Switch to unified view

a b/src/python/samples/rclmbox.py
1
#!/usr/bin/env python
2
3
import mailbox
4
import email.header
5
import email.utils
6
#import sys
7
import recollq
8
import os
9
import stat
10
11
#mbfile = "/Users/dockes/projets/fulltext/testrecoll/mail/fred"
12
mbfile = "/Users/dockes/mail/outmail"
13
rclconf = "/Users/dockes/.recoll-test"
14
15
def header_value(msg, nm, to_utf = False):
16
    value = msg.get(nm)
17
    if value == None:
18
        return ""
19
    value = value.replace("\n", "")
20
    value = value.replace("\r", "")
21
    #print value
22
    parts = email.header.decode_header(value)
23
    #print parts
24
    univalue = u""
25
    for part in parts:
26
        if part[1] != None:
27
            univalue += unicode(part[0], part[1]) + " "
28
        else:
29
            univalue += part[0] + " "
30
    if to_utf:
31
        return univalue.encode('utf-8')
32
    else:
33
        return univalue
34
35
class mbox_indexer:
36
    def __init__(self, mbfile):
37
        self.mbfile = mbfile
38
        stdata = os.stat(mbfile)
39
        self.fmtime = stdata[stat.ST_MTIME]
40
        self.fbytes = stdata[stat.ST_SIZE]
41
        self.msgnum = 1
42
43
    def sig(self):
44
        return str(self.fmtime) + ":" + str(self.fbytes)
45
    def udi(self, msgnum):
46
        return self.mbfile + ":" + str(msgnum)
47
48
    def index(self, db):
49
        if not db.needUpdate(self.udi(1), self.sig()):
50
            return None
51
        mb = mailbox.mbox(self.mbfile)
52
        for msg in mb.values():
53
            self.index_message(db, msg)
54
            self.msgnum += 1
55
56
    def index_message(self, db, msg):
57
        doc = recollq.Doc()
58
        doc.author = header_value(msg, "From")
59
        # url
60
        doc.url = "file://" + self.mbfile
61
        # utf8fn
62
        # ipath
63
        doc.ipath = str(self.msgnum)
64
        # mimetype
65
        doc.mimetype = "message/rfc822"
66
        # mtime
67
        dte = header_value(msg, "Date")
68
        tm = email.utils.parsedate_tz(dte)
69
        if tm == None:
70
            doc.mtime = str(self.fmtime)
71
        else:
72
            doc.mtime = str(email.utils.mktime_tz(tm))
73
        # origcharset
74
        # title
75
        doc.title = header_value(msg, "Subject")
76
        # keywords
77
        # abstract
78
        # author
79
        # fbytes
80
        doc.fbytes = str(self.fbytes)
81
        # text
82
        text = u""
83
        text += u"From: " + header_value(msg, "From") + u"\n"
84
        text += u"To: " + header_value(msg, "To") + u"\n"
85
        text += u"Subject: " + header_value(msg, "Subject") + u"\n"
86
        #text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
87
        text += u"\n"
88
        for part in msg.walk():
89
            if part.is_multipart():
90
                pass #print "Multipart: " + part.get_content_type()
91
            else:
92
                ct = part.get_content_type()
93
                #print "Simple: " + ct
94
                if ct.lower() == "text/plain":
95
                    charset = part.get_content_charset("iso-8859-1")
96
                    text += unicode(part.get_payload(None, True), charset)
97
        doc.text = text
98
        # dbytes
99
        doc.dbytes = str(len(text))
100
        # sig
101
        doc.sig = self.sig()
102
        udi = self.udi(self.msgnum)
103
        db.addOrUpdate(udi, u"", doc)
104
105
106
db = recollq.connect(confdir=rclconf, writable=1)
107
108
mbidx = mbox_indexer(mbfile)
109
mbidx.index(db)