|
a |
|
b/src/python/samples/rclmbox.py |
|
|
1 |
#!/usr/bin/env python
|
|
|
2 |
|
|
|
3 |
import mailbox
|
|
|
4 |
import email.header
|
|
|
5 |
import email.utils
|
|
|
6 |
#import sys
|
|
|
7 |
import recollq
|
|
|
8 |
import os
|
|
|
9 |
import stat
|
|
|
10 |
|
|
|
11 |
#mbfile = "/Users/dockes/projets/fulltext/testrecoll/mail/fred"
|
|
|
12 |
mbfile = "/Users/dockes/mail/outmail"
|
|
|
13 |
rclconf = "/Users/dockes/.recoll-test"
|
|
|
14 |
|
|
|
15 |
def header_value(msg, nm, to_utf = False):
|
|
|
16 |
value = msg.get(nm)
|
|
|
17 |
if value == None:
|
|
|
18 |
return ""
|
|
|
19 |
value = value.replace("\n", "")
|
|
|
20 |
value = value.replace("\r", "")
|
|
|
21 |
#print value
|
|
|
22 |
parts = email.header.decode_header(value)
|
|
|
23 |
#print parts
|
|
|
24 |
univalue = u""
|
|
|
25 |
for part in parts:
|
|
|
26 |
if part[1] != None:
|
|
|
27 |
univalue += unicode(part[0], part[1]) + " "
|
|
|
28 |
else:
|
|
|
29 |
univalue += part[0] + " "
|
|
|
30 |
if to_utf:
|
|
|
31 |
return univalue.encode('utf-8')
|
|
|
32 |
else:
|
|
|
33 |
return univalue
|
|
|
34 |
|
|
|
35 |
class mbox_indexer:
|
|
|
36 |
def __init__(self, mbfile):
|
|
|
37 |
self.mbfile = mbfile
|
|
|
38 |
stdata = os.stat(mbfile)
|
|
|
39 |
self.fmtime = stdata[stat.ST_MTIME]
|
|
|
40 |
self.fbytes = stdata[stat.ST_SIZE]
|
|
|
41 |
self.msgnum = 1
|
|
|
42 |
|
|
|
43 |
def sig(self):
|
|
|
44 |
return str(self.fmtime) + ":" + str(self.fbytes)
|
|
|
45 |
def udi(self, msgnum):
|
|
|
46 |
return self.mbfile + ":" + str(msgnum)
|
|
|
47 |
|
|
|
48 |
def index(self, db):
|
|
|
49 |
if not db.needUpdate(self.udi(1), self.sig()):
|
|
|
50 |
return None
|
|
|
51 |
mb = mailbox.mbox(self.mbfile)
|
|
|
52 |
for msg in mb.values():
|
|
|
53 |
self.index_message(db, msg)
|
|
|
54 |
self.msgnum += 1
|
|
|
55 |
|
|
|
56 |
def index_message(self, db, msg):
|
|
|
57 |
doc = recollq.Doc()
|
|
|
58 |
doc.author = header_value(msg, "From")
|
|
|
59 |
# url
|
|
|
60 |
doc.url = "file://" + self.mbfile
|
|
|
61 |
# utf8fn
|
|
|
62 |
# ipath
|
|
|
63 |
doc.ipath = str(self.msgnum)
|
|
|
64 |
# mimetype
|
|
|
65 |
doc.mimetype = "message/rfc822"
|
|
|
66 |
# mtime
|
|
|
67 |
dte = header_value(msg, "Date")
|
|
|
68 |
tm = email.utils.parsedate_tz(dte)
|
|
|
69 |
if tm == None:
|
|
|
70 |
doc.mtime = str(self.fmtime)
|
|
|
71 |
else:
|
|
|
72 |
doc.mtime = str(email.utils.mktime_tz(tm))
|
|
|
73 |
# origcharset
|
|
|
74 |
# title
|
|
|
75 |
doc.title = header_value(msg, "Subject")
|
|
|
76 |
# keywords
|
|
|
77 |
# abstract
|
|
|
78 |
# author
|
|
|
79 |
# fbytes
|
|
|
80 |
doc.fbytes = str(self.fbytes)
|
|
|
81 |
# text
|
|
|
82 |
text = u""
|
|
|
83 |
text += u"From: " + header_value(msg, "From") + u"\n"
|
|
|
84 |
text += u"To: " + header_value(msg, "To") + u"\n"
|
|
|
85 |
text += u"Subject: " + header_value(msg, "Subject") + u"\n"
|
|
|
86 |
#text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
|
|
|
87 |
text += u"\n"
|
|
|
88 |
for part in msg.walk():
|
|
|
89 |
if part.is_multipart():
|
|
|
90 |
pass #print "Multipart: " + part.get_content_type()
|
|
|
91 |
else:
|
|
|
92 |
ct = part.get_content_type()
|
|
|
93 |
#print "Simple: " + ct
|
|
|
94 |
if ct.lower() == "text/plain":
|
|
|
95 |
charset = part.get_content_charset("iso-8859-1")
|
|
|
96 |
text += unicode(part.get_payload(None, True), charset)
|
|
|
97 |
doc.text = text
|
|
|
98 |
# dbytes
|
|
|
99 |
doc.dbytes = str(len(text))
|
|
|
100 |
# sig
|
|
|
101 |
doc.sig = self.sig()
|
|
|
102 |
udi = self.udi(self.msgnum)
|
|
|
103 |
db.addOrUpdate(udi, u"", doc)
|
|
|
104 |
|
|
|
105 |
|
|
|
106 |
db = recollq.connect(confdir=rclconf, writable=1)
|
|
|
107 |
|
|
|
108 |
mbidx = mbox_indexer(mbfile)
|
|
|
109 |
mbidx.index(db)
|