--- a
+++ b/src/python/samples/rclmbox.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+
+import mailbox
+import email.header
+import email.utils
+#import sys
+import recollq
+import os
+import stat
+
+#mbfile = "/Users/dockes/projets/fulltext/testrecoll/mail/fred"
+mbfile = "/Users/dockes/mail/outmail"
+rclconf = "/Users/dockes/.recoll-test"
+
+def header_value(msg, nm, to_utf = False):
+ value = msg.get(nm)
+ if value == None:
+ return ""
+ value = value.replace("\n", "")
+ value = value.replace("\r", "")
+ #print value
+ parts = email.header.decode_header(value)
+ #print parts
+ univalue = u""
+ for part in parts:
+ if part[1] != None:
+ univalue += unicode(part[0], part[1]) + " "
+ else:
+ univalue += part[0] + " "
+ if to_utf:
+ return univalue.encode('utf-8')
+ else:
+ return univalue
+
+class mbox_indexer:
+ def __init__(self, mbfile):
+ self.mbfile = mbfile
+ stdata = os.stat(mbfile)
+ self.fmtime = stdata[stat.ST_MTIME]
+ self.fbytes = stdata[stat.ST_SIZE]
+ self.msgnum = 1
+
+ def sig(self):
+ return str(self.fmtime) + ":" + str(self.fbytes)
+ def udi(self, msgnum):
+ return self.mbfile + ":" + str(msgnum)
+
+ def index(self, db):
+ if not db.needUpdate(self.udi(1), self.sig()):
+ return None
+ mb = mailbox.mbox(self.mbfile)
+ for msg in mb.values():
+ self.index_message(db, msg)
+ self.msgnum += 1
+
+ def index_message(self, db, msg):
+ doc = recollq.Doc()
+ doc.author = header_value(msg, "From")
+ # url
+ doc.url = "file://" + self.mbfile
+ # utf8fn
+ # ipath
+ doc.ipath = str(self.msgnum)
+ # mimetype
+ doc.mimetype = "message/rfc822"
+ # mtime
+ dte = header_value(msg, "Date")
+ tm = email.utils.parsedate_tz(dte)
+ if tm == None:
+ doc.mtime = str(self.fmtime)
+ else:
+ doc.mtime = str(email.utils.mktime_tz(tm))
+ # origcharset
+ # title
+ doc.title = header_value(msg, "Subject")
+ # keywords
+ # abstract
+ # author
+ # fbytes
+ doc.fbytes = str(self.fbytes)
+ # text
+ text = u""
+ text += u"From: " + header_value(msg, "From") + u"\n"
+ text += u"To: " + header_value(msg, "To") + u"\n"
+ text += u"Subject: " + header_value(msg, "Subject") + u"\n"
+ #text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
+ text += u"\n"
+ for part in msg.walk():
+ if part.is_multipart():
+ pass #print "Multipart: " + part.get_content_type()
+ else:
+ ct = part.get_content_type()
+ #print "Simple: " + ct
+ if ct.lower() == "text/plain":
+ charset = part.get_content_charset("iso-8859-1")
+ text += unicode(part.get_payload(None, True), charset)
+ doc.text = text
+ # dbytes
+ doc.dbytes = str(len(text))
+ # sig
+ doc.sig = self.sig()
+ udi = self.udi(self.msgnum)
+ db.addOrUpdate(udi, u"", doc)
+
+
+db = recollq.connect(confdir=rclconf, writable=1)
+
+mbidx = mbox_indexer(mbfile)
+mbidx.index(db)