Parent: [d282f8] (diff)

Child: [094991] (diff)

Download this file

rclmbox.py    115 lines (104 with data), 3.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
"""An example that uses python tools to parse mbox/rfcxxx format and index
messages. Not supposed to run as-is or be really useful"""
import mailbox
import email.header
import email.utils
#import sys
import recoll
import os
import stat
#mbfile = "/Users/dockes/projets/fulltext/testrecoll/mail/fred"
mbfile = "/Users/dockes/mail/outmail"
rclconf = "/Users/dockes/.recoll-test"
def header_value(msg, nm, to_utf = False):
value = msg.get(nm)
if value == None:
return ""
value = value.replace("\n", "")
value = value.replace("\r", "")
#print value
parts = email.header.decode_header(value)
#print parts
univalue = u""
for part in parts:
if part[1] != None:
univalue += unicode(part[0], part[1]) + " "
else:
univalue += part[0] + " "
if to_utf:
return univalue.encode('utf-8')
else:
return univalue
class mbox_indexer:
def __init__(self, mbfile):
self.mbfile = mbfile
stdata = os.stat(mbfile)
self.fmtime = stdata[stat.ST_MTIME]
self.fbytes = stdata[stat.ST_SIZE]
self.msgnum = 1
def sig(self):
return str(self.fmtime) + ":" + str(self.fbytes)
def udi(self, msgnum):
return self.mbfile + ":" + str(msgnum)
def index(self, db):
if not db.needUpdate(self.udi(1), self.sig()):
return None
mb = mailbox.mbox(self.mbfile)
for msg in mb.values():
self.index_message(db, msg)
self.msgnum += 1
def index_message(self, db, msg):
doc = recoll.Doc()
doc.author = header_value(msg, "From")
doc.recipient = header_value(msg, "To") + " " + header_value(msg, "Cc")
# url
doc.url = "file://" + self.mbfile
# utf8fn
# ipath
doc.ipath = str(self.msgnum)
# mimetype
doc.mimetype = "message/rfc822"
# mtime
dte = header_value(msg, "Date")
tm = email.utils.parsedate_tz(dte)
if tm == None:
doc.mtime = str(self.fmtime)
else:
doc.mtime = str(email.utils.mktime_tz(tm))
# origcharset
# title
doc.title = header_value(msg, "Subject")
# keywords
# abstract
# author
# fbytes
doc.fbytes = str(self.fbytes)
# text
text = u""
text += u"From: " + header_value(msg, "From") + u"\n"
text += u"To: " + header_value(msg, "To") + u"\n"
text += u"Subject: " + header_value(msg, "Subject") + u"\n"
#text += u"Message-ID: " + header_value(msg, "Message-ID") + u"\n"
text += u"\n"
for part in msg.walk():
if part.is_multipart():
pass #print "Multipart: " + part.get_content_type()
else:
ct = part.get_content_type()
#print "Simple: " + ct
if ct.lower() == "text/plain":
charset = part.get_content_charset("iso-8859-1")
print "charset: ", charset
print "text: ", part.get_payload(None, True)
text += unicode(part.get_payload(None, True), charset)
doc.text = text
# dbytes
doc.dbytes = str(len(text))
# sig
doc.sig = self.sig()
udi = self.udi(self.msgnum)
db.addOrUpdate(udi, doc)
db = recoll.connect(confdir=rclconf, writable=1)
mbidx = mbox_indexer(mbfile)
mbidx.index(db)