|
a/src/internfile/mh_mail.cpp |
|
b/src/internfile/mh_mail.cpp |
|
... |
|
... |
42 |
#include "mime.h"
|
42 |
#include "mime.h"
|
43 |
|
43 |
|
44 |
using namespace std;
|
44 |
using namespace std;
|
45 |
|
45 |
|
46 |
static const int maxdepth = 20;
|
46 |
static const int maxdepth = 20;
|
47 |
static const string cstr_recipient = "recipient";
|
47 |
static const string cstr_mail_charset("charset");
|
48 |
static const string cstr_modificationdate = "modificationdate";
|
|
|
49 |
static const string cstr_title = "title";
|
|
|
50 |
static const string cstr_msgid = "msgid";
|
|
|
51 |
static const string cstr_abstract = "abstract";
|
|
|
52 |
|
48 |
|
53 |
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &mt)
|
49 |
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &mt)
|
54 |
: RecollFilter(cnf, mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
|
50 |
: RecollFilter(cnf, mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
|
55 |
{
|
51 |
{
|
56 |
|
52 |
|
|
... |
|
... |
98 |
|
94 |
|
99 |
// Yes, we read the file twice. It would be possible in theory to add
|
95 |
// Yes, we read the file twice. It would be possible in theory to add
|
100 |
// the md5 computation to the mime analysis, but ...
|
96 |
// the md5 computation to the mime analysis, but ...
|
101 |
string md5, xmd5, reason;
|
97 |
string md5, xmd5, reason;
|
102 |
if (MD5File(fn, md5, &reason)) {
|
98 |
if (MD5File(fn, md5, &reason)) {
|
103 |
m_metaData[cstr_md5] = MD5HexPrint(md5, xmd5);
|
99 |
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
|
104 |
} else {
|
100 |
} else {
|
105 |
LOGERR(("MimeHandlerMail: cant compute md5 for [%s]: %s\n", fn.c_str(),
|
101 |
LOGERR(("MimeHandlerMail: cant compute md5 for [%s]: %s\n", fn.c_str(),
|
106 |
reason.c_str()));
|
102 |
reason.c_str()));
|
107 |
}
|
103 |
}
|
108 |
|
104 |
|
|
... |
|
... |
130 |
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
|
126 |
LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
|
131 |
delete m_stream;
|
127 |
delete m_stream;
|
132 |
|
128 |
|
133 |
string md5, xmd5;
|
129 |
string md5, xmd5;
|
134 |
MD5String(msgtxt, md5);
|
130 |
MD5String(msgtxt, md5);
|
135 |
m_metaData[cstr_md5] = MD5HexPrint(md5, xmd5);
|
131 |
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
|
136 |
|
132 |
|
137 |
m_stream = new stringstream(msgtxt);
|
133 |
m_stream = new stringstream(msgtxt);
|
138 |
delete m_bincdoc;
|
134 |
delete m_bincdoc;
|
139 |
m_bincdoc = new Binc::MimeDocument;
|
135 |
m_bincdoc = new Binc::MimeDocument;
|
140 |
m_bincdoc->parseFull(*m_stream);
|
136 |
m_bincdoc->parseFull(*m_stream);
|
|
... |
|
... |
170 |
if (!m_havedoc)
|
166 |
if (!m_havedoc)
|
171 |
return false;
|
167 |
return false;
|
172 |
bool res = false;
|
168 |
bool res = false;
|
173 |
|
169 |
|
174 |
if (m_idx == -1) {
|
170 |
if (m_idx == -1) {
|
175 |
m_metaData[cstr_mimetype] = cstr_textplain;
|
171 |
m_metaData[cstr_dj_keymt] = cstr_textplain;
|
176 |
res = processMsg(m_bincdoc, 0);
|
172 |
res = processMsg(m_bincdoc, 0);
|
177 |
LOGDEB1(("MimeHandlerMail::next_document: mimetype %s\n",
|
173 |
LOGDEB1(("MimeHandlerMail::next_document: mimetype %s\n",
|
178 |
m_metaData[cstr_mimetype].c_str()));
|
174 |
m_metaData[cstr_dj_keymt].c_str()));
|
179 |
const string& txt = m_metaData[cstr_content];
|
175 |
const string& txt = m_metaData[cstr_dj_keycontent];
|
180 |
if (m_startoftext < txt.size())
|
176 |
if (m_startoftext < txt.size())
|
181 |
m_metaData[cstr_abstract] =
|
177 |
m_metaData[cstr_dj_keyabstract] =
|
182 |
truncate_to_word(txt.substr(m_startoftext), 250);
|
178 |
truncate_to_word(txt.substr(m_startoftext), 250);
|
183 |
} else {
|
179 |
} else {
|
184 |
m_metaData[cstr_abstract].clear();
|
180 |
m_metaData[cstr_dj_keyabstract].clear();
|
185 |
res = processAttach();
|
181 |
res = processAttach();
|
186 |
}
|
182 |
}
|
187 |
m_idx++;
|
183 |
m_idx++;
|
188 |
m_havedoc = m_idx < (int)m_attachments.size();
|
184 |
m_havedoc = m_idx < (int)m_attachments.size();
|
189 |
if (!m_havedoc) {
|
185 |
if (!m_havedoc) {
|
|
... |
|
... |
233 |
m_havedoc = false;
|
229 |
m_havedoc = false;
|
234 |
return false;
|
230 |
return false;
|
235 |
}
|
231 |
}
|
236 |
MHMailAttach *att = m_attachments[m_idx];
|
232 |
MHMailAttach *att = m_attachments[m_idx];
|
237 |
|
233 |
|
238 |
m_metaData[cstr_mimetype] = att->m_contentType;
|
234 |
m_metaData[cstr_dj_keymt] = att->m_contentType;
|
239 |
m_metaData[cstr_charset] = att->m_charset;
|
235 |
m_metaData[cstr_dj_keycharset] = att->m_charset;
|
240 |
m_metaData["filename"] = att->m_filename;
|
236 |
m_metaData[cstr_dj_keyfn] = att->m_filename;
|
241 |
// Change the title to something helpul
|
237 |
// Change the title to something helpul
|
242 |
m_metaData[cstr_title] = att->m_filename + " (" + m_subject + ")";
|
238 |
m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")";
|
243 |
LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n",
|
239 |
LOGDEB1((" processAttach:ct [%s] cs [%s] fn [%s]\n",
|
244 |
att->m_contentType.c_str(),
|
240 |
att->m_contentType.c_str(),
|
245 |
att->m_charset.c_str(),
|
241 |
att->m_charset.c_str(),
|
246 |
att->m_filename.c_str()));
|
242 |
att->m_filename.c_str()));
|
247 |
|
243 |
|
248 |
m_metaData[cstr_content] = string();
|
244 |
m_metaData[cstr_dj_keycontent] = string();
|
249 |
string& body = m_metaData[cstr_content];
|
245 |
string& body = m_metaData[cstr_dj_keycontent];
|
250 |
att->m_part->getBody(body, 0, att->m_part->bodylength);
|
246 |
att->m_part->getBody(body, 0, att->m_part->bodylength);
|
251 |
string decoded;
|
247 |
string decoded;
|
252 |
const string *bdp;
|
248 |
const string *bdp;
|
253 |
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
|
249 |
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
|
254 |
return false;
|
250 |
return false;
|
|
... |
|
... |
257 |
body = decoded;
|
253 |
body = decoded;
|
258 |
|
254 |
|
259 |
// Special case for text/plain content. Internfile should deal
|
255 |
// Special case for text/plain content. Internfile should deal
|
260 |
// with this but it expects text/plain to be utf-8 already, so we
|
256 |
// with this but it expects text/plain to be utf-8 already, so we
|
261 |
// handle the transcoding if needed
|
257 |
// handle the transcoding if needed
|
262 |
if (m_metaData[cstr_mimetype] == cstr_textplain) {
|
258 |
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
|
263 |
string utf8;
|
259 |
string utf8;
|
264 |
if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) {
|
260 |
if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) {
|
265 |
LOGERR((" processAttach: transcode to utf-8 failed "
|
261 |
LOGERR((" processAttach: transcode to utf-8 failed "
|
266 |
"for charset [%s]\n", m_metaData[cstr_charset].c_str()));
|
262 |
"for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
|
267 |
// can't transcode at all -> data is garbage just erase it
|
263 |
// can't transcode at all -> data is garbage just erase it
|
268 |
body.clear();
|
264 |
body.clear();
|
269 |
} else {
|
265 |
} else {
|
270 |
body = utf8;
|
266 |
body = utf8;
|
271 |
}
|
267 |
}
|
272 |
}
|
268 |
}
|
273 |
|
269 |
|
274 |
// Special case for application/octet-stream: try to better
|
270 |
// Special case for application/octet-stream: try to better
|
275 |
// identify content, using file name if set
|
271 |
// identify content, using file name if set
|
276 |
if (m_metaData[cstr_mimetype] == "application/octet-stream" &&
|
272 |
if (m_metaData[cstr_dj_keymt] == "application/octet-stream" &&
|
277 |
!m_metaData["filename"].empty()) {
|
273 |
!m_metaData[cstr_dj_keyfn].empty()) {
|
278 |
string mt = mimetype(m_metaData["filename"], 0,
|
274 |
string mt = mimetype(m_metaData[cstr_dj_keyfn], 0,
|
279 |
m_config, false);
|
275 |
m_config, false);
|
280 |
if (!mt.empty())
|
276 |
if (!mt.empty())
|
281 |
m_metaData[cstr_mimetype] = mt;
|
277 |
m_metaData[cstr_dj_keymt] = mt;
|
282 |
}
|
278 |
}
|
283 |
|
279 |
|
284 |
// Ipath
|
280 |
// Ipath
|
285 |
char nbuf[20];
|
281 |
char nbuf[20];
|
286 |
sprintf(nbuf, "%d", m_idx);
|
282 |
sprintf(nbuf, "%d", m_idx);
|
287 |
m_metaData[cstr_ipath] = nbuf;
|
283 |
m_metaData[cstr_dj_keyipath] = nbuf;
|
288 |
|
284 |
|
289 |
return true;
|
285 |
return true;
|
290 |
}
|
286 |
}
|
291 |
|
287 |
|
292 |
// Transform a single message into a document. The subject becomes the
|
288 |
// Transform a single message into a document. The subject becomes the
|
|
... |
|
... |
306 |
// Return true anyway, better to index partially than not at all
|
302 |
// Return true anyway, better to index partially than not at all
|
307 |
return true;
|
303 |
return true;
|
308 |
}
|
304 |
}
|
309 |
|
305 |
|
310 |
// Handle some headers.
|
306 |
// Handle some headers.
|
311 |
string& text = m_metaData[cstr_content];
|
307 |
string& text = m_metaData[cstr_dj_keycontent];
|
312 |
Binc::HeaderItem hi;
|
308 |
Binc::HeaderItem hi;
|
313 |
string transcoded;
|
309 |
string transcoded;
|
314 |
if (doc->h.getFirstHeader("From", hi)) {
|
310 |
if (doc->h.getFirstHeader("From", hi)) {
|
315 |
rfc2047_decode(hi.getValue(), transcoded);
|
311 |
rfc2047_decode(hi.getValue(), transcoded);
|
316 |
if (preview())
|
312 |
if (preview())
|
317 |
text += string("From: ");
|
313 |
text += string("From: ");
|
318 |
text += transcoded + cstr_newline;
|
314 |
text += transcoded + cstr_newline;
|
319 |
if (depth == 1) {
|
315 |
if (depth == 1) {
|
320 |
m_metaData[cstr_author] = transcoded;
|
316 |
m_metaData[cstr_dj_keyauthor] = transcoded;
|
321 |
}
|
317 |
}
|
322 |
}
|
318 |
}
|
323 |
if (doc->h.getFirstHeader("To", hi)) {
|
319 |
if (doc->h.getFirstHeader("To", hi)) {
|
324 |
rfc2047_decode(hi.getValue(), transcoded);
|
320 |
rfc2047_decode(hi.getValue(), transcoded);
|
325 |
if (preview())
|
321 |
if (preview())
|
326 |
text += string("To: ");
|
322 |
text += string("To: ");
|
327 |
text += transcoded + cstr_newline;
|
323 |
text += transcoded + cstr_newline;
|
328 |
if (depth == 1) {
|
324 |
if (depth == 1) {
|
329 |
m_metaData[cstr_recipient] = transcoded;
|
325 |
m_metaData[cstr_dj_keyrecipient] = transcoded;
|
330 |
}
|
326 |
}
|
331 |
}
|
327 |
}
|
332 |
if (doc->h.getFirstHeader("Cc", hi)) {
|
328 |
if (doc->h.getFirstHeader("Cc", hi)) {
|
333 |
rfc2047_decode(hi.getValue(), transcoded);
|
329 |
rfc2047_decode(hi.getValue(), transcoded);
|
334 |
if (preview())
|
330 |
if (preview())
|
335 |
text += string("Cc: ");
|
331 |
text += string("Cc: ");
|
336 |
text += transcoded + cstr_newline;
|
332 |
text += transcoded + cstr_newline;
|
337 |
if (depth == 1) {
|
333 |
if (depth == 1) {
|
338 |
m_metaData[cstr_recipient] += " " + transcoded;
|
334 |
m_metaData[cstr_dj_keyrecipient] += " " + transcoded;
|
339 |
}
|
335 |
}
|
340 |
}
|
336 |
}
|
341 |
if (doc->h.getFirstHeader("Message-Id", hi)) {
|
337 |
if (doc->h.getFirstHeader("Message-Id", hi)) {
|
342 |
if (depth == 1) {
|
338 |
if (depth == 1) {
|
343 |
m_metaData[cstr_msgid] = hi.getValue();
|
339 |
m_metaData[cstr_dj_keymsgid] = hi.getValue();
|
344 |
trimstring(m_metaData[cstr_msgid], "<>");
|
340 |
trimstring(m_metaData[cstr_dj_keymsgid], "<>");
|
345 |
}
|
341 |
}
|
346 |
}
|
342 |
}
|
347 |
if (doc->h.getFirstHeader("Date", hi)) {
|
343 |
if (doc->h.getFirstHeader("Date", hi)) {
|
348 |
rfc2047_decode(hi.getValue(), transcoded);
|
344 |
rfc2047_decode(hi.getValue(), transcoded);
|
349 |
if (depth == 1) {
|
345 |
if (depth == 1) {
|
350 |
time_t t = rfc2822DateToUxTime(transcoded);
|
346 |
time_t t = rfc2822DateToUxTime(transcoded);
|
351 |
if (t != (time_t)-1) {
|
347 |
if (t != (time_t)-1) {
|
352 |
char ascuxtime[100];
|
348 |
char ascuxtime[100];
|
353 |
sprintf(ascuxtime, "%ld", (long)t);
|
349 |
sprintf(ascuxtime, "%ld", (long)t);
|
354 |
m_metaData[cstr_modificationdate] = ascuxtime;
|
350 |
m_metaData[cstr_dj_keymd] = ascuxtime;
|
355 |
} else {
|
351 |
} else {
|
356 |
// Leave mtime field alone, ftime will be used instead.
|
352 |
// Leave mtime field alone, ftime will be used instead.
|
357 |
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
|
353 |
LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
|
358 |
}
|
354 |
}
|
359 |
}
|
355 |
}
|
|
... |
|
... |
362 |
text += transcoded + cstr_newline;
|
358 |
text += transcoded + cstr_newline;
|
363 |
}
|
359 |
}
|
364 |
if (doc->h.getFirstHeader("Subject", hi)) {
|
360 |
if (doc->h.getFirstHeader("Subject", hi)) {
|
365 |
rfc2047_decode(hi.getValue(), transcoded);
|
361 |
rfc2047_decode(hi.getValue(), transcoded);
|
366 |
if (depth == 1) {
|
362 |
if (depth == 1) {
|
367 |
m_metaData[cstr_title] = transcoded;
|
363 |
m_metaData[cstr_dj_keytitle] = transcoded;
|
368 |
m_subject = transcoded;
|
364 |
m_subject = transcoded;
|
369 |
}
|
365 |
}
|
370 |
if (preview())
|
366 |
if (preview())
|
371 |
text += string("Subject: ");
|
367 |
text += string("Subject: ");
|
372 |
text += transcoded + cstr_newline;
|
368 |
text += transcoded + cstr_newline;
|
|
... |
|
... |
391 |
LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
|
387 |
LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
|
392 |
doc->isMultipart(), doc->getSubType().c_str()));
|
388 |
doc->isMultipart(), doc->getSubType().c_str()));
|
393 |
walkmime(doc, depth);
|
389 |
walkmime(doc, depth);
|
394 |
|
390 |
|
395 |
LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n",
|
391 |
LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n",
|
396 |
m_metaData[cstr_content].c_str()));
|
392 |
m_metaData[cstr_dj_keycontent].c_str()));
|
397 |
return true;
|
393 |
return true;
|
398 |
}
|
394 |
}
|
399 |
|
395 |
|
400 |
// Recursively walk the message mime parts and concatenate all the
|
396 |
// Recursively walk the message mime parts and concatenate all the
|
401 |
// inline html or text that we find anywhere.
|
397 |
// inline html or text that we find anywhere.
|
|
... |
|
... |
413 |
if (depth++ >= maxdepth) {
|
409 |
if (depth++ >= maxdepth) {
|
414 |
LOGINFO(("walkmime: max depth (%d) exceeded\n", maxdepth));
|
410 |
LOGINFO(("walkmime: max depth (%d) exceeded\n", maxdepth));
|
415 |
return;
|
411 |
return;
|
416 |
}
|
412 |
}
|
417 |
|
413 |
|
418 |
string& out = m_metaData[cstr_content];
|
414 |
string& out = m_metaData[cstr_dj_keycontent];
|
419 |
|
415 |
|
420 |
if (doc->isMultipart()) {
|
416 |
if (doc->isMultipart()) {
|
421 |
LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n",
|
417 |
LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n",
|
422 |
doc->isMultipart(), doc->getSubType().c_str()));
|
418 |
doc->isMultipart(), doc->getSubType().c_str()));
|
423 |
// We only handle alternative, related and mixed (no digests).
|
419 |
// We only handle alternative, related and mixed (no digests).
|
|
... |
|
... |
525 |
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
|
521 |
// 8 bit chars exist in a message that is stated as us-ascii. Ie the
|
526 |
// mailer used by yahoo support ('KANA') does this. We could convert
|
522 |
// mailer used by yahoo support ('KANA') does this. We could convert
|
527 |
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
|
523 |
// to iso-8859 only if the transfer-encoding is 8 bit, or test for
|
528 |
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
|
524 |
// actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
|
529 |
string charset;
|
525 |
string charset;
|
530 |
it = content_type.params.find(string(cstr_charset));
|
526 |
it = content_type.params.find(cstr_mail_charset);
|
531 |
if (it != content_type.params.end())
|
527 |
if (it != content_type.params.end())
|
532 |
charset = it->second;
|
528 |
charset = it->second;
|
533 |
if (charset.empty() ||
|
529 |
if (charset.empty() ||
|
534 |
!stringlowercmp("us-ascii", charset) ||
|
530 |
!stringlowercmp("us-ascii", charset) ||
|
535 |
!stringlowercmp("default", charset) ||
|
531 |
!stringlowercmp("default", charset) ||
|
|
... |
|
... |
607 |
m_forPreview ? "view" : "index");
|
603 |
m_forPreview ? "view" : "index");
|
608 |
mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
604 |
mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
|
609 |
mh.set_document_string(body);
|
605 |
mh.set_document_string(body);
|
610 |
mh.next_document();
|
606 |
mh.next_document();
|
611 |
map<string, string>::const_iterator it =
|
607 |
map<string, string>::const_iterator it =
|
612 |
mh.get_meta_data().find(cstr_content);
|
608 |
mh.get_meta_data().find(cstr_dj_keycontent);
|
613 |
if (it != mh.get_meta_data().end())
|
609 |
if (it != mh.get_meta_data().end())
|
614 |
out += it->second;
|
610 |
out += it->second;
|
615 |
} else {
|
611 |
} else {
|
616 |
// Transcode to utf-8
|
612 |
// Transcode to utf-8
|
617 |
LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
|
613 |
LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
|