Switch to unified view

a/src/internfile/mh_mail.cpp b/src/internfile/mh_mail.cpp
...
...
42
#include "mime.h"
42
#include "mime.h"
43
43
44
using namespace std;
44
using namespace std;
45
45
46
static const int maxdepth = 20;
46
static const int maxdepth = 20;
47
static const string cstr_recipient = "recipient";
47
static const string cstr_mail_charset("charset");
48
static const string cstr_modificationdate = "modificationdate";
49
static const string cstr_title = "title";
50
static const string cstr_msgid = "msgid";
51
static const string cstr_abstract = "abstract";
52
48
53
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &mt) 
49
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &mt) 
54
    : RecollFilter(cnf, mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
50
    : RecollFilter(cnf, mt), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
55
{
51
{
56
52
...
...
98
94
99
    // Yes, we read the file twice. It would be possible in theory to add
95
    // Yes, we read the file twice. It would be possible in theory to add
100
    // the md5 computation to the mime analysis, but ...
96
    // the md5 computation to the mime analysis, but ...
101
    string md5, xmd5, reason;
97
    string md5, xmd5, reason;
102
    if (MD5File(fn, md5, &reason)) {
98
    if (MD5File(fn, md5, &reason)) {
103
    m_metaData[cstr_md5] = MD5HexPrint(md5, xmd5);
99
    m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
104
    } else {
100
    } else {
105
    LOGERR(("MimeHandlerMail: cant compute md5 for [%s]: %s\n", fn.c_str(),
101
    LOGERR(("MimeHandlerMail: cant compute md5 for [%s]: %s\n", fn.c_str(),
106
        reason.c_str()));
102
        reason.c_str()));
107
    }
103
    }
108
104
...
...
130
    LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
126
    LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
131
    delete m_stream;
127
    delete m_stream;
132
128
133
    string md5, xmd5;
129
    string md5, xmd5;
134
    MD5String(msgtxt, md5);
130
    MD5String(msgtxt, md5);
135
    m_metaData[cstr_md5] = MD5HexPrint(md5, xmd5);
131
    m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
136
132
137
    m_stream = new stringstream(msgtxt);
133
    m_stream = new stringstream(msgtxt);
138
    delete m_bincdoc;
134
    delete m_bincdoc;
139
    m_bincdoc = new Binc::MimeDocument;
135
    m_bincdoc = new Binc::MimeDocument;
140
    m_bincdoc->parseFull(*m_stream);
136
    m_bincdoc->parseFull(*m_stream);
...
...
170
    if (!m_havedoc)
166
    if (!m_havedoc)
171
    return false;
167
    return false;
172
    bool res = false;
168
    bool res = false;
173
169
174
    if (m_idx == -1) {
170
    if (m_idx == -1) {
175
    m_metaData[cstr_mimetype] = cstr_textplain;
171
    m_metaData[cstr_dj_keymt] = cstr_textplain;
176
    res = processMsg(m_bincdoc, 0);
172
    res = processMsg(m_bincdoc, 0);
177
    LOGDEB1(("MimeHandlerMail::next_document: mimetype %s\n",
173
    LOGDEB1(("MimeHandlerMail::next_document: mimetype %s\n",
178
        m_metaData[cstr_mimetype].c_str()));
174
        m_metaData[cstr_dj_keymt].c_str()));
179
        const string& txt = m_metaData[cstr_content];
175
        const string& txt = m_metaData[cstr_dj_keycontent];
180
        if (m_startoftext < txt.size())
176
        if (m_startoftext < txt.size())
181
            m_metaData[cstr_abstract] = 
177
            m_metaData[cstr_dj_keyabstract] = 
182
                truncate_to_word(txt.substr(m_startoftext), 250);
178
                truncate_to_word(txt.substr(m_startoftext), 250);
183
    } else {
179
    } else {
184
        m_metaData[cstr_abstract].clear();
180
        m_metaData[cstr_dj_keyabstract].clear();
185
    res = processAttach();
181
    res = processAttach();
186
    }
182
    }
187
    m_idx++;
183
    m_idx++;
188
    m_havedoc = m_idx < (int)m_attachments.size();
184
    m_havedoc = m_idx < (int)m_attachments.size();
189
    if (!m_havedoc) {
185
    if (!m_havedoc) {
...
...
233
    m_havedoc = false;
229
    m_havedoc = false;
234
    return false;
230
    return false;
235
    }
231
    }
236
    MHMailAttach *att = m_attachments[m_idx];
232
    MHMailAttach *att = m_attachments[m_idx];
237
233
238
    m_metaData[cstr_mimetype] = att->m_contentType;
234
    m_metaData[cstr_dj_keymt] = att->m_contentType;
239
    m_metaData[cstr_charset] = att->m_charset;
235
    m_metaData[cstr_dj_keycharset] = att->m_charset;
240
    m_metaData["filename"] = att->m_filename;
236
    m_metaData[cstr_dj_keyfn] = att->m_filename;
241
    // Change the title to something helpul
237
    // Change the title to something helpul
242
    m_metaData[cstr_title] = att->m_filename + "  (" + m_subject + ")";
238
    m_metaData[cstr_dj_keytitle] = att->m_filename + "  (" + m_subject + ")";
243
    LOGDEB1(("  processAttach:ct [%s] cs [%s] fn [%s]\n", 
239
    LOGDEB1(("  processAttach:ct [%s] cs [%s] fn [%s]\n", 
244
        att->m_contentType.c_str(),
240
        att->m_contentType.c_str(),
245
        att->m_charset.c_str(),
241
        att->m_charset.c_str(),
246
        att->m_filename.c_str()));
242
        att->m_filename.c_str()));
247
243
248
    m_metaData[cstr_content] = string();
244
    m_metaData[cstr_dj_keycontent] = string();
249
    string& body = m_metaData[cstr_content];
245
    string& body = m_metaData[cstr_dj_keycontent];
250
    att->m_part->getBody(body, 0, att->m_part->bodylength);
246
    att->m_part->getBody(body, 0, att->m_part->bodylength);
251
    string decoded;
247
    string decoded;
252
    const string *bdp;
248
    const string *bdp;
253
    if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
249
    if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
254
    return false;
250
    return false;
...
...
257
    body = decoded;
253
    body = decoded;
258
254
259
    // Special case for text/plain content. Internfile should deal
255
    // Special case for text/plain content. Internfile should deal
260
    // with this but it expects text/plain to be utf-8 already, so we
256
    // with this but it expects text/plain to be utf-8 already, so we
261
    // handle the transcoding if needed
257
    // handle the transcoding if needed
262
    if (m_metaData[cstr_mimetype] == cstr_textplain) {
258
    if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
263
    string utf8;
259
    string utf8;
264
    if (!transcode(body, utf8, m_metaData[cstr_charset], "UTF-8")) {
260
    if (!transcode(body, utf8, m_metaData[cstr_dj_keycharset], "UTF-8")) {
265
        LOGERR(("  processAttach: transcode to utf-8 failed "
261
        LOGERR(("  processAttach: transcode to utf-8 failed "
266
            "for charset [%s]\n", m_metaData[cstr_charset].c_str()));
262
            "for charset [%s]\n", m_metaData[cstr_dj_keycharset].c_str()));
267
        // can't transcode at all -> data is garbage just erase it
263
        // can't transcode at all -> data is garbage just erase it
268
        body.clear();
264
        body.clear();
269
    } else {
265
    } else {
270
        body = utf8;
266
        body = utf8;
271
    }
267
    }
272
    }
268
    }
273
269
274
    // Special case for application/octet-stream: try to better
270
    // Special case for application/octet-stream: try to better
275
    // identify content, using file name if set
271
    // identify content, using file name if set
276
    if (m_metaData[cstr_mimetype] == "application/octet-stream" &&
272
    if (m_metaData[cstr_dj_keymt] == "application/octet-stream" &&
277
  !m_metaData["filename"].empty()) {
273
  !m_metaData[cstr_dj_keyfn].empty()) {
278
    string mt = mimetype(m_metaData["filename"], 0,    
274
    string mt = mimetype(m_metaData[cstr_dj_keyfn], 0,   
279
                 m_config, false);
275
                 m_config, false);
280
    if (!mt.empty()) 
276
    if (!mt.empty()) 
281
        m_metaData[cstr_mimetype] = mt;
277
        m_metaData[cstr_dj_keymt] = mt;
282
    }
278
    }
283
279
284
    // Ipath
280
    // Ipath
285
    char nbuf[20];
281
    char nbuf[20];
286
    sprintf(nbuf, "%d", m_idx);
282
    sprintf(nbuf, "%d", m_idx);
287
    m_metaData[cstr_ipath] = nbuf;
283
    m_metaData[cstr_dj_keyipath] = nbuf;
288
284
289
    return true;
285
    return true;
290
}
286
}
291
287
292
// Transform a single message into a document. The subject becomes the
288
// Transform a single message into a document. The subject becomes the
...
...
306
    // Return true anyway, better to index partially than not at all
302
    // Return true anyway, better to index partially than not at all
307
    return true;
303
    return true;
308
    }
304
    }
309
    
305
    
310
    // Handle some headers. 
306
    // Handle some headers. 
311
    string& text = m_metaData[cstr_content];
307
    string& text = m_metaData[cstr_dj_keycontent];
312
    Binc::HeaderItem hi;
308
    Binc::HeaderItem hi;
313
    string transcoded;
309
    string transcoded;
314
    if (doc->h.getFirstHeader("From", hi)) {
310
    if (doc->h.getFirstHeader("From", hi)) {
315
    rfc2047_decode(hi.getValue(), transcoded);
311
    rfc2047_decode(hi.getValue(), transcoded);
316
    if (preview())
312
    if (preview())
317
        text += string("From: ");
313
        text += string("From: ");
318
    text += transcoded + cstr_newline;
314
    text += transcoded + cstr_newline;
319
    if (depth == 1) {
315
    if (depth == 1) {
320
        m_metaData[cstr_author] = transcoded;
316
        m_metaData[cstr_dj_keyauthor] = transcoded;
321
    }
317
    }
322
    }
318
    }
323
    if (doc->h.getFirstHeader("To", hi)) {
319
    if (doc->h.getFirstHeader("To", hi)) {
324
    rfc2047_decode(hi.getValue(), transcoded);
320
    rfc2047_decode(hi.getValue(), transcoded);
325
    if (preview())
321
    if (preview())
326
        text += string("To: ");
322
        text += string("To: ");
327
    text += transcoded + cstr_newline;
323
    text += transcoded + cstr_newline;
328
    if (depth == 1) {
324
    if (depth == 1) {
329
        m_metaData[cstr_recipient] = transcoded;
325
        m_metaData[cstr_dj_keyrecipient] = transcoded;
330
    }
326
    }
331
    }
327
    }
332
    if (doc->h.getFirstHeader("Cc", hi)) {
328
    if (doc->h.getFirstHeader("Cc", hi)) {
333
    rfc2047_decode(hi.getValue(), transcoded);
329
    rfc2047_decode(hi.getValue(), transcoded);
334
    if (preview())
330
    if (preview())
335
        text += string("Cc: ");
331
        text += string("Cc: ");
336
    text += transcoded + cstr_newline;
332
    text += transcoded + cstr_newline;
337
    if (depth == 1) {
333
    if (depth == 1) {
338
        m_metaData[cstr_recipient] += " " + transcoded;
334
        m_metaData[cstr_dj_keyrecipient] += " " + transcoded;
339
    }
335
    }
340
    }
336
    }
341
    if (doc->h.getFirstHeader("Message-Id", hi)) {
337
    if (doc->h.getFirstHeader("Message-Id", hi)) {
342
    if (depth == 1) {
338
    if (depth == 1) {
343
        m_metaData[cstr_msgid] =  hi.getValue();
339
        m_metaData[cstr_dj_keymsgid] =  hi.getValue();
344
            trimstring(m_metaData[cstr_msgid], "<>");
340
            trimstring(m_metaData[cstr_dj_keymsgid], "<>");
345
    }
341
    }
346
    }
342
    }
347
    if (doc->h.getFirstHeader("Date", hi)) {
343
    if (doc->h.getFirstHeader("Date", hi)) {
348
    rfc2047_decode(hi.getValue(), transcoded);
344
    rfc2047_decode(hi.getValue(), transcoded);
349
    if (depth == 1) {
345
    if (depth == 1) {
350
        time_t t = rfc2822DateToUxTime(transcoded);
346
        time_t t = rfc2822DateToUxTime(transcoded);
351
        if (t != (time_t)-1) {
347
        if (t != (time_t)-1) {
352
        char ascuxtime[100];
348
        char ascuxtime[100];
353
        sprintf(ascuxtime, "%ld", (long)t);
349
        sprintf(ascuxtime, "%ld", (long)t);
354
        m_metaData[cstr_modificationdate] = ascuxtime;
350
        m_metaData[cstr_dj_keymd] = ascuxtime;
355
        } else {
351
        } else {
356
        // Leave mtime field alone, ftime will be used instead.
352
        // Leave mtime field alone, ftime will be used instead.
357
        LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
353
        LOGDEB(("rfc2822Date...: failed: [%s]\n", transcoded.c_str()));
358
        }
354
        }
359
    }
355
    }
...
...
362
    text += transcoded + cstr_newline;
358
    text += transcoded + cstr_newline;
363
    }
359
    }
364
    if (doc->h.getFirstHeader("Subject", hi)) {
360
    if (doc->h.getFirstHeader("Subject", hi)) {
365
    rfc2047_decode(hi.getValue(), transcoded);
361
    rfc2047_decode(hi.getValue(), transcoded);
366
    if (depth == 1) {
362
    if (depth == 1) {
367
        m_metaData[cstr_title] = transcoded;
363
        m_metaData[cstr_dj_keytitle] = transcoded;
368
        m_subject = transcoded;
364
        m_subject = transcoded;
369
    }
365
    }
370
    if (preview())
366
    if (preview())
371
        text += string("Subject: ");
367
        text += string("Subject: ");
372
    text += transcoded + cstr_newline;
368
    text += transcoded + cstr_newline;
...
...
391
    LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
387
    LOGDEB2(("MimeHandlerMail::processMsg:ismultipart %d mime subtype '%s'\n",
392
        doc->isMultipart(), doc->getSubType().c_str()));
388
        doc->isMultipart(), doc->getSubType().c_str()));
393
    walkmime(doc, depth);
389
    walkmime(doc, depth);
394
390
395
    LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", 
391
    LOGDEB2(("MimeHandlerMail::processMsg:text:[%s]\n", 
396
        m_metaData[cstr_content].c_str()));
392
        m_metaData[cstr_dj_keycontent].c_str()));
397
    return true;
393
    return true;
398
}
394
}
399
395
400
// Recursively walk the message mime parts and concatenate all the
396
// Recursively walk the message mime parts and concatenate all the
401
// inline html or text that we find anywhere.  
397
// inline html or text that we find anywhere.  
...
...
413
    if (depth++ >= maxdepth) {
409
    if (depth++ >= maxdepth) {
414
    LOGINFO(("walkmime: max depth (%d) exceeded\n", maxdepth));
410
    LOGINFO(("walkmime: max depth (%d) exceeded\n", maxdepth));
415
    return;
411
    return;
416
    }
412
    }
417
413
418
    string& out = m_metaData[cstr_content];
414
    string& out = m_metaData[cstr_dj_keycontent];
419
415
420
    if (doc->isMultipart()) {
416
    if (doc->isMultipart()) {
421
    LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n", 
417
    LOGDEB2(("walkmime: ismultipart %d subtype '%s'\n", 
422
        doc->isMultipart(), doc->getSubType().c_str()));
418
        doc->isMultipart(), doc->getSubType().c_str()));
423
    // We only handle alternative, related and mixed (no digests). 
419
    // We only handle alternative, related and mixed (no digests). 
...
...
525
    // 8 bit chars exist in a message that is stated as us-ascii. Ie the 
521
    // 8 bit chars exist in a message that is stated as us-ascii. Ie the 
526
    // mailer used by yahoo support ('KANA') does this. We could convert 
522
    // mailer used by yahoo support ('KANA') does this. We could convert 
527
    // to iso-8859 only if the transfer-encoding is 8 bit, or test for
523
    // to iso-8859 only if the transfer-encoding is 8 bit, or test for
528
    // actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
524
    // actual 8 bit chars, but what the heck, le'ts use 8859-1 as default
529
    string charset;
525
    string charset;
530
    it = content_type.params.find(string(cstr_charset));
526
    it = content_type.params.find(cstr_mail_charset);
531
    if (it != content_type.params.end())
527
    if (it != content_type.params.end())
532
    charset = it->second;
528
    charset = it->second;
533
    if (charset.empty() || 
529
    if (charset.empty() || 
534
    !stringlowercmp("us-ascii", charset) || 
530
    !stringlowercmp("us-ascii", charset) || 
535
    !stringlowercmp("default", charset) || 
531
    !stringlowercmp("default", charset) || 
...
...
607
            m_forPreview ? "view" : "index");
603
            m_forPreview ? "view" : "index");
608
    mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
604
    mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
609
    mh.set_document_string(body);
605
    mh.set_document_string(body);
610
    mh.next_document();
606
    mh.next_document();
611
    map<string, string>::const_iterator it = 
607
    map<string, string>::const_iterator it = 
612
        mh.get_meta_data().find(cstr_content);
608
        mh.get_meta_data().find(cstr_dj_keycontent);
613
    if (it != mh.get_meta_data().end())
609
    if (it != mh.get_meta_data().end())
614
        out += it->second;
610
        out += it->second;
615
    } else {
611
    } else {
616
    // Transcode to utf-8 
612
    // Transcode to utf-8 
617
    LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));
613
    LOGDEB1(("walkmime: transcoding from %s to UTF-8\n", charset.c_str()));