Switch to unified view

a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp
...
...
20
#include <stdio.h>
20
#include <stdio.h>
21
#include <errno.h>
21
#include <errno.h>
22
#include <sys/types.h>
22
#include <sys/types.h>
23
#include "safesysstat.h"
23
#include "safesysstat.h"
24
#include <time.h>
24
#include <time.h>
25
26
#if defined(_WIN32)
27
#define USING_STD_REGEX
28
#endif
29
30
#ifdef USING_STD_REGEX
31
#include <regex>
32
#else
33
#include <regex.h>
34
#endif
35
25
36
#include <cstring>
26
#include <cstring>
37
#include <map>
27
#include <map>
38
#include <mutex>
28
#include <mutex>
39
29
...
...
361
// Update, 2008-08-29: some old? Thunderbird versions apparently use a date
351
// Update, 2008-08-29: some old? Thunderbird versions apparently use a date
362
// in "Date: " header format, like:   From - Mon, 8 May 2006 10:57:32
352
// in "Date: " header format, like:   From - Mon, 8 May 2006 10:57:32
363
// This was added as an alternative format. By the way it also fools "mail" and
353
// This was added as an alternative format. By the way it also fools "mail" and
364
// emacs-vm, Recoll is not alone
354
// emacs-vm, Recoll is not alone
365
// Update: 2009-11-27: word after From may be quoted string: From "john bull"
355
// Update: 2009-11-27: word after From may be quoted string: From "john bull"
366
static const  char *frompat =  
356
static const string frompat{
367
"^From[ ]+([^ ]+|\"[^\"]+\")[ ]+"    // 'From (toto@tutu|"john bull") '
357
"^From[ ]+([^ ]+|\"[^\"]+\")[ ]+"    // 'From (toto@tutu|"john bull") '
368
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Fri Oct 26
358
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Fri Oct 26
369
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+"             // Time, seconds optional
359
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+"             // Time, seconds optional
370
"([^ ]+[ ]+)?"                                        // Optional tz
360
"([^ ]+[ ]+)?"                                        // Optional tz
371
"[12][0-9][0-9][0-9]"            // Year, unanchored, more data may follow
361
"[12][0-9][0-9][0-9]"            // Year, unanchored, more data may follow
372
"|"      // Or standard mail Date: header format
362
"|"      // Or standard mail Date: header format
373
"^From[ ]+[^ ]+[ ]+"                                   // From toto@tutu
363
"^From[ ]+[^ ]+[ ]+"                                   // From toto@tutu
374
"[[:alpha:]]{3},[ ]+[0-3]?[0-9][ ]+[[:alpha:]]{3}[ ]+" // Mon, 8 May
364
"[[:alpha:]]{3},[ ]+[0-3]?[0-9][ ]+[[:alpha:]]{3}[ ]+" // Mon, 8 May
375
"[12][0-9][0-9][0-9][ ]+"                              // Year
365
"[12][0-9][0-9][0-9][ ]+"                              // Year
376
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?"                  // Time, secs optional
366
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?"                  // Time, secs optional
377
    ;
367
    };
378
368
379
// Extreme thunderbird brokiness. Will sometimes use From lines
369
// Extreme thunderbird brokiness. Will sometimes use From lines
380
// exactly like: From ^M (From followed by space and eol). We only
370
// exactly like: From ^M (From followed by space and eol). We only
381
// test for this if QUIRKS_TBIRD is set
371
// test for this if QUIRKS_TBIRD is set
382
static const char *miniTbirdFrom = "^From $";
372
static const string miniTbirdFrom{"^From $"};
383
#ifndef USING_STD_REGEX
384
static regex_t fromregex;
385
static regex_t minifromregex;
386
#define M_regexec(A,B,C,D,E) regexec(&(A),B,C,D,E)
387
#else
388
basic_regex<char> fromregex;
389
basic_regex<char> minifromregex;
390
#define REG_NOSUB std::regex_constants::nosubs
391
#define REG_EXTENDED std::regex_constants::extended
392
#define M_regexec(A, B, C, D, E) (!regex_match(B,A))
393
373
394
#endif
374
static SimpleRegexp fromregex(frompat, SimpleRegexp::SRE_NOSUB);
395
375
static SimpleRegexp minifromregex(miniTbirdFrom, SimpleRegexp::SRE_NOSUB);
396
static bool regcompiled;
397
static std::mutex o_regex_mutex;
398
399
static void compileregexes()
400
{
401
    std::unique_lock<std::mutex> locker(o_regex_mutex);
402
    // As the initial test of regcompiled is unprotected the value may
403
    // have changed while we were waiting for the lock. Test again now
404
    // that we are alone.
405
    if (regcompiled)
406
  return;
407
#ifndef USING_STD_REGEX
408
    regcomp(&fromregex, frompat, REG_NOSUB|REG_EXTENDED);
409
    regcomp(&minifromregex, miniTbirdFrom, REG_NOSUB|REG_EXTENDED);
410
#else
411
    fromregex = basic_regex<char>(frompat, REG_NOSUB | REG_EXTENDED);
412
    minifromregex = basic_regex<char>(miniTbirdFrom, REG_NOSUB | REG_EXTENDED);
413
#endif
414
    regcompiled = true;
415
}
416
376
417
bool MimeHandlerMbox::next_document()
377
bool MimeHandlerMbox::next_document()
418
{
378
{
419
    if (m_vfp == 0) {
379
    if (m_vfp == 0) {
420
    LOGERR("MimeHandlerMbox::next_document: not open\n");
380
    LOGERR("MimeHandlerMbox::next_document: not open\n");
...
...
430
    } else if (m_forPreview) {
390
    } else if (m_forPreview) {
431
    // Can't preview an mbox. 
391
    // Can't preview an mbox. 
432
    LOGDEB("MimeHandlerMbox::next_document: can't preview folders!\n");
392
    LOGDEB("MimeHandlerMbox::next_document: can't preview folders!\n");
433
    return false;
393
    return false;
434
    }
394
    }
435
    LOGDEB0("MimeHandlerMbox::next_document: fn " << (m_fn) << ", msgnum " << (m_msgnum) << " mtarg " << (mtarg) << " \n");
395
    LOGDEB0("MimeHandlerMbox::next_document: fn " << m_fn << ", msgnum " <<
396
            m_msgnum << " mtarg " << mtarg << " \n");
436
    if (mtarg == 0)
397
    if (mtarg == 0)
437
    mtarg = -1;
398
    mtarg = -1;
438
399
439
    if (!regcompiled) {
440
  compileregexes();
441
    }
442
400
443
    // If we are called to retrieve a specific message, seek to bof
401
    // If we are called to retrieve a specific message, seek to bof
444
    // (then scan up to the message). This is for the case where the
402
    // (then scan up to the message). This is for the case where the
445
    // same object is reused to fetch several messages (else the fp is
403
    // same object is reused to fetch several messages (else the fp is
446
    // just opened no need for a seek).  We could also check if the
404
    // just opened no need for a seek).  We could also check if the
...
...
450
    // object).  So:
408
    // object).  So:
451
    bool storeoffsets = true;
409
    bool storeoffsets = true;
452
    if (mtarg > 0) {
410
    if (mtarg > 0) {
453
        mbhoff_type off;
411
        mbhoff_type off;
454
        line_type line;
412
        line_type line;
455
        LOGDEB0("MimeHandlerMbox::next_doc: mtarg " << (mtarg) << " m_udi[" << (m_udi) << "]\n");
413
        LOGDEB0("MimeHandlerMbox::next_doc: mtarg " << mtarg << " m_udi[" <<
414
                m_udi << "]\n");
456
        if (!m_udi.empty() && 
415
        if (!m_udi.empty() && 
457
            (off = o_mcache.get_offset(m_config, m_udi, mtarg)) >= 0 && 
416
            (off = o_mcache.get_offset(m_config, m_udi, mtarg)) >= 0 && 
458
            fseeko(fp, (off_t)off, SEEK_SET) >= 0 && 
417
            fseeko(fp, (off_t)off, SEEK_SET) >= 0 && 
459
            fgets(line, LL, fp) &&
418
            fgets(line, LL, fp) &&
460
            (!M_regexec(fromregex, line, 0, 0, 0) || 
419
            (fromregex(line) || ((m_quirks & MBOXQUIRK_TBIRD) && 
461
       ((m_quirks & MBOXQUIRK_TBIRD) && 
420
                                 minifromregex(line)))    ) {
462
        !M_regexec(minifromregex, line, 0, 0, 0)))    ) {
463
                LOGDEB0("MimeHandlerMbox: Cache: From_ Ok\n");
421
                LOGDEB0("MimeHandlerMbox: Cache: From_ Ok\n");
464
                fseeko(fp, (off_t)off, SEEK_SET);
422
                fseeko(fp, (off_t)off, SEEK_SET);
465
                m_msgnum = mtarg -1;
423
                m_msgnum = mtarg -1;
466
        storeoffsets = false;
424
        storeoffsets = false;
467
        } else {
425
        } else {
...
...
485
        break;
443
        break;
486
    }
444
    }
487
    m_lineno++;
445
    m_lineno++;
488
    int ll;
446
    int ll;
489
    stripendnl(line, ll);
447
    stripendnl(line, ll);
490
  LOGDEB2("mhmbox:next: hadempty " << (hademptyline) << " lineno " << (m_lineno) << " ll " << (ll) << " Line: [" << (line) << "]\n");
448
  LOGDEB2("mhmbox:next: hadempty " << hademptyline << " lineno " <<
449
                m_lineno << " ll " << ll << " Line: [" << line << "]\n");
491
    if (hademptyline) {
450
    if (hademptyline) {
492
        if (ll > 0) {
451
        if (ll > 0) {
493
        // Non-empty line with empty line flag set, reset flag
452
        // Non-empty line with empty line flag set, reset flag
494
        // and check regex.
453
        // and check regex.
495
        if (!(m_quirks & MBOXQUIRK_TBIRD)) {
454
        if (!(m_quirks & MBOXQUIRK_TBIRD)) {
...
...
499
            hademptyline = false;
458
            hademptyline = false;
500
        }
459
        }
501
        /* The 'F' compare is redundant but it improves performance
460
        /* The 'F' compare is redundant but it improves performance
502
           A LOT */
461
           A LOT */
503
        if (line[0] == 'F' && (
462
        if (line[0] == 'F' && (
504
          !M_regexec(fromregex, line, 0, 0, 0) || 
463
                        fromregex(line) || 
505
          ((m_quirks & MBOXQUIRK_TBIRD) && 
464
                        ((m_quirks & MBOXQUIRK_TBIRD) && minifromregex(line)))
506
           !M_regexec(minifromregex, line, 0, 0, 0)))
507
            ) {
465
            ) {
508
          LOGDEB0("MimeHandlerMbox: msgnum " << (m_msgnum) << ", From_ at line " << (m_lineno) << ": [" << (line) << "]\n");
466
          LOGDEB0("MimeHandlerMbox: msgnum " << m_msgnum <<
467
                            ", From_ at line " << m_lineno << ": [" << line
468
                            << "]\n");
509
            if (storeoffsets)
469
            if (storeoffsets)
510
            m_offsets.push_back(message_end);
470
            m_offsets.push_back(message_end);
511
            m_msgnum++;
471
            m_msgnum++;
512
            if ((mtarg <= 0 && m_msgnum > 1) || 
472
            if ((mtarg <= 0 && m_msgnum > 1) || 
513
            (mtarg > 0 && m_msgnum > mtarg)) {
473
            (mtarg > 0 && m_msgnum > mtarg)) {
...
...
526
        // Accumulate message lines
486
        // Accumulate message lines
527
        line[ll] = '\n';
487
        line[ll] = '\n';
528
        line[ll+1] = 0;
488
        line[ll+1] = 0;
529
        msgtxt += line;
489
        msgtxt += line;
530
        if (msgtxt.size() > max_mbox_member_size) {
490
        if (msgtxt.size() > max_mbox_member_size) {
531
      LOGERR("mh_mbox: huge message (more than " << (max_mbox_member_size/(1024*1024)) << " MB) inside " << (m_fn) << ", giving up\n");
491
      LOGERR("mh_mbox: huge message (more than " <<
492
                       max_mbox_member_size/(1024*1024) << " MB) inside " <<
493
                       m_fn << ", giving up\n");
532
        return false;
494
        return false;
533
        }
495
        }
534
    }
496
    }
535
    }
497
    }
536
    LOGDEB2("Message text length " << (msgtxt.size()) << "\n");
498
    LOGDEB2("Message text length " << msgtxt.size() << "\n");
537
    LOGDEB2("Message text: [" << (msgtxt) << "]\n");
499
    LOGDEB2("Message text: [" << msgtxt << "]\n");
538
    char buf[20];
500
    char buf[20];
539
    // m_msgnum was incremented when hitting the next From_ or eof, so the data
501
    // m_msgnum was incremented when hitting the next From_ or eof, so the data
540
    // is for m_msgnum - 1
502
    // is for m_msgnum - 1
541
    sprintf(buf, "%d", m_msgnum - 1); 
503
    sprintf(buf, "%d", m_msgnum - 1); 
542
    m_metaData[cstr_dj_keyipath] = buf;
504
    m_metaData[cstr_dj_keyipath] = buf;