|
a/src/internfile/mh_mbox.cpp |
|
b/src/internfile/mh_mbox.cpp |
|
... |
|
... |
20 |
#include <stdio.h>
|
20 |
#include <stdio.h>
|
21 |
#include <errno.h>
|
21 |
#include <errno.h>
|
22 |
#include <sys/types.h>
|
22 |
#include <sys/types.h>
|
23 |
#include "safesysstat.h"
|
23 |
#include "safesysstat.h"
|
24 |
#include <time.h>
|
24 |
#include <time.h>
|
25 |
|
|
|
26 |
#if defined(_WIN32)
|
|
|
27 |
#define USING_STD_REGEX
|
|
|
28 |
#endif
|
|
|
29 |
|
|
|
30 |
#ifdef USING_STD_REGEX
|
|
|
31 |
#include <regex>
|
|
|
32 |
#else
|
|
|
33 |
#include <regex.h>
|
|
|
34 |
#endif
|
|
|
35 |
|
25 |
|
36 |
#include <cstring>
|
26 |
#include <cstring>
|
37 |
#include <map>
|
27 |
#include <map>
|
38 |
#include <mutex>
|
28 |
#include <mutex>
|
39 |
|
29 |
|
|
... |
|
... |
361 |
// Update, 2008-08-29: some old? Thunderbird versions apparently use a date
|
351 |
// Update, 2008-08-29: some old? Thunderbird versions apparently use a date
|
362 |
// in "Date: " header format, like: From - Mon, 8 May 2006 10:57:32
|
352 |
// in "Date: " header format, like: From - Mon, 8 May 2006 10:57:32
|
363 |
// This was added as an alternative format. By the way it also fools "mail" and
|
353 |
// This was added as an alternative format. By the way it also fools "mail" and
|
364 |
// emacs-vm, Recoll is not alone
|
354 |
// emacs-vm, Recoll is not alone
|
365 |
// Update: 2009-11-27: word after From may be quoted string: From "john bull"
|
355 |
// Update: 2009-11-27: word after From may be quoted string: From "john bull"
|
366 |
static const char *frompat =
|
356 |
static const string frompat{
|
367 |
"^From[ ]+([^ ]+|\"[^\"]+\")[ ]+" // 'From (toto@tutu|"john bull") '
|
357 |
"^From[ ]+([^ ]+|\"[^\"]+\")[ ]+" // 'From (toto@tutu|"john bull") '
|
368 |
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Fri Oct 26
|
358 |
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Fri Oct 26
|
369 |
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+" // Time, seconds optional
|
359 |
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+" // Time, seconds optional
|
370 |
"([^ ]+[ ]+)?" // Optional tz
|
360 |
"([^ ]+[ ]+)?" // Optional tz
|
371 |
"[12][0-9][0-9][0-9]" // Year, unanchored, more data may follow
|
361 |
"[12][0-9][0-9][0-9]" // Year, unanchored, more data may follow
|
372 |
"|" // Or standard mail Date: header format
|
362 |
"|" // Or standard mail Date: header format
|
373 |
"^From[ ]+[^ ]+[ ]+" // From toto@tutu
|
363 |
"^From[ ]+[^ ]+[ ]+" // From toto@tutu
|
374 |
"[[:alpha:]]{3},[ ]+[0-3]?[0-9][ ]+[[:alpha:]]{3}[ ]+" // Mon, 8 May
|
364 |
"[[:alpha:]]{3},[ ]+[0-3]?[0-9][ ]+[[:alpha:]]{3}[ ]+" // Mon, 8 May
|
375 |
"[12][0-9][0-9][0-9][ ]+" // Year
|
365 |
"[12][0-9][0-9][0-9][ ]+" // Year
|
376 |
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?" // Time, secs optional
|
366 |
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?" // Time, secs optional
|
377 |
;
|
367 |
};
|
378 |
|
368 |
|
379 |
// Extreme thunderbird brokiness. Will sometimes use From lines
|
369 |
// Extreme thunderbird brokiness. Will sometimes use From lines
|
380 |
// exactly like: From ^M (From followed by space and eol). We only
|
370 |
// exactly like: From ^M (From followed by space and eol). We only
|
381 |
// test for this if QUIRKS_TBIRD is set
|
371 |
// test for this if QUIRKS_TBIRD is set
|
382 |
static const char *miniTbirdFrom = "^From $";
|
372 |
static const string miniTbirdFrom{"^From $"};
|
383 |
#ifndef USING_STD_REGEX
|
|
|
384 |
static regex_t fromregex;
|
|
|
385 |
static regex_t minifromregex;
|
|
|
386 |
#define M_regexec(A,B,C,D,E) regexec(&(A),B,C,D,E)
|
|
|
387 |
#else
|
|
|
388 |
basic_regex<char> fromregex;
|
|
|
389 |
basic_regex<char> minifromregex;
|
|
|
390 |
#define REG_NOSUB std::regex_constants::nosubs
|
|
|
391 |
#define REG_EXTENDED std::regex_constants::extended
|
|
|
392 |
#define M_regexec(A, B, C, D, E) (!regex_match(B,A))
|
|
|
393 |
|
373 |
|
394 |
#endif
|
374 |
static SimpleRegexp fromregex(frompat, SimpleRegexp::SRE_NOSUB);
|
395 |
|
375 |
static SimpleRegexp minifromregex(miniTbirdFrom, SimpleRegexp::SRE_NOSUB);
|
396 |
static bool regcompiled;
|
|
|
397 |
static std::mutex o_regex_mutex;
|
|
|
398 |
|
|
|
399 |
static void compileregexes()
|
|
|
400 |
{
|
|
|
401 |
std::unique_lock<std::mutex> locker(o_regex_mutex);
|
|
|
402 |
// As the initial test of regcompiled is unprotected the value may
|
|
|
403 |
// have changed while we were waiting for the lock. Test again now
|
|
|
404 |
// that we are alone.
|
|
|
405 |
if (regcompiled)
|
|
|
406 |
return;
|
|
|
407 |
#ifndef USING_STD_REGEX
|
|
|
408 |
regcomp(&fromregex, frompat, REG_NOSUB|REG_EXTENDED);
|
|
|
409 |
regcomp(&minifromregex, miniTbirdFrom, REG_NOSUB|REG_EXTENDED);
|
|
|
410 |
#else
|
|
|
411 |
fromregex = basic_regex<char>(frompat, REG_NOSUB | REG_EXTENDED);
|
|
|
412 |
minifromregex = basic_regex<char>(miniTbirdFrom, REG_NOSUB | REG_EXTENDED);
|
|
|
413 |
#endif
|
|
|
414 |
regcompiled = true;
|
|
|
415 |
}
|
|
|
416 |
|
376 |
|
417 |
bool MimeHandlerMbox::next_document()
|
377 |
bool MimeHandlerMbox::next_document()
|
418 |
{
|
378 |
{
|
419 |
if (m_vfp == 0) {
|
379 |
if (m_vfp == 0) {
|
420 |
LOGERR("MimeHandlerMbox::next_document: not open\n");
|
380 |
LOGERR("MimeHandlerMbox::next_document: not open\n");
|
|
... |
|
... |
430 |
} else if (m_forPreview) {
|
390 |
} else if (m_forPreview) {
|
431 |
// Can't preview an mbox.
|
391 |
// Can't preview an mbox.
|
432 |
LOGDEB("MimeHandlerMbox::next_document: can't preview folders!\n");
|
392 |
LOGDEB("MimeHandlerMbox::next_document: can't preview folders!\n");
|
433 |
return false;
|
393 |
return false;
|
434 |
}
|
394 |
}
|
435 |
LOGDEB0("MimeHandlerMbox::next_document: fn " << (m_fn) << ", msgnum " << (m_msgnum) << " mtarg " << (mtarg) << " \n");
|
395 |
LOGDEB0("MimeHandlerMbox::next_document: fn " << m_fn << ", msgnum " <<
|
|
|
396 |
m_msgnum << " mtarg " << mtarg << " \n");
|
436 |
if (mtarg == 0)
|
397 |
if (mtarg == 0)
|
437 |
mtarg = -1;
|
398 |
mtarg = -1;
|
438 |
|
399 |
|
439 |
if (!regcompiled) {
|
|
|
440 |
compileregexes();
|
|
|
441 |
}
|
|
|
442 |
|
400 |
|
443 |
// If we are called to retrieve a specific message, seek to bof
|
401 |
// If we are called to retrieve a specific message, seek to bof
|
444 |
// (then scan up to the message). This is for the case where the
|
402 |
// (then scan up to the message). This is for the case where the
|
445 |
// same object is reused to fetch several messages (else the fp is
|
403 |
// same object is reused to fetch several messages (else the fp is
|
446 |
// just opened no need for a seek). We could also check if the
|
404 |
// just opened no need for a seek). We could also check if the
|
|
... |
|
... |
450 |
// object). So:
|
408 |
// object). So:
|
451 |
bool storeoffsets = true;
|
409 |
bool storeoffsets = true;
|
452 |
if (mtarg > 0) {
|
410 |
if (mtarg > 0) {
|
453 |
mbhoff_type off;
|
411 |
mbhoff_type off;
|
454 |
line_type line;
|
412 |
line_type line;
|
455 |
LOGDEB0("MimeHandlerMbox::next_doc: mtarg " << (mtarg) << " m_udi[" << (m_udi) << "]\n");
|
413 |
LOGDEB0("MimeHandlerMbox::next_doc: mtarg " << mtarg << " m_udi[" <<
|
|
|
414 |
m_udi << "]\n");
|
456 |
if (!m_udi.empty() &&
|
415 |
if (!m_udi.empty() &&
|
457 |
(off = o_mcache.get_offset(m_config, m_udi, mtarg)) >= 0 &&
|
416 |
(off = o_mcache.get_offset(m_config, m_udi, mtarg)) >= 0 &&
|
458 |
fseeko(fp, (off_t)off, SEEK_SET) >= 0 &&
|
417 |
fseeko(fp, (off_t)off, SEEK_SET) >= 0 &&
|
459 |
fgets(line, LL, fp) &&
|
418 |
fgets(line, LL, fp) &&
|
460 |
(!M_regexec(fromregex, line, 0, 0, 0) ||
|
419 |
(fromregex(line) || ((m_quirks & MBOXQUIRK_TBIRD) &&
|
461 |
((m_quirks & MBOXQUIRK_TBIRD) &&
|
420 |
minifromregex(line))) ) {
|
462 |
!M_regexec(minifromregex, line, 0, 0, 0))) ) {
|
|
|
463 |
LOGDEB0("MimeHandlerMbox: Cache: From_ Ok\n");
|
421 |
LOGDEB0("MimeHandlerMbox: Cache: From_ Ok\n");
|
464 |
fseeko(fp, (off_t)off, SEEK_SET);
|
422 |
fseeko(fp, (off_t)off, SEEK_SET);
|
465 |
m_msgnum = mtarg -1;
|
423 |
m_msgnum = mtarg -1;
|
466 |
storeoffsets = false;
|
424 |
storeoffsets = false;
|
467 |
} else {
|
425 |
} else {
|
|
... |
|
... |
485 |
break;
|
443 |
break;
|
486 |
}
|
444 |
}
|
487 |
m_lineno++;
|
445 |
m_lineno++;
|
488 |
int ll;
|
446 |
int ll;
|
489 |
stripendnl(line, ll);
|
447 |
stripendnl(line, ll);
|
490 |
LOGDEB2("mhmbox:next: hadempty " << (hademptyline) << " lineno " << (m_lineno) << " ll " << (ll) << " Line: [" << (line) << "]\n");
|
448 |
LOGDEB2("mhmbox:next: hadempty " << hademptyline << " lineno " <<
|
|
|
449 |
m_lineno << " ll " << ll << " Line: [" << line << "]\n");
|
491 |
if (hademptyline) {
|
450 |
if (hademptyline) {
|
492 |
if (ll > 0) {
|
451 |
if (ll > 0) {
|
493 |
// Non-empty line with empty line flag set, reset flag
|
452 |
// Non-empty line with empty line flag set, reset flag
|
494 |
// and check regex.
|
453 |
// and check regex.
|
495 |
if (!(m_quirks & MBOXQUIRK_TBIRD)) {
|
454 |
if (!(m_quirks & MBOXQUIRK_TBIRD)) {
|
|
... |
|
... |
499 |
hademptyline = false;
|
458 |
hademptyline = false;
|
500 |
}
|
459 |
}
|
501 |
/* The 'F' compare is redundant but it improves performance
|
460 |
/* The 'F' compare is redundant but it improves performance
|
502 |
A LOT */
|
461 |
A LOT */
|
503 |
if (line[0] == 'F' && (
|
462 |
if (line[0] == 'F' && (
|
504 |
!M_regexec(fromregex, line, 0, 0, 0) ||
|
463 |
fromregex(line) ||
|
505 |
((m_quirks & MBOXQUIRK_TBIRD) &&
|
464 |
((m_quirks & MBOXQUIRK_TBIRD) && minifromregex(line)))
|
506 |
!M_regexec(minifromregex, line, 0, 0, 0)))
|
|
|
507 |
) {
|
465 |
) {
|
508 |
LOGDEB0("MimeHandlerMbox: msgnum " << (m_msgnum) << ", From_ at line " << (m_lineno) << ": [" << (line) << "]\n");
|
466 |
LOGDEB0("MimeHandlerMbox: msgnum " << m_msgnum <<
|
|
|
467 |
", From_ at line " << m_lineno << ": [" << line
|
|
|
468 |
<< "]\n");
|
509 |
if (storeoffsets)
|
469 |
if (storeoffsets)
|
510 |
m_offsets.push_back(message_end);
|
470 |
m_offsets.push_back(message_end);
|
511 |
m_msgnum++;
|
471 |
m_msgnum++;
|
512 |
if ((mtarg <= 0 && m_msgnum > 1) ||
|
472 |
if ((mtarg <= 0 && m_msgnum > 1) ||
|
513 |
(mtarg > 0 && m_msgnum > mtarg)) {
|
473 |
(mtarg > 0 && m_msgnum > mtarg)) {
|
|
... |
|
... |
526 |
// Accumulate message lines
|
486 |
// Accumulate message lines
|
527 |
line[ll] = '\n';
|
487 |
line[ll] = '\n';
|
528 |
line[ll+1] = 0;
|
488 |
line[ll+1] = 0;
|
529 |
msgtxt += line;
|
489 |
msgtxt += line;
|
530 |
if (msgtxt.size() > max_mbox_member_size) {
|
490 |
if (msgtxt.size() > max_mbox_member_size) {
|
531 |
LOGERR("mh_mbox: huge message (more than " << (max_mbox_member_size/(1024*1024)) << " MB) inside " << (m_fn) << ", giving up\n");
|
491 |
LOGERR("mh_mbox: huge message (more than " <<
|
|
|
492 |
max_mbox_member_size/(1024*1024) << " MB) inside " <<
|
|
|
493 |
m_fn << ", giving up\n");
|
532 |
return false;
|
494 |
return false;
|
533 |
}
|
495 |
}
|
534 |
}
|
496 |
}
|
535 |
}
|
497 |
}
|
536 |
LOGDEB2("Message text length " << (msgtxt.size()) << "\n");
|
498 |
LOGDEB2("Message text length " << msgtxt.size() << "\n");
|
537 |
LOGDEB2("Message text: [" << (msgtxt) << "]\n");
|
499 |
LOGDEB2("Message text: [" << msgtxt << "]\n");
|
538 |
char buf[20];
|
500 |
char buf[20];
|
539 |
// m_msgnum was incremented when hitting the next From_ or eof, so the data
|
501 |
// m_msgnum was incremented when hitting the next From_ or eof, so the data
|
540 |
// is for m_msgnum - 1
|
502 |
// is for m_msgnum - 1
|
541 |
sprintf(buf, "%d", m_msgnum - 1);
|
503 |
sprintf(buf, "%d", m_msgnum - 1);
|
542 |
m_metaData[cstr_dj_keyipath] = buf;
|
504 |
m_metaData[cstr_dj_keyipath] = buf;
|