|
a/src/internfile/mh_mbox.cpp |
|
b/src/internfile/mh_mbox.cpp |
|
... |
|
... |
127 |
// and my own's.
|
127 |
// and my own's.
|
128 |
// Update, 2008-08-29: some old? Thunderbird versions apparently use a date
|
128 |
// Update, 2008-08-29: some old? Thunderbird versions apparently use a date
|
129 |
// in "Date: " header format, like: From - Mon, 8 May 2006 10:57:32
|
129 |
// in "Date: " header format, like: From - Mon, 8 May 2006 10:57:32
|
130 |
// This was added as an alternative format. By the way it also fools "mail" and
|
130 |
// This was added as an alternative format. By the way it also fools "mail" and
|
131 |
// emacs-vm, Recoll is not alone
|
131 |
// emacs-vm, Recoll is not alone
|
132 |
//
|
132 |
// Update: 2009-11-27: word after From may be quoted string: From "john bull"
|
133 |
static const char *frompat =
|
133 |
static const char *frompat =
|
134 |
#if 0 //1.9.0
|
134 |
#if 0 //1.9.0
|
135 |
"^From .* [1-2][0-9][0-9][0-9]$";
|
135 |
"^From .* [1-2][0-9][0-9][0-9]$";
|
136 |
#endif
|
136 |
#endif
|
137 |
#if 1
|
137 |
#if 1
|
138 |
"^From[ ]+[^ ]+[ ]+" // From whatever
|
138 |
"^From[ ]+([^ ]+|\"[^\"]+\")[ ]+" // 'From (toto@tutu|"john bull") '
|
139 |
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Date
|
139 |
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Fri Oct 26
|
140 |
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+" // Time, seconds optional
|
140 |
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+" // Time, seconds optional
|
141 |
"([^ ]+[ ]+)?" // Optional tz
|
141 |
"([^ ]+[ ]+)?" // Optional tz
|
142 |
"[12][0-9][0-9][0-9]" // Year, unanchored, more data may follow
|
142 |
"[12][0-9][0-9][0-9]" // Year, unanchored, more data may follow
|
143 |
"|" // Or standard mail Date: header format
|
143 |
"|" // Or standard mail Date: header format
|
144 |
"^From[ ]+[^ ]+[ ]+" // From toto@tutu
|
144 |
"^From[ ]+[^ ]+[ ]+" // From toto@tutu
|
145 |
"[[:alpha:]]{3},[ ]+[0-3]?[0-9][ ]+[[:alpha:]]{3}[ ]+" // Date Mon, 8 May
|
145 |
"[[:alpha:]]{3},[ ]+[0-3]?[0-9][ ]+[[:alpha:]]{3}[ ]+" // Mon, 8 May
|
146 |
"[12][0-9][0-9][0-9][ ]+" // Year
|
146 |
"[12][0-9][0-9][0-9][ ]+" // Year
|
147 |
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?" // Time, secs optional: 10:57(:32)?
|
147 |
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?" // Time, secs optional
|
148 |
;
|
148 |
;
|
149 |
#endif
|
149 |
#endif
|
150 |
// "([ ]+[-+][0-9]{4})?$"
|
150 |
// "([ ]+[-+][0-9]{4})?$"
|
151 |
static regex_t fromregex;
|
151 |
static regex_t fromregex;
|
152 |
static bool regcompiled;
|
152 |
static bool regcompiled;
|
|
... |
|
... |
333 |
while (mh.has_documents()) {
|
333 |
while (mh.has_documents()) {
|
334 |
if (!mh.next_document()) {
|
334 |
if (!mh.next_document()) {
|
335 |
cerr << "next_document failed" << endl;
|
335 |
cerr << "next_document failed" << endl;
|
336 |
exit(1);
|
336 |
exit(1);
|
337 |
}
|
337 |
}
|
|
|
338 |
map<string, string>::const_iterator it =
|
|
|
339 |
mh.get_meta_data().find("content");
|
|
|
340 |
int size;
|
|
|
341 |
if (it == mh.get_meta_data().end()) {
|
|
|
342 |
size = -1;
|
|
|
343 |
} else {
|
|
|
344 |
size = it->second.length();
|
|
|
345 |
}
|
|
|
346 |
cout << "Doc " << docnt << " size " << size << endl;
|
338 |
docnt++;
|
347 |
docnt++;
|
339 |
}
|
348 |
}
|
340 |
cout << docnt << " documents found in " << filename << endl;
|
349 |
cout << docnt << " documents found in " << filename << endl;
|
341 |
exit(0);
|
350 |
exit(0);
|
342 |
}
|
351 |
}
|