Switch to unified view

a/src/internfile/mh_mbox.cpp b/src/internfile/mh_mbox.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.1 2006-12-15 12:40:24 dockes Exp $ (C) 2005 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: mh_mbox.cpp,v 1.2 2007-10-03 14:53:37 dockes Exp $ (C) 2005 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
15
 *   You should have received a copy of the GNU General Public License
15
 *   You should have received a copy of the GNU General Public License
16
 *   along with this program; if not, write to the
16
 *   along with this program; if not, write to the
17
 *   Free Software Foundation, Inc.,
17
 *   Free Software Foundation, Inc.,
18
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
18
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19
 */
19
 */
20
20
#ifndef TEST_MH_MBOX
21
#include <stdio.h>
21
#include <stdio.h>
22
#include <fcntl.h>
22
#include <fcntl.h>
23
#include <errno.h>
23
#include <errno.h>
24
#include <unistd.h>
24
#include <unistd.h>
25
#include <time.h>
25
#include <time.h>
...
...
61
    }
61
    }
62
    m_havedoc = true;
62
    m_havedoc = true;
63
    return true;
63
    return true;
64
}
64
}
65
65
66
static const  char *frompat = "^From .* [1-2][0-9][0-9][0-9][\r]*\n$";
66
#define LL 1024
67
typedef char line_type[LL+10];
68
static inline void stripendnl(line_type& line, int& ll)
69
{
70
    ll = strlen(line);
71
    while (ll > 0) {
72
  if (line[ll-1] == '\n' || line[ll-1] == '\r') {
73
      line[ll-1] = 0;
74
      ll--;
75
  } else 
76
      break;
77
    }
78
}
79
80
// The mbox format uses lines beginning with 'From ' as separator.
81
// Mailers are supposed to quote any other lines beginning with 
82
// 'From ', turning it into '>From '. This should make it easy to detect
83
// message boundaries by matching a '^From ' regular expression
84
// Unfortunately this quoting is quite often incorrect in the real world.
85
//
86
// The rest of the format for the line is somewhat variable, but there will 
87
// be a 4 digit year somewhere... 
88
// The canonic format is the following, with a 24 characters date: 
89
//         From toto@tutu.com Sat Sep 30 16:44:06 2000
90
// This resulted into the pattern for versions up to 1.9.0: 
91
//         "^From .* [1-2][0-9][0-9][0-9]$"
92
//
93
// Some mailers add a time zone to the date, this is non-"standard", 
94
// but happens, like in: 
95
//    From toto@truc.com Sat Sep 30 16:44:06 2000 -0400 
96
//
97
// This is taken into account in the new regexp, which also matches more
98
// of the date format, to catch a few actual issues like
99
//     From http://www.itu.int/newsroom/press/releases/1998/NP-2.html:
100
// Note that this *should* have been quoted.
101
//
102
// http://www.qmail.org/man/man5/mbox.html seems to indicate that the
103
// fact that From_ is normally preceded by a blank line should not be
104
// used, but we do it anyway (for now).
105
// The same source indicates that arbitrary data can follow the date field
106
//
107
// A variety of pathologic From_ lines:
108
//   Bad date format:
109
//      From uucp Wed May 22 11:28 GMT 1996
110
//   Added timezone at the end (ok, part of the "any data" after the date)
111
//      From qian2@fas.harvard.edu Sat Sep 30 16:44:06 2000 -0400
112
//  Emacs VM botch ? Adds tz between hour and year
113
//      From dockes Wed Feb 23 10:31:20 +0100 2005
114
//      From dockes Fri Dec  1 20:36:39 +0100 2006
115
// The modified regexp gives the exact same results on the ietf mail archive
116
// and my own's.
117
static const  char *frompat =  
118
#if 0 //1.9.0
119
    "^From .* [1-2][0-9][0-9][0-9]$";
120
#endif
121
#if 1
122
"^From[ ]+[^ ]+[ ]+"                                  // From toto@tutu
123
"[[:alpha:]]{3}[ ]+[[:alpha:]]{3}[ ]+[0-3 ][0-9][ ]+" // Date
124
"[0-2][0-9]:[0-5][0-9](:[0-5][0-9])?[ ]+"             // Time, seconds optional
125
"([^ ]+[ ]+)?"                                        // Optional tz
126
"[12][0-9][0-9][0-9]"            // Year, unanchored, more data may follow
127
    ;
128
#endif
129
    //    "([ ]+[-+][0-9]{4})?$"
67
static regex_t fromregex;
130
static regex_t fromregex;
68
static bool regcompiled;
131
static bool regcompiled;
69
132
70
bool MimeHandlerMbox::next_document()
133
bool MimeHandlerMbox::next_document()
71
{
134
{
...
...
79
    FILE *fp = (FILE *)m_vfp;
142
    FILE *fp = (FILE *)m_vfp;
80
    int mtarg = 0;
143
    int mtarg = 0;
81
    if (m_ipath != "") {
144
    if (m_ipath != "") {
82
    sscanf(m_ipath.c_str(), "%d", &mtarg);
145
    sscanf(m_ipath.c_str(), "%d", &mtarg);
83
    } else if (m_forPreview) {
146
    } else if (m_forPreview) {
84
    // Can't preview an mbox
147
    // Can't preview an mbox. 
148
  LOGDEB(("MimeHandlerMbox::next_document: can't preview folders!\n"));
85
    return false;
149
    return false;
86
    }
150
    }
87
    LOGDEB(("MimeHandlerMbox::next_document: fn %s, msgnum %d mtarg %d \n", 
151
    LOGDEB0(("MimeHandlerMbox::next_document: fn %s, msgnum %d mtarg %d \n", 
88
        m_fn.c_str(), m_msgnum, mtarg));
152
        m_fn.c_str(), m_msgnum, mtarg));
89
153
90
    if (!regcompiled) {
154
    if (!regcompiled) {
91
    regcomp(&fromregex, frompat, REG_NOSUB);
155
    regcomp(&fromregex, frompat, REG_NOSUB|REG_EXTENDED);
92
    regcompiled = true;
156
    regcompiled = true;
93
    }
157
    }
94
158
95
    // If we are called to retrieve a specific message, seek to bof
159
    // If we are called to retrieve a specific message, seek to bof
96
    // (then scan up to the message). This is for the case where the
160
    // (then scan up to the message). This is for the case where the
...
...
111
    string& msgtxt = m_metaData["content"];
175
    string& msgtxt = m_metaData["content"];
112
    msgtxt.erase();
176
    msgtxt.erase();
113
    do  {
177
    do  {
114
    // Look for next 'From ' Line, start of message. Set start to
178
    // Look for next 'From ' Line, start of message. Set start to
115
    // line after this
179
    // line after this
116
  char line[501];
180
  line_type line;
117
    for (;;) {
181
    for (;;) {
118
        if (!fgets(line, 500, fp)) {
182
        if (!fgets(line, LL, fp)) {
119
        // Eof hit while looking for 'From ' -> file done. We'd need
183
        // Eof hit while looking for 'From ' -> file done. We'd need
120
        // another return code here
184
        // another return code here
185
      LOGDEB2(("MimeHandlerMbox:next: hit eof while looking for "
186
           "start From_ line\n"));
121
        return false;
187
        return false;
122
        }
188
        }
123
      if (line[0] == '\n' || line[0] == '\r') {
189
      m_lineno++;
190
      int ll;
191
      stripendnl(line, ll);
192
      LOGDEB2(("Start: hadempty %d ll %d Line: [%s]\n", 
193
          hademptyline, ll, line));
194
      if (ll <= 0) {
124
        hademptyline = true;
195
        hademptyline = true;
125
        continue;
196
        continue;
126
        }
197
        }
127
        if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
198
        if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
199
      LOGDEB0(("MimeHandlerMbox: From_ at line %d: [%s]\n",
200
          m_lineno, line));
128
        start = ftello(fp);
201
        start = ftello(fp);
129
        m_msgnum++;
202
        m_msgnum++;
130
        break;
203
        break;
131
        }
204
        }
132
        hademptyline = false;
205
        hademptyline = false;
133
    }
206
    }
134
207
135
    // Look for next 'From ' line or eof, end of message.
208
    // Look for next 'From ' line or eof, end of message.
136
    for (;;) {
209
    for (;;) {
137
        end = ftello(fp);
210
        end = ftello(fp);
138
        if (!fgets(line, 500, fp)) {
211
        if (!fgets(line, LL, fp)) {
139
        if (ferror(fp) || feof(fp))
212
        if (ferror(fp) || feof(fp))
140
            iseof = true;
213
            iseof = true;
141
        break;
214
        break;
142
        }
215
        }
216
      m_lineno++;
217
      int ll;
218
      stripendnl(line, ll);
219
      LOGDEB2(("End: hadempty %d ll %d Line: [%s]\n", 
220
          hademptyline, ll, line));
143
        if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
221
        if (hademptyline && !regexec(&fromregex, line, 0, 0, 0)) {
222
      // Rewind to start of "From " line
223
      fseek(fp, end, SEEK_SET);
224
      m_lineno--;
144
        break;
225
        break;
145
        }
226
        }
146
        if (mtarg <= 0 || m_msgnum == mtarg) {
227
        if (mtarg <= 0 || m_msgnum == mtarg) {
228
      line[ll] = '\n';
229
      line[ll+1] = 0;
147
        msgtxt += line;
230
        msgtxt += line;
148
        }
231
        }
149
      if (line[0] == '\n' || line[0] == '\r') {
232
      if (ll <= 0) {
150
        hademptyline = true;
233
        hademptyline = true;
151
        } else {
234
        } else {
152
        hademptyline = false;
235
        hademptyline = false;
153
        }
236
        }
154
    }
237
    }
155
  fseek(fp, end, SEEK_SET);
238
156
    } while (mtarg > 0 && m_msgnum < mtarg);
239
    } while (mtarg > 0 && m_msgnum < mtarg);
157
240
158
    LOGDEB2(("Message text: [%s]\n", msgtxt.c_str()));
241
    LOGDEB1(("Message text: [%s]\n", msgtxt.c_str()));
159
    char buf[20];
242
    char buf[20];
160
    sprintf(buf, "%d", m_msgnum);
243
    sprintf(buf, "%d", m_msgnum);
161
    m_metaData["ipath"] = buf;
244
    m_metaData["ipath"] = buf;
162
    m_metaData["mimetype"] = "message/rfc822";
245
    m_metaData["mimetype"] = "message/rfc822";
163
    if (iseof)
246
    if (iseof) {
247
  LOGDEB2(("MimeHandlerMbox::next: eof hit\n"));
164
    m_havedoc = false;
248
    m_havedoc = false;
249
    }
165
    return msgtxt.empty() ? false : true;
250
    return msgtxt.empty() ? false : true;
166
}
251
}
252
253
#else // Test driver ->
254
255
#include <stdio.h>
256
#include <stdlib.h>
257
#include <unistd.h>
258
#include <errno.h>
259
#include <string.h>
260
261
#include <iostream>
262
#include <string>
263
using namespace std;
264
265
#include "rclinit.h"
266
#include "mh_mbox.h"
267
268
static char *thisprog;
269
270
static char usage [] =
271
"  \n\n"
272
;
273
static void
274
Usage(void)
275
{
276
    fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
277
    exit(1);
278
}
279
280
int main(int argc, char **argv)
281
{
282
  thisprog = argv[0];
283
  argc--; argv++;
284
285
  while (argc > 0 && **argv == '-') {
286
    (*argv)++;
287
    if (!(**argv))
288
      /* Cas du "adb - core" */
289
      Usage();
290
    while (**argv)
291
      switch (*(*argv)++) {
292
      default: Usage();   break;
293
      }
294
    argc--; argv++;
295
  }
296
297
  if (argc != 1)
298
    Usage();
299
  string filename = *argv++;argc--;
300
  string reason;
301
  RclConfig *conf = recollinit(RclInitFlags(0), 0, 0, reason, 0);
302
  if (conf == 0) {
303
      cerr << "init failed " << reason << endl;
304
      exit(1);
305
  }
306
  MimeHandlerMbox mh("text/x-mail");
307
  if (!mh.set_document_file(filename)) {
308
      cerr << "set_document_file failed" << endl;
309
      exit(1);
310
  }
311
  int docnt = 0;
312
  while (mh.has_documents()) {
313
      if (!mh.next_document()) {
314
    cerr << "next_document failed" << endl;
315
    exit(1);
316
      }
317
      docnt++;
318
  }
319
  cout << docnt << " documents found in " << filename << endl;
320
  exit(0);
321
}
322
323
324
#endif // TEST_MH_MBOX