Switch to unified view

a b/src/bincimapmime/mime-parsefull.cc
1
/* -*- Mode: c++; -*- */
2
/*  --------------------------------------------------------------------
3
 *  Filename:
4
 *    mime-parsefull.cc
5
 *  
6
 *  Description:
7
 *    Implementation of main mime parser components
8
 *  --------------------------------------------------------------------
9
 *  Copyright 2002-2004 Andreas Aardal Hanssen
10
 *
11
 *  This program is free software; you can redistribute it and/or modify
12
 *  it under the terms of the GNU General Public License as published by
13
 *  the Free Software Foundation; either version 2 of the License, or
14
 *  (at your option) any later version.
15
 *
16
 *  This program is distributed in the hope that it will be useful,
17
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
18
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
 *  GNU General Public License for more details.
20
 *
21
 *  You should have received a copy of the GNU General Public License
22
 *  along with this program; if not, write to the Free Software
23
 *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
24
 *  --------------------------------------------------------------------
25
 */
26
#ifdef HAVE_CONFIG_H
27
#include <config.h>
28
#endif
29
30
#include "mime.h"
31
#include "mime-utils.h"
32
#include "mime-inputsource.h"
33
#include "convert.h"
34
#include <string>
35
#include <vector>
36
#include <map>
37
#include <exception>
38
#include <iostream>
39
40
#include <string.h>
41
#include <ctype.h>
42
#include <stdio.h>
43
#include <errno.h>
44
45
Binc::MimeInputSource *mimeSource = 0;
46
47
using namespace ::std;
48
49
//------------------------------------------------------------------------
50
void Binc::MimeDocument::parseFull(int fd) const
51
{
52
  if (allIsParsed)
53
    return;
54
55
  allIsParsed = true;
56
57
  if (!mimeSource || mimeSource->getFileDescriptor() != fd) {
58
    delete mimeSource;
59
    mimeSource = new MimeInputSource(fd);
60
  } else {
61
    mimeSource->reset();
62
  }
63
64
  headerstartoffsetcrlf = 0;
65
  headerlength = 0;
66
  bodystartoffsetcrlf = 0;
67
  bodylength = 0;
68
  size = 0;
69
  messagerfc822 = false;
70
  multipart = false;
71
72
  int bsize = 0;
73
  string bound;
74
  MimePart::parseFull(bound, bsize);
75
76
  // eat any trailing junk to get the correct size
77
  char c;
78
  while (mimeSource->getChar(&c));
79
80
  size = mimeSource->getOffset();
81
}
82
83
//------------------------------------------------------------------------
84
static bool parseOneHeaderLine(Binc::Header *header, unsigned int *nlines)
85
{
86
  using namespace ::Binc;
87
  char c;
88
  bool eof = false;
89
  char cqueue[4];
90
  string name;
91
  string content;
92
93
  while (mimeSource->getChar(&c)) {
94
    // If we encounter a \r before we got to the first ':', then
95
    // rewind back to the start of the line and assume we're at the
96
    // start of the body.
97
    if (c == '\r') {
98
      for (int i = 0; i < (int) name.length() + 1; ++i)
99
  mimeSource->ungetChar();
100
      return false;
101
    }
102
103
    // A colon marks the end of the header name
104
    if (c == ':') break;
105
106
    // Otherwise add to the header name
107
    name += c;
108
  }
109
110
  cqueue[0] = '\0';
111
  cqueue[1] = '\0';
112
  cqueue[2] = '\0';
113
  cqueue[3] = '\0';
114
115
  // Read until the end of the header.
116
  bool endOfHeaders = false;
117
  while (!endOfHeaders) {
118
    if (!mimeSource->getChar(&c)) {
119
      eof = true;
120
      break;
121
    }
122
123
    if (c == '\n') ++*nlines;
124
125
    for (int i = 0; i < 3; ++i)
126
      cqueue[i] = cqueue[i + 1];
127
    cqueue[3] = c;
128
129
    if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
130
      endOfHeaders = true;
131
      break;
132
    }
133
134
    // If the last character was a newline, and the first now is not
135
    // whitespace, then rewind one character and store the current
136
    // key,value pair.
137
    if (cqueue[2] == '\n' && c != ' ' && c != '\t') {
138
      if (content.length() > 2)
139
  content.resize(content.length() - 2);
140
141
      trim(content);
142
      header->add(name, content);
143
144
      if (c != '\r') {
145
  mimeSource->ungetChar();
146
  if (c == '\n') --*nlines;
147
  return true;
148
      }
149
  
150
      mimeSource->getChar(&c);
151
      return false;
152
    }
153
154
    content += c;
155
  }
156
157
  if (name != "") {
158
    if (content.length() > 2)
159
      content.resize(content.length() - 2);
160
    header->add(name, content);
161
  }
162
163
  return !(eof || endOfHeaders);
164
}
165
166
//------------------------------------------------------------------------
167
static void parseHeader(Binc::Header *header, unsigned int *nlines)
168
{
169
  while (parseOneHeaderLine(header, nlines))
170
  { }
171
}
172
173
//------------------------------------------------------------------------
174
static void analyzeHeader(Binc::Header *header, bool *multipart,
175
            bool *messagerfc822, string *subtype,
176
            string *boundary)
177
{
178
  using namespace ::Binc;
179
180
  // Do simple parsing of headers to determine the
181
  // type of message (multipart,messagerfc822 etc)
182
  HeaderItem ctype;
183
  if (header->getFirstHeader("content-type", ctype)) {
184
    vector<string> types;
185
    split(ctype.getValue(), ";", types);
186
187
    if (types.size() > 0) {
188
      // first element should describe content type
189
      string tmp = types[0];
190
      trim(tmp);
191
      vector<string> v;
192
      split(tmp, "/", v);
193
      string key, value;
194
195
      key = (v.size() > 0) ? v[0] : "text";
196
      value = (v.size() > 1) ? v[1] : "plain";
197
      lowercase(key);
198
199
      if (key == "multipart") {
200
  *multipart = true;
201
  lowercase(value);
202
  *subtype = value;
203
      } else if (key == "message") {
204
  lowercase(value);
205
  if (value == "rfc822")
206
    *messagerfc822 = true;
207
      }
208
    }
209
210
    for (vector<string>::const_iterator i = types.begin();
211
   i != types.end(); ++i) {
212
      string element = *i;
213
      trim(element);
214
215
      if (element.find("=") != string::npos) {
216
  string::size_type pos = element.find('=');
217
  string key = element.substr(0, pos);
218
  string value = element.substr(pos + 1);
219
  
220
  lowercase(key);
221
  trim(key);
222
223
  if (key == "boundary") {
224
    trim(value, " \"");
225
    *boundary = value;
226
  }
227
      }
228
    }
229
  }
230
}
231
232
static void parseMessageRFC822(vector<Binc::MimePart> *members,
233
                 bool *foundendofpart,
234
                 unsigned int *bodylength,
235
                 unsigned int *nbodylines,
236
                 const string &toboundary)
237
{
238
  using namespace ::Binc;
239
240
  // message rfc822 means a completely enclosed mime document. we
241
  // call the parser recursively, and pass on the boundary string
242
  // that we got. when parse() finds this boundary, it returns 0. if
243
  // it finds the end boundary (boundary + "--"), it returns != 0.
244
  MimePart m;
245
246
  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
247
    
248
  // parsefull returns the number of bytes that need to be removed
249
  // from the body because of the terminating boundary string.
250
  int bsize = 0;
251
  if (m.parseFull(toboundary, bsize))
252
    *foundendofpart = true;
253
254
  // make sure bodylength doesn't overflow    
255
  *bodylength = mimeSource->getOffset();
256
  if (*bodylength >= bodystartoffsetcrlf) {
257
    *bodylength -= bodystartoffsetcrlf;
258
    if (*bodylength >= (unsigned int) bsize) {
259
      *bodylength -= (unsigned int) bsize;
260
    } else {
261
      *bodylength = 0;
262
    }
263
  } else {
264
    *bodylength = 0;
265
  }
266
267
  *nbodylines += m.getNofLines();
268
269
  members->push_back(m);
270
}
271
272
static bool skipUntilBoundary(const string &delimiter,
273
                unsigned int *nlines, bool *eof)
274
{
275
  int endpos = delimiter.length();
276
  char *delimiterqueue = 0;
277
  int delimiterpos = 0;
278
  const char *delimiterStr = delimiter.c_str();
279
  if (delimiter != "") {
280
    delimiterqueue = new char[endpos];
281
    memset(delimiterqueue, 0, endpos);
282
  }
283
284
  // first, skip to the first delimiter string. Anything between the
285
  // header and the first delimiter string is simply ignored (it's
286
  // usually a text message intended for non-mime clients)
287
  char c;
288
289
  bool foundBoundary = false;
290
  for (;;) {    
291
    if (!mimeSource->getChar(&c)) {
292
      *eof = true;
293
      break;
294
    }
295
296
    if (c == '\n')
297
      ++*nlines;
298
299
    // if there is no delimiter, we just read until the end of the
300
    // file.
301
    if (!delimiterqueue)
302
      continue;
303
304
    delimiterqueue[delimiterpos++ % endpos] = c;
305
306
    if (compareStringToQueue(delimiterStr, delimiterqueue,
307
               delimiterpos, endpos)) {
308
      foundBoundary = true;
309
      break;
310
    }
311
  }
312
313
  delete [] delimiterqueue;
314
  delimiterqueue = 0;
315
316
  return foundBoundary;
317
}
318
319
320
static void parseMultipart(const string &boundary,
321
             const string &toboundary,
322
             bool *eof,
323
             unsigned int *nlines,
324
             int *boundarysize,
325
             bool *foundendofpart,
326
             unsigned int *bodylength,
327
             vector<Binc::MimePart> *members)
328
{
329
  using namespace ::Binc;
330
  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
331
332
  // multipart parsing starts with skipping to the first
333
  // boundary. then we call parse() for all parts. the last parse()
334
  // command will return a code indicating that it found the last
335
  // boundary of this multipart. Note that the first boundary does
336
  // not have to start with CRLF.
337
  string delimiter = "--" + boundary;
338
339
  skipUntilBoundary(delimiter, nlines, eof);
340
341
  if (!eof)
342
    *boundarysize = delimiter.size();
343
344
  // Read two more characters. This may be CRLF, it may be "--" and
345
  // it may be any other two characters.
346
  char a;
347
  if (!mimeSource->getChar(&a))
348
    *eof = true;
349
350
  if (a == '\n')
351
    ++*nlines; 
352
353
  char b;
354
  if (!mimeSource->getChar(&b))
355
    *eof = true;
356
    
357
  if (b == '\n')
358
    ++*nlines;
359
    
360
  // If we find two dashes after the boundary, then this is the end
361
  // of boundary marker.
362
  if (!*eof) {
363
    if (a == '-' && b == '-') {
364
      *foundendofpart = true;
365
      *boundarysize += 2;
366
  
367
      if (!mimeSource->getChar(&a))
368
  *eof = true;
369
  
370
      if (a == '\n')
371
  ++*nlines; 
372
  
373
      if (!mimeSource->getChar(&b))
374
  *eof = true;
375
  
376
      if (b == '\n')
377
  ++*nlines;
378
    }
379
380
    if (a == '\r' && b == '\n') {
381
      // This exception is to handle a special case where the
382
      // delimiter of one part is not followed by CRLF, but
383
      // immediately followed by a CRLF prefixed delimiter.
384
      if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
385
  *eof = true; 
386
      else if (a == '-' && b == '-') {
387
  mimeSource->ungetChar();
388
  mimeSource->ungetChar();
389
  mimeSource->ungetChar();
390
  mimeSource->ungetChar();
391
      } else {
392
  mimeSource->ungetChar();
393
  mimeSource->ungetChar();
394
      }
395
396
      *boundarysize += 2;
397
    } else {
398
      mimeSource->ungetChar();
399
      mimeSource->ungetChar();
400
    }
401
  }
402
403
  // read all mime parts.
404
  if (!*foundendofpart && !*eof) {
405
    bool quit = false;
406
    do {
407
      MimePart m;
408
409
      // If parseFull returns != 0, then it encountered the multipart's
410
      // final boundary.
411
      int bsize = 0;
412
      if (m.parseFull(boundary, bsize)) {
413
  quit = true;
414
  *boundarysize = bsize;
415
      }
416
417
      members->push_back(m);
418
419
    } while (!quit);
420
  }
421
422
  if (!*foundendofpart && !*eof) {
423
    // multipart parsing starts with skipping to the first
424
    // boundary. then we call parse() for all parts. the last parse()
425
    // command will return a code indicating that it found the last
426
    // boundary of this multipart. Note that the first boundary does
427
    // not have to start with CRLF.
428
    string delimiter = "\r\n--" + toboundary;
429
430
    skipUntilBoundary(delimiter, nlines, eof);
431
432
    if (!*eof)
433
      *boundarysize = delimiter.size();
434
435
    // Read two more characters. This may be CRLF, it may be "--" and
436
    // it may be any other two characters.
437
    char a = '\0';
438
    if (!mimeSource->getChar(&a))
439
      *eof = true;
440
441
    if (a == '\n')
442
      ++*nlines; 
443
444
    char b = '\0';
445
    if (!mimeSource->getChar(&b))
446
      *eof = true;
447
    
448
    if (b == '\n')
449
      ++*nlines;
450
    
451
    // If we find two dashes after the boundary, then this is the end
452
    // of boundary marker.
453
    if (!*eof) {
454
      if (a == '-' && b == '-') {
455
  *foundendofpart = true;
456
  *boundarysize += 2;
457
  
458
  if (!mimeSource->getChar(&a))
459
    *eof = true;
460
  
461
  if (a == '\n')
462
    ++*nlines; 
463
  
464
  if (!mimeSource->getChar(&b))
465
    *eof = true;
466
  
467
  if (b == '\n')
468
    ++*nlines;
469
      }
470
471
      if (a == '\r' && b == '\n') {
472
  // This exception is to handle a special case where the
473
  // delimiter of one part is not followed by CRLF, but
474
  // immediately followed by a CRLF prefixed delimiter.
475
  if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
476
    *eof = true; 
477
  else if (a == '-' && b == '-') {
478
    mimeSource->ungetChar();
479
    mimeSource->ungetChar();
480
    mimeSource->ungetChar();
481
    mimeSource->ungetChar();
482
  } else {
483
    mimeSource->ungetChar();
484
    mimeSource->ungetChar();
485
  }
486
487
  *boundarysize += 2;
488
      } else {
489
  mimeSource->ungetChar();
490
  mimeSource->ungetChar();
491
      }
492
    }
493
  }
494
495
  // make sure bodylength doesn't overflow    
496
  *bodylength = mimeSource->getOffset();
497
  if (*bodylength >= bodystartoffsetcrlf) {
498
    *bodylength -= bodystartoffsetcrlf;
499
    if (*bodylength >= (unsigned int) *boundarysize) {
500
      *bodylength -= (unsigned int) *boundarysize;
501
    } else {
502
      *bodylength = 0;
503
    }
504
  } else {
505
    *bodylength = 0;
506
  }
507
}
508
509
static void parseSinglePart(const string &toboundary,
510
              int *boundarysize,
511
              unsigned int *nbodylines,
512
              unsigned int *nlines,
513
              bool *eof, bool *foundendofpart,
514
              unsigned int *bodylength)
515
{
516
  using namespace ::Binc;
517
  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
518
519
  // If toboundary is empty, then we read until the end of the
520
  // file. Otherwise we will read until we encounter toboundary.
521
  string _toboundary; 
522
  if (toboundary != "") {
523
    _toboundary = "\r\n--";
524
    _toboundary += toboundary;
525
  }
526
527
  //  if (skipUntilBoundary(_toboundary, nlines, eof))
528
  //    *boundarysize = _toboundary.length();
529
530
  char *boundaryqueue = 0;
531
  int endpos = _toboundary.length();
532
  if (toboundary != "") {
533
    boundaryqueue = new char[endpos];
534
    memset(boundaryqueue, 0, endpos);
535
  }
536
  int boundarypos = 0;
537
538
  *boundarysize = 0;
539
540
  const char *_toboundaryStr = _toboundary.c_str();
541
  string line;
542
  bool toboundaryIsEmpty = (toboundary == "");
543
  char c;
544
  while (mimeSource->getChar(&c)) {
545
    if (c == '\n') { ++*nbodylines; ++*nlines; }
546
547
    if (toboundaryIsEmpty)
548
      continue;
549
550
    // find boundary
551
    boundaryqueue[boundarypos++ % endpos] = c;
552
      
553
    if (compareStringToQueue(_toboundaryStr, boundaryqueue,
554
               boundarypos, endpos)) {
555
      *boundarysize = _toboundary.length();
556
      break;
557
    }
558
  }
559
560
  delete [] boundaryqueue;
561
562
  if (toboundary != "") {
563
    char a;
564
    if (!mimeSource->getChar(&a))
565
      *eof = true;
566
567
    if (a == '\n')
568
      ++*nlines;
569
    char b;
570
    if (!mimeSource->getChar(&b))
571
      *eof = true;
572
573
    if (b == '\n') 
574
      ++*nlines;
575
576
    if (a == '-' && b == '-') {
577
      *boundarysize += 2;
578
      *foundendofpart = true;
579
      if (!mimeSource->getChar(&a))
580
  *eof = true;
581
582
      if (a == '\n')
583
  ++*nlines;
584
585
      if (!mimeSource->getChar(&b))
586
  *eof = true;
587
    
588
      if (b == '\n')
589
  ++*nlines;
590
    }
591
592
    if (a == '\r' && b == '\n') {
593
      // This exception is to handle a special case where the
594
      // delimiter of one part is not followed by CRLF, but
595
      // immediately followed by a CRLF prefixed delimiter.
596
      if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
597
  *eof = true; 
598
      else if (a == '-' && b == '-') {
599
  mimeSource->ungetChar();
600
  mimeSource->ungetChar();
601
  mimeSource->ungetChar();
602
  mimeSource->ungetChar();
603
      } else {
604
  mimeSource->ungetChar();
605
  mimeSource->ungetChar();
606
      }
607
608
      *boundarysize += 2;
609
    } else {
610
      mimeSource->ungetChar();
611
      mimeSource->ungetChar();
612
    }
613
  }
614
615
  // make sure bodylength doesn't overflow    
616
  *bodylength = mimeSource->getOffset();
617
  if (*bodylength >= bodystartoffsetcrlf) {
618
    *bodylength -= bodystartoffsetcrlf;
619
    if (*bodylength >= (unsigned int) *boundarysize) {
620
      *bodylength -= (unsigned int) *boundarysize;
621
    } else {
622
      *bodylength = 0;
623
    }
624
  } else {
625
    *bodylength = 0;
626
  }
627
628
}
629
630
//------------------------------------------------------------------------
631
int Binc::MimePart::parseFull(const string &toboundary,
632
                int &boundarysize) const
633
{
634
  headerstartoffsetcrlf = mimeSource->getOffset();
635
636
  // Parse the header of this mime part.
637
  parseHeader(&h, &nlines);
638
639
  // Headerlength includes the seperating CRLF. Body starts after the
640
  // CRLF.
641
  headerlength = mimeSource->getOffset() - headerstartoffsetcrlf;
642
  bodystartoffsetcrlf = mimeSource->getOffset();
643
  bodylength = 0;
644
645
  // Determine the type of mime part by looking at fields in the
646
  // header.
647
  analyzeHeader(&h, &multipart, &messagerfc822, &subtype, &boundary);
648
649
  bool eof = false;
650
  bool foundendofpart = false;
651
652
  if (messagerfc822) {
653
    parseMessageRFC822(&members, &foundendofpart, &bodylength,
654
             &nbodylines, toboundary);
655
656
  } else if (multipart) {
657
    parseMultipart(boundary, toboundary, &eof, &nlines, &boundarysize,
658
         &foundendofpart, &bodylength,
659
         &members);
660
  } else {
661
    parseSinglePart(toboundary, &boundarysize, &nbodylines, &nlines,
662
          &eof, &foundendofpart, &bodylength);
663
  }
664
665
  return (eof || foundendofpart) ? 1 : 0;
666
}