recoll / Code / Diff of /src/bincimapmime/mime-parsefull.cc

Diff of /src/bincimapmime/mime-parsefull.cc [000000] .. [670d7c]

Switch to side-by-side view

--- a
+++ b/src/bincimapmime/mime-parsefull.cc
@@ -0,0 +1,666 @@
+/* -*- Mode: c++; -*- */
+/*  --------------------------------------------------------------------
+ *  Filename:
+ *    mime-parsefull.cc
+ *  
+ *  Description:
+ *    Implementation of main mime parser components
+ *  --------------------------------------------------------------------
+ *  Copyright 2002-2004 Andreas Aardal Hanssen
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
+ *  --------------------------------------------------------------------
+ */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "mime.h"
+#include "mime-utils.h"
+#include "mime-inputsource.h"
+#include "convert.h"
+#include <string>
+#include <vector>
+#include <map>
+#include <exception>
+#include <iostream>
+
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <errno.h>
+
+Binc::MimeInputSource *mimeSource = 0;
+
+using namespace ::std;
+
+//------------------------------------------------------------------------
+void Binc::MimeDocument::parseFull(int fd) const
+{
+  if (allIsParsed)
+    return;
+
+  allIsParsed = true;
+
+  if (!mimeSource || mimeSource->getFileDescriptor() != fd) {
+    delete mimeSource;
+    mimeSource = new MimeInputSource(fd);
+  } else {
+    mimeSource->reset();
+  }
+
+  headerstartoffsetcrlf = 0;
+  headerlength = 0;
+  bodystartoffsetcrlf = 0;
+  bodylength = 0;
+  size = 0;
+  messagerfc822 = false;
+  multipart = false;
+
+  int bsize = 0;
+  string bound;
+  MimePart::parseFull(bound, bsize);
+
+  // eat any trailing junk to get the correct size
+  char c;
+  while (mimeSource->getChar(&c));
+
+  size = mimeSource->getOffset();
+}
+
+//------------------------------------------------------------------------
+static bool parseOneHeaderLine(Binc::Header *header, unsigned int *nlines)
+{
+  using namespace ::Binc;
+  char c;
+  bool eof = false;
+  char cqueue[4];
+  string name;
+  string content;
+
+  while (mimeSource->getChar(&c)) {
+    // If we encounter a \r before we got to the first ':', then
+    // rewind back to the start of the line and assume we're at the
+    // start of the body.
+    if (c == '\r') {
+      for (int i = 0; i < (int) name.length() + 1; ++i)
+	mimeSource->ungetChar();
+      return false;
+    }
+
+    // A colon marks the end of the header name
+    if (c == ':') break;
+
+    // Otherwise add to the header name
+    name += c;
+  }
+
+  cqueue[0] = '\0';
+  cqueue[1] = '\0';
+  cqueue[2] = '\0';
+  cqueue[3] = '\0';
+
+  // Read until the end of the header.
+  bool endOfHeaders = false;
+  while (!endOfHeaders) {
+    if (!mimeSource->getChar(&c)) {
+      eof = true;
+      break;
+    }
+
+    if (c == '\n') ++*nlines;
+
+    for (int i = 0; i < 3; ++i)
+      cqueue[i] = cqueue[i + 1];
+    cqueue[3] = c;
+
+    if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
+      endOfHeaders = true;
+      break;
+    }
+
+    // If the last character was a newline, and the first now is not
+    // whitespace, then rewind one character and store the current
+    // key,value pair.
+    if (cqueue[2] == '\n' && c != ' ' && c != '\t') {
+      if (content.length() > 2)
+	content.resize(content.length() - 2);
+
+      trim(content);
+      header->add(name, content);
+
+      if (c != '\r') {
+	mimeSource->ungetChar();
+	if (c == '\n') --*nlines;
+	return true;
+      }
+	
+      mimeSource->getChar(&c);
+      return false;
+    }
+
+    content += c;
+  }
+
+  if (name != "") {
+    if (content.length() > 2)
+      content.resize(content.length() - 2);
+    header->add(name, content);
+  }
+
+  return !(eof || endOfHeaders);
+}
+
+//------------------------------------------------------------------------
+static void parseHeader(Binc::Header *header, unsigned int *nlines)
+{
+  while (parseOneHeaderLine(header, nlines))
+  { }
+}
+
+//------------------------------------------------------------------------
+static void analyzeHeader(Binc::Header *header, bool *multipart,
+			  bool *messagerfc822, string *subtype,
+			  string *boundary)
+{
+  using namespace ::Binc;
+
+  // Do simple parsing of headers to determine the
+  // type of message (multipart,messagerfc822 etc)
+  HeaderItem ctype;
+  if (header->getFirstHeader("content-type", ctype)) {
+    vector<string> types;
+    split(ctype.getValue(), ";", types);
+
+    if (types.size() > 0) {
+      // first element should describe content type
+      string tmp = types[0];
+      trim(tmp);
+      vector<string> v;
+      split(tmp, "/", v);
+      string key, value;
+
+      key = (v.size() > 0) ? v[0] : "text";
+      value = (v.size() > 1) ? v[1] : "plain";
+      lowercase(key);
+
+      if (key == "multipart") {
+	*multipart = true;
+	lowercase(value);
+	*subtype = value;
+      } else if (key == "message") {
+	lowercase(value);
+	if (value == "rfc822")
+	  *messagerfc822 = true;
+      }
+    }
+
+    for (vector<string>::const_iterator i = types.begin();
+	 i != types.end(); ++i) {
+      string element = *i;
+      trim(element);
+
+      if (element.find("=") != string::npos) {
+	string::size_type pos = element.find('=');
+	string key = element.substr(0, pos);
+	string value = element.substr(pos + 1);
+	
+	lowercase(key);
+	trim(key);
+
+	if (key == "boundary") {
+	  trim(value, " \"");
+	  *boundary = value;
+	}
+      }
+    }
+  }
+}
+
+static void parseMessageRFC822(vector<Binc::MimePart> *members,
+			       bool *foundendofpart,
+			       unsigned int *bodylength,
+			       unsigned int *nbodylines,
+			       const string &toboundary)
+{
+  using namespace ::Binc;
+
+  // message rfc822 means a completely enclosed mime document. we
+  // call the parser recursively, and pass on the boundary string
+  // that we got. when parse() finds this boundary, it returns 0. if
+  // it finds the end boundary (boundary + "--"), it returns != 0.
+  MimePart m;
+
+  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
+    
+  // parsefull returns the number of bytes that need to be removed
+  // from the body because of the terminating boundary string.
+  int bsize = 0;
+  if (m.parseFull(toboundary, bsize))
+    *foundendofpart = true;
+
+  // make sure bodylength doesn't overflow    
+  *bodylength = mimeSource->getOffset();
+  if (*bodylength >= bodystartoffsetcrlf) {
+    *bodylength -= bodystartoffsetcrlf;
+    if (*bodylength >= (unsigned int) bsize) {
+      *bodylength -= (unsigned int) bsize;
+    } else {
+      *bodylength = 0;
+    }
+  } else {
+    *bodylength = 0;
+  }
+
+  *nbodylines += m.getNofLines();
+
+  members->push_back(m);
+}
+
+static bool skipUntilBoundary(const string &delimiter,
+			      unsigned int *nlines, bool *eof)
+{
+  int endpos = delimiter.length();
+  char *delimiterqueue = 0;
+  int delimiterpos = 0;
+  const char *delimiterStr = delimiter.c_str();
+  if (delimiter != "") {
+    delimiterqueue = new char[endpos];
+    memset(delimiterqueue, 0, endpos);
+  }
+
+  // first, skip to the first delimiter string. Anything between the
+  // header and the first delimiter string is simply ignored (it's
+  // usually a text message intended for non-mime clients)
+  char c;
+
+  bool foundBoundary = false;
+  for (;;) {    
+    if (!mimeSource->getChar(&c)) {
+      *eof = true;
+      break;
+    }
+
+    if (c == '\n')
+      ++*nlines;
+
+    // if there is no delimiter, we just read until the end of the
+    // file.
+    if (!delimiterqueue)
+      continue;
+
+    delimiterqueue[delimiterpos++ % endpos] = c;
+
+    if (compareStringToQueue(delimiterStr, delimiterqueue,
+			     delimiterpos, endpos)) {
+      foundBoundary = true;
+      break;
+    }
+  }
+
+  delete [] delimiterqueue;
+  delimiterqueue = 0;
+
+  return foundBoundary;
+}
+
+
+static void parseMultipart(const string &boundary,
+			   const string &toboundary,
+			   bool *eof,
+			   unsigned int *nlines,
+			   int *boundarysize,
+			   bool *foundendofpart,
+			   unsigned int *bodylength,
+			   vector<Binc::MimePart> *members)
+{
+  using namespace ::Binc;
+  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
+
+  // multipart parsing starts with skipping to the first
+  // boundary. then we call parse() for all parts. the last parse()
+  // command will return a code indicating that it found the last
+  // boundary of this multipart. Note that the first boundary does
+  // not have to start with CRLF.
+  string delimiter = "--" + boundary;
+
+  skipUntilBoundary(delimiter, nlines, eof);
+
+  if (!eof)
+    *boundarysize = delimiter.size();
+
+  // Read two more characters. This may be CRLF, it may be "--" and
+  // it may be any other two characters.
+  char a;
+  if (!mimeSource->getChar(&a))
+    *eof = true;
+
+  if (a == '\n')
+    ++*nlines; 
+
+  char b;
+  if (!mimeSource->getChar(&b))
+    *eof = true;
+    
+  if (b == '\n')
+    ++*nlines;
+    
+  // If we find two dashes after the boundary, then this is the end
+  // of boundary marker.
+  if (!*eof) {
+    if (a == '-' && b == '-') {
+      *foundendofpart = true;
+      *boundarysize += 2;
+	
+      if (!mimeSource->getChar(&a))
+	*eof = true;
+	
+      if (a == '\n')
+	++*nlines; 
+	
+      if (!mimeSource->getChar(&b))
+	*eof = true;
+	
+      if (b == '\n')
+	++*nlines;
+    }
+
+    if (a == '\r' && b == '\n') {
+      // This exception is to handle a special case where the
+      // delimiter of one part is not followed by CRLF, but
+      // immediately followed by a CRLF prefixed delimiter.
+      if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
+	*eof = true; 
+      else if (a == '-' && b == '-') {
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+      } else {
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+      }
+
+      *boundarysize += 2;
+    } else {
+      mimeSource->ungetChar();
+      mimeSource->ungetChar();
+    }
+  }
+
+  // read all mime parts.
+  if (!*foundendofpart && !*eof) {
+    bool quit = false;
+    do {
+      MimePart m;
+
+      // If parseFull returns != 0, then it encountered the multipart's
+      // final boundary.
+      int bsize = 0;
+      if (m.parseFull(boundary, bsize)) {
+	quit = true;
+	*boundarysize = bsize;
+      }
+
+      members->push_back(m);
+
+    } while (!quit);
+  }
+
+  if (!*foundendofpart && !*eof) {
+    // multipart parsing starts with skipping to the first
+    // boundary. then we call parse() for all parts. the last parse()
+    // command will return a code indicating that it found the last
+    // boundary of this multipart. Note that the first boundary does
+    // not have to start with CRLF.
+    string delimiter = "\r\n--" + toboundary;
+
+    skipUntilBoundary(delimiter, nlines, eof);
+
+    if (!*eof)
+      *boundarysize = delimiter.size();
+
+    // Read two more characters. This may be CRLF, it may be "--" and
+    // it may be any other two characters.
+    char a = '\0';
+    if (!mimeSource->getChar(&a))
+      *eof = true;
+
+    if (a == '\n')
+      ++*nlines; 
+
+    char b = '\0';
+    if (!mimeSource->getChar(&b))
+      *eof = true;
+    
+    if (b == '\n')
+      ++*nlines;
+    
+    // If we find two dashes after the boundary, then this is the end
+    // of boundary marker.
+    if (!*eof) {
+      if (a == '-' && b == '-') {
+	*foundendofpart = true;
+	*boundarysize += 2;
+	
+	if (!mimeSource->getChar(&a))
+	  *eof = true;
+	
+	if (a == '\n')
+	  ++*nlines; 
+	
+	if (!mimeSource->getChar(&b))
+	  *eof = true;
+	
+	if (b == '\n')
+	  ++*nlines;
+      }
+
+      if (a == '\r' && b == '\n') {
+	// This exception is to handle a special case where the
+	// delimiter of one part is not followed by CRLF, but
+	// immediately followed by a CRLF prefixed delimiter.
+	if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
+	  *eof = true; 
+	else if (a == '-' && b == '-') {
+	  mimeSource->ungetChar();
+	  mimeSource->ungetChar();
+	  mimeSource->ungetChar();
+	  mimeSource->ungetChar();
+	} else {
+	  mimeSource->ungetChar();
+	  mimeSource->ungetChar();
+	}
+
+	*boundarysize += 2;
+      } else {
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+      }
+    }
+  }
+
+  // make sure bodylength doesn't overflow    
+  *bodylength = mimeSource->getOffset();
+  if (*bodylength >= bodystartoffsetcrlf) {
+    *bodylength -= bodystartoffsetcrlf;
+    if (*bodylength >= (unsigned int) *boundarysize) {
+      *bodylength -= (unsigned int) *boundarysize;
+    } else {
+      *bodylength = 0;
+    }
+  } else {
+    *bodylength = 0;
+  }
+}
+
+static void parseSinglePart(const string &toboundary,
+			    int *boundarysize,
+			    unsigned int *nbodylines,
+			    unsigned int *nlines,
+			    bool *eof, bool *foundendofpart,
+			    unsigned int *bodylength)
+{
+  using namespace ::Binc;
+  unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
+
+  // If toboundary is empty, then we read until the end of the
+  // file. Otherwise we will read until we encounter toboundary.
+  string _toboundary; 
+  if (toboundary != "") {
+    _toboundary = "\r\n--";
+    _toboundary += toboundary;
+  }
+
+  //  if (skipUntilBoundary(_toboundary, nlines, eof))
+  //    *boundarysize = _toboundary.length();
+
+  char *boundaryqueue = 0;
+  int endpos = _toboundary.length();
+  if (toboundary != "") {
+    boundaryqueue = new char[endpos];
+    memset(boundaryqueue, 0, endpos);
+  }
+  int boundarypos = 0;
+
+  *boundarysize = 0;
+
+  const char *_toboundaryStr = _toboundary.c_str();
+  string line;
+  bool toboundaryIsEmpty = (toboundary == "");
+  char c;
+  while (mimeSource->getChar(&c)) {
+    if (c == '\n') { ++*nbodylines; ++*nlines; }
+
+    if (toboundaryIsEmpty)
+      continue;
+
+    // find boundary
+    boundaryqueue[boundarypos++ % endpos] = c;
+      
+    if (compareStringToQueue(_toboundaryStr, boundaryqueue,
+			     boundarypos, endpos)) {
+      *boundarysize = _toboundary.length();
+      break;
+    }
+  }
+
+  delete [] boundaryqueue;
+
+  if (toboundary != "") {
+    char a;
+    if (!mimeSource->getChar(&a))
+      *eof = true;
+
+    if (a == '\n')
+      ++*nlines;
+    char b;
+    if (!mimeSource->getChar(&b))
+      *eof = true;
+
+    if (b == '\n') 
+      ++*nlines;
+
+    if (a == '-' && b == '-') {
+      *boundarysize += 2;
+      *foundendofpart = true;
+      if (!mimeSource->getChar(&a))
+	*eof = true;
+
+      if (a == '\n')
+	++*nlines;
+
+      if (!mimeSource->getChar(&b))
+	*eof = true;
+	  
+      if (b == '\n')
+	++*nlines;
+    }
+
+    if (a == '\r' && b == '\n') {
+      // This exception is to handle a special case where the
+      // delimiter of one part is not followed by CRLF, but
+      // immediately followed by a CRLF prefixed delimiter.
+      if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b))
+	*eof = true; 
+      else if (a == '-' && b == '-') {
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+      } else {
+	mimeSource->ungetChar();
+	mimeSource->ungetChar();
+      }
+
+      *boundarysize += 2;
+    } else {
+      mimeSource->ungetChar();
+      mimeSource->ungetChar();
+    }
+  }
+
+  // make sure bodylength doesn't overflow    
+  *bodylength = mimeSource->getOffset();
+  if (*bodylength >= bodystartoffsetcrlf) {
+    *bodylength -= bodystartoffsetcrlf;
+    if (*bodylength >= (unsigned int) *boundarysize) {
+      *bodylength -= (unsigned int) *boundarysize;
+    } else {
+      *bodylength = 0;
+    }
+  } else {
+    *bodylength = 0;
+  }
+
+}
+
+//------------------------------------------------------------------------
+int Binc::MimePart::parseFull(const string &toboundary,
+			      int &boundarysize) const
+{
+  headerstartoffsetcrlf = mimeSource->getOffset();
+
+  // Parse the header of this mime part.
+  parseHeader(&h, &nlines);
+
+  // Headerlength includes the seperating CRLF. Body starts after the
+  // CRLF.
+  headerlength = mimeSource->getOffset() - headerstartoffsetcrlf;
+  bodystartoffsetcrlf = mimeSource->getOffset();
+  bodylength = 0;
+
+  // Determine the type of mime part by looking at fields in the
+  // header.
+  analyzeHeader(&h, &multipart, &messagerfc822, &subtype, &boundary);
+
+  bool eof = false;
+  bool foundendofpart = false;
+
+  if (messagerfc822) {
+    parseMessageRFC822(&members, &foundendofpart, &bodylength,
+		       &nbodylines, toboundary);
+
+  } else if (multipart) {
+    parseMultipart(boundary, toboundary, &eof, &nlines, &boundarysize,
+		   &foundendofpart, &bodylength,
+		   &members);
+  } else {
+    parseSinglePart(toboundary, &boundarysize, &nbodylines, &nlines,
+		    &eof, &foundendofpart, &bodylength);
+  }
+
+  return (eof || foundendofpart) ? 1 : 0;
+}