recoll / Code / Diff of /src/utils/mimeparse.cpp

Diff of /src/utils/mimeparse.cpp [6f66d9] .. [6cfe82]

Switch to side-by-side view

--- a/src/utils/mimeparse.cpp
+++ b/src/utils/mimeparse.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.2 2005-03-17 14:02:06 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.3 2005-03-25 09:40:28 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 
 #ifndef TEST_MIMEPARSE
@@ -7,71 +7,345 @@
 #include <string>
 #include <ctype.h>
 #include <stdio.h>
+#include <ctype.h>
 
 #include "mimeparse.h"
 
 using namespace std;
-#define WHITE " \t\n"
-
-static void stripw_lc(string &in)
-{
-    // fprintf(stderr, "In: '%s'\n", in.c_str());
-    string::size_type pos, pos1;
-    pos = in.find_first_not_of(WHITE);
-    if (pos == string::npos) {
-	// All white
-	in = "";
-	return;
-    }
-    in.replace(0, pos, "");
-    pos1 = in.find_last_not_of(WHITE); 
-    if (pos1 != in.length() -1)
-	in  = in.replace(pos1+1, string::npos, "");
-    string::iterator i;
-    for (i = in.begin(); i != in.end(); i++)
-	*i = tolower(*i);
-}
-
-MimeHeaderValue parseMimeHeaderValue(const string &ein)
-{
-    string in = ein;
-    MimeHeaderValue out;
-    string::size_type pos;
-
-    pos = in.find_first_not_of(WHITE);
-    if (pos == string::npos)
-	return out;
-    in = in.substr(pos, string::npos);
-    if ((pos = in.find_first_of(";")) == string::npos) {
-	out.value = in;
-	return out;
-    } 
-    out.value = in.substr(0, pos);
-    stripw_lc(out.value);
-    in = in.substr(pos+1, string::npos);
+
+// Parsing a header value. Only content-type has parameters, but
+// others are compatible with content-type syntax, only, parameters
+// are not used. So we can parse all like content-type:
+//    headertype: value [; paramname=paramvalue] ...
+// Value and paramvalues can be quoted strings, and there can be
+// comments in there
+
+
+
+// The lexical token returned by find_next_token
+class Lexical {
+ public:
+    enum kind {none, token, separator};
+    kind   what;
+    string value;
+    string error;
+    char quote;
+    Lexical() : what(none), quote(0) {}
+    void reset() {what = none; value.erase(); error.erase();quote = 0;}
+};
+
+// Skip mime comment. This must be called with in[start] == '('
+int skip_comment(const string &in, unsigned int start, Lexical &lex)
+{
+    int commentlevel = 0;
+    for (; start < in.size(); start++) {
+	if (in[start] == '\\') {
+	    // Skip escaped char. 
+	    if (start+1 < in.size()) {
+		start++;
+		continue;
+	    } else {
+		lex.error.append("\\ at end of string ");
+		return string::npos;
+	    }
+	}
+	if (in[start] == '(')
+	    commentlevel++;
+	if (in[start] == ')') {
+	    if (--commentlevel == 0)
+		break;
+	}
+    }
+    if (start == in.size()) {
+	lex.error.append("Unclosed comment ");
+	return string::npos;
+    }
+    return start;
+}
+
+// Skip initial whitespace and (possibly nested) comments. 
+int skip_whitespace_and_comment(const string &in, unsigned int start, 
+				Lexical &lex)
+{
+    while (1) {
+	if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos)
+	    return in.size();
+	if (in[start] == '(') {
+	    if ((start = skip_comment(in, start, lex)) == string::npos)
+		return string::npos;
+	} else {
+	    break;
+	}
+    }
+    return start;
+}
+
+/// Find next token in mime header value string. 
+/// @return the next starting position in string, string::npos for error 
+///   (ie unbalanced quoting)
+/// @param in the input string
+/// @param start the starting position
+/// @param lex  the returned token and its description
+/// @param delims separators we should look for
+int find_next_token(const string &in, unsigned int start, 
+		    Lexical &lex, string delims = ";=")
+{
+    char oquot, cquot;
+
+    start = skip_whitespace_and_comment(in, start, lex);
+    if (start == string::npos || start == in.size())
+	return start;
+
+    // Begins with separator ? return it.
+    unsigned int delimi = delims.find_first_of(in[start]);
+    if (delimi != string::npos) {
+	lex.what = Lexical::separator;
+	lex.value = delims[delimi];
+	return start+1;
+    }
+
+    // Check for start of quoted string
+    oquot = in[start];
+    switch (oquot) {
+    case '<': cquot = '>';break;
+    case '"': cquot = '"';break;
+    default: cquot = 0; break;
+    }
+
+    if (cquot != 0) {
+	// Quoted string parsing
+	unsigned int end;
+	start++; // Skip quote character
+	for (end = start;end < in.size() && in[end] != cquot; end++) {
+	    if (in[end] == '\\') {
+		// Skip escaped char. 
+		if (end+1 < in.size()) {
+		    end++;
+		} else {
+		    // backslash at end of string: error
+		    lex.error.append("\\ at end of string ");
+		    return string::npos;
+		}
+	    }
+	}
+	if (end == in.size()) {
+	    // Found end of string before closing quote character: error
+	    lex.error.append("Unclosed quoted string ");
+	    return string::npos;
+	}
+	lex.what = Lexical::token;
+	lex.value = in.substr(start, end-start);
+	lex.quote = oquot;
+	return ++end;
+    } else {
+	unsigned int end = in.find_first_of(delims + " \t(", start);
+	lex.what = Lexical::token;
+	lex.quote = 0;
+	if (end == string::npos) {
+	    end = in.size();
+	    lex.value = in.substr(start);
+	} else {
+	    lex.value = in.substr(start, end-start);
+	}
+	return end;
+    }
+}
+
+void stringtolower(string &out, const string& in)
+{
+    for (unsigned int i = 0; i < in.size(); i++)
+	out.append(1, char(tolower(in[i])));
+}
+
+bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed)
+{
+    parsed.value.erase();
+    parsed.params.clear();
+
+    Lexical lex;
+    unsigned int start = 0;
+    start = find_next_token(value, start, lex);
+    if (start == string::npos || lex.what != Lexical::token) 
+	return false;
+
+    parsed.value = lex.value;
+
     for (;;) {
-	// Skip whitespace
-	if ((pos = in.find_first_not_of(WHITE)) == string::npos)
-	    return out;
-	in = in.substr(pos, string::npos);
-
-	if ((pos = in.find_first_of("=")) == string::npos)
-	    return out;
-	string pname = in.substr(0, pos);
-	stripw_lc(pname);
-	in = in.substr(pos+1, string::npos);
-
-	pos = in.find_first_of(";");
-	string pvalue = in.substr(0, pos);
-	stripw_lc(pvalue);
-	out.params[pname] = pvalue;
-	if (pos == string::npos)
-	    return out;
-	in = in.substr(pos+1, string::npos);
-    }
-
-    return out;
-
+	string paramname, paramvalue;
+	lex.reset();
+	start = find_next_token(value, start, lex);
+	if (start == value.size())
+	    return true;
+	if (start == string::npos)
+	    return false;
+	if (lex.what == Lexical::separator && lex.value[0] == ';')
+	    continue;
+	if (lex.what != Lexical::token) 
+	    return false;
+	stringtolower(paramname, lex.value);
+
+	start = find_next_token(value, start, lex);
+	if (start == string::npos || lex.what != Lexical::separator || 
+	    lex.value[0] != '=') 
+	    return false;
+
+	start = find_next_token(value, start, lex);
+	if (start == string::npos || lex.what != Lexical::token)
+	    return false;
+	paramvalue = lex.value;
+	parsed.params[paramname] = paramvalue;
+    }
+    return true;
+}
+
+// Decode a string encoded with quoted-printable encoding. 
+bool qp_decode(const string& in, string &out) 
+{
+    out.reserve(in.length());
+    unsigned int ii;
+    for (ii = 0; ii < in.length(); ii++) {
+	if (in[ii] == '=') {
+	    ii++; // Skip '='
+	    if(ii >= in.length() - 1) { // Need at least 2 more chars
+		break;
+	    } else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip
+		ii++;
+	    } else if (in[ii] != '\n' && in[ii] != '\r') { // decode
+		char c = in[ii];
+		char co;
+		if(c >= 'A' && c <= 'F') {
+		    co = char((c - 'A' + 10) * 16);
+		} else if (c >= 'a' && c <= 'f') {
+		    co = char((c - 'a' + 10) * 16);
+		} else if (c >= '0' && c <= '9') {
+		    co = char((c - '0') * 16);
+		} else {
+		    return false;
+		}
+		if(++ii >= in.length()) 
+		    break;
+		c = in[ii];
+		if (c >= 'A' && c <= 'F') {
+		    co += char(c - 'A' + 10);
+		} else if (c >= 'a' && c <= 'f') {
+		    co += char(c - 'a' + 10);
+		} else if (c >= '0' && c <= '9') {
+		    co += char(c - '0');
+		} else {
+		    return false;
+		}
+		out += co;
+	    }
+	} else {
+	    out += in[ii];
+	}
+    }
+    return true;
+}
+
+
+// This is adapted from FreeBSD's code.
+static const char Base64[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char Pad64 = '=';
+bool base64_decode(const string& in, string& out)
+{
+    int io = 0, state = 0, ch;
+    char *pos;
+    unsigned int ii = 0;
+    out.reserve(in.length());
+
+    for (ii = 0; ii < in.length(); ii++) {
+	ch = in[ii];
+	if (isspace((unsigned char)ch))        /* Skip whitespace anywhere. */
+	    continue;
+
+	if (ch == Pad64)
+	    break;
+
+	pos = strchr(Base64, ch);
+	if (pos == 0) 		/* A non-base64 character. */
+	    return false;
+
+	switch (state) {
+	case 0:
+	    out[io] = (pos - Base64) << 2;
+	    state = 1;
+	    break;
+	case 1:
+	    out[io]   |=  (pos - Base64) >> 4;
+	    out[io+1]  = ((pos - Base64) & 0x0f) << 4 ;
+	    io++;
+	    state = 2;
+	    break;
+	case 2:
+	    out[io]   |=  (pos - Base64) >> 2;
+	    out[io+1]  = ((pos - Base64) & 0x03) << 6;
+	    io++;
+	    state = 3;
+	    break;
+	case 3:
+	    out[io] |= (pos - Base64);
+	    io++;
+	    state = 0;
+	    break;
+	default:
+	    return false;
+	}
+    }
+
+    /*
+     * We are done decoding Base-64 chars.  Let's see if we ended
+     * on a byte boundary, and/or with erroneous trailing characters.
+     */
+
+    if (ch == Pad64) {		/* We got a pad char. */
+	ch = in[ii++];		/* Skip it, get next. */
+	switch (state) {
+	case 0:		/* Invalid = in first position */
+	case 1:		/* Invalid = in second position */
+	    return false;
+
+	case 2:		/* Valid, means one byte of info */
+			/* Skip any number of spaces. */
+	    for (; ii < in.length(); ch = in[ii++])
+		if (!isspace((unsigned char)ch))
+		    break;
+	    /* Make sure there is another trailing = sign. */
+	    if (ch != Pad64)
+		return false;
+	    ch = in[ii++];		/* Skip the = */
+	    /* Fall through to "single trailing =" case. */
+	    /* FALLTHROUGH */
+
+	case 3:		/* Valid, means two bytes of info */
+			/*
+			 * We know this char is an =.  Is there anything but
+			 * whitespace after it?
+			 */
+	    for ((void)NULL; ii < in.length(); ch = in[ii++])
+		if (!isspace((unsigned char)ch))
+		    return false;
+
+	    /*
+	     * Now make sure for cases 2 and 3 that the "extra"
+	     * bits that slopped past the last full byte were
+	     * zeros.  If we don't check them, they become a
+	     * subliminal channel.
+	     */
+	    if (out[io] != 0)
+		return false;
+	}
+    } else {
+	/*
+	 * We ended by seeing the end of the string.  Make sure we
+	 * have no partial bytes lying around.
+	 */
+	if (state != 0)
+	    return false;
+    }
+
+    return true;
 }
 
 #else 
@@ -82,19 +356,47 @@
 int
 main(int argc, const char **argv)
 {
+#if 0
+    //    const char *tr = "text/html; charset=utf-8; otherparam=garb";
+    const char *tr = "text/html;charset = UTF-8 ; otherparam=garb; \n"
+	"QUOTEDPARAM=\"quoted value\"";
 
     MimeHeaderValue parsed;
 
-    //    const char *tr = "text/html; charset=utf-8; otherparam=garb";
-    const char *tr = "text/html;charset = UTF-8 ; otherparam=garb;";
-
-    parsed = parseMimeHeaderValue(tr);
+    if (!parseMimeHeaderValue(tr, parsed)) {
+	fprintf(stderr, "PARSE ERROR\n");
+    }
     
     printf("'%s' \n", parsed.value.c_str());
     map<string, string>::iterator it;
     for (it = parsed.params.begin();it != parsed.params.end();it++) {
 	printf("  '%s' = '%s'\n", it->first.c_str(), it->second.c_str());
     }
+#elif 0
+    const char *qp = "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme "
+	"agrave is: '=E0' probable skipped decode error: =\n"
+	"Actual decode error =xx this wont show";
+
+    string out;
+    if (!qp_decode(string(qp), out)) {
+	fprintf(stderr, "qp_decode returned error\n");
+    }
+    printf("Decoded: '%s'\n", out.c_str());
+#else
+    //'C'est � boire qu'il nous faut �viter l'exc�s.'
+    //'Deuxi�me ligne'
+    //'Troisi�me ligne'
+    //'Et la fin (pas de nl). '
+    const char *b64 = 
+ "Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n"
+	"IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n";
+
+    string out;
+    if (!base64_decode(string(b64), out)) {
+	fprintf(stderr, "base64_decode returned error\n");
+    }
+    printf("Decoded: '%s'\n", out.c_str());
+#endif
 }
 
 #endif // TEST_MIMEPARSE