recoll / Code / [763b5f] /src/utils/mimeparse.cpp

[763b5f]: src / utils / mimeparse.cpp History

mimeparse.cpp 557 lines (504 with data), 13.7 kB

#ifndef lint
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.4 2005-10-15 12:18:04 dockes Exp $ (C) 2004 J.F.Dockes";
#endif

#ifndef TEST_MIMEPARSE

#include <string>
#include <ctype.h>
#include <stdio.h>
#include <ctype.h>

#include "mimeparse.h"

using namespace std;

// Parsing a header value. Only content-type has parameters, but
// others are compatible with content-type syntax, only, parameters
// are not used. So we can parse all like content-type:
//    headertype: value [; paramname=paramvalue] ...
// Value and paramvalues can be quoted strings, and there can be
// comments in there



// The lexical token returned by find_next_token
class Lexical {
 public:
    enum kind {none, token, separator};
    kind   what;
    string value;
    string error;
    char quote;
    Lexical() : what(none), quote(0) {}
    void reset() {what = none; value.erase(); error.erase();quote = 0;}
};

// Skip mime comment. This must be called with in[start] == '('
int skip_comment(const string &in, unsigned int start, Lexical &lex)
{
    int commentlevel = 0;
    for (; start < in.size(); start++) {
	if (in[start] == '\\') {
	    // Skip escaped char. 
	    if (start+1 < in.size()) {
		start++;
		continue;
	    } else {
		lex.error.append("\\ at end of string ");
		return string::npos;
	    }
	}
	if (in[start] == '(')
	    commentlevel++;
	if (in[start] == ')') {
	    if (--commentlevel == 0)
		break;
	}
    }
    if (start == in.size()) {
	lex.error.append("Unclosed comment ");
	return string::npos;
    }
    return start;
}

// Skip initial whitespace and (possibly nested) comments. 
int skip_whitespace_and_comment(const string &in, unsigned int start, 
				Lexical &lex)
{
    while (1) {
	if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos)
	    return in.size();
	if (in[start] == '(') {
	    if ((start = skip_comment(in, start, lex)) == string::npos)
		return string::npos;
	} else {
	    break;
	}
    }
    return start;
}

/// Find next token in mime header value string. 
/// @return the next starting position in string, string::npos for error 
///   (ie unbalanced quoting)
/// @param in the input string
/// @param start the starting position
/// @param lex  the returned token and its description
/// @param delims separators we should look for
int find_next_token(const string &in, unsigned int start, 
		    Lexical &lex, string delims = ";=")
{
    char oquot, cquot;

    start = skip_whitespace_and_comment(in, start, lex);
    if (start == string::npos || start == in.size())
	return start;

    // Begins with separator ? return it.
    unsigned int delimi = delims.find_first_of(in[start]);
    if (delimi != string::npos) {
	lex.what = Lexical::separator;
	lex.value = delims[delimi];
	return start+1;
    }

    // Check for start of quoted string
    oquot = in[start];
    switch (oquot) {
    case '<': cquot = '>';break;
    case '"': cquot = '"';break;
    default: cquot = 0; break;
    }

    if (cquot != 0) {
	// Quoted string parsing
	unsigned int end;
	start++; // Skip quote character
	for (end = start;end < in.size() && in[end] != cquot; end++) {
	    if (in[end] == '\\') {
		// Skip escaped char. 
		if (end+1 < in.size()) {
		    end++;
		} else {
		    // backslash at end of string: error
		    lex.error.append("\\ at end of string ");
		    return string::npos;
		}
	    }
	}
	if (end == in.size()) {
	    // Found end of string before closing quote character: error
	    lex.error.append("Unclosed quoted string ");
	    return string::npos;
	}
	lex.what = Lexical::token;
	lex.value = in.substr(start, end-start);
	lex.quote = oquot;
	return ++end;
    } else {
	unsigned int end = in.find_first_of(delims + " \t(", start);
	lex.what = Lexical::token;
	lex.quote = 0;
	if (end == string::npos) {
	    end = in.size();
	    lex.value = in.substr(start);
	} else {
	    lex.value = in.substr(start, end-start);
	}
	return end;
    }
}

void stringtolower(string &out, const string& in)
{
    for (unsigned int i = 0; i < in.size(); i++)
	out.append(1, char(tolower(in[i])));
}

bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed)
{
    parsed.value.erase();
    parsed.params.clear();

    Lexical lex;
    unsigned int start = 0;
    start = find_next_token(value, start, lex);
    if (start == string::npos || lex.what != Lexical::token) 
	return false;

    parsed.value = lex.value;

    for (;;) {
	string paramname, paramvalue;
	lex.reset();
	start = find_next_token(value, start, lex);
	if (start == value.size())
	    return true;
	if (start == string::npos)
	    return false;
	if (lex.what == Lexical::separator && lex.value[0] == ';')
	    continue;
	if (lex.what != Lexical::token) 
	    return false;
	stringtolower(paramname, lex.value);

	start = find_next_token(value, start, lex);
	if (start == string::npos || lex.what != Lexical::separator || 
	    lex.value[0] != '=') 
	    return false;

	start = find_next_token(value, start, lex);
	if (start == string::npos || lex.what != Lexical::token)
	    return false;
	paramvalue = lex.value;
	parsed.params[paramname] = paramvalue;
    }
    return true;
}

// Decode a string encoded with quoted-printable encoding. 
bool qp_decode(const string& in, string &out) 
{
    out.reserve(in.length());
    unsigned int ii;
    for (ii = 0; ii < in.length(); ii++) {
	if (in[ii] == '=') {
	    ii++; // Skip '='
	    if(ii >= in.length() - 1) { // Need at least 2 more chars
		break;
	    } else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip
		ii++;
	    } else if (in[ii] != '\n' && in[ii] != '\r') { // decode
		char c = in[ii];
		char co;
		if(c >= 'A' && c <= 'F') {
		    co = char((c - 'A' + 10) * 16);
		} else if (c >= 'a' && c <= 'f') {
		    co = char((c - 'a' + 10) * 16);
		} else if (c >= '0' && c <= '9') {
		    co = char((c - '0') * 16);
		} else {
		    return false;
		}
		if(++ii >= in.length()) 
		    break;
		c = in[ii];
		if (c >= 'A' && c <= 'F') {
		    co += char(c - 'A' + 10);
		} else if (c >= 'a' && c <= 'f') {
		    co += char(c - 'a' + 10);
		} else if (c >= '0' && c <= '9') {
		    co += char(c - '0');
		} else {
		    return false;
		}
		out += co;
	    }
	} else {
	    out += in[ii];
	}
    }
    return true;
}


// This is adapted from FreeBSD's code.
static const char Base64[] =
    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static const char Pad64 = '=';
bool base64_decode(const string& in, string& out)
{
    int io = 0, state = 0, ch;
    char *pos;
    unsigned int ii = 0;
    out.reserve(in.length());

    for (ii = 0; ii < in.length(); ii++) {
	ch = in[ii];
	if (isspace((unsigned char)ch))        /* Skip whitespace anywhere. */
	    continue;

	if (ch == Pad64)
	    break;

	pos = strchr(Base64, ch);
	if (pos == 0) 		/* A non-base64 character. */
	    return false;

	switch (state) {
	case 0:
	    out[io] = (pos - Base64) << 2;
	    state = 1;
	    break;
	case 1:
	    out[io]   |=  (pos - Base64) >> 4;
	    out[io+1]  = ((pos - Base64) & 0x0f) << 4 ;
	    io++;
	    state = 2;
	    break;
	case 2:
	    out[io]   |=  (pos - Base64) >> 2;
	    out[io+1]  = ((pos - Base64) & 0x03) << 6;
	    io++;
	    state = 3;
	    break;
	case 3:
	    out[io] |= (pos - Base64);
	    io++;
	    state = 0;
	    break;
	default:
	    return false;
	}
    }

    /*
     * We are done decoding Base-64 chars.  Let's see if we ended
     * on a byte boundary, and/or with erroneous trailing characters.
     */

    if (ch == Pad64) {		/* We got a pad char. */
	ch = in[ii++];		/* Skip it, get next. */
	switch (state) {
	case 0:		/* Invalid = in first position */
	case 1:		/* Invalid = in second position */
	    return false;

	case 2:		/* Valid, means one byte of info */
			/* Skip any number of spaces. */
	    for (; ii < in.length(); ch = in[ii++])
		if (!isspace((unsigned char)ch))
		    break;
	    /* Make sure there is another trailing = sign. */
	    if (ch != Pad64)
		return false;
	    ch = in[ii++];		/* Skip the = */
	    /* Fall through to "single trailing =" case. */
	    /* FALLTHROUGH */

	case 3:		/* Valid, means two bytes of info */
			/*
			 * We know this char is an =.  Is there anything but
			 * whitespace after it?
			 */
	    for ((void)NULL; ii < in.length(); ch = in[ii++])
		if (!isspace((unsigned char)ch))
		    return false;

	    /*
	     * Now make sure for cases 2 and 3 that the "extra"
	     * bits that slopped past the last full byte were
	     * zeros.  If we don't check them, they become a
	     * subliminal channel.
	     */
	    if (out[io] != 0)
		return false;
	}
    } else {
	/*
	 * We ended by seeing the end of the string.  Make sure we
	 * have no partial bytes lying around.
	 */
	if (state != 0)
	    return false;
    }

    return true;
}

#include "transcode.h"
#include "smallut.h"

// Decode a parsed encoded word
static bool rfc2047_decodeParsed(const std::string& charset, 
				 const std::string& encoding, 
				 const std::string& value, 
				 std::string &utf8)
{
    //    fprintf(stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n",
    //	    charset.c_str(), encoding.c_str(), value.c_str());
    utf8 = "";

    string decoded;
    if (!stringlowercmp("b", encoding)) {
	if (!base64_decode(value, decoded))
	    return false;
	//	fprintf(stderr, "FromB64: [%s]\n", decoded.c_str());
    } else if (!stringlowercmp("q", encoding)) {
	if (!qp_decode(value, decoded))
	    return false;
	// Need to translate _ to ' ' here
	string temp;
	for (string::size_type pos = 0; pos < decoded.length(); pos++)
	    if (decoded[pos] == '_')
		temp += ' ';
	    else 
		temp += decoded[pos];
	decoded = temp;
	//	fprintf(stderr, "FromQP: [%s]\n", decoded.c_str());
    } else {
	//	fprintf(stderr, "Bad encoding [%s]\n", encoding.c_str());
	return false;
    }

    if (!transcode(decoded, utf8, charset, "UTF-8")) {
	//	fprintf(stderr, "Transcode failed\n");
	return false;
    }
    return true;
}

// Parse a mail header encoded value
typedef enum  {rfc2047base, rfc2047open_eq, rfc2047charset, rfc2047encoding, 
	       rfc2047value, rfc2047close_q} Rfc2047States;

bool rfc2047_decode(const std::string& in, std::string &out) 
{
    Rfc2047States state = rfc2047base;
    string encoding, charset, value, utf8;

    out = "";

    for (unsigned int ii = 0; ii < in.length(); ii++) {
	char ch = in[ii];
	switch (state) {
	case rfc2047base: 
	    {
		switch (ch) {
		case '=': state = rfc2047open_eq; break;
		default: value += ch;
		}
	    }
	    break;
	case rfc2047open_eq: 
	    {
		switch (ch) {
		case '?': 
		    {
			// Transcode current (unencoded part) value:
			// we sometimes find 8-bit chars in
			// there. Interpret as Iso8859.
			if (value.length() > 0) {
			    transcode(value, utf8, "ISO8859-1", "UTF-8");
			    out += utf8;
			    value = "";
			}
			state = rfc2047charset; 
		    }
		    break;
		default: state = rfc2047base; out += '='; out += ch;break;
		}
	    } 
	    break;
	case rfc2047charset: 
	    {
		switch (ch) {
		case '?': state = rfc2047encoding; break;
		default: charset += ch; break;
		}
	    } 
	    break;
	case rfc2047encoding: 
	    {
		switch (ch) {
		case '?': state = rfc2047value; break;
		default: encoding += ch; break;
		}
	    }
	    break;
	case rfc2047value: 
	    {
		switch (ch) {
		case '?': state = rfc2047close_q; break;
		default: value += ch;break;
		}
	    }
	    break;
	case rfc2047close_q: 
	    {
		switch (ch) {
		case '=': 
		    {
			string utf8;
			state = rfc2047base; 
			if (!rfc2047_decodeParsed(charset, encoding, value, 
						  utf8)) {
			    return false;
			}
			out += utf8;
			charset = encoding = value = "";
		    }
		    break;
		default: state = rfc2047value; value += '?';value += ch;break;
		}
	    }
	    break;
	default: // ??
	    return false;
	}
    }

    if (value.length() > 0) {
	transcode(value, utf8, "ISO8859-1", "UTF-8");
	out += utf8;
	value = "";
    }
    if (state != rfc2047base) 
	return false;
    return true;
}

#else 

#include <string>
#include "mimeparse.h"
using namespace std;
int
main(int argc, const char **argv)
{
#if 0
    //    const char *tr = "text/html; charset=utf-8; otherparam=garb";
    const char *tr = "text/html;charset = UTF-8 ; otherparam=garb; \n"
	"QUOTEDPARAM=\"quoted value\"";

    MimeHeaderValue parsed;

    if (!parseMimeHeaderValue(tr, parsed)) {
	fprintf(stderr, "PARSE ERROR\n");
    }
    
    printf("'%s' \n", parsed.value.c_str());
    map<string, string>::iterator it;
    for (it = parsed.params.begin();it != parsed.params.end();it++) {
	printf("  '%s' = '%s'\n", it->first.c_str(), it->second.c_str());
    }
#elif 0
    const char *qp = "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme "
	"agrave is: '=E0' probable skipped decode error: =\n"
	"Actual decode error =xx this wont show";

    string out;
    if (!qp_decode(string(qp), out)) {
	fprintf(stderr, "qp_decode returned error\n");
    }
    printf("Decoded: '%s'\n", out.c_str());
#elif 0
    //'C'est � boire qu'il nous faut �viter l'exc�s.'
    //'Deuxi�me ligne'
    //'Troisi�me ligne'
    //'Et la fin (pas de nl). '
    const char *b64 = 
 "Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n"
	"IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n";

    string out;
    if (!base64_decode(string(b64), out)) {
	fprintf(stderr, "base64_decode returned error\n");
    }
    printf("Decoded: '%s'\n", out.c_str());
#elif 1
    char line [1024];
    string out;
    while (fgets(line, 1023, stdin)) {
	int l = strlen(line);
	if (l == 0)
	    continue;
	line[l-1] = 0;
	fprintf(stderr, "Line: [%s]\n", line);
	rfc2047_decode(line, out);
	fprintf(stderr, "Out:  [%s]\n", out.c_str());
    }
#endif
}

#endif // TEST_MIMEPARSE