recoll / Code / [98688b] /src/utils/mimeparse.cpp

[98688b]: src / utils / mimeparse.cpp History

mimeparse.cpp 984 lines (893 with data), 28.4 kB

#ifndef lint
static char rcsid[] = "@(#$Id: mimeparse.cpp,v 1.21 2008-07-01 11:51:51 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

#ifndef TEST_MIMEPARSE

#include <string>
#include <vector>

#include <ctype.h>
#include <stdio.h>
#include <ctype.h>
#include <time.h>
#include <cstdlib>
#include <cstring>

#include "mimeparse.h"
#include "base64.h"
#include "transcode.h"
#include "smallut.h"


#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */

//#define DEBUG_MIMEPARSE 
#ifdef DEBUG_MIMEPARSE
#define DPRINT(X) fprintf X
#else
#define DPRINT(X)
#endif

// Parsing a header value. Only content-type and content-disposition
// have parameters, but others are compatible with content-type
// syntax, only, parameters are not used. So we can parse all like:
//
//    headertype: value [; paramname=paramvalue] ...
//
// Value and paramvalues can be quoted strings, and there can be
// comments too. Note that RFC2047 is explicitly forbidden for
// parameter values (RFC2231 must be used), but I have seen it used
// anyway (ie: thunderbird 1.0)
//
// Ref: RFC2045/6/7 (MIME) RFC2183/2231 (content-disposition and encodings)



/** Decode a MIME parameter value encoded according to rfc2231
 *
 * Example input withs input charset == "":  
 *     [iso-8859-1'french'RE%A0%3A_Smoke_Tests%20bla]
 * Or (if charset is set) : RE%A0%3A_Smoke_Tests%20bla
 *
 * @param in input string, ascii with rfc2231 markup
 * @param out output string
 * @param charset if empty: decode string like 'charset'lang'more%20stuff,
 *      else just do the %XX part
 * @return out output string encoded in utf-8
 */
bool rfc2231_decode(const string &in, string &out, string &charset)
{
    string::size_type pos1, pos2=0;

    if (charset.empty()) {
	if ((pos1 = in.find("'")) == string::npos)
	    return false;
	charset = in.substr(0, pos1);
	// fprintf(stderr, "Charset: [%s]\n", charset.c_str());
	pos1++;

	if ((pos2 = in.find("'", pos1)) == string::npos)
	    return false;
	// We have no use for lang for now
	// string lang = in.substr(pos1, pos2-pos1); 
	// fprintf(stderr, "Lang: [%s]\n", lang.c_str());
	pos2++;
    }

    string raw;
    qp_decode(in.substr(pos2), raw, '%');
    // fprintf(stderr, "raw [%s]\n", raw.c_str());
    if (!transcode(raw, out, charset, "UTF-8"))
	return false;
    return true;
}


/////////////////////////////////////////
/// Decoding of MIME fields values and parameters

// The lexical token returned by find_next_token
class Lexical {
 public:
    enum kind {none, token, separator};
    kind   what;
    string value;
    string error;
    char quote;
    Lexical() : what(none), quote(0) {}
    void reset() {what = none; value.erase(); error.erase();quote = 0;}
};

// Skip mime comment. This must be called with in[start] == '('
static string::size_type 
skip_comment(const string &in, string::size_type start, Lexical &lex)
{
    int commentlevel = 0;
    for (; start < in.size(); start++) {
	if (in[start] == '\\') {
	    // Skip escaped char. 
	    if (start+1 < in.size()) {
		start++;
		continue;
	    } else {
		lex.error.append("\\ at end of string ");
		return in.size();
	    }
	}
	if (in[start] == '(')
	    commentlevel++;
	if (in[start] == ')') {
	    if (--commentlevel == 0)
		break;
	}
    }
    if (start == in.size() && commentlevel != 0) {
	lex.error.append("Unclosed comment ");
	return in.size();
    }
    return start;
}

// Skip initial whitespace and (possibly nested) comments. 
static string::size_type 
skip_whitespace_and_comment(const string &in, string::size_type start, 
			    Lexical &lex)
{
    while (1) {
	if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos)
	    return in.size();
	if (in[start] == '(') {
	    if ((start = skip_comment(in, start, lex)) == string::npos)
		return string::npos;
	} else {
	    break;
	}
    }
    return start;
}

/// Find next token in mime header value string. 
/// @return the next starting position in string, string::npos for error 
/// @param in the input string
/// @param start the starting position
/// @param lex  the returned token and its description
/// @param delims separators we should look for
static string::size_type 
find_next_token(const string &in, string::size_type start, 
		Lexical &lex, string delims = ";=")
{
    char oquot, cquot;

    start = skip_whitespace_and_comment(in, start, lex);
    if (start == string::npos || start == in.size())
	return in.size();

    // Begins with separator ? return it.
    string::size_type delimi = delims.find_first_of(in[start]);
    if (delimi != string::npos) {
	lex.what = Lexical::separator;
	lex.value = delims[delimi];
	return start+1;
    }

    // Check for start of quoted string
    oquot = in[start];
    switch (oquot) {
    case '<': cquot = '>';break;
    case '"': cquot = '"';break;
    default: cquot = 0; break;
    }

    if (cquot != 0) {
	// Quoted string parsing
	string::size_type end;
	start++; // Skip quote character
	for (end = start;end < in.size() && in[end] != cquot; end++) {
	    if (in[end] == '\\') {
		// Skip escaped char. 
		if (end+1 < in.size()) {
		    end++;
		} else {
		    // backslash at end of string: error
		    lex.error.append("\\ at end of string ");
		    return string::npos;
		}
	    }
	}
	if (end == in.size()) {
	    // Found end of string before closing quote character: error
	    lex.error.append("Unclosed quoted string ");
	    return string::npos;
	}
	lex.what = Lexical::token;
	lex.value = in.substr(start, end-start);
	lex.quote = oquot;
	return ++end;
    } else {
	string::size_type end = in.find_first_of(delims + "\r\n \t(", start);
	lex.what = Lexical::token;
	lex.quote = 0;
	if (end == string::npos) {
	    end = in.size();
	    lex.value = in.substr(start);
	} else {
	    lex.value = in.substr(start, end-start);
	}
	return end;
    }
}

// Classes for handling rfc2231 value continuations
class Chunk {
public:
    Chunk() : decode(false) {}
    bool decode;
    string value;
};
class Chunks {
public:
    vector<Chunk> chunks;
};

void stringtolower(string &out, const string& in)
{
    for (string::size_type i = 0; i < in.size(); i++)
	out.append(1, char(tolower(in[i])));
}

// Parse MIME field value. Should look like:
//  somevalue ; param1=val1;param2=val2
bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed)
{
    parsed.value.erase();
    parsed.params.clear();

    Lexical lex;
    string::size_type start = 0;

    // Get the field value
    start = find_next_token(value, start, lex);
    if (start == string::npos || lex.what != Lexical::token) 
	return false;
    parsed.value = lex.value;

    map<string, string> rawparams;
    // Look for parameters
    for (;;) {
	string paramname, paramvalue;
	lex.reset();
	start = find_next_token(value, start, lex);
	if (start == value.size())
	    break;
	if (start == string::npos) {
	    //fprintf(stderr, "Find_next_token error(1)\n");
	    return false;
	}
	if (lex.what == Lexical::separator && lex.value[0] == ';')
	    continue;
	if (lex.what != Lexical::token) 
	    return false;
	stringtolower(paramname, lex.value);

	start = find_next_token(value, start, lex);
	if (start == string::npos || lex.what != Lexical::separator || 
	    lex.value[0] != '=') {
	    //fprintf(stderr, "Find_next_token error (2)\n");
	    return false;
	}

	start = find_next_token(value, start, lex);
	if (start == string::npos || lex.what != Lexical::token) {
	    //fprintf(stderr, "Parameter has no value!");
	    return false;
	}
	paramvalue = lex.value;
	rawparams[paramname] = paramvalue;
	//fprintf(stderr, "RAW: name [%s], value [%s]\n", paramname.c_str(),
	//		paramvalue.c_str());
    }
    //    fprintf(stderr, "Number of raw params %d\n", rawparams.size());

    // RFC2231 handling: 
    // - if a parameter name ends in * it must be decoded 
    // - If a parameter name looks line name*ii[*] it is a
    //   partial value, and must be concatenated with other such.
    
    map<string, Chunks> chunks;
    for (map<string, string>::const_iterator it = rawparams.begin(); 
	 it != rawparams.end(); it++) {
	string nm = it->first;
	//	fprintf(stderr, "NM: [%s]\n", nm.c_str());
	if (nm.empty()) // ??
	    continue;

	Chunk chunk;
	if (nm[nm.length()-1] == '*') {
	    nm.erase(nm.length() - 1);
	    chunk.decode = true;
	} else
	    chunk.decode = false;
	//	fprintf(stderr, "NM1: [%s]\n", nm.c_str());

	chunk.value = it->second;

	// Look for another asterisk in nm. If none, assign index 0
	string::size_type aster;
	int idx = 0;
	if ((aster = nm.rfind("*")) != string::npos) {
	    string num = nm.substr(aster+1);
	    //fprintf(stderr, "NUM: [%s]\n", num.c_str());
	    nm.erase(aster);
	    idx = atoi(num.c_str());
	}
	Chunks empty;
	if (chunks.find(nm) == chunks.end())
	    chunks[nm] = empty;
	chunks[nm].chunks.resize(idx+1);
	chunks[nm].chunks[idx] = chunk;
	//fprintf(stderr, "CHNKS: nm [%s], idx %d, decode %d, value [%s]\n", 
	// nm.c_str(), idx, int(chunk.decode), chunk.value.c_str());
    }

    // For each parameter name, concatenate its chunks and possibly
    // decode Note that we pass the whole concatenated string to
    // decoding if the first chunk indicates that decoding is needed,
    // which is not right because there might be uncoded chunks
    // according to the rfc.
    for (map<string, Chunks>::const_iterator it = chunks.begin(); 
	 it != chunks.end(); it++) {
	if (it->second.chunks.empty())
	    continue;
	string nm = it->first;
	// Create the name entry
	if (parsed.params.find(nm) == parsed.params.end())
	    parsed.params[nm].clear();
	// Concatenate all chunks and decode the whole if the first one needs
	// to. Yes, this is not quite right.
	string value;
	for (vector<Chunk>::const_iterator vi = it->second.chunks.begin();
	     vi != it->second.chunks.end(); vi++) {
	    value += vi->value;
	}
	if (it->second.chunks[0].decode) {
	    string charset;
	    rfc2231_decode(value, parsed.params[nm], charset);
	} else {
	    // rfc2047 MUST NOT but IS used by some agents
	    rfc2047_decode(value, parsed.params[nm]);
	}
	//fprintf(stderr, "FINAL: nm [%s], value [%s]\n", 
	//nm.c_str(), parsed.params[nm].c_str());
    }
    
    return true;
}

// Decode a string encoded with quoted-printable encoding. 
// we reuse the code for rfc2231 % encoding, even if the eol
// processing is not useful in this case
bool qp_decode(const string& in, string &out, char esc) 
{
    out.reserve(in.length());
    string::size_type ii;
    for (ii = 0; ii < in.length(); ii++) {
	if (in[ii] == esc) {
	    ii++; // Skip '=' or '%'
	    if(ii >= in.length() - 1) { // Need at least 2 more chars
		break;
	    } else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip
		ii++;
	    } else if (in[ii] != '\n' && in[ii] != '\r') { // decode
		char c = in[ii];
		char co;
		if(c >= 'A' && c <= 'F') {
		    co = char((c - 'A' + 10) * 16);
		} else if (c >= 'a' && c <= 'f') {
		    co = char((c - 'a' + 10) * 16);
		} else if (c >= '0' && c <= '9') {
		    co = char((c - '0') * 16);
		} else {
		    return false;
		}
		if(++ii >= in.length()) 
		    break;
		c = in[ii];
		if (c >= 'A' && c <= 'F') {
		    co += char(c - 'A' + 10);
		} else if (c >= 'a' && c <= 'f') {
		    co += char(c - 'a' + 10);
		} else if (c >= '0' && c <= '9') {
		    co += char(c - '0');
		} else {
		    return false;
		}
		out += co;
	    }
	} else {
	    out += in[ii];
	}
    }
    return true;
}

// Decode an word encoded as quoted printable or base 64
static bool rfc2047_decodeParsed(const std::string& charset, 
				 const std::string& encoding, 
				 const std::string& value, 
				 std::string &utf8)
{
    DPRINT((stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n",
	    charset.c_str(), encoding.c_str(), value.c_str()));
    utf8.clear();

    string decoded;
    if (!stringlowercmp("b", encoding)) {
	if (!base64_decode(value, decoded))
	    return false;
	DPRINT((stderr, "FromB64: [%s]\n", decoded.c_str()));
    } else if (!stringlowercmp("q", encoding)) {
	if (!qp_decode(value, decoded))
	    return false;
	// Need to translate _ to ' ' here
	string temp;
	for (string::size_type pos = 0; pos < decoded.length(); pos++)
	    if (decoded[pos] == '_')
		temp += ' ';
	    else 
		temp += decoded[pos];
	decoded = temp;
	DPRINT((stderr, "FromQP: [%s]\n", decoded.c_str()));
    } else {
	DPRINT((stderr, "Bad encoding [%s]\n", encoding.c_str()));
	return false;
    }

    if (!transcode(decoded, utf8, charset, "UTF-8")) {
	DPRINT((stderr, "Transcode failed\n"));
	return false;
    }
    return true;
}

// Parse a mail header value encoded according to RFC2047. 
// This is not supposed to be used for MIME parameter values, but it
// happens.
// Bugs: 
//    - We should turn off decoding while inside quoted strings
//
typedef enum  {rfc2047ready, rfc2047open_eq, 
	       rfc2047charset, rfc2047encoding, 
	       rfc2047value, rfc2047close_q} Rfc2047States;

bool rfc2047_decode(const std::string& in, std::string &out) 
{
    DPRINT((stderr, "rfc2047_decode: [%s]\n", in.c_str()));

    Rfc2047States state = rfc2047ready;
    string encoding, charset, value, utf8;

    out.clear();

    for (string::size_type ii = 0; ii < in.length(); ii++) {
	char ch = in[ii];
	switch (state) {
	case rfc2047ready: 
	    {
                DPRINT((stderr, "STATE: ready, ch %c\n", ch));
		switch (ch) {
		    // Whitespace: stay ready
		case ' ': case '	': value += ch;break;
		    // '=' -> forward to next state
		case '=': state = rfc2047open_eq; break;
                    DPRINT((stderr, "STATE: open_eq\n"));
		    // Other: go back to sleep
		default: value += ch; state = rfc2047ready;
		}
	    }
	    break;
	case rfc2047open_eq: 
	    {
                DPRINT((stderr, "STATE: open_eq, ch %c\n", ch));
		switch (ch) {
		case '?': 
		    {
			// Transcode current (unencoded part) value:
			// we sometimes find 8-bit chars in
			// there. Interpret as Iso8859.
			if (value.length() > 0) {
			    transcode(value, utf8, "ISO-8859-1", "UTF-8");
			    out += utf8;
			    value.clear();
			}
			state = rfc2047charset; 
		    }
		    break;
		default: state = rfc2047ready; out += '='; out += ch;break;
		}
	    } 
	    break;
	case rfc2047charset: 
	    {
                DPRINT((stderr, "STATE: charset, ch %c\n", ch));
		switch (ch) {
		case '?': state = rfc2047encoding; break;
		default: charset += ch; break;
		}
	    } 
	    break;
	case rfc2047encoding: 
	    {
                DPRINT((stderr, "STATE: encoding, ch %c\n", ch));
		switch (ch) {
		case '?': state = rfc2047value; break;
		default: encoding += ch; break;
		}
	    }
	    break;
	case rfc2047value: 
	    {
                DPRINT((stderr, "STATE: value, ch %c\n", ch));
		switch (ch) {
		case '?': state = rfc2047close_q; break;
		default: value += ch;break;
		}
	    }
	    break;
	case rfc2047close_q: 
	    {
                DPRINT((stderr, "STATE: close_q, ch %c\n", ch));
		switch (ch) {
		case '=': 
		    {
                        DPRINT((stderr, "End of encoded area. Charset %s, Encoding %s\n", charset.c_str(), encoding.c_str()));
			string utf8;
			state = rfc2047ready; 
			if (!rfc2047_decodeParsed(charset, encoding, value, 
						  utf8)) {
			    return false;
			}
			out += utf8;
			charset.clear();
			encoding.clear();
			value.clear();
		    }
		    break;
		default: state = rfc2047value; value += '?';value += ch;break;
		}
	    }
	    break;
	default: // ??
            DPRINT((stderr, "STATE: default ?? ch %c\n", ch));
	    return false;
	}
    }

    if (value.length() > 0) {
	transcode(value, utf8, "ISO-8859-1", "UTF-8");
	out += utf8;
	value.clear();
    }
    if (state != rfc2047ready) 
	return false;
    return true;
}

#define DEBUGDATE 0
#if DEBUGDATE
#define DATEDEB(X) fprintf X
#else
#define DATEDEB(X)
#endif

// Convert rfc822 date to unix time. A date string normally looks like:
//  Mon, 3 Jul 2006 09:51:58 +0200
// But there are many close common variations
// And also hopeless things like: Fri Nov  3 13:13:33 2006
time_t rfc2822DateToUxTime(const string& dt)
{
    // Strip everything up to first comma if any, we don't need weekday,
    // then break into tokens
    list<string> toks;
    string::size_type idx;
    if ((idx = dt.find_first_of(",")) != string::npos) {
	if (idx == dt.length() - 1) {
	    DATEDEB((stderr, "Bad rfc822 date format (short1): [%s]\n", 
		     dt.c_str()));
	    return (time_t)-1;
	}
	string date = dt.substr(idx+1);
	stringToTokens(date, toks, " \t:");
    } else {
        // No comma. Enter strangeland
	stringToTokens(dt, toks, " \t:");
        // Test for date like: Sun Nov 19 06:18:41 2006
        //                      0   1  2   3 4  5  6
        // and change to:      19 Nov 2006 06:18:41
        if (toks.size() == 7) {
            list<string>::iterator it0 = toks.begin();
            if (it0->length() == 3 &&
                it0->find_first_of("0123456789") == string::npos) {
                list<string>::iterator it2 = it0;
                for (int i = 0; i < 2; i++) it2++;
                list<string>::iterator it6 = it2;
                for (int i = 0; i < 4; i++) it6++;
                iter_swap(it0, it2);
                iter_swap(it6, it2);
                toks.erase(it6);
            }
        }
    }

#if DEBUGDATE
    for (list<string>::iterator it = toks.begin(); it != toks.end(); it++) {
	DATEDEB((stderr, "[%s] ", it->c_str()));
    }
    DATEDEB((stderr, "\n"));
#endif

    if (toks.size() < 6) {
	DATEDEB((stderr, "Bad rfc822 date format (toks cnt): [%s]\n", 
		 dt.c_str()));
	return (time_t)-1;
    }

    if (toks.size() == 6) {
	// Probably no timezone, sometimes happens
	toks.push_back("+0000");
    }

    struct tm tm;
    memset(&tm, 0, sizeof(tm));

    // Load struct tm with appropriate tokens, possibly converting
    // when needed

    list<string>::iterator it = toks.begin();

    // Day of month: no conversion needed
    tm.tm_mday = atoi(it->c_str());
    it++;

    // Month. Only Jan-Dec are legal. January, February do happen
    // though. Convert to 0-11
    if (*it == "Jan" || *it == "January") tm.tm_mon = 0; else if
	(*it == "Feb" || *it == "February") tm.tm_mon = 1; else if
	(*it == "Mar" || *it == "March") tm.tm_mon = 2; else if
	(*it == "Apr" || *it == "April") tm.tm_mon = 3; else if
	(*it == "May") tm.tm_mon = 4; else if
	(*it == "Jun" || *it == "June") tm.tm_mon = 5; else if
	(*it == "Jul" || *it == "July") tm.tm_mon = 6; else if
	(*it == "Aug" || *it == "August") tm.tm_mon = 7; else if
	(*it == "Sep" || *it == "September") tm.tm_mon = 8; else if
	(*it == "Oct" || *it == "October") tm.tm_mon = 9; else if
	(*it == "Nov" || *it == "November") tm.tm_mon = 10; else if
	(*it == "Dec" || *it == "December") tm.tm_mon = 11; else {
	DATEDEB((stderr, "Bad rfc822 date format (month): [%s]\n", 
		 dt.c_str()));
	return (time_t)-1;
    }
    it++;

    // Year. Struct tm counts from 1900. 2 char years are quite rare
    // but do happen. I've seen 00 happen so count small values from 2000
    tm.tm_year = atoi(it->c_str());
    if (it->length() == 2) {
        if (tm.tm_year < 10)
            tm.tm_year += 2000;
        else
            tm.tm_year += 1900;
    }
    if (tm.tm_year > 1900)
	tm.tm_year -= 1900;
    it++;

    // Hour minute second need no adjustments
    tm.tm_hour = atoi(it->c_str()); it++;
    tm.tm_min  = atoi(it->c_str()); it++;
    tm.tm_sec  = atoi(it->c_str()); it++;	


    // Timezone is supposed to be either +-XYZT or a zone name
    int zonesecs = 0;
    if (it->length() < 1) {
	DATEDEB((stderr, "Bad rfc822 date format (zlen): [%s]\n", dt.c_str()));
	return (time_t)-1;
    }
    if (it->at(0) == '-' || it->at(0) == '+') {
	// Note that +xy:zt (instead of +xyzt) sometimes happen, we
	// may want to process it one day
	if (it->length() < 5) {
	    DATEDEB((stderr, "Bad rfc822 date format (zlen1): [%s]\n", 
		     dt.c_str()));
	    goto nozone;
	}
	zonesecs = 3600*((it->at(1)-'0') * 10 + it->at(2)-'0')+ 
	    (it->at(3)-'0')*10 + it->at(4)-'0';
	zonesecs = it->at(0) == '+' ? -1 * zonesecs : zonesecs;
    } else {
	int hours;
	if (*it == "A") hours= 1; else if (*it == "B") hours= 2; 
	else if (*it == "C") hours= 3; else if (*it == "D") hours= 4; 
	else if (*it == "E") hours= 5; else if (*it == "F") hours= 6;
	else if (*it == "G") hours= 7; else if (*it == "H") hours= 8; 
	else if (*it == "I") hours= 9; else if (*it == "K") hours= 10;
	else if (*it == "L") hours= 11; else if (*it == "M") hours= 12; 
	else if (*it == "N") hours= -1; else if (*it == "O") hours= -2; 
	else if (*it == "P") hours= -3; else if (*it == "Q") hours= -4; 
	else if (*it == "R") hours= -5; else if (*it == "S") hours= -6; 
	else if (*it == "T") hours= -7; else if (*it == "U") hours= -8; 
	else if (*it == "V") hours= -9; else if (*it == "W") hours= -10;
	else if (*it == "X") hours= -11; else if (*it == "Y") hours= -12;
	else if (*it == "Z") hours=  0; else if  (*it == "UT") hours= 0; 
	else if (*it == "GMT") hours= 0; else if (*it == "EST") hours= 5;
	else if (*it == "EDT") hours= 4; else if (*it == "CST") hours= 6;
	else if (*it == "CDT") hours= 5; else if (*it == "MST") hours= 7;
	else if (*it == "MDT") hours= 6; else if (*it == "PST") hours= 8;
	else if (*it == "PDT") hours= 7; 
	    // Non standard names
	    // Standard Time (or Irish Summer Time?) is actually +5.5
	else if (*it == "CET") hours= -1; else if (*it == "JST") hours= -9; 
	else if (*it == "IST") hours= -5; else if (*it == "WET") hours= 0; 
	else if (*it == "MET") hours= -1; 
	else {
	    DATEDEB((stderr, "Bad rfc822 date format (zname): [%s]\n", 
		     dt.c_str()));
	    // Forget tz
	    goto nozone;
	}
	zonesecs = 3600 * hours;
    }
    DATEDEB((stderr, "Tz: [%s] -> %d\n", it->c_str(), zonesecs));
 nozone:

    // Compute the UTC Unix time value
#ifndef sun
    time_t tim = timegm(&tm);
#else
    // No timegm on Sun. Use mktime, then correct for local timezone
    time_t tim = mktime(&tm);
    // altzone and timezone hold the difference in seconds between UTC
    // and local. They are negative for places east of greenwich
    // 
    // mktime takes our buffer to be local time, so it adds timezone
    // to the conversion result (if timezone is < 0 it's currently
    // earlier in greenwhich). 
    //
    // We have to substract it back (hey! hopefully! maybe we have to
    // add it). Who can really know?
    tim -= timezone;
#endif

    // And add in the correction from the email's Tz
    tim += zonesecs;

    DATEDEB((stderr, "Date: %s  uxtime %ld \n", ctime(&tim), tim));
    return tim;
}

#else 

#include <time.h>

#include <string>
#include "mimeparse.h"
#include "readfile.h"


using namespace std;
extern bool rfc2231_decode(const string& in, string& out, string& charset); 
extern time_t rfc2822DateToUxTime(const string& date);
static const char *thisprog;

static char usage [] =
"-p: header value and parameter test\n"
"-q: qp decoding\n"
"-b: base64\n"
"-7: rfc2047\n"
"-1: rfc2331\n"
"-t: date time\n"
"  \n\n"
;
static void
Usage(void)
{
    fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
    exit(1);
}

static int     op_flags;
#define OPT_MOINS 0x1
#define OPT_p	  0x2 
#define OPT_q	  0x4 
#define OPT_b     0x8
#define OPT_7     0x10
#define OPT_1     0x20
#define OPT_t     0x40
int
main(int argc, const char **argv)
{
  int count = 10;
    
  thisprog = argv[0];
  argc--; argv++;

  while (argc > 0 && **argv == '-') {
    (*argv)++;
    if (!(**argv))
      /* Cas du "adb - core" */
      Usage();
    while (**argv)
      switch (*(*argv)++) {
      case 'p':	op_flags |= OPT_p; break;
      case 'q':	op_flags |= OPT_q; break;
      case 'b':	op_flags |= OPT_b; break;
      case '1':	op_flags |= OPT_1; break;
      case '7':	op_flags |= OPT_7; break;
      case 't':	op_flags |= OPT_t; break;
      default: Usage();	break;
      }
  b1: argc--; argv++;
  }

  if (argc != 0)
    Usage();

  if (op_flags & OPT_p) {
      // Mime header value and parameters extraction
      const char *tr[] = {
	  "text/html;charset = UTF-8 ; otherparam=garb; \n"
	  "QUOTEDPARAM=\"quoted value\"",

	  "text/plain; charset=ASCII\r\n name=\"809D3016_5691DPS_5.2.LIC\"",

	  "application/x-stuff;"
	  "title*0*=us-ascii'en'This%20is%20even%20more%20;"
	  "title*1*=%2A%2A%2Afun%2A%2A%2A%20;"
	  "title*2=\"isn't it!\""
      };
      
      for (unsigned int i = 0; i < sizeof(tr) / sizeof(char *); i++) {
	  MimeHeaderValue parsed;
	  if (!parseMimeHeaderValue(tr[i], parsed)) {
	      fprintf(stderr, "PARSE ERROR for [%s]\n", tr[i]);
	  }
	  printf("Field value: [%s]\n", parsed.value.c_str());
	  map<string, string>::iterator it;
	  for (it = parsed.params.begin();it != parsed.params.end();it++) {
	      if (it == parsed.params.begin())
		  printf("Parameters:\n");
	      printf("  [%s] = [%s]\n", it->first.c_str(), it->second.c_str());
	  }
      }

  } else if (op_flags & OPT_q) {
      // Quoted printable stuff
      const char *qp = 
	  "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme "
	  "agrave is: '=E0' probable skipped decode error: =\n"
	  "Actual decode error =xx this wont show";

      string out;
      if (!qp_decode(string(qp), out)) {
	  fprintf(stderr, "qp_decode returned error\n");
      }
      printf("Decoded: '%s'\n", out.c_str());
  } else if (op_flags & OPT_b) {
      // Base64
      //'C'est � boire qu'il nous faut �viter l'exc�s.'
      //'Deuxi�me ligne'
      //'Troisi�me ligne'
      //'Et la fin (pas de nl). '
      const char *b64 = 
 "Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n"
 "IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n";

      string out;
      if (!base64_decode(string(b64), out)) {
	  fprintf(stderr, "base64_decode returned error\n");
	  exit(1);
      }
      printf("Decoded: [%s]\n", out.c_str());
#if 0
      string coded, decoded;
      const char *fname = "/tmp/recoll_decodefail";
      if (!file_to_string(fname, coded)) {
	  fprintf(stderr, "Cant read %s\n", fname);
	  exit(1);
      }
    
      if (!base64_decode(coded, decoded)) {
	  fprintf(stderr, "base64_decode returned error\n");
	  exit(1);
      }
      printf("Decoded: [%s]\n", decoded.c_str());
#endif

  } else if (op_flags & (OPT_7|OPT_1)) {
      // rfc2047
      char line [1024];
      string out;
      bool res;
      while (fgets(line, 1023, stdin)) {
	  int l = strlen(line);
	  if (l == 0)
	      continue;
	  line[l-1] = 0;
	  fprintf(stderr, "Line: [%s]\n", line);
	  string charset;
	  if (op_flags & OPT_7) {
	      res = rfc2047_decode(line, out);
	  } else {
	      res = rfc2231_decode(line, out, charset);
	  }
	  if (res)
	      fprintf(stderr, "Out:  [%s] cs %s\n", out.c_str(), charset.c_str());
	  else
	      fprintf(stderr, "Decoding failed\n");
      }
  } else if (op_flags & OPT_t) {
      time_t t;
	
      const char *dates[] = {
	  " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)",
	  " Mon, 3 Jul 2006 09:51:58 +0200",
	  " Wed, 13 Sep 2006 08:19:48 GMT-07:00",
	  " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)",
	  " Sat, 23 Dec 89 19:27:12 EST",
	  "   13 Jan 90 08:23:29 GMT"};

      for (unsigned int i = 0; i <sizeof(dates) / sizeof(char *); i++) {
	  t = rfc2822DateToUxTime(dates[i]);
	  struct tm *tm = localtime(&t);
	  char datebuf[100];
	  strftime(datebuf, 99, "&nbsp;%Y-%m-%d&nbsp;%H:%M:%S %z", tm);
	  printf("[%s] -> [%s]\n", dates[i], datebuf);
      }
      printf("Enter date:\n");
      char line [1024];
      while (fgets(line, 1023, stdin)) {
	  int l = strlen(line);
	  if (l == 0) continue;
	  line[l-1] = 0;
	  t = rfc2822DateToUxTime(line);
	  struct tm *tm = localtime(&t);
	  char datebuf[100];
	  strftime(datebuf, 99, "&nbsp;%Y-%m-%d&nbsp;%H:%M:%S %z", tm);
	  printf("[%s] -> [%s]\n", line, datebuf);
      }


  }
  exit(0);
}

#endif // TEST_MIMEPARSE