recoll / Code / Diff of /src/internfile/mh

Diff of /src/internfile/mh_mail.cpp [d4f838] .. [92b930]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.15 2006-09-05 08:05:02 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...

#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */




MimeHandlerMail::~MimeHandlerMail()
{
    if (m_vfp) {
    fclose((FILE *)m_vfp);
    m_vfp = 0;
...
    if (doc.h.getFirstHeader("Subject", hi)) {
    rfc2047_decode(hi.getValue(), transcoded);
    docout.text += string("Subject: ") + transcoded + string("\n");
    }

    LOGDEB2(("MimeHandlerMail::processone:ismultipart %d mime subtype '%s'\n",
        doc.isMultipart(), doc.getSubType().c_str()));
    walkmime(docout.text, doc, 0);

    LOGDEB2(("MimeHandlerMail::processone:text:[%s]\n", docout.text.c_str()));
    return MimeHandler::MHDone;
}

// Recursively walk the message mime parts and concatenate all the
// inline html or text that we find anywhere.
void MimeHandlerMail::walkmime(string &out, Binc::MimePart& doc, int depth)

{
    if (depth > 5) {
    LOGINFO(("walkmime: max depth exceeded\n"));
    return;
    }
...
    // process it For mixed and related, we process each part.
    std::vector<Binc::MimePart>::iterator it;
    if (!stringicmp("mixed", doc.getSubType()) || 
        !stringicmp("related", doc.getSubType())) {
        for (it = doc.members.begin(); it != doc.members.end();it++) {
        walkmime(out, *it, depth+1);
        }
    } else if (!stringicmp("alternative", doc.getSubType())) {
        std::vector<Binc::MimePart>::iterator ittxt, ithtml;
        ittxt = ithtml = doc.members.end();
        int i = 1;
...
        else if (!stringlowercmp("text/html", content_type.value)) 
            ithtml = it;
        }
        if (ittxt != doc.members.end()) {
        LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
        walkmime(out, *ittxt, depth+1);
        } else if (ithtml != doc.members.end()) {
        LOGDEB2(("walkmime: alternative: chose text/html part\n"))
        walkmime(out, *ithtml, depth+1);
        }
    }
    } else {
  // "Simple" part. See what it is:


    // Get and parse content-type header.
    Binc::HeaderItem hi;
    string ctt = "text/plain";
    if (doc.h.getFirstHeader("Content-Type", hi)) {
        ctt = hi.getValue();
    }
    LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
    MimeHeaderValue content_type;
    parseMimeHeaderValue(ctt, content_type);
      
  // Get and parse Content-Disposition header
  string ctd = "inline";
  if (doc.h.getFirstHeader("Content-Disposition", hi)) {
      ctd = hi.getValue();
  }
  MimeHeaderValue content_disposition;
  parseMimeHeaderValue(ctd, content_disposition);

  LOGDEB2(("Content_disposition:[%s]\n", 
      content_disposition.value.c_str()));

  // If this is an attachment, we index the file name if any and, when
  // previewing, at least show that it was there.
  if (!stringlowercmp("attachment", content_disposition.value)) {
      string rafn = "NoFileName", afn;
      map<string,string>::const_iterator it;
      it = content_disposition.params.find(string("filename"));
      if (it != content_type.params.end())
      rafn = it->second;
      rfc2047_decode(rafn, afn);
      out += "\n";
      if (m_forPreview)
      out += "[Attachment: ";
      out += afn;
      if (m_forPreview)
      out += "]";
      out += "\n\n";
      // Attachment: we're done with this part
      return;
  }

  // The only other disposition that interests us is "inline", and then
  // this has to be plain text or html
  if (stringlowercmp("inline", content_disposition.value)) {
      return;
  }
    if (stringlowercmp("text/plain", content_type.value) && 
        stringlowercmp("text/html", content_type.value)) {
        return;
    }

...
        !stringlowercmp("x-user-defined", charset) || 
        !stringlowercmp("x-unknown", charset) || 
        !stringlowercmp("unknown", charset) ) {
        charset = "iso-8859-1";
    }












    // Content transfer encoding
    string cte = "7bit";
    if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
        cte = hi.getValue();
...
    LOGDEB2(("walkmime: final: body start offset %d, length %d\n", 
         doc.getBodyStartOffset(), doc.getBodyLength()));
    string body;
    doc.getBody(body, 0, doc.bodylength);

    // Decode according to content transfer encoding
    if (!stringlowercmp("quoted-printable", cte)) {
        string decoded;
        if (!qp_decode(body, decoded)) {
        LOGERR(("walkmime: quoted-printable decoding failed !\n"));
        return;
...
        return;
        }
        body = decoded;
    }

  // Handle html stripping and transcoding to utf8
  string utf8;
    if (!stringlowercmp("text/html", content_type.value)) {
        MimeHandlerHtml mh;
        Rcl::Doc hdoc;
        mh.charsethint = charset;
        mh.mkDoc(m_conf, "", body, content_type.value,  hdoc);
        utf8 = hdoc.text;
    } else {
        // Transcode to utf-8 
        if (!transcode(body, utf8, charset, "UTF-8")) {
        LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
            charset.c_str()));
      utf8 = body;
        }
    }

    out += string("\r\n") + utf8;
    LOGDEB2(("walkmime: out now: [%s]\n", out.c_str()));
    }
}

	a/src/internfile/mh_mail.cpp		b/src/internfile/mh_mail.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.14 2006-04-07 08:51:15 dockes Exp $ (C) 2005 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.15 2006-09-05 08:05:02 dockes Exp $ (C) 2005 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
45		45
46	#ifndef NO_NAMESPACES	46	#ifndef NO_NAMESPACES
47	using namespace std;	47	using namespace std;
48	#endif /* NO_NAMESPACES */	48	#endif /* NO_NAMESPACES */
49		49
50	static void
51	walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, int depth);
52
53	MimeHandlerMail::~MimeHandlerMail()	50	MimeHandlerMail::~MimeHandlerMail()
54	{	51	{
55	if (m_vfp) {	52	if (m_vfp) {
56	fclose((FILE *)m_vfp);	53	fclose((FILE *)m_vfp);
57	m_vfp = 0;	54	m_vfp = 0;
	...		...
239	if (doc.h.getFirstHeader("Subject", hi)) {	236	if (doc.h.getFirstHeader("Subject", hi)) {
240	rfc2047_decode(hi.getValue(), transcoded);	237	rfc2047_decode(hi.getValue(), transcoded);
241	docout.text += string("Subject: ") + transcoded + string("\n");	238	docout.text += string("Subject: ") + transcoded + string("\n");
242	}	239	}
243		240
244	LOGDEB2(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n",	241	LOGDEB2(("MimeHandlerMail::processone:ismultipart %d mime subtype '%s'\n",
245	doc.isMultipart(), doc.getSubType().c_str()));	242	doc.isMultipart(), doc.getSubType().c_str()));
246	walkmime(m_conf, docout.text, doc, 0);	243	walkmime(docout.text, doc, 0);
247		244
248	// LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));	245	LOGDEB2(("MimeHandlerMail::processone:text:[%s]\n", docout.text.c_str()));
249	return MimeHandler::MHDone;	246	return MimeHandler::MHDone;
250	}	247	}
251		248
252	// Recursively walk the message mime parts and concatenate all the	249	// Recursively walk the message mime parts and concatenate all the
253	// inline html or text that we find anywhere.	250	// inline html or text that we find anywhere.
254	static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc,	251	void MimeHandlerMail::walkmime(string &out, Binc::MimePart& doc, int depth)
255	int depth)
256	{	252	{
257	if (depth > 5) {	253	if (depth > 5) {
258	LOGINFO(("walkmime: max depth exceeded\n"));	254	LOGINFO(("walkmime: max depth exceeded\n"));
259	return;	255	return;
260	}	256	}
	...		...
267	// process it For mixed and related, we process each part.	263	// process it For mixed and related, we process each part.
268	std::vector<Binc::MimePart>::iterator it;	264	std::vector<Binc::MimePart>::iterator it;
269	if (!stringicmp("mixed", doc.getSubType()) \|\|	265	if (!stringicmp("mixed", doc.getSubType()) \|\|
270	!stringicmp("related", doc.getSubType())) {	266	!stringicmp("related", doc.getSubType())) {
271	for (it = doc.members.begin(); it != doc.members.end();it++) {	267	for (it = doc.members.begin(); it != doc.members.end();it++) {
272	walkmime(cnf, out, *it, depth+1);	268	walkmime(out, *it, depth+1);
273	}	269	}
274	} else if (!stringicmp("alternative", doc.getSubType())) {	270	} else if (!stringicmp("alternative", doc.getSubType())) {
275	std::vector<Binc::MimePart>::iterator ittxt, ithtml;	271	std::vector<Binc::MimePart>::iterator ittxt, ithtml;
276	ittxt = ithtml = doc.members.end();	272	ittxt = ithtml = doc.members.end();
277	int i = 1;	273	int i = 1;
	...		...
290	else if (!stringlowercmp("text/html", content_type.value))	286	else if (!stringlowercmp("text/html", content_type.value))
291	ithtml = it;	287	ithtml = it;
292	}	288	}
293	if (ittxt != doc.members.end()) {	289	if (ittxt != doc.members.end()) {
294	LOGDEB2(("walkmime: alternative: chose text/plain part\n"))	290	LOGDEB2(("walkmime: alternative: chose text/plain part\n"))
295	walkmime(cnf, out, *ittxt, depth+1);	291	walkmime(out, *ittxt, depth+1);
296	} else if (ithtml != doc.members.end()) {	292	} else if (ithtml != doc.members.end()) {
297	LOGDEB2(("walkmime: alternative: chose text/html part\n"))	293	LOGDEB2(("walkmime: alternative: chose text/html part\n"))
298	walkmime(cnf, out, *ithtml, depth+1);	294	walkmime(out, *ithtml, depth+1);
299	}	295	}
300	}	296	}
301	} else {	297	} else {
302	// If content-type is text or html and content-disposition is inline,	298	// "Simple" part. See what it is:
303	// decode and add to text.
304		299
305	// Get and parse content-type header.	300	// Get and parse content-type header.
306	Binc::HeaderItem hi;	301	Binc::HeaderItem hi;
307	string ctt = "text/plain";	302	string ctt = "text/plain";
308	if (doc.h.getFirstHeader("Content-Type", hi)) {	303	if (doc.h.getFirstHeader("Content-Type", hi)) {
309	ctt = hi.getValue();	304	ctt = hi.getValue();
310	}	305	}
311	LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));	306	LOGDEB2(("walkmime:content-type: %s\n", ctt.c_str()));
312	MimeHeaderValue content_type;	307	MimeHeaderValue content_type;
313	parseMimeHeaderValue(ctt, content_type);	308	parseMimeHeaderValue(ctt, content_type);
		309
		310	// Get and parse Content-Disposition header
		311	string ctd = "inline";
		312	if (doc.h.getFirstHeader("Content-Disposition", hi)) {
		313	ctd = hi.getValue();
		314	}
		315	MimeHeaderValue content_disposition;
		316	parseMimeHeaderValue(ctd, content_disposition);
		317
		318	LOGDEB2(("Content_disposition:[%s]\n",
		319	content_disposition.value.c_str()));
		320
		321	// If this is an attachment, we index the file name if any and, when
		322	// previewing, at least show that it was there.
		323	if (!stringlowercmp("attachment", content_disposition.value)) {
		324	string rafn = "NoFileName", afn;
		325	map<string,string>::const_iterator it;
		326	it = content_disposition.params.find(string("filename"));
		327	if (it != content_type.params.end())
		328	rafn = it->second;
		329	rfc2047_decode(rafn, afn);
		330	out += "\n";
		331	if (m_forPreview)
		332	out += "[Attachment: ";
		333	out += afn;
		334	if (m_forPreview)
		335	out += "]";
		336	out += "\n\n";
		337	// Attachment: we're done with this part
		338	return;
		339	}
		340
		341	// The only other disposition that interests us is "inline", and then
		342	// this has to be plain text or html
		343	if (stringlowercmp("inline", content_disposition.value)) {
		344	return;
		345	}
314	if (stringlowercmp("text/plain", content_type.value) &&	346	if (stringlowercmp("text/plain", content_type.value) &&
315	stringlowercmp("text/html", content_type.value)) {	347	stringlowercmp("text/html", content_type.value)) {
316	return;	348	return;
317	}	349	}
318		350
	...		...
332	!stringlowercmp("x-user-defined", charset) \|\|	364	!stringlowercmp("x-user-defined", charset) \|\|
333	!stringlowercmp("x-unknown", charset) \|\|	365	!stringlowercmp("x-unknown", charset) \|\|
334	!stringlowercmp("unknown", charset) ) {	366	!stringlowercmp("unknown", charset) ) {
335	charset = "iso-8859-1";	367	charset = "iso-8859-1";
336	}	368	}
337
338	// Content disposition
339	string ctd = "inline";
340	if (doc.h.getFirstHeader("Content-Disposition", hi)) {
341	ctd = hi.getValue();
342	}
343	MimeHeaderValue content_disposition;
344	parseMimeHeaderValue(ctd, content_disposition);
345	if (stringlowercmp("inline", content_disposition.value)) {
346	return;
347	}
348		369
349	// Content transfer encoding	370	// Content transfer encoding
350	string cte = "7bit";	371	string cte = "7bit";
351	if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {	372	if (doc.h.getFirstHeader("Content-Transfer-Encoding", hi)) {
352	cte = hi.getValue();	373	cte = hi.getValue();
	...		...
355	LOGDEB2(("walkmime: final: body start offset %d, length %d\n",	376	LOGDEB2(("walkmime: final: body start offset %d, length %d\n",
356	doc.getBodyStartOffset(), doc.getBodyLength()));	377	doc.getBodyStartOffset(), doc.getBodyLength()));
357	string body;	378	string body;
358	doc.getBody(body, 0, doc.bodylength);	379	doc.getBody(body, 0, doc.bodylength);
359		380
360	// Decode content transfer encoding	381	// Decode according to content transfer encoding
361	if (!stringlowercmp("quoted-printable", cte)) {	382	if (!stringlowercmp("quoted-printable", cte)) {
362	string decoded;	383	string decoded;
363	if (!qp_decode(body, decoded)) {	384	if (!qp_decode(body, decoded)) {
364	LOGERR(("walkmime: quoted-printable decoding failed !\n"));	385	LOGERR(("walkmime: quoted-printable decoding failed !\n"));
365	return;	386	return;
	...		...
379	return;	400	return;
380	}	401	}
381	body = decoded;	402	body = decoded;
382	}	403	}
383		404
384	string transcoded;	405	// Handle html stripping and transcoding to utf8
		406	string utf8;
385	if (!stringlowercmp("text/html", content_type.value)) {	407	if (!stringlowercmp("text/html", content_type.value)) {
386	MimeHandlerHtml mh;	408	MimeHandlerHtml mh;
387	Rcl::Doc hdoc;	409	Rcl::Doc hdoc;
388	mh.charsethint = charset;	410	mh.charsethint = charset;
389	mh.mkDoc(cnf, "", body, content_type.value, hdoc);	411	mh.mkDoc(m_conf, "", body, content_type.value, hdoc);
390	transcoded = hdoc.text;	412	utf8 = hdoc.text;
391	} else {	413	} else {
392	// Transcode to utf-8	414	// Transcode to utf-8
393	if (!transcode(body, transcoded, charset, "UTF-8")) {	415	if (!transcode(body, utf8, charset, "UTF-8")) {
394	LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",	416	LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
395	charset.c_str()));	417	charset.c_str()));
396	transcoded = body;	418	utf8 = body;
397	}	419	}
398	}	420	}
399		421
400	out += string("\r\n") + transcoded;	422	out += string("\r\n") + utf8;
401	LOGDEB2(("walkmime: out now: [%s]\n", out.c_str()));	423	LOGDEB2(("walkmime: out now: [%s]\n", out.c_str()));
402	}	424	}
403	}	425	}