/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#include "autoconfig.h"
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include "safeunistd.h"
#include <time.h>
#include <cstdlib>
#include "safesysstat.h"
#include <map>
#include <sstream>
#include "cstr.h"
#include "mimehandler.h"
#include "readfile.h"
#include "transcode.h"
#include "mimeparse.h"
#include "mh_mail.h"
#include "log.h"
#include "smallut.h"
#include "mh_html.h"
#include "rclconfig.h"
#include "mimetype.h"
#include "md5ut.h"
// binc imap mime definitions
#include "mime.h"
using namespace std;
static const int maxdepth = 20;
static const string cstr_mail_charset("charset");
MimeHandlerMail::MimeHandlerMail(RclConfig *cnf, const string &id)
: RecollFilter(cnf, id), m_bincdoc(0), m_fd(-1), m_stream(0), m_idx(-1)
// Look for additional headers to be processed as per config:
vector<string> hdrnames = m_config->getFieldSectNames("mail");
if (hdrnames.empty())
for (vector<string>::const_iterator it = hdrnames.begin();
it != hdrnames.end(); it++) {
(void)m_config->getFieldConfParam(*it, "mail", m_addProcdHdrs[*it]);
void MimeHandlerMail::clear()
delete m_bincdoc; m_bincdoc = 0;
if (m_fd >= 0) {
m_fd = -1;
delete m_stream; m_stream = 0;
m_idx = -1;
m_startoftext = 0;
for (vector<MHMailAttach*>::iterator it = m_attachments.begin();
it != m_attachments.end(); it++) {
delete *it;
bool MimeHandlerMail::set_document_file_impl(const string& mt, const string &fn)
LOGDEB("MimeHandlerMail::set_document_file(" << fn << ")\n");
if (m_fd >= 0) {
m_fd = -1;
if (!m_forPreview) {
// Yes, we read the file twice. It would be possible in theory
// to add the md5 computation to the mime analysis, but ...
string md5, xmd5, reason;
if (MD5File(fn, md5, &reason)) {
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
} else {
LOGERR("MimeHandlerMail: md5 [" << fn << "]: " << reason << "\n");
m_fd = open(fn.c_str(), 0);
if (m_fd < 0) {
LOGERR("MimeHandlerMail::set_document_file: open(" << fn <<
") errno " << errno << "\n");
return false;
#if defined O_NOATIME && O_NOATIME != 0
if (fcntl(m_fd, F_SETFL, O_NOATIME) < 0) {
// perror("fcntl");
delete m_bincdoc;
m_bincdoc = new Binc::MimeDocument;
if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) {
LOGERR("MimeHandlerMail::mkDoc: mime parse error for " << fn << "\n");
return false;
m_havedoc = true;
return true;
bool MimeHandlerMail::set_document_string_impl(const string& mt,
const string& msgtxt)
LOGDEB2("Message text: [" << msgtxt << "]\n");
delete m_stream;
if (!m_forPreview) {
string md5, xmd5;
MD5String(msgtxt, md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
if ((m_stream = new stringstream(msgtxt)) == 0 || !m_stream->good()) {
LOGERR("MimeHandlerMail::set_document_string: stream create error."
"msgtxt.size() " << msgtxt.size() << "\n");
return false;
delete m_bincdoc;
if ((m_bincdoc = new Binc::MimeDocument) == 0) {
LOGERR("MimeHandlerMail::set_doc._string: new Binc:Document failed. "
"Out of memory?");
return false;
if (!m_bincdoc->isHeaderParsed() && !m_bincdoc->isAllParsed()) {
LOGERR("MimeHandlerMail::set_document_string: mime parse error\n");
return false;
m_havedoc = true;
return true;
bool MimeHandlerMail::skip_to_document(const string& ipath)
LOGDEB("MimeHandlerMail::skip_to_document(" << ipath << ")\n");
if (m_idx == -1) {
// No decoding done yet. If ipath is null need do nothing
if (ipath.empty() || ipath == "-1")
return true;
// ipath points to attachment: need to decode message
if (!next_document()) {
LOGERR("MimeHandlerMail::skip_to_doc: next_document failed\n");
return false;
m_idx = atoi(ipath.c_str());
return true;
bool MimeHandlerMail::next_document()
LOGDEB("MimeHandlerMail::next_document m_idx " << m_idx << " m_havedoc " <<
m_havedoc << "\n");
if (!m_havedoc)
return false;
bool res = false;
if (m_idx == -1) {
m_metaData[cstr_dj_keymt] = cstr_textplain;
res = processMsg(m_bincdoc, 0);
LOGDEB1("MimeHandlerMail::next_document: mt " <<
m_metaData[cstr_dj_keymt] << ", att cnt " <<
m_attachments.size() << "\n");
const string& txt = m_metaData[cstr_dj_keycontent];
if (m_startoftext < txt.size())
m_metaData[cstr_dj_keyabstract] =
truncate_to_word(txt.substr(m_startoftext), 250);
if (m_attachments.size() > 0) {
m_metaData[cstr_dj_keyanc] = "t";
} else {
res = processAttach();
m_havedoc = m_idx < (int)m_attachments.size();
if (!m_havedoc) {
m_reason = "Subdocument index too high";
return res;
// Decode according to content transfer encoding. May actually do nothing,
// which will be indicated by the *respp argument pointing to the original
// text on exit
static bool decodeBody(const string& cte, // Content transfer encoding
const string& body, // Source text
string& decoded, // Decoded text if actual decoding
const string** respp // Decoding Indicator
// By default, there is no encoding (7bit,8bit,raw). Also in case of
// decoding error
*respp = &body;
if (!stringlowercmp("quoted-printable", cte)) {
if (!qp_decode(body, decoded)) {
LOGERR("decodeBody: quoted-printable decoding failed !\n");
LOGDEB(" Body: \n" << body << "\n");
return false;
*respp = &decoded;
} else if (!stringlowercmp("base64", cte)) {
if (!base64_decode(body, decoded)) {
// base64 encoding errors are actually relatively common
LOGERR("decodeBody: base64 decoding failed !\n");
LOGDEB(" Body: \n" << body << "\n");
return false;
*respp = &decoded;
return true;
bool MimeHandlerMail::processAttach()
LOGDEB("MimeHandlerMail::processAttach() m_idx " << m_idx << "\n");
if (!m_havedoc)
return false;
if (m_idx >= (int)m_attachments.size()) {
m_havedoc = false;
return false;
MHMailAttach *att = m_attachments[m_idx];
m_metaData[cstr_dj_keymt] = att->m_contentType;
m_metaData[cstr_dj_keyorigcharset] = att->m_charset;
m_metaData[cstr_dj_keycharset] = att->m_charset;
m_metaData[cstr_dj_keyfn] = att->m_filename;
m_metaData[cstr_dj_keytitle] = att->m_filename + " (" + m_subject + ")";
LOGDEB1(" processAttach:ct [" << att->m_contentType << "] cs [" <<
att->m_charset << "] fn [" << att->m_filename << "]\n");
// Erase current content and replace
string& body = m_metaData[cstr_dj_keycontent];
att->m_part->getBody(body, 0, att->m_part->bodylength);
string decoded;
const string *bdp;
if (!decodeBody(att->m_contentTransferEncoding, body, decoded, &bdp)) {
return false;
if (bdp != &body)
// Special case for application/octet-stream: try to better
// identify content, using file name if set
if (m_metaData[cstr_dj_keymt] == "application/octet-stream" &&
!m_metaData[cstr_dj_keyfn].empty()) {
string mt = mimetype(m_metaData[cstr_dj_keyfn], 0,
m_config, false);
if (!mt.empty())
m_metaData[cstr_dj_keymt] = mt;
// Special case for text/plain content. Internfile should deal
// with this but it expects text/plain to be utf-8 already, so we
// handle the transcoding if needed. Same kind of issue for the MD5
if (m_metaData[cstr_dj_keymt] == cstr_textplain) {
if (!txtdcode("MimeHandlerMail::processAttach")) {
} else if (!m_forPreview) {
string md5, xmd5;
MD5String(body, md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
// Ipath
char nbuf[20];
sprintf(nbuf, "%d", m_idx);
m_metaData[cstr_dj_keyipath] = nbuf;
return true;
// Transform a single message into a document. The subject becomes the
// title, and any simple body part with a content-type of text or html
// and content-disposition inline gets concatenated as text.
// If depth is not zero, we're called recursively for an
// message/rfc822 part and we must not touch the doc fields except the
// text
bool MimeHandlerMail::processMsg(Binc::MimePart *doc, int depth)
LOGDEB2("MimeHandlerMail::processMsg: depth " << depth << "\n");
if (depth++ >= maxdepth) {
// Have to stop somewhere
LOGINFO("MimeHandlerMail::processMsg: maxdepth " << maxdepth <<
" exceeded\n");
// Return true anyway, better to index partially than not at all
return true;
// Handle some headers.
string& text = m_metaData[cstr_dj_keycontent];
Binc::HeaderItem hi;
string decoded;
if (doc->h.getFirstHeader("From", hi)) {
rfc2047_decode(hi.getValue(), decoded);
if (preview())
text += string("From: ");
text += decoded + cstr_newline;
if (depth == 1) {
m_metaData[cstr_dj_keyauthor] = decoded;
if (doc->h.getFirstHeader("To", hi)) {
rfc2047_decode(hi.getValue(), decoded);
if (preview())
text += string("To: ");
text += decoded + cstr_newline;
if (depth == 1) {
m_metaData[cstr_dj_keyrecipient] = decoded;
if (doc->h.getFirstHeader("Cc", hi)) {
rfc2047_decode(hi.getValue(), decoded);
if (preview())
text += string("Cc: ");
text += decoded + cstr_newline;
if (depth == 1) {
m_metaData[cstr_dj_keyrecipient] += " " + decoded;
if (doc->h.getFirstHeader("Message-Id", hi)) {
if (depth == 1) {
m_metaData[cstr_dj_keymsgid] = hi.getValue();
trimstring(m_metaData[cstr_dj_keymsgid], "<>");
if (doc->h.getFirstHeader("Date", hi)) {
rfc2047_decode(hi.getValue(), decoded);
if (depth == 1) {
time_t t = rfc2822DateToUxTime(decoded);
if (t != (time_t)-1) {
char ascuxtime[100];
sprintf(ascuxtime, "%ld", (long)t);
m_metaData[cstr_dj_keymd] = ascuxtime;
} else {
// Leave mtime field alone, ftime will be used instead.
LOGDEB("rfc2822Date...: failed: [" << decoded << "]\n");
if (preview())
text += string("Date: ");
text += decoded + cstr_newline;
if (doc->h.getFirstHeader("Subject", hi)) {
rfc2047_decode(hi.getValue(), decoded);
if (depth == 1) {
m_metaData[cstr_dj_keytitle] = decoded;
m_subject = decoded;
if (preview())
text += string("Subject: ");
text += decoded + cstr_newline;
// Check for the presence of configured additional headers and possibly
// add them to the metadata (with appropriate field name).
if (!m_addProcdHdrs.empty()) {
for (auto& it : m_addProcdHdrs) {
if (!it.second.empty() && doc->h.getFirstHeader(it.first, hi)) {
// Email headers are supposedly ASCII, but we force
// transcode to UTF-8 anyway so that at least partial
// indexing can be done if there are 8bit chars in there.
transcode(hi.getValue(), m_metaData[it.second], "CP1252", "UTF-8");
text += '\n';
m_startoftext = text.size();
LOGDEB2("MimeHandlerMail::processMsg:ismultipart " <<
doc->isMultipart() << " mime subtype '"<<doc->getSubType()<< "'\n");
walkmime(doc, depth);
LOGDEB2("MimeHandlerMail::processMsg:text:[" <<
m_metaData[cstr_dj_keycontent] << "]\n");
return true;
// Recursively walk the message mime parts and concatenate all the
// inline html or text that we find anywhere.
// RFC2046 reminder:
// Top level media types:
// Simple: text, image, audio, video, application,
// Composite: multipart, message.
// multipart can be mixed, signed, alternative, parallel, digest.
// message/rfc822 may also be of interest.
void MimeHandlerMail::walkmime(Binc::MimePart* doc, int depth)
LOGDEB2("MimeHandlerMail::walkmime: depth " << depth << "\n");
if (depth++ >= maxdepth) {
LOGINFO("walkmime: max depth (" << maxdepth << ") exceeded\n");
string& out = m_metaData[cstr_dj_keycontent];
if (doc->isMultipart()) {
LOGDEB2("walkmime: ismultipart " << doc->isMultipart() <<
" subtype '" << doc->getSubType() << "'\n");
// We only handle alternative, related and mixed (no digests).
std::vector<Binc::MimePart>::iterator it;
if (!stringicmp("mixed", doc->getSubType()) ||
!stringicmp("signed", doc->getSubType()) ||
!stringicmp("related", doc->getSubType())) {
// Multipart mixed and related: process each part.
for (it = doc->members.begin(); it != doc->members.end();it++) {
walkmime(&(*it), depth);
} else if (!stringicmp("alternative", doc->getSubType())) {
// Multipart/alternative: look for a text/plain part, then html.
// Process if found
std::vector<Binc::MimePart>::iterator ittxt, ithtml;
ittxt = ithtml = doc->members.end();
int i = 1;
for (it = doc->members.begin();
it != doc->members.end(); it++, i++) {
// Get and parse content-type header
Binc::HeaderItem hi;
if (!it->h.getFirstHeader("Content-Type", hi)) {
LOGDEB("walkmime:no ctent-type header for part "<<i<< "\n");
MimeHeaderValue content_type;
parseMimeHeaderValue(hi.getValue(), content_type);
LOGDEB2("walkmime: C-type: " << content_type.value << "\n");
if (!stringlowercmp(cstr_textplain, content_type.value))
ittxt = it;
else if (!stringlowercmp("text/html", content_type.value))
ithtml = it;
if (ittxt != doc->members.end() && ittxt->bodylength) {
LOGDEB2("walkmime: alternative: choose text/plain. Size: " <<
ittxt->bodylength << endl);
walkmime(&(*ittxt), depth);
} else if (ithtml != doc->members.end()) {
LOGDEB2("walkmime: alternative: choose text/html.\n");
walkmime(&(*ithtml), depth);
// Part is not multipart: it must be either simple or message. Take
// a look at interesting headers and a possible filename parameter
// Get and parse content-type header.
Binc::HeaderItem hi;
string ctt = cstr_textplain;
if (doc->h.getFirstHeader("Content-Type", hi)) {
ctt = hi.getValue();
LOGDEB2("walkmime:content-type: " << ctt << "\n");
MimeHeaderValue content_type;
parseMimeHeaderValue(ctt, content_type);
// Get and parse Content-Disposition header
string ctd = "inline";
if (doc->h.getFirstHeader("Content-Disposition", hi)) {
ctd = hi.getValue();
MimeHeaderValue content_disposition;
parseMimeHeaderValue(ctd, content_disposition);
LOGDEB2("Content_disposition:[" << content_disposition.value << "]\n");
string dispindic;
if (stringlowercmp("inline", content_disposition.value))
dispindic = "Attachment";
dispindic = "Inline";
// See if we have a filename.
string filename;
map<string,string>::const_iterator it;
it = content_disposition.params.find(string("filename"));
if (it != content_disposition.params.end())
filename = it->second;
if (filename.empty()) {
it = content_type.params.find(string("name"));
if (it != content_type.params.end())
filename = it->second;
if (doc->isMessageRFC822()) {
LOGDEB2("walkmime: message/RFC822 part\n");
// The first part is the already parsed message. Call
// processMsg instead of walkmime so that mail headers get
// printed. The depth will tell it what to do
if (doc->members.empty()) {
out += "\n";
if (m_forPreview)
out += "[" + dispindic + " " + content_type.value + ": ";
out += filename;
if (m_forPreview)
out += "]";
out += "\n\n";
processMsg(&doc->members[0], depth);
// "Simple" part.
LOGDEB2("walkmime: simple part\n");
// Normally the default charset is us-ascii. But it happens that 8
// bit chars exist in a message that is stated as us-ascii. Ie the
// mailer used by yahoo support ('KANA') does this. We could
// convert to iso-8859 only if the transfer-encoding is 8 bit, or
// test for actual 8 bit chars, but what the heck, le'ts use
// 8859-1 (actually CP1252 which is compatible, but with more
// useful chars) as default.
string charset;
it = content_type.params.find(cstr_mail_charset);
if (it != content_type.params.end())
charset = it->second;
if (charset.empty() ||
!stringlowercmp("us-ascii", charset) ||
!stringlowercmp("default", charset) ||
!stringlowercmp("x-user-defined", charset) ||
!stringlowercmp("x-unknown", charset) ||
!stringlowercmp("unknown", charset) ) {
if (!m_config->getConfParam("maildefcharset", charset))
charset = "CP1252";
// Content transfer encoding
string cte = "7bit";
if (doc->h.getFirstHeader("Content-Transfer-Encoding", hi)) {
cte = hi.getValue();
// If the Content-Disposition is not inline, we treat it as
// attachment, as per rfc2183.
// If it is inline but not text or html, same thing.
// Some early MIME msgs have "text" instead of "text/plain" as type
if (stringlowercmp("inline", content_disposition.value) ||
(stringlowercmp(cstr_textplain, content_type.value) &&
stringlowercmp("text", content_type.value) &&
stringlowercmp("text/html", content_type.value)) ) {
if (!filename.empty()) {
out += "\n";
if (m_forPreview)
out += "[" + dispindic + " " + content_type.value + ": ";
out += filename;
if (m_forPreview)
out += "]";
out += "\n\n";
MHMailAttach *att = new MHMailAttach;
if (att == 0) {
LOGERR("Out of memory\n");
att->m_contentType = content_type.value;
att->m_filename = filename;
att->m_charset = charset;
att->m_contentTransferEncoding = cte;
att->m_part = doc;
LOGDEB("walkmime: attachmnt: ct [" << att->m_contentType <<
"] cte [" << att->m_contentTransferEncoding << "] cs [" <<
att->m_charset << "] fn [" << filename << "]\n");
// We are dealing with an inline part of text/plain or text/html
// type. We can't just return a text or html subdoc and let the
// filter stack work: this would create another subdocument, but
// we want instead to decode a body part of this message document.
LOGDEB2("walkmime: final: body start offset " <<
doc->getBodyStartOffset()<<", length "<<doc->getBodyLength()<<"\n");
string body;
doc->getBody(body, 0, doc->bodylength);
string decoded;
const string *bdp;
if (!decodeBody(cte, body, decoded, &bdp)) {
LOGERR("MimeHandlerMail::walkmime: failed decoding body\n");
if (bdp != &body)
// Handle html stripping and transcoding to utf8
if (!stringlowercmp("text/html", content_type.value)) {
MimeHandlerHtml mh(m_config, "1234");
m_forPreview ? "view" : "index");
mh.set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
mh.set_document_string("text/html", body);
map<string, string>::const_iterator it =
if (it != mh.get_meta_data().end())
out += it->second;
} else {
string utf8;
// Transcode to utf-8
LOGDEB1("walkmime: transcoding from " << charset << " to UTF-8\n");
if (!transcode(body, utf8, charset, cstr_utf8)) {
LOGERR("walkmime: transcode failed from cs '" << charset <<
"' to UTF-8\n");
out += body;
} else {
out += utf8;
if (out.length() && out[out.length()-1] != '\n')
out += '\n';
LOGDEB2("walkmime: out now: [" << out << "]\n");