recoll / Code / Diff of /src/internfile/mh

Diff of /src/internfile/mh_mail.cpp [d392d3] .. [04b279]

Switch to side-by-side view

--- a/src/internfile/mh_mail.cpp
+++ b/src/internfile/mh_mail.cpp
@@ -1,11 +1,14 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.1 2005-03-25 09:40:27 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: mh_mail.cpp,v 1.2 2005-03-31 10:04:07 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 
+#include <stdio.h>
 #include <fcntl.h>
 #include <errno.h>
 
 #include <map>
+#include <sstream>
+using std::stringstream;
 using std::map;
 
 #include "mimehandler.h"
@@ -19,65 +22,183 @@
 #include "debuglog.h"
 #include "smallut.h"
 #include "mimeparse.h"
+#include "html.h"
+
+// binc imap mime definitions
+#include "mime.h"
+
+static void 
+walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, int depth);
 
 using namespace std;
 
+MimeHandlerMail::~MimeHandlerMail()
+{
+    if (vfp) {
+	fclose((FILE *)vfp);
+	vfp = 0;
+    }
+}
+
 // We are called for two different file types: mbox-type folders
-// holding multiple messages, and maildir-type files with one rfc822
-// message
+// holding multiple messages, and maildir-type files with one message
 MimeHandler::Status 
 MimeHandlerMail::worker(RclConfig *cnf, const string &fn, 
-			const string &mtype, Rcl::Doc &docout, string&)
+			const string &mtype, Rcl::Doc &docout, string& ipath)
 {
     LOGDEB(("MimeHandlerMail::worker: %s [%s]\n", mtype.c_str(), fn.c_str()));
     conf = cnf;
 
     if (!stringlowercmp("message/rfc822", mtype)) {
-	return processone(fn, docout);
+	ipath = "";
+	int fd;
+	if ((fd = open(fn.c_str(), 0)) < 0) {
+	    LOGERR(("MimeHandlerMail::worker: open(%s) errno %d\n",
+		    fn.c_str(), errno));
+	    return MimeHandler::MHError;
+	}
+	Binc::MimeDocument doc;
+	doc.parseFull(fd);
+	MimeHandler::Status ret = processone(fn, doc, docout);
+	close(fd);
+	return ret;
     } else  if (!stringlowercmp("text/x-mail", mtype)) {
+	return processmbox(fn, docout, ipath);
+    } else // hu ho
 	return MimeHandler::MHError;
-    } else
+}
+
+MimeHandler::Status 
+MimeHandlerMail::processmbox(const string &fn, Rcl::Doc &docout, string& ipath)
+{
+    int mtarg = 0;
+    if (ipath != "") {
+	sscanf(ipath.c_str(), "%d", &mtarg);
+    }
+    LOGDEB(("MimeHandlerMail::processmbox: fn %s, mtarg %d\n", fn.c_str(),
+	    mtarg));
+
+    FILE *fp;
+    if (vfp) {
+	fp = (FILE *)vfp;
+    } else {
+	fp = fopen(fn.c_str(), "r");
+	if (fp == 0) {
+	    LOGERR(("MimeHandlerMail::processmbox: error opening %s\n", 
+		    fn.c_str()));
+	    return MimeHandler::MHError;
+	}
+	vfp = fp;
+    }
+    if (mtarg > 0) {
+	fseek(fp, 0, SEEK_SET);
+	msgnum = 0;
+    }
+
+    off_t start, end;
+    bool iseof = false;
+    do  {
+	// Look for next 'From ' Line, start of message. Set start to
+	// line after this
+	char line[301];
+	for (;;) {
+	    if (!fgets(line, 300, fp)) {
+		// Eof hit while looking for 'From ' -> file done. We'd need
+		// another return code here
+		return MimeHandler::MHError;
+	    }
+
+	    if (!strncmp("From ", line, 5)) {
+		start = ftello(fp);
+		break;
+	    }
+	}
+
+	// Look for next 'From ' line or eof, end of message (we let a
+	// spurious empty line in)
+	for (;;) {
+	    end = ftello(fp);
+	    if (!fgets(line, 300, fp) || !strncmp("From ", line, 5)) {
+		if (ferror(fp) || feof(fp))
+		    iseof = true;
+		break;
+	    }
+	}
+	msgnum++;
+	LOGDEB(("MimeHandlerMail::processmbox: got msg %d\n", msgnum));
+	fseek(fp, end, SEEK_SET);
+    } while (mtarg > 0 && msgnum < mtarg);
+
+
+    size_t size = end - start;
+    fseek(fp, start, SEEK_SET);
+    char *cp = (char *)malloc(size);
+    if (cp == 0) {
+	LOGERR(("MimeHandlerMail::processmbox: malloc(%d) failed\n", size));
 	return MimeHandler::MHError;
-}
-
-
-#include "mime.h"
-
-const char *hnames[] = {"Subject", "Content-type"};
-int nh = sizeof(hnames) / sizeof(char *);
-
-void walkmime(string &out, Binc::MimePart& doc, int fd, int depth);
+    }
+    if (fread(cp, 1, size, fp) != size) {
+	LOGERR(("MimeHandlerMail::processmbox: fread failed (errno %d)\n",
+		errno));
+	free(cp);
+	return MimeHandler::MHError;
+    }
+    string msgbuf(cp, size);
+    free(cp);
+    stringstream s(msgbuf);
+    Binc::MimeDocument doc;
+    doc.parseFull(s);
+    MimeHandler::Status ret = processone(fn, doc, docout);
+    if (ret == MimeHandler::MHError)
+	return ret;
+    char buf[20];
+    sprintf(buf, "%d", msgnum);
+    ipath = buf;
+    return iseof ? MimeHandler::MHDone : 
+	(mtarg > 0) ? MimeHandler::MHDone : MimeHandler::MHAgain;
+}
+
 
 // Transform a single message into a document. The subject becomes the
 // title, and any simple body part with a content-type of text or html
 // and content-disposition inline gets concatenated as text.
 MimeHandler::Status 
-MimeHandlerMail::processone(const string &fn, Rcl::Doc &docout)
-{
-    int fd;
-    if ((fd = open(fn.c_str(), 0)) < 0) {
-	LOGERR(("MimeHandlerMail::processone: open(%s) errno %d\n",
-		fn.c_str(), errno));
+MimeHandlerMail::processone(const string &fn, Binc::MimeDocument& doc, 
+			    Rcl::Doc &docout)
+{
+    if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
+	LOGERR(("MimeHandlerMail::processone: mime parse error for %s\n", 
+		fn.c_str()));
 	return MimeHandler::MHError;
     }
-    Binc::MimeDocument doc;
-    doc.parseFull(fd);
-
-    if (!doc.isHeaderParsed() && !doc.isAllParsed()) {
-	LOGERR(("MimeHandlerMail::processone: parse error for %s\n", 
-		fn.c_str()));
-	close(fd);
-	return MimeHandler::MHError;
-    }
+
+    // Handle some headers. We should process rfc2047 encoding here
+    Binc::HeaderItem hi;
+    if (doc.h.getFirstHeader("Subject", hi)) {
+	docout.title = hi.getValue();
+    }
+    if (doc.h.getFirstHeader("From", hi)) {
+	docout.text += string("From: ") + hi.getValue() + string("\n");
+    }
+    if (doc.h.getFirstHeader("To", hi)) {
+	docout.text += string("To: ") + hi.getValue() + string("\n");
+    }
+    if (doc.h.getFirstHeader("Date", hi)) {
+	docout.text += string("Date: ") + hi.getValue() + string("\n");
+    }
+
     LOGDEB(("MimeHandlerMail::processone: ismultipart %d mime subtype '%s'\n", 
 	    doc.isMultipart(), doc.getSubType().c_str()));
-    walkmime(docout.text, doc, fd, 0);
-    close(fd);
-    LOGDEB(("MimeHandlerMail::processone: text: '%s'\n",  docout.text.c_str()));
-    return MimeHandler::MHError;
-}
-
-void walkmime(string &out, Binc::MimePart& doc, int fd, int depth)
+    walkmime(conf, docout.text, doc, 0);
+
+    LOGDEB(("MimeHandlerMail::processone: text: '%s'\n", docout.text.c_str()));
+    return MimeHandler::MHDone;
+}
+
+// Recursively walk the message mime parts and concatenate all the
+// inline html or text that we find anywhere.
+static void walkmime(RclConfig *cnf, string &out, Binc::MimePart& doc, 
+		     int depth)
 {
     if (depth > 5) {
 	LOGINFO(("walkmime: max depth exceeded\n"));
@@ -88,12 +209,12 @@
 	LOGDEB(("walkmime: ismultipart %d subtype '%s'\n", 
 		doc.isMultipart(), doc.getSubType().c_str()));
 	// We only handle alternative and mixed for now. For
-	// alternative, we look for a text/plain part, else html and process it
-	// For mixed, we process each part.
+	// alternative, we look for a text/plain part, else html and
+	// process it For mixed, we process each part.
 	std::vector<Binc::MimePart>::iterator it;
 	if (!stringicmp("mixed", doc.getSubType())) {
 	    for (it = doc.members.begin(); it != doc.members.end();it++) {
-		walkmime(out, *it, fd, depth+1);
+		walkmime(cnf, out, *it, depth+1);
 	    }
 	} else if (!stringicmp("alternative", doc.getSubType())) {
 	    std::vector<Binc::MimePart>::iterator ittxt, ithtml;
@@ -103,7 +224,17 @@
 		Binc::HeaderItem hi;
 		if (!doc.h.getFirstHeader("Content-Type", hi)) 
 		    continue;
-		LOGDEB(("walkmime:content-type: %s\n", hi.getValue().c_str()));
+		MimeHeaderValue content_type;
+		parseMimeHeaderValue(hi.getValue(), content_type);
+		if (!stringlowercmp("text/plain", content_type.value))
+		    ittxt = it;
+		else if (!stringlowercmp("text/html", content_type.value)) 
+		    ithtml = it;
+	    }
+	    if (ittxt != doc.members.end()) {
+		walkmime(cnf, out, *ittxt, depth+1);
+	    } else if (ithtml != doc.members.end()) {
+		walkmime(cnf, out, *ithtml, depth+1);
 	    }
 	}
     } else {
@@ -149,30 +280,36 @@
 	LOGDEB(("walkmime: final: body start offset %d, length %d\n", 
 		doc.getBodyStartOffset(), doc.getBodyLength()));
 	string body;
-	doc.getBody(fd, body, 0, doc.bodylength);
+	doc.getBody(body, 0, doc.bodylength);
 
 	// Decode content transfer encoding
-	if (stringlowercmp("quoted-printable", content_disposition.value)) {
+	if (!stringlowercmp("quoted-printable", cte)) {
 	    string decoded;
 	    qp_decode(body, decoded);
 	    body = decoded;
-	} else if (stringlowercmp("base64", content_disposition.value)) {
+	} else if (!stringlowercmp("base64", cte)) {
 	    string decoded;
 	    base64_decode(body, decoded);
 	    body = decoded;
 	}
 
 
-        // Transcode to utf-8 
 	string transcoded;
-	if (!transcode(body, transcoded, charset, "UTF-8")) {
-	    LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
-		    charset.c_str()));
-	    transcoded = body;
+	if (!stringlowercmp("text/html", content_type.value)) {
+	    MimeHandlerHtml mh;
+	    Rcl::Doc hdoc;
+	    mh.charsethint = charset;
+	    mh.worker1(cnf, "", body, content_type.value,  hdoc);
+	    transcoded = hdoc.text;
+	} else {
+	    // Transcode to utf-8 
+	    if (!transcode(body, transcoded, charset, "UTF-8")) {
+		LOGERR(("walkmime: transcode failed from cs '%s' to UTF-8\n",
+			charset.c_str()));
+		transcoded = body;
+	    }
 	}
 
 	out += string("\r\n") + transcoded;
     }
 }
-
-