recoll / Code / Diff of /src/internfile/mh

Diff of /src/internfile/mh_html.cpp [0c74bd] .. [a2659b]

Switch to side-by-side view

--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@@ -70,7 +70,7 @@
     m_filename.erase();
 
     string charset = m_defcharset;
-    LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n", 
+    LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n", 
 	    charset.c_str()));
 
     // - We first try to convert from the default configured charset
@@ -79,14 +79,13 @@
     // - During parsing, if we find a charset parameter, and it differs from
     //   what we started with, we abort and restart with the parameter value
     //   instead of the configuration one.
-    LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
-
 
     MyHtmlParser result;
     for (int pass = 0; pass < 2; pass++) {
 	string transcoded;
 	LOGDEB(("Html::mkDoc: pass %d\n", pass));
 	MyHtmlParser p;
+
 	// Try transcoding. If it fails, use original text.
 	int ecnt;
 	if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
@@ -94,21 +93,21 @@
 		    "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
 	    transcoded = m_html;
 	    // We don't know the charset, at all
-	    p.ocharset = p.charset = charset = "";
+	    p.reset_charsets();
+	    charset = "";
 	} else {
 	    if (ecnt) {
 		if (pass == 0) {
 		    LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
-			    "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
+			    "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
 		} else {
 		    LOGERR(("textHtmlToDoc: final transcode had %d errors for "
-			    "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
+			    "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
 		}
 	    }
-	    // ocharset has the putative source charset, transcoded is now
+	    // charset has the putative source charset, transcoded is now
 	    // in utf-8
-	    p.ocharset = charset;
-	    p.charset = "utf-8";
+	    p.set_charsets(charset, "utf-8");
 	}
 
 	try {
@@ -118,14 +117,19 @@
 	    break;
 	} catch (bool diag) {
 	    result = p;
-	    if (diag == true)
+	    if (diag == true) {
+		// Parser throws true at end of text. ok
 		break;
+	    }
+
 	    LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
-		    charset.c_str(),result.doccharset.c_str()));
-	    if (!result.doccharset.empty() && 
-		!samecharset(result.doccharset, result.ocharset)) {
+		    charset.c_str(), result.get_charset().c_str()));
+	    if (!result.get_charset().empty() && 
+		!samecharset(result.get_charset(), result.fromcharset)) {
 		LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
-		charset = result.doccharset;
+		// Set the origin charset as specified in document before
+		// transcoding again
+		charset = result.get_charset();
 	    } else {
 		LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
 		return false;
@@ -133,7 +137,7 @@
 	}
     }
 
-    m_metaData["origcharset"] = m_defcharset;
+    m_metaData["origcharset"] = result.get_charset();
     m_metaData["content"] = result.dump;
     m_metaData["charset"] = "utf-8";
     // Avoid setting empty values which would crush ones possibly inherited