recoll / Code / Diff of /src/internfile/mh

Diff of /src/internfile/mh_html.cpp [441820] .. [c5ebe0]

Switch to side-by-side view

--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@@ -49,6 +49,7 @@
 	LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
 	return false;
     }
+    m_filename = fn;
     return set_document_string(otext);
 }
 
@@ -64,8 +65,13 @@
     if (m_havedoc == false)
 	return false;
     m_havedoc = false;
+    // If set_doc(fn), take note of file name.
+    string fn = m_filename;
+    m_filename.erase();
+
     string charset = m_defcharset;
-    LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",charset.c_str()));
+    LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n", 
+	    charset.c_str()));
 
     // - We first try to convert from the default configured charset
     //   (which may depend of the current directory) to utf-8. If this
@@ -82,13 +88,23 @@
 	LOGDEB(("Html::mkDoc: pass %d\n", pass));
 	MyHtmlParser p;
 	// Try transcoding. If it fails, use original text.
-	if (!transcode(m_html, transcoded, charset, "UTF-8")) {
-	    LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
-		    charset.c_str()));
+	int ecnt;
+	if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
+	    LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
+		    "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
 	    transcoded = m_html;
 	    // We don't know the charset, at all
 	    p.ocharset = p.charset = charset = "";
 	} else {
+	    if (ecnt) {
+		if (pass == 0) {
+		    LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
+			    "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
+		} else {
+		    LOGERR(("textHtmlToDoc: final transcode had %d errors for "
+			    "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
+		}
+	    }
 	    // ocharset has the putative source charset, transcoded is now
 	    // in utf-8
 	    p.ocharset = charset;