recoll / Code / Diff of /src/internfile/mh

Diff of /src/internfile/mh_html.cpp [0c74bd] .. [f56c94]

Switch to unified view


...
    // If set_doc(fn), take note of file name.
    string fn = m_filename;
    m_filename.erase();

    string charset = m_defcharset;
    LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n", 
        charset.c_str()));

    // - We first try to convert from the default configured charset
    //   (which may depend of the current directory) to utf-8. If this
    //   fails, we keep the original text
    // - During parsing, if we find a charset parameter, and it differs from
    //   what we started with, we abort and restart with the parameter value
    //   instead of the configuration one.



    MyHtmlParser result;
    for (int pass = 0; pass < 2; pass++) {
    string transcoded;
    LOGDEB(("Html::mkDoc: pass %d\n", pass));
    MyHtmlParser p;

    // Try transcoding. If it fails, use original text.
    int ecnt;
    if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
        LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
            "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
        transcoded = m_html;
        // We don't know the charset, at all
      p.reset_charsets();
      charset = "";
    } else {
        if (ecnt) {
        if (pass == 0) {
            LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
                "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
        } else {
            LOGERR(("textHtmlToDoc: final transcode had %d errors for "
                "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
        }
        }
        // charset has the putative source charset, transcoded is now
        // in utf-8
      p.set_charsets(charset, "utf-8");

    }

    try {
        p.parse_html(transcoded);
        // No exception: ok?
        result = p;
        break;
    } catch (bool diag) {
        result = p;
        if (diag == true) {
      // Parser throws true at end of text. ok
        break;
      }

        LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
            charset.c_str(), result.get_charset().c_str()));
        if (!result.get_charset().empty() && 
        !samecharset(result.get_charset(), result.fromcharset)) {
        LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
      // Set the origin charset as specified in document before
      // transcoding again
        charset = result.get_charset();
        } else {
        LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
        return false;
        }
    }
    }

    m_metaData["origcharset"] = result.get_charset();
    m_metaData["content"] = result.dump;
    m_metaData["charset"] = "utf-8";
    // Avoid setting empty values which would crush ones possibly inherited
    // from parent (if we're an attachment)
    if (!result.dmtime.empty())

	a/src/internfile/mh_html.cpp		b/src/internfile/mh_html.cpp
	...		...
68	// If set_doc(fn), take note of file name.	68	// If set_doc(fn), take note of file name.
69	string fn = m_filename;	69	string fn = m_filename;
70	m_filename.erase();	70	m_filename.erase();
71		71
72	string charset = m_defcharset;	72	string charset = m_defcharset;
73	LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",	73	LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n",
74	charset.c_str()));	74	charset.c_str()));
75		75
76	// - We first try to convert from the default configured charset	76	// - We first try to convert from the default configured charset
77	// (which may depend of the current directory) to utf-8. If this	77	// (which may depend of the current directory) to utf-8. If this
78	// fails, we keep the original text	78	// fails, we keep the original text
79	// - During parsing, if we find a charset parameter, and it differs from	79	// - During parsing, if we find a charset parameter, and it differs from
80	// what we started with, we abort and restart with the parameter value	80	// what we started with, we abort and restart with the parameter value
81	// instead of the configuration one.	81	// instead of the configuration one.
82	LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
83
84		82
85	MyHtmlParser result;	83	MyHtmlParser result;
86	for (int pass = 0; pass < 2; pass++) {	84	for (int pass = 0; pass < 2; pass++) {
87	string transcoded;	85	string transcoded;
88	LOGDEB(("Html::mkDoc: pass %d\n", pass));	86	LOGDEB(("Html::mkDoc: pass %d\n", pass));
89	MyHtmlParser p;	87	MyHtmlParser p;
		88
90	// Try transcoding. If it fails, use original text.	89	// Try transcoding. If it fails, use original text.
91	int ecnt;	90	int ecnt;
92	if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {	91	if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
93	LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"	92	LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
94	"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));	93	"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
95	transcoded = m_html;	94	transcoded = m_html;
96	// We don't know the charset, at all	95	// We don't know the charset, at all
97	p.ocharset = p.charset = charset = "";	96	p.reset_charsets();
		97	charset = "";
98	} else {	98	} else {
99	if (ecnt) {	99	if (ecnt) {
100	if (pass == 0) {	100	if (pass == 0) {
101	LOGDEB(("textHtmlToDoc: init transcode had %d errors for "	101	LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
102	"[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));	102	"[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
103	} else {	103	} else {
104	LOGERR(("textHtmlToDoc: final transcode had %d errors for "	104	LOGERR(("textHtmlToDoc: final transcode had %d errors for "
105	"[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));	105	"[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
106	}	106	}
107	}	107	}
108	// ocharset has the putative source charset, transcoded is now	108	// charset has the putative source charset, transcoded is now
109	// in utf-8	109	// in utf-8
110	p.ocharset = charset;	110	p.set_charsets(charset, "utf-8");
111	p.charset = "utf-8";
112	}	111	}
113		112
114	try {	113	try {
115	p.parse_html(transcoded);	114	p.parse_html(transcoded);
116	// No exception: ok?	115	// No exception: ok?
117	result = p;	116	result = p;
118	break;	117	break;
119	} catch (bool diag) {	118	} catch (bool diag) {
120	result = p;	119	result = p;
121	if (diag == true)	120	if (diag == true) {
		121	// Parser throws true at end of text. ok
122	break;	122	break;
		123	}
		124
123	LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",	125	LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
124	charset.c_str(),result.doccharset.c_str()));	126	charset.c_str(), result.get_charset().c_str()));
125	if (!result.doccharset.empty() &&	127	if (!result.get_charset().empty() &&
126	!samecharset(result.doccharset, result.ocharset)) {	128	!samecharset(result.get_charset(), result.fromcharset)) {
127	LOGDEB(("textHtmlToDoc: reparse for charsets\n"));	129	LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
		130	// Set the origin charset as specified in document before
		131	// transcoding again
128	charset = result.doccharset;	132	charset = result.get_charset();
129	} else {	133	} else {
130	LOGERR(("textHtmlToDoc:: error: non charset exception\n"));	134	LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
131	return false;	135	return false;
132	}	136	}
133	}	137	}
134	}	138	}
135		139
136	m_metaData["origcharset"] = m_defcharset;	140	m_metaData["origcharset"] = result.get_charset();
137	m_metaData["content"] = result.dump;	141	m_metaData["content"] = result.dump;
138	m_metaData["charset"] = "utf-8";	142	m_metaData["charset"] = "utf-8";
139	// Avoid setting empty values which would crush ones possibly inherited	143	// Avoid setting empty values which would crush ones possibly inherited
140	// from parent (if we're an attachment)	144	// from parent (if we're an attachment)
141	if (!result.dmtime.empty())	145	if (!result.dmtime.empty())