recoll / Code / Diff of /src/internfile/mh

Diff of /src/internfile/mh_html.cpp [0b1827] .. [3ca33b]

Switch to unified view


...
class MyHtmlParser : public HtmlParser {
 public:
    bool in_script_tag;
    bool in_style_tag;
    string title, sample, keywords, dump;
    string ocharset; // This is the charset our user thinks the doc was
    string charset; // This is the charset it was supposedly converted to
    string doccharset; // Set this to value of charset parameter in header
    bool indexing_allowed;
    void process_text(const string &text);
    void opening_tag(const string &tag, const map<string,string> &p);
    void closing_tag(const string &tag);
...
            string value = i->second;
            MimeHeaderValue p = parseMimeHeaderValue(value);
            map<string, string>::const_iterator k;
            if ((k = p.params.find("charset")) != p.params.end()) {
            doccharset = k->second;
            if (doccharset != ocharset) {
              LOGDEB1(("Doc specified charset '%s' "
                   "differs from announced '%s'\n",
                   doccharset.c_str(), ocharset.c_str()));
                throw true;
          }
            }
        }
        }
    }
    } else if (tag == "p" || tag == "br") {
  dump += "\n";
    } else if (tag == "script") {
    in_script_tag = true;
    } else if (tag == "style") {
    in_style_tag = true;
    } else if (tag == "body") {
...
    if (conf->guesscharset) {
    charset = csguess(otext, conf->defcharset);
    } else
    charset = conf->defcharset;

    LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));


    MyHtmlParser pres;
    for (int pass = 0; pass < 2; pass++) {
    string transcoded;



    MyHtmlParser p;
    // Try transcoding. If it fails, use original text.
    if (!transcode(otext, transcoded, charset, "UTF-8")) {
        LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
            charset.c_str()));
        transcoded = otext;
        // We don't know the charset, at all
        p.ocharset = p.charset = charset = "";
    } else {
        // ocharset has the putative source charset, transcoded is now
        // in utf-8
      p.ocharset = charset;
        p.charset = "utf-8";
    }

    try {
        p.parse_html(transcoded);
    } catch (bool) {
        pres = p;
      if (!pres.doccharset.empty() && 
      pres.doccharset != pres.ocharset) {
        LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
          "reparse\n", charset.c_str(),pres.doccharset.c_str()));

        charset = pres.doccharset;
        } else
        break;
    }
    }

    Rcl::Doc out;
    out.origcharset = charset;
    out.text = pres.dump;
    //    LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
    out.title = pres.title;
    out.keywords = pres.keywords;
    out.abstract = pres.sample;
    docout = out;
    return true;

	a/src/internfile/mh_html.cpp		b/src/internfile/mh_html.cpp
	...		...
35	class MyHtmlParser : public HtmlParser {	35	class MyHtmlParser : public HtmlParser {
36	public:	36	public:
37	bool in_script_tag;	37	bool in_script_tag;
38	bool in_style_tag;	38	bool in_style_tag;
39	string title, sample, keywords, dump;	39	string title, sample, keywords, dump;
40	string charset; // This is the charset our user thinks the doc is in	40	string ocharset; // This is the charset our user thinks the doc was
		41	string charset; // This is the charset it was supposedly converted to
41	string doccharset; // Set this to value of charset parameter in header	42	string doccharset; // Set this to value of charset parameter in header
42	bool indexing_allowed;	43	bool indexing_allowed;
43	void process_text(const string &text);	44	void process_text(const string &text);
44	void opening_tag(const string &tag, const map<string,string> &p);	45	void opening_tag(const string &tag, const map<string,string> &p);
45	void closing_tag(const string &tag);	46	void closing_tag(const string &tag);
	...		...
123	string value = i->second;	124	string value = i->second;
124	MimeHeaderValue p = parseMimeHeaderValue(value);	125	MimeHeaderValue p = parseMimeHeaderValue(value);
125	map<string, string>::const_iterator k;	126	map<string, string>::const_iterator k;
126	if ((k = p.params.find("charset")) != p.params.end()) {	127	if ((k = p.params.find("charset")) != p.params.end()) {
127	doccharset = k->second;	128	doccharset = k->second;
128	if (doccharset != charset)	129	if (doccharset != ocharset) {
		130	LOGDEB1(("Doc specified charset '%s' "
		131	"differs from announced '%s'\n",
		132	doccharset.c_str(), ocharset.c_str()));
129	throw true;	133	throw true;
		134	}
130	}	135	}
131	}	136	}
132	}	137	}
133	}	138	}
		139	} else if (tag == "p" \|\| tag == "br") {
		140	dump += "\n";
134	} else if (tag == "script") {	141	} else if (tag == "script") {
135	in_script_tag = true;	142	in_script_tag = true;
136	} else if (tag == "style") {	143	} else if (tag == "style") {
137	in_style_tag = true;	144	in_style_tag = true;
138	} else if (tag == "body") {	145	} else if (tag == "body") {
	...		...
177	if (conf->guesscharset) {	184	if (conf->guesscharset) {
178	charset = csguess(otext, conf->defcharset);	185	charset = csguess(otext, conf->defcharset);
179	} else	186	} else
180	charset = conf->defcharset;	187	charset = conf->defcharset;
181		188
182	LOGDEB(("textHtmlToDoc: charset before parsing: %s\n",	189	LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
183	charset.c_str()));
184		190
185	MyHtmlParser pres;	191	MyHtmlParser pres;
186	for (int pass = 0; pass < 2; pass++) {	192	for (int pass = 0; pass < 2; pass++) {
187	string transcoded;	193	string transcoded;
188	LOGDEB(("textHtmlToDoc: transcode from %s to %s\n",
189	charset.c_str(), "UTF-8"));
190		194
191	MyHtmlParser p;	195	MyHtmlParser p;
192	// Try transcoding. If it fails, use original text.	196	// Try transcoding. If it fails, use original text.
193	if (!transcode(otext, transcoded, charset, "UTF-8")) {	197	if (!transcode(otext, transcoded, charset, "UTF-8")) {
194	LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",	198	LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
195	charset.c_str()));	199	charset.c_str()));
196	transcoded = otext;	200	transcoded = otext;
197	// We don't know the charset, at all	201	// We don't know the charset, at all
198	p.charset = charset = "";	202	p.ocharset = p.charset = charset = "";
199	} else {	203	} else {
200	// charset has the putative source charset, transcoded is now	204	// ocharset has the putative source charset, transcoded is now
201	// in utf-8	205	// in utf-8
		206	p.ocharset = charset;
202	p.charset = "utf-8";	207	p.charset = "utf-8";
203	}	208	}
204		209
205	try {	210	try {
206	p.parse_html(transcoded);	211	p.parse_html(transcoded);
207	} catch (bool) {	212	} catch (bool) {
208	pres = p;	213	pres = p;
209	if (!pres.doccharset.empty() && pres.doccharset != charset) {	214	if (!pres.doccharset.empty() &&
		215	pres.doccharset != pres.ocharset) {
210	LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"	216	LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
211	"reparse\n", charset.c_str(),	217	"reparse\n", charset.c_str(),pres.doccharset.c_str()));
212	pres.doccharset.c_str()));
213	charset = pres.doccharset;	218	charset = pres.doccharset;
214	} else	219	} else
215	break;	220	break;
216	}	221	}
217	}	222	}
218		223
219	Rcl::Doc out;	224	Rcl::Doc out;
220	out.origcharset = charset;	225	out.origcharset = charset;
221	out.text = pres.dump;	226	out.text = pres.dump;
		227	// LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
222	out.title = pres.title;	228	out.title = pres.title;
223	out.keywords = pres.keywords;	229	out.keywords = pres.keywords;
224	out.abstract = pres.sample;	230	out.abstract = pres.sample;
225	docout = out;	231	docout = out;
226	return true;	232	return true;