--- a/src/internfile/mh_html.cpp
+++ b/src/internfile/mh_html.cpp
@@ -37,7 +37,8 @@
bool in_script_tag;
bool in_style_tag;
string title, sample, keywords, dump;
- string charset; // This is the charset our user thinks the doc is in
+ string ocharset; // This is the charset our user thinks the doc was
+ string charset; // This is the charset it was supposedly converted to
string doccharset; // Set this to value of charset parameter in header
bool indexing_allowed;
void process_text(const string &text);
@@ -125,12 +126,18 @@
map<string, string>::const_iterator k;
if ((k = p.params.find("charset")) != p.params.end()) {
doccharset = k->second;
- if (doccharset != charset)
+ if (doccharset != ocharset) {
+ LOGDEB1(("Doc specified charset '%s' "
+ "differs from announced '%s'\n",
+ doccharset.c_str(), ocharset.c_str()));
throw true;
+ }
}
}
}
}
+ } else if (tag == "p" || tag == "br") {
+ dump += "\n";
} else if (tag == "script") {
in_script_tag = true;
} else if (tag == "style") {
@@ -179,14 +186,11 @@
} else
charset = conf->defcharset;
- LOGDEB(("textHtmlToDoc: charset before parsing: %s\n",
- charset.c_str()));
+ LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
MyHtmlParser pres;
for (int pass = 0; pass < 2; pass++) {
string transcoded;
- LOGDEB(("textHtmlToDoc: transcode from %s to %s\n",
- charset.c_str(), "UTF-8"));
MyHtmlParser p;
// Try transcoding. If it fails, use original text.
@@ -195,10 +199,11 @@
charset.c_str()));
transcoded = otext;
// We don't know the charset, at all
- p.charset = charset = "";
+ p.ocharset = p.charset = charset = "";
} else {
- // charset has the putative source charset, transcoded is now
+ // ocharset has the putative source charset, transcoded is now
// in utf-8
+ p.ocharset = charset;
p.charset = "utf-8";
}
@@ -206,10 +211,10 @@
p.parse_html(transcoded);
} catch (bool) {
pres = p;
- if (!pres.doccharset.empty() && pres.doccharset != charset) {
+ if (!pres.doccharset.empty() &&
+ pres.doccharset != pres.ocharset) {
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
- "reparse\n", charset.c_str(),
- pres.doccharset.c_str()));
+ "reparse\n", charset.c_str(),pres.doccharset.c_str()));
charset = pres.doccharset;
} else
break;
@@ -219,6 +224,7 @@
Rcl::Doc out;
out.origcharset = charset;
out.text = pres.dump;
+ // LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
out.title = pres.title;
out.keywords = pres.keywords;
out.abstract = pres.sample;