|
a/src/internfile/mh_html.cpp |
|
b/src/internfile/mh_html.cpp |
|
... |
|
... |
35 |
class MyHtmlParser : public HtmlParser {
|
35 |
class MyHtmlParser : public HtmlParser {
|
36 |
public:
|
36 |
public:
|
37 |
bool in_script_tag;
|
37 |
bool in_script_tag;
|
38 |
bool in_style_tag;
|
38 |
bool in_style_tag;
|
39 |
string title, sample, keywords, dump;
|
39 |
string title, sample, keywords, dump;
|
40 |
string charset; // This is the charset our user thinks the doc is in
|
40 |
string ocharset; // This is the charset our user thinks the doc was
|
|
|
41 |
string charset; // This is the charset it was supposedly converted to
|
41 |
string doccharset; // Set this to value of charset parameter in header
|
42 |
string doccharset; // Set this to value of charset parameter in header
|
42 |
bool indexing_allowed;
|
43 |
bool indexing_allowed;
|
43 |
void process_text(const string &text);
|
44 |
void process_text(const string &text);
|
44 |
void opening_tag(const string &tag, const map<string,string> &p);
|
45 |
void opening_tag(const string &tag, const map<string,string> &p);
|
45 |
void closing_tag(const string &tag);
|
46 |
void closing_tag(const string &tag);
|
|
... |
|
... |
123 |
string value = i->second;
|
124 |
string value = i->second;
|
124 |
MimeHeaderValue p = parseMimeHeaderValue(value);
|
125 |
MimeHeaderValue p = parseMimeHeaderValue(value);
|
125 |
map<string, string>::const_iterator k;
|
126 |
map<string, string>::const_iterator k;
|
126 |
if ((k = p.params.find("charset")) != p.params.end()) {
|
127 |
if ((k = p.params.find("charset")) != p.params.end()) {
|
127 |
doccharset = k->second;
|
128 |
doccharset = k->second;
|
128 |
if (doccharset != charset)
|
129 |
if (doccharset != ocharset) {
|
|
|
130 |
LOGDEB1(("Doc specified charset '%s' "
|
|
|
131 |
"differs from announced '%s'\n",
|
|
|
132 |
doccharset.c_str(), ocharset.c_str()));
|
129 |
throw true;
|
133 |
throw true;
|
|
|
134 |
}
|
130 |
}
|
135 |
}
|
131 |
}
|
136 |
}
|
132 |
}
|
137 |
}
|
133 |
}
|
138 |
}
|
|
|
139 |
} else if (tag == "p" || tag == "br") {
|
|
|
140 |
dump += "\n";
|
134 |
} else if (tag == "script") {
|
141 |
} else if (tag == "script") {
|
135 |
in_script_tag = true;
|
142 |
in_script_tag = true;
|
136 |
} else if (tag == "style") {
|
143 |
} else if (tag == "style") {
|
137 |
in_style_tag = true;
|
144 |
in_style_tag = true;
|
138 |
} else if (tag == "body") {
|
145 |
} else if (tag == "body") {
|
|
... |
|
... |
177 |
if (conf->guesscharset) {
|
184 |
if (conf->guesscharset) {
|
178 |
charset = csguess(otext, conf->defcharset);
|
185 |
charset = csguess(otext, conf->defcharset);
|
179 |
} else
|
186 |
} else
|
180 |
charset = conf->defcharset;
|
187 |
charset = conf->defcharset;
|
181 |
|
188 |
|
182 |
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n",
|
189 |
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
|
183 |
charset.c_str()));
|
|
|
184 |
|
190 |
|
185 |
MyHtmlParser pres;
|
191 |
MyHtmlParser pres;
|
186 |
for (int pass = 0; pass < 2; pass++) {
|
192 |
for (int pass = 0; pass < 2; pass++) {
|
187 |
string transcoded;
|
193 |
string transcoded;
|
188 |
LOGDEB(("textHtmlToDoc: transcode from %s to %s\n",
|
|
|
189 |
charset.c_str(), "UTF-8"));
|
|
|
190 |
|
194 |
|
191 |
MyHtmlParser p;
|
195 |
MyHtmlParser p;
|
192 |
// Try transcoding. If it fails, use original text.
|
196 |
// Try transcoding. If it fails, use original text.
|
193 |
if (!transcode(otext, transcoded, charset, "UTF-8")) {
|
197 |
if (!transcode(otext, transcoded, charset, "UTF-8")) {
|
194 |
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
198 |
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
195 |
charset.c_str()));
|
199 |
charset.c_str()));
|
196 |
transcoded = otext;
|
200 |
transcoded = otext;
|
197 |
// We don't know the charset, at all
|
201 |
// We don't know the charset, at all
|
198 |
p.charset = charset = "";
|
202 |
p.ocharset = p.charset = charset = "";
|
199 |
} else {
|
203 |
} else {
|
200 |
// charset has the putative source charset, transcoded is now
|
204 |
// ocharset has the putative source charset, transcoded is now
|
201 |
// in utf-8
|
205 |
// in utf-8
|
|
|
206 |
p.ocharset = charset;
|
202 |
p.charset = "utf-8";
|
207 |
p.charset = "utf-8";
|
203 |
}
|
208 |
}
|
204 |
|
209 |
|
205 |
try {
|
210 |
try {
|
206 |
p.parse_html(transcoded);
|
211 |
p.parse_html(transcoded);
|
207 |
} catch (bool) {
|
212 |
} catch (bool) {
|
208 |
pres = p;
|
213 |
pres = p;
|
209 |
if (!pres.doccharset.empty() && pres.doccharset != charset) {
|
214 |
if (!pres.doccharset.empty() &&
|
|
|
215 |
pres.doccharset != pres.ocharset) {
|
210 |
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
|
216 |
LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
|
211 |
"reparse\n", charset.c_str(),
|
217 |
"reparse\n", charset.c_str(),pres.doccharset.c_str()));
|
212 |
pres.doccharset.c_str()));
|
|
|
213 |
charset = pres.doccharset;
|
218 |
charset = pres.doccharset;
|
214 |
} else
|
219 |
} else
|
215 |
break;
|
220 |
break;
|
216 |
}
|
221 |
}
|
217 |
}
|
222 |
}
|
218 |
|
223 |
|
219 |
Rcl::Doc out;
|
224 |
Rcl::Doc out;
|
220 |
out.origcharset = charset;
|
225 |
out.origcharset = charset;
|
221 |
out.text = pres.dump;
|
226 |
out.text = pres.dump;
|
|
|
227 |
// LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
|
222 |
out.title = pres.title;
|
228 |
out.title = pres.title;
|
223 |
out.keywords = pres.keywords;
|
229 |
out.keywords = pres.keywords;
|
224 |
out.abstract = pres.sample;
|
230 |
out.abstract = pres.sample;
|
225 |
docout = out;
|
231 |
docout = out;
|
226 |
return true;
|
232 |
return true;
|