|
a/src/internfile/myhtmlparse.cpp |
|
b/src/internfile/myhtmlparse.cpp |
|
... |
|
... |
179 |
in_pre_tag(false),
|
179 |
in_pre_tag(false),
|
180 |
pending_space(false),
|
180 |
pending_space(false),
|
181 |
indexing_allowed(true)
|
181 |
indexing_allowed(true)
|
182 |
{
|
182 |
{
|
183 |
// The default html document charset is iso-8859-1. We'll update
|
183 |
// The default html document charset is iso-8859-1. We'll update
|
184 |
// this value from the encoding tag if found.
|
184 |
// this value from the encoding tag if found. Actually use cp1252 which
|
185 |
charset = "iso-8859-1";
|
185 |
// is a superset
|
|
|
186 |
charset = "CP1252";
|
186 |
}
|
187 |
}
|
187 |
|
188 |
|
188 |
void MyHtmlParser::decode_entities(string &s)
|
189 |
void MyHtmlParser::decode_entities(string &s)
|
189 |
{
|
190 |
{
|
190 |
LOGDEB2(("MyHtmlParser::decode_entities\n"));
|
191 |
LOGDEB2(("MyHtmlParser::decode_entities\n"));
|
|
... |
|
... |
400 |
parseMimeHeaderValue(content, p);
|
401 |
parseMimeHeaderValue(content, p);
|
401 |
map<string, string>::const_iterator k;
|
402 |
map<string, string>::const_iterator k;
|
402 |
if ((k = p.params.find(cstr_html_charset)) !=
|
403 |
if ((k = p.params.find(cstr_html_charset)) !=
|
403 |
p.params.end()) {
|
404 |
p.params.end()) {
|
404 |
charset = k->second;
|
405 |
charset = k->second;
|
|
|
406 |
if (!charset.empty() &&
|
405 |
if (!samecharset(charset, fromcharset)) {
|
407 |
!samecharset(charset, fromcharset)) {
|
406 |
LOGDEB1(("Doc http-equiv charset '%s' "
|
408 |
LOGDEB1(("Doc http-equiv charset '%s' "
|
407 |
"differs from dir deflt '%s'\n",
|
409 |
"differs from dir deflt '%s'\n",
|
408 |
charset.c_str(),
|
410 |
charset.c_str(),
|
409 |
fromcharset.c_str()));
|
411 |
fromcharset.c_str()));
|
410 |
throw false;
|
412 |
throw false;
|
|
... |
|
... |
416 |
string newcharset;
|
418 |
string newcharset;
|
417 |
if (get_parameter(cstr_html_charset, newcharset)) {
|
419 |
if (get_parameter(cstr_html_charset, newcharset)) {
|
418 |
// HTML5 added: <meta charset="...">
|
420 |
// HTML5 added: <meta charset="...">
|
419 |
lowercase_term(newcharset);
|
421 |
lowercase_term(newcharset);
|
420 |
charset = newcharset;
|
422 |
charset = newcharset;
|
|
|
423 |
if (!charset.empty() &&
|
421 |
if (!samecharset(charset, fromcharset)) {
|
424 |
!samecharset(charset, fromcharset)) {
|
422 |
LOGDEB1(("Doc html5 charset '%s' "
|
425 |
LOGDEB1(("Doc html5 charset '%s' "
|
423 |
"differs from dir deflt '%s'\n",
|
426 |
"differs from dir deflt '%s'\n",
|
424 |
charset.c_str(),
|
427 |
charset.c_str(),
|
425 |
fromcharset.c_str()));
|
428 |
fromcharset.c_str()));
|
426 |
throw false;
|
429 |
throw false;
|