|
a/src/internfile/myhtmlparse.cpp |
|
b/src/internfile/myhtmlparse.cpp |
|
... |
|
... |
35 |
#include "myhtmlparse.h"
|
35 |
#include "myhtmlparse.h"
|
36 |
#include "indextext.h" // for lowercase_term()
|
36 |
#include "indextext.h" // for lowercase_term()
|
37 |
#include "mimeparse.h"
|
37 |
#include "mimeparse.h"
|
38 |
#include "smallut.h"
|
38 |
#include "smallut.h"
|
39 |
#include "cancelcheck.h"
|
39 |
#include "cancelcheck.h"
|
40 |
#include "debuglog.h"
|
40 |
#include "log.h"
|
41 |
#include "transcode.h"
|
41 |
#include "transcode.h"
|
42 |
|
42 |
|
43 |
static const string cstr_html_charset("charset");
|
43 |
static const string cstr_html_charset("charset");
|
44 |
static const string cstr_html_content("content");
|
44 |
static const string cstr_html_content("content");
|
45 |
|
45 |
|
|
... |
|
... |
191 |
charset = "CP1252";
|
191 |
charset = "CP1252";
|
192 |
}
|
192 |
}
|
193 |
|
193 |
|
194 |
void MyHtmlParser::decode_entities(string &s)
|
194 |
void MyHtmlParser::decode_entities(string &s)
|
195 |
{
|
195 |
{
|
196 |
LOGDEB2(("MyHtmlParser::decode_entities\n"));
|
196 |
LOGDEB2("MyHtmlParser::decode_entities\n" );
|
197 |
// This has no meaning whatsoever if the character encoding is unknown,
|
197 |
// This has no meaning whatsoever if the character encoding is unknown,
|
198 |
// so don't do it. If charset known, caller has converted text to utf-8,
|
198 |
// so don't do it. If charset known, caller has converted text to utf-8,
|
199 |
// and this is also how we translate entities
|
199 |
// and this is also how we translate entities
|
200 |
// if (tocharset != "utf-8")
|
200 |
// if (tocharset != "utf-8")
|
201 |
// return;
|
201 |
// return;
|
|
... |
|
... |
259 |
// browser would display.
|
259 |
// browser would display.
|
260 |
// We keep whitespace inside <pre> tags
|
260 |
// We keep whitespace inside <pre> tags
|
261 |
void
|
261 |
void
|
262 |
MyHtmlParser::process_text(const string &text)
|
262 |
MyHtmlParser::process_text(const string &text)
|
263 |
{
|
263 |
{
|
264 |
LOGDEB2(("process_text: title %d script %d style %d pre %d "
|
264 |
LOGDEB2("process_text: title " << (in_title_tag) << " script " << (in_script_tag) << " style " << (in_style_tag) << " pre " << (in_pre_tag) << " pending_space " << (pending_space) << " txt [" << (text) << "]\n" );
|
265 |
"pending_space %d txt [%s]\n",
|
|
|
266 |
in_title_tag,
|
|
|
267 |
in_script_tag,
|
|
|
268 |
in_style_tag,
|
|
|
269 |
in_pre_tag,
|
|
|
270 |
pending_space,
|
|
|
271 |
text.c_str()));
|
|
|
272 |
CancelCheck::instance().checkCancel();
|
265 |
CancelCheck::instance().checkCancel();
|
273 |
|
266 |
|
274 |
if (!in_script_tag && !in_style_tag) {
|
267 |
if (!in_script_tag && !in_style_tag) {
|
275 |
if (in_title_tag) {
|
268 |
if (in_title_tag) {
|
276 |
titledump += text;
|
269 |
titledump += text;
|
|
... |
|
... |
305 |
}
|
298 |
}
|
306 |
|
299 |
|
307 |
bool
|
300 |
bool
|
308 |
MyHtmlParser::opening_tag(const string &tag)
|
301 |
MyHtmlParser::opening_tag(const string &tag)
|
309 |
{
|
302 |
{
|
310 |
LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
|
303 |
LOGDEB2("opening_tag: [" << (tag) << "]\n" );
|
311 |
#if 0
|
304 |
#if 0
|
312 |
cout << "TAG: " << tag << ": " << endl;
|
305 |
cout << "TAG: " << tag << ": " << endl;
|
313 |
map<string, string>::const_iterator x;
|
306 |
map<string, string>::const_iterator x;
|
314 |
for (x = p.begin(); x != p.end(); x++) {
|
307 |
for (x = p.begin(); x != p.end(); x++) {
|
315 |
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
308 |
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
|
... |
|
... |
417 |
if ((k = p.params.find(cstr_html_charset)) !=
|
410 |
if ((k = p.params.find(cstr_html_charset)) !=
|
418 |
p.params.end()) {
|
411 |
p.params.end()) {
|
419 |
charset = k->second;
|
412 |
charset = k->second;
|
420 |
if (!charset.empty() &&
|
413 |
if (!charset.empty() &&
|
421 |
!samecharset(charset, fromcharset)) {
|
414 |
!samecharset(charset, fromcharset)) {
|
422 |
LOGDEB1(("Doc http-equiv charset '%s' "
|
415 |
LOGDEB1("Doc http-equiv charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
|
423 |
"differs from dir deflt '%s'\n",
|
|
|
424 |
charset.c_str(),
|
|
|
425 |
fromcharset.c_str()));
|
|
|
426 |
throw false;
|
416 |
throw false;
|
427 |
}
|
417 |
}
|
428 |
}
|
418 |
}
|
429 |
}
|
419 |
}
|
430 |
}
|
420 |
}
|
|
... |
|
... |
434 |
// HTML5 added: <meta charset="...">
|
424 |
// HTML5 added: <meta charset="...">
|
435 |
lowercase_term(newcharset);
|
425 |
lowercase_term(newcharset);
|
436 |
charset = newcharset;
|
426 |
charset = newcharset;
|
437 |
if (!charset.empty() &&
|
427 |
if (!charset.empty() &&
|
438 |
!samecharset(charset, fromcharset)) {
|
428 |
!samecharset(charset, fromcharset)) {
|
439 |
LOGDEB1(("Doc html5 charset '%s' "
|
429 |
LOGDEB1("Doc html5 charset '" << (charset) << "' differs from dir deflt '" << (fromcharset) << "'\n" );
|
440 |
"differs from dir deflt '%s'\n",
|
|
|
441 |
charset.c_str(),
|
|
|
442 |
fromcharset.c_str()));
|
|
|
443 |
throw false;
|
430 |
throw false;
|
444 |
}
|
431 |
}
|
445 |
}
|
432 |
}
|
446 |
break;
|
433 |
break;
|
447 |
} else if (tag == "marquee" || tag == "menu" || tag == "multicol")
|
434 |
} else if (tag == "marquee" || tag == "menu" || tag == "multicol")
|
|
... |
|
... |
492 |
}
|
479 |
}
|
493 |
|
480 |
|
494 |
bool
|
481 |
bool
|
495 |
MyHtmlParser::closing_tag(const string &tag)
|
482 |
MyHtmlParser::closing_tag(const string &tag)
|
496 |
{
|
483 |
{
|
497 |
LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
|
484 |
LOGDEB2("closing_tag: [" << (tag) << "]\n" );
|
498 |
if (tag.empty()) return true;
|
485 |
if (tag.empty()) return true;
|
499 |
switch (tag[0]) {
|
486 |
switch (tag[0]) {
|
500 |
case 'a':
|
487 |
case 'a':
|
501 |
if (tag == "address") pending_space = true;
|
488 |
if (tag == "address") pending_space = true;
|
502 |
break;
|
489 |
break;
|
|
... |
|
... |
590 |
// better than nothing.
|
577 |
// better than nothing.
|
591 |
void
|
578 |
void
|
592 |
MyHtmlParser::do_eof()
|
579 |
MyHtmlParser::do_eof()
|
593 |
{
|
580 |
{
|
594 |
}
|
581 |
}
|
|
|
582 |
|