|
a/src/internfile/myhtmlparse.cpp |
|
b/src/internfile/myhtmlparse.cpp |
|
... |
|
... |
175 |
|
175 |
|
176 |
MyHtmlParser::MyHtmlParser()
|
176 |
MyHtmlParser::MyHtmlParser()
|
177 |
: in_script_tag(false),
|
177 |
: in_script_tag(false),
|
178 |
in_style_tag(false),
|
178 |
in_style_tag(false),
|
179 |
in_pre_tag(false),
|
179 |
in_pre_tag(false),
|
|
|
180 |
in_title_tag(false),
|
180 |
pending_space(false),
|
181 |
pending_space(false),
|
181 |
indexing_allowed(true)
|
182 |
indexing_allowed(true)
|
182 |
{
|
183 |
{
|
183 |
// The default html document charset is iso-8859-1. We'll update
|
184 |
// The default html document charset is iso-8859-1. We'll update
|
184 |
// this value from the encoding tag if found. Actually use cp1252 which
|
185 |
// this value from the encoding tag if found. Actually use cp1252 which
|
|
... |
|
... |
254 |
// browser would display.
|
255 |
// browser would display.
|
255 |
// We keep whitespace inside <pre> tags
|
256 |
// We keep whitespace inside <pre> tags
|
256 |
void
|
257 |
void
|
257 |
MyHtmlParser::process_text(const string &text)
|
258 |
MyHtmlParser::process_text(const string &text)
|
258 |
{
|
259 |
{
|
259 |
LOGDEB2(("process_text: pending_space %d txt [%s]\n", pending_space,
|
260 |
LOGDEB2(("process_text: title %d script %d style %d pre %d "
|
|
|
261 |
"pending_space %d txt [%s]\n",
|
|
|
262 |
in_title_tag,
|
|
|
263 |
in_script_tag,
|
|
|
264 |
in_style_tag,
|
|
|
265 |
in_pre_tag,
|
|
|
266 |
pending_space,
|
260 |
text.c_str()));
|
267 |
text.c_str()));
|
261 |
CancelCheck::instance().checkCancel();
|
268 |
CancelCheck::instance().checkCancel();
|
262 |
|
269 |
|
263 |
if (!in_script_tag && !in_style_tag) {
|
270 |
if (!in_script_tag && !in_style_tag) {
|
|
|
271 |
if (in_title_tag) {
|
|
|
272 |
titledump += text;
|
264 |
if (!in_pre_tag) {
|
273 |
} else if (!in_pre_tag) {
|
265 |
string::size_type b = 0;
|
274 |
string::size_type b = 0;
|
266 |
bool only_space = true;
|
275 |
bool only_space = true;
|
267 |
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
|
276 |
while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
|
268 |
only_space = false;
|
277 |
only_space = false;
|
269 |
// If space specifically needed or chunk begins with
|
278 |
// If space specifically needed or chunk begins with
|
|
... |
|
... |
459 |
} else if (tag == "select")
|
468 |
} else if (tag == "select")
|
460 |
pending_space = true;
|
469 |
pending_space = true;
|
461 |
break;
|
470 |
break;
|
462 |
case 't':
|
471 |
case 't':
|
463 |
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
472 |
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
|
|
473 |
tag == "th") {
|
464 |
tag == "th") pending_space = true;
|
474 |
pending_space = true;
|
|
|
475 |
} else if (tag == "title") {
|
|
|
476 |
in_title_tag = true;
|
|
|
477 |
}
|
465 |
break;
|
478 |
break;
|
466 |
case 'u':
|
479 |
case 'u':
|
467 |
if (tag == "ul") pending_space = true;
|
480 |
if (tag == "ul") pending_space = true;
|
468 |
break;
|
481 |
break;
|
469 |
case 'x':
|
482 |
case 'x':
|
|
... |
|
... |
540 |
}
|
553 |
}
|
541 |
if (tag == "select") pending_space = true;
|
554 |
if (tag == "select") pending_space = true;
|
542 |
break;
|
555 |
break;
|
543 |
case 't':
|
556 |
case 't':
|
544 |
if (tag == "title") {
|
557 |
if (tag == "title") {
|
|
|
558 |
in_title_tag = false;
|
545 |
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
|
559 |
if (meta.find("title") == meta.end()|| meta["title"].empty()) {
|
546 |
meta["title"] = dump;
|
560 |
meta["title"] = titledump;
|
547 |
dump.clear();
|
561 |
titledump.clear();
|
548 |
}
|
562 |
}
|
549 |
break;
|
563 |
break;
|
550 |
}
|
564 |
}
|
551 |
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
565 |
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
552 |
tag == "th") pending_space = true;
|
566 |
tag == "th") pending_space = true;
|