|
a/src/internfile/myhtmlparse.cpp |
|
b/src/internfile/myhtmlparse.cpp |
|
... |
|
... |
174 |
static NamedEntsInitializer namedEntsInitializerInstance;
|
174 |
static NamedEntsInitializer namedEntsInitializerInstance;
|
175 |
|
175 |
|
176 |
MyHtmlParser::MyHtmlParser()
|
176 |
MyHtmlParser::MyHtmlParser()
|
177 |
: in_script_tag(false),
|
177 |
: in_script_tag(false),
|
178 |
in_style_tag(false),
|
178 |
in_style_tag(false),
|
179 |
in_body_tag(false),
|
|
|
180 |
in_pre_tag(false),
|
179 |
in_pre_tag(false),
|
181 |
pending_space(false),
|
180 |
pending_space(false),
|
182 |
indexing_allowed(true)
|
181 |
indexing_allowed(true)
|
183 |
{
|
182 |
{
|
184 |
// The default html document charset is iso-8859-1. We'll update
|
183 |
// The default html document charset is iso-8859-1. We'll update
|
|
... |
|
... |
306 |
switch (tag[0]) {
|
305 |
switch (tag[0]) {
|
307 |
case 'a':
|
306 |
case 'a':
|
308 |
if (tag == "address") pending_space = true;
|
307 |
if (tag == "address") pending_space = true;
|
309 |
break;
|
308 |
break;
|
310 |
case 'b':
|
309 |
case 'b':
|
311 |
if (tag == "body") {
|
310 |
// body: some bad docs have several opening body tags and
|
312 |
dump.resize(0);
|
311 |
// even text before the body is displayed by Opera and
|
313 |
in_body_tag = true;
|
312 |
// Firefox. We used to reset the dump each time we saw a
|
314 |
break;
|
313 |
// body tag, but I can't see any reason to do so.
|
315 |
}
|
|
|
316 |
if (tag == "blockquote" || tag == "br") {
|
314 |
if (tag == "blockquote" || tag == "br") {
|
317 |
dump += '\n';
|
315 |
dump += '\n';
|
318 |
pending_space = true;
|
316 |
pending_space = true;
|
319 |
}
|
317 |
}
|
320 |
break;
|
318 |
break;
|
|
... |
|
... |
473 |
if (tag == "address") pending_space = true;
|
471 |
if (tag == "address") pending_space = true;
|
474 |
break;
|
472 |
break;
|
475 |
case 'b':
|
473 |
case 'b':
|
476 |
if (tag == "body") {
|
474 |
if (tag == "body") {
|
477 |
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
475 |
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
478 |
in_body_tag = false;
|
476 |
// We used to signal and end of doc here by returning
|
479 |
return false;
|
477 |
// false but the browsers just ignore body and html
|
|
|
478 |
// closing tags if there is further text, so it seems right
|
|
|
479 |
// to do the same
|
|
|
480 |
break;
|
480 |
}
|
481 |
}
|
481 |
if (tag == "blockquote" || tag == "br") pending_space = true;
|
482 |
if (tag == "blockquote" || tag == "br") pending_space = true;
|
482 |
break;
|
483 |
break;
|
483 |
case 'c':
|
484 |
case 'c':
|
484 |
if (tag == "center") pending_space = true;
|
485 |
if (tag == "center") pending_space = true;
|
|
... |
|
... |
560 |
// But we don't throw any more. Whatever text we've extracted up to now is
|
561 |
// But we don't throw any more. Whatever text we've extracted up to now is
|
561 |
// better than nothing.
|
562 |
// better than nothing.
|
562 |
void
|
563 |
void
|
563 |
MyHtmlParser::do_eof()
|
564 |
MyHtmlParser::do_eof()
|
564 |
{
|
565 |
{
|
565 |
// if (!in_body_tag)
|
|
|
566 |
// throw(false);
|
|
|
567 |
}
|
566 |
}
|