Switch to unified view

a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
...
...
174
static NamedEntsInitializer namedEntsInitializerInstance;
174
static NamedEntsInitializer namedEntsInitializerInstance;
175
175
176
MyHtmlParser::MyHtmlParser()
176
MyHtmlParser::MyHtmlParser()
177
    : in_script_tag(false),
177
    : in_script_tag(false),
178
      in_style_tag(false),
178
      in_style_tag(false),
179
      in_body_tag(false),
180
      in_pre_tag(false),
179
      in_pre_tag(false),
181
      pending_space(false),
180
      pending_space(false),
182
      indexing_allowed(true)
181
      indexing_allowed(true)
183
{
182
{
184
    // The default html document charset is iso-8859-1. We'll update
183
    // The default html document charset is iso-8859-1. We'll update
...
...
306
    switch (tag[0]) {
305
    switch (tag[0]) {
307
    case 'a':
306
    case 'a':
308
        if (tag == "address") pending_space = true;
307
        if (tag == "address") pending_space = true;
309
        break;
308
        break;
310
    case 'b':
309
    case 'b':
311
      if (tag == "body") {
310
      // body: some bad docs have several opening body tags and
312
      dump.resize(0);
311
      // even text before the body is displayed by Opera and
313
      in_body_tag = true;
312
      // Firefox.  We used to reset the dump each time we saw a
314
      break;
313
      // body tag, but I can't see any reason to do so.
315
      }
316
        if (tag == "blockquote" || tag == "br") {
314
        if (tag == "blockquote" || tag == "br") {
317
        dump += '\n';
315
        dump += '\n';
318
        pending_space = true;
316
        pending_space = true;
319
        }
317
        }
320
        break;
318
        break;
...
...
473
        if (tag == "address") pending_space = true;
471
        if (tag == "address") pending_space = true;
474
        break;
472
        break;
475
    case 'b':
473
    case 'b':
476
        if (tag == "body") {
474
        if (tag == "body") {
477
        LOGDEB1(("Myhtmlparse: body close tag found\n"));
475
        LOGDEB1(("Myhtmlparse: body close tag found\n"));
478
      in_body_tag = false;
476
      // We used to signal and end of doc here by returning
479
      return false;
477
      // false but the browsers just ignore body and html
478
      // closing tags if there is further text, so it seems right
479
      // to do the same
480
      break;
480
        }
481
        }
481
        if (tag == "blockquote" || tag == "br") pending_space = true;
482
        if (tag == "blockquote" || tag == "br") pending_space = true;
482
        break;
483
        break;
483
    case 'c':
484
    case 'c':
484
        if (tag == "center") pending_space = true;
485
        if (tag == "center") pending_space = true;
...
...
560
// But we don't throw any more. Whatever text we've extracted up to now is
561
// But we don't throw any more. Whatever text we've extracted up to now is
561
// better than nothing.
562
// better than nothing.
562
void
563
void
563
MyHtmlParser::do_eof()
564
MyHtmlParser::do_eof()
564
{
565
{
565
    //    if (!in_body_tag)
566
    //    throw(false);
567
}
566
}