Switch to unified view

a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
...
...
175
175
176
MyHtmlParser::MyHtmlParser()
176
MyHtmlParser::MyHtmlParser()
177
    : in_script_tag(false),
177
    : in_script_tag(false),
178
      in_style_tag(false),
178
      in_style_tag(false),
179
      in_pre_tag(false),
179
      in_pre_tag(false),
180
      in_title_tag(false),
180
      pending_space(false),
181
      pending_space(false),
181
      indexing_allowed(true)
182
      indexing_allowed(true)
182
{
183
{
183
    // The default html document charset is iso-8859-1. We'll update
184
    // The default html document charset is iso-8859-1. We'll update
184
    // this value from the encoding tag if found. Actually use cp1252 which
185
    // this value from the encoding tag if found. Actually use cp1252 which
...
...
254
// browser would display.
255
// browser would display.
255
// We keep whitespace inside <pre> tags
256
// We keep whitespace inside <pre> tags
256
void
257
void
257
MyHtmlParser::process_text(const string &text)
258
MyHtmlParser::process_text(const string &text)
258
{
259
{
259
    LOGDEB2(("process_text: pending_space %d txt [%s]\n", pending_space,
260
    LOGDEB2(("process_text: title %d script %d style %d pre %d "
261
       "pending_space %d txt [%s]\n", 
262
       in_title_tag,
263
       in_script_tag,
264
       in_style_tag,
265
       in_pre_tag,
266
       pending_space,
260
        text.c_str()));
267
         text.c_str()));
261
    CancelCheck::instance().checkCancel();
268
    CancelCheck::instance().checkCancel();
262
269
263
    if (!in_script_tag && !in_style_tag) {
270
    if (!in_script_tag && !in_style_tag) {
271
  if (in_title_tag) {
272
      titledump += text;
264
    if (!in_pre_tag) {
273
    } else if (!in_pre_tag) {
265
        string::size_type b = 0;
274
        string::size_type b = 0;
266
        bool only_space = true;
275
        bool only_space = true;
267
        while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
276
        while ((b = text.find_first_not_of(WHITESPACE, b)) != string::npos) {
268
        only_space = false;
277
        only_space = false;
269
        // If space specifically needed or chunk begins with
278
        // If space specifically needed or chunk begins with
...
...
459
        } else if (tag == "select") 
468
        } else if (tag == "select") 
460
        pending_space = true;
469
        pending_space = true;
461
        break;
470
        break;
462
    case 't':
471
    case 't':
463
        if (tag == "table" || tag == "td" || tag == "textarea" ||
472
        if (tag == "table" || tag == "td" || tag == "textarea" ||
473
      tag == "th") {
464
        tag == "th") pending_space = true;
474
        pending_space = true;
475
      } else if (tag == "title") {
476
      in_title_tag = true;
477
      }
465
        break;
478
        break;
466
    case 'u':
479
    case 'u':
467
        if (tag == "ul") pending_space = true;
480
        if (tag == "ul") pending_space = true;
468
        break;
481
        break;
469
    case 'x':
482
    case 'x':
...
...
540
        }
553
        }
541
        if (tag == "select") pending_space = true;
554
        if (tag == "select") pending_space = true;
542
        break;
555
        break;
543
    case 't':
556
    case 't':
544
        if (tag == "title") {
557
        if (tag == "title") {
558
      in_title_tag = false;
545
        if (meta.find("title") == meta.end()|| meta["title"].empty()) {
559
        if (meta.find("title") == meta.end()|| meta["title"].empty()) {
546
            meta["title"] = dump;
560
            meta["title"] = titledump;
547
            dump.clear();
561
            titledump.clear();
548
        }
562
        }
549
        break;
563
        break;
550
        }
564
        }
551
        if (tag == "table" || tag == "td" || tag == "textarea" ||
565
        if (tag == "table" || tag == "td" || tag == "textarea" ||
552
        tag == "th") pending_space = true;
566
        tag == "th") pending_space = true;