Switch to unified view

a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
...
...
35
#include "myhtmlparse.h"
35
#include "myhtmlparse.h"
36
#include "indextext.h" // for lowercase_term()
36
#include "indextext.h" // for lowercase_term()
37
#include "mimeparse.h"
37
#include "mimeparse.h"
38
#include "smallut.h"
38
#include "smallut.h"
39
#include "cancelcheck.h"
39
#include "cancelcheck.h"
40
#include "debuglog.h"
40
#include "log.h"
41
#include "transcode.h"
41
#include "transcode.h"
42
42
43
static const string cstr_html_charset("charset");
43
static const string cstr_html_charset("charset");
44
static const string cstr_html_content("content");
44
static const string cstr_html_content("content");
45
45
...
...
191
    charset = "CP1252";
191
    charset = "CP1252";
192
}
192
}
193
193
194
void MyHtmlParser::decode_entities(string &s)
194
void MyHtmlParser::decode_entities(string &s)
195
{
195
{
196
    LOGDEB2(("MyHtmlParser::decode_entities\n"));
196
    LOGDEB2("MyHtmlParser::decode_entities\n" );
197
    // This has no meaning whatsoever if the character encoding is unknown,
197
    // This has no meaning whatsoever if the character encoding is unknown,
198
    // so don't do it. If charset known, caller has converted text to utf-8, 
198
    // so don't do it. If charset known, caller has converted text to utf-8, 
199
    // and this is also how we translate entities
199
    // and this is also how we translate entities
200
    //    if (tocharset != "utf-8")
200
    //    if (tocharset != "utf-8")
201
    //      return;
201
    //      return;
...
...
259
// browser would display.
259
// browser would display.
260
// We keep whitespace inside <pre> tags
260
// We keep whitespace inside <pre> tags
261
void
261
void
262
MyHtmlParser::process_text(const string &text)
262
MyHtmlParser::process_text(const string &text)
263
{
263
{
264
    LOGDEB2(("process_text: title %d script %d style %d pre %d "
264
    LOGDEB2("process_text: title "  << (in_title_tag) << " script "  << (in_script_tag) << " style "  << (in_style_tag) << " pre "  << (in_pre_tag) << " pending_space "  << (pending_space) << " txt ["  << (text) << "]\n" );
265
       "pending_space %d txt [%s]\n", 
266
       in_title_tag,
267
       in_script_tag,
268
       in_style_tag,
269
       in_pre_tag,
270
       pending_space,
271
       text.c_str()));
272
    CancelCheck::instance().checkCancel();
265
    CancelCheck::instance().checkCancel();
273
266
274
    if (!in_script_tag && !in_style_tag) {
267
    if (!in_script_tag && !in_style_tag) {
275
    if (in_title_tag) {
268
    if (in_title_tag) {
276
        titledump += text;
269
        titledump += text;
...
...
305
}
298
}
306
299
307
bool
300
bool
308
MyHtmlParser::opening_tag(const string &tag)
301
MyHtmlParser::opening_tag(const string &tag)
309
{
302
{
310
    LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
303
    LOGDEB2("opening_tag: ["  << (tag) << "]\n" );
311
#if 0
304
#if 0
312
    cout << "TAG: " << tag << ": " << endl;
305
    cout << "TAG: " << tag << ": " << endl;
313
    map<string, string>::const_iterator x;
306
    map<string, string>::const_iterator x;
314
    for (x = p.begin(); x != p.end(); x++) {
307
    for (x = p.begin(); x != p.end(); x++) {
315
    cout << "  " << x->first << " -> '" << x->second << "'" << endl;
308
    cout << "  " << x->first << " -> '" << x->second << "'" << endl;
...
...
417
                if ((k = p.params.find(cstr_html_charset)) != 
410
                if ((k = p.params.find(cstr_html_charset)) != 
418
                p.params.end()) {
411
                p.params.end()) {
419
                charset = k->second;
412
                charset = k->second;
420
                if (!charset.empty() && 
413
                if (!charset.empty() && 
421
                    !samecharset(charset, fromcharset)) {
414
                    !samecharset(charset, fromcharset)) {
422
                  LOGDEB1(("Doc http-equiv charset '%s' "
415
                  LOGDEB1("Doc http-equiv charset '"  << (charset) << "' differs from dir deflt '"  << (fromcharset) << "'\n" );
423
                      "differs from dir deflt '%s'\n",
424
                      charset.c_str(), 
425
                      fromcharset.c_str()));
426
                    throw false;
416
                    throw false;
427
                }
417
                }
428
                }
418
                }
429
            }
419
            }
430
            }
420
            }
...
...
434
            // HTML5 added: <meta charset="...">
424
            // HTML5 added: <meta charset="...">
435
            lowercase_term(newcharset);
425
            lowercase_term(newcharset);
436
            charset = newcharset;
426
            charset = newcharset;
437
            if (!charset.empty() && 
427
            if (!charset.empty() && 
438
            !samecharset(charset, fromcharset)) {
428
            !samecharset(charset, fromcharset)) {
439
          LOGDEB1(("Doc html5 charset '%s' "
429
          LOGDEB1("Doc html5 charset '"  << (charset) << "' differs from dir deflt '"  << (fromcharset) << "'\n" );
440
               "differs from dir deflt '%s'\n",
441
               charset.c_str(), 
442
               fromcharset.c_str()));
443
            throw false;
430
            throw false;
444
            }
431
            }
445
        }
432
        }
446
        break;
433
        break;
447
        } else if (tag == "marquee" || tag == "menu" || tag == "multicol")
434
        } else if (tag == "marquee" || tag == "menu" || tag == "multicol")
...
...
492
}
479
}
493
480
494
bool
481
bool
495
MyHtmlParser::closing_tag(const string &tag)
482
MyHtmlParser::closing_tag(const string &tag)
496
{
483
{
497
    LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
484
    LOGDEB2("closing_tag: ["  << (tag) << "]\n" );
498
    if (tag.empty()) return true;
485
    if (tag.empty()) return true;
499
    switch (tag[0]) {
486
    switch (tag[0]) {
500
    case 'a':
487
    case 'a':
501
        if (tag == "address") pending_space = true;
488
        if (tag == "address") pending_space = true;
502
        break;
489
        break;
...
...
590
// better than nothing.
577
// better than nothing.
591
void
578
void
592
MyHtmlParser::do_eof()
579
MyHtmlParser::do_eof()
593
{
580
{
594
}
581
}
582