Switch to unified view

a/src/internfile/myhtmlparse.cpp b/src/internfile/myhtmlparse.cpp
1
/* This file was copied from omega-0.8.5 and modified */
1
/* This file was copied from omega-0.8.5->1.2.6 and modified */
2
2
3
/* myhtmlparse.cc: subclass of HtmlParser for extracting text
3
/* myhtmlparse.cc: subclass of HtmlParser for extracting text
4
 *
4
 *
5
 * ----START-LICENCE----
5
 * ----START-LICENCE----
6
 * Copyright 1999,2000,2001 BrightStation PLC
6
 * Copyright 1999,2000,2001 BrightStation PLC
...
...
285
        dump += text;
285
        dump += text;
286
    }
286
    }
287
    }
287
    }
288
}
288
}
289
289
290
void
290
bool
291
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
291
MyHtmlParser::opening_tag(const string &tag)
292
{
292
{
293
    LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
293
    LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
294
#if 0
294
#if 0
295
    cout << "TAG: " << tag << ": " << endl;
295
    cout << "TAG: " << tag << ": " << endl;
296
    map<string, string>::const_iterator x;
296
    map<string, string>::const_iterator x;
297
    for (x = p.begin(); x != p.end(); x++) {
297
    for (x = p.begin(); x != p.end(); x++) {
298
    cout << "  " << x->first << " -> '" << x->second << "'" << endl;
298
    cout << "  " << x->first << " -> '" << x->second << "'" << endl;
299
    }
299
    }
300
#endif
300
#endif
301
    if (tag.empty()) return;
301
    if (tag.empty()) return true;
302
    switch (tag[0]) {
302
    switch (tag[0]) {
303
    case 'a':
303
    case 'a':
304
        if (tag == "address") pending_space = true;
304
        if (tag == "address") pending_space = true;
305
        break;
305
        break;
306
    case 'b':
306
    case 'b':
307
        if (tag == "body") {
307
        if (tag == "body") {
308
      dump = "";
308
      dump.resize(0);
309
        in_body_tag = true;
309
        in_body_tag = true;
310
        break;
310
        break;
311
        }
311
        }
312
        if (tag == "blockquote" || tag == "br") {
312
        if (tag == "blockquote" || tag == "br") {
313
        dump += '\n';
313
        dump += '\n';
...
...
349
        pending_space = true;
349
        pending_space = true;
350
        }
350
        }
351
        break;
351
        break;
352
    case 'm':
352
    case 'm':
353
        if (tag == "meta") {
353
        if (tag == "meta") {
354
      map<string, string>::const_iterator i, j;
354
      string content;
355
      if ((i = p.find("content")) != p.end()) {
355
      if (get_parameter("content", content)) {
356
          if ((j = p.find("name")) != p.end()) {
356
          string name;
357
          string name = j->second;
357
          if (get_parameter("name", name)) {
358
            lowercase_term(name);
358
            lowercase_term(name);
359
            if (name == "date") {
359
            if (name == "date") {
360
                // Yes this doesnt exist. It's output by filters
360
                // Yes this doesnt exist. It's output by filters
361
                // And the format isn't even standard http/html
361
                // And the format isn't even standard http/html
362
                // FIXME
362
                // FIXME
363
              string tmp = i->second;
364
                decode_entities(tmp);
363
                decode_entities(content);
365
                struct tm tm;
364
                struct tm tm;
366
                if (strptime(tmp.c_str(), 
365
                if (strptime(content.c_str(), 
367
                     " %Y-%m-%d %H:%M:%S ", &tm) ||
366
                     " %Y-%m-%d %H:%M:%S ", &tm) ||
368
                strptime(tmp.c_str(), 
367
                strptime(content.c_str(), 
369
                     "%Y-%m-%dT%H:%M:%S", &tm)
368
                     "%Y-%m-%dT%H:%M:%S", &tm)
370
                ) {
369
                ) {
371
                char ascuxtime[100];
370
                char ascuxtime[100];
372
                sprintf(ascuxtime, "%ld", (long)mktime(&tm));
371
                sprintf(ascuxtime, "%ld", (long)mktime(&tm));
373
                dmtime = ascuxtime;
372
                dmtime = ascuxtime;
374
                }
373
                }
375
            } else if (name == "robots") {
374
            } else if (name == "robots") {
376
            } else {
375
            } else {
377
                if (!meta[name].empty())
376
                if (!meta[name].empty())
378
                meta[name] += ' ';
377
                meta[name] += ' ';
379
              string tmp = i->second;
380
                decode_entities(tmp);
378
                decode_entities(content);
381
                meta[name] += tmp;
379
                meta[name] += content;
382
            }
380
            }
383
          } else if ((j = p.find("http-equiv")) != p.end()) {
381
          } 
384
          string hequiv = j->second;
382
          string hdr;
383
          if (get_parameter("http-equiv", hdr)) {
385
            lowercase_term(hequiv);
384
            lowercase_term(hdr);
386
            if (hequiv == "content-type") {
385
            if (hdr == "content-type") {
387
              string value = i->second;
388
                MimeHeaderValue p;
386
                MimeHeaderValue p;
389
                parseMimeHeaderValue(value, p);
387
                parseMimeHeaderValue(content, p);
390
                map<string, string>::const_iterator k;
388
                map<string, string>::const_iterator k;
391
                if ((k = p.params.find("charset")) != 
389
                if ((k = p.params.find("charset")) != 
392
                p.params.end()) {
390
                p.params.end()) {
393
                charset = k->second;
391
                charset = k->second;
394
                if (!samecharset(charset, fromcharset)) {
392
                if (!samecharset(charset, fromcharset)) {
...
...
443
        break;
441
        break;
444
    case 'x':
442
    case 'x':
445
        if (tag == "xmp") pending_space = true;
443
        if (tag == "xmp") pending_space = true;
446
        break;
444
        break;
447
    }
445
    }
446
    return true;
448
}
447
}
449
448
450
void
449
bool
451
MyHtmlParser::closing_tag(const string &tag)
450
MyHtmlParser::closing_tag(const string &tag)
452
{
451
{
453
    LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
452
    LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
454
    if (tag.empty()) return;
453
    if (tag.empty()) return true;
455
    switch (tag[0]) {
454
    switch (tag[0]) {
456
    case 'a':
455
    case 'a':
457
        if (tag == "address") pending_space = true;
456
        if (tag == "address") pending_space = true;
458
        break;
457
        break;
459
    case 'b':
458
    case 'b':
460
        if (tag == "body") {
459
        if (tag == "body") {
461
        LOGDEB1(("Myhtmlparse: body close tag found\n"));
460
        LOGDEB1(("Myhtmlparse: body close tag found\n"));
462
        in_body_tag = false;
461
        in_body_tag = false;
463
      throw true;
462
      return false;
464
        }
463
        }
465
        if (tag == "blockquote" || tag == "br") pending_space = true;
464
        if (tag == "blockquote" || tag == "br") pending_space = true;
466
        break;
465
        break;
467
    case 'c':
466
    case 'c':
468
        if (tag == "center") pending_space = true;
467
        if (tag == "center") pending_space = true;
...
...
530
        break;
529
        break;
531
    case 'x':
530
    case 'x':
532
        if (tag == "xmp") pending_space = true;
531
        if (tag == "xmp") pending_space = true;
533
        break;
532
        break;
534
    }
533
    }
534
    return true;
535
}
535
}
536
536
537
// This gets called when hitting eof. 
537
// This gets called when hitting eof. 
538
// We used to do: 
538
// We used to do: 
539
//    > If the <body> is open, do
539
//    > If the <body> is open, do