|
a/src/internfile/myhtmlparse.cpp |
|
b/src/internfile/myhtmlparse.cpp |
1 |
/* This file was copied from omega-0.8.5 and modified */
|
1 |
/* This file was copied from omega-0.8.5->1.2.6 and modified */
|
2 |
|
2 |
|
3 |
/* myhtmlparse.cc: subclass of HtmlParser for extracting text
|
3 |
/* myhtmlparse.cc: subclass of HtmlParser for extracting text
|
4 |
*
|
4 |
*
|
5 |
* ----START-LICENCE----
|
5 |
* ----START-LICENCE----
|
6 |
* Copyright 1999,2000,2001 BrightStation PLC
|
6 |
* Copyright 1999,2000,2001 BrightStation PLC
|
|
... |
|
... |
285 |
dump += text;
|
285 |
dump += text;
|
286 |
}
|
286 |
}
|
287 |
}
|
287 |
}
|
288 |
}
|
288 |
}
|
289 |
|
289 |
|
290 |
void
|
290 |
bool
|
291 |
MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)
|
291 |
MyHtmlParser::opening_tag(const string &tag)
|
292 |
{
|
292 |
{
|
293 |
LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
|
293 |
LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
|
294 |
#if 0
|
294 |
#if 0
|
295 |
cout << "TAG: " << tag << ": " << endl;
|
295 |
cout << "TAG: " << tag << ": " << endl;
|
296 |
map<string, string>::const_iterator x;
|
296 |
map<string, string>::const_iterator x;
|
297 |
for (x = p.begin(); x != p.end(); x++) {
|
297 |
for (x = p.begin(); x != p.end(); x++) {
|
298 |
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
298 |
cout << " " << x->first << " -> '" << x->second << "'" << endl;
|
299 |
}
|
299 |
}
|
300 |
#endif
|
300 |
#endif
|
301 |
if (tag.empty()) return;
|
301 |
if (tag.empty()) return true;
|
302 |
switch (tag[0]) {
|
302 |
switch (tag[0]) {
|
303 |
case 'a':
|
303 |
case 'a':
|
304 |
if (tag == "address") pending_space = true;
|
304 |
if (tag == "address") pending_space = true;
|
305 |
break;
|
305 |
break;
|
306 |
case 'b':
|
306 |
case 'b':
|
307 |
if (tag == "body") {
|
307 |
if (tag == "body") {
|
308 |
dump = "";
|
308 |
dump.resize(0);
|
309 |
in_body_tag = true;
|
309 |
in_body_tag = true;
|
310 |
break;
|
310 |
break;
|
311 |
}
|
311 |
}
|
312 |
if (tag == "blockquote" || tag == "br") {
|
312 |
if (tag == "blockquote" || tag == "br") {
|
313 |
dump += '\n';
|
313 |
dump += '\n';
|
|
... |
|
... |
349 |
pending_space = true;
|
349 |
pending_space = true;
|
350 |
}
|
350 |
}
|
351 |
break;
|
351 |
break;
|
352 |
case 'm':
|
352 |
case 'm':
|
353 |
if (tag == "meta") {
|
353 |
if (tag == "meta") {
|
354 |
map<string, string>::const_iterator i, j;
|
354 |
string content;
|
355 |
if ((i = p.find("content")) != p.end()) {
|
355 |
if (get_parameter("content", content)) {
|
356 |
if ((j = p.find("name")) != p.end()) {
|
356 |
string name;
|
357 |
string name = j->second;
|
357 |
if (get_parameter("name", name)) {
|
358 |
lowercase_term(name);
|
358 |
lowercase_term(name);
|
359 |
if (name == "date") {
|
359 |
if (name == "date") {
|
360 |
// Yes this doesnt exist. It's output by filters
|
360 |
// Yes this doesnt exist. It's output by filters
|
361 |
// And the format isn't even standard http/html
|
361 |
// And the format isn't even standard http/html
|
362 |
// FIXME
|
362 |
// FIXME
|
363 |
string tmp = i->second;
|
|
|
364 |
decode_entities(tmp);
|
363 |
decode_entities(content);
|
365 |
struct tm tm;
|
364 |
struct tm tm;
|
366 |
if (strptime(tmp.c_str(),
|
365 |
if (strptime(content.c_str(),
|
367 |
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
366 |
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
368 |
strptime(tmp.c_str(),
|
367 |
strptime(content.c_str(),
|
369 |
"%Y-%m-%dT%H:%M:%S", &tm)
|
368 |
"%Y-%m-%dT%H:%M:%S", &tm)
|
370 |
) {
|
369 |
) {
|
371 |
char ascuxtime[100];
|
370 |
char ascuxtime[100];
|
372 |
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
|
371 |
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
|
373 |
dmtime = ascuxtime;
|
372 |
dmtime = ascuxtime;
|
374 |
}
|
373 |
}
|
375 |
} else if (name == "robots") {
|
374 |
} else if (name == "robots") {
|
376 |
} else {
|
375 |
} else {
|
377 |
if (!meta[name].empty())
|
376 |
if (!meta[name].empty())
|
378 |
meta[name] += ' ';
|
377 |
meta[name] += ' ';
|
379 |
string tmp = i->second;
|
|
|
380 |
decode_entities(tmp);
|
378 |
decode_entities(content);
|
381 |
meta[name] += tmp;
|
379 |
meta[name] += content;
|
382 |
}
|
380 |
}
|
383 |
} else if ((j = p.find("http-equiv")) != p.end()) {
|
381 |
}
|
384 |
string hequiv = j->second;
|
382 |
string hdr;
|
|
|
383 |
if (get_parameter("http-equiv", hdr)) {
|
385 |
lowercase_term(hequiv);
|
384 |
lowercase_term(hdr);
|
386 |
if (hequiv == "content-type") {
|
385 |
if (hdr == "content-type") {
|
387 |
string value = i->second;
|
|
|
388 |
MimeHeaderValue p;
|
386 |
MimeHeaderValue p;
|
389 |
parseMimeHeaderValue(value, p);
|
387 |
parseMimeHeaderValue(content, p);
|
390 |
map<string, string>::const_iterator k;
|
388 |
map<string, string>::const_iterator k;
|
391 |
if ((k = p.params.find("charset")) !=
|
389 |
if ((k = p.params.find("charset")) !=
|
392 |
p.params.end()) {
|
390 |
p.params.end()) {
|
393 |
charset = k->second;
|
391 |
charset = k->second;
|
394 |
if (!samecharset(charset, fromcharset)) {
|
392 |
if (!samecharset(charset, fromcharset)) {
|
|
... |
|
... |
443 |
break;
|
441 |
break;
|
444 |
case 'x':
|
442 |
case 'x':
|
445 |
if (tag == "xmp") pending_space = true;
|
443 |
if (tag == "xmp") pending_space = true;
|
446 |
break;
|
444 |
break;
|
447 |
}
|
445 |
}
|
|
|
446 |
return true;
|
448 |
}
|
447 |
}
|
449 |
|
448 |
|
450 |
void
|
449 |
bool
|
451 |
MyHtmlParser::closing_tag(const string &tag)
|
450 |
MyHtmlParser::closing_tag(const string &tag)
|
452 |
{
|
451 |
{
|
453 |
LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
|
452 |
LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
|
454 |
if (tag.empty()) return;
|
453 |
if (tag.empty()) return true;
|
455 |
switch (tag[0]) {
|
454 |
switch (tag[0]) {
|
456 |
case 'a':
|
455 |
case 'a':
|
457 |
if (tag == "address") pending_space = true;
|
456 |
if (tag == "address") pending_space = true;
|
458 |
break;
|
457 |
break;
|
459 |
case 'b':
|
458 |
case 'b':
|
460 |
if (tag == "body") {
|
459 |
if (tag == "body") {
|
461 |
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
460 |
LOGDEB1(("Myhtmlparse: body close tag found\n"));
|
462 |
in_body_tag = false;
|
461 |
in_body_tag = false;
|
463 |
throw true;
|
462 |
return false;
|
464 |
}
|
463 |
}
|
465 |
if (tag == "blockquote" || tag == "br") pending_space = true;
|
464 |
if (tag == "blockquote" || tag == "br") pending_space = true;
|
466 |
break;
|
465 |
break;
|
467 |
case 'c':
|
466 |
case 'c':
|
468 |
if (tag == "center") pending_space = true;
|
467 |
if (tag == "center") pending_space = true;
|
|
... |
|
... |
530 |
break;
|
529 |
break;
|
531 |
case 'x':
|
530 |
case 'x':
|
532 |
if (tag == "xmp") pending_space = true;
|
531 |
if (tag == "xmp") pending_space = true;
|
533 |
break;
|
532 |
break;
|
534 |
}
|
533 |
}
|
|
|
534 |
return true;
|
535 |
}
|
535 |
}
|
536 |
|
536 |
|
537 |
// This gets called when hitting eof.
|
537 |
// This gets called when hitting eof.
|
538 |
// We used to do:
|
538 |
// We used to do:
|
539 |
// > If the <body> is open, do
|
539 |
// > If the <body> is open, do
|