|
a/src/internfile/myhtmlparse.cpp |
|
b/src/internfile/myhtmlparse.cpp |
|
... |
|
... |
358 |
if (get_parameter(cstr_html_content, content)) {
|
358 |
if (get_parameter(cstr_html_content, content)) {
|
359 |
string name;
|
359 |
string name;
|
360 |
if (get_parameter("name", name)) {
|
360 |
if (get_parameter("name", name)) {
|
361 |
lowercase_term(name);
|
361 |
lowercase_term(name);
|
362 |
if (name == "date") {
|
362 |
if (name == "date") {
|
363 |
// Yes this doesnt exist. It's output by filters
|
363 |
// Specific to Recoll filters.
|
364 |
// And the format isn't even standard http/html
|
|
|
365 |
// FIXME
|
|
|
366 |
decode_entities(content);
|
364 |
decode_entities(content);
|
367 |
struct tm tm;
|
365 |
struct tm tm;
|
368 |
if (strptime(content.c_str(),
|
366 |
if (strptime(content.c_str(),
|
369 |
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
367 |
" %Y-%m-%d %H:%M:%S ", &tm) ||
|
370 |
strptime(content.c_str(),
|
368 |
strptime(content.c_str(),
|
|
... |
|
... |
374 |
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
|
372 |
sprintf(ascuxtime, "%ld", (long)mktime(&tm));
|
375 |
dmtime = ascuxtime;
|
373 |
dmtime = ascuxtime;
|
376 |
}
|
374 |
}
|
377 |
} else if (name == "robots") {
|
375 |
} else if (name == "robots") {
|
378 |
} else {
|
376 |
} else {
|
|
|
377 |
string markup;
|
|
|
378 |
bool ishtml = false;
|
|
|
379 |
if (get_parameter("markup", markup)) {
|
|
|
380 |
if (!stringlowercmp("html", markup)) {
|
|
|
381 |
ishtml = true;
|
|
|
382 |
}
|
|
|
383 |
}
|
379 |
if (!meta[name].empty())
|
384 |
if (!meta[name].empty())
|
380 |
meta[name] += ' ';
|
385 |
meta[name] += ' ';
|
381 |
decode_entities(content);
|
386 |
decode_entities(content);
|
382 |
meta[name] += content;
|
387 |
meta[name] += content;
|
|
|
388 |
if (ishtml &&
|
|
|
389 |
meta[name].compare(0, cstr_fldhtm.size(),
|
|
|
390 |
cstr_fldhtm)) {
|
|
|
391 |
meta[name].insert(0, cstr_fldhtm);
|
|
|
392 |
}
|
383 |
}
|
393 |
}
|
384 |
}
|
394 |
}
|
385 |
string hdr;
|
395 |
string hdr;
|
386 |
if (get_parameter("http-equiv", hdr)) {
|
396 |
if (get_parameter("http-equiv", hdr)) {
|
387 |
lowercase_term(hdr);
|
397 |
lowercase_term(hdr);
|
|
... |
|
... |
415 |
fromcharset.c_str()));
|
425 |
fromcharset.c_str()));
|
416 |
throw false;
|
426 |
throw false;
|
417 |
}
|
427 |
}
|
418 |
}
|
428 |
}
|
419 |
break;
|
429 |
break;
|
420 |
}
|
|
|
421 |
if (tag == "marquee" || tag == "menu" || tag == "multicol")
|
430 |
} else if (tag == "marquee" || tag == "menu" || tag == "multicol")
|
422 |
pending_space = true;
|
431 |
pending_space = true;
|
423 |
break;
|
432 |
break;
|
424 |
case 'o':
|
433 |
case 'o':
|
425 |
if (tag == "ol" || tag == "option") pending_space = true;
|
434 |
if (tag == "ol" || tag == "option") pending_space = true;
|
426 |
break;
|
435 |
break;
|
|
... |
|
... |
439 |
break;
|
448 |
break;
|
440 |
case 's':
|
449 |
case 's':
|
441 |
if (tag == "style") {
|
450 |
if (tag == "style") {
|
442 |
in_style_tag = true;
|
451 |
in_style_tag = true;
|
443 |
break;
|
452 |
break;
|
444 |
}
|
|
|
445 |
if (tag == "script") {
|
453 |
} else if (tag == "script") {
|
446 |
in_script_tag = true;
|
454 |
in_script_tag = true;
|
447 |
break;
|
455 |
break;
|
448 |
}
|
456 |
} else if (tag == "select")
|
449 |
if (tag == "select") pending_space = true;
|
457 |
pending_space = true;
|
450 |
break;
|
458 |
break;
|
451 |
case 't':
|
459 |
case 't':
|
452 |
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
460 |
if (tag == "table" || tag == "td" || tag == "textarea" ||
|
453 |
tag == "th") pending_space = true;
|
461 |
tag == "th") pending_space = true;
|
454 |
break;
|
462 |
break;
|