recoll / Code / Diff of /src/internfile/myhtmlparse.cpp

Diff of /src/internfile/myhtmlparse.cpp [7d2928] .. [c7a241]

Switch to unified view


/* This file was copied from omega-0.8.5->1.2.6 and modified */

/* myhtmlparse.cc: subclass of HtmlParser for extracting text
 *
 * ----START-LICENCE----
 * Copyright 1999,2000,2001 BrightStation PLC
...
        dump += text;
    }
    }
}

bool
MyHtmlParser::opening_tag(const string &tag)
{
    LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
#if 0
    cout << "TAG: " << tag << ": " << endl;
    map<string, string>::const_iterator x;
    for (x = p.begin(); x != p.end(); x++) {
    cout << "  " << x->first << " -> '" << x->second << "'" << endl;
    }
#endif
    if (tag.empty()) return true;
    switch (tag[0]) {
    case 'a':
        if (tag == "address") pending_space = true;
        break;
    case 'b':
        if (tag == "body") {
      dump.resize(0);
        in_body_tag = true;
        break;
        }
        if (tag == "blockquote" || tag == "br") {
        dump += '\n';
...
        pending_space = true;
        }
        break;
    case 'm':
        if (tag == "meta") {
      string content;
      if (get_parameter("content", content)) {
          string name;
          if (get_parameter("name", name)) {
            lowercase_term(name);
            if (name == "date") {
                // Yes this doesnt exist. It's output by filters
                // And the format isn't even standard http/html
                // FIXME

                decode_entities(content);
                struct tm tm;
                if (strptime(content.c_str(), 
                     " %Y-%m-%d %H:%M:%S ", &tm) ||
                strptime(content.c_str(), 
                     "%Y-%m-%dT%H:%M:%S", &tm)
                ) {
                char ascuxtime[100];
                sprintf(ascuxtime, "%ld", (long)mktime(&tm));
                dmtime = ascuxtime;
                }
            } else if (name == "robots") {
            } else {
                if (!meta[name].empty())
                meta[name] += ' ';

                decode_entities(content);
                meta[name] += content;
            }
          } 
          string hdr;
          if (get_parameter("http-equiv", hdr)) {
            lowercase_term(hdr);
            if (hdr == "content-type") {

                MimeHeaderValue p;
                parseMimeHeaderValue(content, p);
                map<string, string>::const_iterator k;
                if ((k = p.params.find("charset")) != 
                p.params.end()) {
                charset = k->second;
                if (!samecharset(charset, fromcharset)) {
...
        break;
    case 'x':
        if (tag == "xmp") pending_space = true;
        break;
    }
    return true;
}

bool
MyHtmlParser::closing_tag(const string &tag)
{
    LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
    if (tag.empty()) return true;
    switch (tag[0]) {
    case 'a':
        if (tag == "address") pending_space = true;
        break;
    case 'b':
        if (tag == "body") {
        LOGDEB1(("Myhtmlparse: body close tag found\n"));
        in_body_tag = false;
      return false;
        }
        if (tag == "blockquote" || tag == "br") pending_space = true;
        break;
    case 'c':
        if (tag == "center") pending_space = true;
...
        break;
    case 'x':
        if (tag == "xmp") pending_space = true;
        break;
    }
    return true;
}

// This gets called when hitting eof. 
// We used to do: 
//    > If the <body> is open, do

	a/src/internfile/myhtmlparse.cpp		b/src/internfile/myhtmlparse.cpp
1	/* This file was copied from omega-0.8.5 and modified */	1	/* This file was copied from omega-0.8.5->1.2.6 and modified */
2		2
3	/* myhtmlparse.cc: subclass of HtmlParser for extracting text	3	/* myhtmlparse.cc: subclass of HtmlParser for extracting text
4	*	4	*
5	* ----START-LICENCE----	5	* ----START-LICENCE----
6	* Copyright 1999,2000,2001 BrightStation PLC	6	* Copyright 1999,2000,2001 BrightStation PLC
	...		...
285	dump += text;	285	dump += text;
286	}	286	}
287	}	287	}
288	}	288	}
289		289
290	void	290	bool
291	MyHtmlParser::opening_tag(const string &tag, const map<string,string> &p)	291	MyHtmlParser::opening_tag(const string &tag)
292	{	292	{
293	LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));	293	LOGDEB2(("opening_tag: [%s]\n", tag.c_str()));
294	#if 0	294	#if 0
295	cout << "TAG: " << tag << ": " << endl;	295	cout << "TAG: " << tag << ": " << endl;
296	map<string, string>::const_iterator x;	296	map<string, string>::const_iterator x;
297	for (x = p.begin(); x != p.end(); x++) {	297	for (x = p.begin(); x != p.end(); x++) {
298	cout << " " << x->first << " -> '" << x->second << "'" << endl;	298	cout << " " << x->first << " -> '" << x->second << "'" << endl;
299	}	299	}
300	#endif	300	#endif
301	if (tag.empty()) return;	301	if (tag.empty()) return true;
302	switch (tag[0]) {	302	switch (tag[0]) {
303	case 'a':	303	case 'a':
304	if (tag == "address") pending_space = true;	304	if (tag == "address") pending_space = true;
305	break;	305	break;
306	case 'b':	306	case 'b':
307	if (tag == "body") {	307	if (tag == "body") {
308	dump = "";	308	dump.resize(0);
309	in_body_tag = true;	309	in_body_tag = true;
310	break;	310	break;
311	}	311	}
312	if (tag == "blockquote" \|\| tag == "br") {	312	if (tag == "blockquote" \|\| tag == "br") {
313	dump += '\n';	313	dump += '\n';
	...		...
349	pending_space = true;	349	pending_space = true;
350	}	350	}
351	break;	351	break;
352	case 'm':	352	case 'm':
353	if (tag == "meta") {	353	if (tag == "meta") {
354	map<string, string>::const_iterator i, j;	354	string content;
355	if ((i = p.find("content")) != p.end()) {	355	if (get_parameter("content", content)) {
356	if ((j = p.find("name")) != p.end()) {	356	string name;
357	string name = j->second;	357	if (get_parameter("name", name)) {
358	lowercase_term(name);	358	lowercase_term(name);
359	if (name == "date") {	359	if (name == "date") {
360	// Yes this doesnt exist. It's output by filters	360	// Yes this doesnt exist. It's output by filters
361	// And the format isn't even standard http/html	361	// And the format isn't even standard http/html
362	// FIXME	362	// FIXME
363	string tmp = i->second;
364	decode_entities(tmp);	363	decode_entities(content);
365	struct tm tm;	364	struct tm tm;
366	if (strptime(tmp.c_str(),	365	if (strptime(content.c_str(),
367	" %Y-%m-%d %H:%M:%S ", &tm) \|\|	366	" %Y-%m-%d %H:%M:%S ", &tm) \|\|
368	strptime(tmp.c_str(),	367	strptime(content.c_str(),
369	"%Y-%m-%dT%H:%M:%S", &tm)	368	"%Y-%m-%dT%H:%M:%S", &tm)
370	) {	369	) {
371	char ascuxtime[100];	370	char ascuxtime[100];
372	sprintf(ascuxtime, "%ld", (long)mktime(&tm));	371	sprintf(ascuxtime, "%ld", (long)mktime(&tm));
373	dmtime = ascuxtime;	372	dmtime = ascuxtime;
374	}	373	}
375	} else if (name == "robots") {	374	} else if (name == "robots") {
376	} else {	375	} else {
377	if (!meta[name].empty())	376	if (!meta[name].empty())
378	meta[name] += ' ';	377	meta[name] += ' ';
379	string tmp = i->second;
380	decode_entities(tmp);	378	decode_entities(content);
381	meta[name] += tmp;	379	meta[name] += content;
382	}	380	}
383	} else if ((j = p.find("http-equiv")) != p.end()) {	381	}
384	string hequiv = j->second;	382	string hdr;
		383	if (get_parameter("http-equiv", hdr)) {
385	lowercase_term(hequiv);	384	lowercase_term(hdr);
386	if (hequiv == "content-type") {	385	if (hdr == "content-type") {
387	string value = i->second;
388	MimeHeaderValue p;	386	MimeHeaderValue p;
389	parseMimeHeaderValue(value, p);	387	parseMimeHeaderValue(content, p);
390	map<string, string>::const_iterator k;	388	map<string, string>::const_iterator k;
391	if ((k = p.params.find("charset")) !=	389	if ((k = p.params.find("charset")) !=
392	p.params.end()) {	390	p.params.end()) {
393	charset = k->second;	391	charset = k->second;
394	if (!samecharset(charset, fromcharset)) {	392	if (!samecharset(charset, fromcharset)) {
	...		...
443	break;	441	break;
444	case 'x':	442	case 'x':
445	if (tag == "xmp") pending_space = true;	443	if (tag == "xmp") pending_space = true;
446	break;	444	break;
447	}	445	}
		446	return true;
448	}	447	}
449		448
450	void	449	bool
451	MyHtmlParser::closing_tag(const string &tag)	450	MyHtmlParser::closing_tag(const string &tag)
452	{	451	{
453	LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));	452	LOGDEB2(("closing_tag: [%s]\n", tag.c_str()));
454	if (tag.empty()) return;	453	if (tag.empty()) return true;
455	switch (tag[0]) {	454	switch (tag[0]) {
456	case 'a':	455	case 'a':
457	if (tag == "address") pending_space = true;	456	if (tag == "address") pending_space = true;
458	break;	457	break;
459	case 'b':	458	case 'b':
460	if (tag == "body") {	459	if (tag == "body") {
461	LOGDEB1(("Myhtmlparse: body close tag found\n"));	460	LOGDEB1(("Myhtmlparse: body close tag found\n"));
462	in_body_tag = false;	461	in_body_tag = false;
463	throw true;	462	return false;
464	}	463	}
465	if (tag == "blockquote" \|\| tag == "br") pending_space = true;	464	if (tag == "blockquote" \|\| tag == "br") pending_space = true;
466	break;	465	break;
467	case 'c':	466	case 'c':
468	if (tag == "center") pending_space = true;	467	if (tag == "center") pending_space = true;
	...		...
530	break;	529	break;
531	case 'x':	530	case 'x':
532	if (tag == "xmp") pending_space = true;	531	if (tag == "xmp") pending_space = true;
533	break;	532	break;
534	}	533	}
		534	return true;
535	}	535	}
536		536
537	// This gets called when hitting eof.	537	// This gets called when hitting eof.
538	// We used to do:	538	// We used to do:
539	// > If the <body> is open, do	539	// > If the <body> is open, do