recoll / Code / Diff of /src/internfile/myhtmlparse.cpp

Diff of /src/internfile/myhtmlparse.cpp [3872f8] .. [4141db]

Switch to unified view


...
                     " %Y-%m-%d %H:%M:%S ", &tm)) {
                char ascuxtime[100];
                sprintf(ascuxtime, "%ld", (long)mktime(&tm));
                dmtime = ascuxtime;
                }
          } 
#if 0 // We're not a robot, so we don't care about robots metainfo
            else if (name == "robots") {
                string val = i->second;
                decode_entities(val);
                lowercase_term(val);
                if (val.find("none") != string::npos ||
                val.find("noindex") != string::npos) {
                indexing_allowed = false;
                LOGDEB1(("myhtmlparse: robots/noindex\n"));
                throw false;
                }
            }
#endif // 0
            } else if ((j = p.find("http-equiv")) != p.end()) {
            string hequiv = j->second;
            lowercase_term(hequiv);
            if (hequiv == "content-type") {
                string value = i->second;
...
        if (tag == "xmp") pending_space = true;
        break;
    }
}

// This gets called when hitting eof. 
// We used to do: 
//    > If the <body> is open, do
//    > something with the text (that is, don't throw up). Else, things are
//    > too weird, throw an error. We don't get called if the parser finds
//    > a closing body tag (exception gets thrown by closing_tag())
// But we don't throw any more. Whatever text we've extracted up to now is
// better than nothing.
void
MyHtmlParser::do_eof()
{
    //    if (!in_body_tag)
    //   throw(false);
}

	a/src/internfile/myhtmlparse.cpp		b/src/internfile/myhtmlparse.cpp
	...		...
165	" %Y-%m-%d %H:%M:%S ", &tm)) {	165	" %Y-%m-%d %H:%M:%S ", &tm)) {
166	char ascuxtime[100];	166	char ascuxtime[100];
167	sprintf(ascuxtime, "%ld", (long)mktime(&tm));	167	sprintf(ascuxtime, "%ld", (long)mktime(&tm));
168	dmtime = ascuxtime;	168	dmtime = ascuxtime;
169	}	169	}
		170	}
		171	#if 0 // We're not a robot, so we don't care about robots metainfo
170	} else if (name == "robots") {	172	else if (name == "robots") {
171	string val = i->second;	173	string val = i->second;
172	decode_entities(val);	174	decode_entities(val);
173	lowercase_term(val);	175	lowercase_term(val);
174	if (val.find("none") != string::npos \|\|	176	if (val.find("none") != string::npos \|\|
175	val.find("noindex") != string::npos) {	177	val.find("noindex") != string::npos) {
176	indexing_allowed = false;	178	indexing_allowed = false;
177	LOGDEB1(("myhtmlparse: robots/noindex\n"));	179	LOGDEB1(("myhtmlparse: robots/noindex\n"));
178	throw false;	180	throw false;
179	}	181	}
180	}	182	}
		183	#endif // 0
181	} else if ((j = p.find("http-equiv")) != p.end()) {	184	} else if ((j = p.find("http-equiv")) != p.end()) {
182	string hequiv = j->second;	185	string hequiv = j->second;
183	lowercase_term(hequiv);	186	lowercase_term(hequiv);
184	if (hequiv == "content-type") {	187	if (hequiv == "content-type") {
185	string value = i->second;	188	string value = i->second;
	...		...
330	if (tag == "xmp") pending_space = true;	333	if (tag == "xmp") pending_space = true;
331	break;	334	break;
332	}	335	}
333	}	336	}
334		337
335	// This gets called when hitting eof. If the <body> is open, do	338	// This gets called when hitting eof.
		339	// We used to do:
		340	// > If the <body> is open, do
336	// something with the text (that is, don't throw up). Else, things are	341	// > something with the text (that is, don't throw up). Else, things are
337	// too weird, throw an error. We don't get called if the parser finds	342	// > too weird, throw an error. We don't get called if the parser finds
338	// a closing body tag (exception gets thrown by closing_tag())	343	// > a closing body tag (exception gets thrown by closing_tag())
		344	// But we don't throw any more. Whatever text we've extracted up to now is
		345	// better than nothing.
339	void	346	void
340	MyHtmlParser::do_eof()	347	MyHtmlParser::do_eof()
341	{	348	{
342	if (!in_body_tag)	349	// if (!in_body_tag)
343	throw(false);	350	// throw(false);
344	}	351	}