recoll / Code / Diff of /src/internfile/myhtmlparse.cpp

Diff of /src/internfile/myhtmlparse.cpp [f9835a] .. [e6191b]

Switch to unified view


...
static NamedEntsInitializer namedEntsInitializerInstance;

MyHtmlParser::MyHtmlParser()
    : in_script_tag(false),
      in_style_tag(false),

      in_pre_tag(false),
      pending_space(false),
      indexing_allowed(true)
{
    // The default html document charset is iso-8859-1. We'll update
...
    switch (tag[0]) {
    case 'a':
        if (tag == "address") pending_space = true;
        break;
    case 'b':
      // body: some bad docs have several opening body tags and
      // even text before the body is displayed by Opera and
      // Firefox.  We used to reset the dump each time we saw a
      // body tag, but I can't see any reason to do so.

        if (tag == "blockquote" || tag == "br") {
        dump += '\n';
        pending_space = true;
        }
        break;
...
        if (tag == "address") pending_space = true;
        break;
    case 'b':
        if (tag == "body") {
        LOGDEB1(("Myhtmlparse: body close tag found\n"));
      // We used to signal and end of doc here by returning
      // false but the browsers just ignore body and html
      // closing tags if there is further text, so it seems right
      // to do the same
      break;
        }
        if (tag == "blockquote" || tag == "br") pending_space = true;
        break;
    case 'c':
        if (tag == "center") pending_space = true;
...
// But we don't throw any more. Whatever text we've extracted up to now is
// better than nothing.
void
MyHtmlParser::do_eof()
{


}

	a/src/internfile/myhtmlparse.cpp		b/src/internfile/myhtmlparse.cpp
	...		...
174	static NamedEntsInitializer namedEntsInitializerInstance;	174	static NamedEntsInitializer namedEntsInitializerInstance;
175		175
176	MyHtmlParser::MyHtmlParser()	176	MyHtmlParser::MyHtmlParser()
177	: in_script_tag(false),	177	: in_script_tag(false),
178	in_style_tag(false),	178	in_style_tag(false),
179	in_body_tag(false),
180	in_pre_tag(false),	179	in_pre_tag(false),
181	pending_space(false),	180	pending_space(false),
182	indexing_allowed(true)	181	indexing_allowed(true)
183	{	182	{
184	// The default html document charset is iso-8859-1. We'll update	183	// The default html document charset is iso-8859-1. We'll update
	...		...
306	switch (tag[0]) {	305	switch (tag[0]) {
307	case 'a':	306	case 'a':
308	if (tag == "address") pending_space = true;	307	if (tag == "address") pending_space = true;
309	break;	308	break;
310	case 'b':	309	case 'b':
311	if (tag == "body") {	310	// body: some bad docs have several opening body tags and
312	dump.resize(0);	311	// even text before the body is displayed by Opera and
313	in_body_tag = true;	312	// Firefox. We used to reset the dump each time we saw a
314	break;	313	// body tag, but I can't see any reason to do so.
315	}
316	if (tag == "blockquote" \|\| tag == "br") {	314	if (tag == "blockquote" \|\| tag == "br") {
317	dump += '\n';	315	dump += '\n';
318	pending_space = true;	316	pending_space = true;
319	}	317	}
320	break;	318	break;
	...		...
473	if (tag == "address") pending_space = true;	471	if (tag == "address") pending_space = true;
474	break;	472	break;
475	case 'b':	473	case 'b':
476	if (tag == "body") {	474	if (tag == "body") {
477	LOGDEB1(("Myhtmlparse: body close tag found\n"));	475	LOGDEB1(("Myhtmlparse: body close tag found\n"));
478	in_body_tag = false;	476	// We used to signal and end of doc here by returning
479	return false;	477	// false but the browsers just ignore body and html
		478	// closing tags if there is further text, so it seems right
		479	// to do the same
		480	break;
480	}	481	}
481	if (tag == "blockquote" \|\| tag == "br") pending_space = true;	482	if (tag == "blockquote" \|\| tag == "br") pending_space = true;
482	break;	483	break;
483	case 'c':	484	case 'c':
484	if (tag == "center") pending_space = true;	485	if (tag == "center") pending_space = true;
	...		...
560	// But we don't throw any more. Whatever text we've extracted up to now is	561	// But we don't throw any more. Whatever text we've extracted up to now is
561	// better than nothing.	562	// better than nothing.
562	void	563	void
563	MyHtmlParser::do_eof()	564	MyHtmlParser::do_eof()
564	{	565	{
565	// if (!in_body_tag)
566	// throw(false);
567	}	566	}