recoll / Code / Diff of /src/internfile/htmlparse.cpp

Diff of /src/internfile/htmlparse.cpp [d2b54d] .. [012254]

Switch to unified view


...
 * USA
 * -----END-LICENCE-----
 */

#ifndef lint
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.4 2005-12-08 08:44:14 dockes Exp $ ";
#endif

//#include <config.h>

#include <algorithm>
...
{
    map<string,string> Param;
    string::const_iterator start = body.begin();

    while (1) {



    string::const_iterator p = start;

  // Eat text until we find an HTML tag, a comment, or the end
  // of document.  Ignore isolated occurences of `<' which don't
  // start a tag or comment
    while (1) {
        p = find(p, body.end(), '<');
        if (p == body.end()) break;
        char ch = *(p + 1);
        // tag, closing tag, comment (or SGML declaration), or PHP
        if (isalpha(ch) || ch == '/' || ch == '!' || ch == '?') break;
        p++; 
    }

  // Process text
  if (p > start || p == body.end()) {

        string text = body.substr(start - body.begin(), p - start);
        decode_entities(text);
        process_text(text);
    }

    if (p == body.end()) {
      do_eof();
      break;
  }

    start = p + 1;
   
    if (start == body.end()) break;


	a/src/internfile/htmlparse.cpp		b/src/internfile/htmlparse.cpp
	...		...
21	* USA	21	* USA
22	* -----END-LICENCE-----	22	* -----END-LICENCE-----
23	*/	23	*/
24		24
25	#ifndef lint	25	#ifndef lint
26	static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.3 2005-11-24 07:16:15 dockes Exp $ ";	26	static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.4 2005-12-08 08:44:14 dockes Exp $ ";
27	#endif	27	#endif
28		28
29	//#include <config.h>	29	//#include <config.h>
30		30
31	#include <algorithm>	31	#include <algorithm>
	...		...
271	{	271	{
272	map<string,string> Param;	272	map<string,string> Param;
273	string::const_iterator start = body.begin();	273	string::const_iterator start = body.begin();
274		274
275	while (1) {	275	while (1) {
276	// Skip through until we find an HTML tag, a comment, or the end of
277	// document. Ignore isolated occurences of `<' which don't start
278	// a tag or comment
279	string::const_iterator p = start;	276	string::const_iterator p = start;
		277
		278	// Eat text until we find an HTML tag, a comment, or the end
		279	// of document. Ignore isolated occurences of `<' which don't
		280	// start a tag or comment
280	while (1) {	281	while (1) {
281	p = find(p, body.end(), '<');	282	p = find(p, body.end(), '<');
282	if (p == body.end()) break;	283	if (p == body.end()) break;
283	char ch = *(p + 1);	284	char ch = *(p + 1);
284	// tag, closing tag, comment (or SGML declaration), or PHP	285	// tag, closing tag, comment (or SGML declaration), or PHP
285	if (isalpha(ch) \|\| ch == '/' \|\| ch == '!' \|\| ch == '?') break;	286	if (isalpha(ch) \|\| ch == '/' \|\| ch == '!' \|\| ch == '?') break;
286	p++;	287	p++;
287	}	288	}
288		289
289		290	// Process text
290	// process text up to start of tag	291	if (p > start \|\| p == body.end()) {
291	if (p > start) {
292	string text = body.substr(start - body.begin(), p - start);	292	string text = body.substr(start - body.begin(), p - start);
293	decode_entities(text);	293	decode_entities(text);
294	process_text(text);	294	process_text(text);
295	}	295	}
296		296
297	if (p == body.end()) break;	297	if (p == body.end()) {
		298	do_eof();
		299	break;
		300	}
298		301
299	start = p + 1;	302	start = p + 1;
300		303
301	if (start == body.end()) break;	304	if (start == body.end()) break;
302		305