recoll / Code / Diff of /src/internfile/htmlparse.cpp

Diff of /src/internfile/htmlparse.cpp [ae8ff5] .. [7ad5f2]

Switch to side-by-side view

--- a/src/internfile/htmlparse.cpp
+++ b/src/internfile/htmlparse.cpp
@@ -23,7 +23,7 @@
  */
 
 #ifndef lint
-static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.3 2005-11-24 07:16:15 dockes Exp $ ";
+static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.4 2005-12-08 08:44:14 dockes Exp $ ";
 #endif
 
 //#include <config.h>
@@ -273,10 +273,11 @@
     string::const_iterator start = body.begin();
 
     while (1) {
-	// Skip through until we find an HTML tag, a comment, or the end of
-	// document.  Ignore isolated occurences of `<' which don't start
-	// a tag or comment
 	string::const_iterator p = start;
+
+	// Eat text until we find an HTML tag, a comment, or the end
+	// of document.  Ignore isolated occurences of `<' which don't
+	// start a tag or comment
 	while (1) {
 	    p = find(p, body.end(), '<');
 	    if (p == body.end()) break;
@@ -286,15 +287,17 @@
 	    p++; 
 	}
 
-
-	// process text up to start of tag
-	if (p > start) {
+	// Process text
+	if (p > start || p == body.end()) {
 	    string text = body.substr(start - body.begin(), p - start);
 	    decode_entities(text);
 	    process_text(text);
 	}
 
-	if (p == body.end()) break;
+	if (p == body.end()) {
+	    do_eof();
+	    break;
+	}
 
 	start = p + 1;