--- a/src/internfile/htmlparse.cpp
+++ b/src/internfile/htmlparse.cpp
@@ -1,10 +1,10 @@
-/* This file was copied/updated from xapian-omega-1.0.1 and modified */
+/* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */
/* htmlparse.cc: simple HTML parser for omega indexer
*
* Copyright 1999,2000,2001 BrightStation PLC
* Copyright 2001 Ananova Ltd
- * Copyright 2002,2006 Olly Betts
+ * Copyright 2002,2006,2007,2008,2009,2010,2011 Olly Betts
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
@@ -30,6 +30,14 @@
#include <ctype.h>
#include <cstring>
+inline void
+lowercase_string(string &str)
+{
+ for (string::iterator i = str.begin(); i != str.end(); ++i) {
+ *i = tolower(static_cast<unsigned char>(*i));
+ }
+}
+
map<string, unsigned int> HtmlParser::named_ents;
inline static bool
@@ -73,6 +81,15 @@
p_whitespaceeqgt(char c)
{
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
+}
+
+bool
+HtmlParser::get_parameter(const string & param, string & value) const
+{
+ map<string, string>::const_iterator i = parameters.find(param);
+ if (i == parameters.end()) return false;
+ value = i->second;
+ return true;
}
HtmlParser::HtmlParser()
@@ -151,12 +168,12 @@
{
in_script = false;
- map<string,string> Param;
+ parameters.clear();
string::const_iterator start = body.begin();
while (true) {
// Skip through until we find an HTML tag, a comment, or the end of
- // document. Ignore isolated occurences of `<' which don't start
+ // document. Ignore isolated occurrences of `<' which don't start
// a tag or comment.
string::const_iterator p = start;
while (true) {
@@ -166,6 +183,7 @@
// Tag, closing tag, or comment (or SGML declaration).
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
+
if (ch == '?') {
// PHP code or XML declaration.
// XML declaration is only valid at the start of the first line.
@@ -181,7 +199,7 @@
if (decl_end == body.end()) break;
// Default charset for XML is UTF-8.
- charset = "UTF-8";
+ charset = "utf-8";
string decl(p + 6, decl_end);
size_t enc = decl.find("encoding");
@@ -205,7 +223,7 @@
break;
}
- p++;
+ p++;
}
// Process text up to start of tag.
@@ -286,66 +304,83 @@
start = find_if(start, body.end(), p_nottag);
string tag = body.substr(p - body.begin(), start - p);
// convert tagname to lowercase
- for (string::iterator i = tag.begin(); i != tag.end(); ++i)
- *i = tolower(static_cast<unsigned char>(*i));
-
+ lowercase_string(tag);
+
if (closing) {
- closing_tag(tag);
+ if (!closing_tag(tag))
+ return;
if (in_script && tag == "script") in_script = false;
-
+
/* ignore any bogus parameters on closing tags */
p = find(start, body.end(), '>');
if (p == body.end()) break;
start = p + 1;
} else {
+ bool empty_element = false;
+ // FIXME: parse parameters lazily.
while (start < body.end() && *start != '>') {
string name, value;
p = find_if(start, body.end(), p_whitespaceeqgt);
- name = body.substr(start - body.begin(), p - start);
-
+ size_t name_len = p - start;
+ if (name_len == 1) {
+ if (*start == '/' && p < body.end() && *p == '>') {
+ // E.g. <tag foo="bar" />
+ start = p;
+ empty_element = true;
+ break;
+ }
+ }
+
+ name.assign(body, start - body.begin(), name_len);
+
p = find_if(p, body.end(), p_notwhitespace);
-
+
start = p;
if (start != body.end() && *start == '=') {
- int quote;
-
start = find_if(start + 1, body.end(), p_notwhitespace);
p = body.end();
-
- quote = *start;
+
+ int quote = *start;
if (quote == '"' || quote == '\'') {
start++;
p = find(start, body.end(), quote);
}
-
+
if (p == body.end()) {
// unquoted or no closing quote
p = find_if(start, body.end(), p_whitespacegt);
-
- value = body.substr(start - body.begin(), p - start);
-
- start = find_if(p, body.end(), p_notwhitespace);
- } else {
- value = body.substr(start - body.begin(), p - start);
}
-
- if (name.size()) {
+ value.assign(body, start - body.begin(), p - start);
+ start = find_if(p, body.end(), p_notwhitespace);
+
+ if (!name.empty()) {
// convert parameter name to lowercase
- string::iterator i;
- for (i = name.begin(); i != name.end(); ++i)
- *i = tolower(static_cast<unsigned char>(*i));
+ lowercase_string(name);
// in case of multiple entries, use the first
// (as Netscape does)
- if (Param.find(name) == Param.end())
- Param[name] = value;
+ parameters.insert(make_pair(name, value));
}
}
}
- opening_tag(tag, Param);
- Param.clear();
+#if 0
+ cout << "<" << tag;
+ map<string, string>::const_iterator x;
+ for (x = parameters.begin(); x != parameters.end(); x++) {
+ cout << " " << x->first << "=\"" << x->second << "\"";
+ }
+ cout << ">\n";
+#endif
+ if (!opening_tag(tag))
+ return;
+ parameters.clear();
+
+ if (empty_element) {
+ if (!closing_tag(tag))
+ return;
+ }
// In <script> tags we ignore opening tags to avoid problems
// with "a<b".