|
a/src/internfile/htmlparse.cpp |
|
b/src/internfile/htmlparse.cpp |
1 |
/* This file was copied/updated from xapian-omega-1.0.1 and modified */
|
1 |
/* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */
|
2 |
|
2 |
|
3 |
/* htmlparse.cc: simple HTML parser for omega indexer
|
3 |
/* htmlparse.cc: simple HTML parser for omega indexer
|
4 |
*
|
4 |
*
|
5 |
* Copyright 1999,2000,2001 BrightStation PLC
|
5 |
* Copyright 1999,2000,2001 BrightStation PLC
|
6 |
* Copyright 2001 Ananova Ltd
|
6 |
* Copyright 2001 Ananova Ltd
|
7 |
* Copyright 2002,2006 Olly Betts
|
7 |
* Copyright 2002,2006,2007,2008,2009,2010,2011 Olly Betts
|
8 |
*
|
8 |
*
|
9 |
* This program is free software; you can redistribute it and/or
|
9 |
* This program is free software; you can redistribute it and/or
|
10 |
* modify it under the terms of the GNU General Public License as
|
10 |
* modify it under the terms of the GNU General Public License as
|
11 |
* published by the Free Software Foundation; either version 2 of the
|
11 |
* published by the Free Software Foundation; either version 2 of the
|
12 |
* License, or (at your option) any later version.
|
12 |
* License, or (at your option) any later version.
|
|
... |
|
... |
28 |
#include "htmlparse.h"
|
28 |
#include "htmlparse.h"
|
29 |
#include <stdio.h>
|
29 |
#include <stdio.h>
|
30 |
#include <ctype.h>
|
30 |
#include <ctype.h>
|
31 |
#include <cstring>
|
31 |
#include <cstring>
|
32 |
|
32 |
|
|
|
33 |
inline void
|
|
|
34 |
lowercase_string(string &str)
|
|
|
35 |
{
|
|
|
36 |
for (string::iterator i = str.begin(); i != str.end(); ++i) {
|
|
|
37 |
*i = tolower(static_cast<unsigned char>(*i));
|
|
|
38 |
}
|
|
|
39 |
}
|
|
|
40 |
|
33 |
map<string, unsigned int> HtmlParser::named_ents;
|
41 |
map<string, unsigned int> HtmlParser::named_ents;
|
34 |
|
42 |
|
35 |
inline static bool
|
43 |
inline static bool
|
36 |
p_notdigit(char c)
|
44 |
p_notdigit(char c)
|
37 |
{
|
45 |
{
|
|
... |
|
... |
71 |
|
79 |
|
72 |
inline static bool
|
80 |
inline static bool
|
73 |
p_whitespaceeqgt(char c)
|
81 |
p_whitespaceeqgt(char c)
|
74 |
{
|
82 |
{
|
75 |
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
|
83 |
return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
|
|
|
84 |
}
|
|
|
85 |
|
|
|
86 |
bool
|
|
|
87 |
HtmlParser::get_parameter(const string & param, string & value) const
|
|
|
88 |
{
|
|
|
89 |
map<string, string>::const_iterator i = parameters.find(param);
|
|
|
90 |
if (i == parameters.end()) return false;
|
|
|
91 |
value = i->second;
|
|
|
92 |
return true;
|
76 |
}
|
93 |
}
|
77 |
|
94 |
|
78 |
HtmlParser::HtmlParser()
|
95 |
HtmlParser::HtmlParser()
|
79 |
{
|
96 |
{
|
80 |
// RECOLL: no need to initialize these entities, we use those from
|
97 |
// RECOLL: no need to initialize these entities, we use those from
|
|
... |
|
... |
149 |
void
|
166 |
void
|
150 |
HtmlParser::parse_html(const string &body)
|
167 |
HtmlParser::parse_html(const string &body)
|
151 |
{
|
168 |
{
|
152 |
in_script = false;
|
169 |
in_script = false;
|
153 |
|
170 |
|
154 |
map<string,string> Param;
|
171 |
parameters.clear();
|
155 |
string::const_iterator start = body.begin();
|
172 |
string::const_iterator start = body.begin();
|
156 |
|
173 |
|
157 |
while (true) {
|
174 |
while (true) {
|
158 |
// Skip through until we find an HTML tag, a comment, or the end of
|
175 |
// Skip through until we find an HTML tag, a comment, or the end of
|
159 |
// document. Ignore isolated occurences of `<' which don't start
|
176 |
// document. Ignore isolated occurrences of `<' which don't start
|
160 |
// a tag or comment.
|
177 |
// a tag or comment.
|
161 |
string::const_iterator p = start;
|
178 |
string::const_iterator p = start;
|
162 |
while (true) {
|
179 |
while (true) {
|
163 |
p = find(p, body.end(), '<');
|
180 |
p = find(p, body.end(), '<');
|
164 |
if (p == body.end()) break;
|
181 |
if (p == body.end()) break;
|
165 |
unsigned char ch = *(p + 1);
|
182 |
unsigned char ch = *(p + 1);
|
166 |
|
183 |
|
167 |
// Tag, closing tag, or comment (or SGML declaration).
|
184 |
// Tag, closing tag, or comment (or SGML declaration).
|
168 |
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
185 |
if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
|
|
|
186 |
|
169 |
if (ch == '?') {
|
187 |
if (ch == '?') {
|
170 |
// PHP code or XML declaration.
|
188 |
// PHP code or XML declaration.
|
171 |
// XML declaration is only valid at the start of the first line.
|
189 |
// XML declaration is only valid at the start of the first line.
|
172 |
// FIXME: need to deal with BOMs...
|
190 |
// FIXME: need to deal with BOMs...
|
173 |
if (p != body.begin() || body.size() < 20) break;
|
191 |
if (p != body.begin() || body.size() < 20) break;
|
|
... |
|
... |
179 |
|
197 |
|
180 |
string::const_iterator decl_end = find(p + 6, body.end(), '?');
|
198 |
string::const_iterator decl_end = find(p + 6, body.end(), '?');
|
181 |
if (decl_end == body.end()) break;
|
199 |
if (decl_end == body.end()) break;
|
182 |
|
200 |
|
183 |
// Default charset for XML is UTF-8.
|
201 |
// Default charset for XML is UTF-8.
|
184 |
charset = "UTF-8";
|
202 |
charset = "utf-8";
|
185 |
|
203 |
|
186 |
string decl(p + 6, decl_end);
|
204 |
string decl(p + 6, decl_end);
|
187 |
size_t enc = decl.find("encoding");
|
205 |
size_t enc = decl.find("encoding");
|
188 |
if (enc == string::npos) break;
|
206 |
if (enc == string::npos) break;
|
189 |
|
207 |
|
|
... |
|
... |
203 |
if (enc != string::npos)
|
221 |
if (enc != string::npos)
|
204 |
charset = decl.substr(enc, enc_end - enc);
|
222 |
charset = decl.substr(enc, enc_end - enc);
|
205 |
|
223 |
|
206 |
break;
|
224 |
break;
|
207 |
}
|
225 |
}
|
208 |
p++;
|
226 |
p++;
|
209 |
}
|
227 |
}
|
210 |
|
228 |
|
211 |
// Process text up to start of tag.
|
229 |
// Process text up to start of tag.
|
212 |
if (p > start || p == body.end()) {
|
230 |
if (p > start || p == body.end()) {
|
213 |
string text = body.substr(start - body.begin(), p - start);
|
231 |
string text = body.substr(start - body.begin(), p - start);
|
|
... |
|
... |
284 |
|
302 |
|
285 |
p = start;
|
303 |
p = start;
|
286 |
start = find_if(start, body.end(), p_nottag);
|
304 |
start = find_if(start, body.end(), p_nottag);
|
287 |
string tag = body.substr(p - body.begin(), start - p);
|
305 |
string tag = body.substr(p - body.begin(), start - p);
|
288 |
// convert tagname to lowercase
|
306 |
// convert tagname to lowercase
|
289 |
for (string::iterator i = tag.begin(); i != tag.end(); ++i)
|
307 |
lowercase_string(tag);
|
290 |
*i = tolower(static_cast<unsigned char>(*i));
|
308 |
|
291 |
|
|
|
292 |
if (closing) {
|
309 |
if (closing) {
|
293 |
closing_tag(tag);
|
310 |
if (!closing_tag(tag))
|
|
|
311 |
return;
|
294 |
if (in_script && tag == "script") in_script = false;
|
312 |
if (in_script && tag == "script") in_script = false;
|
295 |
|
313 |
|
296 |
/* ignore any bogus parameters on closing tags */
|
314 |
/* ignore any bogus parameters on closing tags */
|
297 |
p = find(start, body.end(), '>');
|
315 |
p = find(start, body.end(), '>');
|
298 |
if (p == body.end()) break;
|
316 |
if (p == body.end()) break;
|
299 |
start = p + 1;
|
317 |
start = p + 1;
|
300 |
} else {
|
318 |
} else {
|
|
|
319 |
bool empty_element = false;
|
|
|
320 |
// FIXME: parse parameters lazily.
|
301 |
while (start < body.end() && *start != '>') {
|
321 |
while (start < body.end() && *start != '>') {
|
302 |
string name, value;
|
322 |
string name, value;
|
303 |
|
323 |
|
304 |
p = find_if(start, body.end(), p_whitespaceeqgt);
|
324 |
p = find_if(start, body.end(), p_whitespaceeqgt);
|
305 |
|
325 |
|
306 |
name = body.substr(start - body.begin(), p - start);
|
326 |
size_t name_len = p - start;
|
|
|
327 |
if (name_len == 1) {
|
|
|
328 |
if (*start == '/' && p < body.end() && *p == '>') {
|
|
|
329 |
// E.g. <tag foo="bar" />
|
|
|
330 |
start = p;
|
|
|
331 |
empty_element = true;
|
|
|
332 |
break;
|
|
|
333 |
}
|
307 |
|
334 |
}
|
|
|
335 |
|
|
|
336 |
name.assign(body, start - body.begin(), name_len);
|
|
|
337 |
|
308 |
p = find_if(p, body.end(), p_notwhitespace);
|
338 |
p = find_if(p, body.end(), p_notwhitespace);
|
309 |
|
339 |
|
310 |
start = p;
|
340 |
start = p;
|
311 |
if (start != body.end() && *start == '=') {
|
341 |
if (start != body.end() && *start == '=') {
|
312 |
int quote;
|
|
|
313 |
|
|
|
314 |
start = find_if(start + 1, body.end(), p_notwhitespace);
|
342 |
start = find_if(start + 1, body.end(), p_notwhitespace);
|
315 |
|
343 |
|
316 |
p = body.end();
|
344 |
p = body.end();
|
317 |
|
345 |
|
318 |
quote = *start;
|
346 |
int quote = *start;
|
319 |
if (quote == '"' || quote == '\'') {
|
347 |
if (quote == '"' || quote == '\'') {
|
320 |
start++;
|
348 |
start++;
|
321 |
p = find(start, body.end(), quote);
|
349 |
p = find(start, body.end(), quote);
|
322 |
}
|
350 |
}
|
323 |
|
351 |
|
324 |
if (p == body.end()) {
|
352 |
if (p == body.end()) {
|
325 |
// unquoted or no closing quote
|
353 |
// unquoted or no closing quote
|
326 |
p = find_if(start, body.end(), p_whitespacegt);
|
354 |
p = find_if(start, body.end(), p_whitespacegt);
|
327 |
|
|
|
328 |
value = body.substr(start - body.begin(), p - start);
|
|
|
329 |
|
|
|
330 |
start = find_if(p, body.end(), p_notwhitespace);
|
|
|
331 |
} else {
|
|
|
332 |
value = body.substr(start - body.begin(), p - start);
|
|
|
333 |
}
|
355 |
}
|
334 |
|
356 |
value.assign(body, start - body.begin(), p - start);
|
|
|
357 |
start = find_if(p, body.end(), p_notwhitespace);
|
|
|
358 |
|
335 |
if (name.size()) {
|
359 |
if (!name.empty()) {
|
336 |
// convert parameter name to lowercase
|
360 |
// convert parameter name to lowercase
|
337 |
string::iterator i;
|
361 |
lowercase_string(name);
|
338 |
for (i = name.begin(); i != name.end(); ++i)
|
|
|
339 |
*i = tolower(static_cast<unsigned char>(*i));
|
|
|
340 |
// in case of multiple entries, use the first
|
362 |
// in case of multiple entries, use the first
|
341 |
// (as Netscape does)
|
363 |
// (as Netscape does)
|
342 |
if (Param.find(name) == Param.end())
|
364 |
parameters.insert(make_pair(name, value));
|
343 |
Param[name] = value;
|
|
|
344 |
}
|
365 |
}
|
345 |
}
|
366 |
}
|
346 |
}
|
367 |
}
|
347 |
opening_tag(tag, Param);
|
368 |
#if 0
|
|
|
369 |
cout << "<" << tag;
|
|
|
370 |
map<string, string>::const_iterator x;
|
|
|
371 |
for (x = parameters.begin(); x != parameters.end(); x++) {
|
|
|
372 |
cout << " " << x->first << "=\"" << x->second << "\"";
|
|
|
373 |
}
|
|
|
374 |
cout << ">\n";
|
|
|
375 |
#endif
|
|
|
376 |
if (!opening_tag(tag))
|
|
|
377 |
return;
|
348 |
Param.clear();
|
378 |
parameters.clear();
|
|
|
379 |
|
|
|
380 |
if (empty_element) {
|
|
|
381 |
if (!closing_tag(tag))
|
|
|
382 |
return;
|
|
|
383 |
}
|
349 |
|
384 |
|
350 |
// In <script> tags we ignore opening tags to avoid problems
|
385 |
// In <script> tags we ignore opening tags to avoid problems
|
351 |
// with "a<b".
|
386 |
// with "a<b".
|
352 |
if (tag == "script") in_script = true;
|
387 |
if (tag == "script") in_script = true;
|
353 |
|
388 |
|