Switch to unified view

a/src/internfile/htmlparse.cpp b/src/internfile/htmlparse.cpp
1
/* This file was copied/updated from xapian-omega-1.0.1 and modified */
1
/* This file was copied/updated from xapian-omega-1.0.1 to 1.2.6 and modified */
2
2
3
/* htmlparse.cc: simple HTML parser for omega indexer
3
/* htmlparse.cc: simple HTML parser for omega indexer
4
 *
4
 *
5
 * Copyright 1999,2000,2001 BrightStation PLC
5
 * Copyright 1999,2000,2001 BrightStation PLC
6
 * Copyright 2001 Ananova Ltd
6
 * Copyright 2001 Ananova Ltd
7
 * Copyright 2002,2006 Olly Betts
7
 * Copyright 2002,2006,2007,2008,2009,2010,2011 Olly Betts
8
 *
8
 *
9
 * This program is free software; you can redistribute it and/or
9
 * This program is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU General Public License as
10
 * modify it under the terms of the GNU General Public License as
11
 * published by the Free Software Foundation; either version 2 of the
11
 * published by the Free Software Foundation; either version 2 of the
12
 * License, or (at your option) any later version.
12
 * License, or (at your option) any later version.
...
...
28
#include "htmlparse.h"
28
#include "htmlparse.h"
29
#include <stdio.h>
29
#include <stdio.h>
30
#include <ctype.h>
30
#include <ctype.h>
31
#include <cstring>
31
#include <cstring>
32
32
33
inline void
34
lowercase_string(string &str)
35
{
36
    for (string::iterator i = str.begin(); i != str.end(); ++i) {
37
  *i = tolower(static_cast<unsigned char>(*i));
38
    }
39
}
40
33
map<string, unsigned int> HtmlParser::named_ents;
41
map<string, unsigned int> HtmlParser::named_ents;
34
42
35
inline static bool
43
inline static bool
36
p_notdigit(char c)
44
p_notdigit(char c)
37
{
45
{
...
...
71
79
72
inline static bool
80
inline static bool
73
p_whitespaceeqgt(char c)
81
p_whitespaceeqgt(char c)
74
{
82
{
75
    return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
83
    return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
84
}
85
86
bool
87
HtmlParser::get_parameter(const string & param, string & value) const
88
{
89
    map<string, string>::const_iterator i = parameters.find(param);
90
    if (i == parameters.end()) return false;
91
    value = i->second;
92
    return true;
76
}
93
}
77
94
78
HtmlParser::HtmlParser()
95
HtmlParser::HtmlParser()
79
{
96
{
80
    // RECOLL: no need to initialize these entities, we use those from
97
    // RECOLL: no need to initialize these entities, we use those from
...
...
149
void
166
void
150
HtmlParser::parse_html(const string &body)
167
HtmlParser::parse_html(const string &body)
151
{
168
{
152
    in_script = false;
169
    in_script = false;
153
170
154
    map<string,string> Param;
171
    parameters.clear();
155
    string::const_iterator start = body.begin();
172
    string::const_iterator start = body.begin();
156
173
157
    while (true) {
174
    while (true) {
158
    // Skip through until we find an HTML tag, a comment, or the end of
175
    // Skip through until we find an HTML tag, a comment, or the end of
159
    // document.  Ignore isolated occurences of `<' which don't start
176
    // document.  Ignore isolated occurrences of `<' which don't start
160
    // a tag or comment.    
177
    // a tag or comment.    
161
    string::const_iterator p = start;
178
    string::const_iterator p = start;
162
    while (true) {
179
    while (true) {
163
        p = find(p, body.end(), '<');
180
        p = find(p, body.end(), '<');
164
        if (p == body.end()) break;
181
        if (p == body.end()) break;
165
        unsigned char ch = *(p + 1);
182
        unsigned char ch = *(p + 1);
166
183
167
        // Tag, closing tag, or comment (or SGML declaration).
184
        // Tag, closing tag, or comment (or SGML declaration).
168
        if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
185
        if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
186
169
        if (ch == '?') {
187
        if (ch == '?') {
170
        // PHP code or XML declaration.
188
        // PHP code or XML declaration.
171
        // XML declaration is only valid at the start of the first line.
189
        // XML declaration is only valid at the start of the first line.
172
        // FIXME: need to deal with BOMs...
190
        // FIXME: need to deal with BOMs...
173
        if (p != body.begin() || body.size() < 20) break;
191
        if (p != body.begin() || body.size() < 20) break;
...
...
179
197
180
        string::const_iterator decl_end = find(p + 6, body.end(), '?');
198
        string::const_iterator decl_end = find(p + 6, body.end(), '?');
181
        if (decl_end == body.end()) break;
199
        if (decl_end == body.end()) break;
182
200
183
        // Default charset for XML is UTF-8.
201
        // Default charset for XML is UTF-8.
184
        charset = "UTF-8";
202
        charset = "utf-8";
185
203
186
        string decl(p + 6, decl_end);
204
        string decl(p + 6, decl_end);
187
        size_t enc = decl.find("encoding");
205
        size_t enc = decl.find("encoding");
188
        if (enc == string::npos) break;
206
        if (enc == string::npos) break;
189
207
...
...
203
        if (enc != string::npos)
221
        if (enc != string::npos)
204
            charset = decl.substr(enc, enc_end - enc);
222
            charset = decl.substr(enc, enc_end - enc);
205
223
206
        break;
224
        break;
207
        }
225
        }
208
        p++; 
226
        p++;
209
    }
227
    }
210
228
211
    // Process text up to start of tag.
229
    // Process text up to start of tag.
212
    if (p > start || p == body.end()) {
230
    if (p > start || p == body.end()) {
213
        string text = body.substr(start - body.begin(), p - start);
231
        string text = body.substr(start - body.begin(), p - start);
...
...
284
          
302
          
285
        p = start;
303
        p = start;
286
        start = find_if(start, body.end(), p_nottag);
304
        start = find_if(start, body.end(), p_nottag);
287
        string tag = body.substr(p - body.begin(), start - p);
305
        string tag = body.substr(p - body.begin(), start - p);
288
        // convert tagname to lowercase
306
        // convert tagname to lowercase
289
      for (string::iterator i = tag.begin(); i != tag.end(); ++i)
307
      lowercase_string(tag);
290
      *i = tolower(static_cast<unsigned char>(*i));
308
291
         
292
        if (closing) {
309
        if (closing) {
293
        closing_tag(tag);
310
        if (!closing_tag(tag))
311
          return;
294
        if (in_script && tag == "script") in_script = false;
312
        if (in_script && tag == "script") in_script = false;
295
         
313
296
        /* ignore any bogus parameters on closing tags */
314
        /* ignore any bogus parameters on closing tags */
297
        p = find(start, body.end(), '>');
315
        p = find(start, body.end(), '>');
298
        if (p == body.end()) break;
316
        if (p == body.end()) break;
299
        start = p + 1;
317
        start = p + 1;
300
        } else {
318
        } else {
319
      bool empty_element = false;
320
      // FIXME: parse parameters lazily.
301
        while (start < body.end() && *start != '>') {
321
        while (start < body.end() && *start != '>') {
302
            string name, value;
322
            string name, value;
303
323
304
            p = find_if(start, body.end(), p_whitespaceeqgt);
324
            p = find_if(start, body.end(), p_whitespaceeqgt);
305
325
306
          name = body.substr(start - body.begin(), p - start);
326
          size_t name_len = p - start;
327
          if (name_len == 1) {
328
          if (*start == '/' && p < body.end() && *p == '>') {
329
              // E.g. <tag foo="bar" />
330
              start = p;
331
              empty_element = true;
332
              break;
333
          }
307
               
334
            }
335
336
          name.assign(body, start - body.begin(), name_len);
337
308
            p = find_if(p, body.end(), p_notwhitespace);
338
            p = find_if(p, body.end(), p_notwhitespace);
309
            
339
310
            start = p;
340
            start = p;
311
            if (start != body.end() && *start == '=') {
341
            if (start != body.end() && *start == '=') {
312
          int quote;
313
             
314
            start = find_if(start + 1, body.end(), p_notwhitespace);
342
            start = find_if(start + 1, body.end(), p_notwhitespace);
315
343
316
            p = body.end();
344
            p = body.end();
317
             
345
318
            quote = *start;
346
            int quote = *start;
319
            if (quote == '"' || quote == '\'') {
347
            if (quote == '"' || quote == '\'') {
320
                start++;
348
                start++;
321
                p = find(start, body.end(), quote);
349
                p = find(start, body.end(), quote);
322
            }
350
            }
323
             
351
324
            if (p == body.end()) {
352
            if (p == body.end()) {
325
                // unquoted or no closing quote
353
                // unquoted or no closing quote
326
                p = find_if(start, body.end(), p_whitespacegt);
354
                p = find_if(start, body.end(), p_whitespacegt);
327
              
328
              value = body.substr(start - body.begin(), p - start);
329
330
              start = find_if(p, body.end(), p_notwhitespace);
331
          } else {
332
              value = body.substr(start - body.begin(), p - start);
333
            }
355
            }
334
             
356
          value.assign(body, start - body.begin(), p - start);
357
          start = find_if(p, body.end(), p_notwhitespace);
358
335
            if (name.size()) {
359
            if (!name.empty()) {
336
                // convert parameter name to lowercase
360
                // convert parameter name to lowercase
337
              string::iterator i;
361
              lowercase_string(name);
338
              for (i = name.begin(); i != name.end(); ++i)
339
              *i = tolower(static_cast<unsigned char>(*i));
340
                // in case of multiple entries, use the first
362
                // in case of multiple entries, use the first
341
                // (as Netscape does)
363
                // (as Netscape does)
342
              if (Param.find(name) == Param.end())
364
              parameters.insert(make_pair(name, value));
343
              Param[name] = value;
344
            }
365
            }
345
            }
366
            }
346
        }
367
        }
347
      opening_tag(tag, Param);
368
#if 0
369
      cout << "<" << tag;
370
      map<string, string>::const_iterator x;
371
      for (x = parameters.begin(); x != parameters.end(); x++) {
372
          cout << " " << x->first << "=\"" << x->second << "\"";
373
      }
374
      cout << ">\n";
375
#endif
376
      if (!opening_tag(tag))
377
          return;
348
        Param.clear();
378
        parameters.clear();
379
380
      if (empty_element) {
381
          if (!closing_tag(tag))
382
          return;
383
      }
349
384
350
        // In <script> tags we ignore opening tags to avoid problems
385
        // In <script> tags we ignore opening tags to avoid problems
351
        // with "a<b".
386
        // with "a<b".
352
        if (tag == "script") in_script = true;
387
        if (tag == "script") in_script = true;
353
388