Switch to unified view

a/src/internfile/htmlparse.cpp b/src/internfile/htmlparse.cpp
1
/* This file was copied from omega-0.8.5 and modified */
1
/* This file was copied/updated from xapian-omega-1.0.1 and modified */
2
2
3
/* htmlparse.cc: simple HTML parser for omega indexer
3
/* htmlparse.cc: simple HTML parser for omega indexer
4
 *
4
 *
5
 * ----START-LICENCE----
6
 * Copyright 1999,2000,2001 BrightStation PLC
5
 * Copyright 1999,2000,2001 BrightStation PLC
7
 * Copyright 2001 Ananova Ltd
6
 * Copyright 2001 Ananova Ltd
8
 * Copyright 2002 Olly Betts
7
 * Copyright 2002,2006 Olly Betts
9
 *
8
 *
10
 * This program is free software; you can redistribute it and/or
9
 * This program is free software; you can redistribute it and/or
11
 * modify it under the terms of the GNU General Public License as
10
 * modify it under the terms of the GNU General Public License as
12
 * published by the Free Software Foundation; either version 2 of the
11
 * published by the Free Software Foundation; either version 2 of the
13
 * License, or (at your option) any later version.
12
 * License, or (at your option) any later version.
...
...
17
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
 * GNU General Public License for more details.
17
 * GNU General Public License for more details.
19
 *
18
 *
20
 * You should have received a copy of the GNU General Public License
19
 * You should have received a copy of the GNU General Public License
21
 * along with this program; if not, write to the Free Software
20
 * along with this program; if not, write to the Free Software
22
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
23
 * USA
22
 * USA
24
 * -----END-LICENCE-----
25
 */
23
 */
26
24
27
#ifndef lint
25
#ifndef lint
28
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.6 2006-01-30 11:15:27 dockes Exp $ ";
26
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.7 2007-06-19 10:28:40 dockes Exp $ ";
29
#endif
27
#endif
30
28
31
//#include <config.h>
32
33
#include <algorithm>
29
#include <algorithm>
34
#ifndef NO_NAMESPACES
35
using namespace std;
36
//using std::find;
30
using std::find;
37
//using std::find_if;
31
using std::find_if;
38
#endif /* NO_NAMESPACES */
39
#include "htmlparse.h"
32
#include "htmlparse.h"
40
#include <stdio.h>
33
#include <stdio.h>
41
#include <ctype.h>
34
#include <ctype.h>
42
35
43
#include "transcode.h"
44
45
map<string, string> HtmlParser::named_ents;
36
map<string, unsigned int> HtmlParser::named_ents;
46
47
inline static bool
48
p_alpha(char c)
49
{
50
    return isalpha(c);
51
}
52
37
53
inline static bool
38
inline static bool
54
p_notdigit(char c)
39
p_notdigit(char c)
55
{
40
{
56
    return !isdigit(c);
41
    return !isdigit(static_cast<unsigned char>(c));
57
}
42
}
58
43
59
inline static bool
44
inline static bool
60
p_notxdigit(char c)
45
p_notxdigit(char c)
61
{
46
{
62
    return !isxdigit(c);
47
    return !isxdigit(static_cast<unsigned char>(c));
63
}
48
}
64
49
65
inline static bool
50
inline static bool
66
p_notalnum(char c)
51
p_notalnum(char c)
67
{
52
{
68
    return !isalnum(c);
53
    return !isalnum(static_cast<unsigned char>(c));
69
}
54
}
70
55
71
inline static bool
56
inline static bool
72
p_notwhitespace(char c)
57
p_notwhitespace(char c)
73
{
58
{
74
    return !isspace(c);
59
    return !isspace(static_cast<unsigned char>(c));
75
}
60
}
76
61
77
inline static bool
62
inline static bool
78
p_nottag(char c)
63
p_nottag(char c)
79
{
64
{
80
    return !isalnum(c) && c != '.' && c != '-';
65
    return !isalnum(static_cast<unsigned char>(c)) &&
66
  c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
81
}
67
}
82
68
83
inline static bool
69
inline static bool
84
p_whitespacegt(char c)
70
p_whitespacegt(char c)
85
{
71
{
86
    return isspace(c) || c == '>';
72
    return isspace(static_cast<unsigned char>(c)) || c == '>';
87
}
73
}
88
74
89
inline static bool
75
inline static bool
90
p_whitespaceeqgt(char c)
76
p_whitespaceeqgt(char c)
91
{
77
{
92
    return isspace(c) || c == '=' || c == '>';
78
    return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
93
}
94
95
/*
96
 * The following array was taken from Estraier. Estraier was
97
 * written by Mikio Hirabayashi. 
98
 *                Copyright (C) 2003-2004 Mikio Hirabayashi
99
 * The version where this comes from 
100
 * is covered by the GNU licence, as this file.*/
101
static const char *epairs[] = {
102
    /* basic symbols */
103
    "amp", "&", "lt", "<", "gt", ">", "quot", "\"", "apos", "'",
104
    /* ISO-8859-1 */
105
    "nbsp", "\xc2\xa0", "iexcl", "\xc2\xa1", "cent", "\xc2\xa2",
106
    "pound", "\xc2\xa3", "curren", "\xc2\xa4", "yen", "\xc2\xa5",
107
    "brvbar", "\xc2\xa6", "sect", "\xc2\xa7", "uml", "\xc2\xa8",
108
    "copy", "\xc2\xa9", "ordf", "\xc2\xaa", "laquo", "\xc2\xab",
109
    "not", "\xc2\xac", "shy", "\xc2\xad", "reg", "\xc2\xae",
110
    "macr", "\xc2\xaf", "deg", "\xc2\xb0", "plusmn", "\xc2\xb1",
111
    "sup2", "\xc2\xb2", "sup3", "\xc2\xb3", "acute", "\xc2\xb4",
112
    "micro", "\xc2\xb5", "para", "\xc2\xb6", "middot", "\xc2\xb7",
113
    "cedil", "\xc2\xb8", "sup1", "\xc2\xb9", "ordm", "\xc2\xba",
114
    "raquo", "\xc2\xbb", "frac14", "\xc2\xbc", "frac12", "\xc2\xbd",
115
    "frac34", "\xc2\xbe", "iquest", "\xc2\xbf", "Agrave", "\xc3\x80",
116
    "Aacute", "\xc3\x81", "Acirc", "\xc3\x82", "Atilde", "\xc3\x83",
117
    "Auml", "\xc3\x84", "Aring", "\xc3\x85", "AElig", "\xc3\x86",
118
    "Ccedil", "\xc3\x87", "Egrave", "\xc3\x88", "Eacute", "\xc3\x89",
119
    "Ecirc", "\xc3\x8a", "Euml", "\xc3\x8b", "Igrave", "\xc3\x8c",
120
    "Iacute", "\xc3\x8d", "Icirc", "\xc3\x8e", "Iuml", "\xc3\x8f",
121
    "ETH", "\xc3\x90", "Ntilde", "\xc3\x91", "Ograve", "\xc3\x92",
122
    "Oacute", "\xc3\x93", "Ocirc", "\xc3\x94", "Otilde", "\xc3\x95",
123
    "Ouml", "\xc3\x96", "times", "\xc3\x97", "Oslash", "\xc3\x98",
124
    "Ugrave", "\xc3\x99", "Uacute", "\xc3\x9a", "Ucirc", "\xc3\x9b",
125
    "Uuml", "\xc3\x9c", "Yacute", "\xc3\x9d", "THORN", "\xc3\x9e",
126
    "szlig", "\xc3\x9f", "agrave", "\xc3\xa0", "aacute", "\xc3\xa1",
127
    "acirc", "\xc3\xa2", "atilde", "\xc3\xa3", "auml", "\xc3\xa4",
128
    "aring", "\xc3\xa5", "aelig", "\xc3\xa6", "ccedil", "\xc3\xa7",
129
    "egrave", "\xc3\xa8", "eacute", "\xc3\xa9", "ecirc", "\xc3\xaa",
130
    "euml", "\xc3\xab", "igrave", "\xc3\xac", "iacute", "\xc3\xad",
131
    "icirc", "\xc3\xae", "iuml", "\xc3\xaf", "eth", "\xc3\xb0",
132
    "ntilde", "\xc3\xb1", "ograve", "\xc3\xb2", "oacute", "\xc3\xb3",
133
    "ocirc", "\xc3\xb4", "otilde", "\xc3\xb5", "ouml", "\xc3\xb6",
134
    "divide", "\xc3\xb7", "oslash", "\xc3\xb8", "ugrave", "\xc3\xb9",
135
    "uacute", "\xc3\xba", "ucirc", "\xc3\xbb", "uuml", "\xc3\xbc",
136
    "yacute", "\xc3\xbd", "thorn", "\xc3\xbe", "yuml", "\xc3\xbf",
137
    /* ISO-10646 */
138
    "fnof", "\xc6\x92", "Alpha", "\xce\x91", "Beta", "\xce\x92",
139
    "Gamma", "\xce\x93", "Delta", "\xce\x94", "Epsilon", "\xce\x95",
140
    "Zeta", "\xce\x96", "Eta", "\xce\x97", "Theta", "\xce\x98",
141
    "Iota", "\xce\x99", "Kappa", "\xce\x9a", "Lambda", "\xce\x9b",
142
    "Mu", "\xce\x9c", "Nu", "\xce\x9d", "Xi", "\xce\x9e",
143
    "Omicron", "\xce\x9f", "Pi", "\xce\xa0", "Rho", "\xce\xa1",
144
    "Sigma", "\xce\xa3", "Tau", "\xce\xa4", "Upsilon", "\xce\xa5",
145
    "Phi", "\xce\xa6", "Chi", "\xce\xa7", "Psi", "\xce\xa8",
146
    "Omega", "\xce\xa9", "alpha", "\xce\xb1", "beta", "\xce\xb2",
147
    "gamma", "\xce\xb3", "delta", "\xce\xb4", "epsilon", "\xce\xb5",
148
    "zeta", "\xce\xb6", "eta", "\xce\xb7", "theta", "\xce\xb8",
149
    "iota", "\xce\xb9", "kappa", "\xce\xba", "lambda", "\xce\xbb",
150
    "mu", "\xce\xbc", "nu", "\xce\xbd", "xi", "\xce\xbe",
151
    "omicron", "\xce\xbf", "pi", "\xcf\x80", "rho", "\xcf\x81",
152
    "sigmaf", "\xcf\x82", "sigma", "\xcf\x83", "tau", "\xcf\x84",
153
    "upsilon", "\xcf\x85", "phi", "\xcf\x86", "chi", "\xcf\x87",
154
    "psi", "\xcf\x88", "omega", "\xcf\x89", "thetasym", "\xcf\x91",
155
    "upsih", "\xcf\x92", "piv", "\xcf\x96", "bull", "\xe2\x80\xa2",
156
    "hellip", "\xe2\x80\xa6", "prime", "\xe2\x80\xb2", "Prime", "\xe2\x80\xb3",
157
    "oline", "\xe2\x80\xbe", "frasl", "\xe2\x81\x84", "weierp", "\xe2\x84\x98",
158
    "image", "\xe2\x84\x91", "real", "\xe2\x84\x9c", "trade", "\xe2\x84\xa2",
159
    "alefsym", "\xe2\x84\xb5", "larr", "\xe2\x86\x90", "uarr", "\xe2\x86\x91",
160
    "rarr", "\xe2\x86\x92", "darr", "\xe2\x86\x93", "harr", "\xe2\x86\x94",
161
    "crarr", "\xe2\x86\xb5", "lArr", "\xe2\x87\x90", "uArr", "\xe2\x87\x91",
162
    "rArr", "\xe2\x87\x92", "dArr", "\xe2\x87\x93", "hArr", "\xe2\x87\x94",
163
    "forall", "\xe2\x88\x80", "part", "\xe2\x88\x82", "exist", "\xe2\x88\x83",
164
    "empty", "\xe2\x88\x85", "nabla", "\xe2\x88\x87", "isin", "\xe2\x88\x88",
165
    "notin", "\xe2\x88\x89", "ni", "\xe2\x88\x8b", "prod", "\xe2\x88\x8f",
166
    "sum", "\xe2\x88\x91", "minus", "\xe2\x88\x92", "lowast", "\xe2\x88\x97",
167
    "radic", "\xe2\x88\x9a", "prop", "\xe2\x88\x9d", "infin", "\xe2\x88\x9e",
168
    "ang", "\xe2\x88\xa0", "and", "\xe2\x88\xa7", "or", "\xe2\x88\xa8",
169
    "cap", "\xe2\x88\xa9", "cup", "\xe2\x88\xaa", "int", "\xe2\x88\xab",
170
    "there4", "\xe2\x88\xb4", "sim", "\xe2\x88\xbc", "cong", "\xe2\x89\x85",
171
    "asymp", "\xe2\x89\x88", "ne", "\xe2\x89\xa0", "equiv", "\xe2\x89\xa1",
172
    "le", "\xe2\x89\xa4", "ge", "\xe2\x89\xa5", "sub", "\xe2\x8a\x82",
173
    "sup", "\xe2\x8a\x83", "nsub", "\xe2\x8a\x84", "sube", "\xe2\x8a\x86",
174
    "supe", "\xe2\x8a\x87", "oplus", "\xe2\x8a\x95", "otimes", "\xe2\x8a\x97",
175
    "perp", "\xe2\x8a\xa5", "sdot", "\xe2\x8b\x85", "lceil", "\xe2\x8c\x88",
176
    "rceil", "\xe2\x8c\x89", "lfloor", "\xe2\x8c\x8a", "rfloor", "\xe2\x8c\x8b",
177
    "lang", "\xe2\x8c\xa9", "rang", "\xe2\x8c\xaa", "loz", "\xe2\x97\x8a",
178
    "spades", "\xe2\x99\xa0", "clubs", "\xe2\x99\xa3", "hearts", "\xe2\x99\xa5",
179
    "diams", "\xe2\x99\xa6", "OElig", "\xc5\x92", "oelig", "\xc5\x93",
180
    "Scaron", "\xc5\xa0", "scaron", "\xc5\xa1", "Yuml", "\xc5\xb8",
181
    "circ", "\xcb\x86", "tilde", "\xcb\x9c", "ensp", "\xe2\x80\x82",
182
    "emsp", "\xe2\x80\x83", "thinsp", "\xe2\x80\x89", "zwnj", "\xe2\x80\x8c",
183
    "zwj", "\xe2\x80\x8d", "lrm", "\xe2\x80\x8e", "rlm", "\xe2\x80\x8f",
184
    "ndash", "\xe2\x80\x93", "mdash", "\xe2\x80\x94", "lsquo", "\xe2\x80\x98",
185
    "rsquo", "\xe2\x80\x99", "sbquo", "\xe2\x80\x9a", "ldquo", "\xe2\x80\x9c",
186
    "rdquo", "\xe2\x80\x9d", "bdquo", "\xe2\x80\x9e", "dagger", "\xe2\x80\xa0",
187
    "Dagger", "\xe2\x80\xa1", "permil", "\xe2\x80\xb0", "lsaquo", "\xe2\x80\xb9",
188
    "rsaquo", "\xe2\x80\xba", "euro", "\xe2\x82\xac",
189
    NULL, NULL
190
};
79
}
191
80
192
HtmlParser::HtmlParser()
81
HtmlParser::HtmlParser()
193
{
82
{
83
    // RECOLL: no need to initialize these entities, we use those from
84
    // myhtmlparse
85
#if 0
86
    static const struct ent { const char *n; unsigned int v; } ents[] = {
87
#include "namedentities.h"
88
  { NULL, 0 }
89
    };
194
    if (named_ents.empty()) {
90
    if (named_ents.empty()) {
195
  for (int i = 0;;) {
91
  const struct ent *i = ents;
196
      const char *ent;
92
  while (i->n) {
197
      const char *val;
198
      ent = epairs[i++];
199
      if (ent == 0) 
200
      break;
201
      val = epairs[i++];
202
      if (val == 0) 
203
      break;
204
        named_ents[string(ent)] = val;
93
        named_ents[string(i->n)] = i->v;
94
      ++i;
205
    }
95
    }
206
    }
96
    }
97
#endif
207
}
98
}
208
99
209
void
100
void
210
HtmlParser::decode_entities(string &s)
101
HtmlParser::decode_entities(string &s)
211
{
102
{
212
    // This has no meaning whatsoever if the character encoding is unknown,
103
    // Not used for recoll. Kept here to minimize the amount of diffs
213
    // so don't do it. If charset known, caller has converted text to utf-8, 
104
#if 0
214
    // and this is also how we translate entities
215
    //    if (charset != "utf-8")
216
    //        return;
217
218
    // We need a const_iterator version of s.end() - otherwise the
105
    // We need a const_iterator version of s.end() - otherwise the
219
    // find() and find_if() templates don't work...
106
    // find() and find_if() templates don't work...
220
    string::const_iterator amp = s.begin(), s_end = s.end();
107
    string::const_iterator amp = s.begin(), s_end = s.end();
221
    while ((amp = find(amp, s_end, '&')) != s_end) {
108
    while ((amp = find(amp, s_end, '&')) != s_end) {
222
    unsigned int val = 0;
109
    unsigned int val = 0;
223
    string::const_iterator end, p = amp + 1;
110
    string::const_iterator end, p = amp + 1;
224
  string subs;
225
    if (p != s_end && *p == '#') {
111
    if (p != s_end && *p == '#') {
226
        p++;
112
        p++;
227
        if (p != s_end && tolower(*p) == 'x') {
113
        if (p != s_end && (*p == 'x' || *p == 'X')) {
228
        // hex
114
        // hex
229
        p++;
115
        p++;
230
        end = find_if(p, s_end, p_notxdigit);
116
        end = find_if(p, s_end, p_notxdigit);
231
        sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
117
        sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
232
        } else {
118
        } else {
...
...
235
        val = atoi(s.substr(p - s.begin(), end - p).c_str());
121
        val = atoi(s.substr(p - s.begin(), end - p).c_str());
236
        }
122
        }
237
    } else {
123
    } else {
238
        end = find_if(p, s_end, p_notalnum);
124
        end = find_if(p, s_end, p_notalnum);
239
        string code = s.substr(p - s.begin(), end - p);
125
        string code = s.substr(p - s.begin(), end - p);
240
        map<string, string>::const_iterator i;
126
        map<string, unsigned int>::const_iterator i;
241
        i = named_ents.find(code);
127
        i = named_ents.find(code);
242
        if (i != named_ents.end()) 
128
        if (i != named_ents.end()) val = i->second;
243
      subs = i->second;
244
    }
129
    }
245
246
    if (end < s_end && *end == ';') 
130
    if (end < s_end && *end == ';') end++;
247
      end++;
248
  
249
    if (val) {
131
    if (val) {
250
      // The code is the code position for a unicode char. We need
251
      // to translate it to an utf-8 string.
252
      string utf16be;
253
      utf16be += char(val / 256);
254
      utf16be += char(val % 256);
255
      transcode(utf16be, subs, "UTF-16BE", "UTF-8");
256
  } 
257
258
  if (subs.length() > 0) {
259
        string::size_type amp_pos = amp - s.begin();
132
        string::size_type amp_pos = amp - s.begin();
133
      if (val < 0x80) {
134
      s.replace(amp_pos, end - amp, 1u, char(val));
135
      } else {
136
      // Convert unicode value val to UTF-8.
137
      char seq[4];
138
      unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
260
        s.replace(amp_pos, end - amp, subs);
139
      s.replace(amp_pos, end - amp, seq, len);
140
      }
261
        s_end = s.end();
141
        s_end = s.end();
262
        // We've modified the string, so the iterators are no longer
142
        // We've modified the string, so the iterators are no longer
263
        // valid...
143
        // valid...
264
        amp = s.begin() + amp_pos + subs.length();
144
        amp = s.begin() + amp_pos + 1;
265
    } else {
145
    } else {
266
        amp = end;
146
        amp = end;
267
    }
147
    }
268
    }
148
    }
149
#endif
269
}
150
}
270
151
271
void
152
void
272
HtmlParser::parse_html(const string &body)
153
HtmlParser::parse_html(const string &body)
273
{
154
{
155
    in_script = false;
156
274
    map<string,string> Param;
157
    map<string,string> Param;
275
    string::const_iterator start = body.begin();
158
    string::const_iterator start = body.begin();
276
159
277
    while (1) {
160
    while (true) {
161
  // Skip through until we find an HTML tag, a comment, or the end of
162
  // document.  Ignore isolated occurences of `<' which don't start
163
  // a tag or comment.    
278
    string::const_iterator p = start;
164
    string::const_iterator p = start;
279
280
  // Eat text until we find an HTML tag, a comment, or the end
281
  // of document.  Ignore isolated occurences of `<' which don't
282
  // start a tag or comment
283
    while (1) {
165
    while (true) {
284
        p = find(p, body.end(), '<');
166
        p = find(p, body.end(), '<');
285
        if (p == body.end()) break;
167
        if (p == body.end()) break;
286
        char ch = *(p + 1);
168
        unsigned char ch = *(p + 1);
169
287
        // tag, closing tag, comment (or SGML declaration), or PHP
170
        // Tag, closing tag, or comment (or SGML declaration).
288
        if (isalpha(ch) || ch == '/' || ch == '!' || ch == '?') break;
171
        if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
172
      if (ch == '?') {
173
      // PHP code or XML declaration.
174
      // XML declaration is only valid at the start of the first line.
175
      // FIXME: need to deal with BOMs...
176
      if (p != body.begin() || body.size() < 20) break;
177
178
      // XML declaration looks something like this:
179
      // <?xml version="1.0" encoding="UTF-8"?>
180
      if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
181
      if (strchr(" \t\r\n", p[5]) == NULL) break;
182
183
      string::const_iterator decl_end = find(p + 6, body.end(), '?');
184
      if (decl_end == body.end()) break;
185
186
      // Default charset for XML is UTF-8.
187
      charset = "UTF-8";
188
189
      string decl(p + 6, decl_end);
190
      size_t enc = decl.find("encoding");
191
      if (enc == string::npos) break;
192
193
      enc = decl.find_first_not_of(" \t\r\n", enc + 8);
194
      if (enc == string::npos || enc == decl.size()) break;
195
196
      if (decl[enc] != '=') break;
197
      
198
      enc = decl.find_first_not_of(" \t\r\n", enc + 1);
199
      if (enc == string::npos || enc == decl.size()) break;
200
201
      if (decl[enc] != '"' && decl[enc] != '\'') break;
202
203
      char quote = decl[enc++];
204
      size_t enc_end = decl.find(quote, enc);
205
206
      if (enc != string::npos)
207
          charset = decl.substr(enc, enc_end - enc);
208
209
      break;
210
      }
289
        p++; 
211
        p++; 
290
    }
212
    }
291
213
292
  // Process text
214
  // Process text up to start of tag.
293
    if (p > start || p == body.end()) {
215
    if (p > start || p == body.end()) {
294
        string text = body.substr(start - body.begin(), p - start);
216
        string text = body.substr(start - body.begin(), p - start);
295
        decode_entities(text);
217
        decode_entities(text);
296
        process_text(text);
218
        process_text(text);
297
    }
219
    }
...
...
308
    if (*start == '!') {
230
    if (*start == '!') {
309
        if (++start == body.end()) break;
231
        if (++start == body.end()) break;
310
        if (++start == body.end()) break;
232
        if (++start == body.end()) break;
311
        // comment or SGML declaration
233
        // comment or SGML declaration
312
        if (*(start - 1) == '-' && *start == '-') {
234
        if (*(start - 1) == '-' && *start == '-') {
313
      start = find(start + 1, body.end(), '>');
235
      ++start;
236
      string::const_iterator close = find(start, body.end(), '>');
314
        // unterminated comment swallows rest of document
237
        // An unterminated comment swallows rest of document
315
        // (like NS, but unlike MSIE iirc)
238
        // (like Netscape, but unlike MSIE IIRC)
316
        if (start == body.end()) break;
239
        if (close == body.end()) break;
317
      
240
318
      p = start;
241
      p = close;
319
        // look for -->
242
        // look for -->
320
        while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
243
        while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
321
            p = find(p + 1, body.end(), '>');
244
            p = find(p + 1, body.end(), '>');
322
245
246
      if (p != body.end()) {
247
          // Check for htdig's "ignore this bit" comments.
248
          if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
249
          string::size_type i;
250
          i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
251
          if (i == string::npos) break;
252
          start = body.begin() + i + 21;
253
          continue;
254
          }
323
        // If we found --> skip to there, otherwise
255
            // If we found --> skip to there.
256
          start = p;
257
      } else {
324
        // skip to the first > we found (as Netscape does)
258
            // Otherwise skip to the first > we found (as Netscape does).
325
      if (p != body.end()) start = p;
259
          start = close;
260
      }
326
        } else {
261
        } else {
327
        // just an SGML declaration, perhaps giving the DTD - ignore it
262
        // just an SGML declaration, perhaps giving the DTD - ignore it
328
        start = find(start - 1, body.end(), '>');
263
        start = find(start - 1, body.end(), '>');
329
        if (start == body.end()) break;
264
        if (start == body.end()) break;
330
        }
265
        }
...
...
352
          
287
          
353
        p = start;
288
        p = start;
354
        start = find_if(start, body.end(), p_nottag);
289
        start = find_if(start, body.end(), p_nottag);
355
        string tag = body.substr(p - body.begin(), start - p);
290
        string tag = body.substr(p - body.begin(), start - p);
356
        // convert tagname to lowercase
291
        // convert tagname to lowercase
357
        for (string::iterator i = tag.begin(); i != tag.end(); i++)
292
        for (string::iterator i = tag.begin(); i != tag.end(); ++i)
358
      *i = tolower(*i);
293
      *i = tolower(static_cast<unsigned char>(*i));
359
           
294
           
360
        if (closing) {
295
        if (closing) {
361
        closing_tag(tag);
296
        closing_tag(tag);
297
      if (in_script && tag == "script") in_script = false;
362
           
298
           
363
        /* ignore any bogus parameters on closing tags */
299
        /* ignore any bogus parameters on closing tags */
364
        p = find(start, body.end(), '>');
300
        p = find(start, body.end(), '>');
365
        if (p == body.end()) break;
301
        if (p == body.end()) break;
366
        start = p + 1;
302
        start = p + 1;
...
...
400
            }
336
            }
401
               
337
               
402
            if (name.size()) {
338
            if (name.size()) {
403
                // convert parameter name to lowercase
339
                // convert parameter name to lowercase
404
                string::iterator i;
340
                string::iterator i;
405
                for (i = name.begin(); i != name.end(); i++)
341
                for (i = name.begin(); i != name.end(); ++i)
406
              *i = tolower(*i);
342
              *i = tolower(static_cast<unsigned char>(*i));
407
                // in case of multiple entries, use the first
343
                // in case of multiple entries, use the first
408
                // (as Netscape does)
344
                // (as Netscape does)
409
                if (Param.find(name) == Param.end())
345
                if (Param.find(name) == Param.end())
410
                Param[name] = value;
346
                Param[name] = value;
411
            }
347
            }
412
            }
348
            }
413
        }
349
        }
414
        opening_tag(tag, Param);
350
        opening_tag(tag, Param);
415
        Param.clear();
351
        Param.clear();
416
352
353
      // In <script> tags we ignore opening tags to avoid problems
354
      // with "a<b".
355
      if (tag == "script") in_script = true;
356
417
        if (start != body.end() && *start == '>') ++start;
357
        if (start != body.end() && *start == '>') ++start;
418
        }
358
        }
419
    }
359
    }
420
    }
360
    }
421
}
361
}