Switch to unified view

a/src/internfile/htmlparse.cpp b/src/internfile/htmlparse.cpp
...
...
20
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21
 * USA
21
 * USA
22
 * -----END-LICENCE-----
22
 * -----END-LICENCE-----
23
 */
23
 */
24
24
25
#ifndef lint
26
static char rcsid[] = "@(#$Id: htmlparse.cpp,v 1.2 2005-01-28 08:50:17 dockes Exp $ ";
27
#endif
28
25
#include <config.h>
29
//#include <config.h>
26
30
27
#include <algorithm>
31
#include <algorithm>
28
using std::find;
32
using std::find;
29
using std::find_if;
33
using std::find_if;
30
#include "htmlparse.h"
34
#include "htmlparse.h"
31
#include <stdio.h>
35
#include <stdio.h>
32
#include <ctype.h>
36
#include <ctype.h>
33
37
38
#include "transcode.h"
39
34
map<string, unsigned int> HtmlParser::named_ents;
40
map<string, string> HtmlParser::named_ents;
35
41
36
inline static bool
42
inline static bool
37
p_alpha(char c)
43
p_alpha(char c)
38
{
44
{
39
    return isalpha(c);
45
    return isalpha(c);
...
...
79
p_whitespaceeqgt(char c)
85
p_whitespaceeqgt(char c)
80
{
86
{
81
    return isspace(c) || c == '=' || c == '>';
87
    return isspace(c) || c == '=' || c == '>';
82
}
88
}
83
89
90
/*
91
 * The following array was taken from Estraier. Estraier was
92
 * written by Mikio Hirabayashi. 
93
 *                Copyright (C) 2003-2004 Mikio Hirabayashi
94
 * The version where this comes from 
95
 * is covered by the GNU licence, as this file.*/
96
static const char *epairs[] = {
97
    /* basic symbols */
98
    "amp", "", "lt", "<", "gt", ">", "quot", "\"", "apos", "'",
99
    /* ISO-8859-1 */
100
    "nbsp", "\xc2\xa0", "iexcl", "\xc2\xa1", "cent", "\xc2\xa2",
101
    "pound", "\xc2\xa3", "curren", "\xc2\xa4", "yen", "\xc2\xa5",
102
    "brvbar", "\xc2\xa6", "sect", "\xc2\xa7", "uml", "\xc2\xa8",
103
    "copy", "\xc2\xa9", "ordf", "\xc2\xaa", "laquo", "\xc2\xab",
104
    "not", "\xc2\xac", "shy", "\xc2\xad", "reg", "\xc2\xae",
105
    "macr", "\xc2\xaf", "deg", "\xc2\xb0", "plusmn", "\xc2\xb1",
106
    "sup2", "\xc2\xb2", "sup3", "\xc2\xb3", "acute", "\xc2\xb4",
107
    "micro", "\xc2\xb5", "para", "\xc2\xb6", "middot", "\xc2\xb7",
108
    "cedil", "\xc2\xb8", "sup1", "\xc2\xb9", "ordm", "\xc2\xba",
109
    "raquo", "\xc2\xbb", "frac14", "\xc2\xbc", "frac12", "\xc2\xbd",
110
    "frac34", "\xc2\xbe", "iquest", "\xc2\xbf", "Agrave", "\xc3\x80",
111
    "Aacute", "\xc3\x81", "Acirc", "\xc3\x82", "Atilde", "\xc3\x83",
112
    "Auml", "\xc3\x84", "Aring", "\xc3\x85", "AElig", "\xc3\x86",
113
    "Ccedil", "\xc3\x87", "Egrave", "\xc3\x88", "Eacute", "\xc3\x89",
114
    "Ecirc", "\xc3\x8a", "Euml", "\xc3\x8b", "Igrave", "\xc3\x8c",
115
    "Iacute", "\xc3\x8d", "Icirc", "\xc3\x8e", "Iuml", "\xc3\x8f",
116
    "ETH", "\xc3\x90", "Ntilde", "\xc3\x91", "Ograve", "\xc3\x92",
117
    "Oacute", "\xc3\x93", "Ocirc", "\xc3\x94", "Otilde", "\xc3\x95",
118
    "Ouml", "\xc3\x96", "times", "\xc3\x97", "Oslash", "\xc3\x98",
119
    "Ugrave", "\xc3\x99", "Uacute", "\xc3\x9a", "Ucirc", "\xc3\x9b",
120
    "Uuml", "\xc3\x9c", "Yacute", "\xc3\x9d", "THORN", "\xc3\x9e",
121
    "szlig", "\xc3\x9f", "agrave", "\xc3\xa0", "aacute", "\xc3\xa1",
122
    "acirc", "\xc3\xa2", "atilde", "\xc3\xa3", "auml", "\xc3\xa4",
123
    "aring", "\xc3\xa5", "aelig", "\xc3\xa6", "ccedil", "\xc3\xa7",
124
    "egrave", "\xc3\xa8", "eacute", "\xc3\xa9", "ecirc", "\xc3\xaa",
125
    "euml", "\xc3\xab", "igrave", "\xc3\xac", "iacute", "\xc3\xad",
126
    "icirc", "\xc3\xae", "iuml", "\xc3\xaf", "eth", "\xc3\xb0",
127
    "ntilde", "\xc3\xb1", "ograve", "\xc3\xb2", "oacute", "\xc3\xb3",
128
    "ocirc", "\xc3\xb4", "otilde", "\xc3\xb5", "ouml", "\xc3\xb6",
129
    "divide", "\xc3\xb7", "oslash", "\xc3\xb8", "ugrave", "\xc3\xb9",
130
    "uacute", "\xc3\xba", "ucirc", "\xc3\xbb", "uuml", "\xc3\xbc",
131
    "yacute", "\xc3\xbd", "thorn", "\xc3\xbe", "yuml", "\xc3\xbf",
132
    /* ISO-10646 */
133
    "fnof", "\xc6\x92", "Alpha", "\xce\x91", "Beta", "\xce\x92",
134
    "Gamma", "\xce\x93", "Delta", "\xce\x94", "Epsilon", "\xce\x95",
135
    "Zeta", "\xce\x96", "Eta", "\xce\x97", "Theta", "\xce\x98",
136
    "Iota", "\xce\x99", "Kappa", "\xce\x9a", "Lambda", "\xce\x9b",
137
    "Mu", "\xce\x9c", "Nu", "\xce\x9d", "Xi", "\xce\x9e",
138
    "Omicron", "\xce\x9f", "Pi", "\xce\xa0", "Rho", "\xce\xa1",
139
    "Sigma", "\xce\xa3", "Tau", "\xce\xa4", "Upsilon", "\xce\xa5",
140
    "Phi", "\xce\xa6", "Chi", "\xce\xa7", "Psi", "\xce\xa8",
141
    "Omega", "\xce\xa9", "alpha", "\xce\xb1", "beta", "\xce\xb2",
142
    "gamma", "\xce\xb3", "delta", "\xce\xb4", "epsilon", "\xce\xb5",
143
    "zeta", "\xce\xb6", "eta", "\xce\xb7", "theta", "\xce\xb8",
144
    "iota", "\xce\xb9", "kappa", "\xce\xba", "lambda", "\xce\xbb",
145
    "mu", "\xce\xbc", "nu", "\xce\xbd", "xi", "\xce\xbe",
146
    "omicron", "\xce\xbf", "pi", "\xcf\x80", "rho", "\xcf\x81",
147
    "sigmaf", "\xcf\x82", "sigma", "\xcf\x83", "tau", "\xcf\x84",
148
    "upsilon", "\xcf\x85", "phi", "\xcf\x86", "chi", "\xcf\x87",
149
    "psi", "\xcf\x88", "omega", "\xcf\x89", "thetasym", "\xcf\x91",
150
    "upsih", "\xcf\x92", "piv", "\xcf\x96", "bull", "\xe2\x80\xa2",
151
    "hellip", "\xe2\x80\xa6", "prime", "\xe2\x80\xb2", "Prime", "\xe2\x80\xb3",
152
    "oline", "\xe2\x80\xbe", "frasl", "\xe2\x81\x84", "weierp", "\xe2\x84\x98",
153
    "image", "\xe2\x84\x91", "real", "\xe2\x84\x9c", "trade", "\xe2\x84\xa2",
154
    "alefsym", "\xe2\x84\xb5", "larr", "\xe2\x86\x90", "uarr", "\xe2\x86\x91",
155
    "rarr", "\xe2\x86\x92", "darr", "\xe2\x86\x93", "harr", "\xe2\x86\x94",
156
    "crarr", "\xe2\x86\xb5", "lArr", "\xe2\x87\x90", "uArr", "\xe2\x87\x91",
157
    "rArr", "\xe2\x87\x92", "dArr", "\xe2\x87\x93", "hArr", "\xe2\x87\x94",
158
    "forall", "\xe2\x88\x80", "part", "\xe2\x88\x82", "exist", "\xe2\x88\x83",
159
    "empty", "\xe2\x88\x85", "nabla", "\xe2\x88\x87", "isin", "\xe2\x88\x88",
160
    "notin", "\xe2\x88\x89", "ni", "\xe2\x88\x8b", "prod", "\xe2\x88\x8f",
161
    "sum", "\xe2\x88\x91", "minus", "\xe2\x88\x92", "lowast", "\xe2\x88\x97",
162
    "radic", "\xe2\x88\x9a", "prop", "\xe2\x88\x9d", "infin", "\xe2\x88\x9e",
163
    "ang", "\xe2\x88\xa0", "and", "\xe2\x88\xa7", "or", "\xe2\x88\xa8",
164
    "cap", "\xe2\x88\xa9", "cup", "\xe2\x88\xaa", "int", "\xe2\x88\xab",
165
    "there4", "\xe2\x88\xb4", "sim", "\xe2\x88\xbc", "cong", "\xe2\x89\x85",
166
    "asymp", "\xe2\x89\x88", "ne", "\xe2\x89\xa0", "equiv", "\xe2\x89\xa1",
167
    "le", "\xe2\x89\xa4", "ge", "\xe2\x89\xa5", "sub", "\xe2\x8a\x82",
168
    "sup", "\xe2\x8a\x83", "nsub", "\xe2\x8a\x84", "sube", "\xe2\x8a\x86",
169
    "supe", "\xe2\x8a\x87", "oplus", "\xe2\x8a\x95", "otimes", "\xe2\x8a\x97",
170
    "perp", "\xe2\x8a\xa5", "sdot", "\xe2\x8b\x85", "lceil", "\xe2\x8c\x88",
171
    "rceil", "\xe2\x8c\x89", "lfloor", "\xe2\x8c\x8a", "rfloor", "\xe2\x8c\x8b",
172
    "lang", "\xe2\x8c\xa9", "rang", "\xe2\x8c\xaa", "loz", "\xe2\x97\x8a",
173
    "spades", "\xe2\x99\xa0", "clubs", "\xe2\x99\xa3", "hearts", "\xe2\x99\xa5",
174
    "diams", "\xe2\x99\xa6", "OElig", "\xc5\x92", "oelig", "\xc5\x93",
175
    "Scaron", "\xc5\xa0", "scaron", "\xc5\xa1", "Yuml", "\xc5\xb8",
176
    "circ", "\xcb\x86", "tilde", "\xcb\x9c", "ensp", "\xe2\x80\x82",
177
    "emsp", "\xe2\x80\x83", "thinsp", "\xe2\x80\x89", "zwnj", "\xe2\x80\x8c",
178
    "zwj", "\xe2\x80\x8d", "lrm", "\xe2\x80\x8e", "rlm", "\xe2\x80\x8f",
179
    "ndash", "\xe2\x80\x93", "mdash", "\xe2\x80\x94", "lsquo", "\xe2\x80\x98",
180
    "rsquo", "\xe2\x80\x99", "sbquo", "\xe2\x80\x9a", "ldquo", "\xe2\x80\x9c",
181
    "rdquo", "\xe2\x80\x9d", "bdquo", "\xe2\x80\x9e", "dagger", "\xe2\x80\xa0",
182
    "Dagger", "\xe2\x80\xa1", "permil", "\xe2\x80\xb0", "lsaquo", "\xe2\x80\xb9",
183
    "rsaquo", "\xe2\x80\xba", "euro", "\xe2\x82\xac",
184
    NULL, NULL
185
};
186
84
HtmlParser::HtmlParser()
187
HtmlParser::HtmlParser()
85
{
188
{
86
    static struct ent { const char *n; unsigned int v; } ents[] = {
87
  { "quot", 34 },
88
  { "amp", 38 },
89
  { "lt", 60 },
90
  { "gt", 62 },
91
  { "AElig", 198 },
92
  { "Aacute", 193 },
93
  { "Acirc", 194 },
94
  { "Agrave", 192 },
95
  { "Aring", 197 },
96
  { "Atilde", 195 },
97
  { "Auml", 196 },
98
  { "Ccedil", 199 },
99
  { "ETH", 208 },
100
  { "Eacute", 201 },
101
  { "Ecirc", 202 },
102
  { "Egrave", 200 },
103
  { "Euml", 203 },
104
  { "Iacute", 205 },
105
  { "Icirc", 206 },
106
  { "Igrave", 204 },
107
  { "Iuml", 207 },
108
  { "Ntilde", 209 },
109
  { "Oacute", 211 },
110
  { "Ocirc", 212 },
111
  { "Ograve", 210 },
112
  { "Oslash", 216 },
113
  { "Otilde", 213 },
114
  { "Ouml", 214 },
115
  { "THORN", 222 },
116
  { "Uacute", 218 },
117
  { "Ucirc", 219 },
118
  { "Ugrave", 217 },
119
  { "Uuml", 220 },
120
  { "Yacute", 221 },
121
  { "aacute", 225 },
122
  { "acirc", 226 },
123
  { "acute", 180 },
124
  { "aelig", 230 },
125
  { "agrave", 224 },
126
  { "aring", 229 },
127
  { "atilde", 227 },
128
  { "auml", 228 },
129
  { "brvbar", 166 },
130
  { "ccedil", 231 },
131
  { "cedil", 184 },
132
  { "cent", 162 },
133
  { "copy", 169 },
134
  { "curren", 164 },
135
  { "deg", 176 },
136
  { "divide", 247 },
137
  { "eacute", 233 },
138
  { "ecirc", 234 },
139
  { "egrave", 232 },
140
  { "eth", 240 },
141
  { "euml", 235 },
142
  { "frac12", 189 },
143
  { "frac14", 188 },
144
  { "frac34", 190 },
145
  { "iacute", 237 },
146
  { "icirc", 238 },
147
  { "iexcl", 161 },
148
  { "igrave", 236 },
149
  { "iquest", 191 },
150
  { "iuml", 239 },
151
  { "laquo", 171 },
152
  { "macr", 175 },
153
  { "micro", 181 },
154
  { "middot", 183 },
155
  { "nbsp", 160 },
156
  { "not", 172 },
157
  { "ntilde", 241 },
158
  { "oacute", 243 },
159
  { "ocirc", 244 },
160
  { "ograve", 242 },
161
  { "ordf", 170 },
162
  { "ordm", 186 },
163
  { "oslash", 248 },
164
  { "otilde", 245 },
165
  { "ouml", 246 },
166
  { "para", 182 },
167
  { "plusmn", 177 },
168
  { "pound", 163 },
169
  { "raquo", 187 },
170
  { "reg", 174 },
171
  { "sect", 167 },
172
  { "shy", 173 },
173
  { "sup1", 185 },
174
  { "sup2", 178 },
175
  { "sup3", 179 },
176
  { "szlig", 223 },
177
  { "thorn", 254 },
178
  { "times", 215 },
179
  { "uacute", 250 },
180
  { "ucirc", 251 },
181
  { "ugrave", 249 },
182
  { "uml", 168 },
183
  { "uuml", 252 },
184
  { "yacute", 253 },
185
  { "yen", 165 },
186
  { "yuml", 255 },
187
// iso8859-1 only for now { "OElig", 338 },
188
// ditto          { "oelig", 339 },
189
  { NULL, 0 }
190
    };
191
    if (named_ents.empty()) {
189
    if (named_ents.empty()) {
192
  struct ent *i = ents;
190
  for (int i = 0;;) {
193
  while (i->n) {
191
      const char *ent;
192
      const char *val;
193
      ent = epairs[i++];
194
      if (ent == 0) 
195
      break;
196
      val = epairs[i++];
197
      if (val == 0) 
198
      break;
194
        named_ents[string(i->n)] = i->v;
199
        named_ents[string(ent)] = val;
195
      ++i;
196
    }
200
    }
197
    }
201
    }
198
}
202
}
199
203
200
void
204
void
201
HtmlParser::decode_entities(string &s)
205
HtmlParser::decode_entities(string &s)
202
{
206
{
207
    // This has no meaning whatsoever if the character encoding is unknown,
208
    // so don't do it. If charset known, caller has converted text to utf-8, 
209
    // and this is also how we translate entities
210
    //    if (charset != "utf-8")
211
    //        return;
212
203
    // We need a const_iterator version of s.end() - otherwise the
213
    // We need a const_iterator version of s.end() - otherwise the
204
    // find() and find_if() templates don't work...
214
    // find() and find_if() templates don't work...
205
    string::const_iterator amp = s.begin(), s_end = s.end();
215
    string::const_iterator amp = s.begin(), s_end = s.end();
206
    while ((amp = find(amp, s_end, '&')) != s_end) {
216
    while ((amp = find(amp, s_end, '&')) != s_end) {
207
    unsigned int val = 0;
217
    unsigned int val = 0;
208
    string::const_iterator end, p = amp + 1;
218
    string::const_iterator end, p = amp + 1;
219
  string subs;
209
    if (p != s_end && *p == '#') {
220
    if (p != s_end && *p == '#') {
210
        p++;
221
        p++;
211
        if (p != s_end && tolower(*p) == 'x') {
222
        if (p != s_end && tolower(*p) == 'x') {
212
        // hex
223
        // hex
213
        p++;
224
        p++;
...
...
219
        val = atoi(s.substr(p - s.begin(), end - p).c_str());
230
        val = atoi(s.substr(p - s.begin(), end - p).c_str());
220
        }
231
        }
221
    } else {
232
    } else {
222
        end = find_if(p, s_end, p_notalnum);
233
        end = find_if(p, s_end, p_notalnum);
223
        string code = s.substr(p - s.begin(), end - p);
234
        string code = s.substr(p - s.begin(), end - p);
224
        map<string, unsigned int>::const_iterator i;
235
        map<string, string>::const_iterator i;
225
        i = named_ents.find(code);
236
        i = named_ents.find(code);
226
        if (i != named_ents.end()) val = i->second;
237
        if (i != named_ents.end()) 
238
      subs = i->second;
227
    }
239
    }
240
228
    if (end < s_end && *end == ';') end++;
241
    if (end < s_end && *end == ';') 
242
      end++;
243
  
229
    if (val) {
244
    if (val) {
245
      // The code is the code position for a unicode char. We need
246
      // to translate it to an utf-8 string.
247
      string utf16be;
248
      utf16be += char(val / 256);
249
      utf16be += char(val % 256);
250
      transcode(utf16be, subs, "UTF-16BE", "UTF-8");
251
  } 
252
253
  if (subs.length() > 0) {
230
        string::size_type amp_pos = amp - s.begin();
254
        string::size_type amp_pos = amp - s.begin();
231
        s.replace(amp_pos, end - amp, 1u, char(val));
255
        s.replace(amp_pos, end - amp, subs);
232
        s_end = s.end();
256
        s_end = s.end();
233
        // We've modified the string, so the iterators are no longer
257
        // We've modified the string, so the iterators are no longer
234
        // valid...
258
        // valid...
235
        amp = s.begin() + amp_pos + 1;
259
        amp = s.begin() + amp_pos + subs.length();
236
    } else {
260
    } else {
237
        amp = end;
261
        amp = end;
238
    }
262
    }
239
    }
263
    }
240
}
264
}