|
a/src/internfile/myhtmlparse.cpp |
|
b/src/internfile/myhtmlparse.cpp |
|
... |
|
... |
32 |
#include "mimeparse.h"
|
32 |
#include "mimeparse.h"
|
33 |
#include "smallut.h"
|
33 |
#include "smallut.h"
|
34 |
#include "cancelcheck.h"
|
34 |
#include "cancelcheck.h"
|
35 |
#include "debuglog.h"
|
35 |
#include "debuglog.h"
|
36 |
#include "transcode.h"
|
36 |
#include "transcode.h"
|
37 |
|
|
|
38 |
map<string, string> MyHtmlParser::my_named_ents;
|
|
|
39 |
|
37 |
|
40 |
inline static bool
|
38 |
inline static bool
|
41 |
p_notdigit(char c)
|
39 |
p_notdigit(char c)
|
42 |
{
|
40 |
{
|
43 |
return !isdigit(static_cast<unsigned char>(c));
|
41 |
return !isdigit(static_cast<unsigned char>(c));
|
|
... |
|
... |
149 |
"rdquo", "\xe2\x80\x9d", "bdquo", "\xe2\x80\x9e", "dagger", "\xe2\x80\xa0",
|
147 |
"rdquo", "\xe2\x80\x9d", "bdquo", "\xe2\x80\x9e", "dagger", "\xe2\x80\xa0",
|
150 |
"Dagger", "\xe2\x80\xa1", "permil", "\xe2\x80\xb0", "lsaquo", "\xe2\x80\xb9",
|
148 |
"Dagger", "\xe2\x80\xa1", "permil", "\xe2\x80\xb0", "lsaquo", "\xe2\x80\xb9",
|
151 |
"rsaquo", "\xe2\x80\xba", "euro", "\xe2\x82\xac",
|
149 |
"rsaquo", "\xe2\x80\xba", "euro", "\xe2\x82\xac",
|
152 |
NULL, NULL
|
150 |
NULL, NULL
|
153 |
};
|
151 |
};
|
|
|
152 |
map<string, string> my_named_ents;
|
|
|
153 |
class NamedEntsInitializer {
|
|
|
154 |
public:
|
|
|
155 |
NamedEntsInitializer()
|
|
|
156 |
{
|
|
|
157 |
for (int i = 0;;) {
|
|
|
158 |
const char *ent;
|
|
|
159 |
const char *val;
|
|
|
160 |
ent = epairs[i++];
|
|
|
161 |
if (ent == 0)
|
|
|
162 |
break;
|
|
|
163 |
val = epairs[i++];
|
|
|
164 |
if (val == 0)
|
|
|
165 |
break;
|
|
|
166 |
my_named_ents[string(ent)] = val;
|
|
|
167 |
}
|
|
|
168 |
}
|
|
|
169 |
};
|
|
|
170 |
static NamedEntsInitializer namedEntsInitializerInstance;
|
154 |
|
171 |
|
155 |
MyHtmlParser::MyHtmlParser()
|
172 |
MyHtmlParser::MyHtmlParser()
|
156 |
: in_script_tag(false),
|
173 |
: in_script_tag(false),
|
157 |
in_style_tag(false),
|
174 |
in_style_tag(false),
|
158 |
in_body_tag(false),
|
175 |
in_body_tag(false),
|
|
... |
|
... |
161 |
indexing_allowed(true)
|
178 |
indexing_allowed(true)
|
162 |
{
|
179 |
{
|
163 |
// The default html document charset is iso-8859-1. We'll update
|
180 |
// The default html document charset is iso-8859-1. We'll update
|
164 |
// this value from the encoding tag if found.
|
181 |
// this value from the encoding tag if found.
|
165 |
charset = "iso-8859-1";
|
182 |
charset = "iso-8859-1";
|
166 |
|
|
|
167 |
if (my_named_ents.empty()) {
|
|
|
168 |
for (int i = 0;;) {
|
|
|
169 |
const char *ent;
|
|
|
170 |
const char *val;
|
|
|
171 |
ent = epairs[i++];
|
|
|
172 |
if (ent == 0)
|
|
|
173 |
break;
|
|
|
174 |
val = epairs[i++];
|
|
|
175 |
if (val == 0)
|
|
|
176 |
break;
|
|
|
177 |
my_named_ents[string(ent)] = val;
|
|
|
178 |
}
|
|
|
179 |
}
|
|
|
180 |
}
|
183 |
}
|
181 |
|
184 |
|
182 |
void MyHtmlParser::decode_entities(string &s)
|
185 |
void MyHtmlParser::decode_entities(string &s)
|
183 |
{
|
186 |
{
|
184 |
LOGDEB2(("MyHtmlParser::decode_entities\n"));
|
187 |
LOGDEB2(("MyHtmlParser::decode_entities\n"));
|