Switch to unified view

a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
...
...
35
class MyHtmlParser : public HtmlParser {
35
class MyHtmlParser : public HtmlParser {
36
 public:
36
 public:
37
    bool in_script_tag;
37
    bool in_script_tag;
38
    bool in_style_tag;
38
    bool in_style_tag;
39
    string title, sample, keywords, dump;
39
    string title, sample, keywords, dump;
40
    string charset; // This is the charset our user thinks the doc is in
40
    string ocharset; // This is the charset our user thinks the doc was
41
    string charset; // This is the charset it was supposedly converted to
41
    string doccharset; // Set this to value of charset parameter in header
42
    string doccharset; // Set this to value of charset parameter in header
42
    bool indexing_allowed;
43
    bool indexing_allowed;
43
    void process_text(const string &text);
44
    void process_text(const string &text);
44
    void opening_tag(const string &tag, const map<string,string> &p);
45
    void opening_tag(const string &tag, const map<string,string> &p);
45
    void closing_tag(const string &tag);
46
    void closing_tag(const string &tag);
...
...
123
            string value = i->second;
124
            string value = i->second;
124
            MimeHeaderValue p = parseMimeHeaderValue(value);
125
            MimeHeaderValue p = parseMimeHeaderValue(value);
125
            map<string, string>::const_iterator k;
126
            map<string, string>::const_iterator k;
126
            if ((k = p.params.find("charset")) != p.params.end()) {
127
            if ((k = p.params.find("charset")) != p.params.end()) {
127
            doccharset = k->second;
128
            doccharset = k->second;
128
            if (doccharset != charset)
129
            if (doccharset != ocharset) {
130
              LOGDEB1(("Doc specified charset '%s' "
131
                   "differs from announced '%s'\n",
132
                   doccharset.c_str(), ocharset.c_str()));
129
                throw true;
133
                throw true;
134
          }
130
            }
135
            }
131
        }
136
        }
132
        }
137
        }
133
    }
138
    }
139
    } else if (tag == "p" || tag == "br") {
140
  dump += "\n";
134
    } else if (tag == "script") {
141
    } else if (tag == "script") {
135
    in_script_tag = true;
142
    in_script_tag = true;
136
    } else if (tag == "style") {
143
    } else if (tag == "style") {
137
    in_style_tag = true;
144
    in_style_tag = true;
138
    } else if (tag == "body") {
145
    } else if (tag == "body") {
...
...
177
    if (conf->guesscharset) {
184
    if (conf->guesscharset) {
178
    charset = csguess(otext, conf->defcharset);
185
    charset = csguess(otext, conf->defcharset);
179
    } else
186
    } else
180
    charset = conf->defcharset;
187
    charset = conf->defcharset;
181
188
182
    LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", 
189
    LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
183
      charset.c_str()));
184
190
185
    MyHtmlParser pres;
191
    MyHtmlParser pres;
186
    for (int pass = 0; pass < 2; pass++) {
192
    for (int pass = 0; pass < 2; pass++) {
187
    string transcoded;
193
    string transcoded;
188
  LOGDEB(("textHtmlToDoc: transcode from %s to %s\n", 
189
      charset.c_str(), "UTF-8"));
190
194
191
    MyHtmlParser p;
195
    MyHtmlParser p;
192
    // Try transcoding. If it fails, use original text.
196
    // Try transcoding. If it fails, use original text.
193
    if (!transcode(otext, transcoded, charset, "UTF-8")) {
197
    if (!transcode(otext, transcoded, charset, "UTF-8")) {
194
        LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
198
        LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
195
            charset.c_str()));
199
            charset.c_str()));
196
        transcoded = otext;
200
        transcoded = otext;
197
        // We don't know the charset, at all
201
        // We don't know the charset, at all
198
        p.charset = charset = "";
202
        p.ocharset = p.charset = charset = "";
199
    } else {
203
    } else {
200
        // charset has the putative source charset, transcoded is now
204
        // ocharset has the putative source charset, transcoded is now
201
        // in utf-8
205
        // in utf-8
206
      p.ocharset = charset;
202
        p.charset = "utf-8";
207
        p.charset = "utf-8";
203
    }
208
    }
204
209
205
    try {
210
    try {
206
        p.parse_html(transcoded);
211
        p.parse_html(transcoded);
207
    } catch (bool) {
212
    } catch (bool) {
208
        pres = p;
213
        pres = p;
209
      if (!pres.doccharset.empty() && pres.doccharset != charset) {
214
      if (!pres.doccharset.empty() && 
215
      pres.doccharset != pres.ocharset) {
210
        LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
216
        LOGDEB(("textHtmlToDoc: charset '%s' doc charset '%s',"
211
          "reparse\n", charset.c_str(), 
217
          "reparse\n", charset.c_str(),pres.doccharset.c_str()));
212
          pres.doccharset.c_str()));
213
        charset = pres.doccharset;
218
        charset = pres.doccharset;
214
        } else
219
        } else
215
        break;
220
        break;
216
    }
221
    }
217
    }
222
    }
218
223
219
    Rcl::Doc out;
224
    Rcl::Doc out;
220
    out.origcharset = charset;
225
    out.origcharset = charset;
221
    out.text = pres.dump;
226
    out.text = pres.dump;
227
    //    LOGDEB(("textHtmlToDoc: dump : %s\n", pres.dump.c_str()));
222
    out.title = pres.title;
228
    out.title = pres.title;
223
    out.keywords = pres.keywords;
229
    out.keywords = pres.keywords;
224
    out.abstract = pres.sample;
230
    out.abstract = pres.sample;
225
    docout = out;
231
    docout = out;
226
    return true;
232
    return true;