Switch to unified view

a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
...
...
68
    // If set_doc(fn), take note of file name.
68
    // If set_doc(fn), take note of file name.
69
    string fn = m_filename;
69
    string fn = m_filename;
70
    m_filename.erase();
70
    m_filename.erase();
71
71
72
    string charset = m_defcharset;
72
    string charset = m_defcharset;
73
    LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n", 
73
    LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n", 
74
        charset.c_str()));
74
        charset.c_str()));
75
75
76
    // - We first try to convert from the default configured charset
76
    // - We first try to convert from the default configured charset
77
    //   (which may depend of the current directory) to utf-8. If this
77
    //   (which may depend of the current directory) to utf-8. If this
78
    //   fails, we keep the original text
78
    //   fails, we keep the original text
79
    // - During parsing, if we find a charset parameter, and it differs from
79
    // - During parsing, if we find a charset parameter, and it differs from
80
    //   what we started with, we abort and restart with the parameter value
80
    //   what we started with, we abort and restart with the parameter value
81
    //   instead of the configuration one.
81
    //   instead of the configuration one.
82
    LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
83
84
82
85
    MyHtmlParser result;
83
    MyHtmlParser result;
86
    for (int pass = 0; pass < 2; pass++) {
84
    for (int pass = 0; pass < 2; pass++) {
87
    string transcoded;
85
    string transcoded;
88
    LOGDEB(("Html::mkDoc: pass %d\n", pass));
86
    LOGDEB(("Html::mkDoc: pass %d\n", pass));
89
    MyHtmlParser p;
87
    MyHtmlParser p;
88
90
    // Try transcoding. If it fails, use original text.
89
    // Try transcoding. If it fails, use original text.
91
    int ecnt;
90
    int ecnt;
92
    if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
91
    if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
93
        LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
92
        LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
94
            "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
93
            "[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
95
        transcoded = m_html;
94
        transcoded = m_html;
96
        // We don't know the charset, at all
95
        // We don't know the charset, at all
97
      p.ocharset = p.charset = charset = "";
96
      p.reset_charsets();
97
      charset = "";
98
    } else {
98
    } else {
99
        if (ecnt) {
99
        if (ecnt) {
100
        if (pass == 0) {
100
        if (pass == 0) {
101
            LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
101
            LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
102
                "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
102
                "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
103
        } else {
103
        } else {
104
            LOGERR(("textHtmlToDoc: final transcode had %d errors for "
104
            LOGERR(("textHtmlToDoc: final transcode had %d errors for "
105
                "[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
105
                "[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
106
        }
106
        }
107
        }
107
        }
108
        // ocharset has the putative source charset, transcoded is now
108
        // charset has the putative source charset, transcoded is now
109
        // in utf-8
109
        // in utf-8
110
      p.ocharset = charset;
110
      p.set_charsets(charset, "utf-8");
111
      p.charset = "utf-8";
112
    }
111
    }
113
112
114
    try {
113
    try {
115
        p.parse_html(transcoded);
114
        p.parse_html(transcoded);
116
        // No exception: ok?
115
        // No exception: ok?
117
        result = p;
116
        result = p;
118
        break;
117
        break;
119
    } catch (bool diag) {
118
    } catch (bool diag) {
120
        result = p;
119
        result = p;
121
        if (diag == true)
120
        if (diag == true) {
121
      // Parser throws true at end of text. ok
122
        break;
122
        break;
123
      }
124
123
        LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
125
        LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
124
            charset.c_str(),result.doccharset.c_str()));
126
            charset.c_str(), result.get_charset().c_str()));
125
        if (!result.doccharset.empty() && 
127
        if (!result.get_charset().empty() && 
126
        !samecharset(result.doccharset, result.ocharset)) {
128
        !samecharset(result.get_charset(), result.fromcharset)) {
127
        LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
129
        LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
130
      // Set the origin charset as specified in document before
131
      // transcoding again
128
        charset = result.doccharset;
132
        charset = result.get_charset();
129
        } else {
133
        } else {
130
        LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
134
        LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
131
        return false;
135
        return false;
132
        }
136
        }
133
    }
137
    }
134
    }
138
    }
135
139
136
    m_metaData["origcharset"] = m_defcharset;
140
    m_metaData["origcharset"] = result.get_charset();
137
    m_metaData["content"] = result.dump;
141
    m_metaData["content"] = result.dump;
138
    m_metaData["charset"] = "utf-8";
142
    m_metaData["charset"] = "utf-8";
139
    // Avoid setting empty values which would crush ones possibly inherited
143
    // Avoid setting empty values which would crush ones possibly inherited
140
    // from parent (if we're an attachment)
144
    // from parent (if we're an attachment)
141
    if (!result.dmtime.empty())
145
    if (!result.dmtime.empty())