|
a/src/internfile/mh_html.cpp |
|
b/src/internfile/mh_html.cpp |
|
... |
|
... |
68 |
// If set_doc(fn), take note of file name.
|
68 |
// If set_doc(fn), take note of file name.
|
69 |
string fn = m_filename;
|
69 |
string fn = m_filename;
|
70 |
m_filename.erase();
|
70 |
m_filename.erase();
|
71 |
|
71 |
|
72 |
string charset = m_defcharset;
|
72 |
string charset = m_defcharset;
|
73 |
LOGDEB(("textHtmlToDoc: next_document. defcharset: %s\n",
|
73 |
LOGDEB(("textHtmlToDoc: next_document. defcharset before parsing: [%s]\n",
|
74 |
charset.c_str()));
|
74 |
charset.c_str()));
|
75 |
|
75 |
|
76 |
// - We first try to convert from the default configured charset
|
76 |
// - We first try to convert from the default configured charset
|
77 |
// (which may depend of the current directory) to utf-8. If this
|
77 |
// (which may depend of the current directory) to utf-8. If this
|
78 |
// fails, we keep the original text
|
78 |
// fails, we keep the original text
|
79 |
// - During parsing, if we find a charset parameter, and it differs from
|
79 |
// - During parsing, if we find a charset parameter, and it differs from
|
80 |
// what we started with, we abort and restart with the parameter value
|
80 |
// what we started with, we abort and restart with the parameter value
|
81 |
// instead of the configuration one.
|
81 |
// instead of the configuration one.
|
82 |
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
|
|
|
83 |
|
|
|
84 |
|
82 |
|
85 |
MyHtmlParser result;
|
83 |
MyHtmlParser result;
|
86 |
for (int pass = 0; pass < 2; pass++) {
|
84 |
for (int pass = 0; pass < 2; pass++) {
|
87 |
string transcoded;
|
85 |
string transcoded;
|
88 |
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
86 |
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
89 |
MyHtmlParser p;
|
87 |
MyHtmlParser p;
|
|
|
88 |
|
90 |
// Try transcoding. If it fails, use original text.
|
89 |
// Try transcoding. If it fails, use original text.
|
91 |
int ecnt;
|
90 |
int ecnt;
|
92 |
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
|
91 |
if (!transcode(m_html, transcoded, charset, "UTF-8", &ecnt)) {
|
93 |
LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
|
92 |
LOGDEB(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8 for"
|
94 |
"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
|
93 |
"[%s]", charset.c_str(), fn.empty()?"unknown":fn.c_str()));
|
95 |
transcoded = m_html;
|
94 |
transcoded = m_html;
|
96 |
// We don't know the charset, at all
|
95 |
// We don't know the charset, at all
|
97 |
p.ocharset = p.charset = charset = "";
|
96 |
p.reset_charsets();
|
|
|
97 |
charset = "";
|
98 |
} else {
|
98 |
} else {
|
99 |
if (ecnt) {
|
99 |
if (ecnt) {
|
100 |
if (pass == 0) {
|
100 |
if (pass == 0) {
|
101 |
LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
|
101 |
LOGDEB(("textHtmlToDoc: init transcode had %d errors for "
|
102 |
"[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
|
102 |
"[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
|
103 |
} else {
|
103 |
} else {
|
104 |
LOGERR(("textHtmlToDoc: final transcode had %d errors for "
|
104 |
LOGERR(("textHtmlToDoc: final transcode had %d errors for "
|
105 |
"[%s]", ecnt, fn.empty()?"unknown":fn.c_str()));
|
105 |
"[%s]\n", ecnt, fn.empty()?"unknown":fn.c_str()));
|
106 |
}
|
106 |
}
|
107 |
}
|
107 |
}
|
108 |
// ocharset has the putative source charset, transcoded is now
|
108 |
// charset has the putative source charset, transcoded is now
|
109 |
// in utf-8
|
109 |
// in utf-8
|
110 |
p.ocharset = charset;
|
110 |
p.set_charsets(charset, "utf-8");
|
111 |
p.charset = "utf-8";
|
|
|
112 |
}
|
111 |
}
|
113 |
|
112 |
|
114 |
try {
|
113 |
try {
|
115 |
p.parse_html(transcoded);
|
114 |
p.parse_html(transcoded);
|
116 |
// No exception: ok?
|
115 |
// No exception: ok?
|
117 |
result = p;
|
116 |
result = p;
|
118 |
break;
|
117 |
break;
|
119 |
} catch (bool diag) {
|
118 |
} catch (bool diag) {
|
120 |
result = p;
|
119 |
result = p;
|
121 |
if (diag == true)
|
120 |
if (diag == true) {
|
|
|
121 |
// Parser throws true at end of text. ok
|
122 |
break;
|
122 |
break;
|
|
|
123 |
}
|
|
|
124 |
|
123 |
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
125 |
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
124 |
charset.c_str(),result.doccharset.c_str()));
|
126 |
charset.c_str(), result.get_charset().c_str()));
|
125 |
if (!result.doccharset.empty() &&
|
127 |
if (!result.get_charset().empty() &&
|
126 |
!samecharset(result.doccharset, result.ocharset)) {
|
128 |
!samecharset(result.get_charset(), result.fromcharset)) {
|
127 |
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
129 |
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
|
|
130 |
// Set the origin charset as specified in document before
|
|
|
131 |
// transcoding again
|
128 |
charset = result.doccharset;
|
132 |
charset = result.get_charset();
|
129 |
} else {
|
133 |
} else {
|
130 |
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
134 |
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
131 |
return false;
|
135 |
return false;
|
132 |
}
|
136 |
}
|
133 |
}
|
137 |
}
|
134 |
}
|
138 |
}
|
135 |
|
139 |
|
136 |
m_metaData["origcharset"] = m_defcharset;
|
140 |
m_metaData["origcharset"] = result.get_charset();
|
137 |
m_metaData["content"] = result.dump;
|
141 |
m_metaData["content"] = result.dump;
|
138 |
m_metaData["charset"] = "utf-8";
|
142 |
m_metaData["charset"] = "utf-8";
|
139 |
// Avoid setting empty values which would crush ones possibly inherited
|
143 |
// Avoid setting empty values which would crush ones possibly inherited
|
140 |
// from parent (if we're an attachment)
|
144 |
// from parent (if we're an attachment)
|
141 |
if (!result.dmtime.empty())
|
145 |
if (!result.dmtime.empty())
|