|
a/src/internfile/mh_html.cpp |
|
b/src/internfile/mh_html.cpp |
|
... |
|
... |
39 |
#ifndef NO_NAMESPACES
|
39 |
#ifndef NO_NAMESPACES
|
40 |
using namespace std;
|
40 |
using namespace std;
|
41 |
#endif /* NO_NAMESPACES */
|
41 |
#endif /* NO_NAMESPACES */
|
42 |
|
42 |
|
43 |
|
43 |
|
44 |
MimeHandler::Status
|
44 |
bool MimeHandlerHtml::set_document_file(const string &fn)
|
45 |
MimeHandlerHtml::mkDoc(RclConfig *conf, const string &fn,
|
|
|
46 |
const string &mtype, Rcl::Doc &docout, string&)
|
|
|
47 |
{
|
45 |
{
|
48 |
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
46 |
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
49 |
string otext;
|
47 |
string otext;
|
50 |
if (!file_to_string(fn, otext)) {
|
48 |
if (!file_to_string(fn, otext)) {
|
51 |
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
49 |
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
52 |
return MimeHandler::MHError;
|
50 |
return false;
|
53 |
}
|
51 |
}
|
54 |
return mkDoc(conf, fn, otext, mtype, docout);
|
52 |
return set_document_string(otext);
|
55 |
}
|
53 |
}
|
56 |
|
54 |
|
57 |
MimeHandler::Status
|
55 |
bool MimeHandlerHtml::set_document_string(const string& htext)
|
58 |
MimeHandlerHtml::mkDoc(RclConfig *conf, const string &,
|
|
|
59 |
const string& htext,
|
|
|
60 |
const string &mtype, Rcl::Doc &docout)
|
|
|
61 |
{
|
56 |
{
|
62 |
//LOGDEB(("textHtmlToDoc: htext: %s\n", htext.c_str()));
|
57 |
m_html = htext;
|
63 |
// Character set handling: the initial guessed charset depends on
|
58 |
m_havedoc = true;
|
64 |
// external factors: possible hint (ie mime charset in a mail
|
59 |
return true;
|
65 |
// message), charset guessing, or default configured charset.
|
60 |
}
|
66 |
string charset;
|
|
|
67 |
if (!charsethint.empty()) {
|
|
|
68 |
charset = charsethint;
|
|
|
69 |
} else if (conf->getGuessCharset()) {
|
|
|
70 |
charset = csguess(htext, conf->getDefCharset());
|
|
|
71 |
} else
|
|
|
72 |
charset = conf->getDefCharset();
|
|
|
73 |
|
61 |
|
|
|
62 |
bool MimeHandlerHtml::next_document()
|
|
|
63 |
{
|
|
|
64 |
if (m_havedoc == false)
|
|
|
65 |
return false;
|
|
|
66 |
m_havedoc = false;
|
|
|
67 |
LOGDEB(("textHtmlToDoc: next_document\n"));
|
|
|
68 |
string charset = m_defcharset;
|
74 |
|
69 |
|
75 |
// - We first try to convert from the default configured charset
|
70 |
// - We first try to convert from the default configured charset
|
76 |
// (which may depend of the current directory) to utf-8. If this
|
71 |
// (which may depend of the current directory) to utf-8. If this
|
77 |
// fails, we keep the original text
|
72 |
// fails, we keep the original text
|
78 |
// - During parsing, if we find a charset parameter, and it differs from
|
73 |
// - During parsing, if we find a charset parameter, and it differs from
|
79 |
// what we started with, we abort and restart with the parameter value
|
74 |
// what we started with, we abort and restart with the parameter value
|
80 |
// instead of the configuration one.
|
75 |
// instead of the configuration one.
|
81 |
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
|
76 |
LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
|
82 |
|
77 |
|
83 |
MyHtmlParser result;
|
78 |
|
|
|
79 |
MyHtmlParser p(m_metaData["content"]);
|
84 |
for (int pass = 0; pass < 2; pass++) {
|
80 |
for (int pass = 0; pass < 2; pass++) {
|
85 |
string transcoded;
|
81 |
string transcoded;
|
86 |
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
82 |
LOGDEB(("Html::mkDoc: pass %d\n", pass));
|
87 |
MyHtmlParser p;
|
|
|
88 |
// Try transcoding. If it fails, use original text.
|
83 |
// Try transcoding. If it fails, use original text.
|
89 |
if (!transcode(htext, transcoded, charset, "UTF-8")) {
|
84 |
if (!transcode(m_html, transcoded, charset, "UTF-8")) {
|
90 |
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
85 |
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
91 |
charset.c_str()));
|
86 |
charset.c_str()));
|
92 |
transcoded = htext;
|
87 |
transcoded = m_html;
|
93 |
// We don't know the charset, at all
|
88 |
// We don't know the charset, at all
|
94 |
p.ocharset = p.charset = charset = "";
|
89 |
p.ocharset = p.charset = charset = "";
|
95 |
} else {
|
90 |
} else {
|
96 |
// ocharset has the putative source charset, transcoded is now
|
91 |
// ocharset has the putative source charset, transcoded is now
|
97 |
// in utf-8
|
92 |
// in utf-8
|
|
... |
|
... |
100 |
}
|
95 |
}
|
101 |
|
96 |
|
102 |
try {
|
97 |
try {
|
103 |
p.parse_html(transcoded);
|
98 |
p.parse_html(transcoded);
|
104 |
// No exception: ok?
|
99 |
// No exception: ok?
|
105 |
result = p;
|
|
|
106 |
break;
|
100 |
break;
|
107 |
} catch (bool diag) {
|
101 |
} catch (bool diag) {
|
108 |
result = p;
|
|
|
109 |
if (diag == true)
|
102 |
if (diag == true)
|
110 |
break;
|
103 |
break;
|
111 |
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
104 |
LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
|
112 |
charset.c_str(),result.doccharset.c_str()));
|
105 |
charset.c_str(), p.doccharset.c_str()));
|
113 |
if (!result.doccharset.empty() &&
|
106 |
if (!p.doccharset.empty() &&
|
114 |
!samecharset(result.doccharset, result.ocharset)) {
|
107 |
!samecharset(p.doccharset, p.ocharset)) {
|
115 |
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
108 |
LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
|
116 |
charset = result.doccharset;
|
109 |
charset = p.doccharset;
|
117 |
} else {
|
110 |
} else {
|
118 |
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
111 |
LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
|
119 |
return MimeHandler::MHError;
|
112 |
return false;
|
120 |
}
|
113 |
}
|
121 |
}
|
114 |
}
|
122 |
}
|
115 |
}
|
123 |
|
116 |
|
124 |
docout.origcharset = charset;
|
117 |
m_metaData["origcharset"] = m_defcharset;
|
125 |
docout.text = result.dump;
|
118 |
m_metaData["charset"] = "utf-8";
|
126 |
//LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str()));
|
119 |
m_metaData["title"] = p.title;
|
127 |
docout.title = result.title;
|
120 |
m_metaData["keywords"] = p.keywords;
|
128 |
docout.keywords = result.keywords;
|
121 |
m_metaData["modificationdate"] = p.dmtime;
|
129 |
docout.abstract = result.sample;
|
122 |
m_metaData["sample"] = p.sample;
|
130 |
docout.dmtime = result.dmtime;
|
123 |
m_metaData["mimetype"] = "text/plain";
|
131 |
return MimeHandler::MHDone;
|
124 |
return true;
|
132 |
}
|
125 |
}
|