|
a/src/internfile/mh_html.cpp |
|
b/src/internfile/mh_html.cpp |
|
... |
|
... |
30 |
#include "readfile.h"
|
30 |
#include "readfile.h"
|
31 |
#include "transcode.h"
|
31 |
#include "transcode.h"
|
32 |
#include "mimeparse.h"
|
32 |
#include "mimeparse.h"
|
33 |
#include "myhtmlparse.h"
|
33 |
#include "myhtmlparse.h"
|
34 |
#include "indextext.h"
|
34 |
#include "indextext.h"
|
|
|
35 |
#include "html.h"
|
35 |
|
36 |
|
36 |
#include <iostream>
|
37 |
#include <iostream>
|
37 |
using namespace std;
|
38 |
using namespace std;
|
38 |
|
39 |
|
39 |
|
40 |
|
40 |
bool textHtmlToDoc(RclConfig *conf, const string &fn,
|
41 |
bool MimeHandlerHtml::worker(RclConfig *conf, const string &fn,
|
41 |
const string &mtype, Rcl::Doc &docout)
|
42 |
const string &mtype, Rcl::Doc &docout)
|
42 |
{
|
43 |
{
|
43 |
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
44 |
LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
|
44 |
string otext;
|
45 |
string otext;
|
45 |
if (!file_to_string(fn, otext)) {
|
46 |
if (!file_to_string(fn, otext)) {
|
46 |
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
47 |
LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
|
47 |
return false;
|
48 |
return false;
|
48 |
}
|
49 |
}
|
49 |
|
50 |
return worker1(conf, fn, otext, mtype, docout);
|
|
|
51 |
}
|
|
|
52 |
|
|
|
53 |
bool MimeHandlerHtml::worker1(RclConfig *conf, const string &fn,
|
|
|
54 |
const string& htext,
|
|
|
55 |
const string &mtype, Rcl::Doc &docout)
|
|
|
56 |
{
|
50 |
// Character set handling:
|
57 |
// Character set handling:
|
51 |
|
58 |
|
52 |
// - We first try to convert from the default configured charset
|
59 |
// - We first try to convert from the default configured charset
|
53 |
// (which may depend of the current directory) to utf-8. If this
|
60 |
// (which may depend of the current directory) to utf-8. If this
|
54 |
// fails, we keep the original text
|
61 |
// fails, we keep the original text
|
55 |
// - During parsing, if we find a charset parameter, and it differs from
|
62 |
// - During parsing, if we find a charset parameter, and it differs from
|
56 |
// what we started with, we abort and restart with the parameter value
|
63 |
// what we started with, we abort and restart with the parameter value
|
57 |
// instead of the configuration one.
|
64 |
// instead of the configuration one.
|
58 |
string charset;
|
65 |
string charset;
|
59 |
if (conf->guesscharset) {
|
66 |
if (conf->guesscharset) {
|
60 |
charset = csguess(otext, conf->defcharset);
|
67 |
charset = csguess(htext, conf->defcharset);
|
61 |
} else
|
68 |
} else
|
62 |
charset = conf->defcharset;
|
69 |
charset = conf->defcharset;
|
63 |
|
70 |
|
64 |
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
|
71 |
LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
|
65 |
|
72 |
|
|
... |
|
... |
67 |
for (int pass = 0; pass < 2; pass++) {
|
74 |
for (int pass = 0; pass < 2; pass++) {
|
68 |
string transcoded;
|
75 |
string transcoded;
|
69 |
|
76 |
|
70 |
MyHtmlParser p;
|
77 |
MyHtmlParser p;
|
71 |
// Try transcoding. If it fails, use original text.
|
78 |
// Try transcoding. If it fails, use original text.
|
72 |
if (!transcode(otext, transcoded, charset, "UTF-8")) {
|
79 |
if (!transcode(htext, transcoded, charset, "UTF-8")) {
|
73 |
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
80 |
LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
|
74 |
charset.c_str()));
|
81 |
charset.c_str()));
|
75 |
transcoded = otext;
|
82 |
transcoded = htext;
|
76 |
// We don't know the charset, at all
|
83 |
// We don't know the charset, at all
|
77 |
p.ocharset = p.charset = charset = "";
|
84 |
p.ocharset = p.charset = charset = "";
|
78 |
} else {
|
85 |
} else {
|
79 |
// ocharset has the putative source charset, transcoded is now
|
86 |
// ocharset has the putative source charset, transcoded is now
|
80 |
// in utf-8
|
87 |
// in utf-8
|