Switch to unified view

a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
...
...
39
#ifndef NO_NAMESPACES
39
#ifndef NO_NAMESPACES
40
using namespace std;
40
using namespace std;
41
#endif /* NO_NAMESPACES */
41
#endif /* NO_NAMESPACES */
42
42
43
43
44
MimeHandler::Status 
44
bool MimeHandlerHtml::set_document_file(const string &fn)
45
MimeHandlerHtml::mkDoc(RclConfig *conf, const string &fn, 
46
          const string &mtype, Rcl::Doc &docout, string&)
47
{
45
{
48
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
46
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
49
    string otext;
47
    string otext;
50
    if (!file_to_string(fn, otext)) {
48
    if (!file_to_string(fn, otext)) {
51
    LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
49
    LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
52
  return MimeHandler::MHError;
50
  return false;
53
    }
51
    }
54
    return mkDoc(conf, fn, otext, mtype, docout);
52
    return set_document_string(otext);
55
}
53
}
56
54
57
MimeHandler::Status 
55
bool MimeHandlerHtml::set_document_string(const string& htext) 
58
MimeHandlerHtml::mkDoc(RclConfig *conf, const string &, 
59
           const string& htext,
60
           const string &mtype, Rcl::Doc &docout)
61
{
56
{
62
    //LOGDEB(("textHtmlToDoc: htext: %s\n", htext.c_str()));
57
    m_html = htext;
63
    // Character set handling: the initial guessed charset depends on
58
    m_havedoc = true;
64
    // external factors: possible hint (ie mime charset in a mail
59
    return true;
65
    // message), charset guessing, or default configured charset.
60
}
66
    string charset;
67
    if (!charsethint.empty()) {
68
  charset = charsethint;
69
    } else if (conf->getGuessCharset()) {
70
  charset = csguess(htext, conf->getDefCharset());
71
    } else
72
  charset = conf->getDefCharset();
73
61
62
bool MimeHandlerHtml::next_document()
63
{
64
    if (m_havedoc == false)
65
  return false;
66
    m_havedoc = false;
67
    LOGDEB(("textHtmlToDoc: next_document\n"));
68
    string charset = m_defcharset;
74
69
75
    // - We first try to convert from the default configured charset
70
    // - We first try to convert from the default configured charset
76
    //   (which may depend of the current directory) to utf-8. If this
71
    //   (which may depend of the current directory) to utf-8. If this
77
    //   fails, we keep the original text
72
    //   fails, we keep the original text
78
    // - During parsing, if we find a charset parameter, and it differs from
73
    // - During parsing, if we find a charset parameter, and it differs from
79
    //   what we started with, we abort and restart with the parameter value
74
    //   what we started with, we abort and restart with the parameter value
80
    //   instead of the configuration one.
75
    //   instead of the configuration one.
81
    LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
76
    LOGDEB(("textHtmlToDoc: charset before parsing: [%s]\n", charset.c_str()));
82
77
83
    MyHtmlParser result;
78
79
    MyHtmlParser p(m_metaData["content"]);
84
    for (int pass = 0; pass < 2; pass++) {
80
    for (int pass = 0; pass < 2; pass++) {
85
    string transcoded;
81
    string transcoded;
86
    LOGDEB(("Html::mkDoc: pass %d\n", pass));
82
    LOGDEB(("Html::mkDoc: pass %d\n", pass));
87
  MyHtmlParser p;
88
    // Try transcoding. If it fails, use original text.
83
    // Try transcoding. If it fails, use original text.
89
    if (!transcode(htext, transcoded, charset, "UTF-8")) {
84
    if (!transcode(m_html, transcoded, charset, "UTF-8")) {
90
        LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
85
        LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
91
            charset.c_str()));
86
            charset.c_str()));
92
        transcoded = htext;
87
        transcoded = m_html;
93
        // We don't know the charset, at all
88
        // We don't know the charset, at all
94
        p.ocharset = p.charset = charset = "";
89
        p.ocharset = p.charset = charset = "";
95
    } else {
90
    } else {
96
        // ocharset has the putative source charset, transcoded is now
91
        // ocharset has the putative source charset, transcoded is now
97
        // in utf-8
92
        // in utf-8
...
...
100
    }
95
    }
101
96
102
    try {
97
    try {
103
        p.parse_html(transcoded);
98
        p.parse_html(transcoded);
104
        // No exception: ok?
99
        // No exception: ok?
105
      result = p;
106
        break;
100
        break;
107
    } catch (bool diag) {
101
    } catch (bool diag) {
108
      result = p;
109
        if (diag == true)
102
        if (diag == true)
110
        break;
103
        break;
111
        LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
104
        LOGDEB(("textHtmlToDoc: charset [%s] doc charset [%s]\n",
112
            charset.c_str(),result.doccharset.c_str()));
105
            charset.c_str(), p.doccharset.c_str()));
113
        if (!result.doccharset.empty() && 
106
        if (!p.doccharset.empty() && 
114
        !samecharset(result.doccharset, result.ocharset)) {
107
        !samecharset(p.doccharset, p.ocharset)) {
115
        LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
108
        LOGDEB(("textHtmlToDoc: reparse for charsets\n"));
116
        charset = result.doccharset;
109
        charset = p.doccharset;
117
        } else {
110
        } else {
118
        LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
111
        LOGERR(("textHtmlToDoc:: error: non charset exception\n"));
119
      return MimeHandler::MHError;
112
      return false;
120
        }
113
        }
121
    }
114
    }
122
    }
115
    }
123
116
124
    docout.origcharset = charset;
117
    m_metaData["origcharset"] = m_defcharset;
125
    docout.text = result.dump;
118
    m_metaData["charset"] = "utf-8";
126
    //LOGDEB(("textHtmlToDoc: dump : %s\n", result.dump.c_str()));
119
    m_metaData["title"] = p.title;
127
    docout.title = result.title;
120
    m_metaData["keywords"] = p.keywords;
128
    docout.keywords = result.keywords;
121
    m_metaData["modificationdate"] = p.dmtime;
129
    docout.abstract = result.sample;
122
    m_metaData["sample"] = p.sample;
130
    docout.dmtime = result.dmtime;
123
    m_metaData["mimetype"] = "text/plain";
131
    return MimeHandler::MHDone;
124
    return true;
132
}
125
}