Switch to unified view

a/src/internfile/mh_html.cpp b/src/internfile/mh_html.cpp
...
...
30
#include "readfile.h"
30
#include "readfile.h"
31
#include "transcode.h"
31
#include "transcode.h"
32
#include "mimeparse.h"
32
#include "mimeparse.h"
33
#include "myhtmlparse.h"
33
#include "myhtmlparse.h"
34
#include "indextext.h"
34
#include "indextext.h"
35
#include "html.h"
35
36
36
#include <iostream>
37
#include <iostream>
37
using namespace std;
38
using namespace std;
38
39
39
40
40
bool textHtmlToDoc(RclConfig *conf, const string &fn, 
41
bool MimeHandlerHtml::worker(RclConfig *conf, const string &fn, 
41
             const string &mtype, Rcl::Doc &docout)
42
                 const string &mtype, Rcl::Doc &docout)
42
{
43
{
43
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
44
    LOGDEB(("textHtmlToDoc: %s\n", fn.c_str()));
44
    string otext;
45
    string otext;
45
    if (!file_to_string(fn, otext)) {
46
    if (!file_to_string(fn, otext)) {
46
    LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
47
    LOGINFO(("textHtmlToDoc: cant read: %s\n", fn.c_str()));
47
    return false;
48
    return false;
48
    }
49
    }
49
    
50
    return worker1(conf, fn, otext, mtype, docout);
51
}
52
53
bool MimeHandlerHtml::worker1(RclConfig *conf, const string &fn, 
54
               const string& htext,
55
               const string &mtype, Rcl::Doc &docout)
56
{
50
    // Character set handling:
57
    // Character set handling:
51
58
52
    // - We first try to convert from the default configured charset
59
    // - We first try to convert from the default configured charset
53
    //   (which may depend of the current directory) to utf-8. If this
60
    //   (which may depend of the current directory) to utf-8. If this
54
    //   fails, we keep the original text
61
    //   fails, we keep the original text
55
    // - During parsing, if we find a charset parameter, and it differs from
62
    // - During parsing, if we find a charset parameter, and it differs from
56
    //   what we started with, we abort and restart with the parameter value
63
    //   what we started with, we abort and restart with the parameter value
57
    //   instead of the configuration one.
64
    //   instead of the configuration one.
58
    string charset;
65
    string charset;
59
    if (conf->guesscharset) {
66
    if (conf->guesscharset) {
60
    charset = csguess(otext, conf->defcharset);
67
    charset = csguess(htext, conf->defcharset);
61
    } else
68
    } else
62
    charset = conf->defcharset;
69
    charset = conf->defcharset;
63
70
64
    LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
71
    LOGDEB(("textHtmlToDoc: charset before parsing: %s\n", charset.c_str()));
65
72
...
...
67
    for (int pass = 0; pass < 2; pass++) {
74
    for (int pass = 0; pass < 2; pass++) {
68
    string transcoded;
75
    string transcoded;
69
76
70
    MyHtmlParser p;
77
    MyHtmlParser p;
71
    // Try transcoding. If it fails, use original text.
78
    // Try transcoding. If it fails, use original text.
72
    if (!transcode(otext, transcoded, charset, "UTF-8")) {
79
    if (!transcode(htext, transcoded, charset, "UTF-8")) {
73
        LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
80
        LOGERR(("textHtmlToDoc: transcode failed from cs '%s' to UTF-8\n",
74
            charset.c_str()));
81
            charset.c_str()));
75
        transcoded = otext;
82
        transcoded = htext;
76
        // We don't know the charset, at all
83
        // We don't know the charset, at all
77
        p.ocharset = p.charset = charset = "";
84
        p.ocharset = p.charset = charset = "";
78
    } else {
85
    } else {
79
        // ocharset has the putative source charset, transcoded is now
86
        // ocharset has the putative source charset, transcoded is now
80
        // in utf-8
87
        // in utf-8