Switch to unified view

a/src/internfile/txtdcode.cpp b/src/internfile/txtdcode.cpp
...
...
13
 *  along with this program; if not, write to the Free Software
13
 *  along with this program; if not, write to the Free Software
14
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
14
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15
 */
15
 */
16
#include "autoconfig.h"
16
#include "autoconfig.h"
17
17
18
#include <sstream>
19
18
#include "cstr.h"
20
#include "cstr.h"
19
#include "transcode.h"
21
#include "transcode.h"
20
#include "mimehandler.h"
22
#include "mimehandler.h"
21
#include "log.h"
23
#include "log.h"
22
#include "smallut.h"
24
#include "smallut.h"
23
25
#include "listmem.h"
24
26
25
// Called after decoding from utf-8 failed. Handle the common case
27
// Called after decoding from utf-8 failed. Handle the common case
26
// where this is a good old 8bit-encoded text document left-over when
28
// where this is a good old 8bit-encoded text document left-over when
27
// the locale was switched to utf-8. We try to guess a charset
29
// the locale was switched to utf-8. We try to guess a charset
28
// according to the locale language and use it. This is a very rough
30
// according to the locale language and use it. This is a very rough
29
// heuristic, but may be better than discarding the data. 
31
// heuristic, but may be better than discarding the data. 
30
// If we still get a significant number of decode errors, the doc is
32
// If we still get a significant number of decode errors, the doc is
31
// quite probably binary, so just fail.
33
// quite probably binary, so just fail.
34
// Note that we could very well get a wrong transcoding (e.g. between
35
// iso-8859 variations), there is no way to detect it.
32
static bool alternate_decode(const string& in, string& out)
36
static bool alternate_decode(const string& in, string& out, const string& ocs)
33
{
37
{
34
    string lang = localelang();
35
    string code = langtocode(lang);
36
    LOGDEB("RecollFilter::txtdcode: trying alternate decode from "  << (code) << "\n" );
37
    int ecnt;
38
    int ecnt;
39
    if (samecharset(ocs, cstr_utf8)) {
40
        string lang = localelang();
41
        string code = langtocode(lang);
42
        LOGDEB("RecollFilter::txtdcode: trying alternate decode from " <<
43
               code << "\n");
38
    bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
44
        bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
39
    return ecnt > 5 ? false : ret;
45
        return ecnt > 5 ? false : ret;
46
    } else {
47
        // Give a try to utf-8 anyway, as this is self-detecting. This
48
        // handles UTF-8 docs in a non-utf-8 environment. Note that
49
        // this will almost never be called, as most encodings are
50
        // unable to detect errors so that the first try at
51
        // transcoding will have succeeded and alternate_decode() will
52
        // not be called at all.
53
        // 
54
        // To avoid this, we would have to attempt an utf-8 decode
55
        // first, but this is a costly proposition as we don't know
56
        // how much data to test, so need to test all (the beginning
57
        // of the text could be ascii even if there are 8-bit chars
58
        // later).
59
        bool ret = transcode(in, out, cstr_utf8, cstr_utf8, &ecnt);
60
        return ecnt > 5 ? false : ret;
61
    }
62
}
63
64
static string bomtocode(const string& itext)
65
{
66
#if 0
67
    std::ostringstream strm;
68
    listmem(strm, itext.c_str(), MIN(itext.size(), 8));
69
    LOGDEB("txtdcode:bomtocode: input " << strm.str() << "\n");
70
#endif
71
72
    const unsigned char *utxt = (const unsigned char *)itext.c_str();
73
    if (itext.size() >= 3 && utxt[0] == 0xEF && utxt[1] == 0xBB &&
74
        utxt[2] == 0xBF) {
75
        LOGDEB("txtdcode:bomtocode: UTF-8\n");
76
        return "UTF-8";
77
    } else if (itext.size() >= 2 && utxt[0] == 0xFE && utxt[1] == 0xFF) {
78
        return "UTF-16BE";
79
    } else if (itext.size() >= 2 && utxt[0] == 0xFF && utxt[1] == 0xFE) {
80
        return "UTF-16LE";
81
    } else if (itext.size() >= 4 && utxt[0] == 0 && utxt[1] == 0 &&
82
               utxt[2] == 0xFE && utxt[3] == 0xFF) {
83
        return "UTF-32BE";
84
    } else if (itext.size() >= 4 && utxt[3] == 0 && utxt[2] == 0 &&
85
               utxt[1] == 0xFE && utxt[0] == 0xFF) {
86
        return "UTF-32LE";
87
    } else {
88
        return string();
89
    }
40
}
90
}
41
91
42
bool RecollFilter::txtdcode(const string& who)
92
bool RecollFilter::txtdcode(const string& who)
43
{
93
{
44
    if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
94
    if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
45
  LOGERR(""  << (who) << "::txtdcode: called on non txt/plain: "  << (m_metaData[cstr_dj_keymt]) << "\n" );
95
  LOGERR(who << "::txtdcode: called on non txt/plain: " <<
96
               m_metaData[cstr_dj_keymt] << "\n");
46
    return false;
97
    return false;
47
    }
98
    }
48
99
49
    string& ocs = m_metaData[cstr_dj_keyorigcharset];
100
    string& ocs = m_metaData[cstr_dj_keyorigcharset];
50
    string& itext = m_metaData[cstr_dj_keycontent];
101
    string& itext = m_metaData[cstr_dj_keycontent];
51
    LOGDEB1(""  << (who) << "::txtdcode: "  << (itext.size()) << " bytes from ["  << (ocs) << "] to UTF-8\n" );
102
    LOGDEB(who << "::txtdcode: "  << itext.size() << " bytes from ["  <<
103
           ocs << "] to UTF-8\n");
52
    int ecnt;
104
    int ecnt;
53
    string otext;
105
    string otext;
106
107
    string bomfromcode = bomtocode(itext);
108
    if (!bomfromcode.empty()) {
109
        LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
110
               ocs << " to " << bomfromcode << " from BOM detection\n");
111
        ocs = bomfromcode;
112
    }
113
    
54
    bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
114
    bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
55
    if (!ret || ecnt > int(itext.size() / 100)) {
115
    if (!ret || ecnt > int(itext.size() / 100)) {
56
  LOGERR(""  << (who) << "::txtdcode: transcode "  << (itext.size()) << " bytes to UTF-8 failed for input charset ["  << (ocs) << "] ret "  << (ret) << " ecnt "  << (ecnt) << "\n" );
116
  LOGERR(who << "::txtdcode: transcode " << itext.size() <<
117
               " bytes to UTF-8 failed for input charset [" << ocs <<
118
               "] ret " << ret << " ecnt "  << ecnt << "\n");
57
119
58
  if (samecharset(ocs, cstr_utf8)) {
59
      ret = alternate_decode(itext, otext);
120
        ret = alternate_decode(itext, otext, ocs);
60
  } else {
121
61
      ret = false;
62
  }
63
    if (!ret) {
122
    if (!ret) {
64
        LOGDEB("txtdcode: failed. Doc is not text?\n" );
123
        LOGDEB("txtdcode: failed. Doc is not text?\n" );
65
        itext.erase();
124
        itext.erase();
66
        return false;
125
        return false;
67
    }
126
    }
...
...
69
128
70
    itext.swap(otext);
129
    itext.swap(otext);
71
    m_metaData[cstr_dj_keycharset] = cstr_utf8;
130
    m_metaData[cstr_dj_keycharset] = cstr_utf8;
72
    return true;
131
    return true;
73
}
132
}
74
75