|
a/src/internfile/txtdcode.cpp |
|
b/src/internfile/txtdcode.cpp |
|
... |
|
... |
24 |
|
24 |
|
25 |
// Called after decoding from utf-8 failed. Handle the common case
|
25 |
// Called after decoding from utf-8 failed. Handle the common case
|
26 |
// where this is a good old 8bit-encoded text document left-over when
|
26 |
// where this is a good old 8bit-encoded text document left-over when
|
27 |
// the locale was switched to utf-8. We try to guess a charset
|
27 |
// the locale was switched to utf-8. We try to guess a charset
|
28 |
// according to the locale language and use it. This is a very rough
|
28 |
// according to the locale language and use it. This is a very rough
|
29 |
// heuristic, but may be better than discarding the data.
|
29 |
// heuristic, but may be better than discarding the data.
|
|
|
30 |
// If we still get a significant number of decode errors, the doc is
|
|
|
31 |
// quite probably binary, so just fail.
|
30 |
static bool alternate_decode(const string& in, string& out)
|
32 |
static bool alternate_decode(const string& in, string& out)
|
31 |
{
|
33 |
{
|
32 |
string lang = localelang();
|
34 |
string lang = localelang();
|
33 |
string code = langtocode(lang);
|
35 |
string code = langtocode(lang);
|
34 |
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
|
36 |
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
|
35 |
code.c_str()));
|
37 |
code.c_str()));
|
|
|
38 |
int ecnt;
|
36 |
return transcode(in, out, code, cstr_utf8);
|
39 |
bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
|
|
|
40 |
return ecnt > 5 ? false : ret;
|
37 |
}
|
41 |
}
|
38 |
|
42 |
|
39 |
bool RecollFilter::txtdcode(const string& who)
|
43 |
bool RecollFilter::txtdcode(const string& who)
|
40 |
{
|
44 |
{
|
41 |
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
|
45 |
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
|
|
... |
|
... |
56 |
"for input charset [%s] ret %d ecnt %d\n",
|
60 |
"for input charset [%s] ret %d ecnt %d\n",
|
57 |
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
|
61 |
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
|
58 |
|
62 |
|
59 |
if (samecharset(ocs, cstr_utf8)) {
|
63 |
if (samecharset(ocs, cstr_utf8)) {
|
60 |
ret = alternate_decode(itext, otext);
|
64 |
ret = alternate_decode(itext, otext);
|
|
|
65 |
} else {
|
|
|
66 |
ret = false;
|
61 |
}
|
67 |
}
|
62 |
if (!ret) {
|
68 |
if (!ret) {
|
|
|
69 |
LOGDEB(("txtdcode: failed. Doc is not text?\n"));
|
63 |
itext.erase();
|
70 |
itext.erase();
|
64 |
return false;
|
71 |
return false;
|
65 |
}
|
72 |
}
|
66 |
}
|
73 |
}
|
67 |
|
74 |
|