|
a/src/internfile/txtdcode.cpp |
|
b/src/internfile/txtdcode.cpp |
|
... |
|
... |
13 |
* along with this program; if not, write to the Free Software
|
13 |
* along with this program; if not, write to the Free Software
|
14 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
14 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
*/
|
15 |
*/
|
16 |
#include "autoconfig.h"
|
16 |
#include "autoconfig.h"
|
17 |
|
17 |
|
|
|
18 |
#include <sstream>
|
|
|
19 |
|
18 |
#include "cstr.h"
|
20 |
#include "cstr.h"
|
19 |
#include "transcode.h"
|
21 |
#include "transcode.h"
|
20 |
#include "mimehandler.h"
|
22 |
#include "mimehandler.h"
|
21 |
#include "log.h"
|
23 |
#include "log.h"
|
22 |
#include "smallut.h"
|
24 |
#include "smallut.h"
|
23 |
|
25 |
#include "listmem.h"
|
24 |
|
26 |
|
25 |
// Called after decoding from utf-8 failed. Handle the common case
|
27 |
// Called after decoding from utf-8 failed. Handle the common case
|
26 |
// where this is a good old 8bit-encoded text document left-over when
|
28 |
// where this is a good old 8bit-encoded text document left-over when
|
27 |
// the locale was switched to utf-8. We try to guess a charset
|
29 |
// the locale was switched to utf-8. We try to guess a charset
|
28 |
// according to the locale language and use it. This is a very rough
|
30 |
// according to the locale language and use it. This is a very rough
|
29 |
// heuristic, but may be better than discarding the data.
|
31 |
// heuristic, but may be better than discarding the data.
|
30 |
// If we still get a significant number of decode errors, the doc is
|
32 |
// If we still get a significant number of decode errors, the doc is
|
31 |
// quite probably binary, so just fail.
|
33 |
// quite probably binary, so just fail.
|
|
|
34 |
// Note that we could very well get a wrong transcoding (e.g. between
|
|
|
35 |
// iso-8859 variations), there is no way to detect it.
|
32 |
static bool alternate_decode(const string& in, string& out)
|
36 |
static bool alternate_decode(const string& in, string& out, const string& ocs)
|
33 |
{
|
37 |
{
|
34 |
string lang = localelang();
|
|
|
35 |
string code = langtocode(lang);
|
|
|
36 |
LOGDEB("RecollFilter::txtdcode: trying alternate decode from " << (code) << "\n" );
|
|
|
37 |
int ecnt;
|
38 |
int ecnt;
|
|
|
39 |
if (samecharset(ocs, cstr_utf8)) {
|
|
|
40 |
string lang = localelang();
|
|
|
41 |
string code = langtocode(lang);
|
|
|
42 |
LOGDEB("RecollFilter::txtdcode: trying alternate decode from " <<
|
|
|
43 |
code << "\n");
|
38 |
bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
|
44 |
bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
|
39 |
return ecnt > 5 ? false : ret;
|
45 |
return ecnt > 5 ? false : ret;
|
|
|
46 |
} else {
|
|
|
47 |
// Give a try to utf-8 anyway, as this is self-detecting. This
|
|
|
48 |
// handles UTF-8 docs in a non-utf-8 environment. Note that
|
|
|
49 |
// this will almost never be called, as most encodings are
|
|
|
50 |
// unable to detect errors so that the first try at
|
|
|
51 |
// transcoding will have succeeded and alternate_decode() will
|
|
|
52 |
// not be called at all.
|
|
|
53 |
//
|
|
|
54 |
// To avoid this, we would have to attempt an utf-8 decode
|
|
|
55 |
// first, but this is a costly proposition as we don't know
|
|
|
56 |
// how much data to test, so need to test all (the beginning
|
|
|
57 |
// of the text could be ascii even if there are 8-bit chars
|
|
|
58 |
// later).
|
|
|
59 |
bool ret = transcode(in, out, cstr_utf8, cstr_utf8, &ecnt);
|
|
|
60 |
return ecnt > 5 ? false : ret;
|
|
|
61 |
}
|
|
|
62 |
}
|
|
|
63 |
|
|
|
64 |
static string bomtocode(const string& itext)
|
|
|
65 |
{
|
|
|
66 |
#if 0
|
|
|
67 |
std::ostringstream strm;
|
|
|
68 |
listmem(strm, itext.c_str(), MIN(itext.size(), 8));
|
|
|
69 |
LOGDEB("txtdcode:bomtocode: input " << strm.str() << "\n");
|
|
|
70 |
#endif
|
|
|
71 |
|
|
|
72 |
const unsigned char *utxt = (const unsigned char *)itext.c_str();
|
|
|
73 |
if (itext.size() >= 3 && utxt[0] == 0xEF && utxt[1] == 0xBB &&
|
|
|
74 |
utxt[2] == 0xBF) {
|
|
|
75 |
LOGDEB("txtdcode:bomtocode: UTF-8\n");
|
|
|
76 |
return "UTF-8";
|
|
|
77 |
} else if (itext.size() >= 2 && utxt[0] == 0xFE && utxt[1] == 0xFF) {
|
|
|
78 |
return "UTF-16BE";
|
|
|
79 |
} else if (itext.size() >= 2 && utxt[0] == 0xFF && utxt[1] == 0xFE) {
|
|
|
80 |
return "UTF-16LE";
|
|
|
81 |
} else if (itext.size() >= 4 && utxt[0] == 0 && utxt[1] == 0 &&
|
|
|
82 |
utxt[2] == 0xFE && utxt[3] == 0xFF) {
|
|
|
83 |
return "UTF-32BE";
|
|
|
84 |
} else if (itext.size() >= 4 && utxt[3] == 0 && utxt[2] == 0 &&
|
|
|
85 |
utxt[1] == 0xFE && utxt[0] == 0xFF) {
|
|
|
86 |
return "UTF-32LE";
|
|
|
87 |
} else {
|
|
|
88 |
return string();
|
|
|
89 |
}
|
40 |
}
|
90 |
}
|
41 |
|
91 |
|
42 |
bool RecollFilter::txtdcode(const string& who)
|
92 |
bool RecollFilter::txtdcode(const string& who)
|
43 |
{
|
93 |
{
|
44 |
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
|
94 |
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
|
45 |
LOGERR("" << (who) << "::txtdcode: called on non txt/plain: " << (m_metaData[cstr_dj_keymt]) << "\n" );
|
95 |
LOGERR(who << "::txtdcode: called on non txt/plain: " <<
|
|
|
96 |
m_metaData[cstr_dj_keymt] << "\n");
|
46 |
return false;
|
97 |
return false;
|
47 |
}
|
98 |
}
|
48 |
|
99 |
|
49 |
string& ocs = m_metaData[cstr_dj_keyorigcharset];
|
100 |
string& ocs = m_metaData[cstr_dj_keyorigcharset];
|
50 |
string& itext = m_metaData[cstr_dj_keycontent];
|
101 |
string& itext = m_metaData[cstr_dj_keycontent];
|
51 |
LOGDEB1("" << (who) << "::txtdcode: " << (itext.size()) << " bytes from [" << (ocs) << "] to UTF-8\n" );
|
102 |
LOGDEB(who << "::txtdcode: " << itext.size() << " bytes from [" <<
|
|
|
103 |
ocs << "] to UTF-8\n");
|
52 |
int ecnt;
|
104 |
int ecnt;
|
53 |
string otext;
|
105 |
string otext;
|
|
|
106 |
|
|
|
107 |
string bomfromcode = bomtocode(itext);
|
|
|
108 |
if (!bomfromcode.empty()) {
|
|
|
109 |
LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
|
|
|
110 |
ocs << " to " << bomfromcode << " from BOM detection\n");
|
|
|
111 |
ocs = bomfromcode;
|
|
|
112 |
}
|
|
|
113 |
|
54 |
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
|
114 |
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
|
55 |
if (!ret || ecnt > int(itext.size() / 100)) {
|
115 |
if (!ret || ecnt > int(itext.size() / 100)) {
|
56 |
LOGERR("" << (who) << "::txtdcode: transcode " << (itext.size()) << " bytes to UTF-8 failed for input charset [" << (ocs) << "] ret " << (ret) << " ecnt " << (ecnt) << "\n" );
|
116 |
LOGERR(who << "::txtdcode: transcode " << itext.size() <<
|
|
|
117 |
" bytes to UTF-8 failed for input charset [" << ocs <<
|
|
|
118 |
"] ret " << ret << " ecnt " << ecnt << "\n");
|
57 |
|
119 |
|
58 |
if (samecharset(ocs, cstr_utf8)) {
|
|
|
59 |
ret = alternate_decode(itext, otext);
|
120 |
ret = alternate_decode(itext, otext, ocs);
|
60 |
} else {
|
121 |
|
61 |
ret = false;
|
|
|
62 |
}
|
|
|
63 |
if (!ret) {
|
122 |
if (!ret) {
|
64 |
LOGDEB("txtdcode: failed. Doc is not text?\n" );
|
123 |
LOGDEB("txtdcode: failed. Doc is not text?\n" );
|
65 |
itext.erase();
|
124 |
itext.erase();
|
66 |
return false;
|
125 |
return false;
|
67 |
}
|
126 |
}
|
|
... |
|
... |
69 |
|
128 |
|
70 |
itext.swap(otext);
|
129 |
itext.swap(otext);
|
71 |
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
130 |
m_metaData[cstr_dj_keycharset] = cstr_utf8;
|
72 |
return true;
|
131 |
return true;
|
73 |
}
|
132 |
}
|
74 |
|
|
|
75 |
|
|
|