|
a/src/internfile/txtdcode.cpp |
|
b/src/internfile/txtdcode.cpp |
|
... |
|
... |
22 |
#include "mimehandler.h"
|
22 |
#include "mimehandler.h"
|
23 |
#include "log.h"
|
23 |
#include "log.h"
|
24 |
#include "smallut.h"
|
24 |
#include "smallut.h"
|
25 |
#include "listmem.h"
|
25 |
#include "listmem.h"
|
26 |
|
26 |
|
|
|
27 |
using std::string;
|
|
|
28 |
|
27 |
// Called after decoding from utf-8 failed. Handle the common case
|
29 |
// Called after decoding from utf-8 failed. Handle the common case
|
28 |
// where this is a good old 8bit-encoded text document left-over when
|
30 |
// where this is a good old 8bit-encoded text document left-over when
|
29 |
// the locale was switched to utf-8. We try to guess a charset
|
31 |
// the locale was switched to utf-8. We try to guess a charset
|
30 |
// according to the locale language and use it. This is a very rough
|
32 |
// according to the locale language and use it. This is a very rough
|
31 |
// heuristic, but may be better than discarding the data.
|
33 |
// heuristic, but may be better than discarding the data.
|
32 |
// If we still get a significant number of decode errors, the doc is
|
34 |
// If we still get a significant number of decode errors, the doc is
|
33 |
// quite probably binary, so just fail.
|
35 |
// quite probably binary, so just fail.
|
34 |
// Note that we could very well get a wrong transcoding (e.g. between
|
36 |
// Note that we could very well get a wrong transcoding (e.g. between
|
35 |
// iso-8859 variations), there is no way to detect it.
|
37 |
// iso-8859 variations), there is no way to detect it.
|
36 |
static bool alternate_decode(const string& in, string& out, const string& ocs)
|
38 |
static bool alternate_decode(const string& in, string& out, string& ocs)
|
37 |
{
|
39 |
{
|
38 |
int ecnt;
|
40 |
int ecnt;
|
39 |
if (samecharset(ocs, cstr_utf8)) {
|
41 |
if (samecharset(ocs, cstr_utf8)) {
|
40 |
string lang = localelang();
|
42 |
string lang = localelang();
|
41 |
string code = langtocode(lang);
|
43 |
string code = langtocode(lang);
|
42 |
LOGDEB("RecollFilter::txtdcode: trying alternate decode from " <<
|
44 |
LOGDEB("RecollFilter::txtdcode: trying alternate decode from " <<
|
43 |
code << "\n");
|
45 |
code << "\n");
|
44 |
bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
|
46 |
bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
|
45 |
return ecnt > 5 ? false : ret;
|
47 |
if (ecnt > 5)
|
|
|
48 |
ret = false;
|
|
|
49 |
if (ret) {
|
|
|
50 |
ocs = code;
|
|
|
51 |
}
|
|
|
52 |
return ret;
|
46 |
} else {
|
53 |
} else {
|
47 |
// Give a try to utf-8 anyway, as this is self-detecting. This
|
54 |
// Give a try to utf-8 anyway, as this is self-detecting. This
|
48 |
// handles UTF-8 docs in a non-utf-8 environment. Note that
|
55 |
// handles UTF-8 docs in a non-utf-8 environment. Note that
|
49 |
// this will almost never be called, as most encodings are
|
56 |
// this will almost never be called, as most encodings are
|
50 |
// unable to detect errors so that the first try at
|
57 |
// unable to detect errors so that the first try at
|