|
a/src/internfile/txtdcode.cpp |
|
b/src/internfile/txtdcode.cpp |
|
... |
|
... |
13 |
* along with this program; if not, write to the Free Software
|
13 |
* along with this program; if not, write to the Free Software
|
14 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
14 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
*/
|
15 |
*/
|
16 |
#include "autoconfig.h"
|
16 |
#include "autoconfig.h"
|
17 |
|
17 |
|
18 |
#include <tr1/unordered_map>
|
|
|
19 |
using std::tr1::unordered_map;
|
|
|
20 |
|
|
|
21 |
#include "cstr.h"
|
18 |
#include "cstr.h"
|
22 |
#include "transcode.h"
|
19 |
#include "transcode.h"
|
23 |
#include "mimehandler.h"
|
20 |
#include "mimehandler.h"
|
24 |
#include "debuglog.h"
|
21 |
#include "debuglog.h"
|
25 |
#include "smallut.h"
|
22 |
#include "smallut.h"
|
26 |
|
|
|
27 |
static const char *vcountry_to_code[] = {
|
|
|
28 |
"fr", "windows-1252",
|
|
|
29 |
"al", "windows-1252",
|
|
|
30 |
"dk", "windows-1252",
|
|
|
31 |
"en", "windows-1252",
|
|
|
32 |
"de", "windows-1252",
|
|
|
33 |
"is", "windows-1252",
|
|
|
34 |
"my", "windows-1252",
|
|
|
35 |
"ie", "windows-1252",
|
|
|
36 |
"gb", "windows-1252",
|
|
|
37 |
"it", "windows-1252",
|
|
|
38 |
"lu", "windows-1252",
|
|
|
39 |
"no", "windows-1252",
|
|
|
40 |
"pt", "windows-1252",
|
|
|
41 |
"es", "windows-1252",
|
|
|
42 |
"se", "windows-1252",
|
|
|
43 |
"ba", "iso-8859-2",
|
|
|
44 |
"hr", "iso-8859-2",
|
|
|
45 |
"cz", "iso-8859-2",
|
|
|
46 |
"hu", "iso-8859-2",
|
|
|
47 |
"pl", "iso-8859-2",
|
|
|
48 |
"rs", "iso-8859-2",
|
|
|
49 |
"sk", "iso-8859-2",
|
|
|
50 |
"si", "iso-8859-2",
|
|
|
51 |
"gr", "iso-8859-7",
|
|
|
52 |
"il", "iso-8859-8",
|
|
|
53 |
"tr", "iso-8859-9",
|
|
|
54 |
"th", "iso-8859-11",
|
|
|
55 |
"lv", "iso-8859-13",
|
|
|
56 |
"lt", "iso-8859-13",
|
|
|
57 |
};
|
|
|
58 |
|
23 |
|
59 |
|
24 |
|
60 |
// Called after decoding from utf-8 failed. Handle the common case
|
25 |
// Called after decoding from utf-8 failed. Handle the common case
|
61 |
// where this is a good old 8bit-encoded text document left-over when
|
26 |
// where this is a good old 8bit-encoded text document left-over when
|
62 |
// the locale was switched to utf-8. We try to guess a charset
|
27 |
// the locale was switched to utf-8. We try to guess a charset
|
63 |
// according to the locale language and use it. This is a very rough
|
28 |
// according to the locale language and use it. This is a very rough
|
64 |
// heuristic, but may be better than discarding the data.
|
29 |
// heuristic, but may be better than discarding the data.
|
65 |
static bool alternate_decode(const string& in, string& out)
|
30 |
static bool alternate_decode(const string& in, string& out)
|
66 |
{
|
31 |
{
|
67 |
static unordered_map<string, string> country_to_code;
|
32 |
string lang = localelang();
|
68 |
if (country_to_code.empty()) {
|
33 |
string code = langtocode(lang);
|
69 |
for (unsigned int i = 0;
|
|
|
70 |
i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
|
|
|
71 |
country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
|
|
|
72 |
}
|
|
|
73 |
}
|
|
|
74 |
|
|
|
75 |
string locale = setlocale(LC_CTYPE, 0);
|
|
|
76 |
LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
|
|
|
77 |
string::size_type under = locale.find_first_of("_");
|
|
|
78 |
if (under == string::npos)
|
|
|
79 |
return false;
|
|
|
80 |
string country = locale.substr(0, under);
|
|
|
81 |
|
|
|
82 |
unordered_map<string,string>::const_iterator it =
|
|
|
83 |
country_to_code.find(country);
|
|
|
84 |
if (it == country_to_code.end())
|
|
|
85 |
return false;
|
|
|
86 |
string code = it->second;
|
|
|
87 |
|
|
|
88 |
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
|
34 |
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
|
89 |
code.c_str()));
|
35 |
code.c_str()));
|
90 |
return transcode(in, out, code, cstr_utf8);
|
36 |
return transcode(in, out, code, cstr_utf8);
|
91 |
}
|
37 |
}
|
92 |
|
38 |
|