--- a/src/internfile/txtdcode.cpp
+++ b/src/internfile/txtdcode.cpp
@@ -15,10 +15,80 @@
*/
#include "autoconfig.h"
+#include <tr1/unordered_map>
+using std::tr1::unordered_map;
+
#include "cstr.h"
#include "transcode.h"
#include "mimehandler.h"
#include "debuglog.h"
+#include "smallut.h"
+
+static const char *vcountry_to_code[] = {
+ "fr", "windows-1252",
+ "al", "windows-1252",
+ "dk", "windows-1252",
+ "en", "windows-1252",
+ "de", "windows-1252",
+ "is", "windows-1252",
+ "my", "windows-1252",
+ "ie", "windows-1252",
+ "gb", "windows-1252",
+ "it", "windows-1252",
+ "lu", "windows-1252",
+ "no", "windows-1252",
+ "pt", "windows-1252",
+ "es", "windows-1252",
+ "se", "windows-1252",
+ "ba", "iso-8859-2",
+ "hr", "iso-8859-2",
+ "cz", "iso-8859-2",
+ "hu", "iso-8859-2",
+ "pl", "iso-8859-2",
+ "rs", "iso-8859-2",
+ "sk", "iso-8859-2",
+ "si", "iso-8859-2",
+ "gr", "iso-8859-7",
+ "il", "iso-8859-8",
+ "tr", "iso-8859-9",
+ "th", "iso-8859-11",
+ "lv", "iso-8859-13",
+ "lt", "iso-8859-13",
+};
+
+
+// Called after decoding from utf-8 failed. Handle the common case
+// where this is a good old 8bit-encoded text document left-over when
+// the locale was switched to utf-8. We try to guess a charset
+// according to the locale language and use it. This is a very rough
+// heuristic, but may be better than discarding the data.
+static bool alternate_decode(const string& in, string& out)
+{
+ static unordered_map<string, string> country_to_code;
+ if (country_to_code.empty()) {
+ for (unsigned int i = 0;
+ i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
+ country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
+ }
+ }
+
+ string locale = setlocale(LC_CTYPE, 0);
+ LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
+ string::size_type under = locale.find_first_of("_");
+ if (under == string::npos)
+ return false;
+ string country = locale.substr(0, under);
+
+ unordered_map<string,string>::const_iterator it =
+ country_to_code.find(country);
+ if (it == country_to_code.end())
+ return false;
+ string code = it->second;
+
+ LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
+ code.c_str()));
+ return transcode(in, out, code, cstr_utf8);
+}
bool RecollFilter::txtdcode(const string& who)
{
@@ -33,17 +103,24 @@
LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n",
who.c_str(), itext.size(), ocs.c_str()));
int ecnt;
- bool ret;
string otext;
- if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) ||
- ecnt > int(itext.size() / 4)) {
+ bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
+ if (!ret || ecnt > int(itext.size() / 100)) {
LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
"for input charset [%s] ret %d ecnt %d\n",
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
- itext.erase();
- return false;
+
+ if (samecharset(ocs, cstr_utf8)) {
+ ret = alternate_decode(itext, otext);
+ }
+ if (!ret) {
+ itext.erase();
+ return false;
+ }
}
+
itext.swap(otext);
- m_metaData[cstr_dj_keycharset] = "UTF-8";
+ m_metaData[cstr_dj_keycharset] = cstr_utf8;
return true;
}
+