recoll / Code / Diff of /src/internfile/txtdcode.cpp

Diff of /src/internfile/txtdcode.cpp [2fc294] .. [84b561]

Switch to side-by-side view

--- a/src/internfile/txtdcode.cpp
+++ b/src/internfile/txtdcode.cpp
@@ -15,10 +15,80 @@
  */
 #include "autoconfig.h"
 
+#include <tr1/unordered_map>
+using std::tr1::unordered_map;
+
 #include "cstr.h"
 #include "transcode.h"
 #include "mimehandler.h"
 #include "debuglog.h"
+#include "smallut.h"
+
+static const char *vcountry_to_code[] = {
+    "fr", "windows-1252",
+    "al", "windows-1252", 
+    "dk", "windows-1252",
+    "en", "windows-1252",
+    "de", "windows-1252",
+    "is", "windows-1252",
+    "my", "windows-1252",
+    "ie", "windows-1252",
+    "gb", "windows-1252",
+    "it", "windows-1252",
+    "lu", "windows-1252",
+    "no", "windows-1252",
+    "pt", "windows-1252",
+    "es", "windows-1252",
+    "se", "windows-1252",
+    "ba", "iso-8859-2",
+    "hr", "iso-8859-2",
+    "cz", "iso-8859-2",
+    "hu", "iso-8859-2",
+    "pl", "iso-8859-2",
+    "rs", "iso-8859-2",
+    "sk", "iso-8859-2",
+    "si", "iso-8859-2",
+    "gr", "iso-8859-7",
+    "il", "iso-8859-8",
+    "tr", "iso-8859-9",
+    "th", "iso-8859-11",
+    "lv", "iso-8859-13",
+    "lt", "iso-8859-13",
+};
+
+
+// Called after decoding from utf-8 failed. Handle the common case
+// where this is a good old 8bit-encoded text document left-over when
+// the locale was switched to utf-8. We try to guess a charset
+// according to the locale language and use it. This is a very rough
+// heuristic, but may be better than discarding the data.
+static bool alternate_decode(const string& in, string& out)
+{
+    static unordered_map<string, string> country_to_code;
+    if (country_to_code.empty()) {
+	for (unsigned int i = 0; 
+	     i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
+	    country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
+	}
+    }
+
+    string locale = setlocale(LC_CTYPE, 0);
+    LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
+    string::size_type under = locale.find_first_of("_");
+    if (under == string::npos)
+	return false;
+    string country = locale.substr(0, under);
+
+    unordered_map<string,string>::const_iterator it = 
+	country_to_code.find(country);
+    if (it == country_to_code.end())
+	return false;
+    string code = it->second;
+
+    LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
+	    code.c_str()));
+    return transcode(in, out, code, cstr_utf8);
+}
 
 bool RecollFilter::txtdcode(const string& who)
 {
@@ -33,17 +103,24 @@
     LOGDEB0(("%s::txtdcode: %d bytes from [%s] to UTF-8\n", 
 	     who.c_str(), itext.size(), ocs.c_str()));
     int ecnt;
-    bool ret;
     string otext;
-    if (!(ret=transcode(itext, otext, ocs, "UTF-8", &ecnt)) || 
-	ecnt > int(itext.size() / 4)) {
+    bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
+    if (!ret || ecnt > int(itext.size() / 100)) {
 	LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
 		"for input charset [%s] ret %d ecnt %d\n", 
 		who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
-	itext.erase();
-	return false;
+
+	if (samecharset(ocs, cstr_utf8)) {
+	    ret = alternate_decode(itext, otext);
+	}
+	if (!ret) {
+	    itext.erase();
+	    return false;
+	}
     }
+
     itext.swap(otext);
-    m_metaData[cstr_dj_keycharset] = "UTF-8";
+    m_metaData[cstr_dj_keycharset] = cstr_utf8;
     return true;
 }
+