--- a/src/internfile/txtdcode.cpp
+++ b/src/internfile/txtdcode.cpp
@@ -15,12 +15,14 @@
*/
#include "autoconfig.h"
+#include <sstream>
+
#include "cstr.h"
#include "transcode.h"
#include "mimehandler.h"
#include "log.h"
#include "smallut.h"
-
+#include "listmem.h"
// Called after decoding from utf-8 failed. Handle the common case
// where this is a good old 8bit-encoded text document left-over when
@@ -29,37 +31,94 @@
// heuristic, but may be better than discarding the data.
// If we still get a significant number of decode errors, the doc is
// quite probably binary, so just fail.
-static bool alternate_decode(const string& in, string& out)
+// Note that we could very well get a wrong transcoding (e.g. between
+// iso-8859 variations), there is no way to detect it.
+static bool alternate_decode(const string& in, string& out, const string& ocs)
{
- string lang = localelang();
- string code = langtocode(lang);
- LOGDEB("RecollFilter::txtdcode: trying alternate decode from " << (code) << "\n" );
int ecnt;
- bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
- return ecnt > 5 ? false : ret;
+ if (samecharset(ocs, cstr_utf8)) {
+ string lang = localelang();
+ string code = langtocode(lang);
+ LOGDEB("RecollFilter::txtdcode: trying alternate decode from " <<
+ code << "\n");
+ bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
+ return ecnt > 5 ? false : ret;
+ } else {
+ // Give a try to utf-8 anyway, as this is self-detecting. This
+ // handles UTF-8 docs in a non-utf-8 environment. Note that
+ // this will almost never be called, as most encodings are
+ // unable to detect errors so that the first try at
+ // transcoding will have succeeded and alternate_decode() will
+ // not be called at all.
+ //
+ // To avoid this, we would have to attempt an utf-8 decode
+ // first, but this is a costly proposition as we don't know
+ // how much data to test, so need to test all (the beginning
+ // of the text could be ascii even if there are 8-bit chars
+ // later).
+ bool ret = transcode(in, out, cstr_utf8, cstr_utf8, &ecnt);
+ return ecnt > 5 ? false : ret;
+ }
+}
+
+static string bomtocode(const string& itext)
+{
+#if 0
+ std::ostringstream strm;
+ listmem(strm, itext.c_str(), MIN(itext.size(), 8));
+ LOGDEB("txtdcode:bomtocode: input " << strm.str() << "\n");
+#endif
+
+ const unsigned char *utxt = (const unsigned char *)itext.c_str();
+ if (itext.size() >= 3 && utxt[0] == 0xEF && utxt[1] == 0xBB &&
+ utxt[2] == 0xBF) {
+ LOGDEB("txtdcode:bomtocode: UTF-8\n");
+ return "UTF-8";
+ } else if (itext.size() >= 2 && utxt[0] == 0xFE && utxt[1] == 0xFF) {
+ return "UTF-16BE";
+ } else if (itext.size() >= 2 && utxt[0] == 0xFF && utxt[1] == 0xFE) {
+ return "UTF-16LE";
+ } else if (itext.size() >= 4 && utxt[0] == 0 && utxt[1] == 0 &&
+ utxt[2] == 0xFE && utxt[3] == 0xFF) {
+ return "UTF-32BE";
+ } else if (itext.size() >= 4 && utxt[3] == 0 && utxt[2] == 0 &&
+ utxt[1] == 0xFE && utxt[0] == 0xFF) {
+ return "UTF-32LE";
+ } else {
+ return string();
+ }
}
bool RecollFilter::txtdcode(const string& who)
{
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
- LOGERR("" << (who) << "::txtdcode: called on non txt/plain: " << (m_metaData[cstr_dj_keymt]) << "\n" );
+ LOGERR(who << "::txtdcode: called on non txt/plain: " <<
+ m_metaData[cstr_dj_keymt] << "\n");
return false;
}
string& ocs = m_metaData[cstr_dj_keyorigcharset];
string& itext = m_metaData[cstr_dj_keycontent];
- LOGDEB1("" << (who) << "::txtdcode: " << (itext.size()) << " bytes from [" << (ocs) << "] to UTF-8\n" );
+ LOGDEB(who << "::txtdcode: " << itext.size() << " bytes from [" <<
+ ocs << "] to UTF-8\n");
int ecnt;
string otext;
+
+ string bomfromcode = bomtocode(itext);
+ if (!bomfromcode.empty()) {
+ LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
+ ocs << " to " << bomfromcode << " from BOM detection\n");
+ ocs = bomfromcode;
+ }
+
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
if (!ret || ecnt > int(itext.size() / 100)) {
- LOGERR("" << (who) << "::txtdcode: transcode " << (itext.size()) << " bytes to UTF-8 failed for input charset [" << (ocs) << "] ret " << (ret) << " ecnt " << (ecnt) << "\n" );
+ LOGERR(who << "::txtdcode: transcode " << itext.size() <<
+ " bytes to UTF-8 failed for input charset [" << ocs <<
+ "] ret " << ret << " ecnt " << ecnt << "\n");
- if (samecharset(ocs, cstr_utf8)) {
- ret = alternate_decode(itext, otext);
- } else {
- ret = false;
- }
+ ret = alternate_decode(itext, otext, ocs);
+
if (!ret) {
LOGDEB("txtdcode: failed. Doc is not text?\n" );
itext.erase();
@@ -71,5 +130,3 @@
m_metaData[cstr_dj_keycharset] = cstr_utf8;
return true;
}
-
-