Download this file

txtdcode.cpp    133 lines (120 with data), 5.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <sstream>
#include "cstr.h"
#include "transcode.h"
#include "mimehandler.h"
#include "log.h"
#include "smallut.h"
#include "listmem.h"
// Called after decoding from utf-8 failed. Handle the common case
// where this is a good old 8bit-encoded text document left-over when
// the locale was switched to utf-8. We try to guess a charset
// according to the locale language and use it. This is a very rough
// heuristic, but may be better than discarding the data.
// If we still get a significant number of decode errors, the doc is
// quite probably binary, so just fail.
// Note that we could very well get a wrong transcoding (e.g. between
// iso-8859 variations), there is no way to detect it.
static bool alternate_decode(const string& in, string& out, const string& ocs)
{
int ecnt;
if (samecharset(ocs, cstr_utf8)) {
string lang = localelang();
string code = langtocode(lang);
LOGDEB("RecollFilter::txtdcode: trying alternate decode from " <<
code << "\n");
bool ret = transcode(in, out, code, cstr_utf8, &ecnt);
return ecnt > 5 ? false : ret;
} else {
// Give a try to utf-8 anyway, as this is self-detecting. This
// handles UTF-8 docs in a non-utf-8 environment. Note that
// this will almost never be called, as most encodings are
// unable to detect errors so that the first try at
// transcoding will have succeeded and alternate_decode() will
// not be called at all.
//
// To avoid this, we would have to attempt an utf-8 decode
// first, but this is a costly proposition as we don't know
// how much data to test, so need to test all (the beginning
// of the text could be ascii even if there are 8-bit chars
// later).
bool ret = transcode(in, out, cstr_utf8, cstr_utf8, &ecnt);
return ecnt > 5 ? false : ret;
}
}
static string bomtocode(const string& itext)
{
#if 0
std::ostringstream strm;
listmem(strm, itext.c_str(), MIN(itext.size(), 8));
LOGDEB("txtdcode:bomtocode: input " << strm.str() << "\n");
#endif
const unsigned char *utxt = (const unsigned char *)itext.c_str();
if (itext.size() >= 3 && utxt[0] == 0xEF && utxt[1] == 0xBB &&
utxt[2] == 0xBF) {
LOGDEB("txtdcode:bomtocode: UTF-8\n");
return "UTF-8";
} else if (itext.size() >= 2 && utxt[0] == 0xFE && utxt[1] == 0xFF) {
return "UTF-16BE";
} else if (itext.size() >= 2 && utxt[0] == 0xFF && utxt[1] == 0xFE) {
return "UTF-16LE";
} else if (itext.size() >= 4 && utxt[0] == 0 && utxt[1] == 0 &&
utxt[2] == 0xFE && utxt[3] == 0xFF) {
return "UTF-32BE";
} else if (itext.size() >= 4 && utxt[3] == 0 && utxt[2] == 0 &&
utxt[1] == 0xFE && utxt[0] == 0xFF) {
return "UTF-32LE";
} else {
return string();
}
}
bool RecollFilter::txtdcode(const string& who)
{
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
LOGERR(who << "::txtdcode: called on non txt/plain: " <<
m_metaData[cstr_dj_keymt] << "\n");
return false;
}
string& ocs = m_metaData[cstr_dj_keyorigcharset];
string& itext = m_metaData[cstr_dj_keycontent];
LOGDEB(who << "::txtdcode: " << itext.size() << " bytes from [" <<
ocs << "] to UTF-8\n");
int ecnt;
string otext;
string bomfromcode = bomtocode(itext);
if (!bomfromcode.empty()) {
LOGDEB(who << "::txtdcode: " << " input charset changed from " <<
ocs << " to " << bomfromcode << " from BOM detection\n");
ocs = bomfromcode;
}
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
if (!ret || ecnt > int(itext.size() / 100)) {
LOGERR(who << "::txtdcode: transcode " << itext.size() <<
" bytes to UTF-8 failed for input charset [" << ocs <<
"] ret " << ret << " ecnt " << ecnt << "\n");
ret = alternate_decode(itext, otext, ocs);
if (!ret) {
LOGDEB("txtdcode: failed. Doc is not text?\n" );
itext.erase();
return false;
}
}
itext.swap(otext);
m_metaData[cstr_dj_keycharset] = cstr_utf8;
return true;
}