Parent: [84b561] (diff)

Child: [04c19b] (diff)

Download this file

txtdcode.cpp    127 lines (113 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <tr1/unordered_map>
using std::tr1::unordered_map;
#include "cstr.h"
#include "transcode.h"
#include "mimehandler.h"
#include "debuglog.h"
#include "smallut.h"
static const char *vcountry_to_code[] = {
"fr", "windows-1252",
"al", "windows-1252",
"dk", "windows-1252",
"en", "windows-1252",
"de", "windows-1252",
"is", "windows-1252",
"my", "windows-1252",
"ie", "windows-1252",
"gb", "windows-1252",
"it", "windows-1252",
"lu", "windows-1252",
"no", "windows-1252",
"pt", "windows-1252",
"es", "windows-1252",
"se", "windows-1252",
"ba", "iso-8859-2",
"hr", "iso-8859-2",
"cz", "iso-8859-2",
"hu", "iso-8859-2",
"pl", "iso-8859-2",
"rs", "iso-8859-2",
"sk", "iso-8859-2",
"si", "iso-8859-2",
"gr", "iso-8859-7",
"il", "iso-8859-8",
"tr", "iso-8859-9",
"th", "iso-8859-11",
"lv", "iso-8859-13",
"lt", "iso-8859-13",
};
// Called after decoding from utf-8 failed. Handle the common case
// where this is a good old 8bit-encoded text document left-over when
// the locale was switched to utf-8. We try to guess a charset
// according to the locale language and use it. This is a very rough
// heuristic, but may be better than discarding the data.
static bool alternate_decode(const string& in, string& out)
{
static unordered_map<string, string> country_to_code;
if (country_to_code.empty()) {
for (unsigned int i = 0;
i < sizeof(vcountry_to_code) / sizeof(char *); i += 2) {
country_to_code[vcountry_to_code[i]] = vcountry_to_code[i+1];
}
}
string locale = setlocale(LC_CTYPE, 0);
LOGDEB(("RecollFilter::alternate_dcde: locale: [%s]\n", locale.c_str()));
string::size_type under = locale.find_first_of("_");
if (under == string::npos)
return false;
string country = locale.substr(0, under);
unordered_map<string,string>::const_iterator it =
country_to_code.find(country);
if (it == country_to_code.end())
return false;
string code = it->second;
LOGDEB(("RecollFilter::txtdcode: trying alternate decode from %s\n",
code.c_str()));
return transcode(in, out, code, cstr_utf8);
}
bool RecollFilter::txtdcode(const string& who)
{
if (m_metaData[cstr_dj_keymt].compare(cstr_textplain)) {
LOGERR(("%s::txtdcode: called on non txt/plain: %s\n", who.c_str(),
m_metaData[cstr_dj_keymt].c_str()));
return false;
}
string& ocs = m_metaData[cstr_dj_keyorigcharset];
string& itext = m_metaData[cstr_dj_keycontent];
LOGDEB1(("%s::txtdcode: %d bytes from [%s] to UTF-8\n",
who.c_str(), itext.size(), ocs.c_str()));
int ecnt;
string otext;
bool ret = transcode(itext, otext, ocs, cstr_utf8, &ecnt);
if (!ret || ecnt > int(itext.size() / 100)) {
LOGERR(("%s::txtdcode: transcode %d bytes to UTF-8 failed "
"for input charset [%s] ret %d ecnt %d\n",
who.c_str(), itext.size(), ocs.c_str(), ret, ecnt));
if (samecharset(ocs, cstr_utf8)) {
ret = alternate_decode(itext, otext);
}
if (!ret) {
itext.erase();
return false;
}
}
itext.swap(otext);
m_metaData[cstr_dj_keycharset] = cstr_utf8;
return true;
}