|
a/unac/unac.c |
|
b/unac/unac.c |
|
... |
|
... |
14 |
* You should have received a copy of the GNU General Public License
|
14 |
* You should have received a copy of the GNU General Public License
|
15 |
* along with this program; if not, write to the Free Software
|
15 |
* along with this program; if not, write to the Free Software
|
16 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
17 |
*/
|
17 |
*/
|
18 |
|
18 |
|
19 |
#ifdef HAVE_CONFIG_H
|
19 |
#ifdef BUILDING_RECOLL
|
20 |
#ifdef RECOLL_DATADIR
|
|
|
21 |
#include "autoconfig.h"
|
20 |
#include "autoconfig.h"
|
22 |
#else
|
21 |
#else
|
23 |
#include "config.h"
|
22 |
#include "config.h"
|
24 |
#endif /* RECOLL */
|
23 |
#endif /* RECOLL */
|
25 |
#endif /* HAVE_CONFIG_H */
|
|
|
26 |
|
24 |
|
27 |
#ifdef RECOLL_DATADIR
|
25 |
#ifdef BUILDING_RECOLL
|
28 |
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
|
26 |
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
|
29 |
caught writing another binary search */
|
27 |
caught writing another binary search */
|
30 |
#include <vector>
|
28 |
#include <vector>
|
31 |
#include <map>
|
29 |
#include <map>
|
32 |
#include <string>
|
30 |
#include <string>
|
33 |
#include <algorithm>
|
31 |
#include <algorithm>
|
34 |
#include "unordered_defs.h"
|
32 |
#include <iostream>
|
|
|
33 |
#include UNORDERED_MAP_INCLUDE
|
|
|
34 |
|
35 |
using std::string;
|
35 |
using std::string;
|
36 |
|
36 |
|
37 |
#include "smallut.h"
|
37 |
#include "smallut.h"
|
38 |
|
38 |
|
39 |
/*
|
39 |
/*
|
|
... |
|
... |
50 |
if (it == except_trans.end())
|
50 |
if (it == except_trans.end())
|
51 |
return false;
|
51 |
return false;
|
52 |
trans = it->second;
|
52 |
trans = it->second;
|
53 |
return true;
|
53 |
return true;
|
54 |
}
|
54 |
}
|
55 |
#endif /* RECOLL_DATADIR */
|
55 |
#endif /* BUILDING_RECOLL*/
|
56 |
|
56 |
|
57 |
/*
|
57 |
/*
|
58 |
* If configure.in has not defined this symbol, assume const. It
|
58 |
* If configure.in has not defined this symbol, assume const. It
|
59 |
* does not harm much: a warning will be issued during compilation.
|
59 |
* does not harm much: a warning will be issued during compilation.
|
60 |
*/
|
60 |
*/
|
|
... |
|
... |
14168 |
|
14168 |
|
14169 |
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
14169 |
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
14170 |
char** outp, size_t* out_lengthp, int what)
|
14170 |
char** outp, size_t* out_lengthp, int what)
|
14171 |
{
|
14171 |
{
|
14172 |
char* out;
|
14172 |
char* out;
|
14173 |
int out_size;
|
14173 |
size_t out_size;
|
14174 |
int out_length;
|
14174 |
size_t out_length;
|
14175 |
unsigned int i;
|
14175 |
size_t i;
|
14176 |
|
14176 |
|
14177 |
out_size = in_length > 0 ? in_length : 1024;
|
14177 |
out_size = in_length > 0 ? in_length : 1024;
|
14178 |
|
14178 |
|
14179 |
out = *outp;
|
14179 |
out = *outp;
|
14180 |
out = (char*)realloc(out, out_size + 1);
|
14180 |
out = (char*)realloc(out, out_size + 1);
|
|
... |
|
... |
14188 |
out_length = 0;
|
14188 |
out_length = 0;
|
14189 |
|
14189 |
|
14190 |
for(i = 0; i < in_length; i += 2) {
|
14190 |
for(i = 0; i < in_length; i += 2) {
|
14191 |
unsigned short c;
|
14191 |
unsigned short c;
|
14192 |
unsigned short* p;
|
14192 |
unsigned short* p;
|
14193 |
int l;
|
14193 |
size_t l;
|
14194 |
int k;
|
14194 |
size_t k;
|
14195 |
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
14195 |
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
14196 |
/*
|
14196 |
/*
|
14197 |
* Lookup the tables for decomposition information
|
14197 |
* Lookup the tables for decomposition information
|
14198 |
*/
|
14198 |
*/
|
14199 |
#ifdef RECOLL_DATADIR
|
14199 |
#ifdef BUILDING_RECOLL
|
14200 |
// Exception unac/fold values set by user. There should be 3 arrays for
|
14200 |
// Exception unac/fold values set by user. There should be 3 arrays for
|
14201 |
// unac/fold/unac+fold. For now there is only one array, which used to
|
14201 |
// unac/fold/unac+fold. For now there is only one array, which used to
|
14202 |
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
14202 |
// be set for unac+fold, and is mostly or only used to prevent diacritics
|
14203 |
// removal for some chars and languages where it should not be done.
|
14203 |
// removal for some chars and languages where it should not be done.
|
14204 |
// In conformance with current usage, but incorrectly, we do the following
|
14204 |
// In conformance with current usage, but incorrectly, we do the following
|
|
... |
|
... |
14217 |
// Has to be UNAC_UNACFOLD: use table
|
14217 |
// Has to be UNAC_UNACFOLD: use table
|
14218 |
p = (unsigned short *)trans.c_str();
|
14218 |
p = (unsigned short *)trans.c_str();
|
14219 |
l = trans.size() / 2;
|
14219 |
l = trans.size() / 2;
|
14220 |
}
|
14220 |
}
|
14221 |
} else {
|
14221 |
} else {
|
14222 |
#endif /* RECOLL_DATADIR */
|
14222 |
#endif /* BUILDING_RECOLL */
|
14223 |
unac_uf_char_utf16_(c, p, l, what)
|
14223 |
unac_uf_char_utf16_(c, p, l, what)
|
14224 |
#ifdef RECOLL_DATADIR
|
14224 |
#ifdef BUILDING_RECOLL
|
14225 |
}
|
14225 |
}
|
14226 |
#endif /* RECOLL_DATADIR */
|
14226 |
#endif /* BUILDING_RECOLL */
|
14227 |
|
14227 |
|
14228 |
/*
|
14228 |
/*
|
14229 |
* Explain what's done in great detail
|
14229 |
* Explain what's done in great detail
|
14230 |
*/
|
14230 |
*/
|
14231 |
if(debug_level == UNAC_DEBUG_HIGH) {
|
14231 |
if(debug_level == UNAC_DEBUG_HIGH) {
|
|
... |
|
... |
14234 |
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
14234 |
DEBUG("unac_data%d[%d] & unac_positions[%d][%d]: ", index, unac_positions[index][position], index, position+1);
|
14235 |
DEBUG_APPEND("0x%04x => ", (c));
|
14235 |
DEBUG_APPEND("0x%04x => ", (c));
|
14236 |
if(l == 0) {
|
14236 |
if(l == 0) {
|
14237 |
DEBUG_APPEND("untouched\n");
|
14237 |
DEBUG_APPEND("untouched\n");
|
14238 |
} else {
|
14238 |
} else {
|
14239 |
int i;
|
14239 |
size_t i;
|
14240 |
for(i = 0; i < l; i++)
|
14240 |
for(i = 0; i < l; i++)
|
14241 |
DEBUG_APPEND("0x%04x ", p[i]);
|
14241 |
DEBUG_APPEND("0x%04x ", p[i]);
|
14242 |
DEBUG_APPEND("\n");
|
14242 |
DEBUG_APPEND("\n");
|
14243 |
}
|
14243 |
}
|
14244 |
}
|
14244 |
}
|
|
... |
|
... |
14434 |
*/
|
14434 |
*/
|
14435 |
if(from_utf16) {
|
14435 |
if(from_utf16) {
|
14436 |
const char* tmp = space;
|
14436 |
const char* tmp = space;
|
14437 |
size_t tmp_length = 2;
|
14437 |
size_t tmp_length = 2;
|
14438 |
if(iconv(cd, (ICONV_CONST char **) &tmp, &tmp_length, &out, &out_remain) == (size_t)-1) {
|
14438 |
if(iconv(cd, (ICONV_CONST char **) &tmp, &tmp_length, &out, &out_remain) == (size_t)-1) {
|
14439 |
if(errno == E2BIG)
|
14439 |
if(errno == E2BIG) {
|
14440 |
/* fall thru to the E2BIG case below */;
|
14440 |
/* fall thru to the E2BIG case below */;
|
14441 |
else
|
14441 |
} else {
|
14442 |
goto out;
|
14442 |
goto out;
|
|
|
14443 |
}
|
14443 |
} else {
|
14444 |
} else {
|
14444 |
/* The offending character was replaced by a SPACE, skip it. */
|
14445 |
/* The offending character was replaced by a SPACE, skip it. */
|
14445 |
in += 2;
|
14446 |
in += 2;
|
14446 |
in_length -= 2;
|
14447 |
in_length -= 2;
|
14447 |
/* And continue conversion. */
|
14448 |
/* And continue conversion. */
|
|
... |
|
... |
14453 |
case E2BIG:
|
14454 |
case E2BIG:
|
14454 |
{
|
14455 |
{
|
14455 |
/*
|
14456 |
/*
|
14456 |
* The output does not fit in the current out buffer, enlarge it.
|
14457 |
* The output does not fit in the current out buffer, enlarge it.
|
14457 |
*/
|
14458 |
*/
|
14458 |
int length = out - out_base;
|
14459 |
size_t length = out - out_base;
|
14459 |
out_size *= 2;
|
14460 |
out_size *= 2;
|
14460 |
{
|
14461 |
{
|
14461 |
char *saved = out_base;
|
14462 |
char *saved = out_base;
|
14462 |
/* +1 for null */
|
14463 |
/* +1 for null */
|
14463 |
out_base = (char *)realloc(out_base, out_size + 1);
|
14464 |
out_base = (char *)realloc(out_base, out_size + 1);
|
|
... |
|
... |
14559 |
const char* unac_version(void)
|
14560 |
const char* unac_version(void)
|
14560 |
{
|
14561 |
{
|
14561 |
return UNAC_VERSION;
|
14562 |
return UNAC_VERSION;
|
14562 |
}
|
14563 |
}
|
14563 |
|
14564 |
|
14564 |
#ifdef RECOLL_DATADIR
|
14565 |
#ifdef BUILDING_RECOLL
|
14565 |
void unac_set_except_translations(const char *spectrans)
|
14566 |
void unac_set_except_translations(const char *spectrans)
|
14566 |
{
|
14567 |
{
|
14567 |
except_trans.clear();
|
14568 |
except_trans.clear();
|
14568 |
if (!spectrans || !spectrans[0])
|
14569 |
if (!spectrans || !spectrans[0])
|
14569 |
return;
|
14570 |
return;
|
|
... |
|
... |
14612 |
|
14613 |
|
14613 |
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
14614 |
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
14614 |
free(out);
|
14615 |
free(out);
|
14615 |
}
|
14616 |
}
|
14616 |
}
|
14617 |
}
|
14617 |
#endif /* RECOLL_DATADIR */
|
14618 |
#endif /* BUILDING_RECOLL */
|