|
a/unac/unac.c |
|
b/unac/unac.c |
|
... |
|
... |
29 |
caught writing another binary search */
|
29 |
caught writing another binary search */
|
30 |
#include <vector>
|
30 |
#include <vector>
|
31 |
#include <map>
|
31 |
#include <map>
|
32 |
#include <string>
|
32 |
#include <string>
|
33 |
#include <algorithm>
|
33 |
#include <algorithm>
|
|
|
34 |
#include <tr1/unordered_map>
|
34 |
using std::string;
|
35 |
using std::string;
|
35 |
using std::vector;
|
36 |
using std::tr1::unordered_map;
|
36 |
using std::map;
|
|
|
37 |
#include "smallut.h"
|
37 |
#include "smallut.h"
|
38 |
|
38 |
|
39 |
/*
|
39 |
/*
|
40 |
Storage for the exception translations. These are chars which
|
40 |
Storage for the exception translations. These are chars which
|
41 |
should not be translated according to what UnicodeData says, but
|
41 |
should not be translated according to what UnicodeData says, but
|
42 |
instead according to some local rule. There will usually be very
|
42 |
instead according to some local rule. There will usually be very
|
43 |
few of them, but they must be looked up for every translated char.
|
43 |
few of them, but they must be looked up for every translated char.
|
44 |
|
|
|
45 |
We use a sorted vector for fastest elimination by binary search and
|
|
|
46 |
a vector<string> to store the translations
|
|
|
47 |
*/
|
44 |
*/
|
48 |
static vector<unsigned short> except_chars;
|
45 |
unordered_map<unsigned short, string> except_trans;
|
49 |
static vector<string> except_trans;
|
|
|
50 |
static inline size_t is_except_char(unsigned short c)
|
46 |
static inline bool is_except_char(unsigned short c, string& trans)
|
51 |
{
|
47 |
{
|
52 |
vector<unsigned short>::iterator it =
|
48 |
unordered_map<unsigned short, string>::const_iterator it
|
53 |
std::lower_bound(except_chars.begin(), except_chars.end(), c);
|
49 |
= except_trans.find(c);
|
54 |
if (it == except_chars.end() || *it != c) {
|
50 |
if (it == except_trans.end())
|
55 |
return (size_t(-1));
|
51 |
return false;
|
56 |
}
|
52 |
trans = it->second;
|
57 |
return std::distance(except_chars.begin(), it);
|
53 |
return true;
|
58 |
}
|
54 |
}
|
59 |
#endif /* RECOLL_DATADIR */
|
55 |
#endif /* RECOLL_DATADIR */
|
60 |
|
56 |
|
61 |
/*
|
57 |
/*
|
62 |
* If configure.in has not defined this symbol, assume const. It
|
58 |
* If configure.in has not defined this symbol, assume const. It
|
|
... |
|
... |
12713 |
// In conformance with current usage, but incorrectly, we do the following
|
12709 |
// In conformance with current usage, but incorrectly, we do the following
|
12714 |
// things for the special chars depending on the operation requested:
|
12710 |
// things for the special chars depending on the operation requested:
|
12715 |
// - unaccenting: do nothing (copy original char)
|
12711 |
// - unaccenting: do nothing (copy original char)
|
12716 |
// - unac+fold: use table
|
12712 |
// - unac+fold: use table
|
12717 |
// - fold: use the unicode data.
|
12713 |
// - fold: use the unicode data.
|
12718 |
size_t idx;
|
12714 |
string trans;
|
12719 |
if (what != UNAC_FOLD && except_chars.size() != 0 &&
|
12715 |
if (what != UNAC_FOLD && except_trans.size() != 0 &&
|
12720 |
(idx=is_except_char(c)) != (size_t)-1) {
|
12716 |
is_except_char(c, trans)) {
|
12721 |
if (what == UNAC_UNAC) {
|
12717 |
if (what == UNAC_UNAC) {
|
12722 |
// Unaccent only. Do nothing
|
12718 |
// Unaccent only. Do nothing
|
12723 |
p = 0;
|
12719 |
p = 0;
|
12724 |
l = 0;
|
12720 |
l = 0;
|
12725 |
} else {
|
12721 |
} else {
|
12726 |
// Has to be UNAC_UNACFOLD: use table
|
12722 |
// Has to be UNAC_UNACFOLD: use table
|
12727 |
p = (unsigned short *)(except_trans[idx].c_str() + 2);
|
12723 |
p = (unsigned short *)trans.c_str();
|
12728 |
l = (except_trans[idx].size() - 2) / 2;
|
12724 |
l = trans.size() / 2;
|
12729 |
}
|
12725 |
}
|
12730 |
/* if (p) {unsigned char *cp = (unsigned char *)p;
|
|
|
12731 |
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
|
|
|
12732 |
(unsigned int)cp[1]);}*/
|
|
|
12733 |
} else {
|
12726 |
} else {
|
12734 |
#endif /* RECOLL_DATADIR */
|
12727 |
#endif /* RECOLL_DATADIR */
|
12735 |
unac_uf_char_utf16_(c, p, l, what)
|
12728 |
unac_uf_char_utf16_(c, p, l, what)
|
12736 |
#ifdef RECOLL_DATADIR
|
12729 |
#ifdef RECOLL_DATADIR
|
12737 |
}
|
12730 |
}
|
|
... |
|
... |
13074 |
}
|
13067 |
}
|
13075 |
|
13068 |
|
13076 |
#ifdef RECOLL_DATADIR
|
13069 |
#ifdef RECOLL_DATADIR
|
13077 |
void unac_set_except_translations(const char *spectrans)
|
13070 |
void unac_set_except_translations(const char *spectrans)
|
13078 |
{
|
13071 |
{
|
13079 |
except_chars.clear();
|
|
|
13080 |
except_trans.clear();
|
13072 |
except_trans.clear();
|
13081 |
if (!spectrans || !spectrans[0])
|
13073 |
if (!spectrans || !spectrans[0])
|
13082 |
return;
|
13074 |
return;
|
13083 |
|
13075 |
|
13084 |
// The translation tables out of Unicode are in machine byte order (we
|
13076 |
// The translation tables out of Unicode are in machine byte order (we
|
|
... |
|
... |
13121 |
if (littleendian)
|
13113 |
if (littleendian)
|
13122 |
ch = (out[1] << 8) | (out[0] & 0xff);
|
13114 |
ch = (out[1] << 8) | (out[0] & 0xff);
|
13123 |
else
|
13115 |
else
|
13124 |
ch = (out[0] << 8) | (out[1] & 0xff);
|
13116 |
ch = (out[0] << 8) | (out[1] & 0xff);
|
13125 |
|
13117 |
|
13126 |
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
|
|
|
13127 |
except_chars.push_back(ch);
|
|
|
13128 |
// We keep ch as the first 2 bytes in the translation so that
|
|
|
13129 |
// both vectors sort identically
|
|
|
13130 |
except_trans.push_back(string((const char *)out, outsize));
|
13118 |
except_trans[ch] = string((const char *)(out + 2), outsize-2);
|
13131 |
free(out);
|
13119 |
free(out);
|
13132 |
}
|
13120 |
}
|
13133 |
std::sort(except_chars.begin(), except_chars.end());
|
|
|
13134 |
std::sort(except_trans.begin(), except_trans.end());
|
|
|
13135 |
}
|
13121 |
}
|
13136 |
#endif /* RECOLL_DATADIR */
|
13122 |
#endif /* RECOLL_DATADIR */
|