|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
58 |
// of Unicode properties, but seems to do the job well enough in most
|
58 |
// of Unicode properties, but seems to do the job well enough in most
|
59 |
// common cases
|
59 |
// common cases
|
60 |
static set<unsigned int> unicign;
|
60 |
static set<unsigned int> unicign;
|
61 |
static set<unsigned int> visiblewhite;
|
61 |
static set<unsigned int> visiblewhite;
|
62 |
|
62 |
|
63 |
// Set up character classes array and the additional unicode sets
|
63 |
class CharClassInit {
|
64 |
static void setcharclasses()
|
64 |
public:
|
65 |
{
|
65 |
CharClassInit()
|
66 |
static int init = 0;
|
66 |
{
|
67 |
if (init)
|
|
|
68 |
return;
|
|
|
69 |
unsigned int i;
|
67 |
unsigned int i;
|
70 |
|
68 |
|
71 |
// Set default value for all: SPACE
|
69 |
// Set default value for all: SPACE
|
72 |
for (i = 0 ; i < 256 ; i ++)
|
70 |
for (i = 0 ; i < 256 ; i ++)
|
73 |
charclasses[i] = SPACE;
|
71 |
charclasses[i] = SPACE;
|
74 |
|
72 |
|
75 |
char digits[] = "0123456789";
|
73 |
char digits[] = "0123456789";
|
76 |
for (i = 0; i < strlen(digits); i++)
|
74 |
for (i = 0; i < strlen(digits); i++)
|
77 |
charclasses[int(digits[i])] = DIGIT;
|
75 |
charclasses[int(digits[i])] = DIGIT;
|
78 |
|
76 |
|
79 |
char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
77 |
char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
|
80 |
for (i = 0; i < strlen(upper); i++)
|
78 |
for (i = 0; i < strlen(upper); i++)
|
81 |
charclasses[int(upper[i])] = A_ULETTER;
|
79 |
charclasses[int(upper[i])] = A_ULETTER;
|
82 |
|
80 |
|
83 |
char lower[] = "abcdefghijklmnopqrstuvwxyz";
|
81 |
char lower[] = "abcdefghijklmnopqrstuvwxyz";
|
84 |
for (i = 0; i < strlen(lower); i++)
|
82 |
for (i = 0; i < strlen(lower); i++)
|
85 |
charclasses[int(lower[i])] = A_LLETTER;
|
83 |
charclasses[int(lower[i])] = A_LLETTER;
|
86 |
|
84 |
|
87 |
char wild[] = "*?[]";
|
85 |
char wild[] = "*?[]";
|
88 |
for (i = 0; i < strlen(wild); i++)
|
86 |
for (i = 0; i < strlen(wild); i++)
|
89 |
charclasses[int(wild[i])] = WILD;
|
87 |
charclasses[int(wild[i])] = WILD;
|
90 |
|
88 |
|
91 |
char special[] = ".@+-,#'_\n\r";
|
89 |
char special[] = ".@+-,#'_\n\r";
|
92 |
for (i = 0; i < strlen(special); i++)
|
90 |
for (i = 0; i < strlen(special); i++)
|
93 |
charclasses[int(special[i])] = special[i];
|
91 |
charclasses[int(special[i])] = special[i];
|
94 |
|
92 |
|
95 |
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
|
93 |
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
|
96 |
unicign.insert(uniign[i]);
|
94 |
unicign.insert(uniign[i]);
|
97 |
}
|
95 |
}
|
98 |
unicign.insert((unsigned int)-1);
|
96 |
unicign.insert((unsigned int)-1);
|
99 |
|
97 |
|
100 |
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
|
98 |
for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
|
101 |
visiblewhite.insert(avsbwht[i]);
|
99 |
visiblewhite.insert(avsbwht[i]);
|
102 |
}
|
|
|
103 |
|
|
|
104 |
init = 1;
|
|
|
105 |
}
|
100 |
}
|
|
|
101 |
}
|
|
|
102 |
};
|
|
|
103 |
static const CharClassInit charClassInitInstance;
|
106 |
|
104 |
|
107 |
static inline int whatcc(unsigned int c)
|
105 |
static inline int whatcc(unsigned int c)
|
108 |
{
|
106 |
{
|
109 |
if (c <= 127) {
|
107 |
if (c <= 127) {
|
110 |
return charclasses[c];
|
108 |
return charclasses[c];
|
|
... |
|
... |
277 |
o_processCJK, o_CJKNgramLen,
|
275 |
o_processCJK, o_CJKNgramLen,
|
278 |
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
276 |
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
279 |
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
277 |
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
280 |
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
278 |
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
281 |
in.substr(0,50).c_str()));
|
279 |
in.substr(0,50).c_str()));
|
282 |
|
|
|
283 |
setcharclasses();
|
|
|
284 |
|
280 |
|
285 |
m_span.erase();
|
281 |
m_span.erase();
|
286 |
m_inNumber = false;
|
282 |
m_inNumber = false;
|
287 |
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
283 |
m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
|
288 |
int curspanglue = 0;
|
284 |
int curspanglue = 0;
|
|
... |
|
... |
631 |
return splitter.wcnt;
|
627 |
return splitter.wcnt;
|
632 |
}
|
628 |
}
|
633 |
|
629 |
|
634 |
bool TextSplit::hasVisibleWhite(const string &in)
|
630 |
bool TextSplit::hasVisibleWhite(const string &in)
|
635 |
{
|
631 |
{
|
636 |
setcharclasses();
|
|
|
637 |
Utf8Iter it(in);
|
632 |
Utf8Iter it(in);
|
638 |
for (; !it.eof(); it++) {
|
633 |
for (; !it.eof(); it++) {
|
639 |
unsigned int c = (unsigned char)*it;
|
634 |
unsigned int c = (unsigned char)*it;
|
640 |
LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
|
635 |
LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
|
641 |
if (c == (unsigned int)-1) {
|
636 |
if (c == (unsigned int)-1) {
|
|
... |
|
... |
648 |
return false;
|
643 |
return false;
|
649 |
}
|
644 |
}
|
650 |
|
645 |
|
651 |
template <class T> bool u8stringToStrings(const string &s, T &tokens)
|
646 |
template <class T> bool u8stringToStrings(const string &s, T &tokens)
|
652 |
{
|
647 |
{
|
653 |
setcharclasses();
|
|
|
654 |
Utf8Iter it(s);
|
648 |
Utf8Iter it(s);
|
655 |
|
649 |
|
656 |
string current;
|
650 |
string current;
|
657 |
tokens.clear();
|
651 |
tokens.clear();
|
658 |
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
|
652 |
enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
|