|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
86 |
|
86 |
|
87 |
char wild[] = "*?[]";
|
87 |
char wild[] = "*?[]";
|
88 |
for (i = 0; i < strlen(wild); i++)
|
88 |
for (i = 0; i < strlen(wild); i++)
|
89 |
charclasses[int(wild[i])] = WILD;
|
89 |
charclasses[int(wild[i])] = WILD;
|
90 |
|
90 |
|
91 |
char special[] = ".@+-,#'\n\r";
|
91 |
char special[] = ".@+-,#'_\n\r";
|
92 |
for (i = 0; i < strlen(special); i++)
|
92 |
for (i = 0; i < strlen(special); i++)
|
93 |
charclasses[int(special[i])] = special[i];
|
93 |
charclasses[int(special[i])] = special[i];
|
94 |
|
94 |
|
95 |
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
|
95 |
for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
|
96 |
unicign.insert(uniign[i]);
|
96 |
unicign.insert(uniign[i]);
|
|
... |
|
... |
136 |
// F900..FAFF; CJK Compatibility Ideographs
|
136 |
// F900..FAFF; CJK Compatibility Ideographs
|
137 |
// FE30..FE4F; CJK Compatibility Forms
|
137 |
// FE30..FE4F; CJK Compatibility Forms
|
138 |
// FF00..FFEF; Halfwidth and Fullwidth Forms
|
138 |
// FF00..FFEF; Halfwidth and Fullwidth Forms
|
139 |
// 20000..2A6DF; CJK Unified Ideographs Extension B
|
139 |
// 20000..2A6DF; CJK Unified Ideographs Extension B
|
140 |
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
140 |
// 2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
|
|
141 |
// Note: the p > 127 test is not necessary, but optimizes away the ascii case
|
141 |
#define UNICODE_IS_CJK(p) \
|
142 |
#define UNICODE_IS_CJK(p) \
|
142 |
(((p) >= 0x2E80 && (p) <= 0x2EFF) \
|
143 |
((p) > 127 && \
|
143 |
|| ((p) >= 0x3000 && (p) <= 0x9FFF) \
|
144 |
(((p) >= 0x2E80 && (p) <= 0x2EFF) || \
|
144 |
|| ((p) >= 0xA700 && (p) <= 0xA71F) \
|
145 |
((p) >= 0x3000 && (p) <= 0x9FFF) || \
|
145 |
|| ((p) >= 0xAC00 && (p) <= 0xD7AF) \
|
146 |
((p) >= 0xA700 && (p) <= 0xA71F) || \
|
146 |
|| ((p) >= 0xF900 && (p) <= 0xFAFF) \
|
147 |
((p) >= 0xAC00 && (p) <= 0xD7AF) || \
|
147 |
|| ((p) >= 0xFE30 && (p) <= 0xFE4F) \
|
148 |
((p) >= 0xF900 && (p) <= 0xFAFF) || \
|
148 |
|| ((p) >= 0xFF00 && (p) <= 0xFFEF) \
|
149 |
((p) >= 0xFE30 && (p) <= 0xFE4F) || \
|
149 |
|| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
|
150 |
((p) >= 0xFF00 && (p) <= 0xFFEF) || \
|
|
|
151 |
((p) >= 0x20000 && (p) <= 0x2A6DF) || \
|
150 |
|| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
|
152 |
((p) >= 0x2F800 && (p) <= 0x2FA1F)))
|
151 |
|
153 |
|
152 |
bool TextSplit::isCJK(int c)
|
154 |
bool TextSplit::isCJK(int c)
|
153 |
{
|
155 |
{
|
154 |
return UNICODE_IS_CJK(c);
|
156 |
return UNICODE_IS_CJK(c);
|
155 |
}
|
157 |
}
|
|
... |
|
... |
383 |
return false;
|
385 |
return false;
|
384 |
m_inNumber = false;
|
386 |
m_inNumber = false;
|
385 |
}
|
387 |
}
|
386 |
m_wordStart += it.appendchartostring(m_span);
|
388 |
m_wordStart += it.appendchartostring(m_span);
|
387 |
break;
|
389 |
break;
|
|
|
390 |
case '_':
|
|
|
391 |
if (m_wordLen) {
|
|
|
392 |
if (!doemit(false, it.getBpos()))
|
|
|
393 |
return false;
|
|
|
394 |
m_inNumber = false;
|
|
|
395 |
}
|
|
|
396 |
m_wordStart += it.appendchartostring(m_span);
|
|
|
397 |
break;
|
388 |
case '\'':
|
398 |
case '\'':
|
389 |
// If in word, potential span: o'brien, else, this is more
|
399 |
// If in word, potential span: o'brien, else, this is more
|
390 |
// whitespace
|
400 |
// whitespace
|
391 |
if (m_wordLen) {
|
401 |
if (m_wordLen) {
|
392 |
if (!doemit(false, it.getBpos()))
|
402 |
if (!doemit(false, it.getBpos()))
|