|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
48 |
// The array could be fully static, but we use a small function to fill it
|
48 |
// The array could be fully static, but we use a small function to fill it
|
49 |
// once.
|
49 |
// once.
|
50 |
// The array is actually a remnant of the original version which did no utf8.
|
50 |
// The array is actually a remnant of the original version which did no utf8.
|
51 |
// Only the lower 127 slots are now used, but keep it at 256
|
51 |
// Only the lower 127 slots are now used, but keep it at 256
|
52 |
// because it makes some tests in the code simpler.
|
52 |
// because it makes some tests in the code simpler.
|
|
|
53 |
const unsigned int charclasses_size = 256;
|
53 |
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
|
54 |
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
|
54 |
A_ULETTER=260, A_LLETTER=261};
|
55 |
A_ULETTER=260, A_LLETTER=261};
|
55 |
static int charclasses[256];
|
56 |
static int charclasses[charclasses_size];
|
56 |
|
57 |
|
57 |
// Real UTF-8 characters are handled with sets holding all characters
|
58 |
// Real UTF-8 characters are handled with sets holding all characters
|
58 |
// with interesting properties. This is far from full-blown management
|
59 |
// with interesting properties. This is far from full-blown management
|
59 |
// of Unicode properties, but seems to do the job well enough in most
|
60 |
// of Unicode properties, but seems to do the job well enough in most
|
60 |
// common cases
|
61 |
// common cases
|
|
... |
|
... |
452 |
// Camelcase handling.
|
453 |
// Camelcase handling.
|
453 |
// If we get uppercase ascii after lowercase ascii, emit word.
|
454 |
// If we get uppercase ascii after lowercase ascii, emit word.
|
454 |
// This emits "camel" when hitting the 'C' of camelCase
|
455 |
// This emits "camel" when hitting the 'C' of camelCase
|
455 |
case A_ULETTER:
|
456 |
case A_ULETTER:
|
456 |
if (m_span.length() &&
|
457 |
if (m_span.length() &&
|
457 |
charclasses[(unsigned int)m_span[m_span.length() - 1]] ==
|
458 |
charclasses[(unsigned char)m_span[m_span.length() - 1]] ==
|
458 |
A_LLETTER) {
|
459 |
A_LLETTER) {
|
459 |
if (m_wordLen) {
|
460 |
if (m_wordLen) {
|
460 |
if (!doemit(false, it.getBpos()))
|
461 |
if (!doemit(false, it.getBpos()))
|
461 |
return false;
|
462 |
return false;
|
462 |
}
|
463 |
}
|
|
... |
|
... |
469 |
// string of several upper-case letters: an
|
470 |
// string of several upper-case letters: an
|
470 |
// acronym (readHTML) or a single letter article (ALittleHelp).
|
471 |
// acronym (readHTML) or a single letter article (ALittleHelp).
|
471 |
// Emit the uppercase word before proceeding
|
472 |
// Emit the uppercase word before proceeding
|
472 |
case A_LLETTER:
|
473 |
case A_LLETTER:
|
473 |
if (m_span.length() &&
|
474 |
if (m_span.length() &&
|
474 |
charclasses[(unsigned int)m_span[m_span.length() - 1]] ==
|
475 |
charclasses[(unsigned char)m_span[m_span.length() - 1]] ==
|
475 |
A_ULETTER && m_wordLen > 1) {
|
476 |
A_ULETTER && m_wordLen > 1) {
|
476 |
// Multiple upper-case letters. Single letter word
|
477 |
// Multiple upper-case letters. Single letter word
|
477 |
// or acronym which we want to emit now
|
478 |
// or acronym which we want to emit now
|
478 |
m_wordLen--;
|
479 |
m_wordLen--;
|
479 |
if (!doemit(false, it.getBpos()))
|
480 |
if (!doemit(false, it.getBpos()))
|
|
... |
|
... |
609 |
bool TextSplit::hasVisibleWhite(const string &in)
|
610 |
bool TextSplit::hasVisibleWhite(const string &in)
|
610 |
{
|
611 |
{
|
611 |
setcharclasses();
|
612 |
setcharclasses();
|
612 |
Utf8Iter it(in);
|
613 |
Utf8Iter it(in);
|
613 |
for (; !it.eof(); it++) {
|
614 |
for (; !it.eof(); it++) {
|
614 |
unsigned int c = *it;
|
615 |
unsigned int c = (unsigned char)*it;
|
615 |
LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
|
616 |
LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
|
616 |
if (c == (unsigned int)-1) {
|
617 |
if (c == (unsigned int)-1) {
|
617 |
LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
|
618 |
LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
|
618 |
return false;
|
619 |
return false;
|
619 |
}
|
620 |
}
|