Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
...
...
48
// The array could be fully static, but we use a small function to fill it 
48
// The array could be fully static, but we use a small function to fill it 
49
// once.
49
// once.
50
// The array is actually a remnant of the original version which did no utf8.
50
// The array is actually a remnant of the original version which did no utf8.
51
// Only the lower 127 slots are  now used, but keep it at 256
51
// Only the lower 127 slots are  now used, but keep it at 256
52
// because it makes some tests in the code simpler.
52
// because it makes some tests in the code simpler.
53
const unsigned int charclasses_size = 256;
53
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259, 
54
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259, 
54
                A_ULETTER=260, A_LLETTER=261};
55
                A_ULETTER=260, A_LLETTER=261};
55
static int charclasses[256];
56
static int charclasses[charclasses_size];
56
57
57
// Real UTF-8 characters are handled with sets holding all characters
58
// Real UTF-8 characters are handled with sets holding all characters
58
// with interesting properties. This is far from full-blown management
59
// with interesting properties. This is far from full-blown management
59
// of Unicode properties, but seems to do the job well enough in most
60
// of Unicode properties, but seems to do the job well enough in most
60
// common cases
61
// common cases
...
...
452
            // Camelcase handling. 
453
            // Camelcase handling. 
453
            // If we get uppercase ascii after lowercase ascii, emit word.
454
            // If we get uppercase ascii after lowercase ascii, emit word.
454
            // This emits "camel" when hitting the 'C' of camelCase
455
            // This emits "camel" when hitting the 'C' of camelCase
455
    case A_ULETTER:
456
    case A_ULETTER:
456
        if (m_span.length() && 
457
        if (m_span.length() && 
457
                charclasses[(unsigned int)m_span[m_span.length() - 1]] == 
458
                charclasses[(unsigned char)m_span[m_span.length() - 1]] == 
458
                A_LLETTER) {
459
                A_LLETTER) {
459
                if (m_wordLen) {
460
                if (m_wordLen) {
460
                    if (!doemit(false, it.getBpos()))
461
                    if (!doemit(false, it.getBpos()))
461
                        return false;
462
                        return false;
462
                }
463
                }
...
...
469
            // string of several upper-case letters:  an
470
            // string of several upper-case letters:  an
470
            // acronym (readHTML) or a single letter article (ALittleHelp).
471
            // acronym (readHTML) or a single letter article (ALittleHelp).
471
            // Emit the uppercase word before proceeding
472
            // Emit the uppercase word before proceeding
472
        case A_LLETTER:
473
        case A_LLETTER:
473
        if (m_span.length() && 
474
        if (m_span.length() && 
474
                charclasses[(unsigned int)m_span[m_span.length() - 1]] == 
475
                charclasses[(unsigned char)m_span[m_span.length() - 1]] == 
475
                A_ULETTER && m_wordLen > 1) {
476
                A_ULETTER && m_wordLen > 1) {
476
                // Multiple upper-case letters. Single letter word
477
                // Multiple upper-case letters. Single letter word
477
                // or acronym which we want to emit now
478
                // or acronym which we want to emit now
478
                m_wordLen--;
479
                m_wordLen--;
479
                if (!doemit(false, it.getBpos()))
480
                if (!doemit(false, it.getBpos()))
...
...
609
bool TextSplit::hasVisibleWhite(const string &in)
610
bool TextSplit::hasVisibleWhite(const string &in)
610
{
611
{
611
    setcharclasses();
612
    setcharclasses();
612
    Utf8Iter it(in);
613
    Utf8Iter it(in);
613
    for (; !it.eof(); it++) {
614
    for (; !it.eof(); it++) {
614
    unsigned int c = *it;
615
    unsigned int c = (unsigned char)*it;
615
    LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
616
    LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
616
    if (c == (unsigned int)-1) {
617
    if (c == (unsigned int)-1) {
617
        LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
618
        LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
618
        return false;
619
        return false;
619
    }
620
    }