a/src/common/textsplit.cpp b/src/common/textsplit.cpp
...
...
58
// of Unicode properties, but seems to do the job well enough in most
58
// of Unicode properties, but seems to do the job well enough in most
59
// common cases
59
// common cases
60
static set<unsigned int> unicign;
60
static set<unsigned int> unicign;
61
static set<unsigned int> visiblewhite;
61
static set<unsigned int> visiblewhite;
62
62
63
// Set up character classes array and the additional unicode sets
63
class CharClassInit {
64
static void setcharclasses()
64
public:
65
{
65
    CharClassInit() 
66
    static int init = 0;
66
    {
67
    if (init)
68
  return;
69
    unsigned int i;
67
  unsigned int i;
70
68
71
    // Set default value for all: SPACE
69
  // Set default value for all: SPACE
72
    for (i = 0 ; i < 256 ; i ++)
70
  for (i = 0 ; i < 256 ; i ++)
73
    charclasses[i] = SPACE;
71
        charclasses[i] = SPACE;
74
72
75
    char digits[] = "0123456789";
73
  char digits[] = "0123456789";
76
    for (i = 0; i  < strlen(digits); i++)
74
  for (i = 0; i  < strlen(digits); i++)
77
    charclasses[int(digits[i])] = DIGIT;
75
        charclasses[int(digits[i])] = DIGIT;
78
76
79
    char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
77
  char upper[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
80
    for (i = 0; i  < strlen(upper); i++)
78
  for (i = 0; i  < strlen(upper); i++)
81
    charclasses[int(upper[i])] = A_ULETTER;
79
        charclasses[int(upper[i])] = A_ULETTER;
82
80
83
    char lower[] = "abcdefghijklmnopqrstuvwxyz";
81
  char lower[] = "abcdefghijklmnopqrstuvwxyz";
84
    for (i = 0; i  < strlen(lower); i++)
82
  for (i = 0; i  < strlen(lower); i++)
85
    charclasses[int(lower[i])] = A_LLETTER;
83
        charclasses[int(lower[i])] = A_LLETTER;
86
84
87
    char wild[] = "*?[]";
85
  char wild[] = "*?[]";
88
    for (i = 0; i  < strlen(wild); i++)
86
  for (i = 0; i  < strlen(wild); i++)
89
    charclasses[int(wild[i])] = WILD;
87
        charclasses[int(wild[i])] = WILD;
90
88
91
    char special[] = ".@+-,#'_\n\r";
89
  char special[] = ".@+-,#'_\n\r";
92
    for (i = 0; i  < strlen(special); i++)
90
  for (i = 0; i  < strlen(special); i++)
93
    charclasses[int(special[i])] = special[i];
91
        charclasses[int(special[i])] = special[i];
94
92
95
    for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
93
  for (i = 0; i < sizeof(uniign) / sizeof(int); i++) {
96
    unicign.insert(uniign[i]);
94
        unicign.insert(uniign[i]);
97
    }
95
  }
98
    unicign.insert((unsigned int)-1);
96
  unicign.insert((unsigned int)-1);
99
97
100
    for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
98
  for (i = 0; i < sizeof(avsbwht) / sizeof(int); i++) {
101
    visiblewhite.insert(avsbwht[i]);
99
        visiblewhite.insert(avsbwht[i]);
102
    }
103
104
    init = 1;
105
}
100
  }
101
    }
102
};
103
static const CharClassInit charClassInitInstance;
106
104
107
static inline int whatcc(unsigned int c)
105
static inline int whatcc(unsigned int c)
108
{
106
{
109
    if (c <= 127) {
107
    if (c <= 127) {
110
    return charclasses[c]; 
108
    return charclasses[c]; 
...
...
277
         o_processCJK, o_CJKNgramLen,
275
         o_processCJK, o_CJKNgramLen,
278
         m_flags & TXTS_NOSPANS ? " nospans" : "",
276
         m_flags & TXTS_NOSPANS ? " nospans" : "",
279
         m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
277
         m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
280
         m_flags & TXTS_KEEPWILD ? " keepwild" : "",
278
         m_flags & TXTS_KEEPWILD ? " keepwild" : "",
281
         in.substr(0,50).c_str()));
279
         in.substr(0,50).c_str()));
282
283
    setcharclasses();
284
280
285
    m_span.erase();
281
    m_span.erase();
286
    m_inNumber = false;
282
    m_inNumber = false;
287
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
283
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
288
    int curspanglue = 0;
284
    int curspanglue = 0;
...
...
631
    return splitter.wcnt;
627
    return splitter.wcnt;
632
}
628
}
633
629
634
bool TextSplit::hasVisibleWhite(const string &in)
630
bool TextSplit::hasVisibleWhite(const string &in)
635
{
631
{
636
    setcharclasses();
637
    Utf8Iter it(in);
632
    Utf8Iter it(in);
638
    for (; !it.eof(); it++) {
633
    for (; !it.eof(); it++) {
639
    unsigned int c = (unsigned char)*it;
634
    unsigned int c = (unsigned char)*it;
640
    LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
635
    LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
641
    if (c == (unsigned int)-1) {
636
    if (c == (unsigned int)-1) {
...
...
648
    return false;
643
    return false;
649
}
644
}
650
645
651
template <class T> bool u8stringToStrings(const string &s, T &tokens)
646
template <class T> bool u8stringToStrings(const string &s, T &tokens)
652
{
647
{
653
    setcharclasses();
654
    Utf8Iter it(s);
648
    Utf8Iter it(s);
655
649
656
    string current;
650
    string current;
657
    tokens.clear();
651
    tokens.clear();
658
    enum states {SPACE, TOKEN, INQUOTE, ESCAPE};
652
    enum states {SPACE, TOKEN, INQUOTE, ESCAPE};