Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
30
#ifndef NO_NAMESPACES
30
#ifndef NO_NAMESPACES
31
using namespace std;
31
using namespace std;
32
#endif /* NO_NAMESPACES */
32
#endif /* NO_NAMESPACES */
33
33
34
/**
34
/**
35
 * Splitting a text into words. The code in this file will work with any 
35
 * Splitting a text into words. The code in this file works with utf-8
36
 * charset where the basic separators (.,- etc.) have their ascii values 
36
 * in a semi-clean way (see uproplist.h)
37
 * (ok for UTF-8, ascii, iso8859* and quite a few others).
38
 *
39
 * We work in a way which would make it quite difficult to handle non-ascii
40
 * separator chars (en-dash, etc.). We would then need to actually parse the 
41
 * utf-8 stream, and use a different way to classify the characters (instead 
42
 * of a 256 slot array).
43
 *
37
 *
44
 * We are also not using capitalization information.
38
 * We are also not using capitalization information.
45
 *
39
 *
46
 * How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
40
 * There are a few remnants of the initial utf8-ignorant version in this file.
47
 * Then specialcase all 'real' utf chars, by checking for the few
48
 * punctuation ones we're interested in (put them in a map). Then
49
 * classify all other non-ascii as letter, and use the current method
50
 * for chars < 127.
51
 */
41
 */
52
42
53
// Character classes: we have three main groups, and then some chars
43
// Character classes: we have three main groups, and then some chars
54
// are their own class because they want special handling.
44
// are their own class because they want special handling.
45
// 
55
// We have an array with 256 slots where we keep the character types. 
46
// We have an array with 256 slots where we keep the character types. 
56
// The array could be fully static, but we use a small function to fill it 
47
// The array could be fully static, but we use a small function to fill it 
57
// once.
48
// once.
49
// The array is actually a remnant of the original version which did no utf8
50
// It could be reduced to 128, because real (over 128) utf8 chars are now 
51
// handled with a set holding all the separator values.
58
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
52
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
59
static int charclasses[256];
53
static int charclasses[256];
60
54
61
static set<unsigned int> unicign;
55
static set<unsigned int> unicign;
62
static void setcharclasses()
56
static void setcharclasses()
...
...
85
79
86
    char special[] = ".@+-,#'\n\r";
80
    char special[] = ".@+-,#'\n\r";
87
    for (i = 0; i  < strlen(special); i++)
81
    for (i = 0; i  < strlen(special); i++)
88
    charclasses[int(special[i])] = special[i];
82
    charclasses[int(special[i])] = special[i];
89
83
90
    init = 1;
91
    //for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
92
    for (i = 0; i < sizeof(uniign); i++) 
84
    for (i = 0; i < sizeof(uniign); i++) 
93
    unicign.insert(uniign[i]);
85
    unicign.insert(uniign[i]);
94
    unicign.insert((unsigned int)-1);
86
    unicign.insert((unsigned int)-1);
87
88
    init = 1;
95
}
89
}
96
90
97
// Do some checking (the kind which is simpler to do here than in the
91
// Do some checking (the kind which is simpler to do here than in the
98
// main loop), then send term to our client.
92
// main loop), then send term to our client.
99
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
93
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
...
...
101
{
95
{
102
    LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
96
    LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
103
97
104
    unsigned int l = w.length();
98
    unsigned int l = w.length();
105
    if (l > 0 && l < (unsigned)maxWordLength) {
99
    if (l > 0 && l < (unsigned)maxWordLength) {
100
  // 1 char word: we index single letters and digits, but
101
  // nothing else. We might want to turn this into a test for a single
102
  // utf8 character instead.
106
    if (l == 1) {
103
    if (l == 1) {
107
      // 1 char word: we index single letters and digits, but
108
      // nothing else
109
        int c = (int)w[0];
104
        int c = (int)w[0];
110
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
105
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
111
        //cerr << "ERASING single letter term " << c << endl;
106
        //cerr << "ERASING single letter term " << c << endl;
112
        return true;
107
        return true;
113
        }
108
        }
...
...
225
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
220
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
226
        return false;
221
        return false;
227
    }
222
    }
228
    int cc = whatcc(c);
223
    int cc = whatcc(c);
229
    switch (cc) {
224
    switch (cc) {
225
  case LETTER:
226
      word += it;
227
      span += it;
228
      break;
229
230
  case DIGIT:
231
      if (word.length() == 0)
232
      number = true;
233
      word += it;
234
      span += it;
235
      break;
236
230
    case SPACE:
237
    case SPACE:
231
    SPACE:
238
    SPACE:
232
        if (word.length() || span.length()) {
239
        if (word.length() || span.length()) {
233
        if (!doemit(true, it.getBpos()))
240
        if (!doemit(true, it.getBpos()))
234
            return false;
241
            return false;
...
...
324
        } else {
331
        } else {
325
        // Handle like a normal separator
332
        // Handle like a normal separator
326
        goto SPACE;
333
        goto SPACE;
327
        }
334
        }
328
        break;
335
        break;
329
  case DIGIT:
336
330
      if (word.length() == 0)
331
      number = true;
332
      /* FALLTHROUGH */
333
  case LETTER:
334
    default:
337
    default:
335
        word += it;
338
        word += it;
336
        span += it;
339
        span += it;
337
        break;
340
        break;
338
    }
341
    }