|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.20 2006-02-01 14:18:20 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.21 2006-04-11 06:49:45 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
30 |
#ifndef NO_NAMESPACES
|
30 |
#ifndef NO_NAMESPACES
|
31 |
using namespace std;
|
31 |
using namespace std;
|
32 |
#endif /* NO_NAMESPACES */
|
32 |
#endif /* NO_NAMESPACES */
|
33 |
|
33 |
|
34 |
/**
|
34 |
/**
|
35 |
* Splitting a text into words. The code in this file will work with any
|
35 |
* Splitting a text into words. The code in this file works with utf-8
|
36 |
* charset where the basic separators (.,- etc.) have their ascii values
|
36 |
* in a semi-clean way (see uproplist.h)
|
37 |
* (ok for UTF-8, ascii, iso8859* and quite a few others).
|
|
|
38 |
*
|
|
|
39 |
* We work in a way which would make it quite difficult to handle non-ascii
|
|
|
40 |
* separator chars (en-dash, etc.). We would then need to actually parse the
|
|
|
41 |
* utf-8 stream, and use a different way to classify the characters (instead
|
|
|
42 |
* of a 256 slot array).
|
|
|
43 |
*
|
37 |
*
|
44 |
* We are also not using capitalization information.
|
38 |
* We are also not using capitalization information.
|
45 |
*
|
39 |
*
|
46 |
* How to fix: use some kind of utf-8 aware iterator, or convert to UCS4 first.
|
40 |
* There are a few remnants of the initial utf8-ignorant version in this file.
|
47 |
* Then specialcase all 'real' utf chars, by checking for the few
|
|
|
48 |
* punctuation ones we're interested in (put them in a map). Then
|
|
|
49 |
* classify all other non-ascii as letter, and use the current method
|
|
|
50 |
* for chars < 127.
|
|
|
51 |
*/
|
41 |
*/
|
52 |
|
42 |
|
53 |
// Character classes: we have three main groups, and then some chars
|
43 |
// Character classes: we have three main groups, and then some chars
|
54 |
// are their own class because they want special handling.
|
44 |
// are their own class because they want special handling.
|
|
|
45 |
//
|
55 |
// We have an array with 256 slots where we keep the character types.
|
46 |
// We have an array with 256 slots where we keep the character types.
|
56 |
// The array could be fully static, but we use a small function to fill it
|
47 |
// The array could be fully static, but we use a small function to fill it
|
57 |
// once.
|
48 |
// once.
|
|
|
49 |
// The array is actually a remnant of the original version which did no utf8
|
|
|
50 |
// It could be reduced to 128, because real (over 128) utf8 chars are now
|
|
|
51 |
// handled with a set holding all the separator values.
|
58 |
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
52 |
enum CharClass {LETTER=256, SPACE=257, DIGIT=258};
|
59 |
static int charclasses[256];
|
53 |
static int charclasses[256];
|
60 |
|
54 |
|
61 |
static set<unsigned int> unicign;
|
55 |
static set<unsigned int> unicign;
|
62 |
static void setcharclasses()
|
56 |
static void setcharclasses()
|
|
... |
|
... |
85 |
|
79 |
|
86 |
char special[] = ".@+-,#'\n\r";
|
80 |
char special[] = ".@+-,#'\n\r";
|
87 |
for (i = 0; i < strlen(special); i++)
|
81 |
for (i = 0; i < strlen(special); i++)
|
88 |
charclasses[int(special[i])] = special[i];
|
82 |
charclasses[int(special[i])] = special[i];
|
89 |
|
83 |
|
90 |
init = 1;
|
|
|
91 |
//for (i=0;i<256;i++)cerr<<i<<" -> "<<charclasses[i]<<endl;
|
|
|
92 |
for (i = 0; i < sizeof(uniign); i++)
|
84 |
for (i = 0; i < sizeof(uniign); i++)
|
93 |
unicign.insert(uniign[i]);
|
85 |
unicign.insert(uniign[i]);
|
94 |
unicign.insert((unsigned int)-1);
|
86 |
unicign.insert((unsigned int)-1);
|
|
|
87 |
|
|
|
88 |
init = 1;
|
95 |
}
|
89 |
}
|
96 |
|
90 |
|
97 |
// Do some checking (the kind which is simpler to do here than in the
|
91 |
// Do some checking (the kind which is simpler to do here than in the
|
98 |
// main loop), then send term to our client.
|
92 |
// main loop), then send term to our client.
|
99 |
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
93 |
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
|
... |
|
... |
101 |
{
|
95 |
{
|
102 |
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
|
96 |
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
|
103 |
|
97 |
|
104 |
unsigned int l = w.length();
|
98 |
unsigned int l = w.length();
|
105 |
if (l > 0 && l < (unsigned)maxWordLength) {
|
99 |
if (l > 0 && l < (unsigned)maxWordLength) {
|
|
|
100 |
// 1 char word: we index single letters and digits, but
|
|
|
101 |
// nothing else. We might want to turn this into a test for a single
|
|
|
102 |
// utf8 character instead.
|
106 |
if (l == 1) {
|
103 |
if (l == 1) {
|
107 |
// 1 char word: we index single letters and digits, but
|
|
|
108 |
// nothing else
|
|
|
109 |
int c = (int)w[0];
|
104 |
int c = (int)w[0];
|
110 |
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
105 |
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
111 |
//cerr << "ERASING single letter term " << c << endl;
|
106 |
//cerr << "ERASING single letter term " << c << endl;
|
112 |
return true;
|
107 |
return true;
|
113 |
}
|
108 |
}
|
|
... |
|
... |
225 |
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
220 |
LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
|
226 |
return false;
|
221 |
return false;
|
227 |
}
|
222 |
}
|
228 |
int cc = whatcc(c);
|
223 |
int cc = whatcc(c);
|
229 |
switch (cc) {
|
224 |
switch (cc) {
|
|
|
225 |
case LETTER:
|
|
|
226 |
word += it;
|
|
|
227 |
span += it;
|
|
|
228 |
break;
|
|
|
229 |
|
|
|
230 |
case DIGIT:
|
|
|
231 |
if (word.length() == 0)
|
|
|
232 |
number = true;
|
|
|
233 |
word += it;
|
|
|
234 |
span += it;
|
|
|
235 |
break;
|
|
|
236 |
|
230 |
case SPACE:
|
237 |
case SPACE:
|
231 |
SPACE:
|
238 |
SPACE:
|
232 |
if (word.length() || span.length()) {
|
239 |
if (word.length() || span.length()) {
|
233 |
if (!doemit(true, it.getBpos()))
|
240 |
if (!doemit(true, it.getBpos()))
|
234 |
return false;
|
241 |
return false;
|
|
... |
|
... |
324 |
} else {
|
331 |
} else {
|
325 |
// Handle like a normal separator
|
332 |
// Handle like a normal separator
|
326 |
goto SPACE;
|
333 |
goto SPACE;
|
327 |
}
|
334 |
}
|
328 |
break;
|
335 |
break;
|
329 |
case DIGIT:
|
336 |
|
330 |
if (word.length() == 0)
|
|
|
331 |
number = true;
|
|
|
332 |
/* FALLTHROUGH */
|
|
|
333 |
case LETTER:
|
|
|
334 |
default:
|
337 |
default:
|
335 |
word += it;
|
338 |
word += it;
|
336 |
span += it;
|
339 |
span += it;
|
337 |
break;
|
340 |
break;
|
338 |
}
|
341 |
}
|