|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
91 |
// Do some checking (the kind which is simpler to do here than in the
|
91 |
// Do some checking (the kind which is simpler to do here than in the
|
92 |
// main loop), then send term to our client.
|
92 |
// main loop), then send term to our client.
|
93 |
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
93 |
inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
|
94 |
int btstart, int btend)
|
94 |
int btstart, int btend)
|
95 |
{
|
95 |
{
|
96 |
LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
|
96 |
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
|
97 |
|
97 |
|
98 |
unsigned int l = w.length();
|
98 |
unsigned int l = w.length();
|
99 |
if (l > 0 && l < (unsigned)maxWordLength) {
|
99 |
if (l > 0 && l < (unsigned)maxWordLength) {
|
100 |
// 1 char word: we index single letters and digits, but
|
100 |
// 1 char word: we index single letters and digits, but
|
101 |
// nothing else. We might want to turn this into a test for a single
|
101 |
// nothing else. We might want to turn this into a test for a single
|
|
... |
|
... |
105 |
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
105 |
if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
|
106 |
//cerr << "ERASING single letter term " << c << endl;
|
106 |
//cerr << "ERASING single letter term " << c << endl;
|
107 |
return true;
|
107 |
return true;
|
108 |
}
|
108 |
}
|
109 |
}
|
109 |
}
|
110 |
if (pos != prevpos || l != prevterm.length() || w != prevterm) {
|
110 |
if (pos != prevpos || l != prevlen) {
|
111 |
bool ret = cb->takeword(w, pos, btstart, btend);
|
111 |
bool ret = cb->takeword(w, pos, btstart, btend);
|
112 |
prevterm = w;
|
112 |
prevlen = w.length();
|
113 |
prevpos = pos;
|
113 |
prevpos = pos;
|
114 |
return ret;
|
114 |
return ret;
|
115 |
}
|
115 |
}
|
|
|
116 |
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
116 |
}
|
117 |
}
|
117 |
return true;
|
118 |
return true;
|
118 |
}
|
119 |
}
|
119 |
|
120 |
|
120 |
/**
|
121 |
/**
|
|
... |
|
... |
135 |
* @param spanerase Set if the current span is at its end. Reset it.
|
136 |
* @param spanerase Set if the current span is at its end. Reset it.
|
136 |
* @param bp The current BYTE position in the stream
|
137 |
* @param bp The current BYTE position in the stream
|
137 |
*/
|
138 |
*/
|
138 |
inline bool TextSplit::doemit(bool spanerase, int bp)
|
139 |
inline bool TextSplit::doemit(bool spanerase, int bp)
|
139 |
{
|
140 |
{
|
140 |
#if 0
|
141 |
LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n",
|
141 |
cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
|
142 |
word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp));
|
142 |
span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp
|
|
|
143 |
<< endl;
|
|
|
144 |
#endif
|
|
|
145 |
|
143 |
|
146 |
// Emit span. When splitting for query, we only emit final spans
|
144 |
// Emit span. When splitting for query, we only emit final spans
|
147 |
bool spanemitted = false;
|
145 |
bool spanemitted = false;
|
148 |
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
146 |
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
149 |
// Maybe trim at end These are chars that we would keep inside
|
147 |
// Maybe trim at end These are chars that we would keep inside
|
|
... |
|
... |
212 |
setcharclasses();
|
210 |
setcharclasses();
|
213 |
|
211 |
|
214 |
span.erase();
|
212 |
span.erase();
|
215 |
word.erase(); // Current word: no punctuation at all in there
|
213 |
word.erase(); // Current word: no punctuation at all in there
|
216 |
number = false;
|
214 |
number = false;
|
217 |
prevpos = wordpos = spanpos = charpos = 0;
|
215 |
prevpos = prevlen = wordpos = spanpos = charpos = 0;
|
218 |
prevterm.erase();
|
|
|
219 |
|
216 |
|
220 |
Utf8Iter it(in);
|
217 |
Utf8Iter it(in);
|
221 |
|
218 |
|
222 |
for (; !it.eof(); it++, charpos++) {
|
219 |
for (; !it.eof(); it++, charpos++) {
|
223 |
unsigned int c = *it;
|
220 |
unsigned int c = *it;
|
|
... |
|
... |
226 |
return false;
|
223 |
return false;
|
227 |
}
|
224 |
}
|
228 |
int cc = whatcc(c);
|
225 |
int cc = whatcc(c);
|
229 |
switch (cc) {
|
226 |
switch (cc) {
|
230 |
case LETTER:
|
227 |
case LETTER:
|
231 |
word += it;
|
228 |
it.appendchartostring(word);
|
232 |
span += it;
|
229 |
it.appendchartostring(span);
|
233 |
break;
|
230 |
break;
|
234 |
|
231 |
|
235 |
case DIGIT:
|
232 |
case DIGIT:
|
236 |
if (word.length() == 0)
|
233 |
if (word.length() == 0)
|
237 |
number = true;
|
234 |
number = true;
|
238 |
word += it;
|
235 |
it.appendchartostring(word);
|
239 |
span += it;
|
236 |
it.appendchartostring(span);
|
240 |
break;
|
237 |
break;
|
241 |
|
238 |
|
242 |
case SPACE:
|
239 |
case SPACE:
|
243 |
SPACE:
|
240 |
SPACE:
|
244 |
if (word.length() || span.length()) {
|
241 |
if (word.length() || span.length()) {
|
|
... |
|
... |
250 |
case '-':
|
247 |
case '-':
|
251 |
case '+':
|
248 |
case '+':
|
252 |
if (word.length() == 0) {
|
249 |
if (word.length() == 0) {
|
253 |
if (whatcc(it[charpos+1]) == DIGIT) {
|
250 |
if (whatcc(it[charpos+1]) == DIGIT) {
|
254 |
number = true;
|
251 |
number = true;
|
255 |
word += it;
|
252 |
it.appendchartostring(word);
|
256 |
span += it;
|
253 |
it.appendchartostring(span);
|
257 |
} else
|
254 |
} else
|
258 |
span += it;
|
255 |
it.appendchartostring(span);
|
259 |
} else {
|
256 |
} else {
|
260 |
if (!doemit(false, it.getBpos()))
|
257 |
if (!doemit(false, it.getBpos()))
|
261 |
return false;
|
258 |
return false;
|
262 |
number = false;
|
259 |
number = false;
|
263 |
span += it;
|
260 |
it.appendchartostring(span);
|
264 |
}
|
261 |
}
|
265 |
break;
|
262 |
break;
|
266 |
case '.':
|
263 |
case '.':
|
267 |
case ',':
|
264 |
case ',':
|
268 |
if (number) {
|
265 |
if (number) {
|
269 |
// 132.jpg ?
|
266 |
// 132.jpg ?
|
270 |
if (whatcc(it[charpos+1]) != DIGIT)
|
267 |
if (whatcc(it[charpos+1]) != DIGIT)
|
271 |
goto SPACE;
|
268 |
goto SPACE;
|
272 |
word += it;
|
269 |
it.appendchartostring(word);
|
273 |
span += it;
|
270 |
it.appendchartostring(span);
|
274 |
break;
|
271 |
break;
|
275 |
} else {
|
272 |
} else {
|
276 |
// If . inside a word, keep it, else, this is whitespace.
|
273 |
// If . inside a word, keep it, else, this is whitespace.
|
277 |
// We also keep an initial '.' for catching .net, but this adds
|
274 |
// We also keep an initial '.' for catching .net, but this adds
|
278 |
// quite a few spurious terms !
|
275 |
// quite a few spurious terms !
|
|
... |
|
... |
284 |
if (!doemit(false, it.getBpos()))
|
281 |
if (!doemit(false, it.getBpos()))
|
285 |
return false;
|
282 |
return false;
|
286 |
// span length could have been adjusted by trimming
|
283 |
// span length could have been adjusted by trimming
|
287 |
// inside doemit
|
284 |
// inside doemit
|
288 |
if (span.length())
|
285 |
if (span.length())
|
289 |
span += it;
|
286 |
it.appendchartostring(span);
|
290 |
break;
|
287 |
break;
|
291 |
} else {
|
288 |
} else {
|
292 |
span += it;
|
289 |
it.appendchartostring(span);
|
293 |
break;
|
290 |
break;
|
294 |
}
|
291 |
}
|
295 |
}
|
292 |
}
|
296 |
}
|
293 |
}
|
297 |
goto SPACE;
|
294 |
goto SPACE;
|
|
... |
|
... |
300 |
if (word.length()) {
|
297 |
if (word.length()) {
|
301 |
if (!doemit(false, it.getBpos()))
|
298 |
if (!doemit(false, it.getBpos()))
|
302 |
return false;
|
299 |
return false;
|
303 |
number = false;
|
300 |
number = false;
|
304 |
}
|
301 |
}
|
305 |
span += it;
|
302 |
it.appendchartostring(span);
|
306 |
break;
|
303 |
break;
|
307 |
case '\'':
|
304 |
case '\'':
|
308 |
// If in word, potential span: o'brien, else, this is more
|
305 |
// If in word, potential span: o'brien, else, this is more
|
309 |
// whitespace
|
306 |
// whitespace
|
310 |
if (word.length()) {
|
307 |
if (word.length()) {
|
311 |
if (!doemit(false, it.getBpos()))
|
308 |
if (!doemit(false, it.getBpos()))
|
312 |
return false;
|
309 |
return false;
|
313 |
number = false;
|
310 |
number = false;
|
314 |
span += it;
|
311 |
it.appendchartostring(span);
|
315 |
}
|
312 |
}
|
316 |
break;
|
313 |
break;
|
317 |
case '#':
|
314 |
case '#':
|
318 |
// Keep it only at end of word... Special case for c# you see...
|
315 |
// Keep it only at end of word... Special case for c# you see...
|
319 |
if (word.length() > 0) {
|
316 |
if (word.length() > 0) {
|
320 |
int w = whatcc(it[charpos+1]);
|
317 |
int w = whatcc(it[charpos+1]);
|
321 |
if (w == SPACE || w == '\n' || w == '\r') {
|
318 |
if (w == SPACE || w == '\n' || w == '\r') {
|
322 |
word += it;
|
319 |
it.appendchartostring(word);
|
323 |
span += it;
|
320 |
it.appendchartostring(span);
|
324 |
break;
|
321 |
break;
|
325 |
}
|
322 |
}
|
326 |
}
|
323 |
}
|
327 |
goto SPACE;
|
324 |
goto SPACE;
|
328 |
break;
|
325 |
break;
|
|
... |
|
... |
341 |
goto SPACE;
|
338 |
goto SPACE;
|
342 |
}
|
339 |
}
|
343 |
break;
|
340 |
break;
|
344 |
|
341 |
|
345 |
default:
|
342 |
default:
|
346 |
word += it;
|
343 |
it.appendchartostring(word);
|
347 |
span += it;
|
344 |
it.appendchartostring(span);
|
348 |
break;
|
345 |
break;
|
349 |
}
|
346 |
}
|
350 |
}
|
347 |
}
|
351 |
if (word.length() || span.length()) {
|
348 |
if (word.length() || span.length()) {
|
352 |
if (!doemit(true, it.getBpos()))
|
349 |
if (!doemit(true, it.getBpos()))
|
|
... |
|
... |
371 |
using namespace std;
|
368 |
using namespace std;
|
372 |
|
369 |
|
373 |
// A small class to hold state while splitting text
|
370 |
// A small class to hold state while splitting text
|
374 |
class mySplitterCB : public TextSplitCB {
|
371 |
class mySplitterCB : public TextSplitCB {
|
375 |
int first;
|
372 |
int first;
|
|
|
373 |
bool nooutput;
|
376 |
public:
|
374 |
public:
|
377 |
mySplitterCB() : first(1) {}
|
375 |
mySplitterCB() : first(1), nooutput(false) {}
|
378 |
|
376 |
void setNoOut(bool val) {nooutput = val;}
|
379 |
bool takeword(const std::string &term, int pos, int bs, int be) {
|
377 |
bool takeword(const std::string &term, int pos, int bs, int be) {
|
|
|
378 |
if (nooutput)
|
|
|
379 |
return true;
|
380 |
if (first) {
|
380 |
if (first) {
|
381 |
printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
381 |
printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
|
382 |
first = 0;
|
382 |
first = 0;
|
383 |
}
|
383 |
}
|
384 |
printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
384 |
printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
|
|
... |
|
... |
404 |
static string teststring1 = " 124, ";
|
404 |
static string teststring1 = " 124, ";
|
405 |
|
405 |
|
406 |
static string thisprog;
|
406 |
static string thisprog;
|
407 |
|
407 |
|
408 |
static string usage =
|
408 |
static string usage =
|
409 |
" textsplit [opts] [filename]\n"
|
409 |
" textsplit [opts] [filename]\n"
|
|
|
410 |
" -S: no output\n"
|
410 |
" -s: only spans\n"
|
411 |
" -s: only spans\n"
|
411 |
" -w: only words\n"
|
412 |
" -w: only words\n"
|
412 |
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
413 |
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
413 |
" \n\n"
|
414 |
" \n\n"
|
414 |
;
|
415 |
;
|
415 |
|
416 |
|
416 |
static void
|
417 |
static void
|
|
... |
|
... |
421 |
}
|
422 |
}
|
422 |
|
423 |
|
423 |
static int op_flags;
|
424 |
static int op_flags;
|
424 |
#define OPT_s 0x1
|
425 |
#define OPT_s 0x1
|
425 |
#define OPT_w 0x2
|
426 |
#define OPT_w 0x2
|
|
|
427 |
#define OPT_S 0x4
|
426 |
|
428 |
|
427 |
int main(int argc, char **argv)
|
429 |
int main(int argc, char **argv)
|
428 |
{
|
430 |
{
|
429 |
thisprog = argv[0];
|
431 |
thisprog = argv[0];
|
430 |
argc--; argv++;
|
432 |
argc--; argv++;
|
|
... |
|
... |
435 |
/* Cas du "adb - core" */
|
437 |
/* Cas du "adb - core" */
|
436 |
Usage();
|
438 |
Usage();
|
437 |
while (**argv)
|
439 |
while (**argv)
|
438 |
switch (*(*argv)++) {
|
440 |
switch (*(*argv)++) {
|
439 |
case 's': op_flags |= OPT_s; break;
|
441 |
case 's': op_flags |= OPT_s; break;
|
|
|
442 |
case 'S': op_flags |= OPT_S; break;
|
440 |
case 'w': op_flags |= OPT_w; break;
|
443 |
case 'w': op_flags |= OPT_w; break;
|
441 |
default: Usage(); break;
|
444 |
default: Usage(); break;
|
442 |
}
|
445 |
}
|
443 |
argc--; argv++;
|
446 |
argc--; argv++;
|
444 |
}
|
447 |
}
|
445 |
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
448 |
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
446 |
DebugLog::setfilename("stderr");
|
449 |
DebugLog::setfilename("stderr");
|
|
|
450 |
|
447 |
mySplitterCB cb;
|
451 |
mySplitterCB cb;
|
448 |
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
452 |
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
|
|
453 |
|
|
|
454 |
if (op_flags&OPT_S)
|
|
|
455 |
cb.setNoOut(true);
|
|
|
456 |
|
449 |
if (op_flags&OPT_s)
|
457 |
if (op_flags&OPT_s)
|
450 |
flags = TextSplit::TXTS_ONLYSPANS;
|
458 |
flags = TextSplit::TXTS_ONLYSPANS;
|
451 |
else if (op_flags&OPT_w)
|
459 |
else if (op_flags&OPT_w)
|
452 |
flags = TextSplit::TXTS_NOSPANS;
|
460 |
flags = TextSplit::TXTS_NOSPANS;
|
453 |
TextSplit splitter(&cb, flags);
|
461 |
TextSplit splitter(&cb, flags);
|