|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.23 2006-09-21 05:59:02 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
142 |
span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp
|
142 |
span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp
|
143 |
<< endl;
|
143 |
<< endl;
|
144 |
#endif
|
144 |
#endif
|
145 |
|
145 |
|
146 |
// Emit span. When splitting for query, we only emit final spans
|
146 |
// Emit span. When splitting for query, we only emit final spans
|
147 |
if (spanerase) {
|
147 |
bool spanemitted = false;
|
|
|
148 |
if (spanerase && !(m_flags & TXTS_NOSPANS)) {
|
148 |
// Maybe trim at end These are chars that we would keep inside
|
149 |
// Maybe trim at end These are chars that we would keep inside
|
149 |
// a span, but not at the end
|
150 |
// a span, but not at the end
|
150 |
while (span.length() > 0) {
|
151 |
while (span.length() > 0) {
|
151 |
switch (span[span.length()-1]) {
|
152 |
switch (span[span.length()-1]) {
|
152 |
case '.':
|
153 |
case '.':
|
|
... |
|
... |
160 |
default:
|
161 |
default:
|
161 |
goto breakloop1;
|
162 |
goto breakloop1;
|
162 |
}
|
163 |
}
|
163 |
}
|
164 |
}
|
164 |
breakloop1:
|
165 |
breakloop1:
|
|
|
166 |
spanemitted = true;
|
165 |
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
|
167 |
if (!emitterm(true, span, spanpos, bp-span.length(), bp))
|
166 |
return false;
|
168 |
return false;
|
167 |
}
|
169 |
}
|
168 |
|
170 |
|
169 |
|
171 |
|
170 |
// Emit word if different from span and not query mode
|
172 |
// Emit word if different from span and not 'no words' mode
|
|
|
173 |
if (!(m_flags & TXTS_ONLYSPANS) &&
|
171 |
if (!fq && (!spanerase || (word.length() != span.length())))
|
174 |
(!spanemitted || word.length() != span.length()))
|
172 |
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
|
175 |
if (!emitterm(false, word, wordpos, bp-word.length(), bp))
|
173 |
return false;
|
176 |
return false;
|
174 |
|
177 |
|
175 |
// Adjust state
|
178 |
// Adjust state
|
176 |
wordpos++;
|
179 |
wordpos++;
|
|
... |
|
... |
402 |
|
405 |
|
403 |
static string thisprog;
|
406 |
static string thisprog;
|
404 |
|
407 |
|
405 |
static string usage =
|
408 |
static string usage =
|
406 |
" textsplit [opts] [filename]\n"
|
409 |
" textsplit [opts] [filename]\n"
|
407 |
" -q: query mode\n"
|
410 |
" -s: only spans\n"
|
|
|
411 |
" -w: only words\n"
|
408 |
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
412 |
" if filename is 'stdin', will read stdin for data (end with ^D)\n"
|
409 |
" \n\n"
|
413 |
" \n\n"
|
410 |
;
|
414 |
;
|
411 |
|
415 |
|
412 |
static void
|
416 |
static void
|
|
... |
|
... |
415 |
cerr << thisprog << ": usage:\n" << usage;
|
419 |
cerr << thisprog << ": usage:\n" << usage;
|
416 |
exit(1);
|
420 |
exit(1);
|
417 |
}
|
421 |
}
|
418 |
|
422 |
|
419 |
static int op_flags;
|
423 |
static int op_flags;
|
420 |
#define OPT_q 0x1
|
424 |
#define OPT_s 0x1
|
|
|
425 |
#define OPT_w 0x2
|
421 |
|
426 |
|
422 |
int main(int argc, char **argv)
|
427 |
int main(int argc, char **argv)
|
423 |
{
|
428 |
{
|
424 |
thisprog = argv[0];
|
429 |
thisprog = argv[0];
|
425 |
argc--; argv++;
|
430 |
argc--; argv++;
|
|
... |
|
... |
429 |
if (!(**argv))
|
434 |
if (!(**argv))
|
430 |
/* Cas du "adb - core" */
|
435 |
/* Cas du "adb - core" */
|
431 |
Usage();
|
436 |
Usage();
|
432 |
while (**argv)
|
437 |
while (**argv)
|
433 |
switch (*(*argv)++) {
|
438 |
switch (*(*argv)++) {
|
434 |
case 'q': op_flags |= OPT_q; break;
|
439 |
case 's': op_flags |= OPT_s; break;
|
|
|
440 |
case 'w': op_flags |= OPT_w; break;
|
435 |
default: Usage(); break;
|
441 |
default: Usage(); break;
|
436 |
}
|
442 |
}
|
437 |
argc--; argv++;
|
443 |
argc--; argv++;
|
438 |
}
|
444 |
}
|
439 |
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
445 |
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
440 |
DebugLog::setfilename("stderr");
|
446 |
DebugLog::setfilename("stderr");
|
441 |
mySplitterCB cb;
|
447 |
mySplitterCB cb;
|
442 |
TextSplit splitter(&cb, (op_flags&OPT_q) ? true: false);
|
448 |
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
|
|
449 |
if (op_flags&OPT_s)
|
|
|
450 |
flags = TextSplit::TXTS_ONLYSPANS;
|
|
|
451 |
else if (op_flags&OPT_w)
|
|
|
452 |
flags = TextSplit::TXTS_NOSPANS;
|
|
|
453 |
TextSplit splitter(&cb, flags);
|
443 |
if (argc == 1) {
|
454 |
if (argc == 1) {
|
444 |
string data;
|
455 |
string data;
|
445 |
const char *filename = *argv++; argc--;
|
456 |
const char *filename = *argv++; argc--;
|
446 |
if (!strcmp(filename, "stdin")) {
|
457 |
if (!strcmp(filename, "stdin")) {
|
447 |
char buf[1024];
|
458 |
char buf[1024];
|