Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.24 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.25 2006-11-19 18:37:37 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
91
// Do some checking (the kind which is simpler to do here than in the
91
// Do some checking (the kind which is simpler to do here than in the
92
// main loop), then send term to our client.
92
// main loop), then send term to our client.
93
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
93
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
94
             int btstart, int btend)
94
             int btstart, int btend)
95
{
95
{
96
    LOGDEB2(("TextSplit::emitterm: '%s' pos %d\n", w.c_str(), pos));
96
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
97
97
98
    unsigned int l = w.length();
98
    unsigned int l = w.length();
99
    if (l > 0 && l < (unsigned)maxWordLength) {
99
    if (l > 0 && l < (unsigned)maxWordLength) {
100
    // 1 char word: we index single letters and digits, but
100
    // 1 char word: we index single letters and digits, but
101
    // nothing else. We might want to turn this into a test for a single
101
    // nothing else. We might want to turn this into a test for a single
...
...
105
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
105
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
106
        //cerr << "ERASING single letter term " << c << endl;
106
        //cerr << "ERASING single letter term " << c << endl;
107
        return true;
107
        return true;
108
        }
108
        }
109
    }
109
    }
110
  if (pos != prevpos || l != prevterm.length() || w != prevterm) {
110
  if (pos != prevpos || l != prevlen) {
111
        bool ret = cb->takeword(w, pos, btstart, btend);
111
        bool ret = cb->takeword(w, pos, btstart, btend);
112
      prevterm = w;
112
      prevlen = w.length();
113
        prevpos = pos;
113
        prevpos = pos;
114
        return ret;
114
        return ret;
115
    }
115
    }
116
  LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
116
    }
117
    }
117
    return true;
118
    return true;
118
}
119
}
119
120
120
/**
121
/**
...
...
135
 * @param spanerase Set if the current span is at its end. Reset it.
136
 * @param spanerase Set if the current span is at its end. Reset it.
136
 * @param bp        The current BYTE position in the stream
137
 * @param bp        The current BYTE position in the stream
137
 */
138
 */
138
inline bool TextSplit::doemit(bool spanerase, int bp)
139
inline bool TextSplit::doemit(bool spanerase, int bp)
139
{
140
{
140
#if 0
141
    LOGDEB3(("TextSplit::doemit: wrd [%s] wp %d spn [%s] sp %d spe %d bp %d\n",
141
    cerr << "doemit: " << "w: '" << word << "' wp: "<< wordpos << " s: '" <<
142
      word.c_str(), wordpos, span.c_str(), spanpos, spanerase, bp));
142
  span << "' sp: " << spanpos << " spe: " << spanerase << " bp: " << bp 
143
   << endl;
144
#endif
145
143
146
    // Emit span. When splitting for query, we only emit final spans
144
    // Emit span. When splitting for query, we only emit final spans
147
    bool spanemitted = false;
145
    bool spanemitted = false;
148
    if (spanerase && !(m_flags & TXTS_NOSPANS)) {
146
    if (spanerase && !(m_flags & TXTS_NOSPANS)) {
149
    // Maybe trim at end These are chars that we would keep inside 
147
    // Maybe trim at end These are chars that we would keep inside 
...
...
212
    setcharclasses();
210
    setcharclasses();
213
211
214
    span.erase();
212
    span.erase();
215
    word.erase(); // Current word: no punctuation at all in there
213
    word.erase(); // Current word: no punctuation at all in there
216
    number = false;
214
    number = false;
217
    prevpos = wordpos = spanpos = charpos = 0;
215
    prevpos = prevlen = wordpos = spanpos = charpos = 0;
218
    prevterm.erase();
219
216
220
    Utf8Iter it(in);
217
    Utf8Iter it(in);
221
218
222
    for (; !it.eof(); it++, charpos++) {
219
    for (; !it.eof(); it++, charpos++) {
223
    unsigned int c = *it;
220
    unsigned int c = *it;
...
...
226
        return false;
223
        return false;
227
    }
224
    }
228
    int cc = whatcc(c);
225
    int cc = whatcc(c);
229
    switch (cc) {
226
    switch (cc) {
230
    case LETTER:
227
    case LETTER:
231
      word += it;
228
      it.appendchartostring(word);
232
      span += it;
229
      it.appendchartostring(span);
233
        break;
230
        break;
234
231
235
    case DIGIT:
232
    case DIGIT:
236
        if (word.length() == 0)
233
        if (word.length() == 0)
237
        number = true;
234
        number = true;
238
      word += it;
235
      it.appendchartostring(word);
239
      span += it;
236
      it.appendchartostring(span);
240
        break;
237
        break;
241
238
242
    case SPACE:
239
    case SPACE:
243
    SPACE:
240
    SPACE:
244
        if (word.length() || span.length()) {
241
        if (word.length() || span.length()) {
...
...
250
    case '-':
247
    case '-':
251
    case '+':
248
    case '+':
252
        if (word.length() == 0) {
249
        if (word.length() == 0) {
253
        if (whatcc(it[charpos+1]) == DIGIT) {
250
        if (whatcc(it[charpos+1]) == DIGIT) {
254
            number = true;
251
            number = true;
255
          word += it;
252
          it.appendchartostring(word);
256
          span += it;
253
          it.appendchartostring(span);
257
        } else
254
        } else
258
          span += it;
255
          it.appendchartostring(span);
259
        } else {
256
        } else {
260
        if (!doemit(false, it.getBpos()))
257
        if (!doemit(false, it.getBpos()))
261
            return false;
258
            return false;
262
        number = false;
259
        number = false;
263
      span += it;
260
      it.appendchartostring(span);
264
        }
261
        }
265
        break;
262
        break;
266
    case '.':
263
    case '.':
267
    case ',':
264
    case ',':
268
        if (number) {
265
        if (number) {
269
        // 132.jpg ?
266
        // 132.jpg ?
270
        if (whatcc(it[charpos+1]) != DIGIT)
267
        if (whatcc(it[charpos+1]) != DIGIT)
271
            goto SPACE;
268
            goto SPACE;
272
      word += it;
269
      it.appendchartostring(word);
273
      span += it;
270
      it.appendchartostring(span);
274
        break;
271
        break;
275
        } else {
272
        } else {
276
        // If . inside a word, keep it, else, this is whitespace. 
273
        // If . inside a word, keep it, else, this is whitespace. 
277
        // We also keep an initial '.' for catching .net, but this adds
274
        // We also keep an initial '.' for catching .net, but this adds
278
        // quite a few spurious terms !
275
        // quite a few spurious terms !
...
...
284
            if (!doemit(false, it.getBpos()))
281
            if (!doemit(false, it.getBpos()))
285
                return false;
282
                return false;
286
            // span length could have been adjusted by trimming
283
            // span length could have been adjusted by trimming
287
            // inside doemit
284
            // inside doemit
288
            if (span.length())
285
            if (span.length())
289
              span += it;
286
              it.appendchartostring(span);
290
            break;
287
            break;
291
            } else {
288
            } else {
292
          span += it;
289
          it.appendchartostring(span);
293
            break;
290
            break;
294
            }
291
            }
295
        }
292
        }
296
        }
293
        }
297
        goto SPACE;
294
        goto SPACE;
...
...
300
        if (word.length()) {
297
        if (word.length()) {
301
        if (!doemit(false, it.getBpos()))
298
        if (!doemit(false, it.getBpos()))
302
            return false;
299
            return false;
303
        number = false;
300
        number = false;
304
        }
301
        }
305
      span += it;
302
      it.appendchartostring(span);
306
        break;
303
        break;
307
    case '\'':
304
    case '\'':
308
        // If in word, potential span: o'brien, else, this is more 
305
        // If in word, potential span: o'brien, else, this is more 
309
        // whitespace
306
        // whitespace
310
        if (word.length()) {
307
        if (word.length()) {
311
        if (!doemit(false, it.getBpos()))
308
        if (!doemit(false, it.getBpos()))
312
            return false;
309
            return false;
313
        number = false;
310
        number = false;
314
      span += it;
311
      it.appendchartostring(span);
315
        }
312
        }
316
        break;
313
        break;
317
    case '#': 
314
    case '#': 
318
        // Keep it only at end of word... Special case for c# you see...
315
        // Keep it only at end of word... Special case for c# you see...
319
        if (word.length() > 0) {
316
        if (word.length() > 0) {
320
        int w = whatcc(it[charpos+1]);
317
        int w = whatcc(it[charpos+1]);
321
        if (w == SPACE || w == '\n' || w == '\r') {
318
        if (w == SPACE || w == '\n' || w == '\r') {
322
          word += it;
319
          it.appendchartostring(word);
323
          span += it;
320
          it.appendchartostring(span);
324
            break;
321
            break;
325
        }
322
        }
326
        }
323
        }
327
        goto SPACE;
324
        goto SPACE;
328
        break;
325
        break;
...
...
341
        goto SPACE;
338
        goto SPACE;
342
        }
339
        }
343
        break;
340
        break;
344
341
345
    default:
342
    default:
346
      word += it;
343
      it.appendchartostring(word);
347
      span += it;
344
      it.appendchartostring(span);
348
        break;
345
        break;
349
    }
346
    }
350
    }
347
    }
351
    if (word.length() || span.length()) {
348
    if (word.length() || span.length()) {
352
    if (!doemit(true, it.getBpos()))
349
    if (!doemit(true, it.getBpos()))
...
...
371
using namespace std;
368
using namespace std;
372
369
373
// A small class to hold state while splitting text
370
// A small class to hold state while splitting text
374
class mySplitterCB : public TextSplitCB {
371
class mySplitterCB : public TextSplitCB {
375
    int first;
372
    int first;
373
    bool nooutput;
376
 public:
374
 public:
377
    mySplitterCB() : first(1) {}
375
    mySplitterCB() : first(1), nooutput(false) {}
378
376
    void setNoOut(bool val) {nooutput = val;}
379
    bool takeword(const std::string &term, int pos, int bs, int be) {
377
    bool takeword(const std::string &term, int pos, int bs, int be) {
378
  if (nooutput)
379
      return true;
380
    if (first) {
380
    if (first) {
381
        printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
381
        printf("%3s %-20s %4s %4s\n", "pos", "Term", "bs", "be");
382
        first = 0;
382
        first = 0;
383
    }
383
    }
384
    printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
384
    printf("%3d %-20s %4d %4d\n", pos, term.c_str(), bs, be);
...
...
404
static string teststring1 = " 124, ";
404
static string teststring1 = " 124, ";
405
405
406
static string thisprog;
406
static string thisprog;
407
407
408
static string usage =
408
static string usage =
409
    " textsplit [opts] [filename]\n"
409
      " textsplit [opts] [filename]\n"
410
      "   -S: no output\n"
410
    "   -s:  only spans\n"
411
      "   -s:  only spans\n"
411
    "   -w:  only words\n"
412
      "   -w:  only words\n"
412
    " if filename is 'stdin', will read stdin for data (end with ^D)\n"
413
    " if filename is 'stdin', will read stdin for data (end with ^D)\n"
413
    "  \n\n"
414
    "  \n\n"
414
    ;
415
    ;
415
416
416
static void
417
static void
...
...
421
}
422
}
422
423
423
static int        op_flags;
424
static int        op_flags;
424
#define OPT_s     0x1 
425
#define OPT_s     0x1 
425
#define OPT_w     0x2
426
#define OPT_w     0x2
427
#define OPT_S   0x4
426
428
427
int main(int argc, char **argv)
429
int main(int argc, char **argv)
428
{
430
{
429
    thisprog = argv[0];
431
    thisprog = argv[0];
430
    argc--; argv++;
432
    argc--; argv++;
...
...
435
        /* Cas du "adb - core" */
437
        /* Cas du "adb - core" */
436
        Usage();
438
        Usage();
437
    while (**argv)
439
    while (**argv)
438
        switch (*(*argv)++) {
440
        switch (*(*argv)++) {
439
        case 's':   op_flags |= OPT_s; break;
441
        case 's':   op_flags |= OPT_s; break;
442
      case 'S':   op_flags |= OPT_S; break;
440
        case 'w':   op_flags |= OPT_w; break;
443
        case 'w':   op_flags |= OPT_w; break;
441
        default: Usage();   break;
444
        default: Usage();   break;
442
        }
445
        }
443
    argc--; argv++;
446
    argc--; argv++;
444
    }
447
    }
445
    DebugLog::getdbl()->setloglevel(DEBDEB1);
448
    DebugLog::getdbl()->setloglevel(DEBDEB1);
446
    DebugLog::setfilename("stderr");
449
    DebugLog::setfilename("stderr");
450
447
    mySplitterCB cb;
451
    mySplitterCB cb;
448
    TextSplit::Flags flags = TextSplit::TXTS_NONE;
452
    TextSplit::Flags flags = TextSplit::TXTS_NONE;
453
454
    if (op_flags&OPT_S)
455
  cb.setNoOut(true);
456
449
    if (op_flags&OPT_s)
457
    if (op_flags&OPT_s)
450
    flags = TextSplit::TXTS_ONLYSPANS;
458
    flags = TextSplit::TXTS_ONLYSPANS;
451
    else if (op_flags&OPT_w)
459
    else if (op_flags&OPT_w)
452
    flags = TextSplit::TXTS_NOSPANS;
460
    flags = TextSplit::TXTS_NOSPANS;
453
    TextSplit splitter(&cb,  flags);
461
    TextSplit splitter(&cb,  flags);