Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.34 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.35 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
139
     || ((p) >= 0xFE30 && (p) <= 0xFE4F)                                \
139
     || ((p) >= 0xFE30 && (p) <= 0xFE4F)                                \
140
     || ((p) >= 0xFF00 && (p) <= 0xFFEF)                                \
140
     || ((p) >= 0xFF00 && (p) <= 0xFFEF)                                \
141
     || ((p) >= 0x20000 && (p) <= 0x2A6DF)                              \
141
     || ((p) >= 0x20000 && (p) <= 0x2A6DF)                              \
142
     || ((p) >= 0x2F800 && (p) <= 0x2FA1F))
142
     || ((p) >= 0x2F800 && (p) <= 0x2FA1F))
143
143
144
bool TextSplit::t_processCJK = true;
144
bool          TextSplit::o_processCJK = true;
145
unsigned int  TextSplit::o_CJKNgramLen = 2;
145
146
146
// Do some checking (the kind which is simpler to do here than in the
147
// Do some checking (the kind which is simpler to do here than in the
147
// main loop), then send term to our client.
148
// main loop), then send term to our client.
148
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
149
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
149
             int btstart, int btend)
150
             int btstart, int btend)
...
...
244
 * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
245
 * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
245
 * are handled properly,
246
 * are handled properly,
246
 */
247
 */
247
bool TextSplit::text_to_words(const string &in)
248
bool TextSplit::text_to_words(const string &in)
248
{
249
{
249
    LOGDEB1(("TextSplit::text_to_words: docjk %d %s%s%s [%s]\n", 
250
    LOGDEB1(("TextSplit::text_to_words: docjk %d (%d) %s%s%s [%s]\n", 
250
      t_processCJK,
251
       o_processCJK, o_CJKNgramLen,
251
        m_flags & TXTS_NOSPANS ? " nospans" : "",
252
         m_flags & TXTS_NOSPANS ? " nospans" : "",
252
        m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
253
         m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
253
        m_flags & TXTS_KEEPWILD ? " keepwild" : "",
254
         m_flags & TXTS_KEEPWILD ? " keepwild" : "",
254
        in.substr(0,50).c_str()));
255
         in.substr(0,50).c_str()));
255
256
256
    setcharclasses();
257
    setcharclasses();
257
258
258
    m_span.erase();
259
    m_span.erase();
259
    m_inNumber = false;
260
    m_inNumber = false;
...
...
267
    if (c == (unsigned int)-1) {
268
    if (c == (unsigned int)-1) {
268
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
269
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
269
        return false;
270
        return false;
270
    }
271
    }
271
272
272
    if (t_processCJK && UNICODE_IS_CJK(c)) {
273
    if (o_processCJK && UNICODE_IS_CJK(c)) {
273
        // CJK character hit. 
274
        // CJK character hit. 
274
        // Do like at EOF with the current non-cjk data.
275
        // Do like at EOF with the current non-cjk data.
275
        if (m_wordLen || m_span.length()) {
276
        if (m_wordLen || m_span.length()) {
276
        if (!doemit(true, it.getBpos()))
277
        if (!doemit(true, it.getBpos()))
277
            return false;
278
            return false;
...
...
419
        return false;
420
        return false;
420
    }
421
    }
421
    return true;
422
    return true;
422
}
423
}
423
424
424
const unsigned int ngramlen = 2;
425
#define MAXNGRAMLEN 5
426
427
// Using an utf8iter pointer just to avoid needing its definition in
425
// Using an utf8iter pointer just to avoid needing its definition in
428
// textsplit.h
426
// textsplit.h
429
//
427
//
430
// We output ngrams for exemple for char input a b c and ngramlen== 2, 
428
// We output ngrams for exemple for char input a b c and ngramlen== 2, 
431
// we generate: a ab b bc c as words
429
// we generate: a ab b bc c as words
...
...
440
    LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
438
    LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
441
    Utf8Iter &it = *itp;
439
    Utf8Iter &it = *itp;
442
440
443
    // We use an offset buffer to remember the starts of the utf-8
441
    // We use an offset buffer to remember the starts of the utf-8
444
    // characters which we still need to use.
442
    // characters which we still need to use.
445
    // Fixed size array. ngramlen over 3 doesn't make sense.
443
    assert(o_CJKNgramLen < o_CJKMaxNgramLen);
446
    assert(ngramlen < MAXNGRAMLEN);
444
    unsigned int boffs[o_CJKMaxNgramLen+1];
447
    unsigned int boffs[MAXNGRAMLEN];
448
445
449
    // Current number of valid offsets;
446
    // Current number of valid offsets;
450
    unsigned int nchars = 0;
447
    unsigned int nchars = 0;
451
    unsigned int c = 0;
448
    unsigned int c = 0;
452
    for (; !it.eof(); it++) {
449
    for (; !it.eof(); it++) {
...
...
454
    if (!UNICODE_IS_CJK(c)) {
451
    if (!UNICODE_IS_CJK(c)) {
455
        // Return to normal handler
452
        // Return to normal handler
456
        break;
453
        break;
457
    }
454
    }
458
455
459
    if (nchars == ngramlen) {
456
    if (nchars == o_CJKNgramLen) {
460
        // Offset buffer full, shift it. Might be more efficient
457
        // Offset buffer full, shift it. Might be more efficient
461
        // to have a circular one, but things are complicated
458
        // to have a circular one, but things are complicated
462
        // enough already...
459
        // enough already...
463
        for (unsigned int i = 0; i < nchars-1; i++) {
460
        for (unsigned int i = 0; i < nchars-1; i++) {
464
        boffs[i] = boffs[i+1];
461
        boffs[i] = boffs[i+1];
...
...
471
    boffs[nchars-1] = it.getBpos();
468
    boffs[nchars-1] = it.getBpos();
472
469
473
    // Output all new ngrams: they begin at each existing position
470
    // Output all new ngrams: they begin at each existing position
474
    // and end after the new character. onlyspans->only output
471
    // and end after the new character. onlyspans->only output
475
    // maximum words, nospans=> single chars
472
    // maximum words, nospans=> single chars
476
    if (!(m_flags & TXTS_ONLYSPANS) || nchars == ngramlen) {
473
    if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
477
        unsigned int btend = it.getBpos() + it.getBlen();
474
        unsigned int btend = it.getBpos() + it.getBlen();
478
        unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
475
        unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
479
        unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
476
        unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
480
        for (unsigned int i = loopbeg; i < loopend; i++) {
477
        for (unsigned int i = loopbeg; i < loopend; i++) {
481
        if (!m_cb->takeword(it.buffer().substr(boffs[i], 
478
        if (!m_cb->takeword(it.buffer().substr(boffs[i], 
...
...
495
    m_wordpos++;
492
    m_wordpos++;
496
    }
493
    }
497
494
498
    // If onlyspans is set, there may be things to flush in the buffer
495
    // If onlyspans is set, there may be things to flush in the buffer
499
    // first
496
    // first
500
    if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != ngramlen)  {
497
    if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen)  {
501
    unsigned int btend = it.getBpos(); // Current char is out
498
    unsigned int btend = it.getBpos(); // Current char is out
502
    if (!m_cb->takeword(it.buffer().substr(boffs[0], 
499
    if (!m_cb->takeword(it.buffer().substr(boffs[0], 
503
                           btend-boffs[0]),
500
                           btend-boffs[0]),
504
                m_wordpos - nchars,
501
                m_wordpos - nchars,
505
                boffs[0], btend)) {
502
                boffs[0], btend)) {