Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.29 2007-01-25 15:40:55 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
101
             int btstart, int btend)
101
             int btstart, int btend)
102
{
102
{
103
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
103
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
104
104
105
    unsigned int l = w.length();
105
    unsigned int l = w.length();
106
    if (l > 0 && l < (unsigned)maxWordLength) {
106
    if (l > 0 && l < (unsigned)m_maxWordLength) {
107
    // 1 char word: we index single letters and digits, but
107
    // 1 char word: we index single letters and digits, but
108
    // nothing else. We might want to turn this into a test for a single
108
    // nothing else. We might want to turn this into a test for a single
109
    // utf8 character instead.
109
    // utf8 character instead.
110
    if (l == 1) {
110
    if (l == 1) {
111
        int c = (int)w[0];
111
        int c = (int)w[0];
112
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
112
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
113
        //cerr << "ERASING single letter term " << c << endl;
113
        //cerr << "ERASING single letter term " << c << endl;
114
        return true;
114
        return true;
115
        }
115
        }
116
    }
116
    }
117
    if (pos != prevpos || l != prevlen) {
117
    if (pos != m_prevpos || l != m_prevlen) {
118
        bool ret = cb->takeword(w, pos, btstart, btend);
118
        bool ret = m_cb->takeword(w, pos, btstart, btend);
119
      m_prevpos = pos;
119
        prevlen = w.length();
120
        m_prevlen = w.length();
120
      prevpos = pos;
121
        return ret;
121
        return ret;
122
    }
122
    }
123
    LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
123
    LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
124
    }
124
    }
125
    return true;
125
    return true;
...
...
144
         span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
144
         span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
145
145
146
    // Emit span. When splitting for query, we only emit final spans
146
    // Emit span. When splitting for query, we only emit final spans
147
    bool spanemitted = false;
147
    bool spanemitted = false;
148
    if (spanerase && !(m_flags & TXTS_NOSPANS)) {
148
    if (spanerase && !(m_flags & TXTS_NOSPANS)) {
149
    // Maybe trim at end These are chars that we would keep inside 
149
    // Maybe trim at end. These are chars that we would keep inside 
150
    // a span, but not at the end
150
    // a span, but not at the end
151
    while (span.length() > 0) {
151
    while (m_span.length() > 0) {
152
        switch (span[span.length()-1]) {
152
        switch (m_span[m_span.length()-1]) {
153
        case '.':
153
        case '.':
154
        case ',':
154
        case ',':
155
        case '@':
155
        case '@':
156
        case '\'':
156
        case '\'':
157
        span.resize(span.length()-1);
157
        m_span.resize(m_span.length()-1);
158
        if (--bp < 0) 
158
        if (--bp < 0) 
159
            bp=0;
159
            bp = 0;
160
        break;
160
        break;
161
        default:
161
        default:
162
        goto breakloop1;
162
        goto breakloop1;
163
        }
163
        }
164
    }
164
    }
165
    breakloop1:
165
    breakloop1:
166
    spanemitted = true;
166
    spanemitted = true;
167
    if (!emitterm(true, span, spanpos, bp-span.length(), bp))
167
    if (!emitterm(true, m_span, m_spanpos, bp - m_span.length(), bp))
168
        return false;
168
        return false;
169
    }
169
    }
170
170
171
    // Emit word if different from span and not 'no words' mode
171
    // Emit word if different from span and not 'no words' mode
172
    if (!(m_flags & TXTS_ONLYSPANS) && wordLen && 
172
    if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen && 
173
    (!spanemitted || wordLen != span.length())) {
173
    (!spanemitted || m_wordLen != m_span.length())) {
174
    string s(span.substr(wordStart, wordLen));
174
    string s(m_span.substr(m_wordStart, m_wordLen));
175
    if (!emitterm(false, s, wordpos, bp-wordLen, bp))
175
    if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
176
        return false;
176
        return false;
177
    }
177
    }
178
178
179
    // Adjust state
179
    // Adjust state
180
    wordpos++;
180
    m_wordpos++;
181
    wordLen = 0;
181
    m_wordLen = 0;
182
    if (spanerase) {
182
    if (spanerase) {
183
    span.erase();
183
    m_span.erase();
184
    spanpos = wordpos;
184
    m_spanpos = m_wordpos;
185
    wordStart = 0;
185
    m_wordStart = 0;
186
    } else {
186
    } else {
187
    wordStart = span.length();
187
    m_wordStart = m_span.length();
188
    }
188
    }
189
189
190
    return true;
190
    return true;
191
}
191
}
192
192
...
...
213
    LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb, 
213
    LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb, 
214
        in.substr(0,50).c_str()));
214
        in.substr(0,50).c_str()));
215
215
216
    setcharclasses();
216
    setcharclasses();
217
217
218
    span.erase();
218
    m_span.erase();
219
    number = false;
219
    m_inNumber = false;
220
    wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0;
220
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
221
221
222
    Utf8Iter it(in);
222
    Utf8Iter it(in);
223
223
224
    for (; !it.eof(); it++) {
224
    for (; !it.eof(); it++) {
225
    unsigned int c = *it;
225
    unsigned int c = *it;
...
...
229
        return false;
229
        return false;
230
    }
230
    }
231
    int cc = whatcc(c);
231
    int cc = whatcc(c);
232
    switch (cc) {
232
    switch (cc) {
233
    case LETTER:
233
    case LETTER:
234
        wordLen += it.appendchartostring(span);
234
        m_wordLen += it.appendchartostring(m_span);
235
        break;
235
        break;
236
236
237
    case DIGIT:
237
    case DIGIT:
238
        if (wordLen == 0)
238
        if (m_wordLen == 0)
239
        number = true;
239
        m_inNumber = true;
240
        wordLen += it.appendchartostring(span);
240
        m_wordLen += it.appendchartostring(m_span);
241
        break;
241
        break;
242
242
243
    case SPACE:
243
    case SPACE:
244
    SPACE:
244
    SPACE:
245
        if (wordLen || span.length()) {
245
        if (m_wordLen || m_span.length()) {
246
        if (!doemit(true, it.getBpos()))
246
        if (!doemit(true, it.getBpos()))
247
            return false;
247
            return false;
248
        number = false;
248
        m_inNumber = false;
249
        }
249
        }
250
        break;
250
        break;
251
    case WILD:
251
    case WILD:
252
        if (m_flags & TXTS_KEEPWILD)
252
        if (m_flags & TXTS_KEEPWILD)
253
        goto NORMALCHAR;
253
        goto NORMALCHAR;
254
        else
254
        else
255
        goto SPACE;
255
        goto SPACE;
256
        break;
256
        break;
257
    case '-':
257
    case '-':
258
    case '+':
258
    case '+':
259
        if (wordLen == 0) {
259
        if (m_wordLen == 0) {
260
        if (whatcc(it[it.getCpos()+1]) == DIGIT) {
260
        if (whatcc(it[it.getCpos()+1]) == DIGIT) {
261
            number = true;
261
            m_inNumber = true;
262
            wordLen += it.appendchartostring(span);
262
            m_wordLen += it.appendchartostring(m_span);
263
        } else {
263
        } else {
264
            wordStart += it.appendchartostring(span);
264
            m_wordStart += it.appendchartostring(m_span);
265
        }
265
        }
266
        } else {
266
        } else {
267
        if (!doemit(false, it.getBpos()))
267
        if (!doemit(false, it.getBpos()))
268
            return false;
268
            return false;
269
        number = false;
269
        m_inNumber = false;
270
        wordStart += it.appendchartostring(span);
270
        m_wordStart += it.appendchartostring(m_span);
271
        }
271
        }
272
        break;
272
        break;
273
    case '.':
273
    case '.':
274
    case ',':
274
    case ',':
275
        if (number) {
275
        if (m_inNumber) {
276
        // 132.jpg ?
276
        // 132.jpg ?
277
        if (whatcc(it[it.getCpos()+1]) != DIGIT)
277
        if (whatcc(it[it.getCpos()+1]) != DIGIT)
278
            goto SPACE;
278
            goto SPACE;
279
        wordLen += it.appendchartostring(span);
279
        m_wordLen += it.appendchartostring(m_span);
280
        break;
280
        break;
281
        } else {
281
        } else {
282
        // If . inside a word, keep it, else, this is whitespace. 
282
        // If . inside a word, keep it, else, this is whitespace. 
283
        // We also keep an initial '.' for catching .net, but this adds
283
        // We also keep an initial '.' for catching .net, but this adds
284
        // quite a few spurious terms !
284
        // quite a few spurious terms !
285
                // Another problem is that something like .x-errs 
285
                // Another problem is that something like .x-errs 
286
        // will be split as .x-errs, x, errs but not x-errs
286
        // will be split as .x-errs, x, errs but not x-errs
287
        // A final comma in a word will be removed by doemit
287
        // A final comma in a word will be removed by doemit
288
        if (cc == '.') {
288
        if (cc == '.') {
289
            if (wordLen) {
289
            if (m_wordLen) {
290
            if (!doemit(false, it.getBpos()))
290
            if (!doemit(false, it.getBpos()))
291
                return false;
291
                return false;
292
            // span length could have been adjusted by trimming
292
            // span length could have been adjusted by trimming
293
            // inside doemit
293
            // inside doemit
294
            if (span.length())
294
            if (m_span.length())
295
                wordStart += it.appendchartostring(span);
295
                m_wordStart += it.appendchartostring(m_span);
296
            break;
296
            break;
297
            } else {
297
            } else {
298
            wordStart += it.appendchartostring(span);
298
            m_wordStart += it.appendchartostring(m_span);
299
            break;
299
            break;
300
            }
300
            }
301
        }
301
        }
302
        }
302
        }
303
        goto SPACE;
303
        goto SPACE;
304
        break;
304
        break;
305
    case '@':
305
    case '@':
306
        if (wordLen) {
306
        if (m_wordLen) {
307
        if (!doemit(false, it.getBpos()))
307
        if (!doemit(false, it.getBpos()))
308
            return false;
308
            return false;
309
        number = false;
309
        m_inNumber = false;
310
        }
310
        }
311
        wordStart += it.appendchartostring(span);
311
        m_wordStart += it.appendchartostring(m_span);
312
        break;
312
        break;
313
    case '\'':
313
    case '\'':
314
        // If in word, potential span: o'brien, else, this is more 
314
        // If in word, potential span: o'brien, else, this is more 
315
        // whitespace
315
        // whitespace
316
        if (wordLen) {
316
        if (m_wordLen) {
317
        if (!doemit(false, it.getBpos()))
317
        if (!doemit(false, it.getBpos()))
318
            return false;
318
            return false;
319
        number = false;
319
        m_inNumber = false;
320
        wordStart += it.appendchartostring(span);
320
        m_wordStart += it.appendchartostring(m_span);
321
        }
321
        }
322
        break;
322
        break;
323
    case '#': 
323
    case '#': 
324
        // Keep it only at end of word ... Special case for c# you see...
324
        // Keep it only at end of word ... Special case for c# you see...
325
        if (wordLen > 0) {
325
        if (m_wordLen > 0) {
326
        int w = whatcc(it[it.getCpos()+1]);
326
        int w = whatcc(it[it.getCpos()+1]);
327
        if (w == SPACE || w == '\n' || w == '\r') {
327
        if (w == SPACE || w == '\n' || w == '\r') {
328
            wordLen += it.appendchartostring(span);
328
            m_wordLen += it.appendchartostring(m_span);
329
            break;
329
            break;
330
        }
330
        }
331
        }
331
        }
332
        goto SPACE;
332
        goto SPACE;
333
        break;
333
        break;
334
    case '\n':
334
    case '\n':
335
    case '\r':
335
    case '\r':
336
        if (span.length() && span[span.length() - 1] == '-') {
336
        if (m_span.length() && m_span[m_span.length() - 1] == '-') {
337
        // if '-' is the last char before end of line, just
337
        // if '-' is the last char before end of line, just
338
        // ignore the line change. This is the right thing to
338
        // ignore the line change. This is the right thing to
339
        // do almost always. We'd then need a way to check if
339
        // do almost always. We'd then need a way to check if
340
        // the - was added as part of the word hyphenation, or was 
340
        // the - was added as part of the word hyphenation, or was 
341
        // there in the first place, but this would need a dictionary.
341
        // there in the first place, but this would need a dictionary.
...
...
347
        }
347
        }
348
        break;
348
        break;
349
349
350
    default:
350
    default:
351
    NORMALCHAR:
351
    NORMALCHAR:
352
        wordLen += it.appendchartostring(span);
352
        m_wordLen += it.appendchartostring(m_span);
353
        break;
353
        break;
354
    }
354
    }
355
    }
355
    }
356
    if (wordLen || span.length()) {
356
    if (m_wordLen || m_span.length()) {
357
    if (!doemit(true, it.getBpos()))
357
    if (!doemit(true, it.getBpos()))
358
        return false;
358
        return false;
359
    }
359
    }
360
    return true;
360
    return true;
361
}
361
}