Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
...
...
206
 * @param bp        The current BYTE position in the stream
206
 * @param bp        The current BYTE position in the stream
207
 * @param spanemit  This is set for intermediate spans: glue char changed.
207
 * @param spanemit  This is set for intermediate spans: glue char changed.
208
 */
208
 */
209
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
209
inline bool TextSplit::doemit(bool spanerase, int bp, bool spanemit)
210
{
210
{
211
    LOGDEB3(("TextSplit::doemit:spn [%s] sp %d wrdS %d wrdL %d spe %d bp %d "
211
    LOGDEB3(("TextSplit::doemit: sper %d bp %d spem %d. spp %d wS %d wL %d "
212
             "innum %d\n", m_span.c_str(), m_spanpos, m_wordStart, 
212
      "inn %d span [%s]\n",
213
             m_wordLen, spanerase, bp, m_inNumber));
213
      spanerase, bp, spanemit, m_spanpos, m_wordStart, m_wordLen,
214
      m_inNumber, m_span.c_str()));
214
215
215
    // Emit span. When splitting for query, we only emit final spans
216
    // Emit span? When splitting for query, we only emit final spans
217
    // (spanerase)
216
    bool spanemitted = false;
218
    bool spanemitted = false;
217
    if (!(m_flags & TXTS_NOSPANS) && 
219
    if (!(m_flags & TXTS_NOSPANS) && 
218
        !((m_wordLen == m_span.length()) && 
220
        !((m_wordLen == m_span.length()) && 
219
          (o_noNumbers) && m_inNumber) &&
221
          (o_noNumbers) && m_inNumber) &&
220
    ((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
222
    ((spanemit && !(m_flags & TXTS_ONLYSPANS)) || spanerase) ) {
221
    // Maybe trim at end. These are chars that we would keep inside 
223
    // Maybe trim at end. These are chars that we would keep inside 
222
    // a span, but not at the end
224
    // a span, but not at the end
223
    while (m_span.length() > 0) {
225
    while (m_span.length() > 0) {
224
        switch (m_span[m_span.length()-1]) {
226
        switch (m_span[m_span.length()-1]) {
225
        case '.':
227
        case '.':
228
      case '-':
226
        case ',':
229
        case ',':
227
        case '@':
230
        case '@':
228
        case '\'':
231
        case '\'':
229
        m_span.resize(m_span.length()-1);
232
        m_span.resize(m_span.length()-1);
230
        if (--bp < 0) 
233
        if (--bp < 0) 
...
...
248
    if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
251
    if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
249
        return false;
252
        return false;
250
    }
253
    }
251
254
252
    // Adjust state
255
    // Adjust state
256
    if (m_wordLen) {
253
    m_wordpos++;
257
  m_wordpos++;
254
    m_wordLen = 0;
258
  m_wordLen = 0;
259
    }
255
    if (spanerase) {
260
    if (spanerase) {
256
  m_span.erase();
261
  discardspan();
257
  m_spanpos = m_wordpos;
258
  m_wordStart = 0;
259
    } else {
262
    } else {
260
    m_wordStart = m_span.length();
263
    m_wordStart = m_span.length();
261
    }
264
    }
262
265
263
    return true;
266
    return true;
267
}
268
269
void TextSplit::discardspan()
270
{
271
    m_span.erase();
272
    m_spanpos = m_wordpos;
273
    m_wordStart = 0;
274
    m_wordLen = 0;
264
}
275
}
265
276
266
/** 
277
/** 
267
 * Splitting a text into terms to be indexed.
278
 * Splitting a text into terms to be indexed.
268
 * We basically emit a word every time we see a separator, but some chars are
279
 * We basically emit a word every time we see a separator, but some chars are
...
...
281
    m_span.erase();
292
    m_span.erase();
282
    m_inNumber = false;
293
    m_inNumber = false;
283
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
294
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
284
    int curspanglue = 0;
295
    int curspanglue = 0;
285
296
297
    // Running count of non-alphanum chars. Reset when we see one;
298
    int nonalnumcnt = 0;
299
286
    Utf8Iter it(in);
300
    Utf8Iter it(in);
287
301
288
    for (; !it.eof(); it++) {
302
    for (; !it.eof(); it++) {
289
    unsigned int c = *it;
303
    unsigned int c = *it;
304
  nonalnumcnt++;
290
305
291
    if (c == (unsigned int)-1) {
306
    if (c == (unsigned int)-1) {
292
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
307
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
293
        return false;
308
        return false;
294
    }
309
    }
...
...
317
    switch (cc) {
332
    switch (cc) {
318
    case DIGIT:
333
    case DIGIT:
319
        if (m_wordLen == 0)
334
        if (m_wordLen == 0)
320
        m_inNumber = true;
335
        m_inNumber = true;
321
        m_wordLen += it.appendchartostring(m_span);
336
        m_wordLen += it.appendchartostring(m_span);
337
      nonalnumcnt = 0;
322
        break;
338
        break;
323
339
324
    case SPACE:
340
    case SPACE:
325
    SPACE:
341
    SPACE:
326
        curspanglue = 0;
342
        curspanglue = 0;
343
      nonalnumcnt = 0;
327
        if (m_wordLen || m_span.length()) {
344
        if (m_wordLen || m_span.length()) {
328
        if (!doemit(true, it.getBpos()))
345
        if (!doemit(true, it.getBpos()))
329
            return false;
346
            return false;
330
        m_inNumber = false;
347
        m_inNumber = false;
331
        }
348
        }
...
...
336
        else
353
        else
337
        goto SPACE;
354
        goto SPACE;
338
        break;
355
        break;
339
    case '-':
356
    case '-':
340
    case '+':
357
    case '+':
358
      curspanglue = cc;
341
        if (m_wordLen == 0 || 
359
        if (m_wordLen == 0) {
360
      if (cc == '-') {
361
          if (whatcc(it[it.getCpos()+1]) == DIGIT) {
362
          // -10
363
          m_inNumber = true;
364
          m_wordLen += it.appendchartostring(m_span);
365
          } else {
366
          goto SPACE;
367
          } 
368
      } else {
369
          if (nonalnumcnt > 2) {
370
          discardspan();
371
          } else {
372
          m_wordStart += it.appendchartostring(m_span);
373
          }
374
      }
342
                (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
375
      } else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
343
                                m_span[m_span.length() - 1] == 'E'))) {
376
                    m_span[m_span.length() - 1] == 'E')) {
344
        if (whatcc(it[it.getCpos()+1]) == DIGIT) {
377
        if (whatcc(it[it.getCpos()+1]) == DIGIT) {
345
          m_inNumber = true;
346
            m_wordLen += it.appendchartostring(m_span);
378
            m_wordLen += it.appendchartostring(m_span);
347
        } else {
379
        } else {
348
          m_wordStart += it.appendchartostring(m_span);
380
          goto SPACE;
349
        }
381
        }
350
      curspanglue = cc;
351
        } else {
382
        } else {
352
        if (!doemit(false, it.getBpos()))
383
        if (!doemit(false, it.getBpos()))
353
            return false;
384
            return false;
354
      curspanglue = cc;
355
        m_inNumber = false;
385
        m_inNumber = false;
356
        m_wordStart += it.appendchartostring(m_span);
386
        m_wordStart += it.appendchartostring(m_span);
357
        }
387
        }
358
        break;
388
        break;
359
    case '.':
389
    case '.':
...
...
365
            goto SPACE;
395
            goto SPACE;
366
        m_wordLen += it.appendchartostring(m_span);
396
        m_wordLen += it.appendchartostring(m_span);
367
        curspanglue = cc;
397
        curspanglue = cc;
368
        break;
398
        break;
369
        } else {
399
        } else {
370
        // If . inside a word, keep it, else, this is whitespace. 
400
        // If . inside a word, it's spanglue, else, it's whitespace. 
371
        // We also keep an initial '.' for catching .net, but this adds
401
        // We also keep an initial '.' for catching .net, but this adds
372
        // quite a few spurious terms !
402
        // quite a few spurious terms !
373
                // Another problem is that something like .x-errs 
403
                // Another problem is that something like .x-errs 
374
        // will be split as .x-errs, x, errs but not x-errs
404
        // will be split as .x-errs, x, errs but not x-errs
375
        // A final comma in a word will be removed by doemit
405
        // A final comma in a word will be removed by doemit
376
      if (cc == '.') {
406
      if (cc == '.' && it[it.getCpos()+1] != '.') {
377
                    // Check for number like .1
407
                    // Check for number like .1
378
                    if (m_span.length() == 0 &&
408
                    if (m_span.length() == 0 &&
379
                        whatcc(it[it.getCpos()+1]) == DIGIT) {
409
                        whatcc(it[it.getCpos()+1]) == DIGIT) {
380
                        m_inNumber = true;
410
                        m_inNumber = true;
381
                        m_wordLen += it.appendchartostring(m_span);
411
                        m_wordLen += it.appendchartostring(m_span);
...
...
384
                    }
414
                    }
385
                            
415
                            
386
            if (m_wordLen) {
416
            if (m_wordLen) {
387
            // Disputable special case: set spanemit to
417
            // Disputable special case: set spanemit to
388
            // true when encountering a '.' while spanglue
418
            // true when encountering a '.' while spanglue
389
            // is '_'. Think of a_b.c Done because to
419
            // is '_'. Think of a_b.c Done to
390
            // avoid breaking stuff after changing '_'
420
            // avoid breaking stuff after changing '_'
391
            // from wordchar to spanglue
421
            // from wordchar to spanglue
392
            if (!doemit(false, it.getBpos(), curspanglue == '_'))
422
            if (!doemit(false, it.getBpos(), curspanglue == '_'))
393
                return false;
423
                return false;
394
            curspanglue = cc;
424
            curspanglue = cc;
...
...
507
    NORMALCHAR:
537
    NORMALCHAR:
508
            if (m_inNumber && c != 'e' && c != 'E') {
538
            if (m_inNumber && c != 'e' && c != 'E') {
509
                m_inNumber = false;
539
                m_inNumber = false;
510
            }
540
            }
511
        m_wordLen += it.appendchartostring(m_span);
541
        m_wordLen += it.appendchartostring(m_span);
542
      nonalnumcnt = 0;
512
        break;
543
        break;
513
    }
544
    }
514
    }
545
    }
515
    if (m_wordLen || m_span.length()) {
546
    if (m_wordLen || m_span.length()) {
516
    if (!doemit(true, it.getBpos()))
547
    if (!doemit(true, it.getBpos()))