Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
...
...
203
                int btstart, int btend)
203
                int btstart, int btend)
204
{
204
{
205
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
205
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
206
206
207
    unsigned int l = w.length();
207
    unsigned int l = w.length();
208
209
#ifdef TEXTSPLIT_STATS
210
    // Update word length statistics. Do this before we filter out
211
    // long words because stats are used to detect bad text
212
    if (!isspan || m_wordLen == m_span.length())
213
  m_stats.newsamp(m_wordChars);
214
#endif
215
208
    if (l > 0 && l < (unsigned)m_maxWordLength) {
216
    if (l > 0 && l < (unsigned)m_maxWordLength) {
209
    // 1 byte word: we index single ascii letters and digits, but
217
    // 1 byte word: we index single ascii letters and digits, but
210
    // nothing else. We might want to turn this into a test for a
218
    // nothing else. We might want to turn this into a test for a
211
    // single utf8 character instead ?
219
    // single utf8 character instead ?
212
    if (l == 1) {
220
    if (l == 1) {
...
...
314
    }
322
    }
315
323
316
    // Adjust state
324
    // Adjust state
317
    if (m_wordLen) {
325
    if (m_wordLen) {
318
    m_wordpos++;
326
    m_wordpos++;
319
  m_wordLen = 0;
327
  m_wordLen = m_wordChars = 0;
320
    }
328
    }
321
    if (spanerase) {
329
    if (spanerase) {
322
    discardspan();
330
    discardspan();
323
    } else {
331
    } else {
324
    m_wordStart = m_span.length();
332
    m_wordStart = m_span.length();
...
...
330
void TextSplit::discardspan()
338
void TextSplit::discardspan()
331
{
339
{
332
    m_span.erase();
340
    m_span.erase();
333
    m_spanpos = m_wordpos;
341
    m_spanpos = m_wordpos;
334
    m_wordStart = 0;
342
    m_wordStart = 0;
335
    m_wordLen = 0;
343
    m_wordLen = m_wordChars = 0;
336
}
344
}
337
345
338
static inline bool isalphanum(int what, unsigned int flgs)
346
static inline bool isalphanum(int what, unsigned int flgs)
339
{
347
{
340
    return what == A_LLETTER || what == A_ULETTER ||
348
    return what == A_LLETTER || what == A_ULETTER ||
...
...
343
}
351
}
344
static inline bool isdigit(int what, unsigned int flgs)
352
static inline bool isdigit(int what, unsigned int flgs)
345
{
353
{
346
    return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
354
    return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
347
}
355
}
356
357
#ifdef TEXTSPLIT_STATS
358
#define INC_WORDCHARS ++m_wordChars
359
#else
360
#define INC_WORDCHARS
361
#endif
348
362
349
/** 
363
/** 
350
 * Splitting a text into terms to be indexed.
364
 * Splitting a text into terms to be indexed.
351
 * We basically emit a word every time we see a separator, but some chars are
365
 * We basically emit a word every time we see a separator, but some chars are
352
 * handled specially so that special cases, ie, c++ and jfd@recoll.com etc, 
366
 * handled specially so that special cases, ie, c++ and jfd@recoll.com etc, 
...
...
364
    if (in.empty())
378
    if (in.empty())
365
    return true;
379
    return true;
366
380
367
    m_span.erase();
381
    m_span.erase();
368
    m_inNumber = false;
382
    m_inNumber = false;
369
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
383
    m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos 
384
  = m_spanpos = 0;
370
    int curspanglue = 0;
385
    int curspanglue = 0;
371
    bool pagepending = false;
386
    bool pagepending = false;
372
    bool softhyphenpending = false;
387
    bool softhyphenpending = false;
373
388
374
    // Running count of non-alphanum chars. Reset when we see one;
389
    // Running count of non-alphanum chars. Reset when we see one;
...
...
421
        continue;
436
        continue;
422
    case DIGIT:
437
    case DIGIT:
423
        if (m_wordLen == 0)
438
        if (m_wordLen == 0)
424
        m_inNumber = true;
439
        m_inNumber = true;
425
        m_wordLen += it.appendchartostring(m_span);
440
        m_wordLen += it.appendchartostring(m_span);
441
      INC_WORDCHARS;
426
        nonalnumcnt = 0;
442
        nonalnumcnt = 0;
427
        break;
443
        break;
428
444
429
    case SPACE:
445
    case SPACE:
430
    SPACE:
446
    SPACE:
...
...
456
        // it's going to be to be a number
472
        // it's going to be to be a number
457
        if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
473
        if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
458
            // -10
474
            // -10
459
            m_inNumber = true;
475
            m_inNumber = true;
460
            m_wordLen += it.appendchartostring(m_span);
476
            m_wordLen += it.appendchartostring(m_span);
477
          INC_WORDCHARS;
461
        } else {
478
        } else {
462
            goto SPACE;
479
            goto SPACE;
463
        } 
480
        } 
464
        } else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
481
        } else if (m_inNumber && (m_span[m_span.length() - 1] == 'e' ||
465
                      m_span[m_span.length() - 1] == 'E')) {
482
                      m_span[m_span.length() - 1] == 'E')) {
466
        if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
483
        if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
467
            m_wordLen += it.appendchartostring(m_span);
484
            m_wordLen += it.appendchartostring(m_span);
485
          INC_WORDCHARS;
468
        } else {
486
        } else {
469
            goto SPACE;
487
            goto SPACE;
470
        }
488
        }
471
        } else {
489
        } else {
472
        goto SPACE;
490
        goto SPACE;
...
...
480
        int nextwhat = whatcc(nextc);
498
        int nextwhat = whatcc(nextc);
481
        if (m_inNumber) {
499
        if (m_inNumber) {
482
        if (!isdigit(nextwhat, m_flags))
500
        if (!isdigit(nextwhat, m_flags))
483
            goto SPACE;
501
            goto SPACE;
484
        m_wordLen += it.appendchartostring(m_span);
502
        m_wordLen += it.appendchartostring(m_span);
503
      INC_WORDCHARS;
485
        curspanglue = cc;
504
        curspanglue = cc;
486
        break;
505
        break;
487
        } else {
506
        } else {
488
        // If . inside a word, it's spanglue, else, it's whitespace. 
507
        // If . inside a word, it's spanglue, else, it's whitespace. 
489
        // We also keep an initial '.' for catching .net, but this adds
508
        // We also keep an initial '.' for catching .net, but this adds
...
...
499
        if (cc == '.') {
518
        if (cc == '.') {
500
                    // Check for number like .1
519
                    // Check for number like .1
501
                    if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
520
                    if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
502
                        m_inNumber = true;
521
                        m_inNumber = true;
503
                        m_wordLen += it.appendchartostring(m_span);
522
                        m_wordLen += it.appendchartostring(m_span);
523
          INC_WORDCHARS;
504
                        curspanglue = cc;
524
                        curspanglue = cc;
505
                        break;
525
                        break;
506
                    }
526
                    }
507
                            
527
                            
508
            if (m_wordLen) {
528
            if (m_wordLen) {
...
...
565
        // Keep it only at end of word ... Special case for c# you see...
585
        // Keep it only at end of word ... Special case for c# you see...
566
        if (m_wordLen > 0) {
586
        if (m_wordLen > 0) {
567
        int w = whatcc(it[it.getCpos()+1]);
587
        int w = whatcc(it[it.getCpos()+1]);
568
        if (w == SPACE || w == '\n' || w == '\r') {
588
        if (w == SPACE || w == '\n' || w == '\r') {
569
            m_wordLen += it.appendchartostring(m_span);
589
            m_wordLen += it.appendchartostring(m_span);
590
          INC_WORDCHARS;
570
            break;
591
            break;
571
        }
592
        }
572
        }
593
        }
573
        goto SPACE;
594
        goto SPACE;
574
        break;
595
        break;
...
...
637
    NORMALCHAR:
658
    NORMALCHAR:
638
            if (m_inNumber && c != 'e' && c != 'E') {
659
            if (m_inNumber && c != 'e' && c != 'E') {
639
                m_inNumber = false;
660
                m_inNumber = false;
640
            }
661
            }
641
        m_wordLen += it.appendchartostring(m_span);
662
        m_wordLen += it.appendchartostring(m_span);
663
      INC_WORDCHARS;
642
        nonalnumcnt = 0;
664
        nonalnumcnt = 0;
643
        break;
665
        break;
644
    }
666
    }
645
    softhyphenpending = false;
667
    softhyphenpending = false;
646
    }
668
    }
...
...
736
    }
758
    }
737
    }
759
    }
738
760
739
    m_span.erase();
761
    m_span.erase();
740
    m_inNumber = false;
762
    m_inNumber = false;
741
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
763
    m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
742
    m_spanpos = m_wordpos;
764
    m_spanpos = m_wordpos;
743
    *cp = c;
765
    *cp = c;
744
    return true;
766
    return true;
745
}
767
}
746
768
...
...
862
#include <stdlib.h>
884
#include <stdlib.h>
863
#include <unistd.h>
885
#include <unistd.h>
864
#include <errno.h>
886
#include <errno.h>
865
#include <fcntl.h>
887
#include <fcntl.h>
866
#include <string.h>
888
#include <string.h>
889
#include <math.h>
867
890
868
#include <iostream>
891
#include <iostream>
869
892
870
#include "textsplit.h"
893
#include "textsplit.h"
871
#include "readfile.h"
894
#include "readfile.h"
...
...
878
901
879
class myTermProc : public Rcl::TermProc {
902
class myTermProc : public Rcl::TermProc {
880
    int first;
903
    int first;
881
    bool nooutput;
904
    bool nooutput;
882
public:
905
public:
883
    myTermProc() : TermProc(0), first(1), nooutput(false)  {}
906
    myTermProc() : TermProc(0), first(1), nooutput(false) {}
884
    void setNoOut(bool val) {nooutput = val;}
907
    void setNoOut(bool val) {nooutput = val;}
885
    virtual bool takeword(const string &term, int pos, int bs, int be)
908
    virtual bool takeword(const string &term, int pos, int bs, int be)
886
    {
909
    {
887
    if (nooutput)
910
    if (nooutput)
888
        return true;
911
        return true;
...
...
1056
1079
1057
        if (op_flags & OPT_q)
1080
        if (op_flags & OPT_q)
1058
            printproc.setNoOut(true);
1081
            printproc.setNoOut(true);
1059
1082
1060
    splitter.text_to_words(data);
1083
    splitter.text_to_words(data);
1061
1084
#ifdef TEXTSPLIT_STATS
1085
  TextSplit::Stats::Values v = splitter.getStats();
1086
  cout << "Average length: " 
1087
       <<  v.avglen
1088
       << " Standard deviation: " 
1089
       << v.sigma
1090
       << " Coef of variation "
1091
       << v.sigma / v.avglen
1092
       << endl;
1093
#endif
1062
    }    
1094
    }    
1063
}
1095
}
1064
#endif // TEST
1096
#endif // TEST