Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
...
...
184
        //cerr << "ERASING single letter term " << c << endl;
184
        //cerr << "ERASING single letter term " << c << endl;
185
        return true;
185
        return true;
186
        }
186
        }
187
    }
187
    }
188
    if (pos != m_prevpos || l != m_prevlen) {
188
    if (pos != m_prevpos || l != m_prevlen) {
189
        bool ret = m_cb->takeword(w, pos, btstart, btend);
189
        bool ret = takeword(w, pos, btstart, btend);
190
        m_prevpos = pos;
190
        m_prevpos = pos;
191
        m_prevlen = w.length();
191
        m_prevlen = w.length();
192
        return ret;
192
        return ret;
193
    }
193
    }
194
    LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
194
    LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
...
...
556
    if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
556
    if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
557
        unsigned int btend = it.getBpos() + it.getBlen();
557
        unsigned int btend = it.getBpos() + it.getBlen();
558
        unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
558
        unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
559
        unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
559
        unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
560
        for (unsigned int i = loopbeg; i < loopend; i++) {
560
        for (unsigned int i = loopbeg; i < loopend; i++) {
561
        if (!m_cb->takeword(it.buffer().substr(boffs[i], 
561
        if (!takeword(it.buffer().substr(boffs[i], 
562
                               btend-boffs[i]),
562
                               btend-boffs[i]),
563
                m_wordpos - (nchars-i-1), boffs[i], btend)) {
563
                m_wordpos - (nchars-i-1), boffs[i], btend)) {
564
            return false;
564
            return false;
565
        }
565
        }
566
        }
566
        }
...
...
577
577
578
    // If onlyspans is set, there may be things to flush in the buffer
578
    // If onlyspans is set, there may be things to flush in the buffer
579
    // first
579
    // first
580
    if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen)  {
580
    if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen)  {
581
    unsigned int btend = it.getBpos(); // Current char is out
581
    unsigned int btend = it.getBpos(); // Current char is out
582
    if (!m_cb->takeword(it.buffer().substr(boffs[0], 
582
    if (!takeword(it.buffer().substr(boffs[0], 
583
                           btend-boffs[0]),
583
                           btend-boffs[0]),
584
                m_wordpos - nchars,
584
                m_wordpos - nchars,
585
                boffs[0], btend)) {
585
                boffs[0], btend)) {
586
        return false;
586
        return false;
587
    }
587
    }
...
...
593
    m_spanpos = m_wordpos;
593
    m_spanpos = m_wordpos;
594
    *cp = c;
594
    *cp = c;
595
    return true;
595
    return true;
596
}
596
}
597
597
598
// Callback class for countWords 
598
// Specialization for countWords 
599
class utSplitterCB : public TextSplitCB {
599
class TextSplitCW : public TextSplit {
600
 public:
600
 public:
601
    int wcnt;
601
    int wcnt;
602
    utSplitterCB() : wcnt(0) {}
602
    TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
603
    bool takeword(const string &term, int pos, int bs, int be) {
603
    bool takeword(const string &, int, int, int) {
604
    wcnt++;
604
    wcnt++;
605
    return true;
605
    return true;
606
    }
606
    }
607
};
607
};
608
608
609
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
609
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
610
{
610
{
611
    utSplitterCB cb;
612
    TextSplit splitter(&cb, flgs);
611
    TextSplitCW splitter(flgs);
613
    splitter.text_to_words(s);
612
    splitter.text_to_words(s);
614
    return cb.wcnt;
613
    return splitter.wcnt;
615
}
614
}
616
615
617
bool TextSplit::hasVisibleWhite(const string &in)
616
bool TextSplit::hasVisibleWhite(const string &in)
618
{
617
{
619
    setcharclasses();
618
    setcharclasses();
...
...
724
#include "debuglog.h"
723
#include "debuglog.h"
725
#include "transcode.h"
724
#include "transcode.h"
726
725
727
using namespace std;
726
using namespace std;
728
727
729
// A small class to hold state while splitting text
728
class myTextSplit : public TextSplit {
730
class mySplitterCB : public TextSplitCB {
731
    int first;
729
    int first;
732
    bool nooutput;
730
    bool nooutput;
733
 public:
731
 public:
734
    mySplitterCB() : first(1), nooutput(false) {}
732
    myTextSplit(Flags flags = Flags(TXTS_NONE)) : 
733
        TextSplit(flags),first(1), nooutput(false) 
734
    {}
735
    void setNoOut(bool val) {nooutput = val;}
735
    void setNoOut(bool val) {nooutput = val;}
736
    bool takeword(const string &term, int pos, int bs, int be) {
736
    bool takeword(const string &term, int pos, int bs, int be) {
737
    if (nooutput)
737
    if (nooutput)
738
        return true;
738
        return true;
739
    FILE *fp = stdout;
739
    FILE *fp = stdout;
...
...
819
    b1: argc--; argv++;
819
    b1: argc--; argv++;
820
    }
820
    }
821
    DebugLog::getdbl()->setloglevel(DEBDEB1);
821
    DebugLog::getdbl()->setloglevel(DEBDEB1);
822
    DebugLog::setfilename("stderr");
822
    DebugLog::setfilename("stderr");
823
823
824
    mySplitterCB cb;
825
    TextSplit::Flags flags = TextSplit::TXTS_NONE;
824
    TextSplit::Flags flags = TextSplit::TXTS_NONE;
826
827
    if (op_flags&OPT_S)
828
  cb.setNoOut(true);
829
825
830
    if (op_flags&OPT_s)
826
    if (op_flags&OPT_s)
831
    flags = TextSplit::TXTS_ONLYSPANS;
827
    flags = TextSplit::TXTS_ONLYSPANS;
832
    else if (op_flags&OPT_w)
828
    else if (op_flags&OPT_w)
833
    flags = TextSplit::TXTS_NOSPANS;
829
    flags = TextSplit::TXTS_NOSPANS;
...
...
865
861
866
    if (op_flags & OPT_c) {
862
    if (op_flags & OPT_c) {
867
    int n = TextSplit::countWords(data, flags);
863
    int n = TextSplit::countWords(data, flags);
868
    cout << n << " words" << endl;
864
    cout << n << " words" << endl;
869
    } else {
865
    } else {
870
    TextSplit splitter(&cb,  flags);
866
    myTextSplit splitter(flags);
867
        if (op_flags&OPT_S)
868
            splitter.setNoOut(true);
871
    splitter.text_to_words(data);
869
    splitter.text_to_words(data);
872
    }    
870
    }    
873
}
871
}
874
#endif // TEST
872
#endif // TEST