|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
|
... |
|
... |
184 |
//cerr << "ERASING single letter term " << c << endl;
|
184 |
//cerr << "ERASING single letter term " << c << endl;
|
185 |
return true;
|
185 |
return true;
|
186 |
}
|
186 |
}
|
187 |
}
|
187 |
}
|
188 |
if (pos != m_prevpos || l != m_prevlen) {
|
188 |
if (pos != m_prevpos || l != m_prevlen) {
|
189 |
bool ret = m_cb->takeword(w, pos, btstart, btend);
|
189 |
bool ret = takeword(w, pos, btstart, btend);
|
190 |
m_prevpos = pos;
|
190 |
m_prevpos = pos;
|
191 |
m_prevlen = w.length();
|
191 |
m_prevlen = w.length();
|
192 |
return ret;
|
192 |
return ret;
|
193 |
}
|
193 |
}
|
194 |
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
194 |
LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
|
|
... |
|
... |
556 |
if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
|
556 |
if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
|
557 |
unsigned int btend = it.getBpos() + it.getBlen();
|
557 |
unsigned int btend = it.getBpos() + it.getBlen();
|
558 |
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
558 |
unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
|
559 |
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
559 |
unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
|
560 |
for (unsigned int i = loopbeg; i < loopend; i++) {
|
560 |
for (unsigned int i = loopbeg; i < loopend; i++) {
|
561 |
if (!m_cb->takeword(it.buffer().substr(boffs[i],
|
561 |
if (!takeword(it.buffer().substr(boffs[i],
|
562 |
btend-boffs[i]),
|
562 |
btend-boffs[i]),
|
563 |
m_wordpos - (nchars-i-1), boffs[i], btend)) {
|
563 |
m_wordpos - (nchars-i-1), boffs[i], btend)) {
|
564 |
return false;
|
564 |
return false;
|
565 |
}
|
565 |
}
|
566 |
}
|
566 |
}
|
|
... |
|
... |
577 |
|
577 |
|
578 |
// If onlyspans is set, there may be things to flush in the buffer
|
578 |
// If onlyspans is set, there may be things to flush in the buffer
|
579 |
// first
|
579 |
// first
|
580 |
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
580 |
if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
|
581 |
unsigned int btend = it.getBpos(); // Current char is out
|
581 |
unsigned int btend = it.getBpos(); // Current char is out
|
582 |
if (!m_cb->takeword(it.buffer().substr(boffs[0],
|
582 |
if (!takeword(it.buffer().substr(boffs[0],
|
583 |
btend-boffs[0]),
|
583 |
btend-boffs[0]),
|
584 |
m_wordpos - nchars,
|
584 |
m_wordpos - nchars,
|
585 |
boffs[0], btend)) {
|
585 |
boffs[0], btend)) {
|
586 |
return false;
|
586 |
return false;
|
587 |
}
|
587 |
}
|
|
... |
|
... |
593 |
m_spanpos = m_wordpos;
|
593 |
m_spanpos = m_wordpos;
|
594 |
*cp = c;
|
594 |
*cp = c;
|
595 |
return true;
|
595 |
return true;
|
596 |
}
|
596 |
}
|
597 |
|
597 |
|
598 |
// Callback class for countWords
|
598 |
// Specialization for countWords
|
599 |
class utSplitterCB : public TextSplitCB {
|
599 |
class TextSplitCW : public TextSplit {
|
600 |
public:
|
600 |
public:
|
601 |
int wcnt;
|
601 |
int wcnt;
|
602 |
utSplitterCB() : wcnt(0) {}
|
602 |
TextSplitCW(Flags flags) : TextSplit(flags), wcnt(0) {}
|
603 |
bool takeword(const string &term, int pos, int bs, int be) {
|
603 |
bool takeword(const string &, int, int, int) {
|
604 |
wcnt++;
|
604 |
wcnt++;
|
605 |
return true;
|
605 |
return true;
|
606 |
}
|
606 |
}
|
607 |
};
|
607 |
};
|
608 |
|
608 |
|
609 |
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
609 |
int TextSplit::countWords(const string& s, TextSplit::Flags flgs)
|
610 |
{
|
610 |
{
|
611 |
utSplitterCB cb;
|
|
|
612 |
TextSplit splitter(&cb, flgs);
|
611 |
TextSplitCW splitter(flgs);
|
613 |
splitter.text_to_words(s);
|
612 |
splitter.text_to_words(s);
|
614 |
return cb.wcnt;
|
613 |
return splitter.wcnt;
|
615 |
}
|
614 |
}
|
616 |
|
615 |
|
617 |
bool TextSplit::hasVisibleWhite(const string &in)
|
616 |
bool TextSplit::hasVisibleWhite(const string &in)
|
618 |
{
|
617 |
{
|
619 |
setcharclasses();
|
618 |
setcharclasses();
|
|
... |
|
... |
724 |
#include "debuglog.h"
|
723 |
#include "debuglog.h"
|
725 |
#include "transcode.h"
|
724 |
#include "transcode.h"
|
726 |
|
725 |
|
727 |
using namespace std;
|
726 |
using namespace std;
|
728 |
|
727 |
|
729 |
// A small class to hold state while splitting text
|
728 |
class myTextSplit : public TextSplit {
|
730 |
class mySplitterCB : public TextSplitCB {
|
|
|
731 |
int first;
|
729 |
int first;
|
732 |
bool nooutput;
|
730 |
bool nooutput;
|
733 |
public:
|
731 |
public:
|
734 |
mySplitterCB() : first(1), nooutput(false) {}
|
732 |
myTextSplit(Flags flags = Flags(TXTS_NONE)) :
|
|
|
733 |
TextSplit(flags),first(1), nooutput(false)
|
|
|
734 |
{}
|
735 |
void setNoOut(bool val) {nooutput = val;}
|
735 |
void setNoOut(bool val) {nooutput = val;}
|
736 |
bool takeword(const string &term, int pos, int bs, int be) {
|
736 |
bool takeword(const string &term, int pos, int bs, int be) {
|
737 |
if (nooutput)
|
737 |
if (nooutput)
|
738 |
return true;
|
738 |
return true;
|
739 |
FILE *fp = stdout;
|
739 |
FILE *fp = stdout;
|
|
... |
|
... |
819 |
b1: argc--; argv++;
|
819 |
b1: argc--; argv++;
|
820 |
}
|
820 |
}
|
821 |
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
821 |
DebugLog::getdbl()->setloglevel(DEBDEB1);
|
822 |
DebugLog::setfilename("stderr");
|
822 |
DebugLog::setfilename("stderr");
|
823 |
|
823 |
|
824 |
mySplitterCB cb;
|
|
|
825 |
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
824 |
TextSplit::Flags flags = TextSplit::TXTS_NONE;
|
826 |
|
|
|
827 |
if (op_flags&OPT_S)
|
|
|
828 |
cb.setNoOut(true);
|
|
|
829 |
|
825 |
|
830 |
if (op_flags&OPT_s)
|
826 |
if (op_flags&OPT_s)
|
831 |
flags = TextSplit::TXTS_ONLYSPANS;
|
827 |
flags = TextSplit::TXTS_ONLYSPANS;
|
832 |
else if (op_flags&OPT_w)
|
828 |
else if (op_flags&OPT_w)
|
833 |
flags = TextSplit::TXTS_NOSPANS;
|
829 |
flags = TextSplit::TXTS_NOSPANS;
|
|
... |
|
... |
865 |
|
861 |
|
866 |
if (op_flags & OPT_c) {
|
862 |
if (op_flags & OPT_c) {
|
867 |
int n = TextSplit::countWords(data, flags);
|
863 |
int n = TextSplit::countWords(data, flags);
|
868 |
cout << n << " words" << endl;
|
864 |
cout << n << " words" << endl;
|
869 |
} else {
|
865 |
} else {
|
870 |
TextSplit splitter(&cb, flags);
|
866 |
myTextSplit splitter(flags);
|
|
|
867 |
if (op_flags&OPT_S)
|
|
|
868 |
splitter.setNoOut(true);
|
871 |
splitter.text_to_words(data);
|
869 |
splitter.text_to_words(data);
|
872 |
}
|
870 |
}
|
873 |
}
|
871 |
}
|
874 |
#endif // TEST
|
872 |
#endif // TEST
|