--- a/src/common/textsplit.cpp
+++ b/src/common/textsplit.cpp
@@ -205,6 +205,14 @@
LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
unsigned int l = w.length();
+
+#ifdef TEXTSPLIT_STATS
+ // Update word length statistics. Do this before we filter out
+ // long words because stats are used to detect bad text
+ if (!isspan || m_wordLen == m_span.length())
+ m_stats.newsamp(m_wordChars);
+#endif
+
if (l > 0 && l < (unsigned)m_maxWordLength) {
// 1 byte word: we index single ascii letters and digits, but
// nothing else. We might want to turn this into a test for a
@@ -316,7 +324,7 @@
// Adjust state
if (m_wordLen) {
m_wordpos++;
- m_wordLen = 0;
+ m_wordLen = m_wordChars = 0;
}
if (spanerase) {
discardspan();
@@ -332,7 +340,7 @@
m_span.erase();
m_spanpos = m_wordpos;
m_wordStart = 0;
- m_wordLen = 0;
+ m_wordLen = m_wordChars = 0;
}
static inline bool isalphanum(int what, unsigned int flgs)
@@ -345,6 +353,12 @@
{
return what == DIGIT || ((flgs & TextSplit::TXTS_KEEPWILD) && what == WILD);
}
+
+#ifdef TEXTSPLIT_STATS
+#define INC_WORDCHARS ++m_wordChars
+#else
+#define INC_WORDCHARS
+#endif
/**
* Splitting a text into terms to be indexed.
@@ -366,7 +380,8 @@
m_span.erase();
m_inNumber = false;
- m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
+ m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = m_wordpos
+ = m_spanpos = 0;
int curspanglue = 0;
bool pagepending = false;
bool softhyphenpending = false;
@@ -423,6 +438,7 @@
if (m_wordLen == 0)
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
+ INC_WORDCHARS;
nonalnumcnt = 0;
break;
@@ -458,6 +474,7 @@
// -10
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
+ INC_WORDCHARS;
} else {
goto SPACE;
}
@@ -465,6 +482,7 @@
m_span[m_span.length() - 1] == 'E')) {
if (isdigit(whatcc(it[it.getCpos()+1]), m_flags)) {
m_wordLen += it.appendchartostring(m_span);
+ INC_WORDCHARS;
} else {
goto SPACE;
}
@@ -482,6 +500,7 @@
if (!isdigit(nextwhat, m_flags))
goto SPACE;
m_wordLen += it.appendchartostring(m_span);
+ INC_WORDCHARS;
curspanglue = cc;
break;
} else {
@@ -501,6 +520,7 @@
if (m_span.length() == 0 && isdigit(nextwhat, m_flags)) {
m_inNumber = true;
m_wordLen += it.appendchartostring(m_span);
+ INC_WORDCHARS;
curspanglue = cc;
break;
}
@@ -567,6 +587,7 @@
int w = whatcc(it[it.getCpos()+1]);
if (w == SPACE || w == '\n' || w == '\r') {
m_wordLen += it.appendchartostring(m_span);
+ INC_WORDCHARS;
break;
}
}
@@ -639,6 +660,7 @@
m_inNumber = false;
}
m_wordLen += it.appendchartostring(m_span);
+ INC_WORDCHARS;
nonalnumcnt = 0;
break;
}
@@ -738,7 +760,7 @@
m_span.erase();
m_inNumber = false;
- m_wordStart = m_wordLen = m_prevpos = m_prevlen = 0;
+ m_wordStart = m_wordLen = m_wordChars = m_prevpos = m_prevlen = 0;
m_spanpos = m_wordpos;
*cp = c;
return true;
@@ -864,6 +886,7 @@
#include <errno.h>
#include <fcntl.h>
#include <string.h>
+#include <math.h>
#include <iostream>
@@ -880,7 +903,7 @@
int first;
bool nooutput;
public:
- myTermProc() : TermProc(0), first(1), nooutput(false) {}
+ myTermProc() : TermProc(0), first(1), nooutput(false) {}
void setNoOut(bool val) {nooutput = val;}
virtual bool takeword(const string &term, int pos, int bs, int be)
{
@@ -1058,7 +1081,16 @@
printproc.setNoOut(true);
splitter.text_to_words(data);
-
+#ifdef TEXTSPLIT_STATS
+ TextSplit::Stats::Values v = splitter.getStats();
+ cout << "Average length: "
+ << v.avglen
+ << " Standard deviation: "
+ << v.sigma
+ << " Coef of variation "
+ << v.sigma / v.avglen
+ << endl;
+#endif
}
}
#endif // TEST