recoll / Code / Diff of /src/rcldb/termproc.h

Diff of /src/rcldb/termproc.h [4a7ff3] .. [988ec0]

Switch to side-by-side view

--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@@ -66,10 +66,10 @@
 };
 
 /** 
- * Intermediary specialized texsplit class: this will probably replace the base
- * textsplit when we've converted all the code. The takeword() routine in this
- * calls a TextProc's instead of being specialized in a derived class by the
- * user module. The text_to_word() method also takes care of flushing.
+ * Specialized TextSplit class: this will probably replace the base
+ * TextSplit when we've converted all the code. The takeword() routine in this
+ * calls a TermProc's instead of being overriden in a user derived class.
+ * The text_to_word() method also takes care of flushing.
  */
 class TextSplitP : public TextSplit {
 public:
@@ -99,18 +99,39 @@
 /** Unaccent and lowercase term. This is usually the first in the pipeline */
 class TermProcPrep : public TermProc {
 public:
-    TermProcPrep(TermProc *nxt)	: TermProc(nxt) {}
+    TermProcPrep(TermProc *nxt)	
+	: TermProc(nxt), m_totalterms(0), m_unacerrors(0) {}
 
     virtual bool takeword(const string& itrm, int pos, int bs, int be)
     {
+	m_totalterms++;
 	string otrm;
 	if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
-	    LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
-	    // We don't generate a fatal error because of a bad term
+	    LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
+	    m_unacerrors++;
+	    // We don't generate a fatal error because of a bad term,
+	    // but one has to put the limit somewhere
+	    if (m_unacerrors > 500 && 
+		(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
+		// More than 1 error for every other term
+		LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
+			m_unacerrors, m_totalterms));
+		return false;
+	    }
 	    return true;
 	}
 	return TermProc::takeword(otrm, pos, bs, be);
     }
+
+    virtual bool flush()
+    {
+	m_totalterms = m_unacerrors = 0;
+	return TermProc::flush();
+    }
+
+private:
+    int m_totalterms;
+    int m_unacerrors;
 };
 
 /** Compare to stop words list and discard if match found */
@@ -119,19 +140,23 @@
     TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
 	: TermProc(nxt), m_stops(stops) {}
 
-    virtual bool takeword(const string& term, int pos, int bts, int bte)
+    virtual bool takeword(const string& term, int pos, int bs, int be)
     {
 	if (m_stops.isStop(term)) {
 	    return true;
 	}
-	return TermProc::takeword(term, pos, bts, bte);
-    }
+	return TermProc::takeword(term, pos, bs, be);
+    }
+
 private:
     const Rcl::StopList& m_stops;
 };
 
 /** Handle common-gram generation: combine frequent terms with neighbours to
  *  shorten the positions lists for phrase searches.
+ *  NOTE: This does not currently work because of bad interaction with the 
+ *  spans (ie john@domain.com) generation in textsplit. Not used, kept for
+ *  testing only
  */
 class TermProcCommongrams : public TermProc {
 public:
@@ -147,7 +172,7 @@
 
 	if (!m_prevterm.empty() && (m_prevstop || isstop)) {
 	    // create 2-gram. space unnecessary but improves
-	    // lisibility of queries
+	    // the readability of queries
 	    string twogram;
 	    twogram.swap(m_prevterm);
 	    twogram.append(1, ' ');
@@ -164,7 +189,7 @@
 	    }
 #endif
 	}
-
+	
 	m_prevterm = term;
 	m_prevstop = isstop;
 	m_prevpos = pos;
@@ -181,7 +206,7 @@
 	return true;
     }
 
-    bool flush()
+    virtual bool flush()
     {
 	if (!m_prevsent && !m_prevterm.empty())
 	    if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))