recoll / Code / Diff of /src/rcldb/termproc.h

Diff of /src/rcldb/termproc.h [7876fb] .. [94b945]

Switch to side-by-side view

--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@@ -24,19 +24,24 @@
 
 namespace Rcl {
 
-/** 
- * Termproc objects take a stream of term tokens as input and do something
+/**
+ * Termproc objects take term tokens as input and do something
  * with them: transform to lowercase, filter out stop words, generate n-grams,
- * finally index or generate search clauses, etc. They are chained and can 
+ * finally index or generate search clauses, etc. They are chained and can
  * be arranged to form different pipelines depending on the desired processing
  * steps: for example, optional stoplist or commongram processing.
  *
  * Shared processing steps are defined in this file. The first and last steps
- * (ie: adding index term) are usually defined in the specific module.
- */
-
-/** 
- * The base class takes care of chaining: all derived classes call its 
+ * are usually defined in the specific module.
+ * - The front TermProc is typically chained from a TextSplit object
+ *   which generates the original terms, and calls takeword() from its
+ *   own takeword() method.
+ * - The last TermProc does something with the finalized terms, e.g. adds
+ *   them to the index.
+ */
+
+/**
+ * The base class takes care of chaining: all derived classes call its
  * takeword() and flush() methods to ensure that terms go through the pipe.
  */
 class TermProc {
@@ -45,106 +50,110 @@
     virtual ~TermProc() {}
     virtual bool takeword(const string &term, int pos, int bs, int be)
     {
-	if (m_next)
-	    return m_next->takeword(term, pos, bs, be);
-	else
-	    return true;
-    }
+        if (m_next)
+            return m_next->takeword(term, pos, bs, be);
+        else
+            return true;
+    }
+    // newpage() is like takeword(), but for page breaks.
     virtual void newpage(int pos)
     {
-	if (m_next)
-	    m_next->newpage(pos);
+        if (m_next)
+            m_next->newpage(pos);
     }
     virtual bool flush()
     {
-	if (m_next)
-	    return m_next->flush();
-	else
-	    return true;
+        if (m_next)
+            return m_next->flush();
+        else
+            return true;
     }
 private:
     TermProc *m_next;
     /* Copyconst and assignment private and forbidden */
     TermProc(const TermProc &) {}
-    TermProc& operator=(const TermProc &) {return *this;};
-};
-
-/** 
- * Specialized TextSplit class: this will probably replace the base
- * TextSplit when we've converted all the code. The takeword() routine in this
- * calls a TermProc's instead of being overriden in a user derived class.
- * The text_to_words() method also takes care of flushing.
+    TermProc& operator=(const TermProc &) {
+        return *this;
+    };
+};
+
+/**
+ * Helper specialized TextSplit class, feeds the pipeline:
+ * - The takeword() method calls a TermProc->takeword().
+ * - The text_to_words() method also takes care of flushing.
+ * Both methods can be further specialized by the user (they should then call
+ * the base methods when they've done the local processing).
  */
 class TextSplitP : public TextSplit {
 public:
     TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
-	: TextSplit(flags), m_prc(prc)  {}
-
-    virtual bool text_to_words(const string &in)
-    {
-	bool ret = TextSplit::text_to_words(in);
-	if (m_prc && !m_prc->flush())
-	    return false;
-	return ret;
-    }
-
-    virtual bool takeword(const string& term, int pos, int bs, int be)
-    {
-	if (m_prc)
-	    return m_prc->takeword(term, pos, bs, be);
-	else
-	    return true;
-    }
-    virtual void newpage(int pos)
-    {
-	if (m_prc)
-	    return m_prc->newpage(pos);
+        : TextSplit(flags), m_prc(prc)  {}
+
+    virtual bool text_to_words(const string &in) {
+        bool ret = TextSplit::text_to_words(in);
+        if (m_prc && !m_prc->flush())
+            return false;
+        return ret;
+    }
+
+    virtual bool takeword(const string& term, int pos, int bs, int be) {
+        if (m_prc)
+            return m_prc->takeword(term, pos, bs, be);
+        else
+            return true;
+    }
+
+    virtual void newpage(int pos) {
+        if (m_prc)
+            return m_prc->newpage(pos);
     }
 
 private:
     TermProc *m_prc;
 };
 
-/** Unaccent and lowercase term. This is usually the first in the pipeline */
+/** Unaccent and lowercase term. If the index is
+ *  not case/diac-sensitive, this is usually the first step in the pipeline
+ */
 class TermProcPrep : public TermProc {
 public:
-    TermProcPrep(TermProc *nxt)	
-	: TermProc(nxt), m_totalterms(0), m_unacerrors(0) 
+    TermProcPrep(TermProc *nxt)
+        : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
     {
     }
 
     virtual bool takeword(const string& itrm, int pos, int bs, int be)
     {
-	m_totalterms++;
-	string otrm;
-	if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
-	    LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
-	    m_unacerrors++;
-	    // We don't generate a fatal error because of a bad term,
-	    // but one has to put the limit somewhere
-	    if (m_unacerrors > 500 && 
-		(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
-		// More than 1 error for every other term
-		LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
-			m_unacerrors, m_totalterms));
-		return false;
-	    }
-	    return true;
-	}
-	// It may happen in some weird cases that the output from unac is 
-	// empty (if the word actually consisted entirely of diacritics ...)
-	// The consequence is that a phrase search won't work without addional
-	// slack. 
-	if (otrm.empty())
-	    return true;
-	else
-	    return TermProc::takeword(otrm, pos, bs, be);
+        m_totalterms++;
+        string otrm;
+        if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
+            LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
+            m_unacerrors++;
+            // We don't generate a fatal error because of a bad term,
+            // but one has to put the limit somewhere
+            if (m_unacerrors > 500 &&
+                    (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
+                // More than 1 error for every other term
+                LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
+                        m_unacerrors, m_totalterms));
+                return false;
+            }
+            return true;
+        }
+        // It may happen in some weird cases that the output from unac is
+        // empty (if the word actually consisted entirely of diacritics ...)
+        // The consequence is that a phrase search won't work without addional
+        // slack.
+        if (otrm.empty())
+            return true;
+        else
+            return TermProc::takeword(otrm, pos, bs, be);
     }
 
     virtual bool flush()
     {
-	m_totalterms = m_unacerrors = 0;
-	return TermProc::flush();
+        m_totalterms = m_unacerrors = 0;
+        return TermProc::flush();
     }
 
 private:
@@ -156,16 +165,16 @@
 class TermProcStop : public TermProc {
 public:
     TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
-	: TermProc(nxt), m_stops(stops) 
+        : TermProc(nxt), m_stops(stops)
     {
     }
 
     virtual bool takeword(const string& term, int pos, int bs, int be)
     {
-	if (m_stops.isStop(term)) {
-	    return true;
-	}
-	return TermProc::takeword(term, pos, bs, be);
+        if (m_stops.isStop(term)) {
+            return true;
+        }
+        return TermProc::takeword(term, pos, bs, be);
     }
 
 private:
@@ -174,73 +183,73 @@
 
 /** Handle common-gram generation: combine frequent terms with neighbours to
  *  shorten the positions lists for phrase searches.
- *  NOTE: This does not currently work because of bad interaction with the 
+ *  NOTE: This does not currently work because of bad interaction with the
  *  spans (ie john@domain.com) generation in textsplit. Not used, kept for
  *  testing only
  */
 class TermProcCommongrams : public TermProc {
 public:
     TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
-	: TermProc(nxt), m_stops(stops), m_onlygrams(false) 
+        : TermProc(nxt), m_stops(stops), m_onlygrams(false)
     {
     }
 
     virtual bool takeword(const string& term, int pos, int bs, int be)
     {
-	LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", 
-		 pos, bs, be, term.c_str()));
-	bool isstop = m_stops.isStop(term);
-	bool twogramemit = false;
-
-	if (!m_prevterm.empty() && (m_prevstop || isstop)) {
-	    // create 2-gram. space unnecessary but improves
-	    // the readability of queries
-	    string twogram;
-	    twogram.swap(m_prevterm);
-	    twogram.append(1, ' ');
-	    twogram += term;
-	    // When emitting a complex term we set the bps to 0. This may
-	    // be used by our clients
-	    if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
-		return false;
-	    twogramemit = true;
+        LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
+                 pos, bs, be, term.c_str()));
+        bool isstop = m_stops.isStop(term);
+        bool twogramemit = false;
+
+        if (!m_prevterm.empty() && (m_prevstop || isstop)) {
+            // create 2-gram. space unnecessary but improves
+            // the readability of queries
+            string twogram;
+            twogram.swap(m_prevterm);
+            twogram.append(1, ' ');
+            twogram += term;
+            // When emitting a complex term we set the bps to 0. This may
+            // be used by our clients
+            if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
+                return false;
+            twogramemit = true;
 #if 0
-	    if (m_stops.isStop(twogram)) {
-		firstword = twogram;
-		isstop = false;
-	    }
+            if (m_stops.isStop(twogram)) {
+                firstword = twogram;
+                isstop = false;
+            }
 #endif
-	}
-	
-	m_prevterm = term;
-	m_prevstop = isstop;
-	m_prevpos = pos;
-	m_prevsent = false;
-	m_prevbs = bs;
-	m_prevbe = be;
-	// If flags allow, emit the bare term at the current pos.
-	if (!m_onlygrams || (!isstop && !twogramemit)) {
-	    if (!TermProc::takeword(term, pos, bs, be))
-		return false;
-	    m_prevsent = true;
-	} 
-
-	return true;
+        }
+
+        m_prevterm = term;
+        m_prevstop = isstop;
+        m_prevpos = pos;
+        m_prevsent = false;
+        m_prevbs = bs;
+        m_prevbe = be;
+        // If flags allow, emit the bare term at the current pos.
+        if (!m_onlygrams || (!isstop && !twogramemit)) {
+            if (!TermProc::takeword(term, pos, bs, be))
+                return false;
+            m_prevsent = true;
+        }
+
+        return true;
     }
 
     virtual bool flush()
     {
-	if (!m_prevsent && !m_prevterm.empty())
-	    if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
-		return false;
-	    
-	m_prevterm.clear();
-	m_prevsent = true;
-	return TermProc::flush();
+        if (!m_prevsent && !m_prevterm.empty())
+            if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
+                return false;
+
+        m_prevterm.clear();
+        m_prevsent = true;
+        return TermProc::flush();
     }
     void onlygrams(bool on)
     {
-	m_onlygrams = on;
+        m_onlygrams = on;
     }
 private:
     // The stoplist we're using