--- a/src/rcldb/termproc.h
+++ b/src/rcldb/termproc.h
@@ -24,19 +24,24 @@
namespace Rcl {
-/**
- * Termproc objects take a stream of term tokens as input and do something
+/**
+ * Termproc objects take term tokens as input and do something
* with them: transform to lowercase, filter out stop words, generate n-grams,
- * finally index or generate search clauses, etc. They are chained and can
+ * finally index or generate search clauses, etc. They are chained and can
* be arranged to form different pipelines depending on the desired processing
* steps: for example, optional stoplist or commongram processing.
*
* Shared processing steps are defined in this file. The first and last steps
- * (ie: adding index term) are usually defined in the specific module.
- */
-
-/**
- * The base class takes care of chaining: all derived classes call its
+ * are usually defined in the specific module.
+ * - The front TermProc is typically chained from a TextSplit object
+ * which generates the original terms, and calls takeword() from its
+ * own takeword() method.
+ * - The last TermProc does something with the finalized terms, e.g. adds
+ * them to the index.
+ */
+
+/**
+ * The base class takes care of chaining: all derived classes call its
* takeword() and flush() methods to ensure that terms go through the pipe.
*/
class TermProc {
@@ -45,106 +50,110 @@
virtual ~TermProc() {}
virtual bool takeword(const string &term, int pos, int bs, int be)
{
- if (m_next)
- return m_next->takeword(term, pos, bs, be);
- else
- return true;
- }
+ if (m_next)
+ return m_next->takeword(term, pos, bs, be);
+ else
+ return true;
+ }
+ // newpage() is like takeword(), but for page breaks.
virtual void newpage(int pos)
{
- if (m_next)
- m_next->newpage(pos);
+ if (m_next)
+ m_next->newpage(pos);
}
virtual bool flush()
{
- if (m_next)
- return m_next->flush();
- else
- return true;
+ if (m_next)
+ return m_next->flush();
+ else
+ return true;
}
private:
TermProc *m_next;
/* Copyconst and assignment private and forbidden */
TermProc(const TermProc &) {}
- TermProc& operator=(const TermProc &) {return *this;};
-};
-
-/**
- * Specialized TextSplit class: this will probably replace the base
- * TextSplit when we've converted all the code. The takeword() routine in this
- * calls a TermProc's instead of being overriden in a user derived class.
- * The text_to_words() method also takes care of flushing.
+ TermProc& operator=(const TermProc &) {
+ return *this;
+ };
+};
+
+/**
+ * Helper specialized TextSplit class, feeds the pipeline:
+ * - The takeword() method calls a TermProc->takeword().
+ * - The text_to_words() method also takes care of flushing.
+ * Both methods can be further specialized by the user (they should then call
+ * the base methods when they've done the local processing).
*/
class TextSplitP : public TextSplit {
public:
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
- : TextSplit(flags), m_prc(prc) {}
-
- virtual bool text_to_words(const string &in)
- {
- bool ret = TextSplit::text_to_words(in);
- if (m_prc && !m_prc->flush())
- return false;
- return ret;
- }
-
- virtual bool takeword(const string& term, int pos, int bs, int be)
- {
- if (m_prc)
- return m_prc->takeword(term, pos, bs, be);
- else
- return true;
- }
- virtual void newpage(int pos)
- {
- if (m_prc)
- return m_prc->newpage(pos);
+ : TextSplit(flags), m_prc(prc) {}
+
+ virtual bool text_to_words(const string &in) {
+ bool ret = TextSplit::text_to_words(in);
+ if (m_prc && !m_prc->flush())
+ return false;
+ return ret;
+ }
+
+ virtual bool takeword(const string& term, int pos, int bs, int be) {
+ if (m_prc)
+ return m_prc->takeword(term, pos, bs, be);
+ else
+ return true;
+ }
+
+ virtual void newpage(int pos) {
+ if (m_prc)
+ return m_prc->newpage(pos);
}
private:
TermProc *m_prc;
};
-/** Unaccent and lowercase term. This is usually the first in the pipeline */
+/** Unaccent and lowercase term. If the index is
+ * not case/diac-sensitive, this is usually the first step in the pipeline
+ */
class TermProcPrep : public TermProc {
public:
- TermProcPrep(TermProc *nxt)
- : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
+ TermProcPrep(TermProc *nxt)
+ : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
{
}
virtual bool takeword(const string& itrm, int pos, int bs, int be)
{
- m_totalterms++;
- string otrm;
- if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
- LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
- m_unacerrors++;
- // We don't generate a fatal error because of a bad term,
- // but one has to put the limit somewhere
- if (m_unacerrors > 500 &&
- (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
- // More than 1 error for every other term
- LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
- m_unacerrors, m_totalterms));
- return false;
- }
- return true;
- }
- // It may happen in some weird cases that the output from unac is
- // empty (if the word actually consisted entirely of diacritics ...)
- // The consequence is that a phrase search won't work without addional
- // slack.
- if (otrm.empty())
- return true;
- else
- return TermProc::takeword(otrm, pos, bs, be);
+ m_totalterms++;
+ string otrm;
+ if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
+ LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
+ m_unacerrors++;
+ // We don't generate a fatal error because of a bad term,
+ // but one has to put the limit somewhere
+ if (m_unacerrors > 500 &&
+ (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
+ // More than 1 error for every other term
+ LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
+ m_unacerrors, m_totalterms));
+ return false;
+ }
+ return true;
+ }
+ // It may happen in some weird cases that the output from unac is
+ // empty (if the word actually consisted entirely of diacritics ...)
+ // The consequence is that a phrase search won't work without addional
+ // slack.
+ if (otrm.empty())
+ return true;
+ else
+ return TermProc::takeword(otrm, pos, bs, be);
}
virtual bool flush()
{
- m_totalterms = m_unacerrors = 0;
- return TermProc::flush();
+ m_totalterms = m_unacerrors = 0;
+ return TermProc::flush();
}
private:
@@ -156,16 +165,16 @@
class TermProcStop : public TermProc {
public:
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
- : TermProc(nxt), m_stops(stops)
+ : TermProc(nxt), m_stops(stops)
{
}
virtual bool takeword(const string& term, int pos, int bs, int be)
{
- if (m_stops.isStop(term)) {
- return true;
- }
- return TermProc::takeword(term, pos, bs, be);
+ if (m_stops.isStop(term)) {
+ return true;
+ }
+ return TermProc::takeword(term, pos, bs, be);
}
private:
@@ -174,73 +183,73 @@
/** Handle common-gram generation: combine frequent terms with neighbours to
* shorten the positions lists for phrase searches.
- * NOTE: This does not currently work because of bad interaction with the
+ * NOTE: This does not currently work because of bad interaction with the
* spans (ie john@domain.com) generation in textsplit. Not used, kept for
* testing only
*/
class TermProcCommongrams : public TermProc {
public:
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
- : TermProc(nxt), m_stops(stops), m_onlygrams(false)
+ : TermProc(nxt), m_stops(stops), m_onlygrams(false)
{
}
virtual bool takeword(const string& term, int pos, int bs, int be)
{
- LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
- pos, bs, be, term.c_str()));
- bool isstop = m_stops.isStop(term);
- bool twogramemit = false;
-
- if (!m_prevterm.empty() && (m_prevstop || isstop)) {
- // create 2-gram. space unnecessary but improves
- // the readability of queries
- string twogram;
- twogram.swap(m_prevterm);
- twogram.append(1, ' ');
- twogram += term;
- // When emitting a complex term we set the bps to 0. This may
- // be used by our clients
- if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
- return false;
- twogramemit = true;
+ LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
+ pos, bs, be, term.c_str()));
+ bool isstop = m_stops.isStop(term);
+ bool twogramemit = false;
+
+ if (!m_prevterm.empty() && (m_prevstop || isstop)) {
+ // create 2-gram. space unnecessary but improves
+ // the readability of queries
+ string twogram;
+ twogram.swap(m_prevterm);
+ twogram.append(1, ' ');
+ twogram += term;
+ // When emitting a complex term we set the bps to 0. This may
+ // be used by our clients
+ if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
+ return false;
+ twogramemit = true;
#if 0
- if (m_stops.isStop(twogram)) {
- firstword = twogram;
- isstop = false;
- }
+ if (m_stops.isStop(twogram)) {
+ firstword = twogram;
+ isstop = false;
+ }
#endif
- }
-
- m_prevterm = term;
- m_prevstop = isstop;
- m_prevpos = pos;
- m_prevsent = false;
- m_prevbs = bs;
- m_prevbe = be;
- // If flags allow, emit the bare term at the current pos.
- if (!m_onlygrams || (!isstop && !twogramemit)) {
- if (!TermProc::takeword(term, pos, bs, be))
- return false;
- m_prevsent = true;
- }
-
- return true;
+ }
+
+ m_prevterm = term;
+ m_prevstop = isstop;
+ m_prevpos = pos;
+ m_prevsent = false;
+ m_prevbs = bs;
+ m_prevbe = be;
+ // If flags allow, emit the bare term at the current pos.
+ if (!m_onlygrams || (!isstop && !twogramemit)) {
+ if (!TermProc::takeword(term, pos, bs, be))
+ return false;
+ m_prevsent = true;
+ }
+
+ return true;
}
virtual bool flush()
{
- if (!m_prevsent && !m_prevterm.empty())
- if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
- return false;
-
- m_prevterm.clear();
- m_prevsent = true;
- return TermProc::flush();
+ if (!m_prevsent && !m_prevterm.empty())
+ if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
+ return false;
+
+ m_prevterm.clear();
+ m_prevsent = true;
+ return TermProc::flush();
}
void onlygrams(bool on)
{
- m_onlygrams = on;
+ m_onlygrams = on;
}
private:
// The stoplist we're using