--- a
+++ b/src/rcldb/termproc.h
@@ -0,0 +1,182 @@
+/* Copyright (C) 2011 J.F.Dockes
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+
+#ifndef _TERMPROC_H_INCLUDED_
+#define _TERMPROC_H_INCLUDED_
+
+#include "textsplit.h"
+#include "stoplist.h"
+namespace Rcl {
+class TermProc {
+public:
+ TermProc(TermProc* next) : m_next(next) {}
+ virtual ~TermProc() {}
+ virtual bool takeword(const string &term, int pos, int bs, int be)
+ {
+ if (m_next)
+ return m_next->takeword(term, pos, bs, be);
+ else
+ return true;
+ }
+ virtual bool flush()
+ {
+ if (m_next)
+ return m_next->flush();
+ else
+ return true;
+ }
+private:
+ TermProc *m_next;
+};
+
+class TextSplitP : public TextSplit {
+public:
+ TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
+ : TextSplit(flags), m_prc(prc)
+ {}
+
+ virtual bool text_to_words(const string &in)
+ {
+ bool ret = TextSplit::text_to_words(in);
+ if (m_prc && !m_prc->flush())
+ return false;
+ return ret;
+ }
+
+ virtual bool takeword(const string& term, int pos, int bs, int be)
+ {
+ if (m_prc)
+ return m_prc->takeword(term, pos, bs, be);
+ else
+ return true;
+ }
+
+private:
+ TermProc *m_prc;
+};
+
+class TermProcPrep : public TermProc {
+public:
+ TermProcPrep(TermProc *nxt) : TermProc(nxt) {}
+
+ virtual bool takeword(const string& itrm, int pos, int bs, int be)
+ {
+ string otrm;
+ if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
+ LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
+ // We don't generate a fatal error because of a bad term
+ return true;
+ }
+ return TermProc::takeword(otrm, pos, bs, be);
+ }
+};
+
+class TermProcStop : public TermProc {
+public:
+ TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
+ : TermProc(nxt), m_stops(stops) { }
+ virtual bool takeword(const string& term, int pos, int bts, int bte)
+ {
+ if (m_stops.isStop(term)) {
+ return true;
+ }
+ return TermProc::takeword(term, pos, bts, bte);
+ }
+private:
+ const Rcl::StopList& m_stops;
+};
+
+class TermProcCommongrams : public TermProc {
+public:
+ TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
+ : TermProc(nxt), m_stops(stops), m_onlygrams(false) { }
+
+ virtual bool takeword(const string& term, int pos, int bs, int be)
+ {
+ LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
+ pos, bs, be, term.c_str()));
+ bool isstop = m_stops.isStop(term);
+ bool twogramemit = false;
+
+ if (!m_prevterm.empty() && (m_prevstop || isstop)) {
+ // create 2-gram. space unnecessary but improves
+ // lisibility of queries
+ string twogram;
+ twogram.swap(m_prevterm);
+ twogram.append(1, ' ');
+ twogram += term;
+ // When emitting a complex term we set the bps to 0. This may
+ // be used by our clients
+ if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
+ return false;
+ twogramemit = true;
+#if 0
+ if (m_stops.isStop(twogram)) {
+ firstword = twogram;
+ isstop = false;
+ }
+#endif
+ }
+
+ m_prevterm = term;
+ m_prevstop = isstop;
+ m_prevpos = pos;
+ m_prevsent = false;
+ m_prevbs = bs;
+ m_prevbe = be;
+ // If flags allow, emit the bare term at the current pos.
+ if (!m_onlygrams || (!isstop && !twogramemit)) {
+ if (!TermProc::takeword(term, pos, bs, be))
+ return false;
+ m_prevsent = true;
+ }
+
+ return true;
+ }
+
+ bool flush()
+ {
+ if (!m_prevsent && !m_prevterm.empty())
+ if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
+ return false;
+
+ m_prevterm.clear();
+ m_prevsent = true;
+ return TermProc::flush();
+ }
+ void onlygrams(bool on)
+ {
+ m_onlygrams = on;
+ }
+private:
+ // The stoplist we're using
+ const Rcl::StopList& m_stops;
+ // Remembered data for the last processed term
+ string m_prevterm;
+ bool m_prevstop;
+ int m_prevpos;
+ int m_prevbs;
+ int m_prevbe;
+ bool m_prevsent;
+ // If this is set, we only emit longest grams
+ bool m_onlygrams;
+};
+
+}
+
+#endif /* _TERMPROC_H_INCLUDED_ */