--- a
+++ b/src/rcldb/termproc.h
@@ -0,0 +1,182 @@
+/* Copyright (C) 2011 J.F.Dockes
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the
+ *   Free Software Foundation, Inc.,
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ */
+
+
+#ifndef _TERMPROC_H_INCLUDED_
+#define _TERMPROC_H_INCLUDED_
+
+#include "textsplit.h"
+#include "stoplist.h"
+namespace Rcl {
+class TermProc {
+public:
+    TermProc(TermProc* next) : m_next(next) {}
+    virtual ~TermProc() {}
+    virtual bool takeword(const string &term, int pos, int bs, int be)
+    {
+	if (m_next)
+	    return m_next->takeword(term, pos, bs, be);
+	else
+	    return true;
+    }
+    virtual bool flush()
+    {
+	if (m_next)
+	    return m_next->flush();
+	else
+	    return true;
+    }
+private:
+    TermProc *m_next;
+};
+
+class TextSplitP : public TextSplit {
+public:
+    TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
+	: TextSplit(flags), m_prc(prc)
+    {}
+
+    virtual bool text_to_words(const string &in)
+    {
+	bool ret = TextSplit::text_to_words(in);
+	if (m_prc && !m_prc->flush())
+	    return false;
+	return ret;
+    }
+
+    virtual bool takeword(const string& term, int pos, int bs, int be)
+    {
+	if (m_prc)
+	    return m_prc->takeword(term, pos, bs, be);
+	else
+	    return true;
+    }
+
+private:
+    TermProc *m_prc;
+};
+
+class TermProcPrep : public TermProc {
+public:
+    TermProcPrep(TermProc *nxt)	: TermProc(nxt) {}
+
+    virtual bool takeword(const string& itrm, int pos, int bs, int be)
+    {
+	string otrm;
+	if (!unacmaybefold(itrm, otrm, "UTF-8", true)) {
+	    LOGINFO(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
+	    // We don't generate a fatal error because of a bad term
+	    return true;
+	}
+	return TermProc::takeword(otrm, pos, bs, be);
+    }
+};
+
+class TermProcStop : public TermProc {
+public:
+    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
+	: TermProc(nxt), m_stops(stops) { }
+    virtual bool takeword(const string& term, int pos, int bts, int bte)
+    {
+	if (m_stops.isStop(term)) {
+	    return true;
+	}
+	return TermProc::takeword(term, pos, bts, bte);
+    }
+private:
+    const Rcl::StopList& m_stops;
+};
+
+class TermProcCommongrams : public TermProc {
+public:
+    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
+	: TermProc(nxt), m_stops(stops), m_onlygrams(false) { }
+
+    virtual bool takeword(const string& term, int pos, int bs, int be)
+    {
+	LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", 
+		 pos, bs, be, term.c_str()));
+	bool isstop = m_stops.isStop(term);
+	bool twogramemit = false;
+
+	if (!m_prevterm.empty() && (m_prevstop || isstop)) {
+	    // create 2-gram. space unnecessary but improves
+	    // lisibility of queries
+	    string twogram;
+	    twogram.swap(m_prevterm);
+	    twogram.append(1, ' ');
+	    twogram += term;
+	    // When emitting a complex term we set the bps to 0. This may
+	    // be used by our clients
+	    if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
+		return false;
+	    twogramemit = true;
+#if 0
+	    if (m_stops.isStop(twogram)) {
+		firstword = twogram;
+		isstop = false;
+	    }
+#endif
+	}
+
+	m_prevterm = term;
+	m_prevstop = isstop;
+	m_prevpos = pos;
+	m_prevsent = false;
+	m_prevbs = bs;
+	m_prevbe = be;
+	// If flags allow, emit the bare term at the current pos.
+	if (!m_onlygrams || (!isstop && !twogramemit)) {
+	    if (!TermProc::takeword(term, pos, bs, be))
+		return false;
+	    m_prevsent = true;
+	} 
+
+	return true;
+    }
+
+    bool flush()
+    {
+	if (!m_prevsent && !m_prevterm.empty())
+	    if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
+		return false;
+	    
+	m_prevterm.clear();
+	m_prevsent = true;
+	return TermProc::flush();
+    }
+    void onlygrams(bool on)
+    {
+	m_onlygrams = on;
+    }
+private:
+    // The stoplist we're using
+    const Rcl::StopList& m_stops;
+    // Remembered data for the last processed term
+    string m_prevterm;
+    bool   m_prevstop;
+    int    m_prevpos;
+    int    m_prevbs;
+    int    m_prevbe;
+    bool   m_prevsent;
+    // If this is set, we only emit longest grams
+    bool   m_onlygrams;
+};
+
+}
+
+#endif /* _TERMPROC_H_INCLUDED_ */