recoll / Code / Diff of /src/rcldb/termproc.h

Diff of /src/rcldb/termproc.h [7876fb] .. [94b945]

Switch to unified view


...
#include "textsplit.h"
#include "stoplist.h"

namespace Rcl {

/**
 * Termproc objects take term tokens as input and do something
 * with them: transform to lowercase, filter out stop words, generate n-grams,
 * finally index or generate search clauses, etc. They are chained and can
 * be arranged to form different pipelines depending on the desired processing
 * steps: for example, optional stoplist or commongram processing.
 *
 * Shared processing steps are defined in this file. The first and last steps
 * are usually defined in the specific module.
 * - The front TermProc is typically chained from a TextSplit object
 *   which generates the original terms, and calls takeword() from its
 *   own takeword() method.
 * - The last TermProc does something with the finalized terms, e.g. adds
 *   them to the index.
 */

/**
 * The base class takes care of chaining: all derived classes call its
 * takeword() and flush() methods to ensure that terms go through the pipe.
 */
class TermProc {
public:
    TermProc(TermProc* next) : m_next(next) {}
    virtual ~TermProc() {}
    virtual bool takeword(const string &term, int pos, int bs, int be)
    {
        if (m_next)
            return m_next->takeword(term, pos, bs, be);
        else
            return true;
    }
    // newpage() is like takeword(), but for page breaks.
    virtual void newpage(int pos)
    {
        if (m_next)
            m_next->newpage(pos);
    }
    virtual bool flush()
    {
        if (m_next)
            return m_next->flush();
        else
            return true;
    }
private:
    TermProc *m_next;
    /* Copyconst and assignment private and forbidden */
    TermProc(const TermProc &) {}
    TermProc& operator=(const TermProc &) {
        return *this;
    };
};

/**
 * Helper specialized TextSplit class, feeds the pipeline:
 * - The takeword() method calls a TermProc->takeword().

 * - The text_to_words() method also takes care of flushing.
 * Both methods can be further specialized by the user (they should then call
 * the base methods when they've done the local processing).
 */
class TextSplitP : public TextSplit {
public:
    TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
        : TextSplit(flags), m_prc(prc)  {}

    virtual bool text_to_words(const string &in) {

        bool ret = TextSplit::text_to_words(in);
        if (m_prc && !m_prc->flush())
            return false;
        return ret;
    }

    virtual bool takeword(const string& term, int pos, int bs, int be) {
        if (m_prc)

            return m_prc->takeword(term, pos, bs, be);
        else
            return true;
    }

    virtual void newpage(int pos) {
        if (m_prc)

            return m_prc->newpage(pos);
    }

private:
    TermProc *m_prc;
};

/** Unaccent and lowercase term. If the index is
 *  not case/diac-sensitive, this is usually the first step in the pipeline
 */
class TermProcPrep : public TermProc {
public:
    TermProcPrep(TermProc *nxt)
        : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
    {
    }

    virtual bool takeword(const string& itrm, int pos, int bs, int be)
    {
        m_totalterms++;
        string otrm;
        if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
            LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
            m_unacerrors++;
            // We don't generate a fatal error because of a bad term,
            // but one has to put the limit somewhere
            if (m_unacerrors > 500 &&
                    (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
                // More than 1 error for every other term
                LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
                        m_unacerrors, m_totalterms));
                return false;
            }
            return true;
        }
        // It may happen in some weird cases that the output from unac is
        // empty (if the word actually consisted entirely of diacritics ...)
        // The consequence is that a phrase search won't work without addional
        // slack.
        if (otrm.empty())
            return true;
        else
            return TermProc::takeword(otrm, pos, bs, be);
    }

    virtual bool flush()
    {
        m_totalterms = m_unacerrors = 0;
        return TermProc::flush();
    }

private:
    int m_totalterms;
    int m_unacerrors;
...

/** Compare to stop words list and discard if match found */
class TermProcStop : public TermProc {
public:
    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
        : TermProc(nxt), m_stops(stops)
    {
    }

    virtual bool takeword(const string& term, int pos, int bs, int be)
    {
        if (m_stops.isStop(term)) {
            return true;
        }
        return TermProc::takeword(term, pos, bs, be);
    }

private:
    const Rcl::StopList& m_stops;
};

/** Handle common-gram generation: combine frequent terms with neighbours to
 *  shorten the positions lists for phrase searches.
 *  NOTE: This does not currently work because of bad interaction with the
 *  spans (ie john@domain.com) generation in textsplit. Not used, kept for
 *  testing only
 */
class TermProcCommongrams : public TermProc {
public:
    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
        : TermProc(nxt), m_stops(stops), m_onlygrams(false)
    {
    }

    virtual bool takeword(const string& term, int pos, int bs, int be)
    {
        LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
                 pos, bs, be, term.c_str()));
        bool isstop = m_stops.isStop(term);
        bool twogramemit = false;

        if (!m_prevterm.empty() && (m_prevstop || isstop)) {
            // create 2-gram. space unnecessary but improves
            // the readability of queries
            string twogram;
            twogram.swap(m_prevterm);
            twogram.append(1, ' ');
            twogram += term;
            // When emitting a complex term we set the bps to 0. This may
            // be used by our clients
            if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
                return false;
            twogramemit = true;
#if 0
            if (m_stops.isStop(twogram)) {
                firstword = twogram;
                isstop = false;
            }
#endif
        }

        m_prevterm = term;
        m_prevstop = isstop;
        m_prevpos = pos;
        m_prevsent = false;
        m_prevbs = bs;
        m_prevbe = be;
        // If flags allow, emit the bare term at the current pos.
        if (!m_onlygrams || (!isstop && !twogramemit)) {
            if (!TermProc::takeword(term, pos, bs, be))
                return false;
            m_prevsent = true;
        }

        return true;
    }

    virtual bool flush()
    {
        if (!m_prevsent && !m_prevterm.empty())
            if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
                return false;

        m_prevterm.clear();
        m_prevsent = true;
        return TermProc::flush();
    }
    void onlygrams(bool on)
    {
        m_onlygrams = on;
    }
private:
    // The stoplist we're using
    const Rcl::StopList& m_stops;
    // Remembered data for the last processed term

	a/src/rcldb/termproc.h		b/src/rcldb/termproc.h
	...		...
22	#include "textsplit.h"	22	#include "textsplit.h"
23	#include "stoplist.h"	23	#include "stoplist.h"
24		24
25	namespace Rcl {	25	namespace Rcl {
26		26
27	/**	27	/**
28	* Termproc objects take a stream of term tokens as input and do something	28	* Termproc objects take term tokens as input and do something
29	* with them: transform to lowercase, filter out stop words, generate n-grams,	29	* with them: transform to lowercase, filter out stop words, generate n-grams,
30	* finally index or generate search clauses, etc. They are chained and can	30	* finally index or generate search clauses, etc. They are chained and can
31	* be arranged to form different pipelines depending on the desired processing	31	* be arranged to form different pipelines depending on the desired processing
32	* steps: for example, optional stoplist or commongram processing.	32	* steps: for example, optional stoplist or commongram processing.
33	*	33	*
34	* Shared processing steps are defined in this file. The first and last steps	34	* Shared processing steps are defined in this file. The first and last steps
35	* (ie: adding index term) are usually defined in the specific module.	35	* are usually defined in the specific module.
		36	* - The front TermProc is typically chained from a TextSplit object
		37	* which generates the original terms, and calls takeword() from its
		38	* own takeword() method.
		39	* - The last TermProc does something with the finalized terms, e.g. adds
		40	* them to the index.
36	*/	41	*/
37		42
38	/**	43	/**
39	* The base class takes care of chaining: all derived classes call its	44	* The base class takes care of chaining: all derived classes call its
40	* takeword() and flush() methods to ensure that terms go through the pipe.	45	* takeword() and flush() methods to ensure that terms go through the pipe.
41	*/	46	*/
42	class TermProc {	47	class TermProc {
43	public:	48	public:
44	TermProc(TermProc* next) : m_next(next) {}	49	TermProc(TermProc* next) : m_next(next) {}
45	virtual ~TermProc() {}	50	virtual ~TermProc() {}
46	virtual bool takeword(const string &term, int pos, int bs, int be)	51	virtual bool takeword(const string &term, int pos, int bs, int be)
47	{	52	{
48	if (m_next)	53	if (m_next)
49	return m_next->takeword(term, pos, bs, be);	54	return m_next->takeword(term, pos, bs, be);
50	else	55	else
51	return true;	56	return true;
52	}	57	}
		58	// newpage() is like takeword(), but for page breaks.
53	virtual void newpage(int pos)	59	virtual void newpage(int pos)
54	{	60	{
55	if (m_next)	61	if (m_next)
56	m_next->newpage(pos);	62	m_next->newpage(pos);
57	}	63	}
58	virtual bool flush()	64	virtual bool flush()
59	{	65	{
60	if (m_next)	66	if (m_next)
61	return m_next->flush();	67	return m_next->flush();
62	else	68	else
63	return true;	69	return true;
64	}	70	}
65	private:	71	private:
66	TermProc *m_next;	72	TermProc *m_next;
67	/* Copyconst and assignment private and forbidden */	73	/* Copyconst and assignment private and forbidden */
68	TermProc(const TermProc &) {}	74	TermProc(const TermProc &) {}
69	TermProc& operator=(const TermProc &) {return *this;};	75	TermProc& operator=(const TermProc &) {
		76	return *this;
		77	};
70	};	78	};
71		79
72	/**	80	/**
73	* Specialized TextSplit class: this will probably replace the base	81	* Helper specialized TextSplit class, feeds the pipeline:
74	* TextSplit when we've converted all the code. The takeword() routine in this	82	* - The takeword() method calls a TermProc->takeword().
75	* calls a TermProc's instead of being overriden in a user derived class.
76	* The text_to_words() method also takes care of flushing.	83	* - The text_to_words() method also takes care of flushing.
		84	* Both methods can be further specialized by the user (they should then call
		85	* the base methods when they've done the local processing).
77	*/	86	*/
78	class TextSplitP : public TextSplit {	87	class TextSplitP : public TextSplit {
79	public:	88	public:
80	TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))	89	TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
81	: TextSplit(flags), m_prc(prc) {}	90	: TextSplit(flags), m_prc(prc) {}
82		91
83	virtual bool text_to_words(const string &in)	92	virtual bool text_to_words(const string &in) {
84	{
85	bool ret = TextSplit::text_to_words(in);	93	bool ret = TextSplit::text_to_words(in);
86	if (m_prc && !m_prc->flush())	94	if (m_prc && !m_prc->flush())
87	return false;	95	return false;
88	return ret;	96	return ret;
89	}	97	}
90		98
91	virtual bool takeword(const string& term, int pos, int bs, int be)	99	virtual bool takeword(const string& term, int pos, int bs, int be) {
92	{	100	if (m_prc)
93	if (m_prc)
94	return m_prc->takeword(term, pos, bs, be);	101	return m_prc->takeword(term, pos, bs, be);
95	else	102	else
96	return true;	103	return true;
97	}	104	}
		105
98	virtual void newpage(int pos)	106	virtual void newpage(int pos) {
99	{	107	if (m_prc)
100	if (m_prc)
101	return m_prc->newpage(pos);	108	return m_prc->newpage(pos);
102	}	109	}
103		110
104	private:	111	private:
105	TermProc *m_prc;	112	TermProc *m_prc;
106	};	113	};
107		114
108	/** Unaccent and lowercase term. This is usually the first in the pipeline */	115	/** Unaccent and lowercase term. If the index is
		116	* not case/diac-sensitive, this is usually the first step in the pipeline
		117	*/
109	class TermProcPrep : public TermProc {	118	class TermProcPrep : public TermProc {
110	public:	119	public:
111	TermProcPrep(TermProc *nxt)	120	TermProcPrep(TermProc *nxt)
112	: TermProc(nxt), m_totalterms(0), m_unacerrors(0)	121	: TermProc(nxt), m_totalterms(0), m_unacerrors(0)
113	{	122	{
114	}	123	}
115		124
116	virtual bool takeword(const string& itrm, int pos, int bs, int be)	125	virtual bool takeword(const string& itrm, int pos, int bs, int be)
117	{	126	{
118	m_totalterms++;	127	m_totalterms++;
119	string otrm;	128	string otrm;
120	if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {	129	if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
121	LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));	130	LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
122	m_unacerrors++;	131	m_unacerrors++;
123	// We don't generate a fatal error because of a bad term,	132	// We don't generate a fatal error because of a bad term,
124	// but one has to put the limit somewhere	133	// but one has to put the limit somewhere
125	if (m_unacerrors > 500 &&	134	if (m_unacerrors > 500 &&
126	(double(m_totalterms) / double(m_unacerrors)) < 2.0) {	135	(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
127	// More than 1 error for every other term	136	// More than 1 error for every other term
128	LOGERR(("splitter::takeword: too many unac errors %d/%d\n",	137	LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
129	m_unacerrors, m_totalterms));	138	m_unacerrors, m_totalterms));
130	return false;	139	return false;
131	}	140	}
132	return true;	141	return true;
133	}	142	}
134	// It may happen in some weird cases that the output from unac is	143	// It may happen in some weird cases that the output from unac is
135	// empty (if the word actually consisted entirely of diacritics ...)	144	// empty (if the word actually consisted entirely of diacritics ...)
136	// The consequence is that a phrase search won't work without addional	145	// The consequence is that a phrase search won't work without addional
137	// slack.	146	// slack.
138	if (otrm.empty())	147	if (otrm.empty())
139	return true;	148	return true;
140	else	149	else
141	return TermProc::takeword(otrm, pos, bs, be);	150	return TermProc::takeword(otrm, pos, bs, be);
142	}	151	}
143		152
144	virtual bool flush()	153	virtual bool flush()
145	{	154	{
146	m_totalterms = m_unacerrors = 0;	155	m_totalterms = m_unacerrors = 0;
147	return TermProc::flush();	156	return TermProc::flush();
148	}	157	}
149		158
150	private:	159	private:
151	int m_totalterms;	160	int m_totalterms;
152	int m_unacerrors;	161	int m_unacerrors;
	...		...
154		163
155	/** Compare to stop words list and discard if match found */	164	/** Compare to stop words list and discard if match found */
156	class TermProcStop : public TermProc {	165	class TermProcStop : public TermProc {
157	public:	166	public:
158	TermProcStop(TermProc *nxt, const Rcl::StopList& stops)	167	TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
159	: TermProc(nxt), m_stops(stops)	168	: TermProc(nxt), m_stops(stops)
160	{	169	{
161	}	170	}
162		171
163	virtual bool takeword(const string& term, int pos, int bs, int be)	172	virtual bool takeword(const string& term, int pos, int bs, int be)
164	{	173	{
165	if (m_stops.isStop(term)) {	174	if (m_stops.isStop(term)) {
166	return true;	175	return true;
167	}	176	}
168	return TermProc::takeword(term, pos, bs, be);	177	return TermProc::takeword(term, pos, bs, be);
169	}	178	}
170		179
171	private:	180	private:
172	const Rcl::StopList& m_stops;	181	const Rcl::StopList& m_stops;
173	};	182	};
174		183
175	/** Handle common-gram generation: combine frequent terms with neighbours to	184	/** Handle common-gram generation: combine frequent terms with neighbours to
176	* shorten the positions lists for phrase searches.	185	* shorten the positions lists for phrase searches.
177	* NOTE: This does not currently work because of bad interaction with the	186	* NOTE: This does not currently work because of bad interaction with the
178	* spans (ie john@domain.com) generation in textsplit. Not used, kept for	187	* spans (ie john@domain.com) generation in textsplit. Not used, kept for
179	* testing only	188	* testing only
180	*/	189	*/
181	class TermProcCommongrams : public TermProc {	190	class TermProcCommongrams : public TermProc {
182	public:	191	public:
183	TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)	192	TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
184	: TermProc(nxt), m_stops(stops), m_onlygrams(false)	193	: TermProc(nxt), m_stops(stops), m_onlygrams(false)
185	{	194	{
186	}	195	}
187		196
188	virtual bool takeword(const string& term, int pos, int bs, int be)	197	virtual bool takeword(const string& term, int pos, int bs, int be)
189	{	198	{
190	LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",	199	LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
191	pos, bs, be, term.c_str()));	200	pos, bs, be, term.c_str()));
192	bool isstop = m_stops.isStop(term);	201	bool isstop = m_stops.isStop(term);
193	bool twogramemit = false;	202	bool twogramemit = false;
194		203
195	if (!m_prevterm.empty() && (m_prevstop \|\| isstop)) {	204	if (!m_prevterm.empty() && (m_prevstop \|\| isstop)) {
196	// create 2-gram. space unnecessary but improves	205	// create 2-gram. space unnecessary but improves
197	// the readability of queries	206	// the readability of queries
198	string twogram;	207	string twogram;
199	twogram.swap(m_prevterm);	208	twogram.swap(m_prevterm);
200	twogram.append(1, ' ');	209	twogram.append(1, ' ');
201	twogram += term;	210	twogram += term;
202	// When emitting a complex term we set the bps to 0. This may	211	// When emitting a complex term we set the bps to 0. This may
203	// be used by our clients	212	// be used by our clients
204	if (!TermProc::takeword(twogram, m_prevpos, 0, 0))	213	if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
205	return false;	214	return false;
206	twogramemit = true;	215	twogramemit = true;
207	#if 0	216	#if 0
208	if (m_stops.isStop(twogram)) {	217	if (m_stops.isStop(twogram)) {
209	firstword = twogram;	218	firstword = twogram;
210	isstop = false;	219	isstop = false;
211	}	220	}
212	#endif	221	#endif
213	}	222	}
214		223
215	m_prevterm = term;	224	m_prevterm = term;
216	m_prevstop = isstop;	225	m_prevstop = isstop;
217	m_prevpos = pos;	226	m_prevpos = pos;
218	m_prevsent = false;	227	m_prevsent = false;
219	m_prevbs = bs;	228	m_prevbs = bs;
220	m_prevbe = be;	229	m_prevbe = be;
221	// If flags allow, emit the bare term at the current pos.	230	// If flags allow, emit the bare term at the current pos.
222	if (!m_onlygrams \|\| (!isstop && !twogramemit)) {	231	if (!m_onlygrams \|\| (!isstop && !twogramemit)) {
223	if (!TermProc::takeword(term, pos, bs, be))	232	if (!TermProc::takeword(term, pos, bs, be))
224	return false;	233	return false;
225	m_prevsent = true;	234	m_prevsent = true;
226	}	235	}
227		236
228	return true;	237	return true;
229	}	238	}
230		239
231	virtual bool flush()	240	virtual bool flush()
232	{	241	{
233	if (!m_prevsent && !m_prevterm.empty())	242	if (!m_prevsent && !m_prevterm.empty())
234	if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))	243	if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
235	return false;	244	return false;
236		245
237	m_prevterm.clear();	246	m_prevterm.clear();
238	m_prevsent = true;	247	m_prevsent = true;
239	return TermProc::flush();	248	return TermProc::flush();
240	}	249	}
241	void onlygrams(bool on)	250	void onlygrams(bool on)
242	{	251	{
243	m_onlygrams = on;	252	m_onlygrams = on;
244	}	253	}
245	private:	254	private:
246	// The stoplist we're using	255	// The stoplist we're using
247	const Rcl::StopList& m_stops;	256	const Rcl::StopList& m_stops;
248	// Remembered data for the last processed term	257	// Remembered data for the last processed term