|
a/src/rcldb/termproc.h |
|
b/src/rcldb/termproc.h |
|
... |
|
... |
22 |
#include "textsplit.h"
|
22 |
#include "textsplit.h"
|
23 |
#include "stoplist.h"
|
23 |
#include "stoplist.h"
|
24 |
|
24 |
|
25 |
namespace Rcl {
|
25 |
namespace Rcl {
|
26 |
|
26 |
|
27 |
/**
|
27 |
/**
|
28 |
* Termproc objects take a stream of term tokens as input and do something
|
28 |
* Termproc objects take term tokens as input and do something
|
29 |
* with them: transform to lowercase, filter out stop words, generate n-grams,
|
29 |
* with them: transform to lowercase, filter out stop words, generate n-grams,
|
30 |
* finally index or generate search clauses, etc. They are chained and can
|
30 |
* finally index or generate search clauses, etc. They are chained and can
|
31 |
* be arranged to form different pipelines depending on the desired processing
|
31 |
* be arranged to form different pipelines depending on the desired processing
|
32 |
* steps: for example, optional stoplist or commongram processing.
|
32 |
* steps: for example, optional stoplist or commongram processing.
|
33 |
*
|
33 |
*
|
34 |
* Shared processing steps are defined in this file. The first and last steps
|
34 |
* Shared processing steps are defined in this file. The first and last steps
|
35 |
* (ie: adding index term) are usually defined in the specific module.
|
35 |
* are usually defined in the specific module.
|
|
|
36 |
* - The front TermProc is typically chained from a TextSplit object
|
|
|
37 |
* which generates the original terms, and calls takeword() from its
|
|
|
38 |
* own takeword() method.
|
|
|
39 |
* - The last TermProc does something with the finalized terms, e.g. adds
|
|
|
40 |
* them to the index.
|
36 |
*/
|
41 |
*/
|
37 |
|
42 |
|
38 |
/**
|
43 |
/**
|
39 |
* The base class takes care of chaining: all derived classes call its
|
44 |
* The base class takes care of chaining: all derived classes call its
|
40 |
* takeword() and flush() methods to ensure that terms go through the pipe.
|
45 |
* takeword() and flush() methods to ensure that terms go through the pipe.
|
41 |
*/
|
46 |
*/
|
42 |
class TermProc {
|
47 |
class TermProc {
|
43 |
public:
|
48 |
public:
|
44 |
TermProc(TermProc* next) : m_next(next) {}
|
49 |
TermProc(TermProc* next) : m_next(next) {}
|
45 |
virtual ~TermProc() {}
|
50 |
virtual ~TermProc() {}
|
46 |
virtual bool takeword(const string &term, int pos, int bs, int be)
|
51 |
virtual bool takeword(const string &term, int pos, int bs, int be)
|
47 |
{
|
52 |
{
|
48 |
if (m_next)
|
53 |
if (m_next)
|
49 |
return m_next->takeword(term, pos, bs, be);
|
54 |
return m_next->takeword(term, pos, bs, be);
|
50 |
else
|
55 |
else
|
51 |
return true;
|
56 |
return true;
|
52 |
}
|
57 |
}
|
|
|
58 |
// newpage() is like takeword(), but for page breaks.
|
53 |
virtual void newpage(int pos)
|
59 |
virtual void newpage(int pos)
|
54 |
{
|
60 |
{
|
55 |
if (m_next)
|
61 |
if (m_next)
|
56 |
m_next->newpage(pos);
|
62 |
m_next->newpage(pos);
|
57 |
}
|
63 |
}
|
58 |
virtual bool flush()
|
64 |
virtual bool flush()
|
59 |
{
|
65 |
{
|
60 |
if (m_next)
|
66 |
if (m_next)
|
61 |
return m_next->flush();
|
67 |
return m_next->flush();
|
62 |
else
|
68 |
else
|
63 |
return true;
|
69 |
return true;
|
64 |
}
|
70 |
}
|
65 |
private:
|
71 |
private:
|
66 |
TermProc *m_next;
|
72 |
TermProc *m_next;
|
67 |
/* Copyconst and assignment private and forbidden */
|
73 |
/* Copyconst and assignment private and forbidden */
|
68 |
TermProc(const TermProc &) {}
|
74 |
TermProc(const TermProc &) {}
|
69 |
TermProc& operator=(const TermProc &) {return *this;};
|
75 |
TermProc& operator=(const TermProc &) {
|
|
|
76 |
return *this;
|
|
|
77 |
};
|
70 |
};
|
78 |
};
|
71 |
|
79 |
|
72 |
/**
|
80 |
/**
|
73 |
* Specialized TextSplit class: this will probably replace the base
|
81 |
* Helper specialized TextSplit class, feeds the pipeline:
|
74 |
* TextSplit when we've converted all the code. The takeword() routine in this
|
82 |
* - The takeword() method calls a TermProc->takeword().
|
75 |
* calls a TermProc's instead of being overriden in a user derived class.
|
|
|
76 |
* The text_to_words() method also takes care of flushing.
|
83 |
* - The text_to_words() method also takes care of flushing.
|
|
|
84 |
* Both methods can be further specialized by the user (they should then call
|
|
|
85 |
* the base methods when they've done the local processing).
|
77 |
*/
|
86 |
*/
|
78 |
class TextSplitP : public TextSplit {
|
87 |
class TextSplitP : public TextSplit {
|
79 |
public:
|
88 |
public:
|
80 |
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
|
89 |
TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
|
81 |
: TextSplit(flags), m_prc(prc) {}
|
90 |
: TextSplit(flags), m_prc(prc) {}
|
82 |
|
91 |
|
83 |
virtual bool text_to_words(const string &in)
|
92 |
virtual bool text_to_words(const string &in) {
|
84 |
{
|
|
|
85 |
bool ret = TextSplit::text_to_words(in);
|
93 |
bool ret = TextSplit::text_to_words(in);
|
86 |
if (m_prc && !m_prc->flush())
|
94 |
if (m_prc && !m_prc->flush())
|
87 |
return false;
|
95 |
return false;
|
88 |
return ret;
|
96 |
return ret;
|
89 |
}
|
97 |
}
|
90 |
|
98 |
|
91 |
virtual bool takeword(const string& term, int pos, int bs, int be)
|
99 |
virtual bool takeword(const string& term, int pos, int bs, int be) {
|
92 |
{
|
100 |
if (m_prc)
|
93 |
if (m_prc)
|
|
|
94 |
return m_prc->takeword(term, pos, bs, be);
|
101 |
return m_prc->takeword(term, pos, bs, be);
|
95 |
else
|
102 |
else
|
96 |
return true;
|
103 |
return true;
|
97 |
}
|
104 |
}
|
|
|
105 |
|
98 |
virtual void newpage(int pos)
|
106 |
virtual void newpage(int pos) {
|
99 |
{
|
107 |
if (m_prc)
|
100 |
if (m_prc)
|
|
|
101 |
return m_prc->newpage(pos);
|
108 |
return m_prc->newpage(pos);
|
102 |
}
|
109 |
}
|
103 |
|
110 |
|
104 |
private:
|
111 |
private:
|
105 |
TermProc *m_prc;
|
112 |
TermProc *m_prc;
|
106 |
};
|
113 |
};
|
107 |
|
114 |
|
108 |
/** Unaccent and lowercase term. This is usually the first in the pipeline */
|
115 |
/** Unaccent and lowercase term. If the index is
|
|
|
116 |
* not case/diac-sensitive, this is usually the first step in the pipeline
|
|
|
117 |
*/
|
109 |
class TermProcPrep : public TermProc {
|
118 |
class TermProcPrep : public TermProc {
|
110 |
public:
|
119 |
public:
|
111 |
TermProcPrep(TermProc *nxt)
|
120 |
TermProcPrep(TermProc *nxt)
|
112 |
: TermProc(nxt), m_totalterms(0), m_unacerrors(0)
|
121 |
: TermProc(nxt), m_totalterms(0), m_unacerrors(0)
|
113 |
{
|
122 |
{
|
114 |
}
|
123 |
}
|
115 |
|
124 |
|
116 |
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
125 |
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
117 |
{
|
126 |
{
|
118 |
m_totalterms++;
|
127 |
m_totalterms++;
|
119 |
string otrm;
|
128 |
string otrm;
|
120 |
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
|
129 |
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
|
121 |
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
130 |
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
122 |
m_unacerrors++;
|
131 |
m_unacerrors++;
|
123 |
// We don't generate a fatal error because of a bad term,
|
132 |
// We don't generate a fatal error because of a bad term,
|
124 |
// but one has to put the limit somewhere
|
133 |
// but one has to put the limit somewhere
|
125 |
if (m_unacerrors > 500 &&
|
134 |
if (m_unacerrors > 500 &&
|
126 |
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
135 |
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
127 |
// More than 1 error for every other term
|
136 |
// More than 1 error for every other term
|
128 |
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
|
137 |
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
|
129 |
m_unacerrors, m_totalterms));
|
138 |
m_unacerrors, m_totalterms));
|
130 |
return false;
|
139 |
return false;
|
131 |
}
|
140 |
}
|
132 |
return true;
|
141 |
return true;
|
133 |
}
|
142 |
}
|
134 |
// It may happen in some weird cases that the output from unac is
|
143 |
// It may happen in some weird cases that the output from unac is
|
135 |
// empty (if the word actually consisted entirely of diacritics ...)
|
144 |
// empty (if the word actually consisted entirely of diacritics ...)
|
136 |
// The consequence is that a phrase search won't work without addional
|
145 |
// The consequence is that a phrase search won't work without addional
|
137 |
// slack.
|
146 |
// slack.
|
138 |
if (otrm.empty())
|
147 |
if (otrm.empty())
|
139 |
return true;
|
148 |
return true;
|
140 |
else
|
149 |
else
|
141 |
return TermProc::takeword(otrm, pos, bs, be);
|
150 |
return TermProc::takeword(otrm, pos, bs, be);
|
142 |
}
|
151 |
}
|
143 |
|
152 |
|
144 |
virtual bool flush()
|
153 |
virtual bool flush()
|
145 |
{
|
154 |
{
|
146 |
m_totalterms = m_unacerrors = 0;
|
155 |
m_totalterms = m_unacerrors = 0;
|
147 |
return TermProc::flush();
|
156 |
return TermProc::flush();
|
148 |
}
|
157 |
}
|
149 |
|
158 |
|
150 |
private:
|
159 |
private:
|
151 |
int m_totalterms;
|
160 |
int m_totalterms;
|
152 |
int m_unacerrors;
|
161 |
int m_unacerrors;
|
|
... |
|
... |
154 |
|
163 |
|
155 |
/** Compare to stop words list and discard if match found */
|
164 |
/** Compare to stop words list and discard if match found */
|
156 |
class TermProcStop : public TermProc {
|
165 |
class TermProcStop : public TermProc {
|
157 |
public:
|
166 |
public:
|
158 |
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
167 |
TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
|
159 |
: TermProc(nxt), m_stops(stops)
|
168 |
: TermProc(nxt), m_stops(stops)
|
160 |
{
|
169 |
{
|
161 |
}
|
170 |
}
|
162 |
|
171 |
|
163 |
virtual bool takeword(const string& term, int pos, int bs, int be)
|
172 |
virtual bool takeword(const string& term, int pos, int bs, int be)
|
164 |
{
|
173 |
{
|
165 |
if (m_stops.isStop(term)) {
|
174 |
if (m_stops.isStop(term)) {
|
166 |
return true;
|
175 |
return true;
|
167 |
}
|
176 |
}
|
168 |
return TermProc::takeword(term, pos, bs, be);
|
177 |
return TermProc::takeword(term, pos, bs, be);
|
169 |
}
|
178 |
}
|
170 |
|
179 |
|
171 |
private:
|
180 |
private:
|
172 |
const Rcl::StopList& m_stops;
|
181 |
const Rcl::StopList& m_stops;
|
173 |
};
|
182 |
};
|
174 |
|
183 |
|
175 |
/** Handle common-gram generation: combine frequent terms with neighbours to
|
184 |
/** Handle common-gram generation: combine frequent terms with neighbours to
|
176 |
* shorten the positions lists for phrase searches.
|
185 |
* shorten the positions lists for phrase searches.
|
177 |
* NOTE: This does not currently work because of bad interaction with the
|
186 |
* NOTE: This does not currently work because of bad interaction with the
|
178 |
* spans (ie john@domain.com) generation in textsplit. Not used, kept for
|
187 |
* spans (ie john@domain.com) generation in textsplit. Not used, kept for
|
179 |
* testing only
|
188 |
* testing only
|
180 |
*/
|
189 |
*/
|
181 |
class TermProcCommongrams : public TermProc {
|
190 |
class TermProcCommongrams : public TermProc {
|
182 |
public:
|
191 |
public:
|
183 |
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
192 |
TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
|
184 |
: TermProc(nxt), m_stops(stops), m_onlygrams(false)
|
193 |
: TermProc(nxt), m_stops(stops), m_onlygrams(false)
|
185 |
{
|
194 |
{
|
186 |
}
|
195 |
}
|
187 |
|
196 |
|
188 |
virtual bool takeword(const string& term, int pos, int bs, int be)
|
197 |
virtual bool takeword(const string& term, int pos, int bs, int be)
|
189 |
{
|
198 |
{
|
190 |
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
|
199 |
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
|
191 |
pos, bs, be, term.c_str()));
|
200 |
pos, bs, be, term.c_str()));
|
192 |
bool isstop = m_stops.isStop(term);
|
201 |
bool isstop = m_stops.isStop(term);
|
193 |
bool twogramemit = false;
|
202 |
bool twogramemit = false;
|
194 |
|
203 |
|
195 |
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
204 |
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
196 |
// create 2-gram. space unnecessary but improves
|
205 |
// create 2-gram. space unnecessary but improves
|
197 |
// the readability of queries
|
206 |
// the readability of queries
|
198 |
string twogram;
|
207 |
string twogram;
|
199 |
twogram.swap(m_prevterm);
|
208 |
twogram.swap(m_prevterm);
|
200 |
twogram.append(1, ' ');
|
209 |
twogram.append(1, ' ');
|
201 |
twogram += term;
|
210 |
twogram += term;
|
202 |
// When emitting a complex term we set the bps to 0. This may
|
211 |
// When emitting a complex term we set the bps to 0. This may
|
203 |
// be used by our clients
|
212 |
// be used by our clients
|
204 |
if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
|
213 |
if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
|
205 |
return false;
|
214 |
return false;
|
206 |
twogramemit = true;
|
215 |
twogramemit = true;
|
207 |
#if 0
|
216 |
#if 0
|
208 |
if (m_stops.isStop(twogram)) {
|
217 |
if (m_stops.isStop(twogram)) {
|
209 |
firstword = twogram;
|
218 |
firstword = twogram;
|
210 |
isstop = false;
|
219 |
isstop = false;
|
211 |
}
|
220 |
}
|
212 |
#endif
|
221 |
#endif
|
213 |
}
|
222 |
}
|
214 |
|
223 |
|
215 |
m_prevterm = term;
|
224 |
m_prevterm = term;
|
216 |
m_prevstop = isstop;
|
225 |
m_prevstop = isstop;
|
217 |
m_prevpos = pos;
|
226 |
m_prevpos = pos;
|
218 |
m_prevsent = false;
|
227 |
m_prevsent = false;
|
219 |
m_prevbs = bs;
|
228 |
m_prevbs = bs;
|
220 |
m_prevbe = be;
|
229 |
m_prevbe = be;
|
221 |
// If flags allow, emit the bare term at the current pos.
|
230 |
// If flags allow, emit the bare term at the current pos.
|
222 |
if (!m_onlygrams || (!isstop && !twogramemit)) {
|
231 |
if (!m_onlygrams || (!isstop && !twogramemit)) {
|
223 |
if (!TermProc::takeword(term, pos, bs, be))
|
232 |
if (!TermProc::takeword(term, pos, bs, be))
|
224 |
return false;
|
233 |
return false;
|
225 |
m_prevsent = true;
|
234 |
m_prevsent = true;
|
226 |
}
|
235 |
}
|
227 |
|
236 |
|
228 |
return true;
|
237 |
return true;
|
229 |
}
|
238 |
}
|
230 |
|
239 |
|
231 |
virtual bool flush()
|
240 |
virtual bool flush()
|
232 |
{
|
241 |
{
|
233 |
if (!m_prevsent && !m_prevterm.empty())
|
242 |
if (!m_prevsent && !m_prevterm.empty())
|
234 |
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
243 |
if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
|
235 |
return false;
|
244 |
return false;
|
236 |
|
245 |
|
237 |
m_prevterm.clear();
|
246 |
m_prevterm.clear();
|
238 |
m_prevsent = true;
|
247 |
m_prevsent = true;
|
239 |
return TermProc::flush();
|
248 |
return TermProc::flush();
|
240 |
}
|
249 |
}
|
241 |
void onlygrams(bool on)
|
250 |
void onlygrams(bool on)
|
242 |
{
|
251 |
{
|
243 |
m_onlygrams = on;
|
252 |
m_onlygrams = on;
|
244 |
}
|
253 |
}
|
245 |
private:
|
254 |
private:
|
246 |
// The stoplist we're using
|
255 |
// The stoplist we're using
|
247 |
const Rcl::StopList& m_stops;
|
256 |
const Rcl::StopList& m_stops;
|
248 |
// Remembered data for the last processed term
|
257 |
// Remembered data for the last processed term
|