Switch to unified view

a/src/rcldb/termproc.h b/src/rcldb/termproc.h
...
...
22
#include "textsplit.h"
22
#include "textsplit.h"
23
#include "stoplist.h"
23
#include "stoplist.h"
24
24
25
namespace Rcl {
25
namespace Rcl {
26
26
27
/** 
27
/**
28
 * Termproc objects take a stream of term tokens as input and do something
28
 * Termproc objects take term tokens as input and do something
29
 * with them: transform to lowercase, filter out stop words, generate n-grams,
29
 * with them: transform to lowercase, filter out stop words, generate n-grams,
30
 * finally index or generate search clauses, etc. They are chained and can 
30
 * finally index or generate search clauses, etc. They are chained and can
31
 * be arranged to form different pipelines depending on the desired processing
31
 * be arranged to form different pipelines depending on the desired processing
32
 * steps: for example, optional stoplist or commongram processing.
32
 * steps: for example, optional stoplist or commongram processing.
33
 *
33
 *
34
 * Shared processing steps are defined in this file. The first and last steps
34
 * Shared processing steps are defined in this file. The first and last steps
35
 * (ie: adding index term) are usually defined in the specific module.
35
 * are usually defined in the specific module.
36
 * - The front TermProc is typically chained from a TextSplit object
37
 *   which generates the original terms, and calls takeword() from its
38
 *   own takeword() method.
39
 * - The last TermProc does something with the finalized terms, e.g. adds
40
 *   them to the index.
36
 */
41
 */
37
42
38
/** 
43
/**
39
 * The base class takes care of chaining: all derived classes call its 
44
 * The base class takes care of chaining: all derived classes call its
40
 * takeword() and flush() methods to ensure that terms go through the pipe.
45
 * takeword() and flush() methods to ensure that terms go through the pipe.
41
 */
46
 */
42
class TermProc {
47
class TermProc {
43
public:
48
public:
44
    TermProc(TermProc* next) : m_next(next) {}
49
    TermProc(TermProc* next) : m_next(next) {}
45
    virtual ~TermProc() {}
50
    virtual ~TermProc() {}
46
    virtual bool takeword(const string &term, int pos, int bs, int be)
51
    virtual bool takeword(const string &term, int pos, int bs, int be)
47
    {
52
    {
48
  if (m_next)
53
        if (m_next)
49
      return m_next->takeword(term, pos, bs, be);
54
            return m_next->takeword(term, pos, bs, be);
50
  else
55
        else
51
      return true;
56
            return true;
52
    }
57
    }
58
    // newpage() is like takeword(), but for page breaks.
53
    virtual void newpage(int pos)
59
    virtual void newpage(int pos)
54
    {
60
    {
55
  if (m_next)
61
        if (m_next)
56
      m_next->newpage(pos);
62
            m_next->newpage(pos);
57
    }
63
    }
58
    virtual bool flush()
64
    virtual bool flush()
59
    {
65
    {
60
  if (m_next)
66
        if (m_next)
61
      return m_next->flush();
67
            return m_next->flush();
62
  else
68
        else
63
      return true;
69
            return true;
64
    }
70
    }
65
private:
71
private:
66
    TermProc *m_next;
72
    TermProc *m_next;
67
    /* Copyconst and assignment private and forbidden */
73
    /* Copyconst and assignment private and forbidden */
68
    TermProc(const TermProc &) {}
74
    TermProc(const TermProc &) {}
69
    TermProc& operator=(const TermProc &) {return *this;};
75
    TermProc& operator=(const TermProc &) {
76
        return *this;
77
    };
70
};
78
};
71
79
72
/** 
80
/**
73
 * Specialized TextSplit class: this will probably replace the base
81
 * Helper specialized TextSplit class, feeds the pipeline:
74
 * TextSplit when we've converted all the code. The takeword() routine in this
82
 * - The takeword() method calls a TermProc->takeword().
75
 * calls a TermProc's instead of being overriden in a user derived class.
76
 * The text_to_words() method also takes care of flushing.
83
 * - The text_to_words() method also takes care of flushing.
84
 * Both methods can be further specialized by the user (they should then call
85
 * the base methods when they've done the local processing).
77
 */
86
 */
78
class TextSplitP : public TextSplit {
87
class TextSplitP : public TextSplit {
79
public:
88
public:
80
    TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
89
    TextSplitP(TermProc *prc, Flags flags = Flags(TXTS_NONE))
81
  : TextSplit(flags), m_prc(prc)  {}
90
        : TextSplit(flags), m_prc(prc)  {}
82
91
83
    virtual bool text_to_words(const string &in)
92
    virtual bool text_to_words(const string &in) {
84
    {
85
  bool ret = TextSplit::text_to_words(in);
93
        bool ret = TextSplit::text_to_words(in);
86
  if (m_prc && !m_prc->flush())
94
        if (m_prc && !m_prc->flush())
87
      return false;
95
            return false;
88
  return ret;
96
        return ret;
89
    }
97
    }
90
98
91
    virtual bool takeword(const string& term, int pos, int bs, int be)
99
    virtual bool takeword(const string& term, int pos, int bs, int be) {
92
    {
100
        if (m_prc)
93
  if (m_prc)
94
      return m_prc->takeword(term, pos, bs, be);
101
            return m_prc->takeword(term, pos, bs, be);
95
  else
102
        else
96
      return true;
103
            return true;
97
    }
104
    }
105
98
    virtual void newpage(int pos)
106
    virtual void newpage(int pos) {
99
    {
107
        if (m_prc)
100
  if (m_prc)
101
      return m_prc->newpage(pos);
108
            return m_prc->newpage(pos);
102
    }
109
    }
103
110
104
private:
111
private:
105
    TermProc *m_prc;
112
    TermProc *m_prc;
106
};
113
};
107
114
108
/** Unaccent and lowercase term. This is usually the first in the pipeline */
115
/** Unaccent and lowercase term. If the index is
116
 *  not case/diac-sensitive, this is usually the first step in the pipeline
117
 */
109
class TermProcPrep : public TermProc {
118
class TermProcPrep : public TermProc {
110
public:
119
public:
111
    TermProcPrep(TermProc *nxt)   
120
    TermProcPrep(TermProc *nxt)
112
  : TermProc(nxt), m_totalterms(0), m_unacerrors(0) 
121
        : TermProc(nxt), m_totalterms(0), m_unacerrors(0)
113
    {
122
    {
114
    }
123
    }
115
124
116
    virtual bool takeword(const string& itrm, int pos, int bs, int be)
125
    virtual bool takeword(const string& itrm, int pos, int bs, int be)
117
    {
126
    {
118
  m_totalterms++;
127
        m_totalterms++;
119
  string otrm;
128
        string otrm;
120
  if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
129
        if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
121
      LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
130
            LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
122
      m_unacerrors++;
131
            m_unacerrors++;
123
      // We don't generate a fatal error because of a bad term,
132
            // We don't generate a fatal error because of a bad term,
124
      // but one has to put the limit somewhere
133
            // but one has to put the limit somewhere
125
      if (m_unacerrors > 500 && 
134
            if (m_unacerrors > 500 &&
126
      (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
135
                    (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
127
      // More than 1 error for every other term
136
                // More than 1 error for every other term
128
      LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
137
                LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
129
          m_unacerrors, m_totalterms));
138
                        m_unacerrors, m_totalterms));
130
      return false;
139
                return false;
131
      }
140
            }
132
      return true;
141
            return true;
133
  }
142
        }
134
  // It may happen in some weird cases that the output from unac is 
143
        // It may happen in some weird cases that the output from unac is
135
  // empty (if the word actually consisted entirely of diacritics ...)
144
        // empty (if the word actually consisted entirely of diacritics ...)
136
  // The consequence is that a phrase search won't work without addional
145
        // The consequence is that a phrase search won't work without addional
137
  // slack. 
146
        // slack.
138
  if (otrm.empty())
147
        if (otrm.empty())
139
      return true;
148
            return true;
140
  else
149
        else
141
      return TermProc::takeword(otrm, pos, bs, be);
150
            return TermProc::takeword(otrm, pos, bs, be);
142
    }
151
    }
143
152
144
    virtual bool flush()
153
    virtual bool flush()
145
    {
154
    {
146
  m_totalterms = m_unacerrors = 0;
155
        m_totalterms = m_unacerrors = 0;
147
  return TermProc::flush();
156
        return TermProc::flush();
148
    }
157
    }
149
158
150
private:
159
private:
151
    int m_totalterms;
160
    int m_totalterms;
152
    int m_unacerrors;
161
    int m_unacerrors;
...
...
154
163
155
/** Compare to stop words list and discard if match found */
164
/** Compare to stop words list and discard if match found */
156
class TermProcStop : public TermProc {
165
class TermProcStop : public TermProc {
157
public:
166
public:
158
    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
167
    TermProcStop(TermProc *nxt, const Rcl::StopList& stops)
159
  : TermProc(nxt), m_stops(stops) 
168
        : TermProc(nxt), m_stops(stops)
160
    {
169
    {
161
    }
170
    }
162
171
163
    virtual bool takeword(const string& term, int pos, int bs, int be)
172
    virtual bool takeword(const string& term, int pos, int bs, int be)
164
    {
173
    {
165
  if (m_stops.isStop(term)) {
174
        if (m_stops.isStop(term)) {
166
      return true;
175
            return true;
167
  }
176
        }
168
  return TermProc::takeword(term, pos, bs, be);
177
        return TermProc::takeword(term, pos, bs, be);
169
    }
178
    }
170
179
171
private:
180
private:
172
    const Rcl::StopList& m_stops;
181
    const Rcl::StopList& m_stops;
173
};
182
};
174
183
175
/** Handle common-gram generation: combine frequent terms with neighbours to
184
/** Handle common-gram generation: combine frequent terms with neighbours to
176
 *  shorten the positions lists for phrase searches.
185
 *  shorten the positions lists for phrase searches.
177
 *  NOTE: This does not currently work because of bad interaction with the 
186
 *  NOTE: This does not currently work because of bad interaction with the
178
 *  spans (ie john@domain.com) generation in textsplit. Not used, kept for
187
 *  spans (ie john@domain.com) generation in textsplit. Not used, kept for
179
 *  testing only
188
 *  testing only
180
 */
189
 */
181
class TermProcCommongrams : public TermProc {
190
class TermProcCommongrams : public TermProc {
182
public:
191
public:
183
    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
192
    TermProcCommongrams(TermProc *nxt, const Rcl::StopList& stops)
184
  : TermProc(nxt), m_stops(stops), m_onlygrams(false) 
193
        : TermProc(nxt), m_stops(stops), m_onlygrams(false)
185
    {
194
    {
186
    }
195
    }
187
196
188
    virtual bool takeword(const string& term, int pos, int bs, int be)
197
    virtual bool takeword(const string& term, int pos, int bs, int be)
189
    {
198
    {
190
  LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n", 
199
        LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
191
       pos, bs, be, term.c_str()));
200
                 pos, bs, be, term.c_str()));
192
  bool isstop = m_stops.isStop(term);
201
        bool isstop = m_stops.isStop(term);
193
  bool twogramemit = false;
202
        bool twogramemit = false;
194
203
195
  if (!m_prevterm.empty() && (m_prevstop || isstop)) {
204
        if (!m_prevterm.empty() && (m_prevstop || isstop)) {
196
      // create 2-gram. space unnecessary but improves
205
            // create 2-gram. space unnecessary but improves
197
      // the readability of queries
206
            // the readability of queries
198
      string twogram;
207
            string twogram;
199
      twogram.swap(m_prevterm);
208
            twogram.swap(m_prevterm);
200
      twogram.append(1, ' ');
209
            twogram.append(1, ' ');
201
      twogram += term;
210
            twogram += term;
202
      // When emitting a complex term we set the bps to 0. This may
211
            // When emitting a complex term we set the bps to 0. This may
203
      // be used by our clients
212
            // be used by our clients
204
      if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
213
            if (!TermProc::takeword(twogram, m_prevpos, 0, 0))
205
      return false;
214
                return false;
206
      twogramemit = true;
215
            twogramemit = true;
207
#if 0
216
#if 0
208
      if (m_stops.isStop(twogram)) {
217
            if (m_stops.isStop(twogram)) {
209
      firstword = twogram;
218
                firstword = twogram;
210
      isstop = false;
219
                isstop = false;
211
      }
220
            }
212
#endif
221
#endif
213
  }
222
        }
214
  
223
215
  m_prevterm = term;
224
        m_prevterm = term;
216
  m_prevstop = isstop;
225
        m_prevstop = isstop;
217
  m_prevpos = pos;
226
        m_prevpos = pos;
218
  m_prevsent = false;
227
        m_prevsent = false;
219
  m_prevbs = bs;
228
        m_prevbs = bs;
220
  m_prevbe = be;
229
        m_prevbe = be;
221
  // If flags allow, emit the bare term at the current pos.
230
        // If flags allow, emit the bare term at the current pos.
222
  if (!m_onlygrams || (!isstop && !twogramemit)) {
231
        if (!m_onlygrams || (!isstop && !twogramemit)) {
223
      if (!TermProc::takeword(term, pos, bs, be))
232
            if (!TermProc::takeword(term, pos, bs, be))
224
      return false;
233
                return false;
225
      m_prevsent = true;
234
            m_prevsent = true;
226
  } 
235
        }
227
236
228
  return true;
237
        return true;
229
    }
238
    }
230
239
231
    virtual bool flush()
240
    virtual bool flush()
232
    {
241
    {
233
  if (!m_prevsent && !m_prevterm.empty())
242
        if (!m_prevsent && !m_prevterm.empty())
234
      if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
243
            if (!TermProc::takeword(m_prevterm, m_prevpos, m_prevbs, m_prevbe))
235
      return false;
244
                return false;
236
      
245
237
  m_prevterm.clear();
246
        m_prevterm.clear();
238
  m_prevsent = true;
247
        m_prevsent = true;
239
  return TermProc::flush();
248
        return TermProc::flush();
240
    }
249
    }
241
    void onlygrams(bool on)
250
    void onlygrams(bool on)
242
    {
251
    {
243
  m_onlygrams = on;
252
        m_onlygrams = on;
244
    }
253
    }
245
private:
254
private:
246
    // The stoplist we're using
255
    // The stoplist we're using
247
    const Rcl::StopList& m_stops;
256
    const Rcl::StopList& m_stops;
248
    // Remembered data for the last processed term
257
    // Remembered data for the last processed term