|
a/src/rcldb/termproc.h |
|
b/src/rcldb/termproc.h |
|
... |
|
... |
21 |
#include <string>
|
21 |
#include <string>
|
22 |
|
22 |
|
23 |
#include "textsplit.h"
|
23 |
#include "textsplit.h"
|
24 |
#include "stoplist.h"
|
24 |
#include "stoplist.h"
|
25 |
#include "smallut.h"
|
25 |
#include "smallut.h"
|
|
|
26 |
#include "utf8iter.h"
|
26 |
|
27 |
|
27 |
namespace Rcl {
|
28 |
namespace Rcl {
|
28 |
|
29 |
|
29 |
/**
|
30 |
/**
|
30 |
* Termproc objects take term tokens as input and do something
|
31 |
* Termproc objects take term tokens as input and do something
|
|
... |
|
... |
126 |
|
127 |
|
127 |
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
128 |
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
128 |
{
|
129 |
{
|
129 |
m_totalterms++;
|
130 |
m_totalterms++;
|
130 |
string otrm;
|
131 |
string otrm;
|
|
|
132 |
|
131 |
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
|
133 |
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
|
132 |
LOGDEB("splitter::takeword: unac [" << (itrm) << "] failed\n" );
|
134 |
LOGDEB("splitter::takeword: unac [" << itrm << "] failed\n");
|
133 |
m_unacerrors++;
|
135 |
m_unacerrors++;
|
134 |
// We don't generate a fatal error because of a bad term,
|
136 |
// We don't generate a fatal error because of a bad term,
|
135 |
// but one has to put the limit somewhere
|
137 |
// but one has to put the limit somewhere
|
136 |
if (m_unacerrors > 500 &&
|
138 |
if (m_unacerrors > 500 &&
|
137 |
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
139 |
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
138 |
// More than 1 error for every other term
|
140 |
// More than 1 error for every other term
|
139 |
LOGERR("splitter::takeword: too many unac errors " << (m_unacerrors) << "/" << (m_totalterms) << "\n" );
|
141 |
LOGERR("splitter::takeword: too many unac errors " <<
|
|
|
142 |
m_unacerrors << "/" << m_totalterms << "\n");
|
140 |
return false;
|
143 |
return false;
|
141 |
}
|
144 |
}
|
142 |
return true;
|
145 |
return true;
|
143 |
}
|
146 |
}
|
144 |
|
147 |
|
|
... |
|
... |
148 |
// of diacritics ...) The consequence is that a phrase
|
151 |
// of diacritics ...) The consequence is that a phrase
|
149 |
// search won't work without addional slack.
|
152 |
// search won't work without addional slack.
|
150 |
return true;
|
153 |
return true;
|
151 |
}
|
154 |
}
|
152 |
|
155 |
|
|
|
156 |
// We should have a Japanese stemmer to handle this, but for
|
|
|
157 |
// experimenting, let's do it here: remove 'prolounged sound
|
|
|
158 |
// mark' and its halfwidth variant from the end of terms.
|
|
|
159 |
if ((unsigned int)otrm[0] > 127) {
|
|
|
160 |
Utf8Iter it(otrm);
|
|
|
161 |
if (TextSplit::isKATAKANA(*it)) {
|
|
|
162 |
Utf8Iter itprev = it;
|
|
|
163 |
while (*it != (unsigned int)-1) {
|
|
|
164 |
itprev = it;
|
|
|
165 |
it++;
|
|
|
166 |
}
|
|
|
167 |
if (*itprev == 0x30fc || *itprev == 0xff70) {
|
|
|
168 |
otrm = otrm.substr(0, itprev.getBpos());
|
|
|
169 |
}
|
|
|
170 |
}
|
|
|
171 |
}
|
|
|
172 |
if (otrm.empty()) {
|
|
|
173 |
return true;
|
|
|
174 |
}
|
|
|
175 |
|
153 |
// It may also occur that unac introduces spaces in the string
|
176 |
// It may also occur that unac introduces spaces in the string
|
154 |
// (when removing isolated accents, may happen for Greek
|
177 |
// (when removing isolated accents, may happen for Greek
|
155 |
// for example). This is a pathological situation. We
|
178 |
// for example). This is a pathological situation. We
|
156 |
// index all the resulting terms at the same pos because
|
179 |
// index all the resulting terms at the same pos because
|
157 |
// the surrounding code is not designed to handle a pos
|
180 |
// the surrounding code is not designed to handle a pos
|