|
a/src/rcldb/termproc.h |
|
b/src/rcldb/termproc.h |
|
... |
|
... |
127 |
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
127 |
virtual bool takeword(const string& itrm, int pos, int bs, int be)
|
128 |
{
|
128 |
{
|
129 |
m_totalterms++;
|
129 |
m_totalterms++;
|
130 |
string otrm;
|
130 |
string otrm;
|
131 |
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
|
131 |
if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
|
132 |
LOGDEB(("splitter::takeword: unac [%s] failed\n", itrm.c_str()));
|
132 |
LOGDEB("splitter::takeword: unac [" << (itrm) << "] failed\n" );
|
133 |
m_unacerrors++;
|
133 |
m_unacerrors++;
|
134 |
// We don't generate a fatal error because of a bad term,
|
134 |
// We don't generate a fatal error because of a bad term,
|
135 |
// but one has to put the limit somewhere
|
135 |
// but one has to put the limit somewhere
|
136 |
if (m_unacerrors > 500 &&
|
136 |
if (m_unacerrors > 500 &&
|
137 |
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
137 |
(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
|
138 |
// More than 1 error for every other term
|
138 |
// More than 1 error for every other term
|
139 |
LOGERR(("splitter::takeword: too many unac errors %d/%d\n",
|
139 |
LOGERR("splitter::takeword: too many unac errors " << (m_unacerrors) << "/" << (m_totalterms) << "\n" );
|
140 |
m_unacerrors, m_totalterms));
|
|
|
141 |
return false;
|
140 |
return false;
|
142 |
}
|
141 |
}
|
143 |
return true;
|
142 |
return true;
|
144 |
}
|
143 |
}
|
145 |
|
144 |
|
|
... |
|
... |
225 |
{
|
224 |
{
|
226 |
}
|
225 |
}
|
227 |
|
226 |
|
228 |
virtual bool takeword(const string& term, int pos, int bs, int be)
|
227 |
virtual bool takeword(const string& term, int pos, int bs, int be)
|
229 |
{
|
228 |
{
|
230 |
LOGDEB1(("TermProcCom::takeword: pos %d %d %d [%s]\n",
|
229 |
LOGDEB1("TermProcCom::takeword: pos " << (pos) << " " << (bs) << " " << (be) << " [" << (term) << "]\n" );
|
231 |
pos, bs, be, term.c_str()));
|
|
|
232 |
bool isstop = m_stops.isStop(term);
|
230 |
bool isstop = m_stops.isStop(term);
|
233 |
bool twogramemit = false;
|
231 |
bool twogramemit = false;
|
234 |
|
232 |
|
235 |
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
233 |
if (!m_prevterm.empty() && (m_prevstop || isstop)) {
|
236 |
// create 2-gram. space unnecessary but improves
|
234 |
// create 2-gram. space unnecessary but improves
|
|
... |
|
... |
298 |
|
296 |
|
299 |
|
297 |
|
300 |
} // End namespace Rcl
|
298 |
} // End namespace Rcl
|
301 |
|
299 |
|
302 |
#endif /* _TERMPROC_H_INCLUDED_ */
|
300 |
#endif /* _TERMPROC_H_INCLUDED_ */
|
|
|
301 |
|