|
a/src/rcldb/stemdb.cpp |
|
b/src/rcldb/stemdb.cpp |
|
... |
|
... |
29 |
#include "stemdb.h"
|
29 |
#include "stemdb.h"
|
30 |
#include "wipedir.h"
|
30 |
#include "wipedir.h"
|
31 |
#include "pathut.h"
|
31 |
#include "pathut.h"
|
32 |
#include "debuglog.h"
|
32 |
#include "debuglog.h"
|
33 |
#include "smallut.h"
|
33 |
#include "smallut.h"
|
|
|
34 |
#include "utf8iter.h"
|
|
|
35 |
#include "textsplit.h"
|
34 |
|
36 |
|
35 |
using namespace std;
|
37 |
using namespace std;
|
36 |
|
38 |
|
37 |
namespace Rcl {
|
39 |
namespace Rcl {
|
38 |
namespace StemDb {
|
40 |
namespace StemDb {
|
|
... |
|
... |
137 |
int stemmultiple = 0; // Count of stems with multiple derivatives
|
139 |
int stemmultiple = 0; // Count of stems with multiple derivatives
|
138 |
try {
|
140 |
try {
|
139 |
Xapian::Stem stemmer(lang);
|
141 |
Xapian::Stem stemmer(lang);
|
140 |
Xapian::TermIterator it;
|
142 |
Xapian::TermIterator it;
|
141 |
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
|
143 |
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
|
142 |
// Deciding if we try to stem the term. If it has any
|
144 |
// Deciding if we try to stem the term.
|
|
|
145 |
|
|
|
146 |
// If it has any
|
143 |
// non-lowercase 7bit char (that is, numbers, capitals and
|
147 |
// non-lowercase 7bit char (that is, numbers, capitals and
|
144 |
// punctuation) dont. We're still sending all multibyte
|
148 |
// punctuation) dont.
|
145 |
// utf-8 chars to the stemmer, which is not too well
|
|
|
146 |
// defined for xapian < 1.0, but seems to work anyway. We don't
|
|
|
147 |
// try to look for multibyte non alphabetic data.
|
|
|
148 |
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
149 |
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
149 |
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
150 |
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
150 |
++nostem;
|
151 |
++nostem;
|
151 |
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
|
152 |
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
|
152 |
(*it).c_str(), *sit));
|
153 |
(*it).c_str(), *sit));
|
153 |
continue;
|
154 |
continue;
|
154 |
}
|
155 |
}
|
|
|
156 |
|
|
|
157 |
// Detect and skip CJK terms.
|
|
|
158 |
// We're still sending all other multibyte utf-8 chars to
|
|
|
159 |
// the stemmer, which is not too well defined for
|
|
|
160 |
// xapian<1.0 (very obsolete now), but seems to work
|
|
|
161 |
// anyway. There shouldnt be too many in any case because
|
|
|
162 |
// accents are stripped at this point. Effect of stripping
|
|
|
163 |
// accents on stemming unknown, hopefuly none, there is
|
|
|
164 |
// nothing we can do about it.
|
|
|
165 |
Utf8Iter utfit(*it);
|
|
|
166 |
if (TextSplit::isCJK(*utfit)) {
|
|
|
167 |
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
|
|
168 |
continue;
|
|
|
169 |
}
|
|
|
170 |
|
155 |
string stem = stemmer(*it);
|
171 |
string stem = stemmer(*it);
|
156 |
LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
|
172 |
LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
|
157 |
stem.c_str()));
|
173 |
stem.c_str()));
|
158 |
if (stem == *it) {
|
174 |
if (stem == *it) {
|
159 |
++stemconst;
|
175 |
++stemconst;
|