Switch to unified view

a/src/rcldb/stemdb.cpp b/src/rcldb/stemdb.cpp
...
...
29
#include "stemdb.h"
29
#include "stemdb.h"
30
#include "wipedir.h"
30
#include "wipedir.h"
31
#include "pathut.h"
31
#include "pathut.h"
32
#include "debuglog.h"
32
#include "debuglog.h"
33
#include "smallut.h"
33
#include "smallut.h"
34
#include "utf8iter.h"
35
#include "textsplit.h"
34
36
35
using namespace std;
37
using namespace std;
36
38
37
namespace Rcl {
39
namespace Rcl {
38
namespace StemDb {
40
namespace StemDb {
...
...
137
    int stemmultiple = 0; // Count of stems with multiple derivatives
139
    int stemmultiple = 0; // Count of stems with multiple derivatives
138
    try {
140
    try {
139
        Xapian::Stem stemmer(lang);
141
        Xapian::Stem stemmer(lang);
140
        Xapian::TermIterator it;
142
        Xapian::TermIterator it;
141
        for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
143
        for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
142
            // Deciding if we try to stem the term. If it has any
144
            // Deciding if we try to stem the term. 
145
146
      // If it has any
143
            // non-lowercase 7bit char (that is, numbers, capitals and
147
            // non-lowercase 7bit char (that is, numbers, capitals and
144
            // punctuation) dont. We're still sending all multibyte
148
            // punctuation) dont. 
145
            // utf-8 chars to the stemmer, which is not too well
146
            // defined for xapian < 1.0, but seems to work anyway. We don't
147
            // try to look for multibyte non alphabetic data.
148
            string::iterator sit = (*it).begin(), eit = sit + (*it).length();
149
            string::iterator sit = (*it).begin(), eit = sit + (*it).length();
149
            if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
150
            if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
150
                ++nostem;
151
                ++nostem;
151
                LOGDEB1(("stemskipped: [%s], because of 0x%x\n", 
152
                LOGDEB1(("stemskipped: [%s], because of 0x%x\n", 
152
                         (*it).c_str(), *sit));
153
                         (*it).c_str(), *sit));
153
                continue;
154
                continue;
154
            }
155
            }
156
157
      // Detect and skip CJK terms.
158
      // We're still sending all other multibyte utf-8 chars to
159
            // the stemmer, which is not too well defined for
160
            // xapian<1.0 (very obsolete now), but seems to work
161
            // anyway. There shouldnt be too many in any case because
162
            // accents are stripped at this point. Effect of stripping
163
            // accents on stemming unknown, hopefuly none, there is
164
            // nothing we can do about it.
165
      Utf8Iter utfit(*it);
166
      if (TextSplit::isCJK(*utfit)) {
167
      // LOGDEB(("stemskipped: Skipping CJK\n"));
168
      continue;
169
      }
170
155
            string stem = stemmer(*it);
171
            string stem = stemmer(*it);
156
            LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
172
            LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
157
                     stem.c_str()));
173
                     stem.c_str()));
158
            if (stem == *it) {
174
            if (stem == *it) {
159
                ++stemconst;
175
                ++stemconst;