Switch to unified view

a/src/rcldb/termproc.h b/src/rcldb/termproc.h
...
...
21
#include <string>
21
#include <string>
22
22
23
#include "textsplit.h"
23
#include "textsplit.h"
24
#include "stoplist.h"
24
#include "stoplist.h"
25
#include "smallut.h"
25
#include "smallut.h"
26
#include "utf8iter.h"
26
27
27
namespace Rcl {
28
namespace Rcl {
28
29
29
/**
30
/**
30
 * Termproc objects take term tokens as input and do something
31
 * Termproc objects take term tokens as input and do something
...
...
126
127
127
    virtual bool takeword(const string& itrm, int pos, int bs, int be)
128
    virtual bool takeword(const string& itrm, int pos, int bs, int be)
128
    {
129
    {
129
        m_totalterms++;
130
        m_totalterms++;
130
        string otrm;
131
        string otrm;
132
131
        if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
133
        if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
132
            LOGDEB("splitter::takeword: unac ["  << (itrm) << "] failed\n" );
134
            LOGDEB("splitter::takeword: unac [" << itrm << "] failed\n");
133
            m_unacerrors++;
135
            m_unacerrors++;
134
            // We don't generate a fatal error because of a bad term,
136
            // We don't generate a fatal error because of a bad term,
135
            // but one has to put the limit somewhere
137
            // but one has to put the limit somewhere
136
            if (m_unacerrors > 500 &&
138
            if (m_unacerrors > 500 &&
137
                    (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
139
                    (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
138
                // More than 1 error for every other term
140
                // More than 1 error for every other term
139
                LOGERR("splitter::takeword: too many unac errors "  << (m_unacerrors) << "/"  << (m_totalterms) << "\n" );
141
                LOGERR("splitter::takeword: too many unac errors " <<
142
                       m_unacerrors << "/"  << m_totalterms << "\n");
140
                return false;
143
                return false;
141
            }
144
            }
142
            return true;
145
            return true;
143
        }
146
        }
144
147
...
...
148
        // of diacritics ...)  The consequence is that a phrase
151
        // of diacritics ...)  The consequence is that a phrase
149
        // search won't work without addional slack.
152
        // search won't work without addional slack.
150
            return true;
153
            return true;
151
    }
154
    }
152
155
156
        // We should have a Japanese stemmer to handle this, but for
157
        // experimenting, let's do it here: remove 'prolounged sound
158
        // mark' and its halfwidth variant from the end of terms.
159
        if ((unsigned int)otrm[0] > 127) {
160
            Utf8Iter it(otrm);
161
            if (TextSplit::isKATAKANA(*it)) {
162
                Utf8Iter itprev = it;
163
                while (*it != (unsigned int)-1) {
164
                    itprev = it;
165
                    it++;
166
                }
167
                if (*itprev == 0x30fc || *itprev == 0xff70) {
168
                    otrm = otrm.substr(0, itprev.getBpos());
169
                }
170
            }
171
        }
172
        if (otrm.empty()) {
173
            return true;
174
        }
175
        
153
    // It may also occur that unac introduces spaces in the string
176
    // It may also occur that unac introduces spaces in the string
154
    // (when removing isolated accents, may happen for Greek
177
    // (when removing isolated accents, may happen for Greek
155
    // for example). This is a pathological situation. We
178
    // for example). This is a pathological situation. We
156
    // index all the resulting terms at the same pos because
179
    // index all the resulting terms at the same pos because
157
    // the surrounding code is not designed to handle a pos
180
    // the surrounding code is not designed to handle a pos