recoll / Code / Diff of /src/rcldb/termproc.h

Diff of /src/rcldb/termproc.h [8f77b9] .. [d14ecc]

Switch to unified view


...
#include <string>

#include "textsplit.h"
#include "stoplist.h"
#include "smallut.h"
#include "utf8iter.h"

namespace Rcl {

/**
 * Termproc objects take term tokens as input and do something
...

    virtual bool takeword(const string& itrm, int pos, int bs, int be)
    {
        m_totalterms++;
        string otrm;

        if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
            LOGDEB("splitter::takeword: unac [" << itrm << "] failed\n");
            m_unacerrors++;
            // We don't generate a fatal error because of a bad term,
            // but one has to put the limit somewhere
            if (m_unacerrors > 500 &&
                    (double(m_totalterms) / double(m_unacerrors)) < 2.0) {
                // More than 1 error for every other term
                LOGERR("splitter::takeword: too many unac errors " <<
                       m_unacerrors << "/"  << m_totalterms << "\n");
                return false;
            }
            return true;
        }

...
        // of diacritics ...)  The consequence is that a phrase
        // search won't work without addional slack.
            return true;
    }

        // We should have a Japanese stemmer to handle this, but for
        // experimenting, let's do it here: remove 'prolounged sound
        // mark' and its halfwidth variant from the end of terms.
        if ((unsigned int)otrm[0] > 127) {
            Utf8Iter it(otrm);
            if (TextSplit::isKATAKANA(*it)) {
                Utf8Iter itprev = it;
                while (*it != (unsigned int)-1) {
                    itprev = it;
                    it++;
                }
                if (*itprev == 0x30fc || *itprev == 0xff70) {
                    otrm = otrm.substr(0, itprev.getBpos());
                }
            }
        }
        if (otrm.empty()) {
            return true;
        }
        
    // It may also occur that unac introduces spaces in the string
    // (when removing isolated accents, may happen for Greek
    // for example). This is a pathological situation. We
    // index all the resulting terms at the same pos because
    // the surrounding code is not designed to handle a pos

	a/src/rcldb/termproc.h		b/src/rcldb/termproc.h
	...		...
21	#include <string>	21	#include <string>
22		22
23	#include "textsplit.h"	23	#include "textsplit.h"
24	#include "stoplist.h"	24	#include "stoplist.h"
25	#include "smallut.h"	25	#include "smallut.h"
		26	#include "utf8iter.h"
26		27
27	namespace Rcl {	28	namespace Rcl {
28		29
29	/**	30	/**
30	* Termproc objects take term tokens as input and do something	31	* Termproc objects take term tokens as input and do something
	...		...
126		127
127	virtual bool takeword(const string& itrm, int pos, int bs, int be)	128	virtual bool takeword(const string& itrm, int pos, int bs, int be)
128	{	129	{
129	m_totalterms++;	130	m_totalterms++;
130	string otrm;	131	string otrm;
		132
131	if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {	133	if (!unacmaybefold(itrm, otrm, "UTF-8", UNACOP_UNACFOLD)) {
132	LOGDEB("splitter::takeword: unac [" << (itrm) << "] failed\n" );	134	LOGDEB("splitter::takeword: unac [" << itrm << "] failed\n");
133	m_unacerrors++;	135	m_unacerrors++;
134	// We don't generate a fatal error because of a bad term,	136	// We don't generate a fatal error because of a bad term,
135	// but one has to put the limit somewhere	137	// but one has to put the limit somewhere
136	if (m_unacerrors > 500 &&	138	if (m_unacerrors > 500 &&
137	(double(m_totalterms) / double(m_unacerrors)) < 2.0) {	139	(double(m_totalterms) / double(m_unacerrors)) < 2.0) {
138	// More than 1 error for every other term	140	// More than 1 error for every other term
139	LOGERR("splitter::takeword: too many unac errors " << (m_unacerrors) << "/" << (m_totalterms) << "\n" );	141	LOGERR("splitter::takeword: too many unac errors " <<
		142	m_unacerrors << "/" << m_totalterms << "\n");
140	return false;	143	return false;
141	}	144	}
142	return true;	145	return true;
143	}	146	}
144		147
	...		...
148	// of diacritics ...) The consequence is that a phrase	151	// of diacritics ...) The consequence is that a phrase
149	// search won't work without addional slack.	152	// search won't work without addional slack.
150	return true;	153	return true;
151	}	154	}
152		155
		156	// We should have a Japanese stemmer to handle this, but for
		157	// experimenting, let's do it here: remove 'prolounged sound
		158	// mark' and its halfwidth variant from the end of terms.
		159	if ((unsigned int)otrm[0] > 127) {
		160	Utf8Iter it(otrm);
		161	if (TextSplit::isKATAKANA(*it)) {
		162	Utf8Iter itprev = it;
		163	while (*it != (unsigned int)-1) {
		164	itprev = it;
		165	it++;
		166	}
		167	if (itprev == 0x30fc \|\| itprev == 0xff70) {
		168	otrm = otrm.substr(0, itprev.getBpos());
		169	}
		170	}
		171	}
		172	if (otrm.empty()) {
		173	return true;
		174	}
		175
153	// It may also occur that unac introduces spaces in the string	176	// It may also occur that unac introduces spaces in the string
154	// (when removing isolated accents, may happen for Greek	177	// (when removing isolated accents, may happen for Greek
155	// for example). This is a pathological situation. We	178	// for example). This is a pathological situation. We
156	// index all the resulting terms at the same pos because	179	// index all the resulting terms at the same pos because
157	// the surrounding code is not designed to handle a pos	180	// the surrounding code is not designed to handle a pos