recoll / Code / Diff of /src/rcldb/stemdb.cpp

Diff of /src/rcldb/stemdb.cpp [420edd] .. [140425]

Switch to unified view


...
#include "stemdb.h"
#include "wipedir.h"
#include "pathut.h"
#include "debuglog.h"
#include "smallut.h"
#include "utf8iter.h"
#include "textsplit.h"

using namespace std;

namespace Rcl {
namespace StemDb {
...
    int stemmultiple = 0; // Count of stems with multiple derivatives
    try {
        Xapian::Stem stemmer(lang);
        Xapian::TermIterator it;
        for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
            // Deciding if we try to stem the term. 

      // If it has any
            // non-lowercase 7bit char (that is, numbers, capitals and
            // punctuation) dont. 



            string::iterator sit = (*it).begin(), eit = sit + (*it).length();
            if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
                ++nostem;
                LOGDEB1(("stemskipped: [%s], because of 0x%x\n", 
                         (*it).c_str(), *sit));
                continue;
            }

      // Detect and skip CJK terms.
      // We're still sending all other multibyte utf-8 chars to
            // the stemmer, which is not too well defined for
            // xapian<1.0 (very obsolete now), but seems to work
            // anyway. There shouldnt be too many in any case because
            // accents are stripped at this point. Effect of stripping
            // accents on stemming unknown, hopefuly none, there is
            // nothing we can do about it.
      Utf8Iter utfit(*it);
      if (TextSplit::isCJK(*utfit)) {
      // LOGDEB(("stemskipped: Skipping CJK\n"));
      continue;
      }

            string stem = stemmer(*it);
            LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
                     stem.c_str()));
            if (stem == *it) {
                ++stemconst;

	a/src/rcldb/stemdb.cpp		b/src/rcldb/stemdb.cpp
	...		...
29	#include "stemdb.h"	29	#include "stemdb.h"
30	#include "wipedir.h"	30	#include "wipedir.h"
31	#include "pathut.h"	31	#include "pathut.h"
32	#include "debuglog.h"	32	#include "debuglog.h"
33	#include "smallut.h"	33	#include "smallut.h"
		34	#include "utf8iter.h"
		35	#include "textsplit.h"
34		36
35	using namespace std;	37	using namespace std;
36		38
37	namespace Rcl {	39	namespace Rcl {
38	namespace StemDb {	40	namespace StemDb {
	...		...
137	int stemmultiple = 0; // Count of stems with multiple derivatives	139	int stemmultiple = 0; // Count of stems with multiple derivatives
138	try {	140	try {
139	Xapian::Stem stemmer(lang);	141	Xapian::Stem stemmer(lang);
140	Xapian::TermIterator it;	142	Xapian::TermIterator it;
141	for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {	143	for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
142	// Deciding if we try to stem the term. If it has any	144	// Deciding if we try to stem the term.
		145
		146	// If it has any
143	// non-lowercase 7bit char (that is, numbers, capitals and	147	// non-lowercase 7bit char (that is, numbers, capitals and
144	// punctuation) dont. We're still sending all multibyte	148	// punctuation) dont.
145	// utf-8 chars to the stemmer, which is not too well
146	// defined for xapian < 1.0, but seems to work anyway. We don't
147	// try to look for multibyte non alphabetic data.
148	string::iterator sit = (it).begin(), eit = sit + (it).length();	149	string::iterator sit = (it).begin(), eit = sit + (it).length();
149	if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {	150	if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
150	++nostem;	151	++nostem;
151	LOGDEB1(("stemskipped: [%s], because of 0x%x\n",	152	LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
152	(it).c_str(), sit));	153	(it).c_str(), sit));
153	continue;	154	continue;
154	}	155	}
		156
		157	// Detect and skip CJK terms.
		158	// We're still sending all other multibyte utf-8 chars to
		159	// the stemmer, which is not too well defined for
		160	// xapian<1.0 (very obsolete now), but seems to work
		161	// anyway. There shouldnt be too many in any case because
		162	// accents are stripped at this point. Effect of stripping
		163	// accents on stemming unknown, hopefuly none, there is
		164	// nothing we can do about it.
		165	Utf8Iter utfit(*it);
		166	if (TextSplit::isCJK(*utfit)) {
		167	// LOGDEB(("stemskipped: Skipping CJK\n"));
		168	continue;
		169	}
		170
155	string stem = stemmer(*it);	171	string stem = stemmer(*it);
156	LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),	172	LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
157	stem.c_str()));	173	stem.c_str()));
158	if (stem == *it) {	174	if (stem == *it) {
159	++stemconst;	175	++stemconst;