recoll / Code / Diff of /src/rcldb/stemdb.cpp

Diff of /src/rcldb/stemdb.cpp [776800] .. [d35c69]

Switch to unified view


...

/**
 * Management of the auxiliary databases listing stems and their expansion 
 * terms
 */

#include "autoconfig.h"

#include <unistd.h>

#include <algorithm>
#include <map>
#include <iostream>
using namespace std;

#include <xapian.h>

#include "stemdb.h"

#include "debuglog.h"
#include "smallut.h"




#include "synfamily.h"
#include "unacpp.h"
#include "rclconfig.h"




namespace Rcl {







































































































































/**
 * Expand for one or several languages
 */
bool StemDb::stemExpand(const std::string& langs, const std::string& term,

            vector<string>& result)
{
    vector<string> llangs;
    stringToStrings(langs, llangs);

    for (vector<string>::const_iterator it = llangs.begin();
     it != llangs.end(); it++) {
  SynTermTransStem stemmer(*it);
  XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
  (void)expander.synExpand(term, result);
    }

#ifndef RCL_INDEX_STRIPCHARS
    // Expand the unaccented stem
    if (!o_index_stripchars) {
  for (vector<string>::const_iterator it = llangs.begin();
       it != llangs.end(); it++) {
      SynTermTransStem stemmer(*it);
      XapComputableSynFamMember expander(getdb(), synFamStemUnac, 
                         *it, &stemmer);
      string unac;
      unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
      (void)expander.synExpand(unac, result);
  }
    }
#endif 

    if (result.empty())
  result.push_back(term);

    sort(result.begin(), result.end());
    vector<string>::iterator uit = unique(result.begin(), result.end());
    result.resize(uit - result.begin());
    LOGDEB0(("stemExpand:%s: %s ->  %s\n", langs.c_str(), term.c_str(),
       stringsToString(result).c_str()));
    return true;
}


}

	a/src/rcldb/stemdb.cpp		b/src/rcldb/stemdb.cpp
	...		...
17		17
18	/**	18	/**
19	* Management of the auxiliary databases listing stems and their expansion	19	* Management of the auxiliary databases listing stems and their expansion
20	* terms	20	* terms
21	*/	21	*/
		22
		23	#include "autoconfig.h"
		24
22	#include <unistd.h>	25	#include <unistd.h>
23		26
24	#include <algorithm>	27	#include <algorithm>
25	#include <map>	28	#include <map>
		29	#include <iostream>
		30	using namespace std;
26		31
27	#include <xapian.h>	32	#include <xapian.h>
28		33
29	#include "stemdb.h"	34	#include "stemdb.h"
30	#include "pathut.h"
31	#include "debuglog.h"	35	#include "debuglog.h"
32	#include "smallut.h"	36	#include "smallut.h"
33	#include "utf8iter.h"
34	#include "textsplit.h"
35	#include "rcldb.h"
36	#include "rcldb_p.h"
37	#include "synfamily.h"	37	#include "synfamily.h"
38	#include "unacpp.h"	38	#include "unacpp.h"
39		39	#include "rclconfig.h"
40	#include <iostream>
41
42	using namespace std;
43		40
44	namespace Rcl {	41	namespace Rcl {
45		42
46	// Fast raw detection of non-natural-language words: look for ascii
47	// chars which are not lowercase letters. Not too sure what islower()
48	// would do with 8 bit values, so not using it here. If we want to be
49	// more complete we'd need to go full utf-8
50	inline static bool p_notlowerascii(unsigned int c)
51	{
52	if (c < 'a' \|\| (c > 'z' && c < 128))
53	return true;
54	return false;
55	}
56
57	/**
58	* Create database of stem to parents associations for a given language.
59	*/
60	bool createExpansionDbs(Xapian::WritableDatabase& wdb,
61	const vector<string>& langs)
62	{
63	LOGDEB(("StemDb::createExpansionDbs\n"));
64	Chrono cron;
65
66	vector<XapWritableSynFamily> stemdbs;
67	for (unsigned int i = 0; i < langs.size(); i++) {
68	stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
69	stemdbs[i].deleteMember(langs[i]);
70	stemdbs[i].createMember(langs[i]);
71	stemdbs[i].setCurrentMemberName(langs[i]);
72	}
73
74	// We walk the list of all terms, and stem each. We skip terms which
75	// don't look like natural language.
76	// If the stem is not identical to the term, we add a synonym entry.
77	// Statistics
78	int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
79	int stemconst = 0; // Stem == term
80	int allsyns = 0; // Total number of entries created
81
82	string ermsg;
83	try {
84	vector<Xapian::Stem> stemmers;
85	for (unsigned int i = 0; i < langs.size(); i++) {
86	stemmers.push_back(Xapian::Stem(langs[i]));
87	}
88
89	for (Xapian::TermIterator it = wdb.allterms_begin();
90	it != wdb.allterms_end(); it++) {
91	// If the term has any non-lowercase 7bit char (that is,
92	// numbers, capitals and punctuation) dont stem.
93	string::iterator sit = (it).begin(), eit = sit + (it).length();
94	if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
95	++nostem;
96	LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
97	(it).c_str(), sit));
98	continue;
99	}
100
101	// Detect and skip CJK terms.
102	// We're still sending all other multibyte utf-8 chars to
103	// the stemmer, which is not too well defined for
104	// xapian<1.0 (very obsolete now), but seems to work
105	// anyway. There shouldn't be too many in any case because
106	// accents are stripped at this point.
107	// The effect of stripping accents on stemming is not good,
108	// (e.g: in french partimes -> partim, parti^mes -> part)
109	// but fixing the issue would be complicated.
110	Utf8Iter utfit(*it);
111	if (TextSplit::isCJK(*utfit)) {
112	// LOGDEB(("stemskipped: Skipping CJK\n"));
113	continue;
114	}
115
116	// Create stemming synonym for every lang
117	for (unsigned int i = 0; i < langs.size(); i++) {
118	string stem = stemmers[i](*it);
119	if (stem == *it) {
120	++stemconst;
121	} else {
122	stemdbs[i].addSynonym(stem, *it);
123	LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
124	(*it).c_str(), langs[i].c_str(), stem.c_str()));
125	++allsyns;
126	}
127	}
128
129	}
130	} XCATCHERROR(ermsg);
131	if (!ermsg.empty()) {
132	LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
133	return false;
134	}
135
136	LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
137	LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
138	nostem, stemconst, allsyns));
139	return true;
140	}
141
142	/**
143	* Expand term to list of all terms which stem to the same term, for one
144	* expansion language
145	*/
146	bool StemDb::expandOne(const std::string& lang,
147	const std::string& term,
148	vector<string>& result)
149	{
150	try {
151	Xapian::Stem stemmer(lang);
152	string stem = stemmer(term);
153	LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
154	lang.c_str(), term.c_str(), stem.c_str()));
155
156	if (!synExpand(lang, stem, result)) {
157	// ?
158	}
159
160	// If the user term or stem are not in the list, add them
161	if (find(result.begin(), result.end(), term) == result.end()) {
162	result.push_back(term);
163	}
164	if (find(result.begin(), result.end(), stem) == result.end()) {
165	result.push_back(stem);
166	}
167	LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
168	stringsToString(result).c_str()));
169
170	} catch (...) {
171	LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
172	lang.c_str()));
173	result.push_back(term);
174	return false;
175	}
176
177	return true;
178	}
179
180	/**	43	/**
181	* Expand for one or several languages	44	* Expand for one or several languages
182	*/	45	*/
183	bool StemDb::stemExpand(const std::string& langs,	46	bool StemDb::stemExpand(const std::string& langs, const std::string& term,
184	const std::string& term,
185	vector<string>& result)	47	vector<string>& result)
186	{	48	{
187	vector<string> llangs;	49	vector<string> llangs;
188	stringToStrings(langs, llangs);	50	stringToStrings(langs, llangs);
		51
189	for (vector<string>::const_iterator it = llangs.begin();	52	for (vector<string>::const_iterator it = llangs.begin();
190	it != llangs.end(); it++) {	53	it != llangs.end(); it++) {
191	vector<string> oneexp;	54	SynTermTransStem stemmer(*it);
192	expandOne(*it, term, oneexp);	55	XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
193	result.insert(result.end(), oneexp.begin(), oneexp.end());	56	(void)expander.synExpand(term, result);
194	}	57	}
		58
		59	#ifndef RCL_INDEX_STRIPCHARS
		60	// Expand the unaccented stem
		61	if (!o_index_stripchars) {
		62	for (vector<string>::const_iterator it = llangs.begin();
		63	it != llangs.end(); it++) {
		64	SynTermTransStem stemmer(*it);
		65	XapComputableSynFamMember expander(getdb(), synFamStemUnac,
		66	*it, &stemmer);
		67	string unac;
		68	unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
		69	(void)expander.synExpand(unac, result);
		70	}
		71	}
		72	#endif
		73
		74	if (result.empty())
		75	result.push_back(term);
		76
195	sort(result.begin(), result.end());	77	sort(result.begin(), result.end());
196	unique(result.begin(), result.end());	78	vector<string>::iterator uit = unique(result.begin(), result.end());
		79	result.resize(uit - result.begin());
		80	LOGDEB0(("stemExpand:%s: %s -> %s\n", langs.c_str(), term.c_str(),
		81	stringsToString(result).c_str()));
197	return true;	82	return true;
198	}	83	}
199		84
200		85
201	}	86	}