|
a/src/rcldb/stemdb.cpp |
|
b/src/rcldb/stemdb.cpp |
|
... |
|
... |
25 |
#include <map>
|
25 |
#include <map>
|
26 |
|
26 |
|
27 |
#include <xapian.h>
|
27 |
#include <xapian.h>
|
28 |
|
28 |
|
29 |
#include "stemdb.h"
|
29 |
#include "stemdb.h"
|
30 |
#include "wipedir.h"
|
|
|
31 |
#include "pathut.h"
|
30 |
#include "pathut.h"
|
32 |
#include "debuglog.h"
|
31 |
#include "debuglog.h"
|
33 |
#include "smallut.h"
|
32 |
#include "smallut.h"
|
34 |
#include "utf8iter.h"
|
33 |
#include "utf8iter.h"
|
35 |
#include "textsplit.h"
|
34 |
#include "textsplit.h"
|
|
|
35 |
#include "rcldb.h"
|
|
|
36 |
#include "rcldb_p.h"
|
|
|
37 |
#include "synfamily.h"
|
|
|
38 |
|
|
|
39 |
#include <iostream>
|
36 |
|
40 |
|
37 |
using namespace std;
|
41 |
using namespace std;
|
38 |
|
42 |
|
39 |
namespace Rcl {
|
43 |
namespace Rcl {
|
40 |
namespace StemDb {
|
44 |
namespace StemDb {
|
41 |
|
45 |
|
42 |
|
46 |
|
43 |
static const string cstr_stemdirstem = "stem_";
|
47 |
vector<string> getLangs(Xapian::Database& xdb)
|
44 |
|
|
|
45 |
/// Compute name of stem db for given base database and language
|
|
|
46 |
static string stemdbname(const string& dbdir, const string& lang)
|
|
|
47 |
{
|
48 |
{
|
48 |
return path_cat(dbdir, cstr_stemdirstem + lang);
|
49 |
XapSynFamily fam(xdb, synprefStem);
|
49 |
}
|
50 |
vector<string> langs;
|
50 |
|
51 |
(void)fam.getMembers(langs);
|
51 |
vector<string> getLangs(const string& dbdir)
|
|
|
52 |
{
|
|
|
53 |
string pattern = cstr_stemdirstem + "*";
|
|
|
54 |
vector<string> dirs = path_dirglob(dbdir, pattern);
|
|
|
55 |
for (vector<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
|
|
|
56 |
*it = path_basename(*it);
|
|
|
57 |
*it = it->substr(cstr_stemdirstem.length(), string::npos);
|
|
|
58 |
}
|
|
|
59 |
return dirs;
|
|
|
60 |
}
|
|
|
61 |
|
|
|
62 |
bool deleteDb(const string& dbdir, const string& lang)
|
|
|
63 |
{
|
|
|
64 |
string dir = stemdbname(dbdir, lang);
|
|
|
65 |
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
|
|
|
66 |
return true;
|
|
|
67 |
return false;
|
52 |
return langs;
|
68 |
}
|
|
|
69 |
|
|
|
70 |
// Autoclean/delete directory
|
|
|
71 |
class DirWiper {
|
|
|
72 |
public:
|
|
|
73 |
string dir;
|
|
|
74 |
bool do_it;
|
|
|
75 |
DirWiper(string d) : dir(d), do_it(true) {}
|
|
|
76 |
~DirWiper() {
|
|
|
77 |
if (do_it) {
|
|
|
78 |
wipedir(dir);
|
|
|
79 |
rmdir(dir.c_str());
|
|
|
80 |
}
|
53 |
}
|
81 |
}
|
54 |
|
|
|
55 |
bool deleteDb(Xapian::WritableDatabase& xdb, const string& lang)
|
|
|
56 |
{
|
|
|
57 |
XapWritableSynFamily fam(xdb, synprefStem);
|
|
|
58 |
return fam.deleteMember(lang);
|
82 |
};
|
59 |
}
|
83 |
|
60 |
|
84 |
inline static bool
|
61 |
inline static bool
|
85 |
p_notlowerascii(unsigned int c)
|
62 |
p_notlowerascii(unsigned int c)
|
86 |
{
|
63 |
{
|
87 |
if (c < 'a' || (c > 'z' && c < 128))
|
64 |
if (c < 'a' || (c > 'z' && c < 128))
|
88 |
return true;
|
65 |
return true;
|
89 |
return false;
|
66 |
return false;
|
90 |
}
|
67 |
}
|
91 |
|
|
|
92 |
static bool addAssoc(Xapian::WritableDatabase &sdb, const string& stem,
|
|
|
93 |
const vector<string>& derivs)
|
|
|
94 |
{
|
|
|
95 |
Xapian::Document newdocument;
|
|
|
96 |
newdocument.add_term(stem);
|
|
|
97 |
// The doc data is just parents=blank-separated-list
|
|
|
98 |
string record = "parents=";
|
|
|
99 |
for (vector<string>::const_iterator it = derivs.begin();
|
|
|
100 |
it != derivs.end(); it++) {
|
|
|
101 |
record += *it + " ";
|
|
|
102 |
}
|
|
|
103 |
record += "\n";
|
|
|
104 |
LOGDEB2(("createStemDb: stmdoc data: [%s]\n", record.c_str()));
|
|
|
105 |
newdocument.set_data(record);
|
|
|
106 |
try {
|
|
|
107 |
sdb.replace_document(stem, newdocument);
|
|
|
108 |
} catch (...) {
|
|
|
109 |
LOGERR(("Db::createstemdb(addAssoc): replace failed\n"));
|
|
|
110 |
return false;
|
|
|
111 |
}
|
|
|
112 |
return true;
|
|
|
113 |
}
|
|
|
114 |
|
|
|
115 |
|
68 |
|
116 |
/**
|
69 |
/**
|
117 |
* Create database of stem to parents associations for a given language.
|
70 |
* Create database of stem to parents associations for a given language.
|
118 |
* We walk the list of all terms, stem them, and create another Xapian db
|
71 |
* We walk the list of all terms, stem them, and create another Xapian db
|
119 |
* with documents indexed by a single term (the stem), and with the list of
|
72 |
* with documents indexed by a single term (the stem), and with the list of
|
120 |
* parent terms in the document data.
|
73 |
* parent terms in the document data.
|
121 |
*/
|
74 |
*/
|
122 |
bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
|
75 |
bool createDb(Xapian::WritableDatabase& xdb, const string& lang)
|
123 |
{
|
76 |
{
|
124 |
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
77 |
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
125 |
Chrono cron;
|
78 |
Chrono cron;
|
126 |
|
79 |
|
127 |
// First build the in-memory stem database:
|
80 |
// First build the in-memory stem database:
|
|
... |
|
... |
134 |
map<string, vector<string> > assocs;
|
87 |
map<string, vector<string> > assocs;
|
135 |
// Statistics
|
88 |
// Statistics
|
136 |
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
89 |
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
|
137 |
int stemconst=0; // Stem == term
|
90 |
int stemconst=0; // Stem == term
|
138 |
int stemmultiple = 0; // Count of stems with multiple derivatives
|
91 |
int stemmultiple = 0; // Count of stems with multiple derivatives
|
|
|
92 |
string ermsg;
|
139 |
try {
|
93 |
try {
|
140 |
Xapian::Stem stemmer(lang);
|
94 |
Xapian::Stem stemmer(lang);
|
141 |
Xapian::TermIterator it;
|
95 |
Xapian::TermIterator it;
|
142 |
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
|
96 |
for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
|
143 |
// If the term has any non-lowercase 7bit char (that is,
|
97 |
// If the term has any non-lowercase 7bit char (that is,
|
|
... |
|
... |
172 |
++stemconst;
|
126 |
++stemconst;
|
173 |
continue;
|
127 |
continue;
|
174 |
}
|
128 |
}
|
175 |
assocs[stem].push_back(*it);
|
129 |
assocs[stem].push_back(*it);
|
176 |
}
|
130 |
}
|
177 |
} catch (const Xapian::Error &e) {
|
131 |
} XCATCHERROR(ermsg);
|
|
|
132 |
if (!ermsg.empty()) {
|
178 |
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
|
133 |
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
179 |
return false;
|
134 |
return false;
|
180 |
} catch (...) {
|
|
|
181 |
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
|
|
|
182 |
lang.c_str()));
|
|
|
183 |
return false;
|
|
|
184 |
}
|
135 |
}
|
|
|
136 |
|
185 |
LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n",
|
137 |
LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n",
|
186 |
lang.c_str(), cron.secs()));
|
138 |
lang.c_str(), cron.secs()));
|
187 |
|
139 |
|
188 |
// Create xapian database for stem relations
|
140 |
XapWritableSynFamily fam(xdb, synprefStem);
|
189 |
string stemdbdir = stemdbname(dbdir, lang);
|
141 |
fam.createMember(lang);
|
190 |
// We want to get rid of the db dir in case of error. This gets disarmed
|
|
|
191 |
// just before success return.
|
|
|
192 |
DirWiper wiper(stemdbdir);
|
|
|
193 |
string ermsg;
|
|
|
194 |
Xapian::WritableDatabase sdb;
|
|
|
195 |
try {
|
|
|
196 |
sdb = Xapian::WritableDatabase(stemdbdir,
|
|
|
197 |
Xapian::DB_CREATE_OR_OVERWRITE);
|
|
|
198 |
} catch (const Xapian::Error &e) {
|
|
|
199 |
ermsg = e.get_msg();
|
|
|
200 |
} catch (const string &s) {
|
|
|
201 |
ermsg = s;
|
|
|
202 |
} catch (const char *s) {
|
|
|
203 |
ermsg = s;
|
|
|
204 |
} catch (...) {
|
|
|
205 |
ermsg = "Caught unknown exception";
|
|
|
206 |
}
|
|
|
207 |
if (!ermsg.empty()) {
|
|
|
208 |
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
|
|
|
209 |
stemdbdir.c_str(), ermsg.c_str()));
|
|
|
210 |
return false;
|
|
|
211 |
}
|
|
|
212 |
|
142 |
|
213 |
// Enter pseud-docs in db by walking the map.
|
|
|
214 |
for (map<string, vector<string> >::const_iterator it = assocs.begin();
|
143 |
for (map<string, vector<string> >::const_iterator it = assocs.begin();
|
215 |
it != assocs.end(); it++) {
|
144 |
it != assocs.end(); it++) {
|
216 |
LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
|
145 |
LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
|
217 |
// We need an entry even if there is only one derivative
|
146 |
// We need an entry even if there is only one derivative
|
218 |
// so that it is possible to search by entering the stem
|
147 |
// so that it is possible to search by entering the stem
|
219 |
// even if it doesnt exist as a term
|
148 |
// even if it doesnt exist as a term
|
220 |
if (it->second.size() > 1)
|
149 |
if (it->second.size() > 1)
|
221 |
++stemmultiple;
|
150 |
++stemmultiple;
|
222 |
|
|
|
223 |
if (!addAssoc(sdb, it->first, it->second)) {
|
151 |
if (!fam.addSynonyms(lang, it->first, it->second)) {
|
224 |
return false;
|
152 |
return false;
|
225 |
}
|
153 |
}
|
226 |
}
|
154 |
}
|
227 |
|
155 |
|
228 |
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
|
156 |
LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n",
|
229 |
lang.c_str(), cron.secs()));
|
157 |
lang.c_str(), cron.secs()));
|
230 |
LOGDEB(("Stem map size: %d mult %d const %d no %d \n",
|
158 |
LOGDEB(("Stem map size: %d mult %d const %d no %d \n",
|
231 |
assocs.size(), stemmultiple, stemconst, nostem));
|
159 |
assocs.size(), stemmultiple, stemconst, nostem));
|
232 |
wiper.do_it = false;
|
160 |
fam.listMap(lang);
|
233 |
return true;
|
161 |
return true;
|
234 |
}
|
162 |
}
|
235 |
|
163 |
|
236 |
static string stringlistdisp(const vector<string>& sl)
|
164 |
static string stringlistdisp(const vector<string>& sl)
|
237 |
{
|
165 |
{
|
|
... |
|
... |
245 |
|
173 |
|
246 |
/**
|
174 |
/**
|
247 |
* Expand term to list of all terms which stem to the same term, for one
|
175 |
* Expand term to list of all terms which stem to the same term, for one
|
248 |
* expansion language
|
176 |
* expansion language
|
249 |
*/
|
177 |
*/
|
250 |
static bool stemExpandOne(const std::string& dbdir,
|
178 |
static bool stemExpandOne(Xapian::Database& xdb,
|
251 |
const std::string& lang,
|
179 |
const std::string& lang,
|
252 |
const std::string& term,
|
180 |
const std::string& term,
|
253 |
vector<string>& result)
|
181 |
vector<string>& result)
|
254 |
{
|
182 |
{
|
255 |
try {
|
183 |
try {
|
256 |
Xapian::Stem stemmer(lang);
|
184 |
Xapian::Stem stemmer(lang);
|
257 |
string stem = stemmer(term);
|
185 |
string stem = stemmer(term);
|
258 |
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
|
186 |
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
|
259 |
lang.c_str(), term.c_str(), stem.c_str()));
|
187 |
lang.c_str(), term.c_str(), stem.c_str()));
|
260 |
|
188 |
|
261 |
// Open stem database
|
189 |
XapSynFamily fam(xdb, synprefStem);
|
262 |
string stemdbdir = stemdbname(dbdir, lang);
|
190 |
if (!fam.synExpand(lang, stem, result)) {
|
263 |
Xapian::Database sdb(stemdbdir);
|
191 |
// ?
|
264 |
LOGDEB0(("stemExpand: %s lastdocid: %d\n",
|
|
|
265 |
stemdbdir.c_str(), sdb.get_lastdocid()));
|
|
|
266 |
|
|
|
267 |
// Try to fetch the doc from the stem db
|
|
|
268 |
if (!sdb.term_exists(stem)) {
|
|
|
269 |
LOGDEB0(("Db::stemExpand: no term for %s\n", stem.c_str()));
|
|
|
270 |
} else {
|
|
|
271 |
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
|
|
272 |
if (did == sdb.postlist_end(stem)) {
|
|
|
273 |
LOGDEB0(("stemExpand: no term(1) for %s\n",stem.c_str()));
|
|
|
274 |
} else {
|
|
|
275 |
Xapian::Document doc = sdb.get_document(*did);
|
|
|
276 |
string data = doc.get_data();
|
|
|
277 |
|
|
|
278 |
// Build expansion list from database data No need for
|
|
|
279 |
// a conftree, but we need to massage the data a
|
|
|
280 |
// little
|
|
|
281 |
string::size_type pos = data.find('=');
|
|
|
282 |
string::size_type pos1 = data.rfind('\n');
|
|
|
283 |
if (pos == string::npos || pos1 == string::npos ||
|
|
|
284 |
pos1 <= pos+1) {
|
|
|
285 |
LOGERR(("stemExpand: bad data in db: [%s]\n",
|
|
|
286 |
data.c_str()));
|
|
|
287 |
} else {
|
|
|
288 |
++pos;
|
|
|
289 |
stringToStrings(data.substr(pos, pos1-pos), result);
|
|
|
290 |
}
|
|
|
291 |
}
|
|
|
292 |
}
|
192 |
}
|
293 |
|
193 |
|
294 |
// If the user term or stem are not in the list, add them
|
194 |
// If the user term or stem are not in the list, add them
|
295 |
if (find(result.begin(), result.end(), term) == result.end()) {
|
195 |
if (find(result.begin(), result.end(), term) == result.end()) {
|
296 |
result.push_back(term);
|
196 |
result.push_back(term);
|
|
... |
|
... |
300 |
}
|
200 |
}
|
301 |
LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
|
201 |
LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
|
302 |
stringlistdisp(result).c_str()));
|
202 |
stringlistdisp(result).c_str()));
|
303 |
|
203 |
|
304 |
} catch (...) {
|
204 |
} catch (...) {
|
305 |
LOGERR(("stemExpand: error accessing stem db. dbdir [%s] lang [%s]\n",
|
205 |
LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
|
306 |
dbdir.c_str(), lang.c_str()));
|
206 |
lang.c_str()));
|
307 |
result.push_back(term);
|
207 |
result.push_back(term);
|
308 |
return false;
|
208 |
return false;
|
309 |
}
|
209 |
}
|
310 |
|
210 |
|
311 |
return true;
|
211 |
return true;
|
|
... |
|
... |
313 |
|
213 |
|
314 |
/**
|
214 |
/**
|
315 |
* Expand term to list of all terms which stem to the same term, add the
|
215 |
* Expand term to list of all terms which stem to the same term, add the
|
316 |
* expansion sets for possibly multiple expansion languages
|
216 |
* expansion sets for possibly multiple expansion languages
|
317 |
*/
|
217 |
*/
|
318 |
bool stemExpand(const std::string& dbdir,
|
218 |
bool stemExpand(Xapian::Database& xdb,
|
319 |
const std::string& langs,
|
219 |
const std::string& langs,
|
320 |
const std::string& term,
|
220 |
const std::string& term,
|
321 |
vector<string>& result)
|
221 |
vector<string>& result)
|
322 |
{
|
222 |
{
|
323 |
|
|
|
324 |
vector<string> llangs;
|
223 |
vector<string> llangs;
|
325 |
stringToStrings(langs, llangs);
|
224 |
stringToStrings(langs, llangs);
|
326 |
for (vector<string>::const_iterator it = llangs.begin();
|
225 |
for (vector<string>::const_iterator it = llangs.begin();
|
327 |
it != llangs.end(); it++) {
|
226 |
it != llangs.end(); it++) {
|
328 |
vector<string> oneexp;
|
227 |
vector<string> oneexp;
|
329 |
stemExpandOne(dbdir, *it, term, oneexp);
|
228 |
stemExpandOne(xdb, *it, term, oneexp);
|
330 |
result.insert(result.end(), oneexp.begin(), oneexp.end());
|
229 |
result.insert(result.end(), oneexp.begin(), oneexp.end());
|
331 |
}
|
230 |
}
|
332 |
sort(result.begin(), result.end());
|
231 |
sort(result.begin(), result.end());
|
333 |
unique(result.begin(), result.end());
|
232 |
unique(result.begin(), result.end());
|
334 |
return true;
|
233 |
return true;
|