Parent: [8283ca] (diff)

Child: [6b3945] (diff)

Download this file

stemdb.cpp    278 lines (255 with data), 8.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#ifndef lint
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.6 2006-12-19 12:11:21 dockes Exp $ (C) 2005 J.F.Dockes";
#endif
/**
* Management of the auxiliary databases listing stems and their expansion
* terms
*/
#include <unistd.h>
#include <algorithm>
#include <map>
#include <xapian.h>
#include <xapian/stem.h>
#include "stemdb.h"
#include "wipedir.h"
#include "pathut.h"
#include "debuglog.h"
#include "smallut.h"
using namespace std;
namespace Rcl {
namespace StemDb {
const static string stemdirstem = "stem_";
/// Compute name of stem db for given base database and language
static string stemdbname(const string& dbdir, const string& lang)
{
return path_cat(dbdir, stemdirstem + lang);
}
list<string> getLangs(const string& dbdir)
{
string pattern = stemdirstem + "*";
list<string> dirs = path_dirglob(dbdir, pattern);
for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
*it = path_basename(*it);
*it = it->substr(stemdirstem.length(), string::npos);
}
return dirs;
}
bool deleteDb(const string& dbdir, const string& lang)
{
string dir = stemdbname(dbdir, lang);
if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
return true;
return false;
}
// Autoclean/delete directory
class DirWiper {
public:
string dir;
bool do_it;
DirWiper(string d) : dir(d), do_it(true) {}
~DirWiper() {
if (do_it) {
wipedir(dir);
rmdir(dir.c_str());
}
}
};
inline static bool
p_notlowerascii(unsigned int c)
{
if (c < 'a' || (c > 'z' && c < 128))
return true;
return false;
}
/**
* Create database of stem to parents associations for a given language.
* We walk the list of all terms, stem them, and create another Xapian db
* with documents indexed by a single term (the stem), and with the list of
* parent terms in the document data.
*/
bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
{
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
// First build the in-memory stem database:
// We walk the list of all terms, and stem each.
// If the stem is identical to the term, no need to create an entry
// Else, we add an entry to the multimap.
// At the end, we only save stem-terms associations with several terms, the
// others are not useful
// Note: a map<string, list<string> > would probably be more efficient
multimap<string, string> assocs;
// Statistics
int nostem=0; // Dont even try: not-alphanum (incomplete for now)
int stemconst=0; // Stem == term
int stemdiff=0; // Count of all different stems
int stemmultiple = 0; // Count of stems with multiple derivatives
try {
Xapian::Stem stemmer(lang);
Xapian::TermIterator it;
for (it = xdb.allterms_begin();
it != xdb.allterms_end(); it++) {
// Deciding if we try to stem the term. If it has any
// non-lowercase 7bit char, dont. Note that
// as we are dealing with unaccented data, we are still
// processing most of western european languages (where
// most unaccented letters are ascii)
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
++nostem;
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
// (*it).c_str(), *sit));
continue;
}
string stem = stemmer.stem_word(*it);
//cerr << "word " << *it << " stem " << stem << endl;
if (stem == *it) {
++stemconst;
continue;
}
assocs.insert(pair<string,string>(stem, *it));
}
} catch (const Xapian::Error &e) {
LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
return false;
} catch (...) {
LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n",
lang.c_str()));
return false;
}
// Create xapian database for stem relations
string stemdbdir = stemdbname(dbdir, lang);
// We want to get rid of the db dir in case of error. This gets disarmed
// just before success return.
DirWiper wiper(stemdbdir);
const char *ermsg = "NOERROR";
Xapian::WritableDatabase sdb;
try {
sdb = Xapian::WritableDatabase(stemdbdir,
Xapian::DB_CREATE_OR_OVERWRITE);
} catch (const Xapian::Error &e) {
ermsg = e.get_msg().c_str();
} catch (const string &s) {
ermsg = s.c_str();
} catch (const char *s) {
ermsg = s;
} catch (...) {
ermsg = "Caught unknown exception";
}
if (ermsg != "NOERROR") {
LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n",
stemdbdir.c_str(), ermsg));
return false;
}
// Enter pseud-docs in db. Walk the multimap, only enter
// associations where there are several parent terms
string stem;
list<string> derivs;
for (multimap<string,string>::const_iterator it = assocs.begin();
it != assocs.end(); it++) {
if (stem == it->first) {
// Staying with same stem
derivs.push_back(it->second);
// cerr << " " << it->second << endl;
} else {
// Changing stems
++stemdiff;
// We need an entry even if there is only one derivative
// so that it is possible to search by entering the stem
// even if it doesnt exist as a term
if (derivs.size() >= 1) {
// Previous stem has multiple derivatives. Enter in db
++stemmultiple;
Xapian::Document newdocument;
newdocument.add_term(stem);
// The doc data is just parents=blank-separated-list
string record = "parents=";
for (list<string>::const_iterator it = derivs.begin();
it != derivs.end(); it++) {
record += *it + " ";
}
record += "\n";
LOGDEB1(("stemdocument data: %s\n", record.c_str()));
newdocument.set_data(record);
try {
sdb.replace_document(stem, newdocument);
} catch (...) {
LOGERR(("Db::createstemdb: replace failed\n"));
return false;
}
}
derivs.clear();
stem = it->first;
derivs.push_back(it->second);
// cerr << "\n" << stem << " " << it->second;
}
}
LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
wiper.do_it = false;
return true;
}
static string stringlistdisp(const list<string>& sl)
{
string s;
for (list<string>::const_iterator it = sl.begin(); it!= sl.end(); it++)
s += "[" + *it + "] ";
if (!s.empty())
s.erase(s.length()-1);
return s;
}
/**
* Expand term to list of all terms which stem to the same term.
*/
bool stemExpand(const std::string& dbdir,
const std::string& lang,
const std::string& term,
list<string>& result)
{
try {
Xapian::Stem stemmer(lang);
string stem = stemmer.stem_word(term);
LOGDEB(("stemExpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
// Try to fetch the doc from the stem db
string stemdbdir = stemdbname(dbdir, lang);
Xapian::Database sdb(stemdbdir);
LOGDEB1(("stemExpand: %s lastdocid: %d\n",
stemdbdir.c_str(), sdb.get_lastdocid()));
if (!sdb.term_exists(stem)) {
LOGDEB1(("Db::stemExpand: no term for %s\n", stem.c_str()));
result.push_back(term);
return true;
}
Xapian::PostingIterator did = sdb.postlist_begin(stem);
if (did == sdb.postlist_end(stem)) {
LOGDEB1(("stemExpand: no term(1) for %s\n",stem.c_str()));
result.push_back(term);
return true;
}
Xapian::Document doc = sdb.get_document(*did);
string data = doc.get_data();
// Build expansion list from database data
// No need for a conftree, but we need to massage the data a little
string::size_type pos = data.find_first_of("=");
++pos;
string::size_type pos1 = data.find_last_of("\n");
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
result.push_back(term);
return true;
}
stringToStrings(data.substr(pos, pos1-pos), result);
// If the user term itself is not in the list, add it.
if (find(result.begin(), result.end(), term) == result.end()) {
result.push_back(term);
}
LOGDEB(("stemExpand: %s -> %s\n", stem.c_str(),
stringlistdisp(result).c_str()));
} catch (...) {
LOGERR(("stemExpand: error accessing stem db. dbdir [%s] lang [%s]\n",
dbdir.c_str(), lang.c_str()));
result.push_back(term);
return false;
}
return true;
}
}
}