|
a/src/rcldb/stemdb.cpp |
|
b/src/rcldb/stemdb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.5 2006-10-09 16:37:08 dockes Exp $ (C) 2005 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.6 2006-12-19 12:11:21 dockes Exp $ (C) 2005 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
|
4 |
|
5 |
/**
|
5 |
/**
|
6 |
* Management of the auxiliary databases listing stems and their expansion
|
6 |
* Management of the auxiliary databases listing stems and their expansion
|
7 |
* terms
|
7 |
* terms
|
|
... |
|
... |
204 |
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
204 |
assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
|
205 |
wiper.do_it = false;
|
205 |
wiper.do_it = false;
|
206 |
return true;
|
206 |
return true;
|
207 |
}
|
207 |
}
|
208 |
|
208 |
|
|
|
209 |
static string stringlistdisp(const list<string>& sl)
|
|
|
210 |
{
|
|
|
211 |
string s;
|
|
|
212 |
for (list<string>::const_iterator it = sl.begin(); it!= sl.end(); it++)
|
|
|
213 |
s += "[" + *it + "] ";
|
|
|
214 |
if (!s.empty())
|
|
|
215 |
s.erase(s.length()-1);
|
|
|
216 |
return s;
|
|
|
217 |
}
|
|
|
218 |
|
209 |
/**
|
219 |
/**
|
210 |
* Expand term to list of all terms which stem to the same term.
|
220 |
* Expand term to list of all terms which stem to the same term.
|
211 |
*/
|
221 |
*/
|
212 |
list<string> stemExpand(const string& dbdir, const string& lang,
|
222 |
bool stemExpand(const std::string& dbdir,
|
|
|
223 |
const std::string& lang,
|
213 |
const string& term)
|
224 |
const std::string& term,
|
|
|
225 |
list<string>& result)
|
214 |
{
|
226 |
{
|
215 |
list<string> explist;
|
|
|
216 |
try {
|
227 |
try {
|
217 |
Xapian::Stem stemmer(lang);
|
228 |
Xapian::Stem stemmer(lang);
|
218 |
string stem = stemmer.stem_word(term);
|
229 |
string stem = stemmer.stem_word(term);
|
219 |
LOGDEB(("stemExpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
|
230 |
LOGDEB(("stemExpand: [%s] stem-> [%s]\n", term.c_str(), stem.c_str()));
|
220 |
// Try to fetch the doc from the stem db
|
231 |
// Try to fetch the doc from the stem db
|
|
... |
|
... |
222 |
Xapian::Database sdb(stemdbdir);
|
233 |
Xapian::Database sdb(stemdbdir);
|
223 |
LOGDEB1(("stemExpand: %s lastdocid: %d\n",
|
234 |
LOGDEB1(("stemExpand: %s lastdocid: %d\n",
|
224 |
stemdbdir.c_str(), sdb.get_lastdocid()));
|
235 |
stemdbdir.c_str(), sdb.get_lastdocid()));
|
225 |
if (!sdb.term_exists(stem)) {
|
236 |
if (!sdb.term_exists(stem)) {
|
226 |
LOGDEB1(("Db::stemExpand: no term for %s\n", stem.c_str()));
|
237 |
LOGDEB1(("Db::stemExpand: no term for %s\n", stem.c_str()));
|
227 |
explist.push_back(term);
|
238 |
result.push_back(term);
|
228 |
return explist;
|
239 |
return true;
|
229 |
}
|
240 |
}
|
230 |
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
241 |
Xapian::PostingIterator did = sdb.postlist_begin(stem);
|
231 |
if (did == sdb.postlist_end(stem)) {
|
242 |
if (did == sdb.postlist_end(stem)) {
|
232 |
LOGDEB1(("stemExpand: no term(1) for %s\n",stem.c_str()));
|
243 |
LOGDEB1(("stemExpand: no term(1) for %s\n",stem.c_str()));
|
233 |
explist.push_back(term);
|
244 |
result.push_back(term);
|
234 |
return explist;
|
245 |
return true;
|
235 |
}
|
246 |
}
|
236 |
Xapian::Document doc = sdb.get_document(*did);
|
247 |
Xapian::Document doc = sdb.get_document(*did);
|
237 |
string data = doc.get_data();
|
248 |
string data = doc.get_data();
|
238 |
|
249 |
|
239 |
// Build expansion list from database data
|
250 |
// Build expansion list from database data
|
240 |
// No need for a conftree, but we need to massage the data a little
|
251 |
// No need for a conftree, but we need to massage the data a little
|
241 |
string::size_type pos = data.find_first_of("=");
|
252 |
string::size_type pos = data.find_first_of("=");
|
242 |
++pos;
|
253 |
++pos;
|
243 |
string::size_type pos1 = data.find_last_of("\n");
|
254 |
string::size_type pos1 = data.find_last_of("\n");
|
244 |
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
|
255 |
if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
|
245 |
explist.push_back(term);
|
256 |
result.push_back(term);
|
246 |
return explist;
|
257 |
return true;
|
247 |
}
|
258 |
}
|
248 |
stringToStrings(data.substr(pos, pos1-pos), explist);
|
259 |
stringToStrings(data.substr(pos, pos1-pos), result);
|
249 |
|
260 |
|
250 |
// If the user term itself is not in the list, add it.
|
261 |
// If the user term itself is not in the list, add it.
|
251 |
if (find(explist.begin(), explist.end(), term) == explist.end()) {
|
262 |
if (find(result.begin(), result.end(), term) == result.end()) {
|
252 |
explist.push_back(term);
|
263 |
result.push_back(term);
|
253 |
}
|
264 |
}
|
254 |
LOGDEB(("stemExpand: %s -> %s\n", stem.c_str(),
|
265 |
LOGDEB(("stemExpand: %s -> %s\n", stem.c_str(),
|
255 |
stringlistdisp(explist).c_str()));
|
266 |
stringlistdisp(result).c_str()));
|
256 |
} catch (...) {
|
267 |
} catch (...) {
|
257 |
LOGERR(("stemExpand: error accessing stem db. dbdir [%s] lang [%s]\n",
|
268 |
LOGERR(("stemExpand: error accessing stem db. dbdir [%s] lang [%s]\n",
|
258 |
dbdir.c_str(), lang.c_str()));
|
269 |
dbdir.c_str(), lang.c_str()));
|
259 |
explist.push_back(term);
|
270 |
result.push_back(term);
|
260 |
return explist;
|
271 |
return false;
|
261 |
}
|
272 |
}
|
262 |
return explist;
|
273 |
return true;
|
263 |
}
|
274 |
}
|
264 |
|
275 |
|
265 |
}
|
276 |
}
|
266 |
}
|
277 |
}
|