|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.100 2006-12-07 13:24:19 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.101 2006-12-19 12:11:21 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
1283 |
sdata->setDescription(d);
|
1283 |
sdata->setDescription(d);
|
1284 |
LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
|
1284 |
LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
|
1285 |
return true;
|
1285 |
return true;
|
1286 |
}
|
1286 |
}
|
1287 |
|
1287 |
|
|
|
1288 |
class TermMatchCmpByWcf {
|
|
|
1289 |
public:
|
|
|
1290 |
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
|
1291 |
return r.wcf - l.wcf < 0;
|
|
|
1292 |
}
|
|
|
1293 |
};
|
|
|
1294 |
class TermMatchCmpByTerm {
|
|
|
1295 |
public:
|
|
|
1296 |
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
|
1297 |
return l.term.compare(r.term) > 0;
|
|
|
1298 |
}
|
|
|
1299 |
};
|
|
|
1300 |
class TermMatchTermEqual {
|
|
|
1301 |
public:
|
|
|
1302 |
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
|
1303 |
return !l.term.compare(r.term);
|
|
|
1304 |
}
|
|
|
1305 |
};
|
|
|
1306 |
|
|
|
1307 |
bool Db::stemExpand(const string &lang, const string &term,
|
|
|
1308 |
list<TermMatchEntry>& result, int max)
|
|
|
1309 |
{
|
|
|
1310 |
list<string> dirs = m_extraDbs;
|
|
|
1311 |
dirs.push_front(m_basedir);
|
|
|
1312 |
for (list<string>::iterator it = dirs.begin();
|
|
|
1313 |
it != dirs.end(); it++) {
|
|
|
1314 |
list<string> more;
|
|
|
1315 |
StemDb::stemExpand(*it, lang, term, more);
|
|
|
1316 |
LOGDEB1(("Db::stemExpand: Got %d from %s\n",
|
|
|
1317 |
more.size(), it->c_str()));
|
|
|
1318 |
result.insert(result.end(), more.begin(), more.end());
|
|
|
1319 |
}
|
|
|
1320 |
LOGDEB1(("Db:::stemExpand: final count %d \n", result.size()));
|
|
|
1321 |
return true;
|
|
|
1322 |
}
|
|
|
1323 |
|
1288 |
// Characters that can begin a wildcard or regexp expression. We use skipto
|
1324 |
// Characters that can begin a wildcard or regexp expression. We use skipto
|
1289 |
// to begin the allterms search with terms that begin with the portion of
|
1325 |
// to begin the allterms search with terms that begin with the portion of
|
1290 |
// the input string prior to these chars.
|
1326 |
// the input string prior to these chars.
|
1291 |
const string wildSpecChars = "*?[";
|
1327 |
const string wildSpecChars = "*?[";
|
1292 |
const string regSpecChars = "(.[{";
|
1328 |
const string regSpecChars = "(.[{";
|
1293 |
|
1329 |
|
1294 |
// Find all index terms that match a wildcard or regular expression
|
1330 |
// Find all index terms that match a wildcard or regular expression
|
1295 |
bool Db::termMatch(MatchType typ, const string &root, list<string>& res,
|
1331 |
bool Db::termMatch(MatchType typ, const string &lang,
|
1296 |
const string &lang, int max)
|
1332 |
const string &root,
|
|
|
1333 |
list<TermMatchEntry>& res,
|
|
|
1334 |
int max)
|
1297 |
{
|
1335 |
{
|
1298 |
if (!m_ndb || !m_ndb->m_isopen)
|
1336 |
if (!m_ndb || !m_ndb->m_isopen)
|
1299 |
return false;
|
1337 |
return false;
|
|
|
1338 |
|
1300 |
Xapian::Database db = m_ndb->m_iswritable ? m_ndb->wdb: m_ndb->db;
|
1339 |
Xapian::Database db = m_ndb->m_iswritable ? m_ndb->wdb: m_ndb->db;
|
|
|
1340 |
|
1301 |
res.clear();
|
1341 |
res.clear();
|
|
|
1342 |
|
1302 |
// Get rid of capitals and accents
|
1343 |
// Get rid of capitals and accents
|
1303 |
string droot;
|
1344 |
string droot;
|
1304 |
dumb_string(root, droot);
|
1345 |
dumb_string(root, droot);
|
1305 |
string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
|
1346 |
string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
|
1306 |
|
1347 |
|
|
|
1348 |
if (typ == ET_STEM) {
|
|
|
1349 |
if (!stemExpand(lang, root, res, max))
|
|
|
1350 |
return false;
|
|
|
1351 |
for (list<TermMatchEntry>::iterator it = res.begin();
|
|
|
1352 |
it != res.end(); it++) {
|
|
|
1353 |
it->wcf = db.get_collection_freq(it->term);
|
|
|
1354 |
LOGDEB(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
|
|
|
1355 |
}
|
|
|
1356 |
} else {
|
1307 |
regex_t reg;
|
1357 |
regex_t reg;
|
1308 |
int errcode;
|
1358 |
int errcode;
|
|
|
1359 |
if (typ == ET_REGEXP) {
|
1309 |
// Compile regexp. We anchor the input by enclosing it in ^ and $
|
1360 |
// Compile regexp. We anchor the input by enclosing it in ^ and $
|
1310 |
if (typ == ET_REGEXP) {
|
|
|
1311 |
string mroot = droot;
|
1361 |
string mroot = droot;
|
1312 |
if (mroot.at(0) != '^')
|
1362 |
if (mroot.at(0) != '^')
|
1313 |
mroot = string("^") + mroot;
|
1363 |
mroot = string("^") + mroot;
|
1314 |
if (mroot.at(mroot.length()-1) != '$')
|
1364 |
if (mroot.at(mroot.length()-1) != '$')
|
1315 |
mroot += "$";
|
1365 |
mroot += "$";
|
1316 |
if ((errcode = regcomp(®, mroot.c_str(), REG_EXTENDED|REG_NOSUB))) {
|
1366 |
if ((errcode = regcomp(®, mroot.c_str(),
|
|
|
1367 |
REG_EXTENDED|REG_NOSUB))) {
|
1317 |
char errbuf[200];
|
1368 |
char errbuf[200];
|
1318 |
regerror(errcode, ®, errbuf, 199);
|
1369 |
regerror(errcode, ®, errbuf, 199);
|
1319 |
LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
|
1370 |
LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
|
1320 |
res.push_back(errbuf);
|
1371 |
res.push_back(string(errbuf));
|
|
|
1372 |
regfree(®);
|
|
|
1373 |
return false;
|
|
|
1374 |
}
|
|
|
1375 |
}
|
|
|
1376 |
|
|
|
1377 |
// Find the initial section before any special char
|
|
|
1378 |
string::size_type es = droot.find_first_of(nochars);
|
|
|
1379 |
string is;
|
|
|
1380 |
switch (es) {
|
|
|
1381 |
case string::npos: is = droot;break;
|
|
|
1382 |
case 0: break;
|
|
|
1383 |
default: is = droot.substr(0, es);break;
|
|
|
1384 |
}
|
|
|
1385 |
LOGDEB(("termMatch: initsec: [%s]\n", is.c_str()));
|
|
|
1386 |
|
|
|
1387 |
Xapian::TermIterator it = db.allterms_begin();
|
|
|
1388 |
if (!is.empty())
|
|
|
1389 |
it.skip_to(is.c_str());
|
|
|
1390 |
for (int n = 0;it != db.allterms_end(); it++) {
|
|
|
1391 |
// If we're beyond the terms matching the initial string, end
|
|
|
1392 |
if (!is.empty() && (*it).find(is) != 0)
|
|
|
1393 |
break;
|
|
|
1394 |
// Don't match special internal terms beginning with uppercase ascii
|
|
|
1395 |
if ((*it).at(0) >= 'A' && (*it).at(0) <= 'Z')
|
|
|
1396 |
continue;
|
|
|
1397 |
if (typ == ET_WILD) {
|
|
|
1398 |
if (fnmatch(droot.c_str(), (*it).c_str(), 0) == FNM_NOMATCH)
|
|
|
1399 |
continue;
|
|
|
1400 |
} else {
|
|
|
1401 |
if (regexec(®, (*it).c_str(), 0, 0, 0))
|
|
|
1402 |
continue;
|
|
|
1403 |
}
|
|
|
1404 |
// Do we want stem expansion here? We don't do it for now
|
|
|
1405 |
res.push_back(TermMatchEntry(*it, it.get_termfreq()));
|
|
|
1406 |
++n;
|
|
|
1407 |
}
|
|
|
1408 |
if (typ == ET_REGEXP) {
|
1321 |
regfree(®);
|
1409 |
regfree(®);
|
1322 |
return false;
|
|
|
1323 |
}
|
1410 |
}
|
1324 |
}
|
|
|
1325 |
|
1411 |
|
1326 |
// Find the initial section before any special char
|
|
|
1327 |
string::size_type es = droot.find_first_of(nochars);
|
|
|
1328 |
string is;
|
|
|
1329 |
switch (es) {
|
|
|
1330 |
case string::npos: is = droot;break;
|
|
|
1331 |
case 0: break;
|
|
|
1332 |
default: is = droot.substr(0, es);break;
|
|
|
1333 |
}
|
|
|
1334 |
LOGDEB(("termMatch: initsec: [%s]\n", is.c_str()));
|
|
|
1335 |
|
|
|
1336 |
Xapian::TermIterator it = db.allterms_begin();
|
|
|
1337 |
if (!is.empty())
|
|
|
1338 |
it.skip_to(is.c_str());
|
|
|
1339 |
for (int n = 0;it != db.allterms_end(); it++) {
|
|
|
1340 |
// If we're beyond the terms matching the initial string, end
|
|
|
1341 |
if (!is.empty() && (*it).find(is) != 0)
|
|
|
1342 |
break;
|
|
|
1343 |
// Don't match special internal terms beginning with uppercase ascii
|
|
|
1344 |
if ((*it).at(0) >= 'A' && (*it).at(0) <= 'Z')
|
|
|
1345 |
continue;
|
|
|
1346 |
if (typ == ET_WILD) {
|
|
|
1347 |
if (fnmatch(droot.c_str(), (*it).c_str(), 0) == FNM_NOMATCH)
|
|
|
1348 |
continue;
|
|
|
1349 |
} else {
|
|
|
1350 |
if (regexec(®, (*it).c_str(), 0, 0, 0))
|
|
|
1351 |
continue;
|
|
|
1352 |
}
|
|
|
1353 |
// Do we want stem expansion here? We don't do it for now
|
|
|
1354 |
if (1 || lang.empty()) {
|
|
|
1355 |
res.push_back(*it);
|
|
|
1356 |
++n;
|
|
|
1357 |
} else {
|
|
|
1358 |
list<string> stemexps = stemExpand(lang, *it);
|
|
|
1359 |
unsigned int cnt =
|
|
|
1360 |
(int)stemexps.size() > max - n ? max - n : stemexps.size();
|
|
|
1361 |
list<string>::iterator sit = stemexps.begin();
|
|
|
1362 |
while (cnt--) {
|
|
|
1363 |
res.push_back(*sit++);
|
|
|
1364 |
n++;
|
|
|
1365 |
}
|
1412 |
}
|
1366 |
}
|
1413 |
|
1367 |
if (n >= max)
|
1414 |
TermMatchCmpByTerm tcmp;
|
1368 |
break;
|
|
|
1369 |
}
|
|
|
1370 |
res.sort();
|
1415 |
res.sort(tcmp);
|
|
|
1416 |
TermMatchTermEqual teq;
|
1371 |
res.unique();
|
1417 |
res.unique(teq);
|
1372 |
if (typ == ET_REGEXP) {
|
1418 |
TermMatchCmpByWcf wcmp;
|
1373 |
regfree(®);
|
1419 |
res.sort(wcmp);
|
|
|
1420 |
if (max > 0) {
|
|
|
1421 |
res.resize(MIN(res.size(), (unsigned int)max));
|
1374 |
}
|
1422 |
}
|
1375 |
return true;
|
1423 |
return true;
|
1376 |
}
|
1424 |
}
|
1377 |
|
1425 |
|
1378 |
/** Term list walking. */
|
1426 |
/** Term list walking. */
|
|
... |
|
... |
1415 |
if (!db.term_exists(word))
|
1463 |
if (!db.term_exists(word))
|
1416 |
return false;
|
1464 |
return false;
|
1417 |
return true;
|
1465 |
return true;
|
1418 |
}
|
1466 |
}
|
1419 |
|
1467 |
|
1420 |
list<string> Db::stemExpand(const string& lang, const string& term)
|
|
|
1421 |
{
|
|
|
1422 |
list<string> dirs = m_extraDbs;
|
|
|
1423 |
dirs.push_front(m_basedir);
|
|
|
1424 |
list<string> exp;
|
|
|
1425 |
for (list<string>::iterator it = dirs.begin();
|
|
|
1426 |
it != dirs.end(); it++) {
|
|
|
1427 |
list<string> more = StemDb::stemExpand(*it, lang, term);
|
|
|
1428 |
LOGDEB1(("Db::stemExpand: Got %d from %s\n",
|
|
|
1429 |
more.size(), it->c_str()));
|
|
|
1430 |
exp.splice(exp.end(), more);
|
|
|
1431 |
}
|
|
|
1432 |
exp.sort();
|
|
|
1433 |
exp.unique();
|
|
|
1434 |
LOGDEB1(("Db:::stemExpand: final count %d \n", exp.size()));
|
|
|
1435 |
return exp;
|
|
|
1436 |
}
|
|
|
1437 |
|
1468 |
|
1438 |
bool Db::stemDiffers(const string& lang, const string& word,
|
1469 |
bool Db::stemDiffers(const string& lang, const string& word,
|
1439 |
const string& base)
|
1470 |
const string& base)
|
1440 |
{
|
1471 |
{
|
1441 |
Xapian::Stem stemmer(lang);
|
1472 |
Xapian::Stem stemmer(lang);
|