Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.100 2006-12-07 13:24:19 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.101 2006-12-19 12:11:21 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
1283
    sdata->setDescription(d);
1283
    sdata->setDescription(d);
1284
    LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
1284
    LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
1285
    return true;
1285
    return true;
1286
}
1286
}
1287
1287
1288
class TermMatchCmpByWcf {
1289
public:
1290
    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
1291
  return r.wcf - l.wcf < 0;
1292
    }
1293
};
1294
class TermMatchCmpByTerm {
1295
public:
1296
    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
1297
  return l.term.compare(r.term) > 0;
1298
    }
1299
};
1300
class TermMatchTermEqual {
1301
public:
1302
    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
1303
  return !l.term.compare(r.term);
1304
    }
1305
};
1306
1307
bool Db::stemExpand(const string &lang, const string &term, 
1308
          list<TermMatchEntry>& result, int max)
1309
{
1310
    list<string> dirs = m_extraDbs;
1311
    dirs.push_front(m_basedir);
1312
    for (list<string>::iterator it = dirs.begin();
1313
   it != dirs.end(); it++) {
1314
  list<string> more;
1315
  StemDb::stemExpand(*it, lang, term, more);
1316
  LOGDEB1(("Db::stemExpand: Got %d from %s\n", 
1317
       more.size(), it->c_str()));
1318
  result.insert(result.end(), more.begin(), more.end());
1319
    }
1320
    LOGDEB1(("Db:::stemExpand: final count %d \n", result.size()));
1321
    return true;
1322
}
1323
1288
// Characters that can begin a wildcard or regexp expression. We use skipto
1324
// Characters that can begin a wildcard or regexp expression. We use skipto
1289
// to begin the allterms search with terms that begin with the portion of
1325
// to begin the allterms search with terms that begin with the portion of
1290
// the input string prior to these chars.
1326
// the input string prior to these chars.
1291
const string wildSpecChars = "*?[";
1327
const string wildSpecChars = "*?[";
1292
const string regSpecChars = "(.[{";
1328
const string regSpecChars = "(.[{";
1293
1329
1294
// Find all index terms that match a wildcard or regular expression
1330
// Find all index terms that match a wildcard or regular expression
1295
bool Db::termMatch(MatchType typ, const string &root, list<string>& res,
1331
bool Db::termMatch(MatchType typ, const string &lang,
1296
           const string &lang, int max)
1332
         const string &root, 
1333
         list<TermMatchEntry>& res,
1334
         int max)
1297
{
1335
{
1298
    if (!m_ndb || !m_ndb->m_isopen)
1336
    if (!m_ndb || !m_ndb->m_isopen)
1299
    return false;
1337
    return false;
1338
1300
    Xapian::Database db = m_ndb->m_iswritable ? m_ndb->wdb: m_ndb->db;
1339
    Xapian::Database db = m_ndb->m_iswritable ? m_ndb->wdb: m_ndb->db;
1340
1301
    res.clear();
1341
    res.clear();
1342
1302
    // Get rid of capitals and accents
1343
    // Get rid of capitals and accents
1303
    string droot;
1344
    string droot;
1304
    dumb_string(root, droot);
1345
    dumb_string(root, droot);
1305
    string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
1346
    string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
1306
1347
1348
    if (typ == ET_STEM) {
1349
  if (!stemExpand(lang, root, res, max))
1350
      return false;
1351
  for (list<TermMatchEntry>::iterator it = res.begin(); 
1352
       it != res.end(); it++) {
1353
      it->wcf = db.get_collection_freq(it->term);
1354
      LOGDEB(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
1355
  }
1356
    } else {
1307
    regex_t reg;
1357
  regex_t reg;
1308
    int errcode;
1358
  int errcode;
1359
  if (typ == ET_REGEXP) {
1309
    // Compile regexp. We anchor the input by enclosing it in ^ and $
1360
      // Compile regexp. We anchor the input by enclosing it in ^ and $
1310
    if (typ == ET_REGEXP) {
1311
    string mroot = droot;
1361
        string mroot = droot;
1312
    if (mroot.at(0) != '^')
1362
        if (mroot.at(0) != '^')
1313
        mroot = string("^") + mroot;
1363
      mroot = string("^") + mroot;
1314
    if (mroot.at(mroot.length()-1) != '$')
1364
        if (mroot.at(mroot.length()-1) != '$')
1315
        mroot += "$";
1365
      mroot += "$";
1316
  if ((errcode = regcomp(&reg, mroot.c_str(), REG_EXTENDED|REG_NOSUB))) {
1366
      if ((errcode = regcomp(&reg, mroot.c_str(), 
1367
                 REG_EXTENDED|REG_NOSUB))) {
1317
        char errbuf[200];
1368
      char errbuf[200];
1318
        regerror(errcode, &reg, errbuf, 199);
1369
      regerror(errcode, &reg, errbuf, 199);
1319
        LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
1370
      LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
1320
        res.push_back(errbuf);
1371
      res.push_back(string(errbuf));
1372
      regfree(&reg);
1373
      return false;
1374
      }
1375
  }
1376
1377
  // Find the initial section before any special char
1378
  string::size_type es = droot.find_first_of(nochars);
1379
  string is;
1380
  switch (es) {
1381
  case string::npos: is = droot;break;
1382
  case 0: break;
1383
  default: is = droot.substr(0, es);break;
1384
  }
1385
  LOGDEB(("termMatch: initsec: [%s]\n", is.c_str()));
1386
1387
  Xapian::TermIterator it = db.allterms_begin(); 
1388
  if (!is.empty())
1389
      it.skip_to(is.c_str());
1390
  for (int n = 0;it != db.allterms_end(); it++) {
1391
      // If we're beyond the terms matching the initial string, end
1392
      if (!is.empty() && (*it).find(is) != 0)
1393
      break;
1394
      // Don't match special internal terms beginning with uppercase ascii
1395
      if ((*it).at(0) >= 'A' && (*it).at(0) <= 'Z')
1396
      continue;
1397
      if (typ == ET_WILD) {
1398
      if (fnmatch(droot.c_str(), (*it).c_str(), 0) == FNM_NOMATCH)
1399
          continue;
1400
      } else {
1401
      if (regexec(&reg, (*it).c_str(), 0, 0, 0))
1402
          continue;
1403
      }
1404
      // Do we want stem expansion here? We don't do it for now
1405
      res.push_back(TermMatchEntry(*it, it.get_termfreq()));
1406
      ++n;
1407
  }
1408
  if (typ == ET_REGEXP) {
1321
        regfree(&reg);
1409
        regfree(&reg);
1322
      return false;
1323
    }
1410
    }
1324
    }
1325
1411
1326
    // Find the initial section before any special char
1327
    string::size_type es = droot.find_first_of(nochars);
1328
    string is;
1329
    switch (es) {
1330
    case string::npos: is = droot;break;
1331
    case 0: break;
1332
    default: is = droot.substr(0, es);break;
1333
    }
1334
    LOGDEB(("termMatch: initsec: [%s]\n", is.c_str()));
1335
1336
    Xapian::TermIterator it = db.allterms_begin(); 
1337
    if (!is.empty())
1338
  it.skip_to(is.c_str());
1339
    for (int n = 0;it != db.allterms_end(); it++) {
1340
        // If we're beyond the terms matching the initial string, end
1341
  if (!is.empty() && (*it).find(is) != 0)
1342
      break;
1343
  // Don't match special internal terms beginning with uppercase ascii
1344
  if ((*it).at(0) >= 'A' && (*it).at(0) <= 'Z')
1345
      continue;
1346
  if (typ == ET_WILD) {
1347
      if (fnmatch(droot.c_str(), (*it).c_str(), 0) == FNM_NOMATCH)
1348
      continue;
1349
  } else {
1350
      if (regexec(&reg, (*it).c_str(), 0, 0, 0))
1351
      continue;
1352
  }
1353
  // Do we want stem expansion here? We don't do it for now
1354
  if (1 || lang.empty()) {
1355
      res.push_back(*it);
1356
      ++n;
1357
  } else {
1358
      list<string> stemexps = stemExpand(lang, *it);
1359
      unsigned int cnt = 
1360
      (int)stemexps.size() > max - n ? max - n : stemexps.size();
1361
      list<string>::iterator sit = stemexps.begin();
1362
      while (cnt--) {
1363
      res.push_back(*sit++);
1364
      n++;
1365
      }
1412
    }
1366
  }
1413
1367
  if (n >= max)
1414
    TermMatchCmpByTerm tcmp;
1368
      break;
1369
    }
1370
    res.sort();
1415
    res.sort(tcmp);
1416
    TermMatchTermEqual teq;
1371
    res.unique();
1417
    res.unique(teq);
1372
    if (typ == ET_REGEXP) {
1418
    TermMatchCmpByWcf wcmp;
1373
  regfree(&reg);
1419
    res.sort(wcmp);
1420
    if (max > 0) {
1421
  res.resize(MIN(res.size(), (unsigned int)max));
1374
    }
1422
    }
1375
    return true;
1423
    return true;
1376
}
1424
}
1377
1425
1378
/** Term list walking. */
1426
/** Term list walking. */
...
...
1415
    if (!db.term_exists(word))
1463
    if (!db.term_exists(word))
1416
    return false;
1464
    return false;
1417
    return true;
1465
    return true;
1418
}
1466
}
1419
1467
1420
list<string> Db::stemExpand(const string& lang, const string& term) 
1421
{
1422
    list<string> dirs = m_extraDbs;
1423
    dirs.push_front(m_basedir);
1424
    list<string> exp;
1425
    for (list<string>::iterator it = dirs.begin();
1426
   it != dirs.end(); it++) {
1427
  list<string> more = StemDb::stemExpand(*it, lang, term);
1428
  LOGDEB1(("Db::stemExpand: Got %d from %s\n", 
1429
       more.size(), it->c_str()));
1430
  exp.splice(exp.end(), more);
1431
    }
1432
    exp.sort();
1433
    exp.unique();
1434
    LOGDEB1(("Db:::stemExpand: final count %d \n", exp.size()));
1435
    return exp;
1436
}
1437
1468
1438
bool Db::stemDiffers(const string& lang, const string& word, 
1469
bool Db::stemDiffers(const string& lang, const string& word, 
1439
             const string& base)
1470
             const string& base)
1440
{
1471
{
1441
    Xapian::Stem stemmer(lang);
1472
    Xapian::Stem stemmer(lang);