recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [c5463d] .. [be05ea]

Switch to unified view

-a/src/rcldb/rcldb.cpp
+b/src/rcldb/rcldb.cpp
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.100 2006-12-07 13:24:19 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.101 2006-12-19 12:11:21 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
 ...
     sdata->setDescription(d);
     LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
     return true;
+}
+class TermMatchCmpByWcf {
+public:
+    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
+  return r.wcf - l.wcf < 0;
+    }
+};
+class TermMatchCmpByTerm {
+public:
+    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
+  return l.term.compare(r.term) > 0;
+    }
+};
+class TermMatchTermEqual {
+public:
+    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
+  return !l.term.compare(r.term);
+    }
+};
+bool Db::stemExpand(const string &lang, const string &term,
+          list<TermMatchEntry>& result, int max)
+{
+    list<string> dirs = m_extraDbs;
+    dirs.push_front(m_basedir);
+    for (list<string>::iterator it = dirs.begin();
+   it != dirs.end(); it++) {
+  list<string> more;
+  StemDb::stemExpand(*it, lang, term, more);
+  LOGDEB1(("Db::stemExpand: Got %d from %s\n",
+       more.size(), it->c_str()));
+  result.insert(result.end(), more.begin(), more.end());
+    }
+    LOGDEB1(("Db:::stemExpand: final count %d \n", result.size()));
+    return true;
+}
 // Characters that can begin a wildcard or regexp expression. We use skipto
 // to begin the allterms search with terms that begin with the portion of
 // the input string prior to these chars.
 const string wildSpecChars = "*?[";
 const string regSpecChars = "(.[{";
 // Find all index terms that match a wildcard or regular expression
-bool Db::termMatch(MatchType typ, const string &root, list<string>& res,
+bool Db::termMatch(MatchType typ, const string &lang,
-           const string &lang, int max)
+         const string &root,
+         list<TermMatchEntry>& res,
+         int max)
+{
     if (!m_ndb || !m_ndb->m_isopen)
     return false;
     Xapian::Database db = m_ndb->m_iswritable ? m_ndb->wdb: m_ndb->db;
     res.clear();
     // Get rid of capitals and accents
     string droot;
     dumb_string(root, droot);
     string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
+    if (typ == ET_STEM) {
+  if (!stemExpand(lang, root, res, max))
+      return false;
+  for (list<TermMatchEntry>::iterator it = res.begin();
+       it != res.end(); it++) {
+      it->wcf = db.get_collection_freq(it->term);
+      LOGDEB(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
+  }
+    } else {
-    regex_t reg;
+  regex_t reg;
-    int errcode;
+  int errcode;
+  if (typ == ET_REGEXP) {
-    // Compile regexp. We anchor the input by enclosing it in ^ and $
+      // Compile regexp. We anchor the input by enclosing it in ^ and $
-    if (typ == ET_REGEXP) {
-    string mroot = droot;
+        string mroot = droot;
-    if (mroot.at(0) != '^')
+        if (mroot.at(0) != '^')
-        mroot = string("^") + mroot;
+      mroot = string("^") + mroot;
-    if (mroot.at(mroot.length()-1) != '$')
+        if (mroot.at(mroot.length()-1) != '$')
-        mroot += "$";
+      mroot += "$";
-  if ((errcode = regcomp(&reg, mroot.c_str(), REG_EXTENDED|REG_NOSUB))) {
+      if ((errcode = regcomp(&reg, mroot.c_str(),
+                 REG_EXTENDED|REG_NOSUB))) {
-        char errbuf[200];
+      char errbuf[200];
-        regerror(errcode, &reg, errbuf, 199);
+      regerror(errcode, &reg, errbuf, 199);
-        LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
+      LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
-        res.push_back(errbuf);
+      res.push_back(string(errbuf));
+      regfree(&reg);
+      return false;
+      }
+  }
+  // Find the initial section before any special char
+  string::size_type es = droot.find_first_of(nochars);
+  string is;
+  switch (es) {
+  case string::npos: is = droot;break;
+  case 0: break;
+  default: is = droot.substr(0, es);break;
+  }
+  LOGDEB(("termMatch: initsec: [%s]\n", is.c_str()));
+  Xapian::TermIterator it = db.allterms_begin();
+  if (!is.empty())
+      it.skip_to(is.c_str());
+  for (int n = 0;it != db.allterms_end(); it++) {
+      // If we're beyond the terms matching the initial string, end
+      if (!is.empty() && (*it).find(is) != 0)
+      break;
+      // Don't match special internal terms beginning with uppercase ascii
+      if ((*it).at(0) >= 'A' && (*it).at(0) <= 'Z')
+      continue;
+      if (typ == ET_WILD) {
+      if (fnmatch(droot.c_str(), (*it).c_str(), 0) == FNM_NOMATCH)
+          continue;
+      } else {
+      if (regexec(&reg, (*it).c_str(), 0, 0, 0))
+          continue;
+      }
+      // Do we want stem expansion here? We don't do it for now
+      res.push_back(TermMatchEntry(*it, it.get_termfreq()));
+      ++n;
+  }
+  if (typ == ET_REGEXP) {
         regfree(&reg);
-      return false;
+    }
-    // Find the initial section before any special char
-    string::size_type es = droot.find_first_of(nochars);
-    string is;
-    switch (es) {
-    case string::npos: is = droot;break;
-    case 0: break;
-    default: is = droot.substr(0, es);break;
-    LOGDEB(("termMatch: initsec: [%s]\n", is.c_str()));
-    Xapian::TermIterator it = db.allterms_begin();
-    if (!is.empty())
-  it.skip_to(is.c_str());
-    for (int n = 0;it != db.allterms_end(); it++) {
-        // If we're beyond the terms matching the initial string, end
-  if (!is.empty() && (*it).find(is) != 0)
-      break;
-  // Don't match special internal terms beginning with uppercase ascii
-  if ((*it).at(0) >= 'A' && (*it).at(0) <= 'Z')
-      continue;
-  if (typ == ET_WILD) {
-      if (fnmatch(droot.c_str(), (*it).c_str(), 0) == FNM_NOMATCH)
-      continue;
-  } else {
-      if (regexec(&reg, (*it).c_str(), 0, 0, 0))
-      continue;
-  // Do we want stem expansion here? We don't do it for now
-  if (1 || lang.empty()) {
-      res.push_back(*it);
-      ++n;
-  } else {
-      list<string> stemexps = stemExpand(lang, *it);
-      unsigned int cnt =
-      (int)stemexps.size() > max - n ? max - n : stemexps.size();
-      list<string>::iterator sit = stemexps.begin();
-      while (cnt--) {
-      res.push_back(*sit++);
-      n++;
+    }
-  if (n >= max)
+    TermMatchCmpByTerm tcmp;
-      break;
-    res.sort();
+    res.sort(tcmp);
+    TermMatchTermEqual teq;
-    res.unique();
+    res.unique(teq);
-    if (typ == ET_REGEXP) {
+    TermMatchCmpByWcf wcmp;
-  regfree(&reg);
+    res.sort(wcmp);
+    if (max > 0) {
+  res.resize(MIN(res.size(), (unsigned int)max));
+    }
     return true;
+}
 /** Term list walking. */
 ...
     if (!db.term_exists(word))
     return false;
     return true;
+}
-list<string> Db::stemExpand(const string& lang, const string& term)
-    list<string> dirs = m_extraDbs;
-    dirs.push_front(m_basedir);
-    list<string> exp;
-    for (list<string>::iterator it = dirs.begin();
-   it != dirs.end(); it++) {
-  list<string> more = StemDb::stemExpand(*it, lang, term);
-  LOGDEB1(("Db::stemExpand: Got %d from %s\n",
-       more.size(), it->c_str()));
-  exp.splice(exp.end(), more);
-    exp.sort();
-    exp.unique();
-    LOGDEB1(("Db:::stemExpand: final count %d \n", exp.size()));
-    return exp;
 bool Db::stemDiffers(const string& lang, const string& word,
              const string& base)
+{
     Xapian::Stem stemmer(lang);