--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
#ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.23 2005-02-08 14:45:54 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.24 2005-02-10 15:21:12 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@@ -16,8 +16,11 @@
#include "unacpp.h"
#include "conftree.h"
#include "debuglog.h"
+#include "pathut.h"
+#include "smallut.h"
#include "xapian.h"
+#include <xapian/stem.h>
// Data for a xapian database. There could actually be 2 different
// ones for indexing or query as there is not much in common.
@@ -25,6 +28,8 @@
public:
bool isopen;
bool iswritable;
+ string basedir;
+
// Indexing
Xapian::WritableDatabase wdb;
vector<bool> updated;
@@ -102,9 +107,6 @@
ndb->iswritable = true;
break;
case DbTrunc:
- ndb->wdb =
- Xapian::WritableDatabase(dir, Xapian::DB_CREATE_OR_OVERWRITE);
- ndb->iswritable = true;
break;
case DbRO:
default:
@@ -113,6 +115,7 @@
break;
}
ndb->isopen = true;
+ ndb->basedir = dir;
return true;
} catch (const Xapian::Error &e) {
ermsg = e.get_msg();
@@ -399,9 +402,160 @@
return true;
}
+/// Compute name of stem db for given base database and language
+static string stemdbname(const string& basename, string lang)
+{
+ string nm = basename;
+ path_cat(nm, string("stem_") + lang);
+ return nm;
+}
+
+// Is char non-lowercase ascii ?
+inline static bool
+p_notlowerorutf(unsigned int c)
+{
+ if (c < 'a' || (c > 'z' && c < 128))
+ return true;
+ return false;
+}
+
+/**
+ * Create database of stem to parents associations for a given language.
+ * We walk the list of all terms, stem them, and create another Xapian db
+ * with documents indexed by a single term (the stem), and with the list of
+ * parent terms in the document data.
+ */
+bool Rcl::Db::createStemDb(const string& lang)
+{
+ LOGDEB(("Rcl::Db::createStemDb(%s)\n", lang.c_str()));
+ if (pdata == 0)
+ return false;
+ Native *ndb = (Native *)pdata;
+ if (ndb->isopen == false || ndb->iswritable == false)
+ return false;
+
+ // First build the in-memory stem database:
+ // We walk the list of all terms, and stem each.
+ // If the stem is identical to the term, no need to create an entry
+ // Else, we add an entry to the multimap.
+ // At the end, we only save stem-terms associations with several terms, the
+ // others are not useful
+ multimap<string, string> assocs;
+ // Statistics
+ int nostem=0; // Dont even try: not-alphanum (incomplete for now)
+ int stemconst=0; // Stem == term
+ int stemdiff=0; // Count of all different stems
+ int stemmultiple = 0; // Count of stems with multiple derivatives
+ try {
+ Xapian::Stem stemmer(lang);
+ Xapian::TermIterator it;
+ for (it = ndb->wdb.allterms_begin();
+ it != ndb->wdb.allterms_end(); it++) {
+ // If it has any non-lowercase 7bit char, cant be stemmable
+ string::iterator sit = (*it).begin(), eit = sit + (*it).length();
+ if ((sit = find_if(sit, eit, p_notlowerorutf)) != eit) {
+ ++nostem;
+ // LOGDEB(("stemskipped: '%s', because of 0x%x\n",
+ // (*it).c_str(), *sit));
+ continue;
+ }
+ string stem = stemmer.stem_word(*it);
+ //cerr << "word " << *it << " stem " << stem << endl;
+ if (stem == *it) {
+ ++stemconst;
+ continue;
+ }
+ assocs.insert(pair<string,string>(stem, *it));
+ }
+ } catch (...) {
+ LOGERR(("Stem database build failed: no stemmer for %s ? \n",
+ lang.c_str()));
+ return false;
+ }
+
+ // Create xapian database for stem relations
+ string stemdbdir = stemdbname(ndb->basedir, lang);
+ string ermsg = "NOERROR";
+ Xapian::WritableDatabase sdb;
+ try {
+ sdb = Xapian::WritableDatabase(stemdbdir,
+ Xapian::DB_CREATE_OR_OVERWRITE);
+ } catch (const Xapian::Error &e) {
+ ermsg = e.get_msg();
+ } catch (const string &s) {
+ ermsg = s;
+ } catch (const char *s) {
+ ermsg = s;
+ } catch (...) {
+ ermsg = "Caught unknown exception";
+ }
+ if (ermsg != "NOERROR") {
+ LOGERR(("Rcl::Db::createstemdb: exception while opening '%s': %s\n",
+ stemdbdir.c_str(), ermsg.c_str()));
+ return false;
+ }
+
+ // Enter pseud-docs in db. Walk the multimap, only enter
+ // associations where there are several parent terms
+ string stem;
+ list<string> derivs;
+ for (multimap<string,string>::const_iterator it = assocs.begin();
+ it != assocs.end(); it++) {
+ if (stem == it->first) {
+ // Staying with same stem
+ derivs.push_back(it->second);
+ // cerr << " " << it->second << endl;
+ } else {
+ // Changing stems
+ ++stemdiff;
+ if (derivs.size() > 1) {
+ // Previous stem has multiple derivatives. Enter in db
+ ++stemmultiple;
+ Xapian::Document newdocument;
+ newdocument.add_term(stem);
+ // The doc data is just parents=blank-separated-list
+ string record = "parents=";
+ for (list<string>::const_iterator it = derivs.begin();
+ it != derivs.end(); it++) {
+ record += *it + " ";
+ }
+ record += "\n";
+ LOGDEB1(("stemdocument data: %s\n", record.c_str()));
+ newdocument.set_data(record);
+ try {
+ sdb.replace_document(stem, newdocument);
+ } catch (...) {
+ LOGERR(("Rcl::Db::createstemdb: replace failed\n"));
+ return false;
+ }
+ }
+ derivs.clear();
+ stem = it->first;
+ derivs.push_back(it->second);
+ // cerr << "\n" << stem << " " << it->second;
+ }
+ }
+ LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n",
+ assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
+ return true;
+}
+
+/**
+ * This is called at the end of an indexing session, to delete the
+ * documents for files that are no longer there. We also build the
+ * stem database while we are at it.
+ */
bool Rcl::Db::purge()
{
LOGDEB(("Rcl::Db::purge\n"));
+ if (pdata == 0)
+ return false;
+ Native *ndb = (Native *)pdata;
+ LOGDEB(("Rcl::Db::purge: isopen %d iswritable %d\n", ndb->isopen,
+ ndb->iswritable));
+ if (ndb->isopen == false || ndb->iswritable == false)
+ return false;
+
// There seems to be problems with the document delete code, when
// we do this, the database is not actually updated. Especially,
// if we delete a bunch of docs, so that there is a hole in the
@@ -409,15 +563,6 @@
// and does nothing). Maybe related to the exceptions below when
// trying to delete an unexistant document ?
// Flushing before trying the deletes seeems to work around the problem
-
- if (pdata == 0)
- return false;
- Native *ndb = (Native *)pdata;
- LOGDEB(("Rcl::Db::purge: isopen %d iswritable %d\n", ndb->isopen,
- ndb->iswritable));
- if (ndb->isopen == false || ndb->iswritable == false)
- return false;
-
ndb->wdb.flush();
for (Xapian::docid did = 1; did < ndb->updated.size(); ++did) {
if (!ndb->updated[did]) {
@@ -429,6 +574,7 @@
}
}
}
+ ndb->wdb.flush();
return true;
}
@@ -446,46 +592,57 @@
return s;
}
bool takeword(const std::string &term, int , int, int) {
- LOGDEB(("Takeword: %s\n", term.c_str()));
+ LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
terms.push_back(term);
return true;
}
};
-#include <xapian/stem.h>
-
-// Expand term to list of all terms which expand to the same term.
-// This is currently awfully inefficient as we actually stem the whole
-// db term list ! Need to build an efficient structure when finishing
-// indexing, but good enough for testing
+
+// Expand term to list of all terms which stem to the same term.
static list<string> stemexpand(Native *ndb, string term, const string& lang)
{
list<string> explist;
try {
Xapian::Stem stemmer(lang);
string stem = stemmer.stem_word(term);
- LOGDEB(("stemexpand: term '%s' stem '%s'\n",
- term.c_str(), stem.c_str()));
- Xapian::TermIterator it;
- for (it = ndb->db.allterms_begin();
- it != ndb->db.allterms_end(); it++) {
- string stem1 = stemmer.stem_word(*it);
- if (stem == stem1)
- explist.push_back(*it);
- }
- if (explist.size() == 0)
+ LOGDEB(("stemexpand: '%s' -> '%s'\n", term.c_str(), stem.c_str()));
+ // Try to fetch the doc from the stem db
+ string stemdbdir = stemdbname(ndb->basedir, lang);
+ Xapian::Database sdb(stemdbdir);
+ LOGDEB1(("Rcl::Db::stemexpand: %s lastdocid: %d\n",
+ stemdbdir.c_str(), sdb.get_lastdocid()));
+ if (!sdb.term_exists(stem)) {
+ LOGDEB1(("Rcl::Db::stemexpand: no term for %s\n", stem.c_str()));
explist.push_back(term);
- if (1) {
- string expanded;
- for (list<string>::const_iterator it = explist.begin();
- it != explist.end(); it++) {
- expanded += *it + " ";
- }
- LOGDEB(("stemexpand: expanded list: %s\n", expanded.c_str()));
- }
- } catch (...) {
- LOGERR(("Stemming failed: no stemmer for %s ? \n", lang.c_str()));
+ return explist;
+ }
+ Xapian::PostingIterator did = sdb.postlist_begin(stem);
+ if (did == sdb.postlist_end(stem)) {
+ LOGDEB1(("Rcl::Db::stemexpand: no term(1) for %s\n",stem.c_str()));
+ explist.push_back(term);
+ return explist;
+ }
+ Xapian::Document doc = sdb.get_document(*did);
+ string data = doc.get_data();
+ // No need for a conftree, but we need to massage the data a little
+ string::size_type pos = data.find_first_of("=");
+ ++pos;
+ string::size_type pos1 = data.find_last_of("\n");
+ if (pos == string::npos || pos1 == string::npos ||pos1 <= pos) { // ??
+ explist.push_back(term);
+ return explist;
+ }
+ ConfTree::stringToStrings(data.substr(pos, pos1-pos), explist);
+ if (find(explist.begin(), explist.end(), term) == explist.end()) {
+ explist.push_back(term);
+ }
+ LOGDEB(("Rcl::Db::stemexpand: %s -> %s\n", stem.c_str(),
+ stringlistdisp(explist).c_str()));
+ } catch (...) {
+ LOGERR(("stemexpand: error accessing stem db\n"));
explist.push_back(term);
+ return explist;
}
return explist;
}
@@ -519,7 +676,8 @@
wsQData splitData;
TextSplit splitter(&splitData, true);
splitter.text_to_words(*it);
- LOGDEB(("Splitter term count: %d\n", splitData.terms.size()));
+ LOGDEB1(("Rcl::Db::setquery: splitter term count: %d\n",
+ splitData.terms.size()));
switch(splitData.terms.size()) {
case 0: continue;// ??
case 1: {
@@ -578,7 +736,7 @@
bool Rcl::Db::getDoc(int i, Doc &doc, int *percent)
{
- LOGDEB(("Rcl::Db::getDoc: %d\n", i));
+ LOGDEB1(("Rcl::Db::getDoc: %d\n", i));
Native *ndb = (Native *)pdata;
if (!ndb || !ndb->enquire) {
LOGERR(("Rcl::Db::getDoc: no query opened\n"));