Switch to side-by-side view

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.132 2008-05-20 10:09:54 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.133 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -36,6 +36,7 @@
 
 #include "rclconfig.h"
 #include "rcldb.h"
+#include "rcldb_p.h"
 #include "stemdb.h"
 #include "textsplit.h"
 #include "transcode.h"
@@ -47,8 +48,9 @@
 #include "pathhash.h"
 #include "utf8iter.h"
 #include "searchdata.h"
-
-#include "xapian.h"
+#include "rclquery.h"
+#include "rclquery_p.h"
+
 
 #ifndef MAX
 #define MAX(A,B) (A>B?A:B)
@@ -88,125 +90,8 @@
 const static string rclSyntAbs = "?!#@";
 const static string emptystring;
 
-// A class for data and methods that would have to expose
-// Xapian-specific stuff if they were in Rcl::Db. There could actually be
-// 2 different ones for indexing or query as there is not much in
-// common.
-class Native {
- public:
-    Db *m_db;
-    bool m_isopen;
-    bool m_iswritable;
-
-    // Indexing
-    Xapian::WritableDatabase wdb;
-
-    // Querying
-    Xapian::Database db;
-    Xapian::Query    query; // query descriptor: terms and subqueries
-			    // joined by operators (or/and etc...)
-
-    // Filtering results on location. There are 2 possible approaches
-    // for this:
-    //   - Set a "MatchDecider" to be used by Xapian during the query
-    //   - Filter the results out of Xapian (this also uses a
-    //     Xapian::MatchDecider object, but applied to the results by Recoll.
-    // 
-    // The result filtering approach was the first implemented. 
-    //
-    // The efficiency of both methods depend on the searches, so the code
-    // for both has been kept.  A nice point for the Xapian approach is that
-    // the result count estimate are correct (they are wrong with
-    // the postfilter approach). It is also faster in some worst case scenarios
-    // so this now the default (but the post-filtering is faster in many common
-    // cases).
-    // 
-    // Which is used is decided in SetQuery(), by setting either of
-    // the two following members. This in turn is controlled by a
-    // preprocessor directive.
-
-#define XAPIAN_FILTERING 1
-
-    Xapian::MatchDecider *decider;   // Xapian does the filtering
-    Xapian::MatchDecider *postfilter; // Result filtering done by Recoll
-
-    Xapian::Enquire      *enquire; // Open query descriptor.
-    Xapian::MSet          mset;    // Partial result set
-
-    // Term frequencies for current query. See makeAbstract, setQuery
-    map<string, double>  m_termfreqs; 
-    
-    Native(Db *db) 
-	: m_db(db),
-	  m_isopen(false), m_iswritable(false), decider(0), postfilter(0),
-	  enquire(0)
-    { }
-
-    ~Native() {
-	delete decider;
-	delete postfilter;
-	delete enquire;
-    }
-
-    string makeAbstract(Xapian::docid id, const list<string>& terms);
-
-    bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
-
-    /** Compute list of subdocuments for a given path (given by hash) 
-     *  We look for all Q terms beginning with the path/hash
-     *  As suggested by James Aylett, a better method would be to add 
-     *  a single term (ie: XP/path/to/file) to all subdocs, then finding
-     *  them would be a simple matter of retrieving the posting list for the
-     *  term. There would still be a need for the current Qterm though, as a
-     *  unique term for replace_document, and for retrieving by
-     *  path/ipath (history)
-     */
-    bool subDocs(const string &hash, vector<Xapian::docid>& docids);
-
-};
-
-class FilterMatcher : public Xapian::MatchDecider {
-public:
-    FilterMatcher(const string &topdir)
-	: m_topdir(topdir)
-    {}
-    virtual ~FilterMatcher() {}
-
-    virtual 
-#if XAPIAN_MAJOR_VERSION < 1
-    int 
-#else
-    bool
-#endif
-    operator()(const Xapian::Document &xdoc) const 
-    {
-	m_cnt++;
-	// Parse xapian document's data and populate doc fields
-	string data = xdoc.get_data();
-	ConfSimple parms(&data);
-
-	// The only filtering for now is on file path (subtree)
-	string url;
-	parms.get(string("url"), url);
-	LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
-		 m_topdir.c_str(), url.c_str()));
-	if (url.find(m_topdir, 7) == 7) {
-	    LOGDEB2(("FilterMatcher: MATCH    %d\n", m_cnt));
-	    return true; 
-	} else {
-	    LOGDEB2(("FilterMatcher: NO MATCH %d\n", m_cnt));
-	    return false;
-	}
-    }
-    static int m_cnt;
-    
-private:
-    string m_topdir;
-};
-int FilterMatcher::m_cnt;
-
 /* See comment in class declaration */
-bool Native::subDocs(const string &hash, vector<Xapian::docid>& docids) 
+bool Db::Native::subDocs(const string &hash, vector<Xapian::docid>& docids) 
 {
     docids.clear();
     string qterm = "Q"+ hash + "|";
@@ -250,7 +135,7 @@
 }
 
 // Turn data record from db into document fields
-bool Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
+bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
 {
     LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str()));
     ConfSimple parms(&data);
@@ -306,26 +191,29 @@
 
 // Build a document abstract by extracting text chunks around the query terms
 // This uses the db termlists, not the original document.
-string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
+string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
 {
     Chrono chron;
     LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
 	     m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
 
+    list<string> iterms;
+    query->getQueryTerms(iterms);
+
     list<string> terms = noPrefixList(iterms);
     if (terms.empty()) {
 	return "";
     }
 
     // Retrieve db-wide frequencies for the query terms
-    if (m_termfreqs.empty()) {
+    if (query->m_nq->termfreqs.empty()) {
 	double doccnt = db.get_doccount();
 	if (doccnt == 0) doccnt = 1;
 	for (list<string>::const_iterator qit = terms.begin(); 
 	     qit != terms.end(); qit++) {
-	    m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
+	    query->m_nq->termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
 	    LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), 
-		     m_termfreqs[*qit]));
+		    query->m_nq->termfreqs[*qit]));
 	}
 	LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
     }
@@ -343,7 +231,7 @@
 	Xapian::TermIterator term = db.termlist_begin(docid);
 	term.skip_to(*qit);
 	if (term != db.termlist_end(docid) && *term == *qit) {
-	    double q = (term.get_wdf() / doclen) * m_termfreqs[*qit];
+	    double q = (term.get_wdf() / doclen) * query->m_nq->termfreqs[*qit];
 	    q = -log10(q);
 	    if (q < 3) {
 		q = 0.05;
@@ -556,7 +444,7 @@
 /* Rcl::Db methods ///////////////////////////////// */
 
 Db::Db() 
-    : m_ndb(0), m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250),
+    : m_ndb(0), m_idxAbsTruncLen(250), m_synthAbsLen(250),
       m_synthAbsWordCtxLen(4), m_flushMb(-1), 
       m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0),
       m_maxFsOccupPc(0), m_mode(Db::DbRO)
@@ -586,28 +474,9 @@
     return res;
 }
 
-// Generic Xapian exception catching code. We do this quite often,
-// and I have no idea how to do this except for a macro
-#define XCATCHERROR(MSG) \
- catch (const Xapian::Error &e) {		   \
-    MSG = e.get_msg();				   \
-    if (MSG.empty()) MSG = "Empty error message";  \
- } catch (const string &s) {			   \
-    MSG = s;					   \
-    if (MSG.empty()) MSG = "Empty error message";  \
- } catch (const char *s) {			   \
-    MSG = s;					   \
-    if (MSG.empty()) MSG = "Empty error message";  \
- } catch (...) {				   \
-    MSG = "Caught unknown xapian exception";	   \
- } 
-
-
-bool Db::open(const string& dir, const string &stops, OpenMode mode, int qops)
-{
-    bool keep_updated = (qops & QO_KEEP_UPDATED) != 0;
-    qops &= ~QO_KEEP_UPDATED;
-
+bool Db::open(const string& dir, const string &stops, OpenMode mode, 
+	      bool keep_updated)
+{
     if (m_ndb == 0)
 	return false;
     LOGDEB(("Db::open: m_isopen %d m_iswritable %d\n", m_ndb->m_isopen, 
@@ -724,7 +593,7 @@
     if (m_ndb && m_ndb->m_isopen) {
 	if (!close())
 	    return false;
-	if (!open(m_basedir, "", m_mode, m_qOpts | QO_KEEP_UPDATED)) {
+	if (!open(m_basedir, "", m_mode, true)) {
 	    return false;
 	}
     }
@@ -1467,64 +1336,6 @@
     return true;
 }
 
-// Prepare query out of user search data
-bool Db::setQuery(RefCntr<SearchData> sdata, int opts, 
-		  const string& stemlang)
-{
-    if (!m_ndb) {
-	LOGERR(("Db::setQuery: no db!\n"));
-	return false;
-    }
-    m_reason.erase();
-    LOGDEB(("Db::setQuery:\n"));
-
-    m_filterTopDir = sdata->getTopdir();
-    deleteZ(m_ndb->decider);
-    deleteZ(m_ndb->postfilter);
-    if (!m_filterTopDir.empty()) {
-#if XAPIAN_FILTERING
-	m_ndb->decider = 
-#else
-        m_ndb->postfilter =
-#endif
-	    new FilterMatcher(m_filterTopDir);
-    }
-    m_dbindices.clear();
-    m_qOpts = opts;
-    m_ndb->m_termfreqs.clear();
-    FilterMatcher::m_cnt = 0;
-    Xapian::Query xq;
-    if (!sdata->toNativeQuery(*this, &xq, 
-			      (opts & Db::QO_STEM) ? stemlang : "")) {
-	m_reason += sdata->getReason();
-	return false;
-    }
-    m_ndb->query = xq;
-    string ermsg;
-    string d;
-    try {
-	delete m_ndb->enquire;
-	m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
-	m_ndb->enquire->set_query(m_ndb->query);
-	m_ndb->mset = Xapian::MSet();
-	// Get the query description and trim the "Xapian::Query"
-	d = m_ndb->query.get_description();
-    } XCATCHERROR(ermsg);
-    if (!ermsg.empty()) {
-	LOGDEB(("Db::SetQuery: xapian error %s\n", ermsg.c_str()));
-	return false;
-    }
-	
-    if (d.find("Xapian::Query") == 0)
-	d.erase(0, strlen("Xapian::Query"));
-    if (!m_filterTopDir.empty()) {
-	d += string(" [dir: ") + m_filterTopDir + "]";
-    }
-    sdata->setDescription(d);
-    LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
-    return true;
-}
-
 class TermMatchCmpByWcf {
 public:
     int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
@@ -1735,195 +1546,15 @@
     return true;
 }
 
-bool Db::getQueryTerms(list<string>& terms)
-{
-    if (!m_ndb)
-	return false;
-
-    terms.clear();
-    Xapian::TermIterator it;
-    string ermsg;
-    try {
-	for (it = m_ndb->query.get_terms_begin(); 
-	     it != m_ndb->query.get_terms_end(); it++) {
-	    terms.push_back(*it);
-	}
-    } XCATCHERROR(ermsg);
-    if (!ermsg.empty()) {
-	LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
-	return false;
-    }
-    return true;
-}
-
-bool Db::getMatchTerms(const Doc& doc, list<string>& terms)
-{
-    if (!m_ndb || !m_ndb->enquire) {
-	LOGERR(("Db::getMatchTerms: no query opened\n"));
-	return -1;
-    }
-
-    terms.clear();
-    Xapian::TermIterator it;
-    Xapian::docid id = Xapian::docid(doc.xdocid);
-    string ermsg;
-    try {
-	for (it=m_ndb->enquire->get_matching_terms_begin(id);
-	     it != m_ndb->enquire->get_matching_terms_end(id); it++) {
-	    terms.push_back(*it);
-	}
-    } XCATCHERROR(ermsg);
-    if (!ermsg.empty()) {
-	LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
-	return false;
-    }
-
-    return true;
-}
-
-// Mset size
-static const int qquantum = 30;
-
-int Db::getResCnt()
-{
-    if (!m_ndb || !m_ndb->enquire) {
-	LOGERR(("Db::getResCnt: no query opened\n"));
-	return -1;
-    }
-    string ermsg;
-    if (m_ndb->mset.size() <= 0) {
-	try {
-	    m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum, 
-						   0, m_ndb->decider);
-	} catch (const Xapian::DatabaseModifiedError &error) {
-	    m_ndb->db.reopen();
-	    m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum,
-						   0, m_ndb->decider);
-	} XCATCHERROR(ermsg);
-	if (!ermsg.empty()) {
-	    LOGERR(("enquire->get_mset: exception: %s\n", ermsg.c_str()));
-	    return -1;
-	}
-    }
-    int ret = -1;
-    try {
-    ret = m_ndb->mset.get_matches_lower_bound();
-    } catch (...) {}
-    return ret;
-}
-
-
-// Get document at rank i in query (i is the index in the whole result
-// set, as in the enquire class. We check if the current mset has the
-// doc, else ask for an other one. We use msets of 10 documents. Don't
-// know if the whole thing makes sense at all but it seems to work.
-//
-// If there is a postquery filter (ie: file names), we have to
-// maintain a correspondance from the sequential external index
-// sequence to the internal Xapian hole-y one (the holes being the documents 
-// that dont match the filter).
-bool Db::getDoc(int exti, Doc &doc, int *percent)
-{
-    LOGDEB1(("Db::getDoc: exti %d\n", exti));
-    if (!m_ndb || !m_ndb->enquire) {
-	LOGERR(("Db::getDoc: no query opened\n"));
-	return false;
-    }
-
-    int xapi;
-    if (m_ndb->postfilter) {
-	// There is a postquery filter, does this fall in already known area ?
-	if (exti >= (int)m_dbindices.size()) {
-	    // Have to fetch xapian docs and filter until we get
-	    // enough or fail
-	    m_dbindices.reserve(exti+1);
-	    // First xapian doc we fetch is the one after last stored 
-	    int first = m_dbindices.size() > 0 ? m_dbindices.back() + 1 : 0;
-	    // Loop until we get enough docs
-	    while (exti >= (int)m_dbindices.size()) {
-		LOGDEB(("Db::getDoc: fetching %d starting at %d\n",
-			qquantum, first));
-		try {
-		    m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum);
-		} catch (const Xapian::DatabaseModifiedError &error) {
-		    m_ndb->db.reopen();
-		    m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum);
-		} catch (const Xapian::Error & error) {
-		  LOGERR(("enquire->get_mset: exception: %s\n", 
-			  error.get_msg().c_str()));
-		  abort();
-		}
-
-		if (m_ndb->mset.empty()) {
-		    LOGDEB(("Db::getDoc: got empty mset\n"));
-		    return false;
-		}
-		first = m_ndb->mset.get_firstitem();
-		for (unsigned int i = 0; i < m_ndb->mset.size() ; i++) {
-		    LOGDEB(("Db::getDoc: [%d]\n", i));
-		    Xapian::Document xdoc = m_ndb->mset[i].get_document();
-		    if ((*m_ndb->postfilter)(xdoc)) {
-			m_dbindices.push_back(first + i);
-		    }
-		}
-		first = first + m_ndb->mset.size();
-	    }
-	}
-	xapi = m_dbindices[exti];
-    } else {
-	xapi = exti;
-    }
-
-    // From there on, we work with a xapian enquire item number. Fetch it
-    int first = m_ndb->mset.get_firstitem();
-    int last = first + m_ndb->mset.size() -1;
-
-    if (!(xapi >= first && xapi <= last)) {
-	LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
-	try {
-	    m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum,
-						   0, m_ndb->decider);
-	} catch (const Xapian::DatabaseModifiedError &error) {
-	    m_ndb->db.reopen();
-	    m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum,
-						   0, m_ndb->decider);
-
-	} catch (const Xapian::Error & error) {
-	  LOGERR(("enquire->get_mset: exception: %s\n", 
-		  error.get_msg().c_str()));
-	  abort();
-	}
-	if (m_ndb->mset.empty())
-	    return false;
-	first = m_ndb->mset.get_firstitem();
-	last = first + m_ndb->mset.size() -1;
-    }
-
-    LOGDEB1(("Db::getDoc: Qry [%s] win [%d-%d] Estimated results: %d",
-	     m_ndb->query.get_description().c_str(), 
-	     first, last,
-	     m_ndb->mset.get_matches_lower_bound()));
-
-    Xapian::Document xdoc = m_ndb->mset[xapi-first].get_document();
-    Xapian::docid docid = *(m_ndb->mset[xapi-first]);
-    if (percent)
-	*percent = m_ndb->mset.convert_to_percent(m_ndb->mset[xapi-first]);
-
-    // Parse xapian document's data and populate doc fields
-    string data = xdoc.get_data();
-    return m_ndb->dbDataToRclDoc(docid, data, doc);
-}
-
-bool Db::makeDocAbstract(Doc &doc, string& abstract)
+
+bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
 {
     LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
-    if (!m_ndb || !m_ndb->enquire) {
-	LOGERR(("Db::makeDocAbstract: no query opened\n"));
-	return false;
-    }
-    list<string> terms;
-    getQueryTerms(terms);
-    abstract = m_ndb->makeAbstract(doc.xdocid, terms);
+    if (!m_ndb) {
+	LOGERR(("Db::makeDocAbstract: no db\n"));
+	return false;
+    }
+    abstract = m_ndb->makeAbstract(doc.xdocid, query);
     return true;
 }
 
@@ -1969,45 +1600,6 @@
     return false;
 }
 
-list<string> Db::expand(const Doc &doc)
-{
-    list<string> res;
-    if (!m_ndb || !m_ndb->enquire) {
-	LOGERR(("Db::expand: no query opened\n"));
-	return res;
-    }
-    string ermsg;
-    for (int tries = 0; tries < 2; tries++) {
-	try {
-	    Xapian::RSet rset;
-	    rset.add_document(Xapian::docid(doc.xdocid));
-	    // We don't exclude the original query terms.
-	    Xapian::ESet eset = m_ndb->enquire->get_eset(20, rset, false);
-	    LOGDEB(("ESet terms:\n"));
-	    // We filter out the special terms
-	    for (Xapian::ESetIterator it = eset.begin(); 
-		 it != eset.end(); it++) {
-		LOGDEB((" [%s]\n", (*it).c_str()));
-		if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z'))
-		    continue;
-		res.push_back(*it);
-		if (res.size() >= 10)
-		    break;
-	    }
-	} catch (const Xapian::DatabaseModifiedError &error) {
-	    continue;
-	} XCATCHERROR(ermsg);
-	if (!ermsg.empty()) {
-	    LOGERR(("Db::expand: xapian error %s\n", ermsg.c_str()));
-	    res.clear();
-	}
-	break;
-    }
-
-    return res;
-}
-
-
 #ifndef NO_NAMESPACES
 }
 #endif