Switch to side-by-side view

--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.3 2006-11-14 17:41:12 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.4 2006-11-17 10:06:34 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -21,10 +21,7 @@
 // Handle translation from rcl's SearchData structures to Xapian Queries
 
 #include <string>
-#include <list>
-#ifndef NO_NAMESPACES
-using namespace std;
-#endif
+#include <vector>
 
 #include "xapian.h"
 
@@ -36,9 +33,13 @@
 #include "unacpp.h"
 #include "utf8iter.h"
 
+#ifndef NO_NAMESPACES
+using namespace std;
 namespace Rcl {
-
-typedef  list<SearchDataClause *>::iterator qlist_it_t;
+#endif
+
+typedef  vector<SearchDataClause *>::iterator qlist_it_t;
+typedef  vector<SearchDataClause *>::const_iterator qlist_cit_t;
 
 bool SearchData::toNativeQuery(Rcl::Db &db, void *d, const string& stemlang)
 {
@@ -71,7 +72,7 @@
     if (!m_filetypes.empty()) {
 	list<Xapian::Query> pqueries;
 	Xapian::Query tq;
-	for (list<string>::iterator it = m_filetypes.begin(); 
+	for (vector<string>::iterator it = m_filetypes.begin(); 
 	     it != m_filetypes.end(); it++) {
 	    string term = "T" + *it;
 	    LOGDEB(("Adding file type term: [%s]\n", term.c_str()));
@@ -90,6 +91,7 @@
 {
     if (m_tp == SCLT_OR && (cl->m_tp == SCLT_EXCL)) {
 	LOGERR(("SearchData::addClause: cant add EXCL to OR list\n"));
+	m_reason = "No Negative (AND_NOT) clauses allowed in OR queries";
 	return false;
     }
     m_query.push_back(cl);
@@ -98,33 +100,46 @@
 
 // Make me all new
 void SearchData::erase() {
+    LOGDEB(("SearchData::erase\n"));
+    m_tp = SCLT_AND;
     for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
 	delete *it;
     m_query.clear();
     m_filetypes.clear();
     m_topdir.erase();
     m_description.erase();
+    m_reason.erase();
 }
 
 // Am I a file name only search ? This is to turn off term highlighting
-bool SearchData::fileNameOnly() {
+bool SearchData::fileNameOnly() 
+{
     for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
 	if (!(*it)->isFileName())
 	    return false;
     return true;
 }
 
+// Extract all terms and term groups
+bool SearchData::getTerms(vector<string>& terms, 
+			  vector<vector<string> >& groups,
+			  vector<int>& gslks) const
+{
+    for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
+	(*it)->getTerms(terms, groups, gslks);
+    return true;
+}
+
 // Splitter callback for breaking a user query string into simple
-// terms and phrases
+// terms and phrases. 
 class wsQData : public TextSplitCB {
  public:
     vector<string> terms;
     // Debug
     string catterms() {
 	string s;
-	for (unsigned int i = 0; i < terms.size(); i++) {
+	for (unsigned int i = 0; i < terms.size(); i++)
 	    s += "[" + terms[i] + "] ";
-	}
 	return s;
     }
     bool takeword(const std::string &term, int , int, int) {
@@ -132,71 +147,97 @@
 	terms.push_back(term);
 	return true;
     }
-    // Decapital + deaccent all terms 
-    void dumball() {
-	for (vector<string>::iterator it=terms.begin(); it !=terms.end();it++){
-	    string dumb;
-	    dumb_string(*it, dumb);
-	    *it = dumb;
-	}
-    }
 };
 
-/** Possibly expand term into its stem siblings, make them dumb strings */
-static void maybeStemExp(Db& db, const string& stemlang, const string& term, 
-			 list<string>& exp)
-{
-    LOGDEB(("maybeStemExp: [%s]\n", term.c_str()));
+// This used to be a static function, but we couldn't just keep adding
+// parameters to the interface!
+class StringToXapianQ {
+public:
+    StringToXapianQ(Db& db) : m_db(db) { }
+    bool translate(const string &iq,
+		   const string& stemlang,
+		   string &ermsg,
+		   list<Xapian::Query> &pqueries,
+		   int slack = 0, bool useNear = false);
+    bool getTerms(vector<string>& terms, 
+		  vector<vector<string> >& groups) 
+    {
+	terms.insert(terms.end(), m_terms.begin(), m_terms.end());
+	groups.insert(groups.end(), m_groups.begin(), m_groups.end());
+	return true;
+    }
+private:
+    void maybeStemExp(const string& stemlang, const string& term, 
+		      list<string>& exp);
+
+    Db& m_db;
+    // Single terms and phrases resulting from breaking up text;
+    vector<string>          m_terms;
+    vector<vector<string> > m_groups; 
+};
+
+/** Make term dumb and possibly expand it into its stem siblings */
+void StringToXapianQ::maybeStemExp(const string& stemlang, 
+				   const string& term, 
+				   list<string>& exp)
+{
+    LOGDEB2(("maybeStemExp: [%s]\n", term.c_str()));
+    if (term.empty()) {
+	exp.clear();
+	return;
+    }
+
     string term1;
     dumb_string(term, term1);
-    if (!stemlang.empty()) {
-	bool nostemexp = false;
+
+    bool nostemexp = stemlang.empty() ? true : false;
+    if (!nostemexp) {
 	// Check if the first letter is a majuscule in which
 	// case we do not want to do stem expansion. Note that
 	// the test is convoluted and possibly problematic
-	if (term.length() > 0) {
-	    string noacterm,noaclowterm;
-	    if (unacmaybefold(term, noacterm, "UTF-8", false) &&
-		unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
-		Utf8Iter it1(noacterm);
-		Utf8Iter it2(noaclowterm);
-		if (*it1 != *it2)
-		    nostemexp = true;
-	    }
+
+	string noacterm,noaclowterm;
+	if (unacmaybefold(term, noacterm, "UTF-8", false) &&
+	    unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
+	    Utf8Iter it1(noacterm);
+	    Utf8Iter it2(noaclowterm);
+	    if (*it1 != *it2)
+		nostemexp = true;
 	}
-	LOGDEB1(("Term: %s stem expansion: %s\n", 
-		 term.c_str(), nostemexp?"no":"yes"));
-	if (!nostemexp) {
-	    exp = db.stemExpand(stemlang, term1);
-	    return;
-	}
-    }
-
-    exp.push_back(term1);
-}
-
-/** Turn string into list of xapian queries. There is little
+	LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str()));
+    }
+
+    if (nostemexp) {
+	exp = list<string>(1, term1);
+    } else {
+	exp = m_db.stemExpand(stemlang, term1);
+    }
+}
+
+/** 
+ * Turn string into list of xapian queries. There is little
  * interpretation done on the string (no +term -term or filename:term
  * stuff). We just separate words and phrases, and interpret
  * capitalized terms as wanting no stem expansion. 
  * The final list contains one query for each term or phrase
  *   - Elements corresponding to a stem-expanded part are an OP_OR
- *    composition of the stem-expanded terms (or a single term query).
+ *     composition of the stem-expanded terms (or a single term query).
  *   - Elements corresponding to a phrase are an OP_PHRASE composition of the
  *     phrase terms (no stem expansion in this case)
  * @return the subquery count (either or'd stem-expanded terms or phrase word
  *   count)
  */
-static bool stringToXapianQueries(const string &iq,
-				  const string& stemlang,
-				  Db& db,
-				  string &ermsg,
-				  list<Xapian::Query> &pqueries,
-				  int slack = 0, bool useNear = false)
+bool StringToXapianQ::translate(const string &iq,
+				const string& stemlang,
+				string &ermsg,
+				list<Xapian::Query> &pqueries,
+				int slack, bool useNear)
 {
     string qstring = iq;
     bool opt_stemexp = !stemlang.empty();
     ermsg.erase();
+    m_terms.clear();
+    m_groups.clear();
 
     // Split into words and phrases (word1 word2 "this is a phrase"):
     list<string> phrases;
@@ -231,10 +272,11 @@
 		{
 		    string term = splitData.terms.front();
 		    list<string> exp;  
-		    maybeStemExp(db, stemlang, term, exp);
+		    maybeStemExp(stemlang, term, exp);
 		    // Push either term or OR of stem-expanded set
 		    pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
 						     exp.begin(), exp.end()));
+		    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
 		}
 		break;
 
@@ -245,14 +287,18 @@
 		list<Xapian::Query> orqueries;
 		bool hadmultiple = false;
 		string nolang, lang;
+		vector<string> dumbterms;
 		for (vector<string>::iterator it = splitData.terms.begin();
 		     it != splitData.terms.end(); it++) {
 		    list<string>exp;
 		    lang = (op == Xapian::Query::OP_PHRASE || hadmultiple) ?
 			nolang : stemlang;
-		    maybeStemExp(db, lang, *it, exp);
-		    if (exp.size() > 1)
+		    maybeStemExp(lang, *it, exp);
+		    dumbterms.insert(dumbterms.end(), exp.begin(), exp.end());
+#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
+		    if (exp.size() > 1) 
 			hadmultiple = true;
+#endif
 		    orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
 						      exp.begin(), exp.end()));
 		}
@@ -260,6 +306,7 @@
 						 orqueries.begin(),
 						 orqueries.end(),
 					 splitData.terms.size() + slack));
+		m_groups.push_back(dumbterms);
 	    }
 	}
     } catch (const Xapian::Error &e) {
@@ -282,12 +329,15 @@
 bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, 
 					   const string& stemlang)
 {
+    m_terms.clear();
+    m_groups.clear();
     Xapian::Query *qp = (Xapian::Query *)p;
     *qp = Xapian::Query();
 
     Xapian::Query::op op;
     switch (m_tp) {
     case SCLT_AND: op = Xapian::Query::OP_AND; break;
+	// EXCL will be set with AND_NOT in the list. So it's an OR list here
     case SCLT_OR: 
     case SCLT_EXCL: op = Xapian::Query::OP_OR; break;
     default:
@@ -295,12 +345,14 @@
 	return false;
     }
     list<Xapian::Query> pqueries;
-    if (!stringToXapianQueries(m_text, stemlang, db, m_reason, pqueries))
+    StringToXapianQ tr(db);
+    if (!tr.translate(m_text, stemlang, m_reason, pqueries))
 	return false;
     if (pqueries.empty()) {
 	LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
 	return true;
     }
+    tr.getTerms(m_terms, m_groups);
     *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
     return true;
 }
@@ -319,28 +371,31 @@
     return true;
 }
 
-// Translate NEAR or PHRASE clause. We're not handling the distance parameter
-// yet.
+// Translate NEAR or PHRASE clause. 
 bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, 
 					 const string& stemlang)
 {
+    m_terms.clear();
+    m_groups.clear();
+
     Xapian::Query *qp = (Xapian::Query *)p;
     *qp = Xapian::Query();
 
     list<Xapian::Query> pqueries;
     Xapian::Query nq;
+
+    // Use stringToXapianQueries to lowercase and simplify the phrase
+    // terms etc. The result should be a single element list
     string s = string("\"") + m_text + string("\"");
     bool useNear = m_tp == SCLT_NEAR;
-
-    // Use stringToXapianQueries anyway to lowercase and simplify the
-    // phrase terms etc. The result should be a single element list
-    if (!stringToXapianQueries(s, stemlang, db, m_reason, pqueries,
-			       m_slack, useNear))
+    StringToXapianQ tr(db);
+    if (!tr.translate(s, stemlang, m_reason, pqueries, m_slack, useNear))
 	return false;
     if (pqueries.empty()) {
 	LOGERR(("SearchDataClauseDist: resolved to null query\n"));
 	return true;
     }
+    tr.getTerms(m_terms, m_groups);
     *qp = *pqueries.begin();
     return true;
 }