recoll / Code / Diff of /src/rcldb/searchdata.cpp

Diff of /src/rcldb/searchdata.cpp [5463ea] .. [0821f0]

Switch to side-by-side view

--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.30 2008-12-17 14:26:09 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -229,7 +229,7 @@
     }
 
 private:
-    void stripExpandTerm(bool dont, const string& term, list<string>& exp, 
+    void expandTerm(bool dont, const string& term, list<string>& exp, 
 		      string& sterm);
     // After splitting entry on whitespace: process non-phrase element
     void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
@@ -247,7 +247,7 @@
     vector<vector<string> > m_groups; 
 };
 
-/** Unaccent and lowercase term, possibly expand stem and wildcards
+/** Expand stem and wildcards
  *
  * @param nostemexp don't perform stem expansion. This is mainly used to
  *   prevent stem expansion inside phrases (because the user probably
@@ -257,24 +257,20 @@
  *   capitalized term, or wildcard(s)
  * @param term input single word
  * @param exp output expansion list
- * @param sterm output lower-cased+unaccented version of the input term 
- *              (only for stem expansion, not wildcards)
+ * @param sterm output original input term if there were no wildcards
  */
-void StringToXapianQ::stripExpandTerm(bool nostemexp, 
+void StringToXapianQ::expandTerm(bool nostemexp, 
 				      const string& term, 
 				      list<string>& exp,
 				      string &sterm)
 {
-    LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n", 
+    LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n", 
 	     term.c_str(), m_stemlang.c_str(), nostemexp));
     sterm.erase();
     exp.clear();
     if (term.empty()) {
 	return;
     }
-    // term1 is lowercase and without diacritics
-    string term1;
-    dumb_string(term, term1);
 
     bool haswild = term.find_first_of("*?[") != string::npos;
 
@@ -299,16 +295,16 @@
 
     if (nostemexp && !haswild) {
 	// Neither stemming nor wildcard expansion: just the word
-	sterm = term1;
-	exp.push_front(term1);
+	sterm = term;
+	exp.push_front(term);
 	exp.resize(1);
     } else {
 	list<TermMatchEntry> l;
 	if (haswild) {
-	    m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
+	    m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
 	} else {
-	    sterm = term1;
-	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
+	    sterm = term;
+	    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
 	}
 	for (list<TermMatchEntry>::const_iterator it = l.begin(); 
 	     it != l.end(); it++) {
@@ -365,7 +361,7 @@
 {
     list<string> exp;  
     string sterm; // dumb version of user term
-    stripExpandTerm(false, span, exp, sterm);
+    expandTerm(false, span, exp, sterm);
     m_terms.insert(m_terms.end(), exp.begin(), exp.end());
     addPrefix(exp, m_prefix);
     // Push either term or OR of stem-expanded set
@@ -409,7 +405,7 @@
 
 	string sterm;
 	list<string>exp;
-	stripExpandTerm(nostemexp, *it, exp, sterm);
+	expandTerm(nostemexp, *it, exp, sterm);
 	groups.push_back(vector<string>(exp.begin(), exp.end()));
 	addPrefix(exp, m_prefix);
 	orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
@@ -448,7 +444,7 @@
  * @return the subquery count (either or'd stem-expanded terms or phrase word
  *   count)
  */
-bool StringToXapianQ::processUserString(const string &iq,
+bool StringToXapianQ::processUserString(const string &_iq,
 					string &ermsg,
 					list<Xapian::Query> &pqueries,
 					const StopList& stops,
@@ -456,10 +452,18 @@
 					bool useNear
 					)
 {
-    LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
+    LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
     ermsg.erase();
     m_terms.clear();
     m_groups.clear();
+
+    // First unaccent/normalize the input: do it first so that it
+    // happens in the same order as when indexing: unac then split. As
+    // the character count can change during normalisation, this is
+    // specially important for cjk because the artificial cjk split is
+    // based on character counts
+    string iq;
+    dumb_string(_iq, iq);
 
     // Simple whitespace-split input into user-level words and
     // double-quoted phrases: word1 word2 "this is a phrase". The text