recoll / Code / Diff of /src/rcldb/searchdata.cpp

Diff of /src/rcldb/searchdata.cpp [765b6c] .. [7dcc7c]

Switch to side-by-side view

--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@@ -183,17 +183,39 @@
     wsQData(const StopList &_stops) 
 	: stops(_stops), alltermcount(0)
     {}
+    bool takeword(const std::string &interm, int , int, int) {
+	alltermcount++;
+	LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
+
+	// Check if the first letter is a majuscule in which
+	// case we do not want to do stem expansion. Note that
+	// the test is convoluted and possibly problematic
+	string noacterm, noaclowterm;
+	if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
+	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
+	    return true;
+	} 
+	if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
+	    LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
+	    return true;
+	}
+	bool nostemexp = false;
+	Utf8Iter it1(noacterm);
+	Utf8Iter it2(noaclowterm);
+	if (*it1 != *it2)
+	    nostemexp = true;
+
+	if (stops.hasStops() && stops.isStop(noaclowterm)) {
+	    LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
+	    return true;
+	}
+	terms.push_back(noaclowterm);
+	nostemexps.push_back(nostemexp);
+	return true;
+    }
+
     vector<string> terms;
-    bool takeword(const std::string &term, int , int, int) {
-	alltermcount++;
-	LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
-	if (stops.hasStops() && stops.isStop(term)) {
-	    LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
-	    return true;
-	}
-	terms.push_back(term);
-	return true;
-    }
+    vector<bool>   nostemexps;
     const StopList &stops;
     // Count of terms including stopwords: this is for adjusting
     // phrase/near slack
@@ -232,7 +254,7 @@
     void expandTerm(bool dont, const string& term, list<string>& exp, 
 		      string& sterm);
     // After splitting entry on whitespace: process non-phrase element
-    void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
+    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
     // Process phrase/near element
     void processPhraseOrNear(wsQData *splitData, 
 			     list<Xapian::Query> &pqueries,
@@ -279,18 +301,6 @@
 	nostemexp = true;
 
     if (!nostemexp) {
-	// Check if the first letter is a majuscule in which
-	// case we do not want to do stem expansion. Note that
-	// the test is convoluted and possibly problematic
-
-	string noacterm, noaclowterm;
-	if (unacmaybefold(term, noacterm, "UTF-8", false) &&
-	    unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
-	    Utf8Iter it1(noacterm);
-	    Utf8Iter it2(noaclowterm);
-	    if (*it1 != *it2)
-		nostemexp = true;
-	}
     }
 
     if (nostemexp && !haswild) {
@@ -356,12 +366,12 @@
 	it->insert(0, prefix);
 }
 
-void StringToXapianQ::processSimpleSpan(const string& span, 
+void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
 					list<Xapian::Query> &pqueries)
 {
     list<string> exp;  
     string sterm; // dumb version of user term
-    expandTerm(false, span, exp, sterm);
+    expandTerm(nostemexp, span, exp, sterm);
     m_terms.insert(m_terms.end(), exp.begin(), exp.end());
     addPrefix(exp, m_prefix);
     // Push either term or OR of stem-expanded set
@@ -396,12 +406,13 @@
     vector<vector<string> >groups;
 
     // Go through the list and perform stem/wildcard expansion for each element
+    vector<bool>::iterator nxit = splitData->nostemexps.begin();
     for (vector<string>::iterator it = splitData->terms.begin();
-	 it != splitData->terms.end(); it++) {
+	 it != splitData->terms.end(); it++, nxit++) {
 	// Adjust when we do stem expansion. Not inside phrases, and
 	// some versions of xapian will accept only one OR clause
 	// inside NEAR, all others must be leafs.
-	bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
+	bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
 
 	string sterm;
 	list<string>exp;
@@ -434,7 +445,10 @@
 
 /** 
  * Turn user entry string (NOT query language) into a list of xapian queries.
- * We just separate words and phrases, and do wildcard and stemp expansion,
+ * We just separate words and phrases, and do wildcard and stem expansion,
+ *
+ * This is used to process data entered into an OR/AND/NEAR/PHRASE field of
+ * the GUI.
  *
  * The final list contains one query for each term or phrase
  *   - Elements corresponding to a stem-expanded part are an OP_OR
@@ -444,7 +458,7 @@
  * @return the subquery count (either or'd stem-expanded terms or phrase word
  *   count)
  */
-bool StringToXapianQ::processUserString(const string &_iq,
+bool StringToXapianQ::processUserString(const string &iq,
 					string &ermsg,
 					list<Xapian::Query> &pqueries,
 					const StopList& stops,
@@ -452,25 +466,18 @@
 					bool useNear
 					)
 {
-    LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
+    LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
     ermsg.erase();
     m_terms.clear();
     m_groups.clear();
 
-    // First unaccent/normalize the input: do it first so that it
-    // happens in the same order as when indexing: unac then split. As
-    // the character count can change during normalisation, this is
-    // specially important for cjk because the artificial cjk split is
-    // based on character counts
-    string iq;
-    dumb_string(_iq, iq);
-
     // Simple whitespace-split input into user-level words and
-    // double-quoted phrases: word1 word2 "this is a phrase". The text
-    // splitter may further still decide that the resulting "words"
-    // are really phrases, this depends on separators: [paul@dom.net]
-    // would still be a word (span), but [about:me] will probably be
-    // handled as a phrase.
+    // double-quoted phrases: word1 word2 "this is a phrase". 
+    //
+    // The text splitter may further still decide that the resulting
+    // "words" are really phrases, this depends on separators:
+    // [paul@dom.net] would still be a word (span), but [about:me]
+    // will probably be handled as a phrase.
     list<string> phrases;
     TextSplit::stringToStrings(iq, phrases);
 
@@ -516,7 +523,7 @@
 	    case 0: 
 		continue;// ??
 	    case 1: 
-		processSimpleSpan(splitData->terms.front(), pqueries);
+		processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
 		break;
 	    default:
 		processPhraseOrNear(splitData, pqueries, useNear, slack);