--- a/src/rcldb/searchdata.cpp
+++ b/src/rcldb/searchdata.cpp
@@ -1,5 +1,5 @@
#ifndef lint
-static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.30 2008-12-17 14:26:09 dockes Exp $ (C) 2006 J.F.Dockes";
+static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@@ -229,7 +229,7 @@
}
private:
- void stripExpandTerm(bool dont, const string& term, list<string>& exp,
+ void expandTerm(bool dont, const string& term, list<string>& exp,
string& sterm);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
@@ -247,7 +247,7 @@
vector<vector<string> > m_groups;
};
-/** Unaccent and lowercase term, possibly expand stem and wildcards
+/** Expand stem and wildcards
*
* @param nostemexp don't perform stem expansion. This is mainly used to
* prevent stem expansion inside phrases (because the user probably
@@ -257,24 +257,20 @@
* capitalized term, or wildcard(s)
* @param term input single word
* @param exp output expansion list
- * @param sterm output lower-cased+unaccented version of the input term
- * (only for stem expansion, not wildcards)
+ * @param sterm output original input term if there were no wildcards
*/
-void StringToXapianQ::stripExpandTerm(bool nostemexp,
+void StringToXapianQ::expandTerm(bool nostemexp,
const string& term,
list<string>& exp,
string &sterm)
{
- LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n",
+ LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n",
term.c_str(), m_stemlang.c_str(), nostemexp));
sterm.erase();
exp.clear();
if (term.empty()) {
return;
}
- // term1 is lowercase and without diacritics
- string term1;
- dumb_string(term, term1);
bool haswild = term.find_first_of("*?[") != string::npos;
@@ -299,16 +295,16 @@
if (nostemexp && !haswild) {
// Neither stemming nor wildcard expansion: just the word
- sterm = term1;
- exp.push_front(term1);
+ sterm = term;
+ exp.push_front(term);
exp.resize(1);
} else {
list<TermMatchEntry> l;
if (haswild) {
- m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
+ m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
} else {
- sterm = term1;
- m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
+ sterm = term;
+ m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
}
for (list<TermMatchEntry>::const_iterator it = l.begin();
it != l.end(); it++) {
@@ -365,7 +361,7 @@
{
list<string> exp;
string sterm; // dumb version of user term
- stripExpandTerm(false, span, exp, sterm);
+ expandTerm(false, span, exp, sterm);
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
addPrefix(exp, m_prefix);
// Push either term or OR of stem-expanded set
@@ -409,7 +405,7 @@
string sterm;
list<string>exp;
- stripExpandTerm(nostemexp, *it, exp, sterm);
+ expandTerm(nostemexp, *it, exp, sterm);
groups.push_back(vector<string>(exp.begin(), exp.end()));
addPrefix(exp, m_prefix);
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
@@ -448,7 +444,7 @@
* @return the subquery count (either or'd stem-expanded terms or phrase word
* count)
*/
-bool StringToXapianQ::processUserString(const string &iq,
+bool StringToXapianQ::processUserString(const string &_iq,
string &ermsg,
list<Xapian::Query> &pqueries,
const StopList& stops,
@@ -456,10 +452,18 @@
bool useNear
)
{
- LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
+ LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
ermsg.erase();
m_terms.clear();
m_groups.clear();
+
+ // First unaccent/normalize the input: do it first so that it
+ // happens in the same order as when indexing: unac then split. As
+ // the character count can change during normalisation, this is
+ // specially important for cjk because the artificial cjk split is
+ // based on character counts
+ string iq;
+ dumb_string(_iq, iq);
// Simple whitespace-split input into user-level words and
// double-quoted phrases: word1 word2 "this is a phrase". The text