--- a/src/rcldb/searchdatatox.cpp
+++ b/src/rcldb/searchdatatox.cpp
@@ -288,48 +288,50 @@
return true;
}
-// Splitter callback for breaking a user string into simple terms and
+// Splitter for breaking a user string into simple terms and
// phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are
-// actually multiple terms to rcldb (ie term1,term2)
+// actually multiple terms to rcldb (ie term1,term2). Still, most of
+// the time, the result of our splitting will be a single term.
class TextSplitQ : public TextSplitP {
public:
- TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
- : TextSplitP(prc, flags),
- curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
- {}
-
- bool takeword(const std::string &term, int pos, int bs, int be)
- {
+ TextSplitQ(Flags flags, TermProc *prc)
+ : TextSplitP(prc, flags), m_nostemexp(false) {
+ }
+
+ bool takeword(const std::string &term, int pos, int bs, int be) {
// Check if the first letter is a majuscule in which
// case we do not want to do stem expansion. Need to do this
// before unac of course...
- curnostemexp = unaciscapital(term);
+ m_nostemexp = unaciscapital(term);
return TextSplitP::takeword(term, pos, bs, be);
}
- bool curnostemexp;
- vector<string> terms;
- vector<bool> nostemexps;
- const StopList &stops;
- // Count of terms including stopwords: this is for adjusting
- // phrase/near slack
- int alltermcount;
- int lastpos;
+ bool nostemexp() const {
+ return m_nostemexp;
+ }
+private:
+ bool m_nostemexp;
};
class TermProcQ : public TermProc {
public:
- TermProcQ() : TermProc(0), m_ts(0) {}
- void setTSQ(TextSplitQ *ts) {m_ts = ts;}
+ TermProcQ() : TermProc(0), m_alltermcount(0), m_lastpos(0), m_ts(0) {}
+
+ // We need a ref to the splitter (only it knows about orig term
+ // capitalization for controlling stemming. The ref can't be set
+ // in the constructor because the splitter is not built yet when
+ // we are born (chicken and egg).
+ void setTSQ(const TextSplitQ *ts) {
+ m_ts = ts;
+ }
- bool takeword(const std::string &term, int pos, int bs, int be)
- {
- m_ts->alltermcount++;
- if (m_ts->lastpos < pos)
- m_ts->lastpos = pos;
- bool noexpand = be ? m_ts->curnostemexp : true;
+ bool takeword(const std::string &term, int pos, int bs, int be) {
+ m_alltermcount++;
+ if (m_lastpos < pos)
+ m_lastpos = pos;
+ bool noexpand = be ? m_ts->nostemexp() : true;
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
term.c_str(), pos, noexpand));
if (m_terms[pos].size() < term.size()) {
@@ -338,17 +340,36 @@
}
return true;
}
- bool flush()
- {
+
+ bool flush() {
for (map<int, string>::const_iterator it = m_terms.begin();
it != m_terms.end(); it++) {
- m_ts->terms.push_back(it->second);
- m_ts->nostemexps.push_back(m_nste[it->first]);
+ m_vterms.push_back(it->second);
+ m_vnostemexps.push_back(m_nste[it->first]);
}
return true;
}
+
+ int alltermcount() const {
+ return m_alltermcount;
+ }
+ int lastpos() const {
+ return m_lastpos;
+ }
+ const vector<string>& terms() {
+ return m_vterms;
+ }
+ const vector<bool>& nostemexps() {
+ return m_vnostemexps;
+ }
private:
- TextSplitQ *m_ts;
+ // Count of terms including stopwords: this is for adjusting
+ // phrase/near slack
+ int m_alltermcount;
+ int m_lastpos;
+ const TextSplitQ *m_ts;
+ vector<string> m_vterms;
+ vector<bool> m_vnostemexps;
map<int, string> m_terms;
map<int, bool> m_nste;
};
@@ -588,7 +609,7 @@
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
- TextSplitQ *splitData,
+ TermProcQ *splitData,
int mods, void *pq,
bool useNear, int slack)
{
@@ -613,9 +634,9 @@
}
// Go through the list and perform stem/wildcard expansion for each element
- vector<bool>::iterator nxit = splitData->nostemexps.begin();
- for (vector<string>::iterator it = splitData->terms.begin();
- it != splitData->terms.end(); it++, nxit++) {
+ vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
+ for (vector<string>::const_iterator it = splitData->terms().begin();
+ it != splitData->terms().end(); it++, nxit++) {
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
// Adjust when we do stem expansion. Not if disabled by
// caller, not inside phrases, and some versions of xapian
@@ -660,9 +681,9 @@
// Generate an appropriate PHRASE/NEAR query with adjusted slack
// For phrases, give a relevance boost like we do for original terms
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
- splitData->alltermcount, splitData->lastpos));
+ splitData->alltermcount(), splitData->lastpos()));
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
- splitData->lastpos + 1 + slack);
+ splitData->lastpos() + 1 + slack);
if (op == Xapian::Query::OP_PHRASE)
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
original_term_wqf_booster);
@@ -772,6 +793,7 @@
// and the last position
// The term processing pipeline:
+ // split -> [unac/case ->] stops -> store terms
TermProcQ tpq;
TermProc *nxt = &tpq;
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
@@ -783,28 +805,28 @@
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
TextSplit::TXTS_KEEPWILD),
- stops, nxt);
+ nxt);
tpq.setTSQ(&splitter);
splitter.text_to_words(*it);
- slack += splitter.lastpos - splitter.terms.size() + 1;
-
- LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
- switch (splitter.terms.size() + terminc) {
+ slack += tpq.lastpos() - tpq.terms().size() + 1;
+
+ LOGDEB0(("strToXapianQ: termcount: %d\n", tpq.terms().size()));
+ switch (tpq.terms().size() + terminc) {
case 0:
continue;// ??
case 1: {
int lmods = mods;
- if (splitter.nostemexps.front())
+ if (tpq.nostemexps().front())
lmods |= SearchDataClause::SDCM_NOSTEMMING;
- m_hldata.ugroups.push_back(splitter.terms);
- processSimpleSpan(db, ermsg, splitter.terms.front(),
+ m_hldata.ugroups.push_back(tpq.terms());
+ processSimpleSpan(db, ermsg, tpq.terms().front(),
lmods, &pqueries);
}
break;
default:
- m_hldata.ugroups.push_back(splitter.terms);
- processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
+ m_hldata.ugroups.push_back(tpq.terms());
+ processPhraseOrNear(db, ermsg, &tpq, mods, &pqueries,
useNear, slack);
}
if (m_curcl >= getMaxCl()) {
@@ -846,6 +868,7 @@
case SCLT_OR: op = Xapian::Query::OP_OR; break;
default:
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
+ m_reason = "Internal error";
return false;
}
@@ -854,7 +877,9 @@
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
- return true;
+ m_reason = string("Resolved to null query. Term too long ? : [" +
+ m_text + string("]"));
+ return false;
}
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
@@ -970,7 +995,9 @@
return false;
if (pqueries.empty()) {
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
- return true;
+ m_reason = string("Resolved to null query. Term too long ? : [" +
+ m_text + string("]"));
+ return false;
}
*qp = *pqueries.begin();