|
a/src/rcldb/searchdata.cpp |
|
b/src/rcldb/searchdata.cpp |
|
... |
|
... |
50 |
namespace Rcl {
|
50 |
namespace Rcl {
|
51 |
|
51 |
|
52 |
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
52 |
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
53 |
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
53 |
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
54 |
|
54 |
|
55 |
static const int original_term_wqf_booster = 10;
|
|
|
56 |
|
|
|
57 |
void SearchData::commoninit()
|
55 |
void SearchData::commoninit()
|
58 |
{
|
56 |
{
|
59 |
m_haveDates = false;
|
57 |
m_haveDates = false;
|
60 |
m_maxSize = size_t(-1);
|
58 |
m_maxSize = size_t(-1);
|
61 |
m_minSize = size_t(-1);
|
59 |
m_minSize = size_t(-1);
|
|
... |
|
... |
70 |
SearchData::~SearchData()
|
68 |
SearchData::~SearchData()
|
71 |
{
|
69 |
{
|
72 |
LOGDEB0(("SearchData::~SearchData\n"));
|
70 |
LOGDEB0(("SearchData::~SearchData\n"));
|
73 |
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
71 |
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
74 |
delete *it;
|
72 |
delete *it;
|
75 |
}
|
|
|
76 |
|
|
|
77 |
// Expand categories and mime type wild card exps Categories are
|
|
|
78 |
// expanded against the configuration, mimetypes against the index
|
|
|
79 |
// (for wildcards).
|
|
|
80 |
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
|
|
|
81 |
{
|
|
|
82 |
const RclConfig *cfg = db.getConf();
|
|
|
83 |
if (!cfg) {
|
|
|
84 |
LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
|
|
|
85 |
return false;
|
|
|
86 |
}
|
|
|
87 |
vector<string> exptps;
|
|
|
88 |
|
|
|
89 |
for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
|
|
|
90 |
if (cfg->isMimeCategory(*it)) {
|
|
|
91 |
vector<string>tps;
|
|
|
92 |
cfg->getMimeCatTypes(*it, tps);
|
|
|
93 |
exptps.insert(exptps.end(), tps.begin(), tps.end());
|
|
|
94 |
} else {
|
|
|
95 |
TermMatchResult res;
|
|
|
96 |
string mt = stringtolower((const string&)*it);
|
|
|
97 |
// We set casesens|diacsens to get an equivalent of ixTermMatch()
|
|
|
98 |
db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
|
|
|
99 |
mt, res, -1, "mtype");
|
|
|
100 |
if (res.entries.empty()) {
|
|
|
101 |
exptps.push_back(it->c_str());
|
|
|
102 |
} else {
|
|
|
103 |
for (vector<TermMatchEntry>::const_iterator rit =
|
|
|
104 |
res.entries.begin(); rit != res.entries.end(); rit++) {
|
|
|
105 |
exptps.push_back(strip_prefix(rit->term));
|
|
|
106 |
}
|
|
|
107 |
}
|
|
|
108 |
}
|
|
|
109 |
}
|
|
|
110 |
sort(exptps.begin(), exptps.end());
|
|
|
111 |
exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
|
|
|
112 |
|
|
|
113 |
tps = exptps;
|
|
|
114 |
return true;
|
|
|
115 |
}
|
|
|
116 |
|
|
|
117 |
static const char *maxXapClauseMsg =
|
|
|
118 |
"Maximum Xapian query size exceeded. Increase maxXapianClauses "
|
|
|
119 |
"in the configuration. ";
|
|
|
120 |
static const char *maxXapClauseCaseDiacMsg =
|
|
|
121 |
"Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
|
|
|
122 |
"wildcards ?"
|
|
|
123 |
;
|
|
|
124 |
|
|
|
125 |
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
|
|
126 |
vector<SearchDataClause*>& query,
|
|
|
127 |
string& reason, void *d)
|
|
|
128 |
{
|
|
|
129 |
Xapian::Query xq;
|
|
|
130 |
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
|
|
131 |
Xapian::Query nq;
|
|
|
132 |
if (!(*it)->toNativeQuery(db, &nq)) {
|
|
|
133 |
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
|
|
|
134 |
(*it)->getReason().c_str()));
|
|
|
135 |
reason += (*it)->getReason() + " ";
|
|
|
136 |
return false;
|
|
|
137 |
}
|
|
|
138 |
if (nq.empty()) {
|
|
|
139 |
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
|
|
|
140 |
continue;
|
|
|
141 |
}
|
|
|
142 |
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
|
|
143 |
// Else this is an OR list, and there can't be excl clauses (checked by
|
|
|
144 |
// addClause())
|
|
|
145 |
Xapian::Query::op op;
|
|
|
146 |
if (tp == SCLT_AND) {
|
|
|
147 |
if ((*it)->getexclude()) {
|
|
|
148 |
op = Xapian::Query::OP_AND_NOT;
|
|
|
149 |
} else {
|
|
|
150 |
op = Xapian::Query::OP_AND;
|
|
|
151 |
}
|
|
|
152 |
} else {
|
|
|
153 |
op = Xapian::Query::OP_OR;
|
|
|
154 |
}
|
|
|
155 |
if (xq.empty()) {
|
|
|
156 |
if (op == Xapian::Query::OP_AND_NOT)
|
|
|
157 |
xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
|
|
|
158 |
else
|
|
|
159 |
xq = nq;
|
|
|
160 |
} else {
|
|
|
161 |
xq = Xapian::Query(op, xq, nq);
|
|
|
162 |
}
|
|
|
163 |
if (int(xq.get_length()) >= getMaxCl()) {
|
|
|
164 |
LOGERR(("%s\n", maxXapClauseMsg));
|
|
|
165 |
m_reason += maxXapClauseMsg;
|
|
|
166 |
if (!o_index_stripchars)
|
|
|
167 |
m_reason += maxXapClauseCaseDiacMsg;
|
|
|
168 |
return false;
|
|
|
169 |
}
|
|
|
170 |
}
|
|
|
171 |
|
|
|
172 |
LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
|
|
|
173 |
|
|
|
174 |
if (xq.empty())
|
|
|
175 |
xq = Xapian::Query::MatchAll;
|
|
|
176 |
|
|
|
177 |
*((Xapian::Query *)d) = xq;
|
|
|
178 |
return true;
|
|
|
179 |
}
|
|
|
180 |
|
|
|
181 |
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|
|
182 |
{
|
|
|
183 |
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
|
|
184 |
m_reason.erase();
|
|
|
185 |
|
|
|
186 |
db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
|
|
|
187 |
db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
|
|
|
188 |
|
|
|
189 |
// Walk the clause list translating each in turn and building the
|
|
|
190 |
// Xapian query tree
|
|
|
191 |
Xapian::Query xq;
|
|
|
192 |
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
|
|
|
193 |
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
|
|
|
194 |
m_reason.c_str()));
|
|
|
195 |
return false;
|
|
|
196 |
}
|
|
|
197 |
|
|
|
198 |
if (m_haveDates) {
|
|
|
199 |
// If one of the extremities is unset, compute db extremas
|
|
|
200 |
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
|
|
|
201 |
int minyear = 1970, maxyear = 2100;
|
|
|
202 |
if (!db.maxYearSpan(&minyear, &maxyear)) {
|
|
|
203 |
LOGERR(("Can't retrieve index min/max dates\n"));
|
|
|
204 |
//whatever, go on.
|
|
|
205 |
}
|
|
|
206 |
|
|
|
207 |
if (m_dates.y1 == 0) {
|
|
|
208 |
m_dates.y1 = minyear;
|
|
|
209 |
m_dates.m1 = 1;
|
|
|
210 |
m_dates.d1 = 1;
|
|
|
211 |
}
|
|
|
212 |
if (m_dates.y2 == 0) {
|
|
|
213 |
m_dates.y2 = maxyear;
|
|
|
214 |
m_dates.m2 = 12;
|
|
|
215 |
m_dates.d2 = 31;
|
|
|
216 |
}
|
|
|
217 |
}
|
|
|
218 |
LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
|
|
|
219 |
m_dates.y1, m_dates.m1, m_dates.d1,
|
|
|
220 |
m_dates.y2, m_dates.m2, m_dates.d2));
|
|
|
221 |
Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
|
|
|
222 |
m_dates.y2, m_dates.m2, m_dates.d2);
|
|
|
223 |
if (dq.empty()) {
|
|
|
224 |
LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
|
|
|
225 |
}
|
|
|
226 |
// If no probabilistic query is provided then promote the daterange
|
|
|
227 |
// filter to be THE query instead of filtering an empty query.
|
|
|
228 |
if (xq.empty()) {
|
|
|
229 |
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
|
|
|
230 |
xq = dq;
|
|
|
231 |
} else {
|
|
|
232 |
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
|
|
|
233 |
}
|
|
|
234 |
}
|
|
|
235 |
|
|
|
236 |
|
|
|
237 |
if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
|
|
|
238 |
Xapian::Query sq;
|
|
|
239 |
char min[50], max[50];
|
|
|
240 |
sprintf(min, "%lld", (long long)m_minSize);
|
|
|
241 |
sprintf(max, "%lld", (long long)m_maxSize);
|
|
|
242 |
if (m_minSize == size_t(-1)) {
|
|
|
243 |
string value(max);
|
|
|
244 |
leftzeropad(value, 12);
|
|
|
245 |
sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
|
|
|
246 |
} else if (m_maxSize == size_t(-1)) {
|
|
|
247 |
string value(min);
|
|
|
248 |
leftzeropad(value, 12);
|
|
|
249 |
sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
|
|
|
250 |
} else {
|
|
|
251 |
string minvalue(min);
|
|
|
252 |
leftzeropad(minvalue, 12);
|
|
|
253 |
string maxvalue(max);
|
|
|
254 |
leftzeropad(maxvalue, 12);
|
|
|
255 |
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
|
|
|
256 |
minvalue, maxvalue);
|
|
|
257 |
}
|
|
|
258 |
|
|
|
259 |
// If no probabilistic query is provided then promote the
|
|
|
260 |
// filter to be THE query instead of filtering an empty query.
|
|
|
261 |
if (xq.empty()) {
|
|
|
262 |
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
|
|
|
263 |
xq = sq;
|
|
|
264 |
} else {
|
|
|
265 |
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
|
|
|
266 |
}
|
|
|
267 |
}
|
|
|
268 |
|
|
|
269 |
// Add the autophrase if any
|
|
|
270 |
if (m_autophrase.isNotNull()) {
|
|
|
271 |
Xapian::Query apq;
|
|
|
272 |
if (m_autophrase->toNativeQuery(db, &apq)) {
|
|
|
273 |
xq = xq.empty() ? apq :
|
|
|
274 |
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
|
|
|
275 |
}
|
|
|
276 |
}
|
|
|
277 |
|
|
|
278 |
// Add the file type filtering clause if any
|
|
|
279 |
if (!m_filetypes.empty()) {
|
|
|
280 |
expandFileTypes(db, m_filetypes);
|
|
|
281 |
|
|
|
282 |
Xapian::Query tq;
|
|
|
283 |
for (vector<string>::iterator it = m_filetypes.begin();
|
|
|
284 |
it != m_filetypes.end(); it++) {
|
|
|
285 |
string term = wrap_prefix(mimetype_prefix) + *it;
|
|
|
286 |
LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
|
|
|
287 |
tq = tq.empty() ? Xapian::Query(term) :
|
|
|
288 |
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
|
|
289 |
}
|
|
|
290 |
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
|
|
291 |
}
|
|
|
292 |
|
|
|
293 |
// Add the neg file type filtering clause if any
|
|
|
294 |
if (!m_nfiletypes.empty()) {
|
|
|
295 |
expandFileTypes(db, m_nfiletypes);
|
|
|
296 |
|
|
|
297 |
Xapian::Query tq;
|
|
|
298 |
for (vector<string>::iterator it = m_nfiletypes.begin();
|
|
|
299 |
it != m_nfiletypes.end(); it++) {
|
|
|
300 |
string term = wrap_prefix(mimetype_prefix) + *it;
|
|
|
301 |
LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
|
|
|
302 |
tq = tq.empty() ? Xapian::Query(term) :
|
|
|
303 |
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
|
|
304 |
}
|
|
|
305 |
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
|
|
|
306 |
}
|
|
|
307 |
|
|
|
308 |
*((Xapian::Query *)d) = xq;
|
|
|
309 |
return true;
|
|
|
310 |
}
|
73 |
}
|
311 |
|
74 |
|
312 |
// This is called by the GUI simple search if the option is set: add
|
75 |
// This is called by the GUI simple search if the option is set: add
|
313 |
// (OR) phrase to a query (if it is simple enough) so that results
|
76 |
// (OR) phrase to a query (if it is simple enough) so that results
|
314 |
// where the search terms are close and in order will come up on top.
|
77 |
// where the search terms are close and in order will come up on top.
|
|
... |
|
... |
426 |
for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
|
189 |
for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
|
427 |
(*it)->getTerms(hld);
|
190 |
(*it)->getTerms(hld);
|
428 |
return;
|
191 |
return;
|
429 |
}
|
192 |
}
|
430 |
|
193 |
|
431 |
// Splitter callback for breaking a user string into simple terms and
|
|
|
432 |
// phrases. This is for parts of the user entry which would appear as
|
|
|
433 |
// a single word because there is no white space inside, but are
|
|
|
434 |
// actually multiple terms to rcldb (ie term1,term2)
|
|
|
435 |
class TextSplitQ : public TextSplitP {
|
|
|
436 |
public:
|
|
|
437 |
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
|
|
438 |
: TextSplitP(prc, flags),
|
|
|
439 |
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
|
|
|
440 |
{}
|
|
|
441 |
|
|
|
442 |
bool takeword(const std::string &term, int pos, int bs, int be)
|
|
|
443 |
{
|
|
|
444 |
// Check if the first letter is a majuscule in which
|
|
|
445 |
// case we do not want to do stem expansion. Need to do this
|
|
|
446 |
// before unac of course...
|
|
|
447 |
curnostemexp = unaciscapital(term);
|
|
|
448 |
|
|
|
449 |
return TextSplitP::takeword(term, pos, bs, be);
|
|
|
450 |
}
|
|
|
451 |
|
|
|
452 |
bool curnostemexp;
|
|
|
453 |
vector<string> terms;
|
|
|
454 |
vector<bool> nostemexps;
|
|
|
455 |
const StopList &stops;
|
|
|
456 |
// Count of terms including stopwords: this is for adjusting
|
|
|
457 |
// phrase/near slack
|
|
|
458 |
int alltermcount;
|
|
|
459 |
int lastpos;
|
|
|
460 |
};
|
|
|
461 |
|
|
|
462 |
class TermProcQ : public TermProc {
|
|
|
463 |
public:
|
|
|
464 |
TermProcQ() : TermProc(0), m_ts(0) {}
|
|
|
465 |
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
|
|
|
466 |
|
|
|
467 |
bool takeword(const std::string &term, int pos, int bs, int be)
|
|
|
468 |
{
|
|
|
469 |
m_ts->alltermcount++;
|
|
|
470 |
if (m_ts->lastpos < pos)
|
|
|
471 |
m_ts->lastpos = pos;
|
|
|
472 |
bool noexpand = be ? m_ts->curnostemexp : true;
|
|
|
473 |
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
|
|
474 |
term.c_str(), pos, noexpand));
|
|
|
475 |
if (m_terms[pos].size() < term.size()) {
|
|
|
476 |
m_terms[pos] = term;
|
|
|
477 |
m_nste[pos] = noexpand;
|
|
|
478 |
}
|
|
|
479 |
return true;
|
|
|
480 |
}
|
|
|
481 |
bool flush()
|
|
|
482 |
{
|
|
|
483 |
for (map<int, string>::const_iterator it = m_terms.begin();
|
|
|
484 |
it != m_terms.end(); it++) {
|
|
|
485 |
m_ts->terms.push_back(it->second);
|
|
|
486 |
m_ts->nostemexps.push_back(m_nste[it->first]);
|
|
|
487 |
}
|
|
|
488 |
return true;
|
|
|
489 |
}
|
|
|
490 |
private:
|
|
|
491 |
TextSplitQ *m_ts;
|
|
|
492 |
map<int, string> m_terms;
|
|
|
493 |
map<int, bool> m_nste;
|
|
|
494 |
};
|
|
|
495 |
|
|
|
496 |
|
|
|
497 |
#if 1
|
|
|
498 |
static void listVector(const string& what, const vector<string>&l)
|
|
|
499 |
{
|
|
|
500 |
string a;
|
|
|
501 |
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
|
|
502 |
a = a + *it + " ";
|
|
|
503 |
}
|
|
|
504 |
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
|
|
|
505 |
}
|
|
|
506 |
#endif
|
|
|
507 |
|
|
|
508 |
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
|
|
509 |
* diacritics...
|
|
|
510 |
*
|
|
|
511 |
* @param mods stem expansion, case and diacritics sensitivity control.
|
|
|
512 |
* @param term input single word
|
|
|
513 |
* @param oexp output expansion list
|
|
|
514 |
* @param sterm output original input term if there were no wildcards
|
|
|
515 |
* @param prefix field prefix in index. We could recompute it, but the caller
|
|
|
516 |
* has it already. Used in the simple case where there is nothing to expand,
|
|
|
517 |
* and we just return the prefixed term (else Db::termMatch deals with it).
|
|
|
518 |
*/
|
|
|
519 |
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
|
|
520 |
string& ermsg, int mods,
|
|
|
521 |
const string& term,
|
|
|
522 |
vector<string>& oexp, string &sterm,
|
|
|
523 |
const string& prefix)
|
|
|
524 |
{
|
|
|
525 |
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
|
|
526 |
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
|
|
|
527 |
sterm.clear();
|
|
|
528 |
oexp.clear();
|
|
|
529 |
if (term.empty())
|
|
|
530 |
return true;
|
|
|
531 |
|
|
|
532 |
bool maxexpissoft = false;
|
|
|
533 |
int maxexpand = getSoftMaxExp();
|
|
|
534 |
if (maxexpand != -1) {
|
|
|
535 |
maxexpissoft = true;
|
|
|
536 |
} else {
|
|
|
537 |
maxexpand = getMaxExp();
|
|
|
538 |
}
|
|
|
539 |
|
|
|
540 |
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
|
|
541 |
|
|
|
542 |
// If there are no wildcards, add term to the list of user-entered terms
|
|
|
543 |
if (!haswild) {
|
|
|
544 |
m_hldata.uterms.insert(term);
|
|
|
545 |
sterm = term;
|
|
|
546 |
}
|
|
|
547 |
// No stem expansion if there are wildcards or if prevented by caller
|
|
|
548 |
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
|
|
|
549 |
if (haswild || getStemLang().empty()) {
|
|
|
550 |
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
|
|
551 |
nostemexp = true;
|
|
|
552 |
}
|
|
|
553 |
|
|
|
554 |
// noexpansion can be modified further down by possible case/diac expansion
|
|
|
555 |
bool noexpansion = nostemexp && !haswild;
|
|
|
556 |
|
|
|
557 |
int termmatchsens = 0;
|
|
|
558 |
|
|
|
559 |
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
|
|
|
560 |
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
|
|
|
561 |
|
|
|
562 |
if (o_index_stripchars) {
|
|
|
563 |
diac_sensitive = case_sensitive = false;
|
|
|
564 |
} else {
|
|
|
565 |
// If we are working with a raw index, apply the rules for case and
|
|
|
566 |
// diacritics sensitivity.
|
|
|
567 |
|
|
|
568 |
// If any character has a diacritic, we become
|
|
|
569 |
// diacritic-sensitive. Note that the way that the test is
|
|
|
570 |
// performed (conversion+comparison) will automatically ignore
|
|
|
571 |
// accented characters which are actually a separate letter
|
|
|
572 |
if (getAutoDiac() && unachasaccents(term)) {
|
|
|
573 |
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
|
|
|
574 |
diac_sensitive = true;
|
|
|
575 |
}
|
|
|
576 |
|
|
|
577 |
// If any character apart the first is uppercase, we become
|
|
|
578 |
// case-sensitive. The first character is reserved for
|
|
|
579 |
// turning off stemming. You need to use a query language
|
|
|
580 |
// modifier to search for Floor in a case-sensitive way.
|
|
|
581 |
Utf8Iter it(term);
|
|
|
582 |
it++;
|
|
|
583 |
if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
|
|
|
584 |
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
|
|
|
585 |
case_sensitive = true;
|
|
|
586 |
}
|
|
|
587 |
|
|
|
588 |
// If we are sensitive to case or diacritics turn stemming off
|
|
|
589 |
if (diac_sensitive || case_sensitive) {
|
|
|
590 |
LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
|
|
|
591 |
nostemexp = true;
|
|
|
592 |
}
|
|
|
593 |
|
|
|
594 |
if (!case_sensitive || !diac_sensitive)
|
|
|
595 |
noexpansion = false;
|
|
|
596 |
}
|
|
|
597 |
|
|
|
598 |
if (case_sensitive)
|
|
|
599 |
termmatchsens |= Db::ET_CASESENS;
|
|
|
600 |
if (diac_sensitive)
|
|
|
601 |
termmatchsens |= Db::ET_DIACSENS;
|
|
|
602 |
|
|
|
603 |
if (noexpansion) {
|
|
|
604 |
oexp.push_back(prefix + term);
|
|
|
605 |
m_hldata.terms[term] = term;
|
|
|
606 |
LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
|
|
|
607 |
return true;
|
|
|
608 |
}
|
|
|
609 |
|
|
|
610 |
Db::MatchType mtyp = haswild ? Db::ET_WILD :
|
|
|
611 |
nostemexp ? Db::ET_NONE : Db::ET_STEM;
|
|
|
612 |
TermMatchResult res;
|
|
|
613 |
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
|
|
|
614 |
m_field)) {
|
|
|
615 |
// Let it go through
|
|
|
616 |
}
|
|
|
617 |
|
|
|
618 |
// Term match entries to vector of terms
|
|
|
619 |
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
|
|
|
620 |
ermsg = "Maximum term expansion size exceeded."
|
|
|
621 |
" Maybe use case/diacritics sensitivity or increase maxTermExpand.";
|
|
|
622 |
return false;
|
|
|
623 |
}
|
|
|
624 |
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
|
|
625 |
it != res.entries.end(); it++) {
|
|
|
626 |
oexp.push_back(it->term);
|
|
|
627 |
}
|
|
|
628 |
// If the term does not exist at all in the db, the return from
|
|
|
629 |
// termMatch() is going to be empty, which is not what we want (we
|
|
|
630 |
// would then compute an empty Xapian query)
|
|
|
631 |
if (oexp.empty())
|
|
|
632 |
oexp.push_back(prefix + term);
|
|
|
633 |
|
|
|
634 |
// Remember the uterm-to-expansion links
|
|
|
635 |
for (vector<string>::const_iterator it = oexp.begin();
|
|
|
636 |
it != oexp.end(); it++) {
|
|
|
637 |
m_hldata.terms[strip_prefix(*it)] = term;
|
|
|
638 |
}
|
|
|
639 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
|
|
640 |
return true;
|
|
|
641 |
}
|
|
|
642 |
|
|
|
643 |
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
|
|
644 |
void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
|
|
645 |
vector<vector<string> >::const_iterator vvend,
|
|
|
646 |
vector<string>& comb,
|
|
|
647 |
vector<vector<string> >&allcombs)
|
|
|
648 |
{
|
|
|
649 |
// Remember my string vector and compute next, for recursive calls.
|
|
|
650 |
vector<vector<string> >::const_iterator myvit = vvit++;
|
|
|
651 |
|
|
|
652 |
// Walk the string vector I'm called upon and, for each string,
|
|
|
653 |
// add it to current result, an call myself recursively on the
|
|
|
654 |
// next string vector. The last call (last element of the vector of
|
|
|
655 |
// vectors), adds the elementary result to the output
|
|
|
656 |
|
|
|
657 |
// Walk my string vector
|
|
|
658 |
for (vector<string>::const_iterator strit = (*myvit).begin();
|
|
|
659 |
strit != (*myvit).end(); strit++) {
|
|
|
660 |
|
|
|
661 |
// Add my current value to the string vector we're building
|
|
|
662 |
comb.push_back(*strit);
|
|
|
663 |
|
|
|
664 |
if (vvit == vvend) {
|
|
|
665 |
// Last call: store current result
|
|
|
666 |
allcombs.push_back(comb);
|
|
|
667 |
} else {
|
|
|
668 |
// Call recursively on next string vector
|
|
|
669 |
multiply_groups(vvit, vvend, comb, allcombs);
|
|
|
670 |
}
|
|
|
671 |
// Pop the value I just added (make room for the next element in my
|
|
|
672 |
// vector)
|
|
|
673 |
comb.pop_back();
|
|
|
674 |
}
|
|
|
675 |
}
|
|
|
676 |
|
|
|
677 |
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
|
|
678 |
const string& span,
|
|
|
679 |
int mods, void * pq)
|
|
|
680 |
{
|
|
|
681 |
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
|
|
|
682 |
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
|
|
|
683 |
span.c_str(), (unsigned int)mods));
|
|
|
684 |
vector<string> exp;
|
|
|
685 |
string sterm; // dumb version of user term
|
|
|
686 |
|
|
|
687 |
string prefix;
|
|
|
688 |
const FieldTraits *ftp;
|
|
|
689 |
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
|
|
690 |
prefix = wrap_prefix(ftp->pfx);
|
|
|
691 |
}
|
|
|
692 |
|
|
|
693 |
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
|
|
|
694 |
return;
|
|
|
695 |
|
|
|
696 |
// Set up the highlight data. No prefix should go in there
|
|
|
697 |
for (vector<string>::const_iterator it = exp.begin();
|
|
|
698 |
it != exp.end(); it++) {
|
|
|
699 |
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
|
|
|
700 |
m_hldata.slacks.push_back(0);
|
|
|
701 |
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
|
|
|
702 |
}
|
|
|
703 |
|
|
|
704 |
// Push either term or OR of stem-expanded set
|
|
|
705 |
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
|
|
706 |
m_curcl += exp.size();
|
|
|
707 |
|
|
|
708 |
// If sterm (simplified original user term) is not null, give it a
|
|
|
709 |
// relevance boost. We do this even if no expansion occurred (else
|
|
|
710 |
// the non-expanded terms in a term list would end-up with even
|
|
|
711 |
// less wqf). This does not happen if there are wildcards anywhere
|
|
|
712 |
// in the search.
|
|
|
713 |
// We normally boost the original term in the stem expansion list. Don't
|
|
|
714 |
// do it if there are wildcards anywhere, this would skew the results.
|
|
|
715 |
bool doBoostUserTerm =
|
|
|
716 |
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
|
|
717 |
(m_parentSearch == 0 && !m_haveWildCards);
|
|
|
718 |
if (doBoostUserTerm && !sterm.empty()) {
|
|
|
719 |
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
|
|
720 |
Xapian::Query(prefix+sterm,
|
|
|
721 |
original_term_wqf_booster));
|
|
|
722 |
}
|
|
|
723 |
pqueries.push_back(xq);
|
|
|
724 |
}
|
|
|
725 |
|
|
|
726 |
// User entry element had several terms: transform into a PHRASE or
|
|
|
727 |
// NEAR xapian query, the elements of which can themselves be OR
|
|
|
728 |
// queries if the terms get expanded by stemming or wildcards (we
|
|
|
729 |
// don't do stemming for PHRASE though)
|
|
|
730 |
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
|
|
731 |
TextSplitQ *splitData,
|
|
|
732 |
int mods, void *pq,
|
|
|
733 |
bool useNear, int slack)
|
|
|
734 |
{
|
|
|
735 |
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
|
|
736 |
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
|
|
737 |
Xapian::Query::OP_PHRASE;
|
|
|
738 |
vector<Xapian::Query> orqueries;
|
|
|
739 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
|
740 |
bool hadmultiple = false;
|
|
|
741 |
#endif
|
|
|
742 |
vector<vector<string> >groups;
|
|
|
743 |
|
|
|
744 |
string prefix;
|
|
|
745 |
const FieldTraits *ftp;
|
|
|
746 |
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
|
|
747 |
prefix = wrap_prefix(ftp->pfx);
|
|
|
748 |
}
|
|
|
749 |
|
|
|
750 |
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
|
|
751 |
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
|
|
|
752 |
slack++;
|
|
|
753 |
}
|
|
|
754 |
|
|
|
755 |
// Go through the list and perform stem/wildcard expansion for each element
|
|
|
756 |
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
|
|
757 |
for (vector<string>::iterator it = splitData->terms.begin();
|
|
|
758 |
it != splitData->terms.end(); it++, nxit++) {
|
|
|
759 |
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
|
|
760 |
// Adjust when we do stem expansion. Not if disabled by
|
|
|
761 |
// caller, not inside phrases, and some versions of xapian
|
|
|
762 |
// will accept only one OR clause inside NEAR.
|
|
|
763 |
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
|
|
|
764 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
|
765 |
|| hadmultiple
|
|
|
766 |
#endif // single OR inside NEAR
|
|
|
767 |
;
|
|
|
768 |
int lmods = mods;
|
|
|
769 |
if (nostemexp)
|
|
|
770 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
|
|
771 |
string sterm;
|
|
|
772 |
vector<string> exp;
|
|
|
773 |
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
|
|
|
774 |
return;
|
|
|
775 |
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
|
|
776 |
listVector("", exp);
|
|
|
777 |
// groups is used for highlighting, we don't want prefixes in there.
|
|
|
778 |
vector<string> noprefs;
|
|
|
779 |
for (vector<string>::const_iterator it = exp.begin();
|
|
|
780 |
it != exp.end(); it++) {
|
|
|
781 |
noprefs.push_back(it->substr(prefix.size()));
|
|
|
782 |
}
|
|
|
783 |
groups.push_back(noprefs);
|
|
|
784 |
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
|
|
785 |
exp.begin(), exp.end()));
|
|
|
786 |
m_curcl += exp.size();
|
|
|
787 |
if (m_curcl >= getMaxCl())
|
|
|
788 |
return;
|
|
|
789 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
|
790 |
if (exp.size() > 1)
|
|
|
791 |
hadmultiple = true;
|
|
|
792 |
#endif
|
|
|
793 |
}
|
|
|
794 |
|
|
|
795 |
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
|
|
|
796 |
orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
|
|
|
797 |
slack++;
|
|
|
798 |
}
|
|
|
799 |
|
|
|
800 |
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
|
|
801 |
// For phrases, give a relevance boost like we do for original terms
|
|
|
802 |
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
|
|
803 |
splitData->alltermcount, splitData->lastpos));
|
|
|
804 |
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
|
|
805 |
splitData->lastpos + 1 + slack);
|
|
|
806 |
if (op == Xapian::Query::OP_PHRASE)
|
|
|
807 |
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
|
|
808 |
original_term_wqf_booster);
|
|
|
809 |
pqueries.push_back(xq);
|
|
|
810 |
|
|
|
811 |
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
|
|
|
812 |
vector<vector<string> > allcombs;
|
|
|
813 |
vector<string> comb;
|
|
|
814 |
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
|
|
815 |
|
|
|
816 |
// Insert the search groups and slacks in the highlight data, with
|
|
|
817 |
// a reference to the user entry that generated them:
|
|
|
818 |
m_hldata.groups.insert(m_hldata.groups.end(),
|
|
|
819 |
allcombs.begin(), allcombs.end());
|
|
|
820 |
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
|
|
|
821 |
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
|
|
|
822 |
m_hldata.ugroups.size() - 1);
|
|
|
823 |
}
|
|
|
824 |
|
|
|
825 |
// Trim string beginning with ^ or ending with $ and convert to flags
|
|
|
826 |
static int stringToMods(string& s)
|
|
|
827 |
{
|
|
|
828 |
int mods = 0;
|
|
|
829 |
// Check for an anchored search
|
|
|
830 |
trimstring(s);
|
|
|
831 |
if (s.length() > 0 && s[0] == '^') {
|
|
|
832 |
mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
|
|
|
833 |
s.erase(0, 1);
|
|
|
834 |
}
|
|
|
835 |
if (s.length() > 0 && s[s.length()-1] == '$') {
|
|
|
836 |
mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
|
|
|
837 |
s.erase(s.length()-1);
|
|
|
838 |
}
|
|
|
839 |
return mods;
|
|
|
840 |
}
|
|
|
841 |
|
|
|
842 |
/**
|
|
|
843 |
* Turn user entry string (NOT query language) into a list of xapian queries.
|
|
|
844 |
* We just separate words and phrases, and do wildcard and stem expansion,
|
|
|
845 |
*
|
|
|
846 |
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
|
|
|
847 |
* the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
|
|
|
848 |
* entry).
|
|
|
849 |
*
|
|
|
850 |
* This appears awful, and it would seem that the split into
|
|
|
851 |
* terms/phrases should be performed in the upper layer so that we
|
|
|
852 |
* only receive pure term or near/phrase pure elements here, but in
|
|
|
853 |
* fact there are things that would appear like terms to naive code,
|
|
|
854 |
* and which will actually may be turned into phrases (ie: tom:jerry),
|
|
|
855 |
* in a manner which intimately depends on the index implementation,
|
|
|
856 |
* so that it makes sense to process this here.
|
|
|
857 |
*
|
|
|
858 |
* The final list contains one query for each term or phrase
|
|
|
859 |
* - Elements corresponding to a stem-expanded part are an OP_OR
|
|
|
860 |
* composition of the stem-expanded terms (or a single term query).
|
|
|
861 |
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
|
|
862 |
* composition of the phrase terms (no stem expansion in this case)
|
|
|
863 |
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
|
|
864 |
* count)
|
|
|
865 |
*/
|
|
|
866 |
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
|
|
|
867 |
string &ermsg, void *pq,
|
|
|
868 |
int slack, bool useNear)
|
|
|
869 |
{
|
|
|
870 |
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
|
|
871 |
int mods = m_modifiers;
|
|
|
872 |
|
|
|
873 |
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
|
|
|
874 |
"slack %d near %d\n",
|
|
|
875 |
iq.c_str(), m_field.c_str(), mods, slack, useNear));
|
|
|
876 |
ermsg.erase();
|
|
|
877 |
m_curcl = 0;
|
|
|
878 |
const StopList stops = db.getStopList();
|
|
|
879 |
|
|
|
880 |
// Simple whitespace-split input into user-level words and
|
|
|
881 |
// double-quoted phrases: word1 word2 "this is a phrase".
|
|
|
882 |
//
|
|
|
883 |
// The text splitter may further still decide that the resulting
|
|
|
884 |
// "words" are really phrases, this depends on separators:
|
|
|
885 |
// [paul@dom.net] would still be a word (span), but [about:me]
|
|
|
886 |
// will probably be handled as a phrase.
|
|
|
887 |
vector<string> phrases;
|
|
|
888 |
TextSplit::stringToStrings(iq, phrases);
|
|
|
889 |
|
|
|
890 |
// Process each element: textsplit into terms, handle stem/wildcard
|
|
|
891 |
// expansion and transform into an appropriate Xapian::Query
|
|
|
892 |
try {
|
|
|
893 |
for (vector<string>::iterator it = phrases.begin();
|
|
|
894 |
it != phrases.end(); it++) {
|
|
|
895 |
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
|
|
|
896 |
// Anchoring modifiers
|
|
|
897 |
int amods = stringToMods(*it);
|
|
|
898 |
int terminc = amods != 0 ? 1 : 0;
|
|
|
899 |
mods |= amods;
|
|
|
900 |
// If there are multiple spans in this element, including
|
|
|
901 |
// at least one composite, we have to increase the slack
|
|
|
902 |
// else a phrase query including a span would fail.
|
|
|
903 |
// Ex: "term0@term1 term2" is onlyspans-split as:
|
|
|
904 |
// 0 term0@term1 0 12
|
|
|
905 |
// 2 term2 13 18
|
|
|
906 |
// The position of term2 is 2, not 1, so a phrase search
|
|
|
907 |
// would fail.
|
|
|
908 |
// We used to do word split, searching for
|
|
|
909 |
// "term0 term1 term2" instead, which may have worse
|
|
|
910 |
// performance, but will succeed.
|
|
|
911 |
// We now adjust the phrase/near slack by comparing the term count
|
|
|
912 |
// and the last position
|
|
|
913 |
|
|
|
914 |
// The term processing pipeline:
|
|
|
915 |
TermProcQ tpq;
|
|
|
916 |
TermProc *nxt = &tpq;
|
|
|
917 |
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
|
|
918 |
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
|
|
919 |
//tpcommon.onlygrams(true);
|
|
|
920 |
TermProcPrep tpprep(nxt);
|
|
|
921 |
if (o_index_stripchars)
|
|
|
922 |
nxt = &tpprep;
|
|
|
923 |
|
|
|
924 |
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
|
|
925 |
TextSplit::TXTS_KEEPWILD),
|
|
|
926 |
stops, nxt);
|
|
|
927 |
tpq.setTSQ(&splitter);
|
|
|
928 |
splitter.text_to_words(*it);
|
|
|
929 |
|
|
|
930 |
slack += splitter.lastpos - splitter.terms.size() + 1;
|
|
|
931 |
|
|
|
932 |
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
|
|
|
933 |
switch (splitter.terms.size() + terminc) {
|
|
|
934 |
case 0:
|
|
|
935 |
continue;// ??
|
|
|
936 |
case 1: {
|
|
|
937 |
int lmods = mods;
|
|
|
938 |
if (splitter.nostemexps.front())
|
|
|
939 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
|
|
940 |
m_hldata.ugroups.push_back(splitter.terms);
|
|
|
941 |
processSimpleSpan(db, ermsg, splitter.terms.front(),
|
|
|
942 |
lmods, &pqueries);
|
|
|
943 |
}
|
|
|
944 |
break;
|
|
|
945 |
default:
|
|
|
946 |
m_hldata.ugroups.push_back(splitter.terms);
|
|
|
947 |
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
|
|
|
948 |
useNear, slack);
|
|
|
949 |
}
|
|
|
950 |
if (m_curcl >= getMaxCl()) {
|
|
|
951 |
ermsg = maxXapClauseMsg;
|
|
|
952 |
if (!o_index_stripchars)
|
|
|
953 |
ermsg += maxXapClauseCaseDiacMsg;
|
|
|
954 |
break;
|
|
|
955 |
}
|
|
|
956 |
}
|
|
|
957 |
} catch (const Xapian::Error &e) {
|
|
|
958 |
ermsg = e.get_msg();
|
|
|
959 |
} catch (const string &s) {
|
|
|
960 |
ermsg = s;
|
|
|
961 |
} catch (const char *s) {
|
|
|
962 |
ermsg = s;
|
|
|
963 |
} catch (...) {
|
|
|
964 |
ermsg = "Caught unknown exception";
|
|
|
965 |
}
|
|
|
966 |
if (!ermsg.empty()) {
|
|
|
967 |
LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
|
|
|
968 |
return false;
|
|
|
969 |
}
|
|
|
970 |
return true;
|
|
|
971 |
}
|
|
|
972 |
|
|
|
973 |
// Translate a simple OR or AND search clause.
|
|
|
974 |
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
|
|
975 |
{
|
|
|
976 |
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
|
|
977 |
getStemLang().c_str()));
|
|
|
978 |
|
|
|
979 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
|
980 |
*qp = Xapian::Query();
|
|
|
981 |
|
|
|
982 |
Xapian::Query::op op;
|
|
|
983 |
switch (m_tp) {
|
|
|
984 |
case SCLT_AND: op = Xapian::Query::OP_AND; break;
|
|
|
985 |
case SCLT_OR: op = Xapian::Query::OP_OR; break;
|
|
|
986 |
default:
|
|
|
987 |
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
|
|
988 |
return false;
|
|
|
989 |
}
|
|
|
990 |
|
|
|
991 |
vector<Xapian::Query> pqueries;
|
|
|
992 |
if (!processUserString(db, m_text, m_reason, &pqueries))
|
|
|
993 |
return false;
|
|
|
994 |
if (pqueries.empty()) {
|
|
|
995 |
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
|
|
996 |
return true;
|
|
|
997 |
}
|
|
|
998 |
|
|
|
999 |
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
|
|
1000 |
if (m_weight != 1.0) {
|
|
|
1001 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
|
1002 |
}
|
|
|
1003 |
return true;
|
|
|
1004 |
}
|
|
|
1005 |
|
|
|
1006 |
// Translate a FILENAME search clause. This always comes
|
|
|
1007 |
// from a "filename" search from the gui or recollq. A query language
|
|
|
1008 |
// "filename:"-prefixed field will not go through here, but through
|
|
|
1009 |
// the generic field-processing code.
|
|
|
1010 |
//
|
|
|
1011 |
// We do not split the entry any more (used to do some crazy thing
|
|
|
1012 |
// about expanding multiple fragments in the past). We just take the
|
|
|
1013 |
// value blanks and all and expand this against the indexed unsplit
|
|
|
1014 |
// file names
|
|
|
1015 |
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
|
|
1016 |
{
|
|
|
1017 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
|
1018 |
*qp = Xapian::Query();
|
|
|
1019 |
|
|
|
1020 |
int maxexp = getSoftMaxExp();
|
|
|
1021 |
if (maxexp == -1)
|
|
|
1022 |
maxexp = getMaxExp();
|
|
|
1023 |
|
|
|
1024 |
vector<string> names;
|
|
|
1025 |
db.filenameWildExp(m_text, names, maxexp);
|
|
|
1026 |
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
|
|
1027 |
|
|
|
1028 |
if (m_weight != 1.0) {
|
|
|
1029 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
|
1030 |
}
|
|
|
1031 |
return true;
|
|
|
1032 |
}
|
|
|
1033 |
|
|
|
1034 |
// Translate a dir: path filtering clause. See comments in .h
|
|
|
1035 |
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
|
|
|
1036 |
{
|
|
|
1037 |
LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
|
|
|
1038 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
|
1039 |
*qp = Xapian::Query();
|
|
|
1040 |
|
|
|
1041 |
if (m_text.empty()) {
|
|
|
1042 |
LOGERR(("SearchDataClausePath: empty path??\n"));
|
|
|
1043 |
m_reason = "Empty path ?";
|
|
|
1044 |
return false;
|
|
|
1045 |
}
|
|
|
1046 |
|
|
|
1047 |
vector<Xapian::Query> orqueries;
|
|
|
1048 |
|
|
|
1049 |
if (m_text[0] == '/')
|
|
|
1050 |
orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
|
|
|
1051 |
else
|
|
|
1052 |
m_text = path_tildexpand(m_text);
|
|
|
1053 |
|
|
|
1054 |
vector<string> vpath;
|
|
|
1055 |
stringToTokens(m_text, vpath, "/");
|
|
|
1056 |
|
|
|
1057 |
for (vector<string>::const_iterator pit = vpath.begin();
|
|
|
1058 |
pit != vpath.end(); pit++){
|
|
|
1059 |
|
|
|
1060 |
string sterm;
|
|
|
1061 |
vector<string> exp;
|
|
|
1062 |
if (!expandTerm(db, m_reason,
|
|
|
1063 |
SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
|
|
|
1064 |
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
|
|
|
1065 |
return false;
|
|
|
1066 |
}
|
|
|
1067 |
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
|
|
|
1068 |
listVector("", exp);
|
|
|
1069 |
if (exp.size() == 1)
|
|
|
1070 |
orqueries.push_back(Xapian::Query(exp[0]));
|
|
|
1071 |
else
|
|
|
1072 |
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
|
|
1073 |
exp.begin(), exp.end()));
|
|
|
1074 |
m_curcl += exp.size();
|
|
|
1075 |
if (m_curcl >= getMaxCl())
|
|
|
1076 |
return false;
|
|
|
1077 |
}
|
|
|
1078 |
|
|
|
1079 |
*qp = Xapian::Query(Xapian::Query::OP_PHRASE,
|
|
|
1080 |
orqueries.begin(), orqueries.end());
|
|
|
1081 |
|
|
|
1082 |
if (m_weight != 1.0) {
|
|
|
1083 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
|
1084 |
}
|
|
|
1085 |
return true;
|
|
|
1086 |
}
|
|
|
1087 |
|
|
|
1088 |
// Translate NEAR or PHRASE clause.
|
|
|
1089 |
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
|
|
1090 |
{
|
|
|
1091 |
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
|
|
1092 |
|
|
|
1093 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
|
1094 |
*qp = Xapian::Query();
|
|
|
1095 |
|
|
|
1096 |
vector<Xapian::Query> pqueries;
|
|
|
1097 |
Xapian::Query nq;
|
|
|
1098 |
|
|
|
1099 |
// We produce a single phrase out of the user entry then use
|
|
|
1100 |
// stringToXapianQueries() to lowercase and simplify the phrase
|
|
|
1101 |
// terms etc. This will result into a single (complex)
|
|
|
1102 |
// Xapian::Query.
|
|
|
1103 |
if (m_text.find('\"') != string::npos) {
|
|
|
1104 |
m_text = neutchars(m_text, "\"");
|
|
|
1105 |
}
|
|
|
1106 |
string s = cstr_dquote + m_text + cstr_dquote;
|
|
|
1107 |
bool useNear = (m_tp == SCLT_NEAR);
|
|
|
1108 |
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
|
|
|
1109 |
return false;
|
|
|
1110 |
if (pqueries.empty()) {
|
|
|
1111 |
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
|
|
1112 |
return true;
|
|
|
1113 |
}
|
|
|
1114 |
|
|
|
1115 |
*qp = *pqueries.begin();
|
|
|
1116 |
if (m_weight != 1.0) {
|
|
|
1117 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
|
1118 |
}
|
|
|
1119 |
return true;
|
|
|
1120 |
}
|
|
|
1121 |
|
|
|
1122 |
} // Namespace Rcl
|
194 |
} // Namespace Rcl
|