|
a |
|
b/src/rcldb/searchdatatox.cpp |
|
|
1 |
/* Copyright (C) 2006 J.F.Dockes
|
|
|
2 |
* This program is free software; you can redistribute it and/or modify
|
|
|
3 |
* it under the terms of the GNU General Public License as published by
|
|
|
4 |
* the Free Software Foundation; either version 2 of the License, or
|
|
|
5 |
* (at your option) any later version.
|
|
|
6 |
*
|
|
|
7 |
* This program is distributed in the hope that it will be useful,
|
|
|
8 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
9 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
10 |
* GNU General Public License for more details.
|
|
|
11 |
*
|
|
|
12 |
* You should have received a copy of the GNU General Public License
|
|
|
13 |
* along with this program; if not, write to the
|
|
|
14 |
* Free Software Foundation, Inc.,
|
|
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
|
16 |
*/
|
|
|
17 |
|
|
|
18 |
// Handle translation from rcl's SearchData structures to Xapian Queries
|
|
|
19 |
|
|
|
20 |
#include "autoconfig.h"
|
|
|
21 |
|
|
|
22 |
#include <stdio.h>
|
|
|
23 |
|
|
|
24 |
#include <string>
|
|
|
25 |
#include <vector>
|
|
|
26 |
#include <algorithm>
|
|
|
27 |
#include <sstream>
|
|
|
28 |
using namespace std;
|
|
|
29 |
|
|
|
30 |
#include "xapian.h"
|
|
|
31 |
|
|
|
32 |
#include "cstr.h"
|
|
|
33 |
#include "rcldb.h"
|
|
|
34 |
#include "rcldb_p.h"
|
|
|
35 |
#include "searchdata.h"
|
|
|
36 |
#include "debuglog.h"
|
|
|
37 |
#include "smallut.h"
|
|
|
38 |
#include "textsplit.h"
|
|
|
39 |
#include "unacpp.h"
|
|
|
40 |
#include "utf8iter.h"
|
|
|
41 |
#include "stoplist.h"
|
|
|
42 |
#include "rclconfig.h"
|
|
|
43 |
#include "termproc.h"
|
|
|
44 |
#include "synfamily.h"
|
|
|
45 |
#include "stemdb.h"
|
|
|
46 |
#include "expansiondbs.h"
|
|
|
47 |
#include "base64.h"
|
|
|
48 |
#include "daterange.h"
|
|
|
49 |
|
|
|
50 |
namespace Rcl {
|
|
|
51 |
|
|
|
52 |
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
|
|
53 |
|
|
|
54 |
static const int original_term_wqf_booster = 10;
|
|
|
55 |
|
|
|
56 |
// Expand categories and mime type wild card exps Categories are
|
|
|
57 |
// expanded against the configuration, mimetypes against the index
|
|
|
58 |
// (for wildcards).
|
|
|
59 |
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
|
|
|
60 |
{
|
|
|
61 |
const RclConfig *cfg = db.getConf();
|
|
|
62 |
if (!cfg) {
|
|
|
63 |
LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
|
|
|
64 |
return false;
|
|
|
65 |
}
|
|
|
66 |
vector<string> exptps;
|
|
|
67 |
|
|
|
68 |
for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
|
|
|
69 |
if (cfg->isMimeCategory(*it)) {
|
|
|
70 |
vector<string>tps;
|
|
|
71 |
cfg->getMimeCatTypes(*it, tps);
|
|
|
72 |
exptps.insert(exptps.end(), tps.begin(), tps.end());
|
|
|
73 |
} else {
|
|
|
74 |
TermMatchResult res;
|
|
|
75 |
string mt = stringtolower((const string&)*it);
|
|
|
76 |
// We set casesens|diacsens to get an equivalent of ixTermMatch()
|
|
|
77 |
db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
|
|
|
78 |
mt, res, -1, "mtype");
|
|
|
79 |
if (res.entries.empty()) {
|
|
|
80 |
exptps.push_back(it->c_str());
|
|
|
81 |
} else {
|
|
|
82 |
for (vector<TermMatchEntry>::const_iterator rit =
|
|
|
83 |
res.entries.begin(); rit != res.entries.end(); rit++) {
|
|
|
84 |
exptps.push_back(strip_prefix(rit->term));
|
|
|
85 |
}
|
|
|
86 |
}
|
|
|
87 |
}
|
|
|
88 |
}
|
|
|
89 |
sort(exptps.begin(), exptps.end());
|
|
|
90 |
exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
|
|
|
91 |
|
|
|
92 |
tps = exptps;
|
|
|
93 |
return true;
|
|
|
94 |
}
|
|
|
95 |
|
|
|
96 |
static const char *maxXapClauseMsg =
|
|
|
97 |
"Maximum Xapian query size exceeded. Increase maxXapianClauses "
|
|
|
98 |
"in the configuration. ";
|
|
|
99 |
static const char *maxXapClauseCaseDiacMsg =
|
|
|
100 |
"Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
|
|
|
101 |
"wildcards ?"
|
|
|
102 |
;
|
|
|
103 |
|
|
|
104 |
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
|
|
105 |
vector<SearchDataClause*>& query,
|
|
|
106 |
string& reason, void *d)
|
|
|
107 |
{
|
|
|
108 |
Xapian::Query xq;
|
|
|
109 |
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
|
|
110 |
Xapian::Query nq;
|
|
|
111 |
if (!(*it)->toNativeQuery(db, &nq)) {
|
|
|
112 |
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
|
|
|
113 |
(*it)->getReason().c_str()));
|
|
|
114 |
reason += (*it)->getReason() + " ";
|
|
|
115 |
return false;
|
|
|
116 |
}
|
|
|
117 |
if (nq.empty()) {
|
|
|
118 |
LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
|
|
|
119 |
continue;
|
|
|
120 |
}
|
|
|
121 |
// If this structure is an AND list, must use AND_NOT for excl clauses.
|
|
|
122 |
// Else this is an OR list, and there can't be excl clauses (checked by
|
|
|
123 |
// addClause())
|
|
|
124 |
Xapian::Query::op op;
|
|
|
125 |
if (tp == SCLT_AND) {
|
|
|
126 |
if ((*it)->getexclude()) {
|
|
|
127 |
op = Xapian::Query::OP_AND_NOT;
|
|
|
128 |
} else {
|
|
|
129 |
op = Xapian::Query::OP_AND;
|
|
|
130 |
}
|
|
|
131 |
} else {
|
|
|
132 |
op = Xapian::Query::OP_OR;
|
|
|
133 |
}
|
|
|
134 |
if (xq.empty()) {
|
|
|
135 |
if (op == Xapian::Query::OP_AND_NOT)
|
|
|
136 |
xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
|
|
|
137 |
else
|
|
|
138 |
xq = nq;
|
|
|
139 |
} else {
|
|
|
140 |
xq = Xapian::Query(op, xq, nq);
|
|
|
141 |
}
|
|
|
142 |
if (int(xq.get_length()) >= getMaxCl()) {
|
|
|
143 |
LOGERR(("%s\n", maxXapClauseMsg));
|
|
|
144 |
m_reason += maxXapClauseMsg;
|
|
|
145 |
if (!o_index_stripchars)
|
|
|
146 |
m_reason += maxXapClauseCaseDiacMsg;
|
|
|
147 |
return false;
|
|
|
148 |
}
|
|
|
149 |
}
|
|
|
150 |
|
|
|
151 |
LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
|
|
|
152 |
|
|
|
153 |
if (xq.empty())
|
|
|
154 |
xq = Xapian::Query::MatchAll;
|
|
|
155 |
|
|
|
156 |
*((Xapian::Query *)d) = xq;
|
|
|
157 |
return true;
|
|
|
158 |
}
|
|
|
159 |
|
|
|
160 |
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
|
|
161 |
{
|
|
|
162 |
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
|
|
163 |
m_reason.erase();
|
|
|
164 |
|
|
|
165 |
db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
|
|
|
166 |
db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
|
|
|
167 |
|
|
|
168 |
// Walk the clause list translating each in turn and building the
|
|
|
169 |
// Xapian query tree
|
|
|
170 |
Xapian::Query xq;
|
|
|
171 |
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
|
|
|
172 |
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
|
|
|
173 |
m_reason.c_str()));
|
|
|
174 |
return false;
|
|
|
175 |
}
|
|
|
176 |
|
|
|
177 |
if (m_haveDates) {
|
|
|
178 |
// If one of the extremities is unset, compute db extremas
|
|
|
179 |
if (m_dates.y1 == 0 || m_dates.y2 == 0) {
|
|
|
180 |
int minyear = 1970, maxyear = 2100;
|
|
|
181 |
if (!db.maxYearSpan(&minyear, &maxyear)) {
|
|
|
182 |
LOGERR(("Can't retrieve index min/max dates\n"));
|
|
|
183 |
//whatever, go on.
|
|
|
184 |
}
|
|
|
185 |
|
|
|
186 |
if (m_dates.y1 == 0) {
|
|
|
187 |
m_dates.y1 = minyear;
|
|
|
188 |
m_dates.m1 = 1;
|
|
|
189 |
m_dates.d1 = 1;
|
|
|
190 |
}
|
|
|
191 |
if (m_dates.y2 == 0) {
|
|
|
192 |
m_dates.y2 = maxyear;
|
|
|
193 |
m_dates.m2 = 12;
|
|
|
194 |
m_dates.d2 = 31;
|
|
|
195 |
}
|
|
|
196 |
}
|
|
|
197 |
LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
|
|
|
198 |
m_dates.y1, m_dates.m1, m_dates.d1,
|
|
|
199 |
m_dates.y2, m_dates.m2, m_dates.d2));
|
|
|
200 |
Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
|
|
|
201 |
m_dates.y2, m_dates.m2, m_dates.d2);
|
|
|
202 |
if (dq.empty()) {
|
|
|
203 |
LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
|
|
|
204 |
}
|
|
|
205 |
// If no probabilistic query is provided then promote the daterange
|
|
|
206 |
// filter to be THE query instead of filtering an empty query.
|
|
|
207 |
if (xq.empty()) {
|
|
|
208 |
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
|
|
|
209 |
xq = dq;
|
|
|
210 |
} else {
|
|
|
211 |
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
|
|
|
212 |
}
|
|
|
213 |
}
|
|
|
214 |
|
|
|
215 |
|
|
|
216 |
if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
|
|
|
217 |
Xapian::Query sq;
|
|
|
218 |
char min[50], max[50];
|
|
|
219 |
sprintf(min, "%lld", (long long)m_minSize);
|
|
|
220 |
sprintf(max, "%lld", (long long)m_maxSize);
|
|
|
221 |
if (m_minSize == size_t(-1)) {
|
|
|
222 |
string value(max);
|
|
|
223 |
leftzeropad(value, 12);
|
|
|
224 |
sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
|
|
|
225 |
} else if (m_maxSize == size_t(-1)) {
|
|
|
226 |
string value(min);
|
|
|
227 |
leftzeropad(value, 12);
|
|
|
228 |
sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
|
|
|
229 |
} else {
|
|
|
230 |
string minvalue(min);
|
|
|
231 |
leftzeropad(minvalue, 12);
|
|
|
232 |
string maxvalue(max);
|
|
|
233 |
leftzeropad(maxvalue, 12);
|
|
|
234 |
sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE,
|
|
|
235 |
minvalue, maxvalue);
|
|
|
236 |
}
|
|
|
237 |
|
|
|
238 |
// If no probabilistic query is provided then promote the
|
|
|
239 |
// filter to be THE query instead of filtering an empty query.
|
|
|
240 |
if (xq.empty()) {
|
|
|
241 |
LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
|
|
|
242 |
xq = sq;
|
|
|
243 |
} else {
|
|
|
244 |
xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
|
|
|
245 |
}
|
|
|
246 |
}
|
|
|
247 |
|
|
|
248 |
// Add the autophrase if any
|
|
|
249 |
if (m_autophrase.isNotNull()) {
|
|
|
250 |
Xapian::Query apq;
|
|
|
251 |
if (m_autophrase->toNativeQuery(db, &apq)) {
|
|
|
252 |
xq = xq.empty() ? apq :
|
|
|
253 |
Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
|
|
|
254 |
}
|
|
|
255 |
}
|
|
|
256 |
|
|
|
257 |
// Add the file type filtering clause if any
|
|
|
258 |
if (!m_filetypes.empty()) {
|
|
|
259 |
expandFileTypes(db, m_filetypes);
|
|
|
260 |
|
|
|
261 |
Xapian::Query tq;
|
|
|
262 |
for (vector<string>::iterator it = m_filetypes.begin();
|
|
|
263 |
it != m_filetypes.end(); it++) {
|
|
|
264 |
string term = wrap_prefix(mimetype_prefix) + *it;
|
|
|
265 |
LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
|
|
|
266 |
tq = tq.empty() ? Xapian::Query(term) :
|
|
|
267 |
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
|
|
268 |
}
|
|
|
269 |
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
|
|
|
270 |
}
|
|
|
271 |
|
|
|
272 |
// Add the neg file type filtering clause if any
|
|
|
273 |
if (!m_nfiletypes.empty()) {
|
|
|
274 |
expandFileTypes(db, m_nfiletypes);
|
|
|
275 |
|
|
|
276 |
Xapian::Query tq;
|
|
|
277 |
for (vector<string>::iterator it = m_nfiletypes.begin();
|
|
|
278 |
it != m_nfiletypes.end(); it++) {
|
|
|
279 |
string term = wrap_prefix(mimetype_prefix) + *it;
|
|
|
280 |
LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
|
|
|
281 |
tq = tq.empty() ? Xapian::Query(term) :
|
|
|
282 |
Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
|
|
|
283 |
}
|
|
|
284 |
xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
|
|
|
285 |
}
|
|
|
286 |
|
|
|
287 |
*((Xapian::Query *)d) = xq;
|
|
|
288 |
return true;
|
|
|
289 |
}
|
|
|
290 |
|
|
|
291 |
// Splitter callback for breaking a user string into simple terms and
|
|
|
292 |
// phrases. This is for parts of the user entry which would appear as
|
|
|
293 |
// a single word because there is no white space inside, but are
|
|
|
294 |
// actually multiple terms to rcldb (ie term1,term2)
|
|
|
295 |
class TextSplitQ : public TextSplitP {
|
|
|
296 |
public:
|
|
|
297 |
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
|
|
298 |
: TextSplitP(prc, flags),
|
|
|
299 |
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
|
|
|
300 |
{}
|
|
|
301 |
|
|
|
302 |
bool takeword(const std::string &term, int pos, int bs, int be)
|
|
|
303 |
{
|
|
|
304 |
// Check if the first letter is a majuscule in which
|
|
|
305 |
// case we do not want to do stem expansion. Need to do this
|
|
|
306 |
// before unac of course...
|
|
|
307 |
curnostemexp = unaciscapital(term);
|
|
|
308 |
|
|
|
309 |
return TextSplitP::takeword(term, pos, bs, be);
|
|
|
310 |
}
|
|
|
311 |
|
|
|
312 |
bool curnostemexp;
|
|
|
313 |
vector<string> terms;
|
|
|
314 |
vector<bool> nostemexps;
|
|
|
315 |
const StopList &stops;
|
|
|
316 |
// Count of terms including stopwords: this is for adjusting
|
|
|
317 |
// phrase/near slack
|
|
|
318 |
int alltermcount;
|
|
|
319 |
int lastpos;
|
|
|
320 |
};
|
|
|
321 |
|
|
|
322 |
class TermProcQ : public TermProc {
|
|
|
323 |
public:
|
|
|
324 |
TermProcQ() : TermProc(0), m_ts(0) {}
|
|
|
325 |
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
|
|
|
326 |
|
|
|
327 |
bool takeword(const std::string &term, int pos, int bs, int be)
|
|
|
328 |
{
|
|
|
329 |
m_ts->alltermcount++;
|
|
|
330 |
if (m_ts->lastpos < pos)
|
|
|
331 |
m_ts->lastpos = pos;
|
|
|
332 |
bool noexpand = be ? m_ts->curnostemexp : true;
|
|
|
333 |
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
|
|
334 |
term.c_str(), pos, noexpand));
|
|
|
335 |
if (m_terms[pos].size() < term.size()) {
|
|
|
336 |
m_terms[pos] = term;
|
|
|
337 |
m_nste[pos] = noexpand;
|
|
|
338 |
}
|
|
|
339 |
return true;
|
|
|
340 |
}
|
|
|
341 |
bool flush()
|
|
|
342 |
{
|
|
|
343 |
for (map<int, string>::const_iterator it = m_terms.begin();
|
|
|
344 |
it != m_terms.end(); it++) {
|
|
|
345 |
m_ts->terms.push_back(it->second);
|
|
|
346 |
m_ts->nostemexps.push_back(m_nste[it->first]);
|
|
|
347 |
}
|
|
|
348 |
return true;
|
|
|
349 |
}
|
|
|
350 |
private:
|
|
|
351 |
TextSplitQ *m_ts;
|
|
|
352 |
map<int, string> m_terms;
|
|
|
353 |
map<int, bool> m_nste;
|
|
|
354 |
};
|
|
|
355 |
|
|
|
356 |
|
|
|
357 |
#if 1
|
|
|
358 |
static void listVector(const string& what, const vector<string>&l)
|
|
|
359 |
{
|
|
|
360 |
string a;
|
|
|
361 |
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
|
|
362 |
a = a + *it + " ";
|
|
|
363 |
}
|
|
|
364 |
LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
|
|
|
365 |
}
|
|
|
366 |
#endif
|
|
|
367 |
|
|
|
368 |
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
|
|
369 |
* diacritics...
|
|
|
370 |
*
|
|
|
371 |
* @param mods stem expansion, case and diacritics sensitivity control.
|
|
|
372 |
* @param term input single word
|
|
|
373 |
* @param oexp output expansion list
|
|
|
374 |
* @param sterm output original input term if there were no wildcards
|
|
|
375 |
* @param prefix field prefix in index. We could recompute it, but the caller
|
|
|
376 |
* has it already. Used in the simple case where there is nothing to expand,
|
|
|
377 |
* and we just return the prefixed term (else Db::termMatch deals with it).
|
|
|
378 |
*/
|
|
|
379 |
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
|
|
380 |
string& ermsg, int mods,
|
|
|
381 |
const string& term,
|
|
|
382 |
vector<string>& oexp, string &sterm,
|
|
|
383 |
const string& prefix)
|
|
|
384 |
{
|
|
|
385 |
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
|
|
386 |
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
|
|
|
387 |
sterm.clear();
|
|
|
388 |
oexp.clear();
|
|
|
389 |
if (term.empty())
|
|
|
390 |
return true;
|
|
|
391 |
|
|
|
392 |
bool maxexpissoft = false;
|
|
|
393 |
int maxexpand = getSoftMaxExp();
|
|
|
394 |
if (maxexpand != -1) {
|
|
|
395 |
maxexpissoft = true;
|
|
|
396 |
} else {
|
|
|
397 |
maxexpand = getMaxExp();
|
|
|
398 |
}
|
|
|
399 |
|
|
|
400 |
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
|
|
401 |
|
|
|
402 |
// If there are no wildcards, add term to the list of user-entered terms
|
|
|
403 |
if (!haswild) {
|
|
|
404 |
m_hldata.uterms.insert(term);
|
|
|
405 |
sterm = term;
|
|
|
406 |
}
|
|
|
407 |
// No stem expansion if there are wildcards or if prevented by caller
|
|
|
408 |
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
|
|
|
409 |
if (haswild || getStemLang().empty()) {
|
|
|
410 |
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
|
|
411 |
nostemexp = true;
|
|
|
412 |
}
|
|
|
413 |
|
|
|
414 |
// noexpansion can be modified further down by possible case/diac expansion
|
|
|
415 |
bool noexpansion = nostemexp && !haswild;
|
|
|
416 |
|
|
|
417 |
int termmatchsens = 0;
|
|
|
418 |
|
|
|
419 |
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
|
|
|
420 |
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
|
|
|
421 |
|
|
|
422 |
if (o_index_stripchars) {
|
|
|
423 |
diac_sensitive = case_sensitive = false;
|
|
|
424 |
} else {
|
|
|
425 |
// If we are working with a raw index, apply the rules for case and
|
|
|
426 |
// diacritics sensitivity.
|
|
|
427 |
|
|
|
428 |
// If any character has a diacritic, we become
|
|
|
429 |
// diacritic-sensitive. Note that the way that the test is
|
|
|
430 |
// performed (conversion+comparison) will automatically ignore
|
|
|
431 |
// accented characters which are actually a separate letter
|
|
|
432 |
if (getAutoDiac() && unachasaccents(term)) {
|
|
|
433 |
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
|
|
|
434 |
diac_sensitive = true;
|
|
|
435 |
}
|
|
|
436 |
|
|
|
437 |
// If any character apart the first is uppercase, we become
|
|
|
438 |
// case-sensitive. The first character is reserved for
|
|
|
439 |
// turning off stemming. You need to use a query language
|
|
|
440 |
// modifier to search for Floor in a case-sensitive way.
|
|
|
441 |
Utf8Iter it(term);
|
|
|
442 |
it++;
|
|
|
443 |
if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
|
|
|
444 |
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
|
|
|
445 |
case_sensitive = true;
|
|
|
446 |
}
|
|
|
447 |
|
|
|
448 |
// If we are sensitive to case or diacritics turn stemming off
|
|
|
449 |
if (diac_sensitive || case_sensitive) {
|
|
|
450 |
LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
|
|
|
451 |
nostemexp = true;
|
|
|
452 |
}
|
|
|
453 |
|
|
|
454 |
if (!case_sensitive || !diac_sensitive)
|
|
|
455 |
noexpansion = false;
|
|
|
456 |
}
|
|
|
457 |
|
|
|
458 |
if (case_sensitive)
|
|
|
459 |
termmatchsens |= Db::ET_CASESENS;
|
|
|
460 |
if (diac_sensitive)
|
|
|
461 |
termmatchsens |= Db::ET_DIACSENS;
|
|
|
462 |
|
|
|
463 |
if (noexpansion) {
|
|
|
464 |
oexp.push_back(prefix + term);
|
|
|
465 |
m_hldata.terms[term] = term;
|
|
|
466 |
LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
|
|
|
467 |
return true;
|
|
|
468 |
}
|
|
|
469 |
|
|
|
470 |
Db::MatchType mtyp = haswild ? Db::ET_WILD :
|
|
|
471 |
nostemexp ? Db::ET_NONE : Db::ET_STEM;
|
|
|
472 |
TermMatchResult res;
|
|
|
473 |
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
|
|
|
474 |
m_field)) {
|
|
|
475 |
// Let it go through
|
|
|
476 |
}
|
|
|
477 |
|
|
|
478 |
// Term match entries to vector of terms
|
|
|
479 |
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
|
|
|
480 |
ermsg = "Maximum term expansion size exceeded."
|
|
|
481 |
" Maybe use case/diacritics sensitivity or increase maxTermExpand.";
|
|
|
482 |
return false;
|
|
|
483 |
}
|
|
|
484 |
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
|
|
485 |
it != res.entries.end(); it++) {
|
|
|
486 |
oexp.push_back(it->term);
|
|
|
487 |
}
|
|
|
488 |
// If the term does not exist at all in the db, the return from
|
|
|
489 |
// termMatch() is going to be empty, which is not what we want (we
|
|
|
490 |
// would then compute an empty Xapian query)
|
|
|
491 |
if (oexp.empty())
|
|
|
492 |
oexp.push_back(prefix + term);
|
|
|
493 |
|
|
|
494 |
// Remember the uterm-to-expansion links
|
|
|
495 |
for (vector<string>::const_iterator it = oexp.begin();
|
|
|
496 |
it != oexp.end(); it++) {
|
|
|
497 |
m_hldata.terms[strip_prefix(*it)] = term;
|
|
|
498 |
}
|
|
|
499 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
|
|
500 |
return true;
|
|
|
501 |
}
|
|
|
502 |
|
|
|
503 |
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
|
|
|
504 |
void multiply_groups(vector<vector<string> >::const_iterator vvit,
|
|
|
505 |
vector<vector<string> >::const_iterator vvend,
|
|
|
506 |
vector<string>& comb,
|
|
|
507 |
vector<vector<string> >&allcombs)
|
|
|
508 |
{
|
|
|
509 |
// Remember my string vector and compute next, for recursive calls.
|
|
|
510 |
vector<vector<string> >::const_iterator myvit = vvit++;
|
|
|
511 |
|
|
|
512 |
// Walk the string vector I'm called upon and, for each string,
|
|
|
513 |
// add it to current result, an call myself recursively on the
|
|
|
514 |
// next string vector. The last call (last element of the vector of
|
|
|
515 |
// vectors), adds the elementary result to the output
|
|
|
516 |
|
|
|
517 |
// Walk my string vector
|
|
|
518 |
for (vector<string>::const_iterator strit = (*myvit).begin();
|
|
|
519 |
strit != (*myvit).end(); strit++) {
|
|
|
520 |
|
|
|
521 |
// Add my current value to the string vector we're building
|
|
|
522 |
comb.push_back(*strit);
|
|
|
523 |
|
|
|
524 |
if (vvit == vvend) {
|
|
|
525 |
// Last call: store current result
|
|
|
526 |
allcombs.push_back(comb);
|
|
|
527 |
} else {
|
|
|
528 |
// Call recursively on next string vector
|
|
|
529 |
multiply_groups(vvit, vvend, comb, allcombs);
|
|
|
530 |
}
|
|
|
531 |
// Pop the value I just added (make room for the next element in my
|
|
|
532 |
// vector)
|
|
|
533 |
comb.pop_back();
|
|
|
534 |
}
|
|
|
535 |
}
|
|
|
536 |
|
|
|
537 |
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
|
|
538 |
const string& span,
|
|
|
539 |
int mods, void * pq)
|
|
|
540 |
{
|
|
|
541 |
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
|
|
|
542 |
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
|
|
|
543 |
span.c_str(), (unsigned int)mods));
|
|
|
544 |
vector<string> exp;
|
|
|
545 |
string sterm; // dumb version of user term
|
|
|
546 |
|
|
|
547 |
string prefix;
|
|
|
548 |
const FieldTraits *ftp;
|
|
|
549 |
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
|
|
550 |
prefix = wrap_prefix(ftp->pfx);
|
|
|
551 |
}
|
|
|
552 |
|
|
|
553 |
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
|
|
|
554 |
return;
|
|
|
555 |
|
|
|
556 |
// Set up the highlight data. No prefix should go in there
|
|
|
557 |
for (vector<string>::const_iterator it = exp.begin();
|
|
|
558 |
it != exp.end(); it++) {
|
|
|
559 |
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
|
|
|
560 |
m_hldata.slacks.push_back(0);
|
|
|
561 |
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
|
|
|
562 |
}
|
|
|
563 |
|
|
|
564 |
// Push either term or OR of stem-expanded set
|
|
|
565 |
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
|
|
566 |
m_curcl += exp.size();
|
|
|
567 |
|
|
|
568 |
// If sterm (simplified original user term) is not null, give it a
|
|
|
569 |
// relevance boost. We do this even if no expansion occurred (else
|
|
|
570 |
// the non-expanded terms in a term list would end-up with even
|
|
|
571 |
// less wqf). This does not happen if there are wildcards anywhere
|
|
|
572 |
// in the search.
|
|
|
573 |
// We normally boost the original term in the stem expansion list. Don't
|
|
|
574 |
// do it if there are wildcards anywhere, this would skew the results.
|
|
|
575 |
bool doBoostUserTerm =
|
|
|
576 |
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
|
|
577 |
(m_parentSearch == 0 && !m_haveWildCards);
|
|
|
578 |
if (doBoostUserTerm && !sterm.empty()) {
|
|
|
579 |
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
|
|
580 |
Xapian::Query(prefix+sterm,
|
|
|
581 |
original_term_wqf_booster));
|
|
|
582 |
}
|
|
|
583 |
pqueries.push_back(xq);
|
|
|
584 |
}
|
|
|
585 |
|
|
|
586 |
// User entry element had several terms: transform into a PHRASE or
|
|
|
587 |
// NEAR xapian query, the elements of which can themselves be OR
|
|
|
588 |
// queries if the terms get expanded by stemming or wildcards (we
|
|
|
589 |
// don't do stemming for PHRASE though)
|
|
|
590 |
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
|
|
591 |
TextSplitQ *splitData,
|
|
|
592 |
int mods, void *pq,
|
|
|
593 |
bool useNear, int slack)
|
|
|
594 |
{
|
|
|
595 |
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
|
|
596 |
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
|
|
597 |
Xapian::Query::OP_PHRASE;
|
|
|
598 |
vector<Xapian::Query> orqueries;
|
|
|
599 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
|
600 |
bool hadmultiple = false;
|
|
|
601 |
#endif
|
|
|
602 |
vector<vector<string> >groups;
|
|
|
603 |
|
|
|
604 |
string prefix;
|
|
|
605 |
const FieldTraits *ftp;
|
|
|
606 |
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
|
|
|
607 |
prefix = wrap_prefix(ftp->pfx);
|
|
|
608 |
}
|
|
|
609 |
|
|
|
610 |
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
|
|
611 |
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
|
|
|
612 |
slack++;
|
|
|
613 |
}
|
|
|
614 |
|
|
|
615 |
// Go through the list and perform stem/wildcard expansion for each element
|
|
|
616 |
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
|
|
617 |
for (vector<string>::iterator it = splitData->terms.begin();
|
|
|
618 |
it != splitData->terms.end(); it++, nxit++) {
|
|
|
619 |
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
|
|
620 |
// Adjust when we do stem expansion. Not if disabled by
|
|
|
621 |
// caller, not inside phrases, and some versions of xapian
|
|
|
622 |
// will accept only one OR clause inside NEAR.
|
|
|
623 |
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
|
|
|
624 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
|
625 |
|| hadmultiple
|
|
|
626 |
#endif // single OR inside NEAR
|
|
|
627 |
;
|
|
|
628 |
int lmods = mods;
|
|
|
629 |
if (nostemexp)
|
|
|
630 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
|
|
631 |
string sterm;
|
|
|
632 |
vector<string> exp;
|
|
|
633 |
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
|
|
|
634 |
return;
|
|
|
635 |
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
|
|
636 |
listVector("", exp);
|
|
|
637 |
// groups is used for highlighting, we don't want prefixes in there.
|
|
|
638 |
vector<string> noprefs;
|
|
|
639 |
for (vector<string>::const_iterator it = exp.begin();
|
|
|
640 |
it != exp.end(); it++) {
|
|
|
641 |
noprefs.push_back(it->substr(prefix.size()));
|
|
|
642 |
}
|
|
|
643 |
groups.push_back(noprefs);
|
|
|
644 |
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
|
|
645 |
exp.begin(), exp.end()));
|
|
|
646 |
m_curcl += exp.size();
|
|
|
647 |
if (m_curcl >= getMaxCl())
|
|
|
648 |
return;
|
|
|
649 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
|
650 |
if (exp.size() > 1)
|
|
|
651 |
hadmultiple = true;
|
|
|
652 |
#endif
|
|
|
653 |
}
|
|
|
654 |
|
|
|
655 |
if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
|
|
|
656 |
orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
|
|
|
657 |
slack++;
|
|
|
658 |
}
|
|
|
659 |
|
|
|
660 |
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
|
|
661 |
// For phrases, give a relevance boost like we do for original terms
|
|
|
662 |
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
|
|
663 |
splitData->alltermcount, splitData->lastpos));
|
|
|
664 |
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
|
|
665 |
splitData->lastpos + 1 + slack);
|
|
|
666 |
if (op == Xapian::Query::OP_PHRASE)
|
|
|
667 |
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
|
|
668 |
original_term_wqf_booster);
|
|
|
669 |
pqueries.push_back(xq);
|
|
|
670 |
|
|
|
671 |
// Add all combinations of NEAR/PHRASE groups to the highlighting data.
|
|
|
672 |
vector<vector<string> > allcombs;
|
|
|
673 |
vector<string> comb;
|
|
|
674 |
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
|
|
675 |
|
|
|
676 |
// Insert the search groups and slacks in the highlight data, with
|
|
|
677 |
// a reference to the user entry that generated them:
|
|
|
678 |
m_hldata.groups.insert(m_hldata.groups.end(),
|
|
|
679 |
allcombs.begin(), allcombs.end());
|
|
|
680 |
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
|
|
|
681 |
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
|
|
|
682 |
m_hldata.ugroups.size() - 1);
|
|
|
683 |
}
|
|
|
684 |
|
|
|
685 |
// Trim string beginning with ^ or ending with $ and convert to flags
|
|
|
686 |
static int stringToMods(string& s)
|
|
|
687 |
{
|
|
|
688 |
int mods = 0;
|
|
|
689 |
// Check for an anchored search
|
|
|
690 |
trimstring(s);
|
|
|
691 |
if (s.length() > 0 && s[0] == '^') {
|
|
|
692 |
mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
|
|
|
693 |
s.erase(0, 1);
|
|
|
694 |
}
|
|
|
695 |
if (s.length() > 0 && s[s.length()-1] == '$') {
|
|
|
696 |
mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
|
|
|
697 |
s.erase(s.length()-1);
|
|
|
698 |
}
|
|
|
699 |
return mods;
|
|
|
700 |
}
|
|
|
701 |
|
|
|
702 |
/**
|
|
|
703 |
* Turn user entry string (NOT query language) into a list of xapian queries.
|
|
|
704 |
* We just separate words and phrases, and do wildcard and stem expansion,
|
|
|
705 |
*
|
|
|
706 |
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
|
|
|
707 |
* the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
|
|
|
708 |
* entry).
|
|
|
709 |
*
|
|
|
710 |
* This appears awful, and it would seem that the split into
|
|
|
711 |
* terms/phrases should be performed in the upper layer so that we
|
|
|
712 |
* only receive pure term or near/phrase pure elements here, but in
|
|
|
713 |
* fact there are things that would appear like terms to naive code,
|
|
|
714 |
* and which will actually may be turned into phrases (ie: tom:jerry),
|
|
|
715 |
* in a manner which intimately depends on the index implementation,
|
|
|
716 |
* so that it makes sense to process this here.
|
|
|
717 |
*
|
|
|
718 |
* The final list contains one query for each term or phrase
|
|
|
719 |
* - Elements corresponding to a stem-expanded part are an OP_OR
|
|
|
720 |
* composition of the stem-expanded terms (or a single term query).
|
|
|
721 |
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
|
|
722 |
* composition of the phrase terms (no stem expansion in this case)
|
|
|
723 |
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
|
|
724 |
* count)
|
|
|
725 |
*/
|
|
|
726 |
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
|
|
|
727 |
string &ermsg, void *pq,
|
|
|
728 |
int slack, bool useNear)
|
|
|
729 |
{
|
|
|
730 |
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
|
|
731 |
int mods = m_modifiers;
|
|
|
732 |
|
|
|
733 |
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
|
|
|
734 |
"slack %d near %d\n",
|
|
|
735 |
iq.c_str(), m_field.c_str(), mods, slack, useNear));
|
|
|
736 |
ermsg.erase();
|
|
|
737 |
m_curcl = 0;
|
|
|
738 |
const StopList stops = db.getStopList();
|
|
|
739 |
|
|
|
740 |
// Simple whitespace-split input into user-level words and
|
|
|
741 |
// double-quoted phrases: word1 word2 "this is a phrase".
|
|
|
742 |
//
|
|
|
743 |
// The text splitter may further still decide that the resulting
|
|
|
744 |
// "words" are really phrases, this depends on separators:
|
|
|
745 |
// [paul@dom.net] would still be a word (span), but [about:me]
|
|
|
746 |
// will probably be handled as a phrase.
|
|
|
747 |
vector<string> phrases;
|
|
|
748 |
TextSplit::stringToStrings(iq, phrases);
|
|
|
749 |
|
|
|
750 |
// Process each element: textsplit into terms, handle stem/wildcard
|
|
|
751 |
// expansion and transform into an appropriate Xapian::Query
|
|
|
752 |
try {
|
|
|
753 |
for (vector<string>::iterator it = phrases.begin();
|
|
|
754 |
it != phrases.end(); it++) {
|
|
|
755 |
LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
|
|
|
756 |
// Anchoring modifiers
|
|
|
757 |
int amods = stringToMods(*it);
|
|
|
758 |
int terminc = amods != 0 ? 1 : 0;
|
|
|
759 |
mods |= amods;
|
|
|
760 |
// If there are multiple spans in this element, including
|
|
|
761 |
// at least one composite, we have to increase the slack
|
|
|
762 |
// else a phrase query including a span would fail.
|
|
|
763 |
// Ex: "term0@term1 term2" is onlyspans-split as:
|
|
|
764 |
// 0 term0@term1 0 12
|
|
|
765 |
// 2 term2 13 18
|
|
|
766 |
// The position of term2 is 2, not 1, so a phrase search
|
|
|
767 |
// would fail.
|
|
|
768 |
// We used to do word split, searching for
|
|
|
769 |
// "term0 term1 term2" instead, which may have worse
|
|
|
770 |
// performance, but will succeed.
|
|
|
771 |
// We now adjust the phrase/near slack by comparing the term count
|
|
|
772 |
// and the last position
|
|
|
773 |
|
|
|
774 |
// The term processing pipeline:
|
|
|
775 |
TermProcQ tpq;
|
|
|
776 |
TermProc *nxt = &tpq;
|
|
|
777 |
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
|
|
778 |
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
|
|
779 |
//tpcommon.onlygrams(true);
|
|
|
780 |
TermProcPrep tpprep(nxt);
|
|
|
781 |
if (o_index_stripchars)
|
|
|
782 |
nxt = &tpprep;
|
|
|
783 |
|
|
|
784 |
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
|
|
785 |
TextSplit::TXTS_KEEPWILD),
|
|
|
786 |
stops, nxt);
|
|
|
787 |
tpq.setTSQ(&splitter);
|
|
|
788 |
splitter.text_to_words(*it);
|
|
|
789 |
|
|
|
790 |
slack += splitter.lastpos - splitter.terms.size() + 1;
|
|
|
791 |
|
|
|
792 |
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
|
|
|
793 |
switch (splitter.terms.size() + terminc) {
|
|
|
794 |
case 0:
|
|
|
795 |
continue;// ??
|
|
|
796 |
case 1: {
|
|
|
797 |
int lmods = mods;
|
|
|
798 |
if (splitter.nostemexps.front())
|
|
|
799 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
|
|
800 |
m_hldata.ugroups.push_back(splitter.terms);
|
|
|
801 |
processSimpleSpan(db, ermsg, splitter.terms.front(),
|
|
|
802 |
lmods, &pqueries);
|
|
|
803 |
}
|
|
|
804 |
break;
|
|
|
805 |
default:
|
|
|
806 |
m_hldata.ugroups.push_back(splitter.terms);
|
|
|
807 |
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
|
|
|
808 |
useNear, slack);
|
|
|
809 |
}
|
|
|
810 |
if (m_curcl >= getMaxCl()) {
|
|
|
811 |
ermsg = maxXapClauseMsg;
|
|
|
812 |
if (!o_index_stripchars)
|
|
|
813 |
ermsg += maxXapClauseCaseDiacMsg;
|
|
|
814 |
break;
|
|
|
815 |
}
|
|
|
816 |
}
|
|
|
817 |
} catch (const Xapian::Error &e) {
|
|
|
818 |
ermsg = e.get_msg();
|
|
|
819 |
} catch (const string &s) {
|
|
|
820 |
ermsg = s;
|
|
|
821 |
} catch (const char *s) {
|
|
|
822 |
ermsg = s;
|
|
|
823 |
} catch (...) {
|
|
|
824 |
ermsg = "Caught unknown exception";
|
|
|
825 |
}
|
|
|
826 |
if (!ermsg.empty()) {
|
|
|
827 |
LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
|
|
|
828 |
return false;
|
|
|
829 |
}
|
|
|
830 |
return true;
|
|
|
831 |
}
|
|
|
832 |
|
|
|
833 |
// Translate a simple OR or AND search clause.
|
|
|
834 |
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
|
|
835 |
{
|
|
|
836 |
LOGDEB(("SearchDataClauseSimple::toNativeQuery: fld [%s] val [%s] "
|
|
|
837 |
"stemlang [%s]\n", m_field.c_str(), m_text.c_str(),
|
|
|
838 |
getStemLang().c_str()));
|
|
|
839 |
|
|
|
840 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
|
841 |
*qp = Xapian::Query();
|
|
|
842 |
|
|
|
843 |
Xapian::Query::op op;
|
|
|
844 |
switch (m_tp) {
|
|
|
845 |
case SCLT_AND: op = Xapian::Query::OP_AND; break;
|
|
|
846 |
case SCLT_OR: op = Xapian::Query::OP_OR; break;
|
|
|
847 |
default:
|
|
|
848 |
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
|
|
849 |
return false;
|
|
|
850 |
}
|
|
|
851 |
|
|
|
852 |
vector<Xapian::Query> pqueries;
|
|
|
853 |
if (!processUserString(db, m_text, m_reason, &pqueries))
|
|
|
854 |
return false;
|
|
|
855 |
if (pqueries.empty()) {
|
|
|
856 |
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
|
|
857 |
return true;
|
|
|
858 |
}
|
|
|
859 |
|
|
|
860 |
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
|
|
861 |
if (m_weight != 1.0) {
|
|
|
862 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
|
863 |
}
|
|
|
864 |
return true;
|
|
|
865 |
}
|
|
|
866 |
|
|
|
867 |
// Translate a FILENAME search clause. This always comes
|
|
|
868 |
// from a "filename" search from the gui or recollq. A query language
|
|
|
869 |
// "filename:"-prefixed field will not go through here, but through
|
|
|
870 |
// the generic field-processing code.
|
|
|
871 |
//
|
|
|
872 |
// We do not split the entry any more (used to do some crazy thing
|
|
|
873 |
// about expanding multiple fragments in the past). We just take the
|
|
|
874 |
// value blanks and all and expand this against the indexed unsplit
|
|
|
875 |
// file names
|
|
|
876 |
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
|
|
877 |
{
|
|
|
878 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
|
879 |
*qp = Xapian::Query();
|
|
|
880 |
|
|
|
881 |
int maxexp = getSoftMaxExp();
|
|
|
882 |
if (maxexp == -1)
|
|
|
883 |
maxexp = getMaxExp();
|
|
|
884 |
|
|
|
885 |
vector<string> names;
|
|
|
886 |
db.filenameWildExp(m_text, names, maxexp);
|
|
|
887 |
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
|
|
888 |
|
|
|
889 |
if (m_weight != 1.0) {
|
|
|
890 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
|
891 |
}
|
|
|
892 |
return true;
|
|
|
893 |
}
|
|
|
894 |
|
|
|
895 |
// Translate a dir: path filtering clause. See comments in .h
|
|
|
896 |
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
|
|
|
897 |
{
|
|
|
898 |
LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
|
|
|
899 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
|
900 |
*qp = Xapian::Query();
|
|
|
901 |
|
|
|
902 |
if (m_text.empty()) {
|
|
|
903 |
LOGERR(("SearchDataClausePath: empty path??\n"));
|
|
|
904 |
m_reason = "Empty path ?";
|
|
|
905 |
return false;
|
|
|
906 |
}
|
|
|
907 |
|
|
|
908 |
vector<Xapian::Query> orqueries;
|
|
|
909 |
|
|
|
910 |
if (m_text[0] == '/')
|
|
|
911 |
orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
|
|
|
912 |
else
|
|
|
913 |
m_text = path_tildexpand(m_text);
|
|
|
914 |
|
|
|
915 |
vector<string> vpath;
|
|
|
916 |
stringToTokens(m_text, vpath, "/");
|
|
|
917 |
|
|
|
918 |
for (vector<string>::const_iterator pit = vpath.begin();
|
|
|
919 |
pit != vpath.end(); pit++){
|
|
|
920 |
|
|
|
921 |
string sterm;
|
|
|
922 |
vector<string> exp;
|
|
|
923 |
if (!expandTerm(db, m_reason,
|
|
|
924 |
SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
|
|
|
925 |
*pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
|
|
|
926 |
return false;
|
|
|
927 |
}
|
|
|
928 |
LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
|
|
|
929 |
listVector("", exp);
|
|
|
930 |
if (exp.size() == 1)
|
|
|
931 |
orqueries.push_back(Xapian::Query(exp[0]));
|
|
|
932 |
else
|
|
|
933 |
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
|
|
934 |
exp.begin(), exp.end()));
|
|
|
935 |
m_curcl += exp.size();
|
|
|
936 |
if (m_curcl >= getMaxCl())
|
|
|
937 |
return false;
|
|
|
938 |
}
|
|
|
939 |
|
|
|
940 |
*qp = Xapian::Query(Xapian::Query::OP_PHRASE,
|
|
|
941 |
orqueries.begin(), orqueries.end());
|
|
|
942 |
|
|
|
943 |
if (m_weight != 1.0) {
|
|
|
944 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
|
945 |
}
|
|
|
946 |
return true;
|
|
|
947 |
}
|
|
|
948 |
|
|
|
949 |
// Translate NEAR or PHRASE clause.
|
|
|
950 |
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
|
|
951 |
{
|
|
|
952 |
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
|
|
953 |
|
|
|
954 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
|
955 |
*qp = Xapian::Query();
|
|
|
956 |
|
|
|
957 |
vector<Xapian::Query> pqueries;
|
|
|
958 |
Xapian::Query nq;
|
|
|
959 |
|
|
|
960 |
// We produce a single phrase out of the user entry then use
|
|
|
961 |
// stringToXapianQueries() to lowercase and simplify the phrase
|
|
|
962 |
// terms etc. This will result into a single (complex)
|
|
|
963 |
// Xapian::Query.
|
|
|
964 |
if (m_text.find('\"') != string::npos) {
|
|
|
965 |
m_text = neutchars(m_text, "\"");
|
|
|
966 |
}
|
|
|
967 |
string s = cstr_dquote + m_text + cstr_dquote;
|
|
|
968 |
bool useNear = (m_tp == SCLT_NEAR);
|
|
|
969 |
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
|
|
|
970 |
return false;
|
|
|
971 |
if (pqueries.empty()) {
|
|
|
972 |
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
|
|
973 |
return true;
|
|
|
974 |
}
|
|
|
975 |
|
|
|
976 |
*qp = *pqueries.begin();
|
|
|
977 |
if (m_weight != 1.0) {
|
|
|
978 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
|
979 |
}
|
|
|
980 |
return true;
|
|
|
981 |
}
|
|
|
982 |
|
|
|
983 |
} // Namespace Rcl
|