|
a/src/rcldb/searchdata.cpp |
|
b/src/rcldb/searchdata.cpp |
|
... |
|
... |
44 |
#include "termproc.h"
|
44 |
#include "termproc.h"
|
45 |
#include "synfamily.h"
|
45 |
#include "synfamily.h"
|
46 |
#include "stemdb.h"
|
46 |
#include "stemdb.h"
|
47 |
#include "expansiondbs.h"
|
47 |
#include "expansiondbs.h"
|
48 |
#include "base64.h"
|
48 |
#include "base64.h"
|
|
|
49 |
#include "daterange.h"
|
49 |
|
50 |
|
50 |
namespace Rcl {
|
51 |
namespace Rcl {
|
51 |
|
52 |
|
52 |
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
53 |
typedef vector<SearchDataClause *>::iterator qlist_it_t;
|
53 |
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
54 |
typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
|
54 |
|
55 |
|
55 |
static const int original_term_wqf_booster = 10;
|
56 |
static const int original_term_wqf_booster = 10;
|
56 |
|
57 |
|
57 |
/* The dates-to-query routine is is lifted quasi-verbatim but
|
58 |
void SearchData::commoninit()
|
58 |
* modified from xapian-omega:date.cc. Copyright info:
|
|
|
59 |
*
|
|
|
60 |
* Copyright 1999,2000,2001 BrightStation PLC
|
|
|
61 |
* Copyright 2001 James Aylett
|
|
|
62 |
* Copyright 2001,2002 Ananova Ltd
|
|
|
63 |
* Copyright 2002 Intercede 1749 Ltd
|
|
|
64 |
* Copyright 2002,2003,2006 Olly Betts
|
|
|
65 |
*
|
|
|
66 |
* This program is free software; you can redistribute it and/or
|
|
|
67 |
* modify it under the terms of the GNU General Public License as
|
|
|
68 |
* published by the Free Software Foundation; either version 2 of the
|
|
|
69 |
* License, or (at your option) any later version.
|
|
|
70 |
*
|
|
|
71 |
* This program is distributed in the hope that it will be useful,
|
|
|
72 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
73 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
74 |
* GNU General Public License for more details.
|
|
|
75 |
*
|
|
|
76 |
* You should have received a copy of the GNU General Public License
|
|
|
77 |
* along with this program; if not, write to the Free Software
|
|
|
78 |
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
|
|
|
79 |
* USA
|
|
|
80 |
*/
|
|
|
81 |
|
|
|
82 |
#ifdef RCL_INDEX_STRIPCHARS
|
|
|
83 |
#define bufprefix(BUF, L) {(BUF)[0] = L;}
|
|
|
84 |
#define bpoffs() 1
|
|
|
85 |
#else
|
|
|
86 |
static inline void bufprefix(char *buf, char c)
|
|
|
87 |
{
|
59 |
{
|
88 |
if (o_index_stripchars) {
|
60 |
m_haveDates = false;
|
89 |
buf[0] = c;
|
61 |
m_maxSize = size_t(-1);
|
90 |
} else {
|
62 |
m_minSize = size_t(-1);
|
91 |
buf[0] = ':';
|
63 |
m_haveWildCards = false;
|
92 |
buf[1] = c;
|
64 |
m_softmaxexpand = -1;
|
93 |
buf[2] = ':';
|
65 |
m_autodiacsens = false;
|
94 |
}
|
66 |
m_autocasesens = true;
|
95 |
}
|
67 |
m_maxexp = 10000;
|
96 |
static inline int bpoffs()
|
68 |
m_maxcl = 100000;
|
97 |
{
|
|
|
98 |
return o_index_stripchars ? 1 : 3;
|
|
|
99 |
}
|
|
|
100 |
#endif
|
|
|
101 |
|
|
|
102 |
static Xapian::Query
|
|
|
103 |
date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
|
|
|
104 |
{
|
|
|
105 |
// Xapian uses a smallbuf and snprintf. Can't be bothered, we're
|
|
|
106 |
// only doing %d's !
|
|
|
107 |
char buf[200];
|
|
|
108 |
bufprefix(buf, 'D');
|
|
|
109 |
sprintf(buf+bpoffs(), "%04d%02d", y1, m1);
|
|
|
110 |
vector<Xapian::Query> v;
|
|
|
111 |
|
|
|
112 |
int d_last = monthdays(m1, y1);
|
|
|
113 |
int d_end = d_last;
|
|
|
114 |
if (y1 == y2 && m1 == m2 && d2 < d_last) {
|
|
|
115 |
d_end = d2;
|
|
|
116 |
}
|
|
|
117 |
// Deal with any initial partial month
|
|
|
118 |
if (d1 > 1 || d_end < d_last) {
|
|
|
119 |
for ( ; d1 <= d_end ; d1++) {
|
|
|
120 |
sprintf(buf + 6 + bpoffs(), "%02d", d1);
|
|
|
121 |
v.push_back(Xapian::Query(buf));
|
|
|
122 |
}
|
|
|
123 |
} else {
|
|
|
124 |
bufprefix(buf, 'M');
|
|
|
125 |
v.push_back(Xapian::Query(buf));
|
|
|
126 |
}
|
|
|
127 |
|
|
|
128 |
if (y1 == y2 && m1 == m2) {
|
|
|
129 |
return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end());
|
|
|
130 |
}
|
|
|
131 |
|
|
|
132 |
int m_last = (y1 < y2) ? 12 : m2 - 1;
|
|
|
133 |
while (++m1 <= m_last) {
|
|
|
134 |
sprintf(buf + 4 + bpoffs(), "%02d", m1);
|
|
|
135 |
bufprefix(buf, 'M');
|
|
|
136 |
v.push_back(Xapian::Query(buf));
|
|
|
137 |
}
|
|
|
138 |
|
|
|
139 |
if (y1 < y2) {
|
|
|
140 |
while (++y1 < y2) {
|
|
|
141 |
sprintf(buf + bpoffs(), "%04d", y1);
|
|
|
142 |
bufprefix(buf, 'Y');
|
|
|
143 |
v.push_back(Xapian::Query(buf));
|
|
|
144 |
}
|
|
|
145 |
sprintf(buf + bpoffs(), "%04d", y2);
|
|
|
146 |
bufprefix(buf, 'M');
|
|
|
147 |
for (m1 = 1; m1 < m2; m1++) {
|
|
|
148 |
sprintf(buf + 4 + bpoffs(), "%02d", m1);
|
|
|
149 |
v.push_back(Xapian::Query(buf));
|
|
|
150 |
}
|
|
|
151 |
}
|
|
|
152 |
|
|
|
153 |
sprintf(buf + 2 + bpoffs(), "%02d", m2);
|
|
|
154 |
|
|
|
155 |
// Deal with any final partial month
|
|
|
156 |
if (d2 < monthdays(m2, y2)) {
|
|
|
157 |
bufprefix(buf, 'D');
|
|
|
158 |
for (d1 = 1 ; d1 <= d2; d1++) {
|
|
|
159 |
sprintf(buf + 6 + bpoffs(), "%02d", d1);
|
|
|
160 |
v.push_back(Xapian::Query(buf));
|
|
|
161 |
}
|
|
|
162 |
} else {
|
|
|
163 |
bufprefix(buf, 'M');
|
|
|
164 |
v.push_back(Xapian::Query(buf));
|
|
|
165 |
}
|
|
|
166 |
|
|
|
167 |
return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end());
|
|
|
168 |
}
|
69 |
}
|
169 |
|
70 |
|
170 |
// Expand categories and mime type wild card exps
|
71 |
// Expand categories and mime type wild card exps
|
171 |
// Actually, using getAllMimeTypes() here is a bit problematic because
|
72 |
// Actually, using getAllMimeTypes() here is a bit problematic because
|
172 |
// there maybe other types in the index, not indexed by content, but
|
73 |
// there maybe other types in the index, not indexed by content, but
|
|
... |
|
... |
186 |
if (cfg->isMimeCategory(*it)) {
|
87 |
if (cfg->isMimeCategory(*it)) {
|
187 |
vector<string>tps;
|
88 |
vector<string>tps;
|
188 |
cfg->getMimeCatTypes(*it, tps);
|
89 |
cfg->getMimeCatTypes(*it, tps);
|
189 |
exptps.insert(exptps.end(), tps.begin(), tps.end());
|
90 |
exptps.insert(exptps.end(), tps.begin(), tps.end());
|
190 |
} else {
|
91 |
} else {
|
|
|
92 |
bool matched = false;
|
191 |
for (vector<string>::const_iterator ait = alltypes.begin();
|
93 |
for (vector<string>::const_iterator ait = alltypes.begin();
|
192 |
ait != alltypes.end(); ait++) {
|
94 |
ait != alltypes.end(); ait++) {
|
193 |
if (fnmatch(it->c_str(), ait->c_str(), FNM_CASEFOLD)
|
95 |
if (fnmatch(it->c_str(), ait->c_str(), FNM_CASEFOLD)
|
194 |
!= FNM_NOMATCH) {
|
96 |
!= FNM_NOMATCH) {
|
195 |
exptps.push_back(*ait);
|
97 |
exptps.push_back(*ait);
|
|
|
98 |
matched = true;
|
196 |
}
|
99 |
}
|
197 |
}
|
100 |
}
|
|
|
101 |
if (!matched)
|
|
|
102 |
exptps.push_back(it->c_str());
|
198 |
}
|
103 |
}
|
199 |
}
|
104 |
}
|
200 |
tps = exptps;
|
105 |
tps = exptps;
|
201 |
return true;
|
106 |
return true;
|
202 |
}
|
107 |
}
|
203 |
|
108 |
|
204 |
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
109 |
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
|
205 |
vector<SearchDataClause*>& query,
|
110 |
vector<SearchDataClause*>& query,
|
206 |
string& reason, void *d,
|
111 |
string& reason, void *d)
|
207 |
int maxexp, int maxcl)
|
|
|
208 |
{
|
112 |
{
|
209 |
Xapian::Query xq;
|
113 |
Xapian::Query xq;
|
210 |
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
114 |
for (qlist_it_t it = query.begin(); it != query.end(); it++) {
|
211 |
Xapian::Query nq;
|
115 |
Xapian::Query nq;
|
212 |
if (!(*it)->toNativeQuery(db, &nq, maxexp, maxcl)) {
|
116 |
if (!(*it)->toNativeQuery(db, &nq)) {
|
213 |
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
|
117 |
LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
|
214 |
(*it)->getReason().c_str()));
|
118 |
(*it)->getReason().c_str()));
|
215 |
reason += (*it)->getReason() + " ";
|
119 |
reason += (*it)->getReason() + " ";
|
216 |
return false;
|
120 |
return false;
|
217 |
}
|
121 |
}
|
|
... |
|
... |
238 |
else
|
142 |
else
|
239 |
xq = nq;
|
143 |
xq = nq;
|
240 |
} else {
|
144 |
} else {
|
241 |
xq = Xapian::Query(op, xq, nq);
|
145 |
xq = Xapian::Query(op, xq, nq);
|
242 |
}
|
146 |
}
|
243 |
if (int(xq.get_length()) >= maxcl) {
|
147 |
if (int(xq.get_length()) >= getMaxCl()) {
|
244 |
LOGERR(("Maximum Xapian query size exceeded."
|
148 |
LOGERR(("Maximum Xapian query size exceeded."
|
245 |
" Maybe increase maxXapianClauses."));
|
149 |
" Maybe increase maxXapianClauses."));
|
246 |
m_reason += "Maximum Xapian query size exceeded."
|
150 |
m_reason += "Maximum Xapian query size exceeded."
|
247 |
" Maybe increase maxXapianClauses.";
|
151 |
" Maybe increase maxXapianClauses.";
|
248 |
return false;
|
152 |
return false;
|
|
... |
|
... |
253 |
|
157 |
|
254 |
*((Xapian::Query *)d) = xq;
|
158 |
*((Xapian::Query *)d) = xq;
|
255 |
return true;
|
159 |
return true;
|
256 |
}
|
160 |
}
|
257 |
|
161 |
|
258 |
static string tpToString(SClType tp)
|
|
|
259 |
{
|
|
|
260 |
switch (tp) {
|
|
|
261 |
case SCLT_AND: return "AND";
|
|
|
262 |
case SCLT_OR: return "OR";
|
|
|
263 |
case SCLT_EXCL: return "EX";
|
|
|
264 |
case SCLT_FILENAME: return "FN";
|
|
|
265 |
case SCLT_PHRASE: return "PH";
|
|
|
266 |
case SCLT_NEAR: return "NE";
|
|
|
267 |
case SCLT_SUB: return "SU"; // Unsupported actually
|
|
|
268 |
default: return "UN";
|
|
|
269 |
}
|
|
|
270 |
}
|
|
|
271 |
|
|
|
272 |
string SearchData::asXML()
|
|
|
273 |
{
|
|
|
274 |
LOGDEB(("SearchData::asXML\n"));
|
|
|
275 |
ostringstream os;
|
|
|
276 |
|
|
|
277 |
// Searchdata
|
|
|
278 |
os << "<SD>" << endl;
|
|
|
279 |
|
|
|
280 |
// Clause list
|
|
|
281 |
os << "<CL>" << endl;
|
|
|
282 |
if (m_tp != SCLT_AND)
|
|
|
283 |
os << "<CLT>" << tpToString(m_tp) << "</CLT>" << endl;
|
|
|
284 |
for (unsigned int i = 0; i < m_query.size(); i++) {
|
|
|
285 |
SearchDataClause *c = m_query[i];
|
|
|
286 |
if (c->getTp() == SCLT_SUB) {
|
|
|
287 |
LOGERR(("SearchData::asXML: can't do subclauses !\n"));
|
|
|
288 |
continue;
|
|
|
289 |
}
|
|
|
290 |
SearchDataClauseSimple *cl =
|
|
|
291 |
dynamic_cast<SearchDataClauseSimple*>(c);
|
|
|
292 |
os << "<C>" << endl;
|
|
|
293 |
if (cl->getTp() != SCLT_AND) {
|
|
|
294 |
os << "<CT>" << tpToString(cl->getTp()) << "</CT>" << endl;
|
|
|
295 |
}
|
|
|
296 |
if (cl->getTp() != SCLT_FILENAME && !cl->getfield().empty()) {
|
|
|
297 |
os << "<F>" << base64_encode(cl->getfield()) << "</F>" << endl;
|
|
|
298 |
}
|
|
|
299 |
os << "<T>" << base64_encode(cl->gettext()) << "</T>" << endl;
|
|
|
300 |
if (cl->getTp() == SCLT_NEAR || cl->getTp() == SCLT_PHRASE) {
|
|
|
301 |
SearchDataClauseDist *cld =
|
|
|
302 |
dynamic_cast<SearchDataClauseDist*>(cl);
|
|
|
303 |
os << "<S>" << cld->getslack() << "</S>" << endl;
|
|
|
304 |
}
|
|
|
305 |
os << "</C>" << endl;
|
|
|
306 |
}
|
|
|
307 |
os << "</CL>" << endl;
|
|
|
308 |
|
|
|
309 |
if (m_haveDates) {
|
|
|
310 |
if (m_dates.y1 > 0) {
|
|
|
311 |
os << "<DMI>" <<
|
|
|
312 |
"<D>" << m_dates.d1 << "</D>" <<
|
|
|
313 |
"<M>" << m_dates.m1 << "</M>" <<
|
|
|
314 |
"<Y>" << m_dates.y1 << "</Y>"
|
|
|
315 |
<< "</DMI>" << endl;
|
|
|
316 |
}
|
|
|
317 |
if (m_dates.y2 > 0) {
|
|
|
318 |
os << "<DMA>" <<
|
|
|
319 |
"<D>" << m_dates.d2 << "</D>" <<
|
|
|
320 |
"<M>" << m_dates.m2 << "</M>" <<
|
|
|
321 |
"<Y>" << m_dates.y2 << "</Y>"
|
|
|
322 |
<< "</DMA>" << endl;
|
|
|
323 |
}
|
|
|
324 |
}
|
|
|
325 |
|
|
|
326 |
|
|
|
327 |
if (m_minSize != size_t(-1)) {
|
|
|
328 |
os << "<MIS>" << m_minSize << "</MIS>" << endl;
|
|
|
329 |
}
|
|
|
330 |
if (m_maxSize != size_t(-1)) {
|
|
|
331 |
os << "<MAS>" << m_maxSize << "</MAS>" << endl;
|
|
|
332 |
}
|
|
|
333 |
|
|
|
334 |
if (!m_filetypes.empty()) {
|
|
|
335 |
os << "<ST>";
|
|
|
336 |
for (vector<string>::iterator it = m_filetypes.begin();
|
|
|
337 |
it != m_filetypes.end(); it++) {
|
|
|
338 |
os << *it << " ";
|
|
|
339 |
}
|
|
|
340 |
os << "</ST>" << endl;
|
|
|
341 |
}
|
|
|
342 |
|
|
|
343 |
if (!m_nfiletypes.empty()) {
|
|
|
344 |
os << "<IT>";
|
|
|
345 |
for (vector<string>::iterator it = m_nfiletypes.begin();
|
|
|
346 |
it != m_nfiletypes.end(); it++) {
|
|
|
347 |
os << *it << " ";
|
|
|
348 |
}
|
|
|
349 |
os << "</IT>" << endl;
|
|
|
350 |
}
|
|
|
351 |
|
|
|
352 |
for (vector<DirSpec>::const_iterator dit = m_dirspecs.begin();
|
|
|
353 |
dit != m_dirspecs.end(); dit++) {
|
|
|
354 |
if (dit->exclude) {
|
|
|
355 |
os << "<ND>" << base64_encode(dit->dir) << "</ND>" << endl;
|
|
|
356 |
} else {
|
|
|
357 |
os << "<YD>" << base64_encode(dit->dir) << "</YD>" << endl;
|
|
|
358 |
}
|
|
|
359 |
}
|
|
|
360 |
os << "</SD>";
|
|
|
361 |
return os.str();
|
|
|
362 |
}
|
|
|
363 |
|
|
|
364 |
bool SearchData::toNativeQuery(Rcl::Db &db, void *d, int maxexp, int maxcl)
|
162 |
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
|
365 |
{
|
163 |
{
|
366 |
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
164 |
LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
|
367 |
m_reason.erase();
|
165 |
m_reason.erase();
|
368 |
|
166 |
|
369 |
// Walk the clause list translating each in turn and building the
|
167 |
// Walk the clause list translating each in turn and building the
|
370 |
// Xapian query tree
|
168 |
// Xapian query tree
|
371 |
Xapian::Query xq;
|
169 |
Xapian::Query xq;
|
372 |
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq, maxexp, maxcl)) {
|
170 |
if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
|
373 |
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
|
171 |
LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
|
374 |
m_reason.c_str()));
|
172 |
m_reason.c_str()));
|
375 |
return false;
|
173 |
return false;
|
376 |
}
|
174 |
}
|
377 |
|
175 |
|
|
... |
|
... |
630 |
m_query.push_back(cl);
|
428 |
m_query.push_back(cl);
|
631 |
return true;
|
429 |
return true;
|
632 |
}
|
430 |
}
|
633 |
|
431 |
|
634 |
// Make me all new
|
432 |
// Make me all new
|
635 |
void SearchData::erase() {
|
433 |
void SearchData::erase()
|
|
|
434 |
{
|
636 |
LOGDEB0(("SearchData::erase\n"));
|
435 |
LOGDEB0(("SearchData::erase\n"));
|
637 |
m_tp = SCLT_AND;
|
436 |
m_tp = SCLT_AND;
|
638 |
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
437 |
for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
|
639 |
delete *it;
|
438 |
delete *it;
|
640 |
m_query.clear();
|
439 |
m_query.clear();
|
|
... |
|
... |
727 |
TextSplitQ *m_ts;
|
526 |
TextSplitQ *m_ts;
|
728 |
map<int, string> m_terms;
|
527 |
map<int, string> m_terms;
|
729 |
map<int, bool> m_nste;
|
528 |
map<int, bool> m_nste;
|
730 |
};
|
529 |
};
|
731 |
|
530 |
|
732 |
// A class used to translate a user compound string (*not* a query
|
|
|
733 |
// language string) as may be entered in any_terms/all_terms search
|
|
|
734 |
// entry fields, ex: [term1 "a phrase" term3] into a xapian query
|
|
|
735 |
// tree.
|
|
|
736 |
// The object keeps track of the query terms and term groups while
|
|
|
737 |
// translating.
|
|
|
738 |
class StringToXapianQ {
|
|
|
739 |
public:
|
|
|
740 |
StringToXapianQ(Db& db, HighlightData& hld, const string& field,
|
|
|
741 |
const string &stmlng, bool boostUser, int maxexp, int maxcl)
|
|
|
742 |
: m_db(db), m_field(field), m_stemlang(stmlng),
|
|
|
743 |
m_doBoostUserTerms(boostUser), m_hld(hld), m_autodiacsens(false),
|
|
|
744 |
m_autocasesens(true), m_maxexp(maxexp), m_maxcl(maxcl), m_curcl(0)
|
|
|
745 |
{
|
|
|
746 |
m_db.getConf()->getConfParam("autodiacsens", &m_autodiacsens);
|
|
|
747 |
m_db.getConf()->getConfParam("autocasesens", &m_autocasesens);
|
|
|
748 |
}
|
|
|
749 |
|
|
|
750 |
bool processUserString(const string &iq,
|
|
|
751 |
int mods,
|
|
|
752 |
string &ermsg,
|
|
|
753 |
vector<Xapian::Query> &pqueries,
|
|
|
754 |
int slack = 0, bool useNear = false);
|
|
|
755 |
private:
|
|
|
756 |
bool expandTerm(string& ermsg, int mods,
|
|
|
757 |
const string& term, vector<string>& exp,
|
|
|
758 |
string& sterm, const string& prefix);
|
|
|
759 |
// After splitting entry on whitespace: process non-phrase element
|
|
|
760 |
void processSimpleSpan(string& ermsg, const string& span,
|
|
|
761 |
int mods,
|
|
|
762 |
vector<Xapian::Query> &pqueries);
|
|
|
763 |
// Process phrase/near element
|
|
|
764 |
void processPhraseOrNear(string& ermsg, TextSplitQ *splitData,
|
|
|
765 |
int mods,
|
|
|
766 |
vector<Xapian::Query> &pqueries,
|
|
|
767 |
bool useNear, int slack);
|
|
|
768 |
|
|
|
769 |
Db& m_db;
|
|
|
770 |
const string& m_field;
|
|
|
771 |
const string& m_stemlang;
|
|
|
772 |
const bool m_doBoostUserTerms;
|
|
|
773 |
HighlightData& m_hld;
|
|
|
774 |
bool m_autodiacsens;
|
|
|
775 |
bool m_autocasesens;
|
|
|
776 |
int m_maxexp;
|
|
|
777 |
int m_maxcl;
|
|
|
778 |
int m_curcl;
|
|
|
779 |
};
|
|
|
780 |
|
531 |
|
781 |
#if 1
|
532 |
#if 1
|
782 |
static void listVector(const string& what, const vector<string>&l)
|
533 |
static void listVector(const string& what, const vector<string>&l)
|
783 |
{
|
534 |
{
|
784 |
string a;
|
535 |
string a;
|
|
... |
|
... |
798 |
* @param sterm output original input term if there were no wildcards
|
549 |
* @param sterm output original input term if there were no wildcards
|
799 |
* @param prefix field prefix in index. We could recompute it, but the caller
|
550 |
* @param prefix field prefix in index. We could recompute it, but the caller
|
800 |
* has it already. Used in the simple case where there is nothing to expand,
|
551 |
* has it already. Used in the simple case where there is nothing to expand,
|
801 |
* and we just return the prefixed term (else Db::termMatch deals with it).
|
552 |
* and we just return the prefixed term (else Db::termMatch deals with it).
|
802 |
*/
|
553 |
*/
|
803 |
bool StringToXapianQ::expandTerm(string& ermsg, int mods,
|
554 |
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
|
|
|
555 |
string& ermsg, int mods,
|
804 |
const string& term,
|
556 |
const string& term,
|
805 |
vector<string>& oexp, string &sterm,
|
557 |
vector<string>& oexp, string &sterm,
|
806 |
const string& prefix)
|
558 |
const string& prefix)
|
807 |
{
|
559 |
{
|
808 |
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
560 |
LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
|
809 |
mods, m_field.c_str(), term.c_str(), m_stemlang.c_str()));
|
561 |
mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
|
810 |
sterm.clear();
|
562 |
sterm.clear();
|
811 |
oexp.clear();
|
563 |
oexp.clear();
|
812 |
if (term.empty())
|
564 |
if (term.empty())
|
813 |
return true;
|
565 |
return true;
|
814 |
|
566 |
|
815 |
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
567 |
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
816 |
|
568 |
|
817 |
// If there are no wildcards, add term to the list of user-entered terms
|
569 |
// If there are no wildcards, add term to the list of user-entered terms
|
818 |
if (!haswild)
|
570 |
if (!haswild)
|
819 |
m_hld.uterms.insert(term);
|
571 |
m_hldata.uterms.insert(term);
|
820 |
|
572 |
|
821 |
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
573 |
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
822 |
|
574 |
|
823 |
// No stem expansion if there are wildcards or if prevented by caller
|
575 |
// No stem expansion if there are wildcards or if prevented by caller
|
824 |
if (haswild || m_stemlang.empty()) {
|
576 |
if (haswild || getStemLang().empty()) {
|
825 |
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
577 |
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
826 |
nostemexp = true;
|
578 |
nostemexp = true;
|
827 |
}
|
579 |
}
|
828 |
|
580 |
|
829 |
bool noexpansion = nostemexp && !haswild;
|
581 |
bool noexpansion = nostemexp && !haswild;
|
|
... |
|
... |
840 |
|
592 |
|
841 |
// If any character has a diacritic, we become
|
593 |
// If any character has a diacritic, we become
|
842 |
// diacritic-sensitive. Note that the way that the test is
|
594 |
// diacritic-sensitive. Note that the way that the test is
|
843 |
// performed (conversion+comparison) will automatically ignore
|
595 |
// performed (conversion+comparison) will automatically ignore
|
844 |
// accented characters which are actually a separate letter
|
596 |
// accented characters which are actually a separate letter
|
845 |
if (m_autodiacsens && unachasaccents(term)) {
|
597 |
if (getAutoDiac() && unachasaccents(term)) {
|
846 |
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
|
598 |
LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
|
847 |
diac_sensitive = true;
|
599 |
diac_sensitive = true;
|
848 |
}
|
600 |
}
|
849 |
|
601 |
|
850 |
// If any character apart the first is uppercase, we become
|
602 |
// If any character apart the first is uppercase, we become
|
851 |
// case-sensitive. The first character is reserved for
|
603 |
// case-sensitive. The first character is reserved for
|
852 |
// turning off stemming. You need to use a query language
|
604 |
// turning off stemming. You need to use a query language
|
853 |
// modifier to search for Floor in a case-sensitive way.
|
605 |
// modifier to search for Floor in a case-sensitive way.
|
854 |
Utf8Iter it(term);
|
606 |
Utf8Iter it(term);
|
855 |
it++;
|
607 |
it++;
|
856 |
if (m_autocasesens && unachasuppercase(term.substr(it.getBpos()))) {
|
608 |
if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
|
857 |
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
|
609 |
LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
|
858 |
case_sensitive = true;
|
610 |
case_sensitive = true;
|
859 |
}
|
611 |
}
|
860 |
|
612 |
|
861 |
// If we are sensitive to case or diacritics turn stemming off
|
613 |
// If we are sensitive to case or diacritics turn stemming off
|
|
... |
|
... |
870 |
#endif
|
622 |
#endif
|
871 |
|
623 |
|
872 |
if (noexpansion) {
|
624 |
if (noexpansion) {
|
873 |
sterm = term;
|
625 |
sterm = term;
|
874 |
oexp.push_back(prefix + term);
|
626 |
oexp.push_back(prefix + term);
|
875 |
m_hld.terms[term] = m_hld.uterms.size() - 1;
|
627 |
m_hldata.terms[term] = m_hldata.uterms.size() - 1;
|
876 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
628 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
877 |
return true;
|
629 |
return true;
|
878 |
}
|
630 |
}
|
879 |
|
631 |
|
880 |
// Make objects before the goto jungle to avoid compiler complaints
|
632 |
// Make objects before the goto jungle to avoid compiler complaints
|
881 |
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
633 |
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
882 |
XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, "all",
|
634 |
XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
|
883 |
&unacfoldtrans);
|
635 |
&unacfoldtrans);
|
884 |
// This will hold the result of case and diacritics expansion as input
|
636 |
// This will hold the result of case and diacritics expansion as input
|
885 |
// to stem expansion.
|
637 |
// to stem expansion.
|
886 |
vector<string> lexp;
|
638 |
vector<string> lexp;
|
887 |
|
639 |
|
|
... |
|
... |
889 |
if (haswild) {
|
641 |
if (haswild) {
|
890 |
// Note that if there are wildcards, we do a direct from-index
|
642 |
// Note that if there are wildcards, we do a direct from-index
|
891 |
// expansion, which means that we are casediac-sensitive. There
|
643 |
// expansion, which means that we are casediac-sensitive. There
|
892 |
// would be nothing to prevent us to expand from the casediac
|
644 |
// would be nothing to prevent us to expand from the casediac
|
893 |
// synonyms first. To be done later
|
645 |
// synonyms first. To be done later
|
894 |
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang,term,res,m_maxexp,m_field);
|
646 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
|
|
|
647 |
getMaxExp(), m_field);
|
895 |
goto termmatchtoresult;
|
648 |
goto termmatchtoresult;
|
896 |
}
|
649 |
}
|
897 |
|
650 |
|
898 |
sterm = term;
|
651 |
sterm = term;
|
899 |
|
652 |
|
900 |
#ifdef RCL_INDEX_STRIPCHARS
|
653 |
#ifdef RCL_INDEX_STRIPCHARS
|
901 |
|
654 |
|
902 |
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, m_maxexp, m_field);
|
655 |
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
|
|
656 |
getMaxExp(), m_field);
|
903 |
|
657 |
|
904 |
#else
|
658 |
#else
|
905 |
|
659 |
|
906 |
if (o_index_stripchars) {
|
660 |
if (o_index_stripchars) {
|
907 |
// If the index is raw, we can only come here if nostemexp is unset
|
661 |
// If the index is raw, we can only come here if nostemexp is unset
|
908 |
// and we just need stem expansion.
|
662 |
// and we just need stem expansion.
|
909 |
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang,term,res,m_maxexp,m_field);
|
663 |
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
|
|
664 |
getMaxExp(), m_field);
|
910 |
goto termmatchtoresult;
|
665 |
goto termmatchtoresult;
|
911 |
}
|
666 |
}
|
912 |
|
667 |
|
913 |
// No stem expansion when diacritic or case sensitivity is set, it
|
668 |
// No stem expansion when diacritic or case sensitivity is set, it
|
914 |
// makes no sense (it would mess with the diacritics anyway if
|
669 |
// makes no sense (it would mess with the diacritics anyway if
|
|
... |
|
... |
948 |
}
|
703 |
}
|
949 |
sort(lexp.begin(), lexp.end());
|
704 |
sort(lexp.begin(), lexp.end());
|
950 |
{
|
705 |
{
|
951 |
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
|
706 |
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
|
952 |
lexp.resize(uit - lexp.begin());
|
707 |
lexp.resize(uit - lexp.begin());
|
953 |
StemDb db(m_db.m_ndb->xrdb);
|
708 |
StemDb sdb(db.m_ndb->xrdb);
|
954 |
vector<string> exp1;
|
709 |
vector<string> exp1;
|
955 |
for (vector<string>::const_iterator it = lexp.begin();
|
710 |
for (vector<string>::const_iterator it = lexp.begin();
|
956 |
it != lexp.end(); it++) {
|
711 |
it != lexp.end(); it++) {
|
957 |
db.stemExpand(m_stemlang, *it, exp1);
|
712 |
sdb.stemExpand(getStemLang(), *it, exp1);
|
958 |
}
|
713 |
}
|
959 |
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
714 |
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
960 |
|
715 |
|
961 |
// Expand the resulting list for case (all stemdb content
|
716 |
// Expand the resulting list for case (all stemdb content
|
962 |
// is lowercase)
|
717 |
// is lowercase)
|
|
... |
|
... |
973 |
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
728 |
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
974 |
exptotermatch:
|
729 |
exptotermatch:
|
975 |
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
730 |
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
976 |
for (vector<string>::const_iterator it = lexp.begin();
|
731 |
for (vector<string>::const_iterator it = lexp.begin();
|
977 |
it != lexp.end(); it++) {
|
732 |
it != lexp.end(); it++) {
|
978 |
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, res,m_maxexp,m_field);
|
733 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
|
|
734 |
getMaxExp(), m_field);
|
979 |
}
|
735 |
}
|
980 |
#endif
|
736 |
#endif
|
981 |
|
737 |
|
982 |
// Term match entries to vector of terms
|
738 |
// Term match entries to vector of terms
|
983 |
termmatchtoresult:
|
739 |
termmatchtoresult:
|
984 |
if (int(res.entries.size()) >= m_maxexp) {
|
740 |
if (int(res.entries.size()) >= getMaxExp()) {
|
985 |
ermsg = "Maximum term expansion size exceeded."
|
741 |
ermsg = "Maximum term expansion size exceeded."
|
986 |
" Maybe increase maxTermExpand.";
|
742 |
" Maybe increase maxTermExpand.";
|
987 |
return false;
|
743 |
return false;
|
988 |
}
|
744 |
}
|
989 |
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
745 |
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
|
... |
|
... |
997 |
oexp.push_back(prefix + term);
|
753 |
oexp.push_back(prefix + term);
|
998 |
|
754 |
|
999 |
// Remember the uterm-to-expansion links
|
755 |
// Remember the uterm-to-expansion links
|
1000 |
for (vector<string>::const_iterator it = oexp.begin();
|
756 |
for (vector<string>::const_iterator it = oexp.begin();
|
1001 |
it != oexp.end(); it++) {
|
757 |
it != oexp.end(); it++) {
|
1002 |
m_hld.terms[strip_prefix(*it)] = term;
|
758 |
m_hldata.terms[strip_prefix(*it)] = term;
|
1003 |
}
|
759 |
}
|
1004 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
760 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
1005 |
return true;
|
761 |
return true;
|
1006 |
}
|
762 |
}
|
1007 |
|
763 |
|
|
... |
|
... |
1037 |
// vector)
|
793 |
// vector)
|
1038 |
comb.pop_back();
|
794 |
comb.pop_back();
|
1039 |
}
|
795 |
}
|
1040 |
}
|
796 |
}
|
1041 |
|
797 |
|
1042 |
void StringToXapianQ::processSimpleSpan(string& ermsg, const string& span,
|
798 |
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
|
1043 |
int mods,
|
799 |
const string& span,
|
1044 |
vector<Xapian::Query> &pqueries)
|
800 |
int mods, void * pq)
|
1045 |
{
|
801 |
{
|
|
|
802 |
vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
|
1046 |
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
|
803 |
LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
|
1047 |
span.c_str(), (unsigned int)mods));
|
804 |
span.c_str(), (unsigned int)mods));
|
1048 |
vector<string> exp;
|
805 |
vector<string> exp;
|
1049 |
string sterm; // dumb version of user term
|
806 |
string sterm; // dumb version of user term
|
1050 |
|
807 |
|
1051 |
string prefix;
|
808 |
string prefix;
|
1052 |
const FieldTraits *ftp;
|
809 |
const FieldTraits *ftp;
|
1053 |
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
|
810 |
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp)) {
|
1054 |
prefix = wrap_prefix(ftp->pfx);
|
811 |
prefix = wrap_prefix(ftp->pfx);
|
1055 |
}
|
812 |
}
|
1056 |
|
813 |
|
1057 |
if (!expandTerm(ermsg, mods, span, exp, sterm, prefix))
|
814 |
if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
|
1058 |
return;
|
815 |
return;
|
1059 |
|
816 |
|
1060 |
// Set up the highlight data. No prefix should go in there
|
817 |
// Set up the highlight data. No prefix should go in there
|
1061 |
for (vector<string>::const_iterator it = exp.begin();
|
818 |
for (vector<string>::const_iterator it = exp.begin();
|
1062 |
it != exp.end(); it++) {
|
819 |
it != exp.end(); it++) {
|
1063 |
m_hld.groups.push_back(vector<string>(1, it->substr(prefix.size())));
|
820 |
m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
|
1064 |
m_hld.slacks.push_back(0);
|
821 |
m_hldata.slacks.push_back(0);
|
1065 |
m_hld.grpsugidx.push_back(m_hld.ugroups.size() - 1);
|
822 |
m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
|
1066 |
}
|
823 |
}
|
1067 |
|
824 |
|
1068 |
// Push either term or OR of stem-expanded set
|
825 |
// Push either term or OR of stem-expanded set
|
1069 |
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
826 |
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
1070 |
m_curcl += exp.size();
|
827 |
m_curcl += exp.size();
|
|
... |
|
... |
1072 |
// If sterm (simplified original user term) is not null, give it a
|
829 |
// If sterm (simplified original user term) is not null, give it a
|
1073 |
// relevance boost. We do this even if no expansion occurred (else
|
830 |
// relevance boost. We do this even if no expansion occurred (else
|
1074 |
// the non-expanded terms in a term list would end-up with even
|
831 |
// the non-expanded terms in a term list would end-up with even
|
1075 |
// less wqf). This does not happen if there are wildcards anywhere
|
832 |
// less wqf). This does not happen if there are wildcards anywhere
|
1076 |
// in the search.
|
833 |
// in the search.
|
|
|
834 |
// We normally boost the original term in the stem expansion list. Don't
|
|
|
835 |
// do it if there are wildcards anywhere, this would skew the results.
|
|
|
836 |
bool doBoostUserTerm =
|
|
|
837 |
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
|
|
838 |
(m_parentSearch == 0 && !m_haveWildCards);
|
1077 |
if (m_doBoostUserTerms && !sterm.empty()) {
|
839 |
if (doBoostUserTerm && !sterm.empty()) {
|
1078 |
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
840 |
xq = Xapian::Query(Xapian::Query::OP_OR, xq,
|
1079 |
Xapian::Query(prefix+sterm,
|
841 |
Xapian::Query(prefix+sterm,
|
1080 |
original_term_wqf_booster));
|
842 |
original_term_wqf_booster));
|
1081 |
}
|
843 |
}
|
1082 |
pqueries.push_back(xq);
|
844 |
pqueries.push_back(xq);
|
|
... |
|
... |
1084 |
|
846 |
|
1085 |
// User entry element had several terms: transform into a PHRASE or
|
847 |
// User entry element had several terms: transform into a PHRASE or
|
1086 |
// NEAR xapian query, the elements of which can themselves be OR
|
848 |
// NEAR xapian query, the elements of which can themselves be OR
|
1087 |
// queries if the terms get expanded by stemming or wildcards (we
|
849 |
// queries if the terms get expanded by stemming or wildcards (we
|
1088 |
// don't do stemming for PHRASE though)
|
850 |
// don't do stemming for PHRASE though)
|
1089 |
void StringToXapianQ::processPhraseOrNear(string& ermsg, TextSplitQ *splitData,
|
851 |
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
1090 |
int mods,
|
852 |
TextSplitQ *splitData,
|
1091 |
vector<Xapian::Query> &pqueries,
|
853 |
int mods, void *pq,
|
1092 |
bool useNear, int slack)
|
854 |
bool useNear, int slack)
|
1093 |
{
|
855 |
{
|
|
|
856 |
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
1094 |
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
857 |
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
1095 |
Xapian::Query::OP_PHRASE;
|
858 |
Xapian::Query::OP_PHRASE;
|
1096 |
vector<Xapian::Query> orqueries;
|
859 |
vector<Xapian::Query> orqueries;
|
1097 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
860 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
1098 |
bool hadmultiple = false;
|
861 |
bool hadmultiple = false;
|
1099 |
#endif
|
862 |
#endif
|
1100 |
vector<vector<string> >groups;
|
863 |
vector<vector<string> >groups;
|
1101 |
|
864 |
|
1102 |
string prefix;
|
865 |
string prefix;
|
1103 |
const FieldTraits *ftp;
|
866 |
const FieldTraits *ftp;
|
1104 |
if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
|
867 |
if (!m_field.empty() && db.fieldToTraits(m_field, &ftp)) {
|
1105 |
prefix = wrap_prefix(ftp->pfx);
|
868 |
prefix = wrap_prefix(ftp->pfx);
|
1106 |
}
|
869 |
}
|
1107 |
|
870 |
|
1108 |
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
871 |
if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
|
1109 |
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
|
872 |
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
|
|
... |
|
... |
1126 |
int lmods = mods;
|
889 |
int lmods = mods;
|
1127 |
if (nostemexp)
|
890 |
if (nostemexp)
|
1128 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
891 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
1129 |
string sterm;
|
892 |
string sterm;
|
1130 |
vector<string> exp;
|
893 |
vector<string> exp;
|
1131 |
if (!expandTerm(ermsg, lmods, *it, exp, sterm, prefix))
|
894 |
if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
|
1132 |
return;
|
895 |
return;
|
1133 |
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
896 |
LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
|
1134 |
listVector("", exp);
|
897 |
listVector("", exp);
|
1135 |
// groups is used for highlighting, we don't want prefixes in there.
|
898 |
// groups is used for highlighting, we don't want prefixes in there.
|
1136 |
vector<string> noprefs;
|
899 |
vector<string> noprefs;
|
|
... |
|
... |
1140 |
}
|
903 |
}
|
1141 |
groups.push_back(noprefs);
|
904 |
groups.push_back(noprefs);
|
1142 |
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
905 |
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
1143 |
exp.begin(), exp.end()));
|
906 |
exp.begin(), exp.end()));
|
1144 |
m_curcl += exp.size();
|
907 |
m_curcl += exp.size();
|
1145 |
if (m_curcl >= m_maxcl)
|
908 |
if (m_curcl >= getMaxCl())
|
1146 |
return;
|
909 |
return;
|
1147 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
910 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
1148 |
if (exp.size() > 1)
|
911 |
if (exp.size() > 1)
|
1149 |
hadmultiple = true;
|
912 |
hadmultiple = true;
|
1150 |
#endif
|
913 |
#endif
|
|
... |
|
... |
1171 |
vector<string> comb;
|
934 |
vector<string> comb;
|
1172 |
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
935 |
multiply_groups(groups.begin(), groups.end(), comb, allcombs);
|
1173 |
|
936 |
|
1174 |
// Insert the search groups and slacks in the highlight data, with
|
937 |
// Insert the search groups and slacks in the highlight data, with
|
1175 |
// a reference to the user entry that generated them:
|
938 |
// a reference to the user entry that generated them:
|
1176 |
m_hld.groups.insert(m_hld.groups.end(), allcombs.begin(), allcombs.end());
|
939 |
m_hldata.groups.insert(m_hldata.groups.end(),
|
|
|
940 |
allcombs.begin(), allcombs.end());
|
1177 |
m_hld.slacks.insert(m_hld.slacks.end(), allcombs.size(), slack);
|
941 |
m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
|
1178 |
m_hld.grpsugidx.insert(m_hld.grpsugidx.end(), allcombs.size(),
|
942 |
m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(),
|
1179 |
m_hld.ugroups.size() - 1);
|
943 |
m_hldata.ugroups.size() - 1);
|
1180 |
}
|
944 |
}
|
1181 |
|
945 |
|
1182 |
// Trim string beginning with ^ or ending with $ and convert to flags
|
946 |
// Trim string beginning with ^ or ending with $ and convert to flags
|
1183 |
static int stringToMods(string& s)
|
947 |
static int stringToMods(string& s)
|
1184 |
{
|
948 |
{
|
|
... |
|
... |
1218 |
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
982 |
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
1219 |
* composition of the phrase terms (no stem expansion in this case)
|
983 |
* composition of the phrase terms (no stem expansion in this case)
|
1220 |
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
984 |
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
1221 |
* count)
|
985 |
* count)
|
1222 |
*/
|
986 |
*/
|
1223 |
bool StringToXapianQ::processUserString(const string &iq,
|
987 |
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
|
1224 |
int mods,
|
|
|
1225 |
string &ermsg,
|
988 |
int mods, string &ermsg,
|
1226 |
vector<Xapian::Query> &pqueries,
|
989 |
void *pq, int slack, bool useNear)
|
1227 |
int slack,
|
|
|
1228 |
bool useNear
|
|
|
1229 |
)
|
|
|
1230 |
{
|
990 |
{
|
|
|
991 |
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
|
|
992 |
|
1231 |
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
|
993 |
LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
|
1232 |
"slack %d near %d\n",
|
994 |
"slack %d near %d\n",
|
1233 |
iq.c_str(), m_field.c_str(), mods, slack, useNear));
|
995 |
iq.c_str(), m_field.c_str(), mods, slack, useNear));
|
1234 |
ermsg.erase();
|
996 |
ermsg.erase();
|
1235 |
m_curcl = 0;
|
997 |
m_curcl = 0;
|
1236 |
const StopList stops = m_db.getStopList();
|
998 |
const StopList stops = db.getStopList();
|
1237 |
|
999 |
|
1238 |
// Simple whitespace-split input into user-level words and
|
1000 |
// Simple whitespace-split input into user-level words and
|
1239 |
// double-quoted phrases: word1 word2 "this is a phrase".
|
1001 |
// double-quoted phrases: word1 word2 "this is a phrase".
|
1240 |
//
|
1002 |
//
|
1241 |
// The text splitter may further still decide that the resulting
|
1003 |
// The text splitter may further still decide that the resulting
|
|
... |
|
... |
1295 |
continue;// ??
|
1057 |
continue;// ??
|
1296 |
case 1: {
|
1058 |
case 1: {
|
1297 |
int lmods = mods;
|
1059 |
int lmods = mods;
|
1298 |
if (splitter.nostemexps.front())
|
1060 |
if (splitter.nostemexps.front())
|
1299 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
1061 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
1300 |
m_hld.ugroups.push_back(vector<string>(1, *it));
|
1062 |
m_hldata.ugroups.push_back(vector<string>(1, *it));
|
1301 |
processSimpleSpan(ermsg,splitter.terms.front(),lmods, pqueries);
|
1063 |
processSimpleSpan(db, ermsg, splitter.terms.front(),
|
|
|
1064 |
lmods, &pqueries);
|
1302 |
}
|
1065 |
}
|
1303 |
break;
|
1066 |
break;
|
1304 |
default:
|
1067 |
default:
|
1305 |
m_hld.ugroups.push_back(vector<string>(1, *it));
|
1068 |
m_hldata.ugroups.push_back(vector<string>(1, *it));
|
1306 |
processPhraseOrNear(ermsg, &splitter, mods, pqueries,
|
1069 |
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
|
1307 |
useNear, slack);
|
1070 |
useNear, slack);
|
1308 |
}
|
1071 |
}
|
1309 |
if (m_curcl >= m_maxcl) {
|
1072 |
if (m_curcl >= getMaxCl()) {
|
1310 |
ermsg = "Maximum Xapian query size exceeded."
|
1073 |
ermsg = "Maximum Xapian query size exceeded."
|
1311 |
" Maybe increase maxXapianClauses.";
|
1074 |
" Maybe increase maxXapianClauses.";
|
1312 |
break;
|
1075 |
break;
|
1313 |
}
|
1076 |
}
|
1314 |
}
|
1077 |
}
|
|
... |
|
... |
1327 |
}
|
1090 |
}
|
1328 |
return true;
|
1091 |
return true;
|
1329 |
}
|
1092 |
}
|
1330 |
|
1093 |
|
1331 |
// Translate a simple OR, AND, or EXCL search clause.
|
1094 |
// Translate a simple OR, AND, or EXCL search clause.
|
1332 |
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p,
|
1095 |
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
|
1333 |
int maxexp, int maxcl)
|
|
|
1334 |
{
|
1096 |
{
|
1335 |
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
1097 |
LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
|
1336 |
getStemLang().c_str()));
|
1098 |
getStemLang().c_str()));
|
1337 |
|
1099 |
|
1338 |
Xapian::Query *qp = (Xapian::Query *)p;
|
1100 |
Xapian::Query *qp = (Xapian::Query *)p;
|
|
... |
|
... |
1346 |
case SCLT_EXCL: op = Xapian::Query::OP_OR; break;
|
1108 |
case SCLT_EXCL: op = Xapian::Query::OP_OR; break;
|
1347 |
default:
|
1109 |
default:
|
1348 |
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
1110 |
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
1349 |
return false;
|
1111 |
return false;
|
1350 |
}
|
1112 |
}
|
|
|
1113 |
|
1351 |
vector<Xapian::Query> pqueries;
|
1114 |
vector<Xapian::Query> pqueries;
|
1352 |
|
|
|
1353 |
// We normally boost the original term in the stem expansion list. Don't
|
|
|
1354 |
// do it if there are wildcards anywhere, this would skew the results.
|
|
|
1355 |
bool doBoostUserTerm =
|
|
|
1356 |
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
|
|
1357 |
(m_parentSearch == 0 && !m_haveWildCards);
|
|
|
1358 |
|
|
|
1359 |
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm,
|
|
|
1360 |
maxexp, maxcl);
|
|
|
1361 |
if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries))
|
1115 |
if (!processUserString(db, m_text, getModifiers(), m_reason, &pqueries))
|
1362 |
return false;
|
1116 |
return false;
|
1363 |
if (pqueries.empty()) {
|
1117 |
if (pqueries.empty()) {
|
1364 |
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
1118 |
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
1365 |
return true;
|
1119 |
return true;
|
1366 |
}
|
1120 |
}
|
|
... |
|
... |
1379 |
//
|
1133 |
//
|
1380 |
// We do not split the entry any more (used to do some crazy thing
|
1134 |
// We do not split the entry any more (used to do some crazy thing
|
1381 |
// about expanding multiple fragments in the past). We just take the
|
1135 |
// about expanding multiple fragments in the past). We just take the
|
1382 |
// value blanks and all and expand this against the indexed unsplit
|
1136 |
// value blanks and all and expand this against the indexed unsplit
|
1383 |
// file names
|
1137 |
// file names
|
1384 |
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p,
|
1138 |
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
|
1385 |
int maxexp, int)
|
|
|
1386 |
{
|
1139 |
{
|
1387 |
Xapian::Query *qp = (Xapian::Query *)p;
|
1140 |
Xapian::Query *qp = (Xapian::Query *)p;
|
1388 |
*qp = Xapian::Query();
|
1141 |
*qp = Xapian::Query();
|
1389 |
|
1142 |
|
1390 |
vector<string> names;
|
1143 |
vector<string> names;
|
|
|
1144 |
int maxexp = 10000;
|
|
|
1145 |
db.getConf()->getConfParam("maxTermExpand", &maxexp);
|
1391 |
db.filenameWildExp(m_text, names, maxexp);
|
1146 |
db.filenameWildExp(m_text, names, maxexp);
|
1392 |
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
1147 |
*qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
|
1393 |
|
1148 |
|
1394 |
if (m_weight != 1.0) {
|
1149 |
if (m_weight != 1.0) {
|
1395 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
1150 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
1396 |
}
|
1151 |
}
|
1397 |
return true;
|
1152 |
return true;
|
1398 |
}
|
1153 |
}
|
1399 |
|
1154 |
|
1400 |
// Translate NEAR or PHRASE clause.
|
1155 |
// Translate NEAR or PHRASE clause.
|
1401 |
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p,
|
1156 |
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
|
1402 |
int maxexp, int maxcl)
|
|
|
1403 |
{
|
1157 |
{
|
1404 |
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
1158 |
LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
|
1405 |
|
1159 |
|
1406 |
Xapian::Query *qp = (Xapian::Query *)p;
|
1160 |
Xapian::Query *qp = (Xapian::Query *)p;
|
1407 |
*qp = Xapian::Query();
|
1161 |
*qp = Xapian::Query();
|
1408 |
|
1162 |
|
1409 |
vector<Xapian::Query> pqueries;
|
1163 |
vector<Xapian::Query> pqueries;
|
1410 |
Xapian::Query nq;
|
1164 |
Xapian::Query nq;
|
1411 |
|
|
|
1412 |
// We normally boost the original term in the stem expansion list. Don't
|
|
|
1413 |
// do it if there are wildcards anywhere, this would skew the results.
|
|
|
1414 |
bool doBoostUserTerm =
|
|
|
1415 |
(m_parentSearch && !m_parentSearch->haveWildCards()) ||
|
|
|
1416 |
(m_parentSearch == 0 && !m_haveWildCards);
|
|
|
1417 |
|
1165 |
|
1418 |
// We produce a single phrase out of the user entry then use
|
1166 |
// We produce a single phrase out of the user entry then use
|
1419 |
// stringToXapianQueries() to lowercase and simplify the phrase
|
1167 |
// stringToXapianQueries() to lowercase and simplify the phrase
|
1420 |
// terms etc. This will result into a single (complex)
|
1168 |
// terms etc. This will result into a single (complex)
|
1421 |
// Xapian::Query.
|
1169 |
// Xapian::Query.
|
1422 |
if (m_text.find('\"') != string::npos) {
|
1170 |
if (m_text.find('\"') != string::npos) {
|
1423 |
m_text = neutchars(m_text, "\"");
|
1171 |
m_text = neutchars(m_text, "\"");
|
1424 |
}
|
1172 |
}
|
1425 |
string s = cstr_dquote + m_text + cstr_dquote;
|
1173 |
string s = cstr_dquote + m_text + cstr_dquote;
|
1426 |
bool useNear = (m_tp == SCLT_NEAR);
|
1174 |
bool useNear = (m_tp == SCLT_NEAR);
|
1427 |
StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm,
|
|
|
1428 |
maxexp, maxcl);
|
|
|
1429 |
if (!tr.processUserString(s, getModifiers(), m_reason, pqueries,
|
1175 |
if (!processUserString(db, s, getModifiers(), m_reason, &pqueries,
|
1430 |
m_slack, useNear))
|
1176 |
m_slack, useNear))
|
1431 |
return false;
|
1177 |
return false;
|
1432 |
if (pqueries.empty()) {
|
1178 |
if (pqueries.empty()) {
|
1433 |
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
1179 |
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
1434 |
return true;
|
1180 |
return true;
|
1435 |
}
|
1181 |
}
|