|
a/src/rcldb/searchdata.cpp |
|
b/src/rcldb/searchdata.cpp |
|
... |
|
... |
181 |
class wsQData : public TextSplitCB {
|
181 |
class wsQData : public TextSplitCB {
|
182 |
public:
|
182 |
public:
|
183 |
wsQData(const StopList &_stops)
|
183 |
wsQData(const StopList &_stops)
|
184 |
: stops(_stops), alltermcount(0)
|
184 |
: stops(_stops), alltermcount(0)
|
185 |
{}
|
185 |
{}
|
|
|
186 |
bool takeword(const std::string &interm, int , int, int) {
|
|
|
187 |
alltermcount++;
|
|
|
188 |
LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
|
|
|
189 |
|
|
|
190 |
// Check if the first letter is a majuscule in which
|
|
|
191 |
// case we do not want to do stem expansion. Note that
|
|
|
192 |
// the test is convoluted and possibly problematic
|
|
|
193 |
string noacterm, noaclowterm;
|
|
|
194 |
if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
|
|
|
195 |
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
|
|
|
196 |
return true;
|
|
|
197 |
}
|
|
|
198 |
if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
|
|
199 |
LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
|
|
|
200 |
return true;
|
|
|
201 |
}
|
|
|
202 |
bool nostemexp = false;
|
|
|
203 |
Utf8Iter it1(noacterm);
|
|
|
204 |
Utf8Iter it2(noaclowterm);
|
|
|
205 |
if (*it1 != *it2)
|
|
|
206 |
nostemexp = true;
|
|
|
207 |
|
|
|
208 |
if (stops.hasStops() && stops.isStop(noaclowterm)) {
|
|
|
209 |
LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
|
|
|
210 |
return true;
|
|
|
211 |
}
|
|
|
212 |
terms.push_back(noaclowterm);
|
|
|
213 |
nostemexps.push_back(nostemexp);
|
|
|
214 |
return true;
|
|
|
215 |
}
|
|
|
216 |
|
186 |
vector<string> terms;
|
217 |
vector<string> terms;
|
187 |
bool takeword(const std::string &term, int , int, int) {
|
218 |
vector<bool> nostemexps;
|
188 |
alltermcount++;
|
|
|
189 |
LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
|
|
|
190 |
if (stops.hasStops() && stops.isStop(term)) {
|
|
|
191 |
LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
|
|
|
192 |
return true;
|
|
|
193 |
}
|
|
|
194 |
terms.push_back(term);
|
|
|
195 |
return true;
|
|
|
196 |
}
|
|
|
197 |
const StopList &stops;
|
219 |
const StopList &stops;
|
198 |
// Count of terms including stopwords: this is for adjusting
|
220 |
// Count of terms including stopwords: this is for adjusting
|
199 |
// phrase/near slack
|
221 |
// phrase/near slack
|
200 |
int alltermcount;
|
222 |
int alltermcount;
|
201 |
};
|
223 |
};
|
|
... |
|
... |
230 |
|
252 |
|
231 |
private:
|
253 |
private:
|
232 |
void expandTerm(bool dont, const string& term, list<string>& exp,
|
254 |
void expandTerm(bool dont, const string& term, list<string>& exp,
|
233 |
string& sterm);
|
255 |
string& sterm);
|
234 |
// After splitting entry on whitespace: process non-phrase element
|
256 |
// After splitting entry on whitespace: process non-phrase element
|
235 |
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
|
257 |
void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
|
236 |
// Process phrase/near element
|
258 |
// Process phrase/near element
|
237 |
void processPhraseOrNear(wsQData *splitData,
|
259 |
void processPhraseOrNear(wsQData *splitData,
|
238 |
list<Xapian::Query> &pqueries,
|
260 |
list<Xapian::Query> &pqueries,
|
239 |
bool useNear, int slack);
|
261 |
bool useNear, int slack);
|
240 |
|
262 |
|
|
... |
|
... |
277 |
// No stemming if there are wildcards or prevented globally.
|
299 |
// No stemming if there are wildcards or prevented globally.
|
278 |
if (haswild || m_stemlang.empty())
|
300 |
if (haswild || m_stemlang.empty())
|
279 |
nostemexp = true;
|
301 |
nostemexp = true;
|
280 |
|
302 |
|
281 |
if (!nostemexp) {
|
303 |
if (!nostemexp) {
|
282 |
// Check if the first letter is a majuscule in which
|
|
|
283 |
// case we do not want to do stem expansion. Note that
|
|
|
284 |
// the test is convoluted and possibly problematic
|
|
|
285 |
|
|
|
286 |
string noacterm, noaclowterm;
|
|
|
287 |
if (unacmaybefold(term, noacterm, "UTF-8", false) &&
|
|
|
288 |
unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
|
|
|
289 |
Utf8Iter it1(noacterm);
|
|
|
290 |
Utf8Iter it2(noaclowterm);
|
|
|
291 |
if (*it1 != *it2)
|
|
|
292 |
nostemexp = true;
|
|
|
293 |
}
|
|
|
294 |
}
|
304 |
}
|
295 |
|
305 |
|
296 |
if (nostemexp && !haswild) {
|
306 |
if (nostemexp && !haswild) {
|
297 |
// Neither stemming nor wildcard expansion: just the word
|
307 |
// Neither stemming nor wildcard expansion: just the word
|
298 |
sterm = term;
|
308 |
sterm = term;
|
|
... |
|
... |
354 |
return;
|
364 |
return;
|
355 |
for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
|
365 |
for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
|
356 |
it->insert(0, prefix);
|
366 |
it->insert(0, prefix);
|
357 |
}
|
367 |
}
|
358 |
|
368 |
|
359 |
void StringToXapianQ::processSimpleSpan(const string& span,
|
369 |
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
|
360 |
list<Xapian::Query> &pqueries)
|
370 |
list<Xapian::Query> &pqueries)
|
361 |
{
|
371 |
{
|
362 |
list<string> exp;
|
372 |
list<string> exp;
|
363 |
string sterm; // dumb version of user term
|
373 |
string sterm; // dumb version of user term
|
364 |
expandTerm(false, span, exp, sterm);
|
374 |
expandTerm(nostemexp, span, exp, sterm);
|
365 |
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
375 |
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
366 |
addPrefix(exp, m_prefix);
|
376 |
addPrefix(exp, m_prefix);
|
367 |
// Push either term or OR of stem-expanded set
|
377 |
// Push either term or OR of stem-expanded set
|
368 |
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
378 |
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
369 |
|
379 |
|
|
... |
|
... |
394 |
list<Xapian::Query> orqueries;
|
404 |
list<Xapian::Query> orqueries;
|
395 |
bool hadmultiple = false;
|
405 |
bool hadmultiple = false;
|
396 |
vector<vector<string> >groups;
|
406 |
vector<vector<string> >groups;
|
397 |
|
407 |
|
398 |
// Go through the list and perform stem/wildcard expansion for each element
|
408 |
// Go through the list and perform stem/wildcard expansion for each element
|
|
|
409 |
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
399 |
for (vector<string>::iterator it = splitData->terms.begin();
|
410 |
for (vector<string>::iterator it = splitData->terms.begin();
|
400 |
it != splitData->terms.end(); it++) {
|
411 |
it != splitData->terms.end(); it++, nxit++) {
|
401 |
// Adjust when we do stem expansion. Not inside phrases, and
|
412 |
// Adjust when we do stem expansion. Not inside phrases, and
|
402 |
// some versions of xapian will accept only one OR clause
|
413 |
// some versions of xapian will accept only one OR clause
|
403 |
// inside NEAR, all others must be leafs.
|
414 |
// inside NEAR, all others must be leafs.
|
404 |
bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
415 |
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
405 |
|
416 |
|
406 |
string sterm;
|
417 |
string sterm;
|
407 |
list<string>exp;
|
418 |
list<string>exp;
|
408 |
expandTerm(nostemexp, *it, exp, sterm);
|
419 |
expandTerm(nostemexp, *it, exp, sterm);
|
409 |
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
420 |
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
|
... |
|
... |
432 |
m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
|
443 |
m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
|
433 |
}
|
444 |
}
|
434 |
|
445 |
|
435 |
/**
|
446 |
/**
|
436 |
* Turn user entry string (NOT query language) into a list of xapian queries.
|
447 |
* Turn user entry string (NOT query language) into a list of xapian queries.
|
437 |
* We just separate words and phrases, and do wildcard and stemp expansion,
|
448 |
* We just separate words and phrases, and do wildcard and stem expansion,
|
|
|
449 |
*
|
|
|
450 |
* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
|
|
|
451 |
* the GUI.
|
438 |
*
|
452 |
*
|
439 |
* The final list contains one query for each term or phrase
|
453 |
* The final list contains one query for each term or phrase
|
440 |
* - Elements corresponding to a stem-expanded part are an OP_OR
|
454 |
* - Elements corresponding to a stem-expanded part are an OP_OR
|
441 |
* composition of the stem-expanded terms (or a single term query).
|
455 |
* composition of the stem-expanded terms (or a single term query).
|
442 |
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
456 |
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
443 |
* composition of the phrase terms (no stem expansion in this case)
|
457 |
* composition of the phrase terms (no stem expansion in this case)
|
444 |
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
458 |
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
445 |
* count)
|
459 |
* count)
|
446 |
*/
|
460 |
*/
|
447 |
bool StringToXapianQ::processUserString(const string &_iq,
|
461 |
bool StringToXapianQ::processUserString(const string &iq,
|
448 |
string &ermsg,
|
462 |
string &ermsg,
|
449 |
list<Xapian::Query> &pqueries,
|
463 |
list<Xapian::Query> &pqueries,
|
450 |
const StopList& stops,
|
464 |
const StopList& stops,
|
451 |
int slack,
|
465 |
int slack,
|
452 |
bool useNear
|
466 |
bool useNear
|
453 |
)
|
467 |
)
|
454 |
{
|
468 |
{
|
455 |
LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
|
469 |
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
456 |
ermsg.erase();
|
470 |
ermsg.erase();
|
457 |
m_terms.clear();
|
471 |
m_terms.clear();
|
458 |
m_groups.clear();
|
472 |
m_groups.clear();
|
459 |
|
473 |
|
460 |
// First unaccent/normalize the input: do it first so that it
|
|
|
461 |
// happens in the same order as when indexing: unac then split. As
|
|
|
462 |
// the character count can change during normalisation, this is
|
|
|
463 |
// specially important for cjk because the artificial cjk split is
|
|
|
464 |
// based on character counts
|
|
|
465 |
string iq;
|
|
|
466 |
dumb_string(_iq, iq);
|
|
|
467 |
|
|
|
468 |
// Simple whitespace-split input into user-level words and
|
474 |
// Simple whitespace-split input into user-level words and
|
469 |
// double-quoted phrases: word1 word2 "this is a phrase". The text
|
475 |
// double-quoted phrases: word1 word2 "this is a phrase".
|
|
|
476 |
//
|
470 |
// splitter may further still decide that the resulting "words"
|
477 |
// The text splitter may further still decide that the resulting
|
471 |
// are really phrases, this depends on separators: [paul@dom.net]
|
478 |
// "words" are really phrases, this depends on separators:
|
472 |
// would still be a word (span), but [about:me] will probably be
|
479 |
// [paul@dom.net] would still be a word (span), but [about:me]
|
473 |
// handled as a phrase.
|
480 |
// will probably be handled as a phrase.
|
474 |
list<string> phrases;
|
481 |
list<string> phrases;
|
475 |
TextSplit::stringToStrings(iq, phrases);
|
482 |
TextSplit::stringToStrings(iq, phrases);
|
476 |
|
483 |
|
477 |
// Process each element: textsplit into terms, handle stem/wildcard
|
484 |
// Process each element: textsplit into terms, handle stem/wildcard
|
478 |
// expansion and transform into an appropriate Xapian::Query
|
485 |
// expansion and transform into an appropriate Xapian::Query
|
|
... |
|
... |
514 |
LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
|
521 |
LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
|
515 |
switch (splitData->terms.size()) {
|
522 |
switch (splitData->terms.size()) {
|
516 |
case 0:
|
523 |
case 0:
|
517 |
continue;// ??
|
524 |
continue;// ??
|
518 |
case 1:
|
525 |
case 1:
|
519 |
processSimpleSpan(splitData->terms.front(), pqueries);
|
526 |
processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
|
520 |
break;
|
527 |
break;
|
521 |
default:
|
528 |
default:
|
522 |
processPhraseOrNear(splitData, pqueries, useNear, slack);
|
529 |
processPhraseOrNear(splitData, pqueries, useNear, slack);
|
523 |
}
|
530 |
}
|
524 |
}
|
531 |
}
|