|
a/src/rcldb/searchdata.cpp |
|
b/src/rcldb/searchdata.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.17 2007-06-22 06:14:04 dockes Exp $ (C) 2006 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.18 2007-09-20 08:43:12 dockes Exp $ (C) 2006 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
364 |
try {
|
364 |
try {
|
365 |
for (list<string>::iterator it = phrases.begin();
|
365 |
for (list<string>::iterator it = phrases.begin();
|
366 |
it != phrases.end(); it++) {
|
366 |
it != phrases.end(); it++) {
|
367 |
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
|
367 |
LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
|
368 |
|
368 |
|
369 |
// If there are both spans and single words in this element,
|
369 |
// If there are multiple spans in this element, including
|
370 |
// we need to use a word split, else a phrase query including
|
370 |
// at least one composite, we need to use a word split,
|
371 |
// a span would fail if we didn't adjust the proximity to
|
371 |
// else a phrase query including a span would fail.
|
372 |
// account for the additional span term which is complicated.
|
372 |
// (other possible solution: adjust slack to account for the
|
|
|
373 |
// additional position increase?)
|
|
|
374 |
// Ex: "term0@term01 term1" is onlyspans-split as:
|
|
|
375 |
// 0 term0@term01 0 12
|
|
|
376 |
// 2 term1 13 18
|
|
|
377 |
// The position of term1 is 2, not 1, so the phrase search would
|
|
|
378 |
// fail. We search for "term0 term01 term1" instead, which may
|
|
|
379 |
// have worse performance, but will succeed.
|
373 |
wsQData splitDataS(stops), splitDataW(stops);
|
380 |
wsQData splitDataS(stops), splitDataW(stops);
|
374 |
TextSplit splitterS(&splitDataS, (TextSplit::Flags)
|
381 |
TextSplit splitterS(&splitDataS,
|
375 |
(TextSplit::TXTS_ONLYSPANS |
|
382 |
TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
376 |
TextSplit::TXTS_KEEPWILD));
|
383 |
TextSplit::TXTS_KEEPWILD));
|
377 |
splitterS.text_to_words(*it);
|
384 |
splitterS.text_to_words(*it);
|
378 |
TextSplit splitterW(&splitDataW, (TextSplit::Flags)
|
385 |
TextSplit splitterW(&splitDataW,
|
379 |
(TextSplit::TXTS_NOSPANS |
|
386 |
TextSplit::Flags(TextSplit::TXTS_NOSPANS |
|
380 |
TextSplit::TXTS_KEEPWILD));
|
387 |
TextSplit::TXTS_KEEPWILD));
|
381 |
splitterW.text_to_words(*it);
|
388 |
splitterW.text_to_words(*it);
|
382 |
wsQData *splitData = &splitDataS;
|
389 |
wsQData *splitData = &splitDataS;
|
383 |
if (splitDataS.terms.size() > 1 &&
|
390 |
if (splitDataS.terms.size() > 1 &&
|
384 |
splitDataS.terms.size() != splitDataW.terms.size())
|
391 |
splitDataS.terms.size() != splitDataW.terms.size())
|
385 |
splitData = &splitDataW;
|
392 |
splitData = &splitDataW;
|
|
... |
|
... |
387 |
LOGDEB(("strToXapianQ: splitter term count: %d\n",
|
394 |
LOGDEB(("strToXapianQ: splitter term count: %d\n",
|
388 |
splitData->terms.size()));
|
395 |
splitData->terms.size()));
|
389 |
switch (splitData->terms.size()) {
|
396 |
switch (splitData->terms.size()) {
|
390 |
case 0: continue;// ??
|
397 |
case 0: continue;// ??
|
391 |
case 1:
|
398 |
case 1:
|
392 |
// Not a real phrase: one term. Still may be expanded
|
399 |
// Just a term. Still may be expanded (by stem or
|
393 |
// (stem or wildcard)
|
400 |
// wildcard) to an OR list.
|
394 |
{
|
401 |
{
|
395 |
string term = splitData->terms.front();
|
402 |
string term = splitData->terms.front();
|
396 |
list<string> exp;
|
403 |
list<string> exp;
|
397 |
string sterm;
|
404 |
string sterm; // dumb version of user term
|
398 |
stripExpandTerm(false, term, exp, sterm);
|
405 |
stripExpandTerm(false, term, exp, sterm);
|
399 |
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
406 |
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
400 |
// Push either term or OR of stem-expanded set
|
407 |
// Push either term or OR of stem-expanded set
|
401 |
addPrefix(exp, prefix);
|
408 |
addPrefix(exp, prefix);
|
402 |
Xapian::Query xq(Xapian::Query::OP_OR,
|
409 |
Xapian::Query xq(Xapian::Query::OP_OR,
|
|
... |
|
... |
415 |
pqueries.push_back(xq);
|
422 |
pqueries.push_back(xq);
|
416 |
}
|
423 |
}
|
417 |
break;
|
424 |
break;
|
418 |
|
425 |
|
419 |
default:
|
426 |
default:
|
420 |
// Phrase/near: transform into a PHRASE or NEAR xapian
|
427 |
// Element had several terms: transform into a PHRASE
|
421 |
// query, the element of which can themselves be OR
|
428 |
// or NEAR xapian query, the elements of which can
|
422 |
// queries if the terms get expanded by stemming or
|
429 |
// themselves be OR queries if the terms get expanded
|
423 |
// wildcards (we don't do stemming for PHRASE though)
|
430 |
// by stemming or wildcards (we don't do stemming for
|
|
|
431 |
// PHRASE though)
|
424 |
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
432 |
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
425 |
Xapian::Query::OP_PHRASE;
|
433 |
Xapian::Query::OP_PHRASE;
|
426 |
list<Xapian::Query> orqueries;
|
434 |
list<Xapian::Query> orqueries;
|
427 |
bool hadmultiple = false;
|
435 |
bool hadmultiple = false;
|
428 |
vector<vector<string> >groups;
|
436 |
vector<vector<string> >groups;
|