|
a/src/rcldb/searchdata.cpp |
|
b/src/rcldb/searchdata.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.30 2008-12-17 14:26:09 dockes Exp $ (C) 2006 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
227 |
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
|
227 |
groups.insert(groups.end(), m_groups.begin(), m_groups.end());
|
228 |
return true;
|
228 |
return true;
|
229 |
}
|
229 |
}
|
230 |
|
230 |
|
231 |
private:
|
231 |
private:
|
232 |
void stripExpandTerm(bool dont, const string& term, list<string>& exp,
|
232 |
void expandTerm(bool dont, const string& term, list<string>& exp,
|
233 |
string& sterm);
|
233 |
string& sterm);
|
234 |
// After splitting entry on whitespace: process non-phrase element
|
234 |
// After splitting entry on whitespace: process non-phrase element
|
235 |
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
|
235 |
void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
|
236 |
// Process phrase/near element
|
236 |
// Process phrase/near element
|
237 |
void processPhraseOrNear(wsQData *splitData,
|
237 |
void processPhraseOrNear(wsQData *splitData,
|
|
... |
|
... |
245 |
// Single terms and phrases resulting from breaking up text;
|
245 |
// Single terms and phrases resulting from breaking up text;
|
246 |
vector<string> m_terms;
|
246 |
vector<string> m_terms;
|
247 |
vector<vector<string> > m_groups;
|
247 |
vector<vector<string> > m_groups;
|
248 |
};
|
248 |
};
|
249 |
|
249 |
|
250 |
/** Unaccent and lowercase term, possibly expand stem and wildcards
|
250 |
/** Expand stem and wildcards
|
251 |
*
|
251 |
*
|
252 |
* @param nostemexp don't perform stem expansion. This is mainly used to
|
252 |
* @param nostemexp don't perform stem expansion. This is mainly used to
|
253 |
* prevent stem expansion inside phrases (because the user probably
|
253 |
* prevent stem expansion inside phrases (because the user probably
|
254 |
* does not expect it). This does NOT prevent wild card expansion.
|
254 |
* does not expect it). This does NOT prevent wild card expansion.
|
255 |
* Other factors than nostemexp can prevent stem expansion:
|
255 |
* Other factors than nostemexp can prevent stem expansion:
|
256 |
* a null stemlang, resulting from a global user preference, a
|
256 |
* a null stemlang, resulting from a global user preference, a
|
257 |
* capitalized term, or wildcard(s)
|
257 |
* capitalized term, or wildcard(s)
|
258 |
* @param term input single word
|
258 |
* @param term input single word
|
259 |
* @param exp output expansion list
|
259 |
* @param exp output expansion list
|
260 |
* @param sterm output lower-cased+unaccented version of the input term
|
260 |
* @param sterm output original input term if there were no wildcards
|
261 |
* (only for stem expansion, not wildcards)
|
|
|
262 |
*/
|
261 |
*/
|
263 |
void StringToXapianQ::stripExpandTerm(bool nostemexp,
|
262 |
void StringToXapianQ::expandTerm(bool nostemexp,
|
264 |
const string& term,
|
263 |
const string& term,
|
265 |
list<string>& exp,
|
264 |
list<string>& exp,
|
266 |
string &sterm)
|
265 |
string &sterm)
|
267 |
{
|
266 |
{
|
268 |
LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n",
|
267 |
LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n",
|
269 |
term.c_str(), m_stemlang.c_str(), nostemexp));
|
268 |
term.c_str(), m_stemlang.c_str(), nostemexp));
|
270 |
sterm.erase();
|
269 |
sterm.erase();
|
271 |
exp.clear();
|
270 |
exp.clear();
|
272 |
if (term.empty()) {
|
271 |
if (term.empty()) {
|
273 |
return;
|
272 |
return;
|
274 |
}
|
273 |
}
|
275 |
// term1 is lowercase and without diacritics
|
|
|
276 |
string term1;
|
|
|
277 |
dumb_string(term, term1);
|
|
|
278 |
|
274 |
|
279 |
bool haswild = term.find_first_of("*?[") != string::npos;
|
275 |
bool haswild = term.find_first_of("*?[") != string::npos;
|
280 |
|
276 |
|
281 |
// No stemming if there are wildcards or prevented globally.
|
277 |
// No stemming if there are wildcards or prevented globally.
|
282 |
if (haswild || m_stemlang.empty())
|
278 |
if (haswild || m_stemlang.empty())
|
|
... |
|
... |
297 |
}
|
293 |
}
|
298 |
}
|
294 |
}
|
299 |
|
295 |
|
300 |
if (nostemexp && !haswild) {
|
296 |
if (nostemexp && !haswild) {
|
301 |
// Neither stemming nor wildcard expansion: just the word
|
297 |
// Neither stemming nor wildcard expansion: just the word
|
302 |
sterm = term1;
|
298 |
sterm = term;
|
303 |
exp.push_front(term1);
|
299 |
exp.push_front(term);
|
304 |
exp.resize(1);
|
300 |
exp.resize(1);
|
305 |
} else {
|
301 |
} else {
|
306 |
list<TermMatchEntry> l;
|
302 |
list<TermMatchEntry> l;
|
307 |
if (haswild) {
|
303 |
if (haswild) {
|
308 |
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
|
304 |
m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
|
309 |
} else {
|
305 |
} else {
|
310 |
sterm = term1;
|
306 |
sterm = term;
|
311 |
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
|
307 |
m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
|
312 |
}
|
308 |
}
|
313 |
for (list<TermMatchEntry>::const_iterator it = l.begin();
|
309 |
for (list<TermMatchEntry>::const_iterator it = l.begin();
|
314 |
it != l.end(); it++) {
|
310 |
it != l.end(); it++) {
|
315 |
exp.push_back(it->term);
|
311 |
exp.push_back(it->term);
|
316 |
}
|
312 |
}
|
|
... |
|
... |
363 |
void StringToXapianQ::processSimpleSpan(const string& span,
|
359 |
void StringToXapianQ::processSimpleSpan(const string& span,
|
364 |
list<Xapian::Query> &pqueries)
|
360 |
list<Xapian::Query> &pqueries)
|
365 |
{
|
361 |
{
|
366 |
list<string> exp;
|
362 |
list<string> exp;
|
367 |
string sterm; // dumb version of user term
|
363 |
string sterm; // dumb version of user term
|
368 |
stripExpandTerm(false, span, exp, sterm);
|
364 |
expandTerm(false, span, exp, sterm);
|
369 |
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
365 |
m_terms.insert(m_terms.end(), exp.begin(), exp.end());
|
370 |
addPrefix(exp, m_prefix);
|
366 |
addPrefix(exp, m_prefix);
|
371 |
// Push either term or OR of stem-expanded set
|
367 |
// Push either term or OR of stem-expanded set
|
372 |
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
368 |
Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
|
373 |
|
369 |
|
|
... |
|
... |
407 |
// inside NEAR, all others must be leafs.
|
403 |
// inside NEAR, all others must be leafs.
|
408 |
bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
404 |
bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
|
409 |
|
405 |
|
410 |
string sterm;
|
406 |
string sterm;
|
411 |
list<string>exp;
|
407 |
list<string>exp;
|
412 |
stripExpandTerm(nostemexp, *it, exp, sterm);
|
408 |
expandTerm(nostemexp, *it, exp, sterm);
|
413 |
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
409 |
groups.push_back(vector<string>(exp.begin(), exp.end()));
|
414 |
addPrefix(exp, m_prefix);
|
410 |
addPrefix(exp, m_prefix);
|
415 |
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
411 |
orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
|
416 |
exp.begin(), exp.end()));
|
412 |
exp.begin(), exp.end()));
|
417 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
413 |
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
|
|
... |
|
... |
446 |
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
442 |
* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
|
447 |
* composition of the phrase terms (no stem expansion in this case)
|
443 |
* composition of the phrase terms (no stem expansion in this case)
|
448 |
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
444 |
* @return the subquery count (either or'd stem-expanded terms or phrase word
|
449 |
* count)
|
445 |
* count)
|
450 |
*/
|
446 |
*/
|
451 |
bool StringToXapianQ::processUserString(const string &iq,
|
447 |
bool StringToXapianQ::processUserString(const string &_iq,
|
452 |
string &ermsg,
|
448 |
string &ermsg,
|
453 |
list<Xapian::Query> &pqueries,
|
449 |
list<Xapian::Query> &pqueries,
|
454 |
const StopList& stops,
|
450 |
const StopList& stops,
|
455 |
int slack,
|
451 |
int slack,
|
456 |
bool useNear
|
452 |
bool useNear
|
457 |
)
|
453 |
)
|
458 |
{
|
454 |
{
|
459 |
LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
|
455 |
LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
|
460 |
ermsg.erase();
|
456 |
ermsg.erase();
|
461 |
m_terms.clear();
|
457 |
m_terms.clear();
|
462 |
m_groups.clear();
|
458 |
m_groups.clear();
|
|
|
459 |
|
|
|
460 |
// First unaccent/normalize the input: do it first so that it
|
|
|
461 |
// happens in the same order as when indexing: unac then split. As
|
|
|
462 |
// the character count can change during normalisation, this is
|
|
|
463 |
// specially important for cjk because the artificial cjk split is
|
|
|
464 |
// based on character counts
|
|
|
465 |
string iq;
|
|
|
466 |
dumb_string(_iq, iq);
|
463 |
|
467 |
|
464 |
// Simple whitespace-split input into user-level words and
|
468 |
// Simple whitespace-split input into user-level words and
|
465 |
// double-quoted phrases: word1 word2 "this is a phrase". The text
|
469 |
// double-quoted phrases: word1 word2 "this is a phrase". The text
|
466 |
// splitter may further still decide that the resulting "words"
|
470 |
// splitter may further still decide that the resulting "words"
|
467 |
// are really phrases, this depends on separators: [paul@dom.net]
|
471 |
// are really phrases, this depends on separators: [paul@dom.net]
|