Switch to unified view

a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.30 2008-12-17 14:26:09 dockes Exp $ (C) 2006 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
227
    groups.insert(groups.end(), m_groups.begin(), m_groups.end());
227
    groups.insert(groups.end(), m_groups.begin(), m_groups.end());
228
    return true;
228
    return true;
229
    }
229
    }
230
230
231
private:
231
private:
232
    void stripExpandTerm(bool dont, const string& term, list<string>& exp, 
232
    void expandTerm(bool dont, const string& term, list<string>& exp, 
233
              string& sterm);
233
              string& sterm);
234
    // After splitting entry on whitespace: process non-phrase element
234
    // After splitting entry on whitespace: process non-phrase element
235
    void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
235
    void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
236
    // Process phrase/near element
236
    // Process phrase/near element
237
    void processPhraseOrNear(wsQData *splitData, 
237
    void processPhraseOrNear(wsQData *splitData, 
...
...
245
    // Single terms and phrases resulting from breaking up text;
245
    // Single terms and phrases resulting from breaking up text;
246
    vector<string>          m_terms;
246
    vector<string>          m_terms;
247
    vector<vector<string> > m_groups; 
247
    vector<vector<string> > m_groups; 
248
};
248
};
249
249
250
/** Unaccent and lowercase term, possibly expand stem and wildcards
250
/** Expand stem and wildcards
251
 *
251
 *
252
 * @param nostemexp don't perform stem expansion. This is mainly used to
252
 * @param nostemexp don't perform stem expansion. This is mainly used to
253
 *   prevent stem expansion inside phrases (because the user probably
253
 *   prevent stem expansion inside phrases (because the user probably
254
 *   does not expect it). This does NOT prevent wild card expansion.
254
 *   does not expect it). This does NOT prevent wild card expansion.
255
 *   Other factors than nostemexp can prevent stem expansion: 
255
 *   Other factors than nostemexp can prevent stem expansion: 
256
 *   a null stemlang, resulting from a global user preference, a
256
 *   a null stemlang, resulting from a global user preference, a
257
 *   capitalized term, or wildcard(s)
257
 *   capitalized term, or wildcard(s)
258
 * @param term input single word
258
 * @param term input single word
259
 * @param exp output expansion list
259
 * @param exp output expansion list
260
 * @param sterm output lower-cased+unaccented version of the input term 
260
 * @param sterm output original input term if there were no wildcards
261
 *              (only for stem expansion, not wildcards)
262
 */
261
 */
263
void StringToXapianQ::stripExpandTerm(bool nostemexp, 
262
void StringToXapianQ::expandTerm(bool nostemexp, 
264
                      const string& term, 
263
                      const string& term, 
265
                      list<string>& exp,
264
                      list<string>& exp,
266
                      string &sterm)
265
                      string &sterm)
267
{
266
{
268
    LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n", 
267
    LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n", 
269
         term.c_str(), m_stemlang.c_str(), nostemexp));
268
         term.c_str(), m_stemlang.c_str(), nostemexp));
270
    sterm.erase();
269
    sterm.erase();
271
    exp.clear();
270
    exp.clear();
272
    if (term.empty()) {
271
    if (term.empty()) {
273
    return;
272
    return;
274
    }
273
    }
275
    // term1 is lowercase and without diacritics
276
    string term1;
277
    dumb_string(term, term1);
278
274
279
    bool haswild = term.find_first_of("*?[") != string::npos;
275
    bool haswild = term.find_first_of("*?[") != string::npos;
280
276
281
    // No stemming if there are wildcards or prevented globally.
277
    // No stemming if there are wildcards or prevented globally.
282
    if (haswild || m_stemlang.empty())
278
    if (haswild || m_stemlang.empty())
...
...
297
    }
293
    }
298
    }
294
    }
299
295
300
    if (nostemexp && !haswild) {
296
    if (nostemexp && !haswild) {
301
    // Neither stemming nor wildcard expansion: just the word
297
    // Neither stemming nor wildcard expansion: just the word
302
    sterm = term1;
298
    sterm = term;
303
    exp.push_front(term1);
299
    exp.push_front(term);
304
    exp.resize(1);
300
    exp.resize(1);
305
    } else {
301
    } else {
306
    list<TermMatchEntry> l;
302
    list<TermMatchEntry> l;
307
    if (haswild) {
303
    if (haswild) {
308
        m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);
304
        m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
309
    } else {
305
    } else {
310
        sterm = term1;
306
        sterm = term;
311
        m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);
307
        m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
312
    }
308
    }
313
    for (list<TermMatchEntry>::const_iterator it = l.begin(); 
309
    for (list<TermMatchEntry>::const_iterator it = l.begin(); 
314
         it != l.end(); it++) {
310
         it != l.end(); it++) {
315
        exp.push_back(it->term);
311
        exp.push_back(it->term);
316
    }
312
    }
...
...
363
void StringToXapianQ::processSimpleSpan(const string& span, 
359
void StringToXapianQ::processSimpleSpan(const string& span, 
364
                    list<Xapian::Query> &pqueries)
360
                    list<Xapian::Query> &pqueries)
365
{
361
{
366
    list<string> exp;  
362
    list<string> exp;  
367
    string sterm; // dumb version of user term
363
    string sterm; // dumb version of user term
368
    stripExpandTerm(false, span, exp, sterm);
364
    expandTerm(false, span, exp, sterm);
369
    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
365
    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
370
    addPrefix(exp, m_prefix);
366
    addPrefix(exp, m_prefix);
371
    // Push either term or OR of stem-expanded set
367
    // Push either term or OR of stem-expanded set
372
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
368
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
373
369
...
...
407
    // inside NEAR, all others must be leafs.
403
    // inside NEAR, all others must be leafs.
408
    bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
404
    bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
409
405
410
    string sterm;
406
    string sterm;
411
    list<string>exp;
407
    list<string>exp;
412
    stripExpandTerm(nostemexp, *it, exp, sterm);
408
    expandTerm(nostemexp, *it, exp, sterm);
413
    groups.push_back(vector<string>(exp.begin(), exp.end()));
409
    groups.push_back(vector<string>(exp.begin(), exp.end()));
414
    addPrefix(exp, m_prefix);
410
    addPrefix(exp, m_prefix);
415
    orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
411
    orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
416
                      exp.begin(), exp.end()));
412
                      exp.begin(), exp.end()));
417
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
413
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
...
...
446
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
442
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
447
 *     composition of the phrase terms (no stem expansion in this case)
443
 *     composition of the phrase terms (no stem expansion in this case)
448
 * @return the subquery count (either or'd stem-expanded terms or phrase word
444
 * @return the subquery count (either or'd stem-expanded terms or phrase word
449
 *   count)
445
 *   count)
450
 */
446
 */
451
bool StringToXapianQ::processUserString(const string &iq,
447
bool StringToXapianQ::processUserString(const string &_iq,
452
                    string &ermsg,
448
                    string &ermsg,
453
                    list<Xapian::Query> &pqueries,
449
                    list<Xapian::Query> &pqueries,
454
                    const StopList& stops,
450
                    const StopList& stops,
455
                    int slack, 
451
                    int slack, 
456
                    bool useNear
452
                    bool useNear
457
                    )
453
                    )
458
{
454
{
459
    LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
455
    LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
460
    ermsg.erase();
456
    ermsg.erase();
461
    m_terms.clear();
457
    m_terms.clear();
462
    m_groups.clear();
458
    m_groups.clear();
459
460
    // First unaccent/normalize the input: do it first so that it
461
    // happens in the same order as when indexing: unac then split. As
462
    // the character count can change during normalisation, this is
463
    // specially important for cjk because the artificial cjk split is
464
    // based on character counts
465
    string iq;
466
    dumb_string(_iq, iq);
463
467
464
    // Simple whitespace-split input into user-level words and
468
    // Simple whitespace-split input into user-level words and
465
    // double-quoted phrases: word1 word2 "this is a phrase". The text
469
    // double-quoted phrases: word1 word2 "this is a phrase". The text
466
    // splitter may further still decide that the resulting "words"
470
    // splitter may further still decide that the resulting "words"
467
    // are really phrases, this depends on separators: [paul@dom.net]
471
    // are really phrases, this depends on separators: [paul@dom.net]