Switch to unified view

a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
...
...
181
class wsQData : public TextSplitCB {
181
class wsQData : public TextSplitCB {
182
 public:
182
 public:
183
    wsQData(const StopList &_stops) 
183
    wsQData(const StopList &_stops) 
184
    : stops(_stops), alltermcount(0)
184
    : stops(_stops), alltermcount(0)
185
    {}
185
    {}
186
    bool takeword(const std::string &interm, int , int, int) {
187
  alltermcount++;
188
  LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
189
190
  // Check if the first letter is a majuscule in which
191
  // case we do not want to do stem expansion. Note that
192
  // the test is convoluted and possibly problematic
193
  string noacterm, noaclowterm;
194
  if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
195
      LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
196
      return true;
197
  } 
198
  if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
199
      LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
200
      return true;
201
  }
202
  bool nostemexp = false;
203
  Utf8Iter it1(noacterm);
204
  Utf8Iter it2(noaclowterm);
205
  if (*it1 != *it2)
206
      nostemexp = true;
207
208
  if (stops.hasStops() && stops.isStop(noaclowterm)) {
209
      LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
210
      return true;
211
  }
212
  terms.push_back(noaclowterm);
213
  nostemexps.push_back(nostemexp);
214
  return true;
215
    }
216
186
    vector<string> terms;
217
    vector<string> terms;
187
    bool takeword(const std::string &term, int , int, int) {
218
    vector<bool>   nostemexps;
188
  alltermcount++;
189
  LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
190
  if (stops.hasStops() && stops.isStop(term)) {
191
      LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
192
      return true;
193
  }
194
  terms.push_back(term);
195
  return true;
196
    }
197
    const StopList &stops;
219
    const StopList &stops;
198
    // Count of terms including stopwords: this is for adjusting
220
    // Count of terms including stopwords: this is for adjusting
199
    // phrase/near slack
221
    // phrase/near slack
200
    int alltermcount; 
222
    int alltermcount; 
201
};
223
};
...
...
230
252
231
private:
253
private:
232
    void expandTerm(bool dont, const string& term, list<string>& exp, 
254
    void expandTerm(bool dont, const string& term, list<string>& exp, 
233
              string& sterm);
255
              string& sterm);
234
    // After splitting entry on whitespace: process non-phrase element
256
    // After splitting entry on whitespace: process non-phrase element
235
    void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
257
    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
236
    // Process phrase/near element
258
    // Process phrase/near element
237
    void processPhraseOrNear(wsQData *splitData, 
259
    void processPhraseOrNear(wsQData *splitData, 
238
                 list<Xapian::Query> &pqueries,
260
                 list<Xapian::Query> &pqueries,
239
                 bool useNear, int slack);
261
                 bool useNear, int slack);
240
262
...
...
277
    // No stemming if there are wildcards or prevented globally.
299
    // No stemming if there are wildcards or prevented globally.
278
    if (haswild || m_stemlang.empty())
300
    if (haswild || m_stemlang.empty())
279
    nostemexp = true;
301
    nostemexp = true;
280
302
281
    if (!nostemexp) {
303
    if (!nostemexp) {
282
  // Check if the first letter is a majuscule in which
283
  // case we do not want to do stem expansion. Note that
284
  // the test is convoluted and possibly problematic
285
286
  string noacterm, noaclowterm;
287
  if (unacmaybefold(term, noacterm, "UTF-8", false) &&
288
      unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
289
      Utf8Iter it1(noacterm);
290
      Utf8Iter it2(noaclowterm);
291
      if (*it1 != *it2)
292
      nostemexp = true;
293
  }
294
    }
304
    }
295
305
296
    if (nostemexp && !haswild) {
306
    if (nostemexp && !haswild) {
297
    // Neither stemming nor wildcard expansion: just the word
307
    // Neither stemming nor wildcard expansion: just the word
298
    sterm = term;
308
    sterm = term;
...
...
354
    return;
364
    return;
355
    for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
365
    for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
356
    it->insert(0, prefix);
366
    it->insert(0, prefix);
357
}
367
}
358
368
359
void StringToXapianQ::processSimpleSpan(const string& span, 
369
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
360
                    list<Xapian::Query> &pqueries)
370
                    list<Xapian::Query> &pqueries)
361
{
371
{
362
    list<string> exp;  
372
    list<string> exp;  
363
    string sterm; // dumb version of user term
373
    string sterm; // dumb version of user term
364
    expandTerm(false, span, exp, sterm);
374
    expandTerm(nostemexp, span, exp, sterm);
365
    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
375
    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
366
    addPrefix(exp, m_prefix);
376
    addPrefix(exp, m_prefix);
367
    // Push either term or OR of stem-expanded set
377
    // Push either term or OR of stem-expanded set
368
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
378
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
369
379
...
...
394
    list<Xapian::Query> orqueries;
404
    list<Xapian::Query> orqueries;
395
    bool hadmultiple = false;
405
    bool hadmultiple = false;
396
    vector<vector<string> >groups;
406
    vector<vector<string> >groups;
397
407
398
    // Go through the list and perform stem/wildcard expansion for each element
408
    // Go through the list and perform stem/wildcard expansion for each element
409
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
399
    for (vector<string>::iterator it = splitData->terms.begin();
410
    for (vector<string>::iterator it = splitData->terms.begin();
400
     it != splitData->terms.end(); it++) {
411
     it != splitData->terms.end(); it++, nxit++) {
401
    // Adjust when we do stem expansion. Not inside phrases, and
412
    // Adjust when we do stem expansion. Not inside phrases, and
402
    // some versions of xapian will accept only one OR clause
413
    // some versions of xapian will accept only one OR clause
403
    // inside NEAR, all others must be leafs.
414
    // inside NEAR, all others must be leafs.
404
    bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;
415
    bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;
405
416
406
    string sterm;
417
    string sterm;
407
    list<string>exp;
418
    list<string>exp;
408
    expandTerm(nostemexp, *it, exp, sterm);
419
    expandTerm(nostemexp, *it, exp, sterm);
409
    groups.push_back(vector<string>(exp.begin(), exp.end()));
420
    groups.push_back(vector<string>(exp.begin(), exp.end()));
...
...
432
    m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
443
    m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
433
}
444
}
434
445
435
/** 
446
/** 
436
 * Turn user entry string (NOT query language) into a list of xapian queries.
447
 * Turn user entry string (NOT query language) into a list of xapian queries.
437
 * We just separate words and phrases, and do wildcard and stemp expansion,
448
 * We just separate words and phrases, and do wildcard and stem expansion,
449
 *
450
 * This is used to process data entered into an OR/AND/NEAR/PHRASE field of
451
 * the GUI.
438
 *
452
 *
439
 * The final list contains one query for each term or phrase
453
 * The final list contains one query for each term or phrase
440
 *   - Elements corresponding to a stem-expanded part are an OP_OR
454
 *   - Elements corresponding to a stem-expanded part are an OP_OR
441
 *     composition of the stem-expanded terms (or a single term query).
455
 *     composition of the stem-expanded terms (or a single term query).
442
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
456
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
443
 *     composition of the phrase terms (no stem expansion in this case)
457
 *     composition of the phrase terms (no stem expansion in this case)
444
 * @return the subquery count (either or'd stem-expanded terms or phrase word
458
 * @return the subquery count (either or'd stem-expanded terms or phrase word
445
 *   count)
459
 *   count)
446
 */
460
 */
447
bool StringToXapianQ::processUserString(const string &_iq,
461
bool StringToXapianQ::processUserString(const string &iq,
448
                    string &ermsg,
462
                    string &ermsg,
449
                    list<Xapian::Query> &pqueries,
463
                    list<Xapian::Query> &pqueries,
450
                    const StopList& stops,
464
                    const StopList& stops,
451
                    int slack, 
465
                    int slack, 
452
                    bool useNear
466
                    bool useNear
453
                    )
467
                    )
454
{
468
{
455
    LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
469
    LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
456
    ermsg.erase();
470
    ermsg.erase();
457
    m_terms.clear();
471
    m_terms.clear();
458
    m_groups.clear();
472
    m_groups.clear();
459
473
460
    // First unaccent/normalize the input: do it first so that it
461
    // happens in the same order as when indexing: unac then split. As
462
    // the character count can change during normalisation, this is
463
    // specially important for cjk because the artificial cjk split is
464
    // based on character counts
465
    string iq;
466
    dumb_string(_iq, iq);
467
468
    // Simple whitespace-split input into user-level words and
474
    // Simple whitespace-split input into user-level words and
469
    // double-quoted phrases: word1 word2 "this is a phrase". The text
475
    // double-quoted phrases: word1 word2 "this is a phrase". 
476
    //
470
    // splitter may further still decide that the resulting "words"
477
    // The text splitter may further still decide that the resulting
471
    // are really phrases, this depends on separators: [paul@dom.net]
478
    // "words" are really phrases, this depends on separators:
472
    // would still be a word (span), but [about:me] will probably be
479
    // [paul@dom.net] would still be a word (span), but [about:me]
473
    // handled as a phrase.
480
    // will probably be handled as a phrase.
474
    list<string> phrases;
481
    list<string> phrases;
475
    TextSplit::stringToStrings(iq, phrases);
482
    TextSplit::stringToStrings(iq, phrases);
476
483
477
    // Process each element: textsplit into terms, handle stem/wildcard 
484
    // Process each element: textsplit into terms, handle stem/wildcard 
478
    // expansion and transform into an appropriate Xapian::Query
485
    // expansion and transform into an appropriate Xapian::Query
...
...
514
        LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
521
        LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
515
        switch (splitData->terms.size()) {
522
        switch (splitData->terms.size()) {
516
        case 0: 
523
        case 0: 
517
        continue;// ??
524
        continue;// ??
518
        case 1: 
525
        case 1: 
519
        processSimpleSpan(splitData->terms.front(), pqueries);
526
        processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
520
        break;
527
        break;
521
        default:
528
        default:
522
        processPhraseOrNear(splitData, pqueries, useNear, slack);
529
        processPhraseOrNear(splitData, pqueries, useNear, slack);
523
        }
530
        }
524
    }
531
    }