a b/src/rcldb/searchdatatox.cpp
1
/* Copyright (C) 2006 J.F.Dockes
2
 *   This program is free software; you can redistribute it and/or modify
3
 *   it under the terms of the GNU General Public License as published by
4
 *   the Free Software Foundation; either version 2 of the License, or
5
 *   (at your option) any later version.
6
 *
7
 *   This program is distributed in the hope that it will be useful,
8
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
 *   GNU General Public License for more details.
11
 *
12
 *   You should have received a copy of the GNU General Public License
13
 *   along with this program; if not, write to the
14
 *   Free Software Foundation, Inc.,
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
17
18
// Handle translation from rcl's SearchData structures to Xapian Queries
19
20
#include "autoconfig.h"
21
22
#include <stdio.h>
23
24
#include <string>
25
#include <vector>
26
#include <algorithm>
27
#include <sstream>
28
using namespace std;
29
30
#include "xapian.h"
31
32
#include "cstr.h"
33
#include "rcldb.h"
34
#include "rcldb_p.h"
35
#include "searchdata.h"
36
#include "debuglog.h"
37
#include "smallut.h"
38
#include "textsplit.h"
39
#include "unacpp.h"
40
#include "utf8iter.h"
41
#include "stoplist.h"
42
#include "rclconfig.h"
43
#include "termproc.h"
44
#include "synfamily.h"
45
#include "stemdb.h"
46
#include "expansiondbs.h"
47
#include "base64.h"
48
#include "daterange.h"
49
50
namespace Rcl {
51
52
typedef  vector<SearchDataClause *>::iterator qlist_it_t;
53
54
static const int original_term_wqf_booster = 10;
55
56
// Expand categories and mime type wild card exps Categories are
57
// expanded against the configuration, mimetypes against the index
58
// (for wildcards).
59
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
60
{
61
    const RclConfig *cfg = db.getConf();
62
    if (!cfg) {
63
  LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
64
  return false;
65
    }
66
    vector<string> exptps;
67
68
    for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
69
  if (cfg->isMimeCategory(*it)) {
70
      vector<string>tps;
71
      cfg->getMimeCatTypes(*it, tps);
72
      exptps.insert(exptps.end(), tps.begin(), tps.end());
73
  } else {
74
      TermMatchResult res;
75
      string mt = stringtolower((const string&)*it);
76
      // We set casesens|diacsens to get an equivalent of ixTermMatch()
77
      db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
78
           mt, res, -1, "mtype");
79
      if (res.entries.empty()) {
80
      exptps.push_back(it->c_str());
81
      } else {
82
      for (vector<TermMatchEntry>::const_iterator rit = 
83
           res.entries.begin(); rit != res.entries.end(); rit++) {
84
          exptps.push_back(strip_prefix(rit->term));
85
      }
86
      }
87
  }
88
    }
89
    sort(exptps.begin(), exptps.end());
90
    exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
91
92
    tps = exptps;
93
    return true;
94
}
95
96
static const char *maxXapClauseMsg = 
97
    "Maximum Xapian query size exceeded. Increase maxXapianClauses "
98
    "in the configuration. ";
99
static const char *maxXapClauseCaseDiacMsg = 
100
    "Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
101
    "wildcards ?"
102
    ;
103
104
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, 
105
              vector<SearchDataClause*>& query, 
106
              string& reason, void *d)
107
{
108
    Xapian::Query xq;
109
    for (qlist_it_t it = query.begin(); it != query.end(); it++) {
110
  Xapian::Query nq;
111
  if (!(*it)->toNativeQuery(db, &nq)) {
112
      LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
113
          (*it)->getReason().c_str()));
114
      reason += (*it)->getReason() + " ";
115
      return false;
116
  }       
117
        if (nq.empty()) {
118
            LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
119
            continue;
120
        }
121
  // If this structure is an AND list, must use AND_NOT for excl clauses.
122
  // Else this is an OR list, and there can't be excl clauses (checked by
123
  // addClause())
124
  Xapian::Query::op op;
125
  if (tp == SCLT_AND) {
126
            if ((*it)->getexclude()) {
127
                op =  Xapian::Query::OP_AND_NOT;
128
            } else {
129
                op =  Xapian::Query::OP_AND;
130
            }
131
  } else {
132
      op = Xapian::Query::OP_OR;
133
  }
134
        if (xq.empty()) {
135
            if (op == Xapian::Query::OP_AND_NOT)
136
                xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
137
            else 
138
                xq = nq;
139
        } else {
140
            xq = Xapian::Query(op, xq, nq);
141
        }
142
  if (int(xq.get_length()) >= getMaxCl()) {
143
      LOGERR(("%s\n", maxXapClauseMsg));
144
      m_reason += maxXapClauseMsg;
145
      if (!o_index_stripchars)
146
      m_reason += maxXapClauseCaseDiacMsg;
147
      return false;
148
  }
149
    }
150
151
    LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
152
153
    if (xq.empty())
154
  xq = Xapian::Query::MatchAll;
155
156
   *((Xapian::Query *)d) = xq;
157
    return true;
158
}
159
160
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
161
{
162
    LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
163
    m_reason.erase();
164
165
    db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
166
    db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
167
168
    // Walk the clause list translating each in turn and building the 
169
    // Xapian query tree
170
    Xapian::Query xq;
171
    if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
172
  LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n", 
173
      m_reason.c_str()));
174
  return false;
175
    }
176
177
    if (m_haveDates) {
178
        // If one of the extremities is unset, compute db extremas
179
        if (m_dates.y1 == 0 || m_dates.y2 == 0) {
180
            int minyear = 1970, maxyear = 2100;
181
            if (!db.maxYearSpan(&minyear, &maxyear)) {
182
                LOGERR(("Can't retrieve index min/max dates\n"));
183
                //whatever, go on.
184
            }
185
186
            if (m_dates.y1 == 0) {
187
                m_dates.y1 = minyear;
188
                m_dates.m1 = 1;
189
                m_dates.d1 = 1;
190
            }
191
            if (m_dates.y2 == 0) {
192
                m_dates.y2 = maxyear;
193
                m_dates.m2 = 12;
194
                m_dates.d2 = 31;
195
            }
196
        }
197
        LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
198
                m_dates.y1, m_dates.m1, m_dates.d1,
199
                m_dates.y2, m_dates.m2, m_dates.d2));
200
        Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
201
                m_dates.y2, m_dates.m2, m_dates.d2);
202
        if (dq.empty()) {
203
            LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
204
        }
205
        // If no probabilistic query is provided then promote the daterange
206
        // filter to be THE query instead of filtering an empty query.
207
        if (xq.empty()) {
208
            LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
209
            xq = dq;
210
        } else {
211
            xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
212
        }
213
    }
214
215
216
    if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
217
        Xapian::Query sq;
218
  char min[50], max[50];
219
  sprintf(min, "%lld", (long long)m_minSize);
220
  sprintf(max, "%lld", (long long)m_maxSize);
221
  if (m_minSize == size_t(-1)) {
222
      string value(max);
223
      leftzeropad(value, 12);
224
      sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
225
  } else if (m_maxSize == size_t(-1)) {
226
      string value(min);
227
      leftzeropad(value, 12);
228
      sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
229
  } else {
230
      string minvalue(min);
231
      leftzeropad(minvalue, 12);
232
      string maxvalue(max);
233
      leftzeropad(maxvalue, 12);
234
      sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE, 
235
                 minvalue, maxvalue);
236
  }
237
      
238
        // If no probabilistic query is provided then promote the
239
        // filter to be THE query instead of filtering an empty query.
240
        if (xq.empty()) {
241
            LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
242
            xq = sq;
243
        } else {
244
            xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
245
        }
246
    }
247
248
    // Add the autophrase if any
249
    if (m_autophrase.isNotNull()) {
250
  Xapian::Query apq;
251
  if (m_autophrase->toNativeQuery(db, &apq)) {
252
      xq = xq.empty() ? apq : 
253
      Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
254
  }
255
    }
256
257
    // Add the file type filtering clause if any
258
    if (!m_filetypes.empty()) {
259
  expandFileTypes(db, m_filetypes);
260
      
261
  Xapian::Query tq;
262
  for (vector<string>::iterator it = m_filetypes.begin(); 
263
       it != m_filetypes.end(); it++) {
264
      string term = wrap_prefix(mimetype_prefix) + *it;
265
      LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
266
      tq = tq.empty() ? Xapian::Query(term) : 
267
      Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
268
  }
269
  xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
270
    }
271
272
    // Add the neg file type filtering clause if any
273
    if (!m_nfiletypes.empty()) {
274
  expandFileTypes(db, m_nfiletypes);
275
      
276
  Xapian::Query tq;
277
  for (vector<string>::iterator it = m_nfiletypes.begin(); 
278
       it != m_nfiletypes.end(); it++) {
279
      string term = wrap_prefix(mimetype_prefix) + *it;
280
      LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
281
      tq = tq.empty() ? Xapian::Query(term) : 
282
      Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
283
  }
284
  xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
285
    }
286
287
    *((Xapian::Query *)d) = xq;
288
    return true;
289
}
290
291
// Splitter callback for breaking a user string into simple terms and
292
// phrases. This is for parts of the user entry which would appear as
293
// a single word because there is no white space inside, but are
294
// actually multiple terms to rcldb (ie term1,term2)
295
class TextSplitQ : public TextSplitP {
296
 public:
297
    TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
298
  : TextSplitP(prc, flags), 
299
    curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
300
    {}
301
302
    bool takeword(const std::string &term, int pos, int bs, int be) 
303
    {
304
  // Check if the first letter is a majuscule in which
305
  // case we do not want to do stem expansion. Need to do this
306
  // before unac of course...
307
  curnostemexp = unaciscapital(term);
308
309
  return TextSplitP::takeword(term, pos, bs, be);
310
    }
311
312
    bool           curnostemexp;
313
    vector<string> terms;
314
    vector<bool>   nostemexps;
315
    const StopList &stops;
316
    // Count of terms including stopwords: this is for adjusting
317
    // phrase/near slack
318
    int alltermcount; 
319
    int lastpos;
320
};
321
322
class TermProcQ : public TermProc {
323
public:
324
    TermProcQ() : TermProc(0), m_ts(0) {}
325
    void setTSQ(TextSplitQ *ts) {m_ts = ts;}
326
    
327
    bool takeword(const std::string &term, int pos, int bs, int be) 
328
    {
329
  m_ts->alltermcount++;
330
  if (m_ts->lastpos < pos)
331
      m_ts->lastpos = pos;
332
  bool noexpand = be ? m_ts->curnostemexp : true;
333
  LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", 
334
       term.c_str(), pos, noexpand));
335
  if (m_terms[pos].size() < term.size()) {
336
      m_terms[pos] = term;
337
      m_nste[pos] = noexpand;
338
  }
339
  return true;
340
    }
341
    bool flush()
342
    {
343
  for (map<int, string>::const_iterator it = m_terms.begin();
344
       it != m_terms.end(); it++) {
345
      m_ts->terms.push_back(it->second);
346
      m_ts->nostemexps.push_back(m_nste[it->first]);
347
  }
348
  return true;
349
    }
350
private:
351
    TextSplitQ *m_ts;
352
    map<int, string> m_terms;
353
    map<int, bool> m_nste;
354
};
355
356
357
#if 1
358
static void listVector(const string& what, const vector<string>&l)
359
{
360
    string a;
361
    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
362
        a = a + *it + " ";
363
    }
364
    LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
365
}
366
#endif
367
368
/** Expand term into term list, using appropriate mode: stem, wildcards, 
369
 *  diacritics... 
370
 *
371
 * @param mods stem expansion, case and diacritics sensitivity control.
372
 * @param term input single word
373
 * @param oexp output expansion list
374
 * @param sterm output original input term if there were no wildcards
375
 * @param prefix field prefix in index. We could recompute it, but the caller
376
 *  has it already. Used in the simple case where there is nothing to expand, 
377
 *  and we just return the prefixed term (else Db::termMatch deals with it).
378
 */
379
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, 
380
                  string& ermsg, int mods, 
381
                  const string& term, 
382
                  vector<string>& oexp, string &sterm,
383
                  const string& prefix)
384
{
385
    LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
386
       mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
387
    sterm.clear();
388
    oexp.clear();
389
    if (term.empty())
390
  return true;
391
392
    bool maxexpissoft = false;
393
    int maxexpand = getSoftMaxExp();
394
    if (maxexpand != -1) {
395
  maxexpissoft = true;
396
    } else {
397
  maxexpand = getMaxExp();
398
    }
399
400
    bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
401
402
    // If there are no wildcards, add term to the list of user-entered terms
403
    if (!haswild) {
404
  m_hldata.uterms.insert(term);
405
        sterm = term;
406
    }
407
    // No stem expansion if there are wildcards or if prevented by caller
408
    bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
409
    if (haswild || getStemLang().empty()) {
410
  LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
411
  nostemexp = true;
412
    }
413
414
    // noexpansion can be modified further down by possible case/diac expansion
415
    bool noexpansion = nostemexp && !haswild; 
416
417
    int termmatchsens = 0;
418
419
    bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
420
    bool case_sensitive = (mods & SDCM_CASESENS) != 0;
421
422
    if (o_index_stripchars) {
423
  diac_sensitive = case_sensitive = false;
424
    } else {
425
  // If we are working with a raw index, apply the rules for case and 
426
  // diacritics sensitivity.
427
428
  // If any character has a diacritic, we become
429
  // diacritic-sensitive. Note that the way that the test is
430
  // performed (conversion+comparison) will automatically ignore
431
  // accented characters which are actually a separate letter
432
  if (getAutoDiac() && unachasaccents(term)) {
433
      LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
434
      diac_sensitive = true;
435
  }
436
437
  // If any character apart the first is uppercase, we become
438
  // case-sensitive.  The first character is reserved for
439
  // turning off stemming. You need to use a query language
440
  // modifier to search for Floor in a case-sensitive way.
441
  Utf8Iter it(term);
442
  it++;
443
  if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
444
      LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
445
      case_sensitive = true;
446
  }
447
448
  // If we are sensitive to case or diacritics turn stemming off
449
  if (diac_sensitive || case_sensitive) {
450
      LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
451
      nostemexp = true;
452
  }
453
454
  if (!case_sensitive || !diac_sensitive)
455
      noexpansion = false;
456
    }
457
458
    if (case_sensitive)
459
  termmatchsens |= Db::ET_CASESENS;
460
    if (diac_sensitive)
461
  termmatchsens |= Db::ET_DIACSENS;
462
463
    if (noexpansion) {
464
  oexp.push_back(prefix + term);
465
  m_hldata.terms[term] = term;
466
  LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
467
  return true;
468
    } 
469
470
    Db::MatchType mtyp = haswild ? Db::ET_WILD : 
471
  nostemexp ? Db::ET_NONE : Db::ET_STEM;
472
    TermMatchResult res;
473
    if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
474
            m_field)) {
475
  // Let it go through
476
    }
477
478
    // Term match entries to vector of terms
479
    if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
480
  ermsg = "Maximum term expansion size exceeded."
481
      " Maybe use case/diacritics sensitivity or increase maxTermExpand.";
482
  return false;
483
    }
484
    for (vector<TermMatchEntry>::const_iterator it = res.entries.begin(); 
485
   it != res.entries.end(); it++) {
486
  oexp.push_back(it->term);
487
    }
488
    // If the term does not exist at all in the db, the return from
489
    // termMatch() is going to be empty, which is not what we want (we
490
    // would then compute an empty Xapian query)
491
    if (oexp.empty())
492
  oexp.push_back(prefix + term);
493
494
    // Remember the uterm-to-expansion links
495
    for (vector<string>::const_iterator it = oexp.begin(); 
496
   it != oexp.end(); it++) {
497
  m_hldata.terms[strip_prefix(*it)] = term;
498
    }
499
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
500
    return true;
501
}
502
503
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
504
void multiply_groups(vector<vector<string> >::const_iterator vvit,
505
           vector<vector<string> >::const_iterator vvend, 
506
           vector<string>& comb,
507
           vector<vector<string> >&allcombs)
508
{
509
    // Remember my string vector and compute next, for recursive calls.
510
    vector<vector<string> >::const_iterator myvit = vvit++;
511
512
    // Walk the string vector I'm called upon and, for each string,
513
    // add it to current result, an call myself recursively on the
514
    // next string vector. The last call (last element of the vector of
515
    // vectors), adds the elementary result to the output
516
517
    // Walk my string vector
518
    for (vector<string>::const_iterator strit = (*myvit).begin();
519
   strit != (*myvit).end(); strit++) {
520
521
  // Add my current value to the string vector we're building
522
  comb.push_back(*strit);
523
524
  if (vvit == vvend) {
525
      // Last call: store current result
526
      allcombs.push_back(comb);
527
  } else {
528
      // Call recursively on next string vector
529
      multiply_groups(vvit, vvend, comb, allcombs);
530
  }
531
  // Pop the value I just added (make room for the next element in my
532
  // vector)
533
  comb.pop_back();
534
    }
535
}
536
537
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
538
                         const string& span, 
539
                         int mods, void * pq)
540
{
541
    vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
542
    LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
543
      span.c_str(), (unsigned int)mods));
544
    vector<string> exp;  
545
    string sterm; // dumb version of user term
546
547
    string prefix;
548
    const FieldTraits *ftp;
549
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
550
  prefix = wrap_prefix(ftp->pfx);
551
    }
552
553
    if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
554
  return;
555
    
556
    // Set up the highlight data. No prefix should go in there
557
    for (vector<string>::const_iterator it = exp.begin(); 
558
   it != exp.end(); it++) {
559
  m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
560
  m_hldata.slacks.push_back(0);
561
  m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
562
    }
563
564
    // Push either term or OR of stem-expanded set
565
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
566
    m_curcl += exp.size();
567
568
    // If sterm (simplified original user term) is not null, give it a
569
    // relevance boost. We do this even if no expansion occurred (else
570
    // the non-expanded terms in a term list would end-up with even
571
    // less wqf). This does not happen if there are wildcards anywhere
572
    // in the search.
573
    // We normally boost the original term in the stem expansion list. Don't
574
    // do it if there are wildcards anywhere, this would skew the results.
575
    bool doBoostUserTerm = 
576
  (m_parentSearch && !m_parentSearch->haveWildCards()) || 
577
  (m_parentSearch == 0 && !m_haveWildCards);
578
    if (doBoostUserTerm && !sterm.empty()) {
579
        xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
580
             Xapian::Query(prefix+sterm, 
581
                   original_term_wqf_booster));
582
    }
583
    pqueries.push_back(xq);
584
}
585
586
// User entry element had several terms: transform into a PHRASE or
587
// NEAR xapian query, the elements of which can themselves be OR
588
// queries if the terms get expanded by stemming or wildcards (we
589
// don't do stemming for PHRASE though)
590
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
591
                       TextSplitQ *splitData, 
592
                       int mods, void *pq,
593
                       bool useNear, int slack)
594
{
595
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
596
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
597
  Xapian::Query::OP_PHRASE;
598
    vector<Xapian::Query> orqueries;
599
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
600
    bool hadmultiple = false;
601
#endif
602
    vector<vector<string> >groups;
603
604
    string prefix;
605
    const FieldTraits *ftp;
606
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
607
  prefix = wrap_prefix(ftp->pfx);
608
    }
609
610
    if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
611
  orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
612
  slack++;
613
    }
614
615
    // Go through the list and perform stem/wildcard expansion for each element
616
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
617
    for (vector<string>::iterator it = splitData->terms.begin();
618
   it != splitData->terms.end(); it++, nxit++) {
619
  LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
620
  // Adjust when we do stem expansion. Not if disabled by
621
  // caller, not inside phrases, and some versions of xapian
622
  // will accept only one OR clause inside NEAR.
623
  bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
624
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
625
      || hadmultiple
626
#endif // single OR inside NEAR
627
      ;
628
  int lmods = mods;
629
  if (nostemexp)
630
      lmods |= SearchDataClause::SDCM_NOSTEMMING;
631
  string sterm;
632
  vector<string> exp;
633
  if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
634
      return;
635
  LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
636
  listVector("", exp);
637
  // groups is used for highlighting, we don't want prefixes in there.
638
  vector<string> noprefs;
639
  for (vector<string>::const_iterator it = exp.begin(); 
640
       it != exp.end(); it++) {
641
      noprefs.push_back(it->substr(prefix.size()));
642
  }
643
  groups.push_back(noprefs);
644
  orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
645
                    exp.begin(), exp.end()));
646
  m_curcl += exp.size();
647
  if (m_curcl >= getMaxCl())
648
      return;
649
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
650
  if (exp.size() > 1) 
651
      hadmultiple = true;
652
#endif
653
    }
654
655
    if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
656
  orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
657
  slack++;
658
    }
659
660
    // Generate an appropriate PHRASE/NEAR query with adjusted slack
661
    // For phrases, give a relevance boost like we do for original terms
662
    LOGDEB2(("PHRASE/NEAR:  alltermcount %d lastpos %d\n", 
663
             splitData->alltermcount, splitData->lastpos));
664
    Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
665
           splitData->lastpos + 1 + slack);
666
    if (op == Xapian::Query::OP_PHRASE)
667
  xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, 
668
             original_term_wqf_booster);
669
    pqueries.push_back(xq);
670
671
    // Add all combinations of NEAR/PHRASE groups to the highlighting data. 
672
    vector<vector<string> > allcombs;
673
    vector<string> comb;
674
    multiply_groups(groups.begin(), groups.end(), comb, allcombs);
675
    
676
    // Insert the search groups and slacks in the highlight data, with
677
    // a reference to the user entry that generated them:
678
    m_hldata.groups.insert(m_hldata.groups.end(), 
679
             allcombs.begin(), allcombs.end());
680
    m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
681
    m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(), 
682
                m_hldata.ugroups.size() - 1);
683
}
684
685
// Trim string beginning with ^ or ending with $ and convert to flags
686
static int stringToMods(string& s)
687
{
688
    int mods = 0;
689
    // Check for an anchored search
690
    trimstring(s);
691
    if (s.length() > 0 && s[0] == '^') {
692
  mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
693
  s.erase(0, 1);
694
    }
695
    if (s.length() > 0 && s[s.length()-1] == '$') {
696
  mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
697
  s.erase(s.length()-1);
698
    }
699
    return mods;
700
}
701
702
/** 
703
 * Turn user entry string (NOT query language) into a list of xapian queries.
704
 * We just separate words and phrases, and do wildcard and stem expansion,
705
 *
706
 * This is used to process data entered into an OR/AND/NEAR/PHRASE field of
707
 * the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
708
 * entry).
709
 *
710
 * This appears awful, and it would seem that the split into
711
 * terms/phrases should be performed in the upper layer so that we
712
 * only receive pure term or near/phrase pure elements here, but in
713
 * fact there are things that would appear like terms to naive code,
714
 * and which will actually may be turned into phrases (ie: tom:jerry),
715
 * in a manner which intimately depends on the index implementation,
716
 * so that it makes sense to process this here.
717
 *
718
 * The final list contains one query for each term or phrase
719
 *   - Elements corresponding to a stem-expanded part are an OP_OR
720
 *     composition of the stem-expanded terms (or a single term query).
721
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
722
 *     composition of the phrase terms (no stem expansion in this case)
723
 * @return the subquery count (either or'd stem-expanded terms or phrase word
724
 *   count)
725
 */
726
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
727
                         string &ermsg, void *pq, 
728
                         int slack, bool useNear)
729
{
730
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
731
    int mods = m_modifiers;
732
733
    LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
734
      "slack %d near %d\n", 
735
      iq.c_str(), m_field.c_str(), mods, slack, useNear));
736
    ermsg.erase();
737
    m_curcl = 0;
738
    const StopList stops = db.getStopList();
739
740
    // Simple whitespace-split input into user-level words and
741
    // double-quoted phrases: word1 word2 "this is a phrase". 
742
    //
743
    // The text splitter may further still decide that the resulting
744
    // "words" are really phrases, this depends on separators:
745
    // [paul@dom.net] would still be a word (span), but [about:me]
746
    // will probably be handled as a phrase.
747
    vector<string> phrases;
748
    TextSplit::stringToStrings(iq, phrases);
749
750
    // Process each element: textsplit into terms, handle stem/wildcard 
751
    // expansion and transform into an appropriate Xapian::Query
752
    try {
753
  for (vector<string>::iterator it = phrases.begin(); 
754
       it != phrases.end(); it++) {
755
      LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
756
      // Anchoring modifiers
757
      int amods = stringToMods(*it);
758
      int terminc = amods != 0 ? 1 : 0;
759
      mods |= amods;
760
      // If there are multiple spans in this element, including
761
      // at least one composite, we have to increase the slack
762
      // else a phrase query including a span would fail. 
763
      // Ex: "term0@term1 term2" is onlyspans-split as:
764
      //   0 term0@term1             0   12
765
      //   2 term2                  13   18
766
      // The position of term2 is 2, not 1, so a phrase search
767
      // would fail.
768
      // We used to do  word split, searching for 
769
      // "term0 term1 term2" instead, which may have worse 
770
      // performance, but will succeed.
771
      // We now adjust the phrase/near slack by comparing the term count
772
      // and the last position
773
774
      // The term processing pipeline:
775
      TermProcQ tpq;
776
      TermProc *nxt = &tpq;
777
            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
778
            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
779
            //tpcommon.onlygrams(true);
780
      TermProcPrep tpprep(nxt);
781
      if (o_index_stripchars)
782
      nxt = &tpprep;
783
784
      TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
785
                       TextSplit::TXTS_KEEPWILD), 
786
              stops, nxt);
787
      tpq.setTSQ(&splitter);
788
      splitter.text_to_words(*it);
789
790
      slack += splitter.lastpos - splitter.terms.size() + 1;
791
792
      LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
793
      switch (splitter.terms.size() + terminc) {
794
      case 0: 
795
      continue;// ??
796
      case 1: {
797
      int lmods = mods;
798
      if (splitter.nostemexps.front())
799
          lmods |= SearchDataClause::SDCM_NOSTEMMING;
800
      m_hldata.ugroups.push_back(splitter.terms);
801
      processSimpleSpan(db, ermsg, splitter.terms.front(),
802
                lmods, &pqueries);
803
      }
804
      break;
805
      default:
806
      m_hldata.ugroups.push_back(splitter.terms);
807
      processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
808
                  useNear, slack);
809
      }
810
      if (m_curcl >= getMaxCl()) {
811
      ermsg = maxXapClauseMsg;
812
      if (!o_index_stripchars)
813
          ermsg += maxXapClauseCaseDiacMsg;
814
      break;
815
      }
816
  }
817
    } catch (const Xapian::Error &e) {
818
  ermsg = e.get_msg();
819
    } catch (const string &s) {
820
  ermsg = s;
821
    } catch (const char *s) {
822
  ermsg = s;
823
    } catch (...) {
824
  ermsg = "Caught unknown exception";
825
    }
826
    if (!ermsg.empty()) {
827
  LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
828
  return false;
829
    }
830
    return true;
831
}
832
833
// Translate a simple OR or AND search clause. 
834
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
835
{
836
    LOGDEB(("SearchDataClauseSimple::toNativeQuery: fld [%s] val [%s] "
837
            "stemlang [%s]\n", m_field.c_str(), m_text.c_str(),
838
            getStemLang().c_str()));
839
840
    Xapian::Query *qp = (Xapian::Query *)p;
841
    *qp = Xapian::Query();
842
843
    Xapian::Query::op op;
844
    switch (m_tp) {
845
    case SCLT_AND: op = Xapian::Query::OP_AND; break;
846
    case SCLT_OR: op = Xapian::Query::OP_OR; break;
847
    default:
848
  LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
849
  return false;
850
    }
851
852
    vector<Xapian::Query> pqueries;
853
    if (!processUserString(db, m_text, m_reason, &pqueries))
854
  return false;
855
    if (pqueries.empty()) {
856
  LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
857
  return true;
858
    }
859
860
    *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
861
    if  (m_weight != 1.0) {
862
  *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
863
    }
864
    return true;
865
}
866
867
// Translate a FILENAME search clause. This always comes
868
// from a "filename" search from the gui or recollq. A query language
869
// "filename:"-prefixed field will not go through here, but through
870
// the generic field-processing code.
871
//
872
// We do not split the entry any more (used to do some crazy thing
873
// about expanding multiple fragments in the past). We just take the
874
// value blanks and all and expand this against the indexed unsplit
875
// file names
876
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
877
{
878
    Xapian::Query *qp = (Xapian::Query *)p;
879
    *qp = Xapian::Query();
880
881
    int maxexp = getSoftMaxExp();
882
    if (maxexp == -1)
883
  maxexp = getMaxExp();
884
885
    vector<string> names;
886
    db.filenameWildExp(m_text, names, maxexp);
887
    *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
888
889
    if (m_weight != 1.0) {
890
  *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
891
    }
892
    return true;
893
}
894
895
// Translate a dir: path filtering clause. See comments in .h
896
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
897
{
898
    LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
899
    Xapian::Query *qp = (Xapian::Query *)p;
900
    *qp = Xapian::Query();
901
902
    if (m_text.empty()) {
903
  LOGERR(("SearchDataClausePath: empty path??\n"));
904
  m_reason = "Empty path ?";
905
  return false;
906
    }
907
908
    vector<Xapian::Query> orqueries;
909
910
    if (m_text[0] == '/')
911
  orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
912
    else
913
        m_text = path_tildexpand(m_text);
914
915
    vector<string> vpath;
916
    stringToTokens(m_text, vpath, "/");
917
918
    for (vector<string>::const_iterator pit = vpath.begin(); 
919
   pit != vpath.end(); pit++){
920
921
  string sterm;
922
  vector<string> exp;
923
  if (!expandTerm(db, m_reason, 
924
          SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
925
          *pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
926
      return false;
927
  }
928
  LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
929
  listVector("", exp);
930
  if (exp.size() == 1)
931
      orqueries.push_back(Xapian::Query(exp[0]));
932
  else 
933
      orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
934
                        exp.begin(), exp.end()));
935
  m_curcl += exp.size();
936
  if (m_curcl >= getMaxCl())
937
      return false;
938
    }
939
940
    *qp = Xapian::Query(Xapian::Query::OP_PHRASE, 
941
          orqueries.begin(), orqueries.end());
942
943
    if (m_weight != 1.0) {
944
  *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
945
    }
946
    return true;
947
}
948
949
// Translate NEAR or PHRASE clause. 
950
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
951
{
952
    LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
953
954
    Xapian::Query *qp = (Xapian::Query *)p;
955
    *qp = Xapian::Query();
956
957
    vector<Xapian::Query> pqueries;
958
    Xapian::Query nq;
959
960
    // We produce a single phrase out of the user entry then use
961
    // stringToXapianQueries() to lowercase and simplify the phrase
962
    // terms etc. This will result into a single (complex)
963
    // Xapian::Query.
964
    if (m_text.find('\"') != string::npos) {
965
  m_text = neutchars(m_text, "\"");
966
    }
967
    string s = cstr_dquote + m_text + cstr_dquote;
968
    bool useNear = (m_tp == SCLT_NEAR);
969
    if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
970
  return false;
971
    if (pqueries.empty()) {
972
  LOGERR(("SearchDataClauseDist: resolved to null query\n"));
973
  return true;
974
    }
975
976
    *qp = *pqueries.begin();
977
    if (m_weight != 1.0) {
978
  *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
979
    }
980
    return true;
981
}
982
983
} // Namespace Rcl