Switch to unified view

a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
...
...
50
namespace Rcl {
50
namespace Rcl {
51
51
52
typedef  vector<SearchDataClause *>::iterator qlist_it_t;
52
typedef  vector<SearchDataClause *>::iterator qlist_it_t;
53
typedef  vector<SearchDataClause *>::const_iterator qlist_cit_t;
53
typedef  vector<SearchDataClause *>::const_iterator qlist_cit_t;
54
54
55
static const int original_term_wqf_booster = 10;
56
57
void SearchData::commoninit()
55
void SearchData::commoninit()
58
{
56
{
59
    m_haveDates = false;
57
    m_haveDates = false;
60
    m_maxSize = size_t(-1);
58
    m_maxSize = size_t(-1);
61
    m_minSize = size_t(-1);
59
    m_minSize = size_t(-1);
...
...
70
SearchData::~SearchData() 
68
SearchData::~SearchData() 
71
{
69
{
72
    LOGDEB0(("SearchData::~SearchData\n"));
70
    LOGDEB0(("SearchData::~SearchData\n"));
73
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
71
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
74
    delete *it;
72
    delete *it;
75
}
76
77
// Expand categories and mime type wild card exps Categories are
78
// expanded against the configuration, mimetypes against the index
79
// (for wildcards).
80
bool SearchData::expandFileTypes(Db &db, vector<string>& tps)
81
{
82
    const RclConfig *cfg = db.getConf();
83
    if (!cfg) {
84
  LOGFATAL(("Db::expandFileTypes: null configuration!!\n"));
85
  return false;
86
    }
87
    vector<string> exptps;
88
89
    for (vector<string>::iterator it = tps.begin(); it != tps.end(); it++) {
90
  if (cfg->isMimeCategory(*it)) {
91
      vector<string>tps;
92
      cfg->getMimeCatTypes(*it, tps);
93
      exptps.insert(exptps.end(), tps.begin(), tps.end());
94
  } else {
95
      TermMatchResult res;
96
      string mt = stringtolower((const string&)*it);
97
      // We set casesens|diacsens to get an equivalent of ixTermMatch()
98
      db.termMatch(Db::ET_WILD|Db::ET_CASESENS|Db::ET_DIACSENS, string(),
99
           mt, res, -1, "mtype");
100
      if (res.entries.empty()) {
101
      exptps.push_back(it->c_str());
102
      } else {
103
      for (vector<TermMatchEntry>::const_iterator rit = 
104
           res.entries.begin(); rit != res.entries.end(); rit++) {
105
          exptps.push_back(strip_prefix(rit->term));
106
      }
107
      }
108
  }
109
    }
110
    sort(exptps.begin(), exptps.end());
111
    exptps.erase(unique(exptps.begin(), exptps.end()), exptps.end());
112
113
    tps = exptps;
114
    return true;
115
}
116
117
static const char *maxXapClauseMsg = 
118
    "Maximum Xapian query size exceeded. Increase maxXapianClauses "
119
    "in the configuration. ";
120
static const char *maxXapClauseCaseDiacMsg = 
121
    "Or try to use case (C) or diacritics (D) sensitivity qualifiers, or less "
122
    "wildcards ?"
123
    ;
124
125
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, 
126
              vector<SearchDataClause*>& query, 
127
              string& reason, void *d)
128
{
129
    Xapian::Query xq;
130
    for (qlist_it_t it = query.begin(); it != query.end(); it++) {
131
  Xapian::Query nq;
132
  if (!(*it)->toNativeQuery(db, &nq)) {
133
      LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
134
          (*it)->getReason().c_str()));
135
      reason += (*it)->getReason() + " ";
136
      return false;
137
  }       
138
        if (nq.empty()) {
139
            LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
140
            continue;
141
        }
142
  // If this structure is an AND list, must use AND_NOT for excl clauses.
143
  // Else this is an OR list, and there can't be excl clauses (checked by
144
  // addClause())
145
  Xapian::Query::op op;
146
  if (tp == SCLT_AND) {
147
            if ((*it)->getexclude()) {
148
                op =  Xapian::Query::OP_AND_NOT;
149
            } else {
150
                op =  Xapian::Query::OP_AND;
151
            }
152
  } else {
153
      op = Xapian::Query::OP_OR;
154
  }
155
        if (xq.empty()) {
156
            if (op == Xapian::Query::OP_AND_NOT)
157
                xq = Xapian::Query(op, Xapian::Query::MatchAll, nq);
158
            else 
159
                xq = nq;
160
        } else {
161
            xq = Xapian::Query(op, xq, nq);
162
        }
163
  if (int(xq.get_length()) >= getMaxCl()) {
164
      LOGERR(("%s\n", maxXapClauseMsg));
165
      m_reason += maxXapClauseMsg;
166
      if (!o_index_stripchars)
167
      m_reason += maxXapClauseCaseDiacMsg;
168
      return false;
169
  }
170
    }
171
172
    LOGDEB0(("SearchData::clausesToQuery: got %d clauses\n", xq.get_length()));
173
174
    if (xq.empty())
175
  xq = Xapian::Query::MatchAll;
176
177
   *((Xapian::Query *)d) = xq;
178
    return true;
179
}
180
181
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
182
{
183
    LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
184
    m_reason.erase();
185
186
    db.getConf()->getConfParam("maxTermExpand", &m_maxexp);
187
    db.getConf()->getConfParam("maxXapianClauses", &m_maxcl);
188
189
    // Walk the clause list translating each in turn and building the 
190
    // Xapian query tree
191
    Xapian::Query xq;
192
    if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
193
  LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n", 
194
      m_reason.c_str()));
195
  return false;
196
    }
197
198
    if (m_haveDates) {
199
        // If one of the extremities is unset, compute db extremas
200
        if (m_dates.y1 == 0 || m_dates.y2 == 0) {
201
            int minyear = 1970, maxyear = 2100;
202
            if (!db.maxYearSpan(&minyear, &maxyear)) {
203
                LOGERR(("Can't retrieve index min/max dates\n"));
204
                //whatever, go on.
205
            }
206
207
            if (m_dates.y1 == 0) {
208
                m_dates.y1 = minyear;
209
                m_dates.m1 = 1;
210
                m_dates.d1 = 1;
211
            }
212
            if (m_dates.y2 == 0) {
213
                m_dates.y2 = maxyear;
214
                m_dates.m2 = 12;
215
                m_dates.d2 = 31;
216
            }
217
        }
218
        LOGDEB(("Db::toNativeQuery: date interval: %d-%d-%d/%d-%d-%d\n",
219
                m_dates.y1, m_dates.m1, m_dates.d1,
220
                m_dates.y2, m_dates.m2, m_dates.d2));
221
        Xapian::Query dq = date_range_filter(m_dates.y1, m_dates.m1, m_dates.d1,
222
                m_dates.y2, m_dates.m2, m_dates.d2);
223
        if (dq.empty()) {
224
            LOGINFO(("Db::toNativeQuery: date filter is empty\n"));
225
        }
226
        // If no probabilistic query is provided then promote the daterange
227
        // filter to be THE query instead of filtering an empty query.
228
        if (xq.empty()) {
229
            LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
230
            xq = dq;
231
        } else {
232
            xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, dq);
233
        }
234
    }
235
236
237
    if (m_minSize != size_t(-1) || m_maxSize != size_t(-1)) {
238
        Xapian::Query sq;
239
  char min[50], max[50];
240
  sprintf(min, "%lld", (long long)m_minSize);
241
  sprintf(max, "%lld", (long long)m_maxSize);
242
  if (m_minSize == size_t(-1)) {
243
      string value(max);
244
      leftzeropad(value, 12);
245
      sq = Xapian::Query(Xapian::Query::OP_VALUE_LE, VALUE_SIZE, value);
246
  } else if (m_maxSize == size_t(-1)) {
247
      string value(min);
248
      leftzeropad(value, 12);
249
      sq = Xapian::Query(Xapian::Query::OP_VALUE_GE, VALUE_SIZE, value);
250
  } else {
251
      string minvalue(min);
252
      leftzeropad(minvalue, 12);
253
      string maxvalue(max);
254
      leftzeropad(maxvalue, 12);
255
      sq = Xapian::Query(Xapian::Query::OP_VALUE_RANGE, VALUE_SIZE, 
256
                 minvalue, maxvalue);
257
  }
258
      
259
        // If no probabilistic query is provided then promote the
260
        // filter to be THE query instead of filtering an empty query.
261
        if (xq.empty()) {
262
            LOGINFO(("Db::toNativeQuery: proba query is empty\n"));
263
            xq = sq;
264
        } else {
265
            xq = Xapian::Query(Xapian::Query::OP_FILTER, xq, sq);
266
        }
267
    }
268
269
    // Add the autophrase if any
270
    if (m_autophrase.isNotNull()) {
271
  Xapian::Query apq;
272
  if (m_autophrase->toNativeQuery(db, &apq)) {
273
      xq = xq.empty() ? apq : 
274
      Xapian::Query(Xapian::Query::OP_AND_MAYBE, xq, apq);
275
  }
276
    }
277
278
    // Add the file type filtering clause if any
279
    if (!m_filetypes.empty()) {
280
  expandFileTypes(db, m_filetypes);
281
      
282
  Xapian::Query tq;
283
  for (vector<string>::iterator it = m_filetypes.begin(); 
284
       it != m_filetypes.end(); it++) {
285
      string term = wrap_prefix(mimetype_prefix) + *it;
286
      LOGDEB0(("Adding file type term: [%s]\n", term.c_str()));
287
      tq = tq.empty() ? Xapian::Query(term) : 
288
      Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
289
  }
290
  xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_FILTER, xq, tq);
291
    }
292
293
    // Add the neg file type filtering clause if any
294
    if (!m_nfiletypes.empty()) {
295
  expandFileTypes(db, m_nfiletypes);
296
      
297
  Xapian::Query tq;
298
  for (vector<string>::iterator it = m_nfiletypes.begin(); 
299
       it != m_nfiletypes.end(); it++) {
300
      string term = wrap_prefix(mimetype_prefix) + *it;
301
      LOGDEB0(("Adding negative file type term: [%s]\n", term.c_str()));
302
      tq = tq.empty() ? Xapian::Query(term) : 
303
      Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
304
  }
305
  xq = xq.empty() ? tq : Xapian::Query(Xapian::Query::OP_AND_NOT, xq, tq);
306
    }
307
308
    *((Xapian::Query *)d) = xq;
309
    return true;
310
}
73
}
311
74
312
// This is called by the GUI simple search if the option is set: add
75
// This is called by the GUI simple search if the option is set: add
313
// (OR) phrase to a query (if it is simple enough) so that results
76
// (OR) phrase to a query (if it is simple enough) so that results
314
// where the search terms are close and in order will come up on top.
77
// where the search terms are close and in order will come up on top.
...
...
426
    for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
189
    for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
427
    (*it)->getTerms(hld);
190
    (*it)->getTerms(hld);
428
    return;
191
    return;
429
}
192
}
430
193
431
// Splitter callback for breaking a user string into simple terms and
432
// phrases. This is for parts of the user entry which would appear as
433
// a single word because there is no white space inside, but are
434
// actually multiple terms to rcldb (ie term1,term2)
435
class TextSplitQ : public TextSplitP {
436
 public:
437
    TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
438
  : TextSplitP(prc, flags), 
439
    curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
440
    {}
441
442
    bool takeword(const std::string &term, int pos, int bs, int be) 
443
    {
444
  // Check if the first letter is a majuscule in which
445
  // case we do not want to do stem expansion. Need to do this
446
  // before unac of course...
447
  curnostemexp = unaciscapital(term);
448
449
  return TextSplitP::takeword(term, pos, bs, be);
450
    }
451
452
    bool           curnostemexp;
453
    vector<string> terms;
454
    vector<bool>   nostemexps;
455
    const StopList &stops;
456
    // Count of terms including stopwords: this is for adjusting
457
    // phrase/near slack
458
    int alltermcount; 
459
    int lastpos;
460
};
461
462
class TermProcQ : public TermProc {
463
public:
464
    TermProcQ() : TermProc(0), m_ts(0) {}
465
    void setTSQ(TextSplitQ *ts) {m_ts = ts;}
466
    
467
    bool takeword(const std::string &term, int pos, int bs, int be) 
468
    {
469
  m_ts->alltermcount++;
470
  if (m_ts->lastpos < pos)
471
      m_ts->lastpos = pos;
472
  bool noexpand = be ? m_ts->curnostemexp : true;
473
  LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", 
474
       term.c_str(), pos, noexpand));
475
  if (m_terms[pos].size() < term.size()) {
476
      m_terms[pos] = term;
477
      m_nste[pos] = noexpand;
478
  }
479
  return true;
480
    }
481
    bool flush()
482
    {
483
  for (map<int, string>::const_iterator it = m_terms.begin();
484
       it != m_terms.end(); it++) {
485
      m_ts->terms.push_back(it->second);
486
      m_ts->nostemexps.push_back(m_nste[it->first]);
487
  }
488
  return true;
489
    }
490
private:
491
    TextSplitQ *m_ts;
492
    map<int, string> m_terms;
493
    map<int, bool> m_nste;
494
};
495
496
497
#if 1
498
static void listVector(const string& what, const vector<string>&l)
499
{
500
    string a;
501
    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
502
        a = a + *it + " ";
503
    }
504
    LOGDEB0(("%s: %s\n", what.c_str(), a.c_str()));
505
}
506
#endif
507
508
/** Expand term into term list, using appropriate mode: stem, wildcards, 
509
 *  diacritics... 
510
 *
511
 * @param mods stem expansion, case and diacritics sensitivity control.
512
 * @param term input single word
513
 * @param oexp output expansion list
514
 * @param sterm output original input term if there were no wildcards
515
 * @param prefix field prefix in index. We could recompute it, but the caller
516
 *  has it already. Used in the simple case where there is nothing to expand, 
517
 *  and we just return the prefixed term (else Db::termMatch deals with it).
518
 */
519
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, 
520
                  string& ermsg, int mods, 
521
                  const string& term, 
522
                  vector<string>& oexp, string &sterm,
523
                  const string& prefix)
524
{
525
    LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
526
       mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
527
    sterm.clear();
528
    oexp.clear();
529
    if (term.empty())
530
  return true;
531
532
    bool maxexpissoft = false;
533
    int maxexpand = getSoftMaxExp();
534
    if (maxexpand != -1) {
535
  maxexpissoft = true;
536
    } else {
537
  maxexpand = getMaxExp();
538
    }
539
540
    bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
541
542
    // If there are no wildcards, add term to the list of user-entered terms
543
    if (!haswild) {
544
  m_hldata.uterms.insert(term);
545
        sterm = term;
546
    }
547
    // No stem expansion if there are wildcards or if prevented by caller
548
    bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
549
    if (haswild || getStemLang().empty()) {
550
  LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
551
  nostemexp = true;
552
    }
553
554
    // noexpansion can be modified further down by possible case/diac expansion
555
    bool noexpansion = nostemexp && !haswild; 
556
557
    int termmatchsens = 0;
558
559
    bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
560
    bool case_sensitive = (mods & SDCM_CASESENS) != 0;
561
562
    if (o_index_stripchars) {
563
  diac_sensitive = case_sensitive = false;
564
    } else {
565
  // If we are working with a raw index, apply the rules for case and 
566
  // diacritics sensitivity.
567
568
  // If any character has a diacritic, we become
569
  // diacritic-sensitive. Note that the way that the test is
570
  // performed (conversion+comparison) will automatically ignore
571
  // accented characters which are actually a separate letter
572
  if (getAutoDiac() && unachasaccents(term)) {
573
      LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
574
      diac_sensitive = true;
575
  }
576
577
  // If any character apart the first is uppercase, we become
578
  // case-sensitive.  The first character is reserved for
579
  // turning off stemming. You need to use a query language
580
  // modifier to search for Floor in a case-sensitive way.
581
  Utf8Iter it(term);
582
  it++;
583
  if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
584
      LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
585
      case_sensitive = true;
586
  }
587
588
  // If we are sensitive to case or diacritics turn stemming off
589
  if (diac_sensitive || case_sensitive) {
590
      LOGDEB0(("expandTerm: diac or case sens set -> stemexpand off\n"));
591
      nostemexp = true;
592
  }
593
594
  if (!case_sensitive || !diac_sensitive)
595
      noexpansion = false;
596
    }
597
598
    if (case_sensitive)
599
  termmatchsens |= Db::ET_CASESENS;
600
    if (diac_sensitive)
601
  termmatchsens |= Db::ET_DIACSENS;
602
603
    if (noexpansion) {
604
  oexp.push_back(prefix + term);
605
  m_hldata.terms[term] = term;
606
  LOGDEB(("ExpandTerm: noexpansion: final: %s\n", stringsToString(oexp).c_str()));
607
  return true;
608
    } 
609
610
    Db::MatchType mtyp = haswild ? Db::ET_WILD : 
611
  nostemexp ? Db::ET_NONE : Db::ET_STEM;
612
    TermMatchResult res;
613
    if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
614
            m_field)) {
615
  // Let it go through
616
    }
617
618
    // Term match entries to vector of terms
619
    if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
620
  ermsg = "Maximum term expansion size exceeded."
621
      " Maybe use case/diacritics sensitivity or increase maxTermExpand.";
622
  return false;
623
    }
624
    for (vector<TermMatchEntry>::const_iterator it = res.entries.begin(); 
625
   it != res.entries.end(); it++) {
626
  oexp.push_back(it->term);
627
    }
628
    // If the term does not exist at all in the db, the return from
629
    // termMatch() is going to be empty, which is not what we want (we
630
    // would then compute an empty Xapian query)
631
    if (oexp.empty())
632
  oexp.push_back(prefix + term);
633
634
    // Remember the uterm-to-expansion links
635
    for (vector<string>::const_iterator it = oexp.begin(); 
636
   it != oexp.end(); it++) {
637
  m_hldata.terms[strip_prefix(*it)] = term;
638
    }
639
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
640
    return true;
641
}
642
643
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
644
void multiply_groups(vector<vector<string> >::const_iterator vvit,
645
           vector<vector<string> >::const_iterator vvend, 
646
           vector<string>& comb,
647
           vector<vector<string> >&allcombs)
648
{
649
    // Remember my string vector and compute next, for recursive calls.
650
    vector<vector<string> >::const_iterator myvit = vvit++;
651
652
    // Walk the string vector I'm called upon and, for each string,
653
    // add it to current result, an call myself recursively on the
654
    // next string vector. The last call (last element of the vector of
655
    // vectors), adds the elementary result to the output
656
657
    // Walk my string vector
658
    for (vector<string>::const_iterator strit = (*myvit).begin();
659
   strit != (*myvit).end(); strit++) {
660
661
  // Add my current value to the string vector we're building
662
  comb.push_back(*strit);
663
664
  if (vvit == vvend) {
665
      // Last call: store current result
666
      allcombs.push_back(comb);
667
  } else {
668
      // Call recursively on next string vector
669
      multiply_groups(vvit, vvend, comb, allcombs);
670
  }
671
  // Pop the value I just added (make room for the next element in my
672
  // vector)
673
  comb.pop_back();
674
    }
675
}
676
677
void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
678
                         const string& span, 
679
                         int mods, void * pq)
680
{
681
    vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
682
    LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
683
      span.c_str(), (unsigned int)mods));
684
    vector<string> exp;  
685
    string sterm; // dumb version of user term
686
687
    string prefix;
688
    const FieldTraits *ftp;
689
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
690
  prefix = wrap_prefix(ftp->pfx);
691
    }
692
693
    if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
694
  return;
695
    
696
    // Set up the highlight data. No prefix should go in there
697
    for (vector<string>::const_iterator it = exp.begin(); 
698
   it != exp.end(); it++) {
699
  m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
700
  m_hldata.slacks.push_back(0);
701
  m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
702
    }
703
704
    // Push either term or OR of stem-expanded set
705
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
706
    m_curcl += exp.size();
707
708
    // If sterm (simplified original user term) is not null, give it a
709
    // relevance boost. We do this even if no expansion occurred (else
710
    // the non-expanded terms in a term list would end-up with even
711
    // less wqf). This does not happen if there are wildcards anywhere
712
    // in the search.
713
    // We normally boost the original term in the stem expansion list. Don't
714
    // do it if there are wildcards anywhere, this would skew the results.
715
    bool doBoostUserTerm = 
716
  (m_parentSearch && !m_parentSearch->haveWildCards()) || 
717
  (m_parentSearch == 0 && !m_haveWildCards);
718
    if (doBoostUserTerm && !sterm.empty()) {
719
        xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
720
             Xapian::Query(prefix+sterm, 
721
                   original_term_wqf_booster));
722
    }
723
    pqueries.push_back(xq);
724
}
725
726
// User entry element had several terms: transform into a PHRASE or
727
// NEAR xapian query, the elements of which can themselves be OR
728
// queries if the terms get expanded by stemming or wildcards (we
729
// don't do stemming for PHRASE though)
730
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
731
                       TextSplitQ *splitData, 
732
                       int mods, void *pq,
733
                       bool useNear, int slack)
734
{
735
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
736
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
737
  Xapian::Query::OP_PHRASE;
738
    vector<Xapian::Query> orqueries;
739
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
740
    bool hadmultiple = false;
741
#endif
742
    vector<vector<string> >groups;
743
744
    string prefix;
745
    const FieldTraits *ftp;
746
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp, true)) {
747
  prefix = wrap_prefix(ftp->pfx);
748
    }
749
750
    if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
751
  orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
752
  slack++;
753
    }
754
755
    // Go through the list and perform stem/wildcard expansion for each element
756
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
757
    for (vector<string>::iterator it = splitData->terms.begin();
758
   it != splitData->terms.end(); it++, nxit++) {
759
  LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
760
  // Adjust when we do stem expansion. Not if disabled by
761
  // caller, not inside phrases, and some versions of xapian
762
  // will accept only one OR clause inside NEAR.
763
  bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
764
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
765
      || hadmultiple
766
#endif // single OR inside NEAR
767
      ;
768
  int lmods = mods;
769
  if (nostemexp)
770
      lmods |= SearchDataClause::SDCM_NOSTEMMING;
771
  string sterm;
772
  vector<string> exp;
773
  if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
774
      return;
775
  LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
776
  listVector("", exp);
777
  // groups is used for highlighting, we don't want prefixes in there.
778
  vector<string> noprefs;
779
  for (vector<string>::const_iterator it = exp.begin(); 
780
       it != exp.end(); it++) {
781
      noprefs.push_back(it->substr(prefix.size()));
782
  }
783
  groups.push_back(noprefs);
784
  orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
785
                    exp.begin(), exp.end()));
786
  m_curcl += exp.size();
787
  if (m_curcl >= getMaxCl())
788
      return;
789
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
790
  if (exp.size() > 1) 
791
      hadmultiple = true;
792
#endif
793
    }
794
795
    if (mods & Rcl::SearchDataClause::SDCM_ANCHOREND) {
796
  orqueries.push_back(Xapian::Query(prefix + end_of_field_term));
797
  slack++;
798
    }
799
800
    // Generate an appropriate PHRASE/NEAR query with adjusted slack
801
    // For phrases, give a relevance boost like we do for original terms
802
    LOGDEB2(("PHRASE/NEAR:  alltermcount %d lastpos %d\n", 
803
             splitData->alltermcount, splitData->lastpos));
804
    Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
805
           splitData->lastpos + 1 + slack);
806
    if (op == Xapian::Query::OP_PHRASE)
807
  xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, 
808
             original_term_wqf_booster);
809
    pqueries.push_back(xq);
810
811
    // Add all combinations of NEAR/PHRASE groups to the highlighting data. 
812
    vector<vector<string> > allcombs;
813
    vector<string> comb;
814
    multiply_groups(groups.begin(), groups.end(), comb, allcombs);
815
    
816
    // Insert the search groups and slacks in the highlight data, with
817
    // a reference to the user entry that generated them:
818
    m_hldata.groups.insert(m_hldata.groups.end(), 
819
             allcombs.begin(), allcombs.end());
820
    m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
821
    m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(), 
822
                m_hldata.ugroups.size() - 1);
823
}
824
825
// Trim string beginning with ^ or ending with $ and convert to flags
826
static int stringToMods(string& s)
827
{
828
    int mods = 0;
829
    // Check for an anchored search
830
    trimstring(s);
831
    if (s.length() > 0 && s[0] == '^') {
832
  mods |= Rcl::SearchDataClause::SDCM_ANCHORSTART;
833
  s.erase(0, 1);
834
    }
835
    if (s.length() > 0 && s[s.length()-1] == '$') {
836
  mods |= Rcl::SearchDataClause::SDCM_ANCHOREND;
837
  s.erase(s.length()-1);
838
    }
839
    return mods;
840
}
841
842
/** 
843
 * Turn user entry string (NOT query language) into a list of xapian queries.
844
 * We just separate words and phrases, and do wildcard and stem expansion,
845
 *
846
 * This is used to process data entered into an OR/AND/NEAR/PHRASE field of
847
 * the GUI (in the case of NEAR/PHRASE, clausedist adds dquotes to the user
848
 * entry).
849
 *
850
 * This appears awful, and it would seem that the split into
851
 * terms/phrases should be performed in the upper layer so that we
852
 * only receive pure term or near/phrase pure elements here, but in
853
 * fact there are things that would appear like terms to naive code,
854
 * and which will actually may be turned into phrases (ie: tom:jerry),
855
 * in a manner which intimately depends on the index implementation,
856
 * so that it makes sense to process this here.
857
 *
858
 * The final list contains one query for each term or phrase
859
 *   - Elements corresponding to a stem-expanded part are an OP_OR
860
 *     composition of the stem-expanded terms (or a single term query).
861
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
862
 *     composition of the phrase terms (no stem expansion in this case)
863
 * @return the subquery count (either or'd stem-expanded terms or phrase word
864
 *   count)
865
 */
866
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,
867
                         string &ermsg, void *pq, 
868
                         int slack, bool useNear)
869
{
870
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
871
    int mods = m_modifiers;
872
873
    LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
874
      "slack %d near %d\n", 
875
      iq.c_str(), m_field.c_str(), mods, slack, useNear));
876
    ermsg.erase();
877
    m_curcl = 0;
878
    const StopList stops = db.getStopList();
879
880
    // Simple whitespace-split input into user-level words and
881
    // double-quoted phrases: word1 word2 "this is a phrase". 
882
    //
883
    // The text splitter may further still decide that the resulting
884
    // "words" are really phrases, this depends on separators:
885
    // [paul@dom.net] would still be a word (span), but [about:me]
886
    // will probably be handled as a phrase.
887
    vector<string> phrases;
888
    TextSplit::stringToStrings(iq, phrases);
889
890
    // Process each element: textsplit into terms, handle stem/wildcard 
891
    // expansion and transform into an appropriate Xapian::Query
892
    try {
893
  for (vector<string>::iterator it = phrases.begin(); 
894
       it != phrases.end(); it++) {
895
      LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
896
      // Anchoring modifiers
897
      int amods = stringToMods(*it);
898
      int terminc = amods != 0 ? 1 : 0;
899
      mods |= amods;
900
      // If there are multiple spans in this element, including
901
      // at least one composite, we have to increase the slack
902
      // else a phrase query including a span would fail. 
903
      // Ex: "term0@term1 term2" is onlyspans-split as:
904
      //   0 term0@term1             0   12
905
      //   2 term2                  13   18
906
      // The position of term2 is 2, not 1, so a phrase search
907
      // would fail.
908
      // We used to do  word split, searching for 
909
      // "term0 term1 term2" instead, which may have worse 
910
      // performance, but will succeed.
911
      // We now adjust the phrase/near slack by comparing the term count
912
      // and the last position
913
914
      // The term processing pipeline:
915
      TermProcQ tpq;
916
      TermProc *nxt = &tpq;
917
            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
918
            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
919
            //tpcommon.onlygrams(true);
920
      TermProcPrep tpprep(nxt);
921
      if (o_index_stripchars)
922
      nxt = &tpprep;
923
924
      TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
925
                       TextSplit::TXTS_KEEPWILD), 
926
              stops, nxt);
927
      tpq.setTSQ(&splitter);
928
      splitter.text_to_words(*it);
929
930
      slack += splitter.lastpos - splitter.terms.size() + 1;
931
932
      LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
933
      switch (splitter.terms.size() + terminc) {
934
      case 0: 
935
      continue;// ??
936
      case 1: {
937
      int lmods = mods;
938
      if (splitter.nostemexps.front())
939
          lmods |= SearchDataClause::SDCM_NOSTEMMING;
940
      m_hldata.ugroups.push_back(splitter.terms);
941
      processSimpleSpan(db, ermsg, splitter.terms.front(),
942
                lmods, &pqueries);
943
      }
944
      break;
945
      default:
946
      m_hldata.ugroups.push_back(splitter.terms);
947
      processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
948
                  useNear, slack);
949
      }
950
      if (m_curcl >= getMaxCl()) {
951
      ermsg = maxXapClauseMsg;
952
      if (!o_index_stripchars)
953
          ermsg += maxXapClauseCaseDiacMsg;
954
      break;
955
      }
956
  }
957
    } catch (const Xapian::Error &e) {
958
  ermsg = e.get_msg();
959
    } catch (const string &s) {
960
  ermsg = s;
961
    } catch (const char *s) {
962
  ermsg = s;
963
    } catch (...) {
964
  ermsg = "Caught unknown exception";
965
    }
966
    if (!ermsg.empty()) {
967
  LOGERR(("stringToXapianQueries: %s\n", ermsg.c_str()));
968
  return false;
969
    }
970
    return true;
971
}
972
973
// Translate a simple OR or AND search clause. 
974
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
975
{
976
    LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
977
       getStemLang().c_str()));
978
979
    Xapian::Query *qp = (Xapian::Query *)p;
980
    *qp = Xapian::Query();
981
982
    Xapian::Query::op op;
983
    switch (m_tp) {
984
    case SCLT_AND: op = Xapian::Query::OP_AND; break;
985
    case SCLT_OR: op = Xapian::Query::OP_OR; break;
986
    default:
987
  LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
988
  return false;
989
    }
990
991
    vector<Xapian::Query> pqueries;
992
    if (!processUserString(db, m_text, m_reason, &pqueries))
993
  return false;
994
    if (pqueries.empty()) {
995
  LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
996
  return true;
997
    }
998
999
    *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
1000
    if  (m_weight != 1.0) {
1001
  *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
1002
    }
1003
    return true;
1004
}
1005
1006
// Translate a FILENAME search clause. This always comes
1007
// from a "filename" search from the gui or recollq. A query language
1008
// "filename:"-prefixed field will not go through here, but through
1009
// the generic field-processing code.
1010
//
1011
// We do not split the entry any more (used to do some crazy thing
1012
// about expanding multiple fragments in the past). We just take the
1013
// value blanks and all and expand this against the indexed unsplit
1014
// file names
1015
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
1016
{
1017
    Xapian::Query *qp = (Xapian::Query *)p;
1018
    *qp = Xapian::Query();
1019
1020
    int maxexp = getSoftMaxExp();
1021
    if (maxexp == -1)
1022
  maxexp = getMaxExp();
1023
1024
    vector<string> names;
1025
    db.filenameWildExp(m_text, names, maxexp);
1026
    *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
1027
1028
    if (m_weight != 1.0) {
1029
  *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
1030
    }
1031
    return true;
1032
}
1033
1034
// Translate a dir: path filtering clause. See comments in .h
1035
bool SearchDataClausePath::toNativeQuery(Rcl::Db &db, void *p)
1036
{
1037
    LOGDEB(("SearchDataClausePath::toNativeQuery: [%s]\n", m_text.c_str()));
1038
    Xapian::Query *qp = (Xapian::Query *)p;
1039
    *qp = Xapian::Query();
1040
1041
    if (m_text.empty()) {
1042
  LOGERR(("SearchDataClausePath: empty path??\n"));
1043
  m_reason = "Empty path ?";
1044
  return false;
1045
    }
1046
1047
    vector<Xapian::Query> orqueries;
1048
1049
    if (m_text[0] == '/')
1050
  orqueries.push_back(Xapian::Query(wrap_prefix(pathelt_prefix)));
1051
    else
1052
        m_text = path_tildexpand(m_text);
1053
1054
    vector<string> vpath;
1055
    stringToTokens(m_text, vpath, "/");
1056
1057
    for (vector<string>::const_iterator pit = vpath.begin(); 
1058
   pit != vpath.end(); pit++){
1059
1060
  string sterm;
1061
  vector<string> exp;
1062
  if (!expandTerm(db, m_reason, 
1063
          SDCM_NOSTEMMING|SDCM_CASESENS|SDCM_DIACSENS,
1064
          *pit, exp, sterm, wrap_prefix(pathelt_prefix))) {
1065
      return false;
1066
  }
1067
  LOGDEB0(("SDataPath::toNative: exp size %d\n", exp.size()));
1068
  listVector("", exp);
1069
  if (exp.size() == 1)
1070
      orqueries.push_back(Xapian::Query(exp[0]));
1071
  else 
1072
      orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
1073
                        exp.begin(), exp.end()));
1074
  m_curcl += exp.size();
1075
  if (m_curcl >= getMaxCl())
1076
      return false;
1077
    }
1078
1079
    *qp = Xapian::Query(Xapian::Query::OP_PHRASE, 
1080
          orqueries.begin(), orqueries.end());
1081
1082
    if (m_weight != 1.0) {
1083
  *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
1084
    }
1085
    return true;
1086
}
1087
1088
// Translate NEAR or PHRASE clause. 
1089
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
1090
{
1091
    LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
1092
1093
    Xapian::Query *qp = (Xapian::Query *)p;
1094
    *qp = Xapian::Query();
1095
1096
    vector<Xapian::Query> pqueries;
1097
    Xapian::Query nq;
1098
1099
    // We produce a single phrase out of the user entry then use
1100
    // stringToXapianQueries() to lowercase and simplify the phrase
1101
    // terms etc. This will result into a single (complex)
1102
    // Xapian::Query.
1103
    if (m_text.find('\"') != string::npos) {
1104
  m_text = neutchars(m_text, "\"");
1105
    }
1106
    string s = cstr_dquote + m_text + cstr_dquote;
1107
    bool useNear = (m_tp == SCLT_NEAR);
1108
    if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
1109
  return false;
1110
    if (pqueries.empty()) {
1111
  LOGERR(("SearchDataClauseDist: resolved to null query\n"));
1112
  return true;
1113
    }
1114
1115
    *qp = *pqueries.begin();
1116
    if (m_weight != 1.0) {
1117
  *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
1118
    }
1119
    return true;
1120
}
1121
1122
} // Namespace Rcl
194
} // Namespace Rcl