Switch to unified view

a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
...
...
14
 *   Free Software Foundation, Inc.,
14
 *   Free Software Foundation, Inc.,
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
16
 */
17
17
18
// Handle translation from rcl's SearchData structures to Xapian Queries
18
// Handle translation from rcl's SearchData structures to Xapian Queries
19
20
#include "autoconfig.h"
21
19
#include <stdio.h>
22
#include <stdio.h>
20
#include <fnmatch.h>
23
#include <fnmatch.h>
21
24
22
#include <string>
25
#include <string>
23
#include <vector>
26
#include <vector>
24
#include <algorithm>
27
#include <algorithm>
28
using namespace std;
25
29
26
#include "xapian.h"
30
#include "xapian.h"
27
31
28
#include "cstr.h"
32
#include "cstr.h"
29
#include "rcldb.h"
33
#include "rcldb.h"
34
#include "rcldb_p.h"
30
#include "searchdata.h"
35
#include "searchdata.h"
31
#include "debuglog.h"
36
#include "debuglog.h"
32
#include "smallut.h"
37
#include "smallut.h"
33
#include "textsplit.h"
38
#include "textsplit.h"
34
#include "unacpp.h"
39
#include "unacpp.h"
35
#include "utf8iter.h"
40
#include "utf8iter.h"
36
#include "stoplist.h"
41
#include "stoplist.h"
37
#include "rclconfig.h"
42
#include "rclconfig.h"
38
#include "termproc.h"
43
#include "termproc.h"
44
#include "synfamily.h"
45
#include "stemdb.h"
46
#include "expansiondbs.h"
39
47
40
#ifndef NO_NAMESPACES
41
using namespace std;
42
namespace Rcl {
48
namespace Rcl {
43
#endif
44
49
45
typedef  vector<SearchDataClause *>::iterator qlist_it_t;
50
typedef  vector<SearchDataClause *>::iterator qlist_it_t;
46
typedef  vector<SearchDataClause *>::const_iterator qlist_cit_t;
51
typedef  vector<SearchDataClause *>::const_iterator qlist_cit_t;
47
52
48
static const int original_term_wqf_booster = 10;
53
static const int original_term_wqf_booster = 10;
...
...
69
 * You should have received a copy of the GNU General Public License
74
 * You should have received a copy of the GNU General Public License
70
 * along with this program; if not, write to the Free Software
75
 * along with this program; if not, write to the Free Software
71
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
76
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
72
 * USA
77
 * USA
73
 */
78
 */
79
80
#ifdef RCL_INDEX_STRIPCHARS
81
#define bufprefix(BUF, L) {(BUF)[0] = L;}
82
#define bpoffs() 1
83
#else
84
static inline void bufprefix(char *buf, char c)
85
{
86
    if (o_index_stripchars) {
87
  buf[0] = c;
88
    } else {
89
  buf[0] = ':'; 
90
  buf[1] = c; 
91
  buf[2] = ':';
92
    }
93
}
94
static inline int bpoffs() 
95
{
96
    return o_index_stripchars ? 1 : 3;
97
}
98
#endif
99
74
static Xapian::Query
100
static Xapian::Query
75
date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
101
date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
76
{
102
{
77
    // Xapian uses a smallbuf and snprintf. Can't be bothered, we're
103
    // Xapian uses a smallbuf and snprintf. Can't be bothered, we're
78
    // only doing %d's !
104
    // only doing %d's !
79
    char buf[200];
105
    char buf[200];
106
    bufprefix(buf, 'D');
80
    sprintf(buf, "D%04d%02d", y1, m1);
107
    sprintf(buf+bpoffs(), "%04d%02d", y1, m1);
81
    vector<Xapian::Query> v;
108
    vector<Xapian::Query> v;
82
109
83
    int d_last = monthdays(m1, y1);
110
    int d_last = monthdays(m1, y1);
84
    int d_end = d_last;
111
    int d_end = d_last;
85
    if (y1 == y2 && m1 == m2 && d2 < d_last) {
112
    if (y1 == y2 && m1 == m2 && d2 < d_last) {
86
    d_end = d2;
113
    d_end = d2;
87
    }
114
    }
88
    // Deal with any initial partial month
115
    // Deal with any initial partial month
89
    if (d1 > 1 || d_end < d_last) {
116
    if (d1 > 1 || d_end < d_last) {
90
        for ( ; d1 <= d_end ; d1++) {
117
        for ( ; d1 <= d_end ; d1++) {
91
        sprintf(buf + 7, "%02d", d1);
118
        sprintf(buf + 6 + bpoffs(), "%02d", d1);
92
        v.push_back(Xapian::Query(buf));
119
        v.push_back(Xapian::Query(buf));
93
    }
120
    }
94
    } else {
121
    } else {
95
  buf[0] = 'M';
122
  bufprefix(buf, 'M');
96
    v.push_back(Xapian::Query(buf));
123
    v.push_back(Xapian::Query(buf));
97
    }
124
    }
98
    
125
    
99
    if (y1 == y2 && m1 == m2) {
126
    if (y1 == y2 && m1 == m2) {
100
    return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end());
127
    return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end());
101
    }
128
    }
102
129
103
    int m_last = (y1 < y2) ? 12 : m2 - 1;
130
    int m_last = (y1 < y2) ? 12 : m2 - 1;
104
    while (++m1 <= m_last) {
131
    while (++m1 <= m_last) {
105
    sprintf(buf + 5, "%02d", m1);
132
    sprintf(buf + 4 + bpoffs(), "%02d", m1);
106
  buf[0] = 'M';
133
  bufprefix(buf, 'M');
107
    v.push_back(Xapian::Query(buf));
134
    v.push_back(Xapian::Query(buf));
108
    }
135
    }
109
    
136
    
110
    if (y1 < y2) {
137
    if (y1 < y2) {
111
    while (++y1 < y2) {
138
    while (++y1 < y2) {
112
        sprintf(buf + 1, "%04d", y1);
139
        sprintf(buf + bpoffs(), "%04d", y1);
113
      buf[0] = 'Y';
140
      bufprefix(buf, 'Y');
114
        v.push_back(Xapian::Query(buf));
141
        v.push_back(Xapian::Query(buf));
115
    }
142
    }
116
    sprintf(buf + 1, "%04d", y2);
143
    sprintf(buf + bpoffs(), "%04d", y2);
117
  buf[0] = 'M';
144
  bufprefix(buf, 'M');
118
    for (m1 = 1; m1 < m2; m1++) {
145
    for (m1 = 1; m1 < m2; m1++) {
119
        sprintf(buf + 5, "%02d", m1);
146
        sprintf(buf + 4 + bpoffs(), "%02d", m1);
120
        v.push_back(Xapian::Query(buf));
147
        v.push_back(Xapian::Query(buf));
121
    }
148
    }
122
    }
149
    }
123
    
150
    
124
    sprintf(buf + 5, "%02d", m2);
151
    sprintf(buf + 2 + bpoffs(), "%02d", m2);
125
152
126
    // Deal with any final partial month
153
    // Deal with any final partial month
127
    if (d2 < monthdays(m2, y2)) {
154
    if (d2 < monthdays(m2, y2)) {
128
  buf[0] = 'D';
155
  bufprefix(buf, 'D');
129
        for (d1 = 1 ; d1 <= d2; d1++) {
156
        for (d1 = 1 ; d1 <= d2; d1++) {
130
        sprintf(buf + 7, "%02d", d1);
157
        sprintf(buf + 6 + bpoffs(), "%02d", d1);
131
        v.push_back(Xapian::Query(buf));
158
        v.push_back(Xapian::Query(buf));
132
    }
159
    }
133
    } else {
160
    } else {
134
  buf[0] = 'M';
161
  bufprefix(buf, 'M');
135
    v.push_back(Xapian::Query(buf));
162
    v.push_back(Xapian::Query(buf));
136
    }
163
    }
137
164
138
    return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end());
165
    return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end());
139
}
166
}
...
...
170
    }
197
    }
171
    tps = exptps;
198
    tps = exptps;
172
    return true;
199
    return true;
173
}
200
}
174
201
175
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
202
bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, 
203
              vector<SearchDataClause*>& query, 
204
              string& reason, void *d)
176
{
205
{
177
    LOGDEB2(("SearchData::toNativeQuery: stemlang [%s]\n", 
178
      m_stemlang.c_str()));
179
    Xapian::Query xq;
206
    Xapian::Query xq;
180
    m_reason.erase();
181
182
    // Walk the clause list translating each in turn and building the 
183
    // Xapian query tree
184
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++) {
207
    for (qlist_it_t it = query.begin(); it != query.end(); it++) {
185
    Xapian::Query nq;
208
    Xapian::Query nq;
186
    if (!(*it)->toNativeQuery(db, &nq, m_stemlang)) {
209
    if (!(*it)->toNativeQuery(db, &nq)) {
187
        LOGERR(("SearchData::toNativeQuery: failed\n"));
210
        LOGERR(("SearchData::clausesToQuery: toNativeQuery failed\n"));
188
        m_reason = (*it)->getReason();
211
        reason = (*it)->getReason();
189
        return false;
212
        return false;
190
    }       
213
    }       
191
        if (nq.empty()) {
214
        if (nq.empty()) {
192
            LOGDEB(("SearchData::toNativeQuery: skipping empty clause\n"));
215
            LOGDEB(("SearchData::clausesToQuery: skipping empty clause\n"));
193
            continue;
216
            continue;
194
        }
217
        }
195
    // If this structure is an AND list, must use AND_NOT for excl clauses.
218
    // If this structure is an AND list, must use AND_NOT for excl clauses.
196
    // Else this is an OR list, and there can't be excl clauses (checked by
219
    // Else this is an OR list, and there can't be excl clauses (checked by
197
    // addClause())
220
    // addClause())
198
    Xapian::Query::op op;
221
    Xapian::Query::op op;
199
    if (m_tp == SCLT_AND) {
222
    if (tp == SCLT_AND) {
200
            if ((*it)->m_tp == SCLT_EXCL) {
223
            if ((*it)->m_tp == SCLT_EXCL) {
201
                op =  Xapian::Query::OP_AND_NOT;
224
                op =  Xapian::Query::OP_AND_NOT;
202
            } else {
225
            } else {
203
                op =  Xapian::Query::OP_AND;
226
                op =  Xapian::Query::OP_AND;
204
            }
227
            }
...
...
214
            xq = Xapian::Query(op, xq, nq);
237
            xq = Xapian::Query(op, xq, nq);
215
        }
238
        }
216
    }
239
    }
217
    if (xq.empty())
240
    if (xq.empty())
218
    xq = Xapian::Query::MatchAll;
241
    xq = Xapian::Query::MatchAll;
242
243
   *((Xapian::Query *)d) = xq;
244
    return true;
245
}
246
247
bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
248
{
249
    LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
250
    m_reason.erase();
251
252
    // Walk the clause list translating each in turn and building the 
253
    // Xapian query tree
254
    Xapian::Query xq;
255
    if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
256
  LOGERR(("SearchData::toNativeQuery: clausesToQuery failed\n"));
257
  return false;
258
    }
219
259
220
    if (m_haveDates) {
260
    if (m_haveDates) {
221
        // If one of the extremities is unset, compute db extremas
261
        // If one of the extremities is unset, compute db extremas
222
        if (m_dates.y1 == 0 || m_dates.y2 == 0) {
262
        if (m_dates.y1 == 0 || m_dates.y2 == 0) {
223
            int minyear = 1970, maxyear = 2100;
263
            int minyear = 1970, maxyear = 2100;
...
...
324
     dit != m_dirspecs.end(); dit++) {
364
     dit != m_dirspecs.end(); dit++) {
325
    vector<string> vpath;
365
    vector<string> vpath;
326
    stringToTokens(dit->dir, vpath, "/");
366
    stringToTokens(dit->dir, vpath, "/");
327
    vector<string> pvpath;
367
    vector<string> pvpath;
328
    if (dit->dir[0] == '/')
368
    if (dit->dir[0] == '/')
329
        pvpath.push_back(pathelt_prefix);
369
        pvpath.push_back(wrap_prefix(pathelt_prefix));
330
    for (vector<string>::const_iterator pit = vpath.begin(); 
370
    for (vector<string>::const_iterator pit = vpath.begin(); 
331
         pit != vpath.end(); pit++){
371
         pit != vpath.end(); pit++){
332
        pvpath.push_back(pathelt_prefix + *pit);
372
        pvpath.push_back(wrap_prefix(pathelt_prefix) + *pit);
333
    }
373
    }
334
    Xapian::Query::op tdop;
374
    Xapian::Query::op tdop;
335
    if (dit->weight == 1.0) {
375
    if (dit->weight == 1.0) {
336
        tdop = dit->exclude ? 
376
        tdop = dit->exclude ? 
337
        Xapian::Query::OP_AND_NOT : Xapian::Query::OP_FILTER;
377
        Xapian::Query::OP_AND_NOT : Xapian::Query::OP_FILTER;
...
...
444
    addClause(nclp);
484
    addClause(nclp);
445
    } else {
485
    } else {
446
    // My type is AND. Change it to OR and insert two queries, one
486
    // My type is AND. Change it to OR and insert two queries, one
447
    // being the original query as a subquery, the other the
487
    // being the original query as a subquery, the other the
448
    // phrase.
488
    // phrase.
449
    SearchData *sd = new SearchData(m_tp);
489
    SearchData *sd = new SearchData(m_tp, m_stemlang);
450
    sd->m_query = m_query;
490
    sd->m_query = m_query;
451
    sd->m_stemlang = m_stemlang;
491
    sd->m_stemlang = m_stemlang;
452
    m_tp = SCLT_OR;
492
    m_tp = SCLT_OR;
453
    m_query.clear();
493
    m_query.clear();
454
    SearchDataClauseSub *oq = 
494
    SearchDataClauseSub *oq = 
...
...
584
    : m_db(db), m_field(field), m_stemlang(stmlng),
624
    : m_db(db), m_field(field), m_stemlang(stmlng),
585
      m_doBoostUserTerms(boostUser), m_hld(hld)
625
      m_doBoostUserTerms(boostUser), m_hld(hld)
586
    { }
626
    { }
587
627
588
    bool processUserString(const string &iq,
628
    bool processUserString(const string &iq,
629
             int mods, 
589
               string &ermsg,
630
               string &ermsg,
590
               vector<Xapian::Query> &pqueries, 
631
               vector<Xapian::Query> &pqueries, 
591
             const StopList &stops,
592
               int slack = 0, bool useNear = false);
632
               int slack = 0, bool useNear = false);
593
private:
633
private:
594
    void expandTerm(bool dont, const string& term, vector<string>& exp, 
634
    void expandTerm(int mods, 
635
          const string& term, vector<string>& exp, 
595
                    string& sterm, const string& prefix);
636
                    string& sterm, const string& prefix);
596
    // After splitting entry on whitespace: process non-phrase element
637
    // After splitting entry on whitespace: process non-phrase element
597
    void processSimpleSpan(const string& span, bool nostemexp, 
638
    void processSimpleSpan(const string& span, 
639
             int mods,
598
               vector<Xapian::Query> &pqueries);
640
               vector<Xapian::Query> &pqueries);
599
    // Process phrase/near element
641
    // Process phrase/near element
600
    void processPhraseOrNear(TextSplitQ *splitData, 
642
    void processPhraseOrNear(TextSplitQ *splitData, 
643
               int mods,
601
                 vector<Xapian::Query> &pqueries,
644
                 vector<Xapian::Query> &pqueries,
602
                 bool useNear, int slack, int mods);
645
                 bool useNear, int slack);
603
646
604
    Db&           m_db;
647
    Db&           m_db;
605
    const string& m_field;
648
    const string& m_field;
606
    const string& m_stemlang;
649
    const string& m_stemlang;
607
    bool          m_doBoostUserTerms;
650
    const bool    m_doBoostUserTerms;
608
    HighlightData& m_hld;
651
    HighlightData& m_hld;
609
};
652
};
610
653
611
#if 1
654
#if 1
612
static void listVector(const string& what, const vector<string>&l)
655
static void listVector(const string& what, const vector<string>&l)
...
...
617
    }
660
    }
618
    LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
661
    LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
619
}
662
}
620
#endif
663
#endif
621
664
622
/** Take simple term and expand stem and wildcards
665
/** Expand term into term list, using appropriate mode: stem, wildcards, 
666
 *  diacritics... 
623
 *
667
 *
624
 * @param nostemexp don't perform stem expansion. This is mainly used to
668
 * @param mods stem expansion, case and diacritics sensitivity control.
625
 *   prevent stem expansion inside phrases (because the user probably
626
 *   does not expect it). This does NOT prevent wild card expansion.
627
 *   Other factors than nostemexp can prevent stem expansion: 
628
 *   a null stemlang, resulting from a global user preference, a
629
 *   capitalized term, or wildcard(s)
630
 * @param term input single word
669
 * @param term input single word
631
 * @param exp output expansion list
670
 * @param exp output expansion list
632
 * @param sterm output original input term if there were no wildcards
671
 * @param sterm output original input term if there were no wildcards
672
 * @param prefix field prefix in index. We could recompute it, but the caller
673
 *  has it already. Used in the simple case where there is nothing to expand, 
674
 *  and we just return the prefixed term (else Db::termMatch deals with it).
633
 */
675
 */
634
void StringToXapianQ::expandTerm(bool nostemexp, 
676
void StringToXapianQ::expandTerm(int mods, 
635
                                 const string& term, 
677
               const string& term, 
636
                                 vector<string>& exp,
678
                                 vector<string>& oexp, string &sterm,
637
                                 string &sterm, const string& prefix)
679
               const string& prefix)
638
{
680
{
639
    LOGDEB2(("expandTerm: field [%s] term [%s] stemlang [%s] nostemexp %d\n",
681
    LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
640
         m_field.c_str(), term.c_str(), m_stemlang.c_str(), nostemexp));
682
         mods, m_field.c_str(), term.c_str(), m_stemlang.c_str()));
641
    sterm.erase();
683
    sterm.clear();
642
    exp.clear();
684
    oexp.clear();
643
    if (term.empty()) {
685
    if (term.empty())
644
    return;
686
    return;
645
    }
646
687
647
    bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
688
    bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
648
689
690
    // If there are no wildcards, add term to the list of user-entered terms
691
    if (!haswild)
692
  m_hld.uterms.insert(term);
693
694
    bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
695
649
    // No stemming if there are wildcards or prevented globally.
696
    // No stem expansion if there are wildcards or if prevented by caller
650
    if (haswild || m_stemlang.empty()) {
697
    if (haswild || m_stemlang.empty()) {
651
    LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
698
    LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
652
    nostemexp = true;
699
    nostemexp = true;
653
    }
700
    }
654
701
655
    if (!haswild)
702
    bool noexpansion = nostemexp && !haswild;
656
  m_hld.uterms.insert(term);
657
703
658
    if (nostemexp && !haswild) {
704
#ifndef RCL_INDEX_STRIPCHARS
705
    bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
706
    bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
707
708
    if (o_index_stripchars) {
709
  diac_sensitive = case_sensitive = false;
710
    } else {
711
  // If we are working with a raw index, apply the rules for case and 
712
  // diacritics sensitivity.
713
714
  // If any character has a diacritic, we become
715
  // diacritic-sensitive. Note that the way that the test is
716
  // performed (conversion+comparison) will automatically ignore
717
  // accented characters which are actually a separate letter
718
  if (unachasaccents(term))
719
      diac_sensitive = true;
720
721
  // If any character apart the first is uppercase, we become
722
  // case-sensitive.  The first character is reserved for
723
  // turning off stemming. You need to use a query language
724
  // modifier to search for Floor in a case-sensitive way.
725
  Utf8Iter it(term);
726
  it++;
727
  if (unachasuppercase(term.substr(it.getBpos())))
728
      case_sensitive = true;
729
730
  // If we are sensitive to case or diacritics turn stemming off
731
  if (diac_sensitive || case_sensitive)
732
      nostemexp = true;
733
734
  if (!case_sensitive || !diac_sensitive)
735
      noexpansion = false;
736
    }
737
#endif
738
739
    if (noexpansion) {
659
    sterm = term;
740
    sterm = term;
660
  exp.resize(1);
741
  oexp.push_back(prefix + term);
661
  exp[0] = prefix + term;
742
  LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
662
    } else {
743
  return;
744
    } 
745
746
    SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
747
    XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, "all", 
748
                  &unacfoldtrans);
749
    vector<string> lexp;
750
663
  TermMatchResult res;
751
    TermMatchResult res;
664
  if (haswild) {
752
    if (haswild) {
753
  // Note that if there are wildcards, we do a direct from-index
754
  // expansion, which means that we are casediac-sensitive. There
755
  // would be nothing to prevent us to expand from the casediac
756
  // synonyms first. To be done later
665
        m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, 
757
    m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, 
666
                           m_field);
758
             m_field);
667
  } else {
759
  goto termmatchtoresult;
760
    }
761
668
      sterm = term;
762
    sterm = term;
763
764
#ifdef RCL_INDEX_STRIPCHARS
765
669
      m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, 
766
    m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
670
             m_field);
767
768
#else
769
770
    if (o_index_stripchars) {
771
  // If the index is raw, we can only come here if nostemexp is unset
772
  // and we just need stem expansion.
773
  m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, -1, m_field);
774
  goto termmatchtoresult;
775
    } 
776
777
    // No stem expansion when diacritic or case sensitivity is set, it
778
    // makes no sense (it would mess with the diacritics anyway if
779
    // they are not in the stem part).  In these 3 cases, perform
780
    // appropriate expansion from the charstripping db, and do a bogus
781
    // wildcard expansion (there is no wild card) to generate the
782
    // result:
783
784
    if (diac_sensitive && case_sensitive) {
785
  // No expansion whatsoever
786
  m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, res, -1, m_field);
787
  goto termmatchtoresult;
788
    }
789
790
    if (diac_sensitive) {
791
  // Expand for accents and case, filtering for same accents,
792
  // then bogus wildcard expansion for generating result
793
  SynTermTransUnac foldtrans(UNACOP_FOLD);
794
  synac.synExpand(term, lexp, &foldtrans);
795
  goto exptotermatch;
796
    } 
797
798
    if (case_sensitive) {
799
  // Expand for accents and case, filtering for same case, then
800
  // bogus wildcard expansion for generating result
801
  SynTermTransUnac unactrans(UNACOP_UNAC);
802
  synac.synExpand(term, lexp, &unactrans);
803
  goto exptotermatch;
804
    }
805
806
    // We are neither accent- nor case- sensitive and may need stem
807
    // expansion or not.
808
809
    // Expand for accents and case
810
    synac.synExpand(term, lexp);
811
    LOGDEB(("ExpTerm: casediac: %s\n", stringsToString(lexp).c_str()));
812
    if (nostemexp)
813
  goto exptotermatch;
814
815
    // Need stem expansion. Lowercase the result of accent and case
816
    // expansion for input to stemdb.
817
    for (unsigned int i = 0; i < lexp.size(); i++) {
818
  string lower;
819
  unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
820
  lexp[i] = lower;
821
    }
822
    sort(lexp.begin(), lexp.end());
823
    {
824
  vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
825
  lexp.resize(uit - lexp.begin());
826
  StemDb db(m_db.m_ndb->xrdb);
827
  vector<string> exp1;
828
  for (vector<string>::const_iterator it = lexp.begin(); 
829
       it != lexp.end(); it++) {
830
      db.stemExpand(m_stemlang, *it, exp1);
671
    }
831
    }
832
  LOGDEB(("ExpTerm: stem: %s\n", stringsToString(exp1).c_str()));
833
834
  // Expand the resulting list for case (all stemdb content
835
  // is lowercase)
836
  lexp.clear();
837
  for (vector<string>::const_iterator it = exp1.begin(); 
838
       it != exp1.end(); it++) {
839
      synac.synExpand(*it, lexp);
840
  }
841
  sort(lexp.begin(), lexp.end());
842
  uit = unique(lexp.begin(), lexp.end());
843
  lexp.resize(uit - lexp.begin());
844
    }
845
    LOGDEB(("ExpTerm: case exp of stem: %s\n", stringsToString(lexp).c_str()));
846
847
    // Bogus wildcard expand to generate the result
848
exptotermatch:
849
    for (vector<string>::const_iterator it = lexp.begin();
850
   it != lexp.end(); it++) {
851
  m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, *it, 
852
             res, -1, m_field);
853
    }
854
#endif
855
856
    // Term match entries to vector of terms
857
termmatchtoresult:
672
  for (vector<TermMatchEntry>::const_iterator it = res.entries.begin(); 
858
    for (vector<TermMatchEntry>::const_iterator it = res.entries.begin(); 
673
         it != res.entries.end(); it++) {
859
     it != res.entries.end(); it++) {
674
        exp.push_back(it->term);
860
    oexp.push_back(it->term);
675
  }
676
    }
861
    }
862
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
677
}
863
}
678
864
679
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
865
// Do distribution of string vectors: a,b c,d -> a,c a,d b,c b,d
680
void multiply_groups(vector<vector<string> >::const_iterator vvit,
866
void multiply_groups(vector<vector<string> >::const_iterator vvit,
681
             vector<vector<string> >::const_iterator vvend, 
867
             vector<vector<string> >::const_iterator vvend, 
...
...
708
    // vector)
894
    // vector)
709
    comb.pop_back();
895
    comb.pop_back();
710
    }
896
    }
711
}
897
}
712
898
713
void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
899
void StringToXapianQ::processSimpleSpan(const string& span, 
900
                  int mods,
714
                    vector<Xapian::Query> &pqueries)
901
                    vector<Xapian::Query> &pqueries)
715
{
902
{
716
    LOGDEB2(("StringToXapianQ::processSimpleSpan: [%s] nostemexp %d\n",
903
    LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
717
       span.c_str(), int(nostemexp)));
904
      span.c_str(), (unsigned int)mods));
718
    vector<string> exp;  
905
    vector<string> exp;  
719
    string sterm; // dumb version of user term
906
    string sterm; // dumb version of user term
720
907
721
    string prefix;
908
    string prefix;
722
    const FieldTraits *ftp;
909
    const FieldTraits *ftp;
723
    if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
910
    if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
724
    prefix = ftp->pfx;
911
    prefix = wrap_prefix(ftp->pfx);
725
    }
912
    }
726
913
727
    expandTerm(nostemexp, span, exp, sterm, prefix);
914
    expandTerm(mods, span, exp, sterm, prefix);
728
    
915
    
729
    // Set up the highlight data. No prefix should go in there
916
    // Set up the highlight data. No prefix should go in there
730
    for (vector<string>::const_iterator it = exp.begin(); 
917
    for (vector<string>::const_iterator it = exp.begin(); 
731
     it != exp.end(); it++) {
918
     it != exp.end(); it++) {
732
    m_hld.groups.push_back(vector<string>(1, it->substr(prefix.size())));
919
    m_hld.groups.push_back(vector<string>(1, it->substr(prefix.size())));
...
...
753
// User entry element had several terms: transform into a PHRASE or
940
// User entry element had several terms: transform into a PHRASE or
754
// NEAR xapian query, the elements of which can themselves be OR
941
// NEAR xapian query, the elements of which can themselves be OR
755
// queries if the terms get expanded by stemming or wildcards (we
942
// queries if the terms get expanded by stemming or wildcards (we
756
// don't do stemming for PHRASE though)
943
// don't do stemming for PHRASE though)
757
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, 
944
void StringToXapianQ::processPhraseOrNear(TextSplitQ *splitData, 
945
                    int mods,
758
                      vector<Xapian::Query> &pqueries,
946
                      vector<Xapian::Query> &pqueries,
759
                      bool useNear, int slack, int mods)
947
                      bool useNear, int slack)
760
{
948
{
761
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
949
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
762
    Xapian::Query::OP_PHRASE;
950
    Xapian::Query::OP_PHRASE;
763
    vector<Xapian::Query> orqueries;
951
    vector<Xapian::Query> orqueries;
764
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
952
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
...
...
767
    vector<vector<string> >groups;
955
    vector<vector<string> >groups;
768
956
769
    string prefix;
957
    string prefix;
770
    const FieldTraits *ftp;
958
    const FieldTraits *ftp;
771
    if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
959
    if (!m_field.empty() && m_db.fieldToTraits(m_field, &ftp)) {
772
    prefix = ftp->pfx;
960
    prefix = wrap_prefix(ftp->pfx);
773
    }
961
    }
774
962
775
    if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
963
    if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
776
    orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
964
    orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
777
    slack++;
965
    slack++;
...
...
788
    bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
976
    bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
789
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
977
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
790
        || hadmultiple
978
        || hadmultiple
791
#endif // single OR inside NEAR
979
#endif // single OR inside NEAR
792
        ;
980
        ;
793
981
  int lmods = mods;
982
  if (nostemexp)
983
      lmods |= SearchDataClause::SDCM_NOSTEMMING;
794
    string sterm;
984
    string sterm;
795
    vector<string> exp;
985
    vector<string> exp;
796
    expandTerm(nostemexp, *it, exp, sterm, prefix);
986
    expandTerm(lmods, *it, exp, sterm, prefix);
797
    LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
987
    LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
798
    listVector("", exp);
988
    listVector("", exp);
799
    // groups is used for highlighting, we don't want prefixes in there.
989
    // groups is used for highlighting, we don't want prefixes in there.
800
    vector<string> noprefs;
990
    vector<string> noprefs;
801
    for (vector<string>::const_iterator it = exp.begin(); 
991
    for (vector<string>::const_iterator it = exp.begin(); 
...
...
880
 *     composition of the phrase terms (no stem expansion in this case)
1070
 *     composition of the phrase terms (no stem expansion in this case)
881
 * @return the subquery count (either or'd stem-expanded terms or phrase word
1071
 * @return the subquery count (either or'd stem-expanded terms or phrase word
882
 *   count)
1072
 *   count)
883
 */
1073
 */
884
bool StringToXapianQ::processUserString(const string &iq,
1074
bool StringToXapianQ::processUserString(const string &iq,
1075
                  int mods, 
885
                    string &ermsg,
1076
                    string &ermsg,
886
                    vector<Xapian::Query> &pqueries,
1077
                    vector<Xapian::Query> &pqueries,
887
                  const StopList& stops,
888
                    int slack, 
1078
                    int slack, 
889
                    bool useNear
1079
                    bool useNear
890
                    )
1080
                    )
891
{
1081
{
892
    LOGDEB(("StringToXapianQ:: query string: [%s], slack %d, near %d\n", iq.c_str(), slack, useNear));
1082
    LOGDEB(("StringToXapianQ:: qstr [%s] mods 0x%x slack %d near %d\n", 
1083
      iq.c_str(), mods, slack, useNear));
893
    ermsg.erase();
1084
    ermsg.erase();
1085
1086
    const StopList stops = m_db.getStopList();
894
1087
895
    // Simple whitespace-split input into user-level words and
1088
    // Simple whitespace-split input into user-level words and
896
    // double-quoted phrases: word1 word2 "this is a phrase". 
1089
    // double-quoted phrases: word1 word2 "this is a phrase". 
897
    //
1090
    //
898
    // The text splitter may further still decide that the resulting
1091
    // The text splitter may further still decide that the resulting
...
...
906
    // expansion and transform into an appropriate Xapian::Query
1099
    // expansion and transform into an appropriate Xapian::Query
907
    try {
1100
    try {
908
    for (vector<string>::iterator it = phrases.begin(); 
1101
    for (vector<string>::iterator it = phrases.begin(); 
909
         it != phrases.end(); it++) {
1102
         it != phrases.end(); it++) {
910
        LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
1103
        LOGDEB0(("strToXapianQ: phrase/word: [%s]\n", it->c_str()));
1104
      // Anchoring modifiers
911
        int mods = stringToMods(*it);
1105
        int amods = stringToMods(*it);
912
        int terminc = mods != 0 ? 1 : 0;
1106
        int terminc = amods != 0 ? 1 : 0;
1107
      mods |= amods;
913
        // If there are multiple spans in this element, including
1108
        // If there are multiple spans in this element, including
914
        // at least one composite, we have to increase the slack
1109
        // at least one composite, we have to increase the slack
915
        // else a phrase query including a span would fail. 
1110
        // else a phrase query including a span would fail. 
916
        // Ex: "term0@term1 term2" is onlyspans-split as:
1111
        // Ex: "term0@term1 term2" is onlyspans-split as:
917
        //   0 term0@term1             0   12
1112
        //   0 term0@term1             0   12
...
...
928
        TermProcQ tpq;
1123
        TermProcQ tpq;
929
        TermProc *nxt = &tpq;
1124
        TermProc *nxt = &tpq;
930
            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
1125
            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
931
            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
1126
            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
932
            //tpcommon.onlygrams(true);
1127
            //tpcommon.onlygrams(true);
933
        TermProcPrep tpprep(nxt); nxt = &tpprep;
1128
        TermProcPrep tpprep(nxt);
1129
#ifndef RCL_INDEX_STRIPCHARS
1130
      if (o_index_stripchars)
1131
#endif
1132
      nxt = &tpprep;
934
1133
935
        TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
1134
        TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
936
                          TextSplit::TXTS_KEEPWILD), 
1135
                         TextSplit::TXTS_KEEPWILD), 
937
                                 stops, nxt);
1136
              stops, nxt);
938
        tpq.setTSQ(&splitter);
1137
        tpq.setTSQ(&splitter);
939
        splitter.text_to_words(*it);
1138
        splitter.text_to_words(*it);
940
1139
941
        slack += splitter.lastpos - splitter.terms.size() + 1;
1140
        slack += splitter.lastpos - splitter.terms.size() + 1;
942
1141
943
        LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
1142
        LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
944
        switch (splitter.terms.size() + terminc) {
1143
        switch (splitter.terms.size() + terminc) {
945
        case 0: 
1144
        case 0: 
946
        continue;// ??
1145
        continue;// ??
947
        case 1: 
1146
        case 1: {
1147
      int lmods = mods;
1148
      if (splitter.nostemexps.front())
1149
          lmods |= SearchDataClause::SDCM_NOSTEMMING;
948
        m_hld.ugroups.push_back(vector<string>(1, *it));
1150
        m_hld.ugroups.push_back(vector<string>(1, *it));
949
        processSimpleSpan(splitter.terms.front(), 
1151
        processSimpleSpan(splitter.terms.front(), lmods, pqueries);
950
                                  splitter.nostemexps.front(), pqueries);
1152
      }
951
        break;
1153
        break;
952
        default:
1154
        default:
953
        m_hld.ugroups.push_back(vector<string>(1, *it));
1155
        m_hld.ugroups.push_back(vector<string>(1, *it));
954
        processPhraseOrNear(&splitter, pqueries, useNear, slack, mods);
1156
        processPhraseOrNear(&splitter, mods, pqueries, useNear, slack);
955
        }
1157
        }
956
    }
1158
    }
957
    } catch (const Xapian::Error &e) {
1159
    } catch (const Xapian::Error &e) {
958
    ermsg = e.get_msg();
1160
    ermsg = e.get_msg();
959
    } catch (const string &s) {
1161
    } catch (const string &s) {
...
...
969
    }
1171
    }
970
    return true;
1172
    return true;
971
}
1173
}
972
1174
973
// Translate a simple OR, AND, or EXCL search clause. 
1175
// Translate a simple OR, AND, or EXCL search clause. 
974
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, 
1176
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)
975
                     const string& stemlang)
976
{
1177
{
977
    const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
978
  stemlang;
979
    LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
1178
    LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
980
         stemlang.c_str()));
1179
         getStemLang().c_str()));
981
1180
982
    Xapian::Query *qp = (Xapian::Query *)p;
1181
    Xapian::Query *qp = (Xapian::Query *)p;
983
    *qp = Xapian::Query();
1182
    *qp = Xapian::Query();
984
1183
985
    Xapian::Query::op op;
1184
    Xapian::Query::op op;
...
...
998
    // do it if there are wildcards anywhere, this would skew the results.
1197
    // do it if there are wildcards anywhere, this would skew the results.
999
    bool doBoostUserTerm = 
1198
    bool doBoostUserTerm = 
1000
    (m_parentSearch && !m_parentSearch->haveWildCards()) || 
1199
    (m_parentSearch && !m_parentSearch->haveWildCards()) || 
1001
    (m_parentSearch == 0 && !m_haveWildCards);
1200
    (m_parentSearch == 0 && !m_haveWildCards);
1002
1201
1003
    StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm);
1202
    StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
1004
    if (!tr.processUserString(m_text, m_reason, pqueries, db.getStopList()))
1203
    if (!tr.processUserString(m_text, getModifiers(), m_reason, pqueries))
1005
    return false;
1204
    return false;
1006
    if (pqueries.empty()) {
1205
    if (pqueries.empty()) {
1007
    LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
1206
    LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
1008
    return true;
1207
    return true;
1009
    }
1208
    }
...
...
1022
//
1221
//
1023
// We do not split the entry any more (used to do some crazy thing
1222
// We do not split the entry any more (used to do some crazy thing
1024
// about expanding multiple fragments in the past. We just take the
1223
// about expanding multiple fragments in the past. We just take the
1025
// value blanks and all and expand this against the indexed unsplit
1224
// value blanks and all and expand this against the indexed unsplit
1026
// file names
1225
// file names
1027
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p, 
1226
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)
1028
                       const string&)
1029
{
1227
{
1030
    Xapian::Query *qp = (Xapian::Query *)p;
1228
    Xapian::Query *qp = (Xapian::Query *)p;
1031
    *qp = Xapian::Query();
1229
    *qp = Xapian::Query();
1032
1230
1033
    vector<string> names;
1231
    vector<string> names;
...
...
1039
    }
1237
    }
1040
    return true;
1238
    return true;
1041
}
1239
}
1042
1240
1043
// Translate NEAR or PHRASE clause. 
1241
// Translate NEAR or PHRASE clause. 
1044
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, 
1242
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)
1045
                   const string& stemlang)
1046
{
1243
{
1047
    const string& l_stemlang = (m_modifiers&SDCM_NOSTEMMING)? cstr_null:
1048
  stemlang;
1049
    LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
1244
    LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));
1050
1245
1051
    Xapian::Query *qp = (Xapian::Query *)p;
1246
    Xapian::Query *qp = (Xapian::Query *)p;
1052
    *qp = Xapian::Query();
1247
    *qp = Xapian::Query();
1053
1248
...
...
1067
    if (m_text.find('\"') != string::npos) {
1262
    if (m_text.find('\"') != string::npos) {
1068
    m_text = neutchars(m_text, "\"");
1263
    m_text = neutchars(m_text, "\"");
1069
    }
1264
    }
1070
    string s = cstr_dquote + m_text + cstr_dquote;
1265
    string s = cstr_dquote + m_text + cstr_dquote;
1071
    bool useNear = (m_tp == SCLT_NEAR);
1266
    bool useNear = (m_tp == SCLT_NEAR);
1072
    StringToXapianQ tr(db, m_hldata, m_field, l_stemlang, doBoostUserTerm);
1267
    StringToXapianQ tr(db, m_hldata, m_field, getStemLang(), doBoostUserTerm);
1073
    if (!tr.processUserString(s, m_reason, pqueries, db.getStopList(),
1268
    if (!tr.processUserString(s, getModifiers(), m_reason, pqueries, 
1074
                  m_slack, useNear))
1269
                  m_slack, useNear))
1075
    return false;
1270
    return false;
1076
    if (pqueries.empty()) {
1271
    if (pqueries.empty()) {
1077
    LOGERR(("SearchDataClauseDist: resolved to null query\n"));
1272
    LOGERR(("SearchDataClauseDist: resolved to null query\n"));
1078
    return true;
1273
    return true;