Switch to unified view

a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.3 2006-11-14 17:41:12 dockes Exp $ (C) 2006 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.4 2006-11-17 10:06:34 dockes Exp $ (C) 2006 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
19
 */
19
 */
20
20
21
// Handle translation from rcl's SearchData structures to Xapian Queries
21
// Handle translation from rcl's SearchData structures to Xapian Queries
22
22
23
#include <string>
23
#include <string>
24
#include <list>
24
#include <vector>
25
#ifndef NO_NAMESPACES
26
using namespace std;
27
#endif
28
25
29
#include "xapian.h"
26
#include "xapian.h"
30
27
31
#include "rcldb.h"
28
#include "rcldb.h"
32
#include "searchdata.h"
29
#include "searchdata.h"
...
...
34
#include "smallut.h"
31
#include "smallut.h"
35
#include "textsplit.h"
32
#include "textsplit.h"
36
#include "unacpp.h"
33
#include "unacpp.h"
37
#include "utf8iter.h"
34
#include "utf8iter.h"
38
35
36
#ifndef NO_NAMESPACES
37
using namespace std;
39
namespace Rcl {
38
namespace Rcl {
39
#endif
40
40
41
typedef  list<SearchDataClause *>::iterator qlist_it_t;
41
typedef  vector<SearchDataClause *>::iterator qlist_it_t;
42
typedef  vector<SearchDataClause *>::const_iterator qlist_cit_t;
42
43
43
bool SearchData::toNativeQuery(Rcl::Db &db, void *d, const string& stemlang)
44
bool SearchData::toNativeQuery(Rcl::Db &db, void *d, const string& stemlang)
44
{
45
{
45
    Xapian::Query xq;
46
    Xapian::Query xq;
46
    m_reason.erase();
47
    m_reason.erase();
...
...
69
70
70
    // Add the file type filtering clause if any
71
    // Add the file type filtering clause if any
71
    if (!m_filetypes.empty()) {
72
    if (!m_filetypes.empty()) {
72
    list<Xapian::Query> pqueries;
73
    list<Xapian::Query> pqueries;
73
    Xapian::Query tq;
74
    Xapian::Query tq;
74
    for (list<string>::iterator it = m_filetypes.begin(); 
75
    for (vector<string>::iterator it = m_filetypes.begin(); 
75
         it != m_filetypes.end(); it++) {
76
         it != m_filetypes.end(); it++) {
76
        string term = "T" + *it;
77
        string term = "T" + *it;
77
        LOGDEB(("Adding file type term: [%s]\n", term.c_str()));
78
        LOGDEB(("Adding file type term: [%s]\n", term.c_str()));
78
        tq = tq.empty() ? Xapian::Query(term) : 
79
        tq = tq.empty() ? Xapian::Query(term) : 
79
        Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
80
        Xapian::Query(Xapian::Query::OP_OR, tq, Xapian::Query(term));
...
...
88
// Add clause to current list. OR lists cant have EXCL clauses.
89
// Add clause to current list. OR lists cant have EXCL clauses.
89
bool SearchData::addClause(SearchDataClause* cl)
90
bool SearchData::addClause(SearchDataClause* cl)
90
{
91
{
91
    if (m_tp == SCLT_OR && (cl->m_tp == SCLT_EXCL)) {
92
    if (m_tp == SCLT_OR && (cl->m_tp == SCLT_EXCL)) {
92
    LOGERR(("SearchData::addClause: cant add EXCL to OR list\n"));
93
    LOGERR(("SearchData::addClause: cant add EXCL to OR list\n"));
94
  m_reason = "No Negative (AND_NOT) clauses allowed in OR queries";
93
    return false;
95
    return false;
94
    }
96
    }
95
    m_query.push_back(cl);
97
    m_query.push_back(cl);
96
    return true;
98
    return true;
97
}
99
}
98
100
99
// Make me all new
101
// Make me all new
100
void SearchData::erase() {
102
void SearchData::erase() {
103
    LOGDEB(("SearchData::erase\n"));
104
    m_tp = SCLT_AND;
101
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
105
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
102
    delete *it;
106
    delete *it;
103
    m_query.clear();
107
    m_query.clear();
104
    m_filetypes.clear();
108
    m_filetypes.clear();
105
    m_topdir.erase();
109
    m_topdir.erase();
106
    m_description.erase();
110
    m_description.erase();
111
    m_reason.erase();
107
}
112
}
108
113
109
// Am I a file name only search ? This is to turn off term highlighting
114
// Am I a file name only search ? This is to turn off term highlighting
110
bool SearchData::fileNameOnly() {
115
bool SearchData::fileNameOnly() 
116
{
111
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
117
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
112
    if (!(*it)->isFileName())
118
    if (!(*it)->isFileName())
113
        return false;
119
        return false;
114
    return true;
120
    return true;
115
}
121
}
116
122
123
// Extract all terms and term groups
124
bool SearchData::getTerms(vector<string>& terms, 
125
            vector<vector<string> >& groups,
126
            vector<int>& gslks) const
127
{
128
    for (qlist_cit_t it = m_query.begin(); it != m_query.end(); it++)
129
  (*it)->getTerms(terms, groups, gslks);
130
    return true;
131
}
132
117
// Splitter callback for breaking a user query string into simple
133
// Splitter callback for breaking a user query string into simple
118
// terms and phrases
134
// terms and phrases. 
119
class wsQData : public TextSplitCB {
135
class wsQData : public TextSplitCB {
120
 public:
136
 public:
121
    vector<string> terms;
137
    vector<string> terms;
122
    // Debug
138
    // Debug
123
    string catterms() {
139
    string catterms() {
124
    string s;
140
    string s;
125
    for (unsigned int i = 0; i < terms.size(); i++) {
141
    for (unsigned int i = 0; i < terms.size(); i++)
126
        s += "[" + terms[i] + "] ";
142
        s += "[" + terms[i] + "] ";
127
  }
128
    return s;
143
    return s;
129
    }
144
    }
130
    bool takeword(const std::string &term, int , int, int) {
145
    bool takeword(const std::string &term, int , int, int) {
131
    LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
146
    LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
132
    terms.push_back(term);
147
    terms.push_back(term);
133
    return true;
148
    return true;
134
    }
149
    }
135
    // Decapital + deaccent all terms 
136
    void dumball() {
137
  for (vector<string>::iterator it=terms.begin(); it !=terms.end();it++){
138
      string dumb;
139
      dumb_string(*it, dumb);
140
      *it = dumb;
141
  }
142
    }
143
};
150
};
144
151
145
/** Possibly expand term into its stem siblings, make them dumb strings */
152
// This used to be a static function, but we couldn't just keep adding
153
// parameters to the interface!
154
class StringToXapianQ {
155
public:
156
    StringToXapianQ(Db& db) : m_db(db) { }
157
    bool translate(const string &iq,
158
         const string& stemlang,
159
         string &ermsg,
160
         list<Xapian::Query> &pqueries,
161
         int slack = 0, bool useNear = false);
162
    bool getTerms(vector<string>& terms, 
163
        vector<vector<string> >& groups) 
164
    {
165
  terms.insert(terms.end(), m_terms.begin(), m_terms.end());
166
  groups.insert(groups.end(), m_groups.begin(), m_groups.end());
167
  return true;
168
    }
169
private:
146
static void maybeStemExp(Db& db, const string& stemlang, const string& term, 
170
    void maybeStemExp(const string& stemlang, const string& term, 
171
            list<string>& exp);
172
173
    Db& m_db;
174
    // Single terms and phrases resulting from breaking up text;
175
    vector<string>          m_terms;
176
    vector<vector<string> > m_groups; 
177
};
178
179
/** Make term dumb and possibly expand it into its stem siblings */
180
void StringToXapianQ::maybeStemExp(const string& stemlang, 
181
                 const string& term, 
147
             list<string>& exp)
182
                 list<string>& exp)
148
{
183
{
149
    LOGDEB(("maybeStemExp: [%s]\n", term.c_str()));
184
    LOGDEB2(("maybeStemExp: [%s]\n", term.c_str()));
185
    if (term.empty()) {
186
  exp.clear();
187
  return;
188
    }
189
150
    string term1;
190
    string term1;
151
    dumb_string(term, term1);
191
    dumb_string(term, term1);
152
    if (!stemlang.empty()) {
192
153
  bool nostemexp = false;
193
    bool nostemexp = stemlang.empty() ? true : false;
194
    if (!nostemexp) {
154
    // Check if the first letter is a majuscule in which
195
    // Check if the first letter is a majuscule in which
155
    // case we do not want to do stem expansion. Note that
196
    // case we do not want to do stem expansion. Note that
156
    // the test is convoluted and possibly problematic
197
    // the test is convoluted and possibly problematic
157
  if (term.length() > 0) {
198
158
        string noacterm,noaclowterm;
199
    string noacterm,noaclowterm;
159
        if (unacmaybefold(term, noacterm, "UTF-8", false) &&
200
    if (unacmaybefold(term, noacterm, "UTF-8", false) &&
160
      unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
201
        unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
161
      Utf8Iter it1(noacterm);
202
        Utf8Iter it1(noacterm);
162
      Utf8Iter it2(noaclowterm);
203
        Utf8Iter it2(noaclowterm);
163
      if (*it1 != *it2)
204
        if (*it1 != *it2)
164
            nostemexp = true;
205
        nostemexp = true;
165
      }
166
    }
206
    }
167
    LOGDEB1(("Term: %s stem expansion: %s\n", 
207
    LOGDEB1(("Term: %s stem expansion: %s\n", term.c_str()));
168
       term.c_str(), nostemexp?"no":"yes"));
208
    }
209
169
  if (!nostemexp) {
210
    if (nostemexp) {
211
  exp = list<string>(1, term1);
212
    } else {
170
        exp = db.stemExpand(stemlang, term1);
213
    exp = m_db.stemExpand(stemlang, term1);
171
      return;
214
    }
172
  }
215
}
173
    }
174
216
175
    exp.push_back(term1);
217
/** 
176
}
177
178
/** Turn string into list of xapian queries. There is little
218
 * Turn string into list of xapian queries. There is little
179
 * interpretation done on the string (no +term -term or filename:term
219
 * interpretation done on the string (no +term -term or filename:term
180
 * stuff). We just separate words and phrases, and interpret
220
 * stuff). We just separate words and phrases, and interpret
181
 * capitalized terms as wanting no stem expansion. 
221
 * capitalized terms as wanting no stem expansion. 
182
 * The final list contains one query for each term or phrase
222
 * The final list contains one query for each term or phrase
183
 *   - Elements corresponding to a stem-expanded part are an OP_OR
223
 *   - Elements corresponding to a stem-expanded part are an OP_OR
184
 *    composition of the stem-expanded terms (or a single term query).
224
 *     composition of the stem-expanded terms (or a single term query).
185
 *   - Elements corresponding to a phrase are an OP_PHRASE composition of the
225
 *   - Elements corresponding to a phrase are an OP_PHRASE composition of the
186
 *     phrase terms (no stem expansion in this case)
226
 *     phrase terms (no stem expansion in this case)
187
 * @return the subquery count (either or'd stem-expanded terms or phrase word
227
 * @return the subquery count (either or'd stem-expanded terms or phrase word
188
 *   count)
228
 *   count)
189
 */
229
 */
190
static bool stringToXapianQueries(const string &iq,
230
bool StringToXapianQ::translate(const string &iq,
191
                  const string& stemlang,
231
                const string& stemlang,
192
                Db& db,
193
                  string &ermsg,
232
                string &ermsg,
194
                  list<Xapian::Query> &pqueries,
233
                list<Xapian::Query> &pqueries,
195
                  int slack = 0, bool useNear = false)
234
                int slack, bool useNear)
196
{
235
{
197
    string qstring = iq;
236
    string qstring = iq;
198
    bool opt_stemexp = !stemlang.empty();
237
    bool opt_stemexp = !stemlang.empty();
199
    ermsg.erase();
238
    ermsg.erase();
239
    m_terms.clear();
240
    m_groups.clear();
200
241
201
    // Split into words and phrases (word1 word2 "this is a phrase"):
242
    // Split into words and phrases (word1 word2 "this is a phrase"):
202
    list<string> phrases;
243
    list<string> phrases;
203
    stringToStrings(qstring, phrases);
244
    stringToStrings(qstring, phrases);
204
245
...
...
229
        case 0: continue;// ??
270
        case 0: continue;// ??
230
        case 1: // Not a real phrase: one term
271
        case 1: // Not a real phrase: one term
231
        {
272
        {
232
            string term = splitData.terms.front();
273
            string term = splitData.terms.front();
233
            list<string> exp;  
274
            list<string> exp;  
234
            maybeStemExp(db, stemlang, term, exp);
275
            maybeStemExp(stemlang, term, exp);
235
            // Push either term or OR of stem-expanded set
276
            // Push either term or OR of stem-expanded set
236
            pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
277
            pqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
237
                             exp.begin(), exp.end()));
278
                             exp.begin(), exp.end()));
279
          m_terms.insert(m_terms.end(), exp.begin(), exp.end());
238
        }
280
        }
239
        break;
281
        break;
240
282
241
        default:
283
        default:
242
        // Phrase/near
284
        // Phrase/near
243
        Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
285
        Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
244
        Xapian::Query::OP_PHRASE;
286
        Xapian::Query::OP_PHRASE;
245
        list<Xapian::Query> orqueries;
287
        list<Xapian::Query> orqueries;
246
        bool hadmultiple = false;
288
        bool hadmultiple = false;
247
        string nolang, lang;
289
        string nolang, lang;
290
      vector<string> dumbterms;
248
        for (vector<string>::iterator it = splitData.terms.begin();
291
        for (vector<string>::iterator it = splitData.terms.begin();
249
             it != splitData.terms.end(); it++) {
292
             it != splitData.terms.end(); it++) {
250
            list<string>exp;
293
            list<string>exp;
251
            lang = (op == Xapian::Query::OP_PHRASE || hadmultiple) ?
294
            lang = (op == Xapian::Query::OP_PHRASE || hadmultiple) ?
252
            nolang : stemlang;
295
            nolang : stemlang;
253
            maybeStemExp(db, lang, *it, exp);
296
            maybeStemExp(lang, *it, exp);
297
          dumbterms.insert(dumbterms.end(), exp.begin(), exp.end());
298
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
254
            if (exp.size() > 1)
299
            if (exp.size() > 1) 
255
            hadmultiple = true;
300
            hadmultiple = true;
301
#endif
256
            orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
302
            orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
257
                              exp.begin(), exp.end()));
303
                              exp.begin(), exp.end()));
258
        }
304
        }
259
        pqueries.push_back(Xapian::Query(op,
305
        pqueries.push_back(Xapian::Query(op,
260
                         orqueries.begin(),
306
                         orqueries.begin(),
261
                         orqueries.end(),
307
                         orqueries.end(),
262
                     splitData.terms.size() + slack));
308
                     splitData.terms.size() + slack));
309
      m_groups.push_back(dumbterms);
263
        }
310
        }
264
    }
311
    }
265
    } catch (const Xapian::Error &e) {
312
    } catch (const Xapian::Error &e) {
266
    ermsg = e.get_msg();
313
    ermsg = e.get_msg();
267
    } catch (const string &s) {
314
    } catch (const string &s) {
...
...
280
327
281
// Translate a simple OR, AND, or EXCL search clause. 
328
// Translate a simple OR, AND, or EXCL search clause. 
282
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, 
329
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p, 
283
                       const string& stemlang)
330
                       const string& stemlang)
284
{
331
{
332
    m_terms.clear();
333
    m_groups.clear();
285
    Xapian::Query *qp = (Xapian::Query *)p;
334
    Xapian::Query *qp = (Xapian::Query *)p;
286
    *qp = Xapian::Query();
335
    *qp = Xapian::Query();
287
336
288
    Xapian::Query::op op;
337
    Xapian::Query::op op;
289
    switch (m_tp) {
338
    switch (m_tp) {
290
    case SCLT_AND: op = Xapian::Query::OP_AND; break;
339
    case SCLT_AND: op = Xapian::Query::OP_AND; break;
340
  // EXCL will be set with AND_NOT in the list. So it's an OR list here
291
    case SCLT_OR: 
341
    case SCLT_OR: 
292
    case SCLT_EXCL: op = Xapian::Query::OP_OR; break;
342
    case SCLT_EXCL: op = Xapian::Query::OP_OR; break;
293
    default:
343
    default:
294
    LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
344
    LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
295
    return false;
345
    return false;
296
    }
346
    }
297
    list<Xapian::Query> pqueries;
347
    list<Xapian::Query> pqueries;
348
    StringToXapianQ tr(db);
298
    if (!stringToXapianQueries(m_text, stemlang, db, m_reason, pqueries))
349
    if (!tr.translate(m_text, stemlang, m_reason, pqueries))
299
    return false;
350
    return false;
300
    if (pqueries.empty()) {
351
    if (pqueries.empty()) {
301
    LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
352
    LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
302
    return true;
353
    return true;
303
    }
354
    }
355
    tr.getTerms(m_terms, m_groups);
304
    *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
356
    *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
305
    return true;
357
    return true;
306
}
358
}
307
359
308
// Translate a FILENAME search clause. 
360
// Translate a FILENAME search clause. 
...
...
317
    // Build a query out of the matching file name terms.
369
    // Build a query out of the matching file name terms.
318
    *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
370
    *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());
319
    return true;
371
    return true;
320
}
372
}
321
373
322
// Translate NEAR or PHRASE clause. We're not handling the distance parameter
374
// Translate NEAR or PHRASE clause. 
323
// yet.
324
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, 
375
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p, 
325
                     const string& stemlang)
376
                     const string& stemlang)
326
{
377
{
378
    m_terms.clear();
379
    m_groups.clear();
380
327
    Xapian::Query *qp = (Xapian::Query *)p;
381
    Xapian::Query *qp = (Xapian::Query *)p;
328
    *qp = Xapian::Query();
382
    *qp = Xapian::Query();
329
383
330
    list<Xapian::Query> pqueries;
384
    list<Xapian::Query> pqueries;
331
    Xapian::Query nq;
385
    Xapian::Query nq;
386
387
    // Use stringToXapianQueries to lowercase and simplify the phrase
388
    // terms etc. The result should be a single element list
332
    string s = string("\"") + m_text + string("\"");
389
    string s = string("\"") + m_text + string("\"");
333
    bool useNear = m_tp == SCLT_NEAR;
390
    bool useNear = m_tp == SCLT_NEAR;
334
391
    StringToXapianQ tr(db);
335
    // Use stringToXapianQueries anyway to lowercase and simplify the
392
    if (!tr.translate(s, stemlang, m_reason, pqueries, m_slack, useNear))
336
    // phrase terms etc. The result should be a single element list
337
    if (!stringToXapianQueries(s, stemlang, db, m_reason, pqueries,
338
                 m_slack, useNear))
339
    return false;
393
    return false;
340
    if (pqueries.empty()) {
394
    if (pqueries.empty()) {
341
    LOGERR(("SearchDataClauseDist: resolved to null query\n"));
395
    LOGERR(("SearchDataClauseDist: resolved to null query\n"));
342
    return true;
396
    return true;
343
    }
397
    }
398
    tr.getTerms(m_terms, m_groups);
344
    *qp = *pqueries.begin();
399
    *qp = *pqueries.begin();
345
    return true;
400
    return true;
346
}
401
}
347
402
348
} // Namespace Rcl
403
} // Namespace Rcl