Parent: [533068] (diff)

Download this file

searchdata.h    524 lines (458 with data), 16.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _SEARCHDATA_H_INCLUDED_
#define _SEARCHDATA_H_INCLUDED_
/**
* Structures to hold data coming almost directly from the gui
* and handle its translation to Xapian queries.
* This is not generic code, it reflects the choices made for the user
* interface, and it also knows some specific of recoll's usage of Xapian
* (ie: term prefixes)
*/
#include <string>
#include <vector>
#include <ostream>
#include "rcldb.h"
#include MEMORY_INCLUDE
#include "smallut.h"
#include "cstr.h"
#include "hldata.h"
class RclConfig;
class AdvSearch;
namespace Rcl {
/** Search clause types */
enum SClType {
SCLT_AND,
SCLT_OR, SCLT_FILENAME, SCLT_PHRASE, SCLT_NEAR, SCLT_PATH,
SCLT_SUB
};
class SearchDataClause;
class SearchDataClauseDist;
/**
A SearchData object represents a Recoll user query, for translation
into a Xapian query tree. This could probably better called a 'question'.
This is a list of SearchDataClause objects combined through either
OR or AND.
Clauses either reflect user entry in a query field: some text, a
clause type (AND/OR/NEAR etc.), possibly a distance, or are the
result of parsing query language input. A clause can also point to
another SearchData representing a subquery.
The content of each clause when added may not be fully parsed yet
(may come directly from a gui field). It will be parsed and may be
translated to several queries in the Xapian sense, for exemple
several terms and phrases as would result from
["this is a phrase" term1 term2] .
This is why the clauses also have an AND/OR/... type. They are an
intermediate form between the primary user input and
the final Xapian::Query tree.
For example, a phrase clause could be added either explicitly or
using double quotes: {SCLT_PHRASE, [this is a phrase]} or as
{SCLT_XXX, ["this is a phrase"]}
*/
class SearchData {
public:
SearchData(SClType tp, const string& stemlang)
: m_tp(tp), m_stemlang(stemlang)
{
if (m_tp != SCLT_OR && m_tp != SCLT_AND)
m_tp = SCLT_OR;
commoninit();
}
SearchData()
: m_tp(SCLT_AND)
{
commoninit();
}
~SearchData();
/** Is there anything but a file name search in here ? */
bool fileNameOnly();
/** Do we have wildcards anywhere apart from filename searches ? */
bool haveWildCards() {return m_haveWildCards;}
/** Translate to Xapian query. rcldb knows about the void* */
bool toNativeQuery(Rcl::Db &db, void *);
/** We become the owner of cl and will delete it */
bool addClause(SearchDataClause* cl);
/** If this is a simple query (one field only, no distance clauses),
* add phrase made of query terms to query, so that docs containing the
* user terms in order will have higher relevance. This must be called
* before toNativeQuery().
* @param threshold: don't use terms more frequent than the value
* (proportion of docs where they occur)
*/
bool maybeAddAutoPhrase(Rcl::Db &db, double threshold);
const std::string& getStemLang() {return m_stemlang;}
void setMinSize(size_t size) {m_minSize = size;}
void setMaxSize(size_t size) {m_maxSize = size;}
/** Set date span for filtering results */
void setDateSpan(DateInterval *dip) {m_dates = *dip; m_haveDates = true;}
/** Add file type for filtering results */
void addFiletype(const std::string& ft) {m_filetypes.push_back(ft);}
/** Add file type to not wanted list */
void remFiletype(const std::string& ft) {m_nfiletypes.push_back(ft);}
/** Retrieve error description */
std::string getReason() {return m_reason;}
/** Return term expansion data. Mostly used by caller for highlighting
*/
void getTerms(HighlightData& hldata) const;
/**
* Get/set the description field which is retrieved from xapian after
* initializing the query. It is stored here for usage in the GUI.
*/
std::string getDescription() {return m_description;}
void setDescription(const std::string& d) {m_description = d;}
/** Return an XML version of the contents, for storage in search history
by the GUI */
string asXML();
void setTp(SClType tp)
{
m_tp = tp;
}
SClType getTp() {
return m_tp;
}
void setMaxExpand(int max)
{
m_softmaxexpand = max;
}
bool getAutoDiac() {return m_autodiacsens;}
bool getAutoCase() {return m_autocasesens;}
int getMaxExp() {return m_maxexp;}
int getMaxCl() {return m_maxcl;}
int getSoftMaxExp() {return m_softmaxexpand;}
void dump(ostream& o) const;
friend class ::AdvSearch;
private:
// Combine type. Only SCLT_AND or SCLT_OR here
SClType m_tp;
// The clauses
std::vector<SearchDataClause*> m_query;
// Restricted set of filetypes if not empty.
std::vector<std::string> m_filetypes;
// Excluded set of file types if not empty
std::vector<std::string> m_nfiletypes;
// Autophrase if set. Can't be part of the normal chain because
// it uses OP_AND_MAYBE
STD_SHARED_PTR<SearchDataClauseDist> m_autophrase;
// Special stuff produced by input which looks like a clause but means
// something else (date and size specs)
bool m_haveDates;
DateInterval m_dates; // Restrict to date interval
size_t m_maxSize;
size_t m_minSize;
// Printable expanded version of the complete query, retrieved/set
// from rcldb after the Xapian::setQuery() call
std::string m_description;
// Error diag
std::string m_reason;
bool m_haveWildCards;
std::string m_stemlang;
// Parameters set at the start of ToNativeQuery because they need
// an rclconfig. Actually this does not make sense and it would be
// simpler to just pass an rclconfig to the constructor;
bool m_autodiacsens;
bool m_autocasesens;
int m_maxexp;
int m_maxcl;
// Parameters which are not part of the main query data but may influence
// translation in special cases.
// Maximum TermMatch (e.g. wildcard) expansion. This is normally set
// from the configuration with a high default, but may be set to a lower
// value during "find-as-you-type" operations from the GUI
int m_softmaxexpand;
// Collapse bogus subqueries generated by the query parser, mostly
// so that we can check if this is an autophrase candidate (else
// Xapian will do it anyway)
void simplify();
bool expandFileTypes(Rcl::Db &db, std::vector<std::string>& exptps);
bool clausesToQuery(Rcl::Db &db, SClType tp,
std::vector<SearchDataClause*>& query,
string& reason, void *d);
void commoninit();
/* Copyconst and assignment private and forbidden */
SearchData(const SearchData &) {}
SearchData& operator=(const SearchData&) {return *this;};
};
class SearchDataClause {
public:
enum Modifier {SDCM_NONE=0, SDCM_NOSTEMMING=0x1, SDCM_ANCHORSTART=0x2,
SDCM_ANCHOREND=0x4, SDCM_CASESENS=0x8, SDCM_DIACSENS=0x10,
SDCM_NOTERMS=0x20, // Don't include terms for highlighting
SDCM_NOSYNS = 0x40, // Don't perform synonym expansion
};
enum Relation {REL_CONTAINS, REL_EQUALS, REL_LT, REL_LTE, REL_GT, REL_GTE};
SearchDataClause(SClType tp)
: m_tp(tp), m_parentSearch(0), m_haveWildCards(0),
m_modifiers(SDCM_NONE), m_weight(1.0), m_exclude(false),
m_rel(REL_CONTAINS)
{}
virtual ~SearchDataClause() {}
virtual bool toNativeQuery(Rcl::Db &db, void *) = 0;
bool isFileName() const {return m_tp == SCLT_FILENAME ? true: false;}
virtual std::string getReason() const {return m_reason;}
virtual void getTerms(HighlightData&) const {}
SClType getTp() const
{
return m_tp;
}
void setTp(SClType tp) {
m_tp = tp;
}
void setParent(SearchData *p)
{
m_parentSearch = p;
}
string getStemLang()
{
return (m_modifiers & SDCM_NOSTEMMING) || m_parentSearch == 0 ?
cstr_null : m_parentSearch->getStemLang();
}
bool getAutoDiac()
{
return m_parentSearch ? m_parentSearch->getAutoDiac() : false;
}
bool getAutoCase()
{
return m_parentSearch ? m_parentSearch->getAutoCase() : true;
}
int getMaxExp()
{
return m_parentSearch ? m_parentSearch->getMaxExp() : 10000;
}
size_t getMaxCl()
{
return m_parentSearch ? m_parentSearch->getMaxCl() : 100000;
}
int getSoftMaxExp()
{
return m_parentSearch ? m_parentSearch->getSoftMaxExp() : -1;
}
virtual void addModifier(Modifier mod)
{
m_modifiers = m_modifiers | mod;
}
virtual unsigned int getmodifiers() {
return m_modifiers;
}
virtual void setWeight(float w)
{
m_weight = w;
}
virtual bool getexclude() const
{
return m_exclude;
}
virtual void setexclude(bool onoff)
{
m_exclude = onoff;
}
virtual void setrel(Relation rel) {
m_rel = rel;
}
virtual Relation getrel() {
return m_rel;
}
virtual void dump(ostream& o) const;
friend class SearchData;
protected:
std::string m_reason;
SClType m_tp;
SearchData *m_parentSearch;
bool m_haveWildCards;
unsigned int m_modifiers;
float m_weight;
bool m_exclude;
Relation m_rel;
private:
SearchDataClause(const SearchDataClause&)
{
}
SearchDataClause& operator=(const SearchDataClause&)
{
return *this;
}
};
/**
* "Simple" data clause with user-entered query text. This can include
* multiple phrases and words, but no specified distance.
*/
class TermProcQ;
class SearchDataClauseSimple : public SearchDataClause {
public:
SearchDataClauseSimple(SClType tp, const std::string& txt,
const std::string& fld = std::string())
: SearchDataClause(tp), m_text(txt), m_field(fld), m_curcl(0)
{
m_haveWildCards =
(txt.find_first_of(cstr_minwilds) != std::string::npos);
}
SearchDataClauseSimple(const std::string& txt, SClType tp)
: SearchDataClause(tp), m_text(txt), m_curcl(0)
{
m_haveWildCards =
(txt.find_first_of(cstr_minwilds) != std::string::npos);
}
virtual ~SearchDataClauseSimple()
{
}
/** Translate to Xapian query */
virtual bool toNativeQuery(Rcl::Db &, void *);
virtual void getTerms(HighlightData& hldata) const
{
hldata.append(m_hldata);
}
virtual const std::string& gettext()
{
return m_text;
}
virtual const std::string& getfield()
{
return m_field;
}
virtual void setfield(const string& field) {
m_field = field;
}
virtual void dump(ostream& o) const;
protected:
std::string m_text; // Raw user entry text.
std::string m_field; // Field specification if any
HighlightData m_hldata;
// Current count of Xapian clauses, to check against expansion limit
size_t m_curcl;
bool processUserString(Rcl::Db &db, const string &iq,
std::string &ermsg,
void* pq, int slack = 0, bool useNear = false);
bool expandTerm(Rcl::Db &db, std::string& ermsg, int mods,
const std::string& term,
std::vector<std::string>& exp,
std::string& sterm, const std::string& prefix,
std::vector<std::string>* multiwords = 0);
// After splitting entry on whitespace: process non-phrase element
void processSimpleSpan(Rcl::Db &db, string& ermsg, const string& span,
int mods, void *pq);
// Process phrase/near element
void processPhraseOrNear(Rcl::Db &db, string& ermsg, TermProcQ *splitData,
int mods, void *pq, bool useNear, int slack);
};
/**
* Filename search clause. This is special because term expansion is only
* performed against the unsplit file name terms.
*
* There is a big advantage in expanding only against the
* field, especially for file names, because this makes searches for
* "*xx" much faster (no need to scan the whole main index).
*/
class SearchDataClauseFilename : public SearchDataClauseSimple {
public:
SearchDataClauseFilename(const std::string& txt)
: SearchDataClauseSimple(txt, SCLT_FILENAME)
{
// File name searches don't count when looking for wild cards.
m_haveWildCards = false;
}
virtual ~SearchDataClauseFilename()
{
}
virtual bool toNativeQuery(Rcl::Db &, void *);
virtual void dump(ostream& o) const;
};
/**
* Pathname filtering clause. This is special because of history:
* - Pathname filtering used to be performed as a post-processing step
* done with the url fields of doc data records.
* - Then it was done as special phrase searchs on path elements prefixed
* with XP.
* Up to this point dir filtering data was stored as part of the searchdata
* object, not in the SearchDataClause tree. Only one, then a list,
* of clauses where stored, and they were always ANDed together.
*
* In order to allow for OR searching, dir clauses are now stored in a
* specific SearchDataClause, but this is still special because the field has
* non-standard phrase-like processing, reflected in index storage by
* an empty element representing / (as "XP").
*
* A future version should use a standard phrase with an anchor to the
* start if the path starts with /. As this implies an index format
* change but is no important enough to warrant it, this has to wait for
* the next format change.
*/
class SearchDataClausePath : public SearchDataClauseSimple {
public:
SearchDataClausePath(const std::string& txt, bool excl = false)
: SearchDataClauseSimple(SCLT_PATH, txt, "dir")
{
m_exclude = excl;
m_haveWildCards = false;
}
virtual ~SearchDataClausePath()
{
}
virtual bool toNativeQuery(Rcl::Db &, void *);
virtual void dump(ostream& o) const;
};
/**
* A clause coming from a NEAR or PHRASE entry field. There is only one
* std::string group, and a specified distance, which applies to it.
*/
class SearchDataClauseDist : public SearchDataClauseSimple {
public:
SearchDataClauseDist(SClType tp, const std::string& txt, int slack,
const std::string& fld = std::string())
: SearchDataClauseSimple(tp, txt, fld), m_slack(slack)
{
}
virtual ~SearchDataClauseDist()
{
}
virtual bool toNativeQuery(Rcl::Db &, void *);
virtual int getslack() const
{
return m_slack;
}
virtual void setslack(int slack) {
m_slack = slack;
}
virtual void dump(ostream& o) const;
private:
int m_slack;
};
/** Subquery */
class SearchDataClauseSub : public SearchDataClause {
public:
SearchDataClauseSub(STD_SHARED_PTR<SearchData> sub)
: SearchDataClause(SCLT_SUB), m_sub(sub)
{
}
virtual bool toNativeQuery(Rcl::Db &db, void *p)
{
bool ret = m_sub->toNativeQuery(db, p);
if (!ret)
m_reason = m_sub->getReason();
return ret;
}
virtual void getTerms(HighlightData& hldata) const
{
m_sub.get()->getTerms(hldata);
}
virtual STD_SHARED_PTR<SearchData> getSub() {
return m_sub;
}
virtual void dump(ostream& o) const;
protected:
STD_SHARED_PTR<SearchData> m_sub;
};
} // Namespace Rcl
#endif /* _SEARCHDATA_H_INCLUDED_ */