Parent: [18b357] (diff)

Child: [2777fe] (diff)

Download this file

rcldb.h    256 lines (197 with data), 8.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _DB_H_INCLUDED_
#define _DB_H_INCLUDED_
/* @(#$Id: rcldb.h,v 1.52 2007-06-19 08:36:24 dockes Exp $ (C) 2004 J.F.Dockes */
#include <string>
#include <list>
#include <vector>
#include "refcntr.h"
#include "rcldoc.h"
#include "stoplist.h"
#ifndef NO_NAMESPACES
using std::string;
using std::list;
using std::vector;
#endif
// rcldb defines an interface for a 'real' text database. The current
// implementation uses xapian only, and xapian-related code is in rcldb.cpp
// If support was added for other backend, the xapian code would be moved in
// rclxapian.cpp, another file would be created for the new backend, and the
// configuration/compile/link code would be adjusted to allow choosing. There
// is no plan for supporting multiple different backends.
//
// In no case does this try to implement a useful virtualized text-db interface
// The main goal is simplicity and good matching to usage inside the recoll
// user interface. In other words, this is not exhaustive or well-designed or
// reusable.
struct stat;
#ifndef NO_NAMESPACES
namespace Rcl {
#endif
class SearchData;
class Native;
class TermIter;
class TermMatchEntry {
public:
TermMatchEntry() : wcf(0) {}
TermMatchEntry(const string&t, int f) : term(t), wcf(f) {}
TermMatchEntry(const string&t) : term(t), wcf(0) {}
bool operator==(const TermMatchEntry &o) { return term == o.term;}
bool operator<(const TermMatchEntry &o) { return term < o.term;}
string term;
int wcf; // Within collection frequency
};
/**
* Wrapper class for the native database.
*/
class Db {
public:
/* General stuff (valid for query or update) ****************************/
Db();
~Db();
enum OpenMode {DbRO, DbUpd, DbTrunc};
// KEEP_UPDATED is internal use by reOpen() only
enum QueryOpts {QO_NONE=0, QO_STEM = 1, QO_KEEP_UPDATED = 8};
bool open(const string &dbdir, const string &stoplistfn,
OpenMode mode, int qops = QO_NONE);
bool close();
bool isopen();
/** Retrieve main database directory */
string getDbDir();
/** Get explanation about last error */
string getReason() const {return m_reason;}
/** Return list of configured stop words */
const StopList& getStopList() const {return m_stops;}
/** Field name to prefix translation (ie: author -> 'A') */
bool fieldToPrefix(const string& fldname, string &pfx);
/* Update-related methods ******************************************/
/** Add document. The Doc class should have been filled as much as
possible depending on the document type */
bool add(const string &filename, const Doc &doc, const struct stat *stp);
/** Test if the db entry for the given filename/stat is up to date */
bool needUpdate(const string &filename, const struct stat *stp);
/** Remove documents that no longer exist in the file system. This
depends on the update map, which is built during
indexation. This should only be called after a full walk of
the file system, else the update map will not be complete, and
many documents will be deleted that shouldn't */
bool purge();
/** Delete document(s) for given filename */
bool purgeFile(const string &filename);
/** Create stem expansion database for given language. */
bool createStemDb(const string &lang);
/** Delete stem expansion database for given language. */
bool deleteStemDb(const string &lang);
/* Query-related methods ************************************/
/** Return total docs in db */
int docCnt();
// Parse query string and initialize query
bool setQuery(RefCntr<SearchData> q, int opts = QO_NONE,
const string& stemlang = "english");
bool getQueryTerms(list<string>& terms);
bool getMatchTerms(const Doc& doc, list<string>& terms);
/** Return a list of index terms that match the input string
* Expansion is performed either with either wildcard or regexp processing
* Stem expansion is performed if lang is not empty */
enum MatchType {ET_WILD, ET_REGEXP, ET_STEM};
bool termMatch(MatchType typ, const string &lang, const string &s,
list<TermMatchEntry>& result, int max = -1);
/** Add extra database for querying */
bool addQueryDb(const string &dir);
/** Remove extra database. if dir == "", remove all. */
bool rmQueryDb(const string &dir);
/** Tell if directory seems to hold xapian db */
static bool testDbDir(const string &dir);
/** Get document at rank i in current query.
This is probably vastly inferior to the type of interface in
Xapian, but we have to start with something simple to
experiment with the GUI. i is sequential from 0 to some value.
*/
bool getDoc(int i, Doc &doc, int *percent = 0);
/* Build synthetic abstract out of query terms and term position data */
bool makeDocAbstract(Doc &doc, string& abstract);
/** Get document for given filename and ipath */
bool getDoc(const string &fn, const string &ipath, Doc &doc, int *percent);
/** Expand query */
list<string> expand(const Doc &doc);
/** Get results count for current query */
int getResCnt();
/** Get a list of existing stemming databases */
std::list<std::string> getStemLangs();
/** Set parameters for synthetic abstract generation */
void setAbstractParams(int idxTrunc, int synthLen, int syntCtxLen);
/** Whole term list walking. */
TermIter *termWalkOpen();
bool termWalkNext(TermIter *, string &term);
void termWalkClose(TermIter *);
/** Test term existence */
bool termExists(const string& term);
/** Test if terms stem to different roots. */
bool stemDiffers(const string& lang, const string& term,
const string& base);
/** Filename wildcard expansion */
bool filenameWildExp(const string& exp, list<string>& names);
private:
string m_filterTopDir; // Current query filter on subtree top directory
vector<int> m_dbindices; // In case there is a postq filter: sequence of
// db indices that match
string m_reason; // Error explanation
// A place for things we don't want visible here.
friend class Native;
Native *m_ndb;
unsigned int m_qOpts;
// This is how long an abstract we keep or build from beginning of
// text when indexing. It only has an influence on the size of the
// db as we are free to shorten it again when displaying
int m_idxAbsTruncLen;
// This is the size of the abstract that we synthetize out of query
// term contexts at *query time*
int m_synthAbsLen;
// This is how many words (context size) we keep around query terms
// when building the abstract
int m_synthAbsWordCtxLen;
// Flush threshold. Megabytes of text indexed before we flush.
int m_flushMb;
// Text bytes indexed since beginning
long long m_curtxtsz;
// Text bytes at last flush
long long m_flushtxtsz;
// Text bytes at last fsoccup check
long long m_occtxtsz;
// Maximum file system occupation percentage
int m_maxFsOccupPc;
// Database directory
string m_basedir;
// List of directories for additional databases to query
list<string> m_extraDbs;
OpenMode m_mode;
vector<bool> updated;
StopList m_stops;
bool reOpen(); // Close/open, same mode/opts
bool stemExpand(const string &lang, const string &s,
list<TermMatchEntry>& result, int max = -1);
/* Copyconst and assignemt private and forbidden */
Db(const Db &) {}
Db & operator=(const Db &) {return *this;};
};
// Unaccent and lowercase data.
extern bool dumb_string(const string &in, string &out);
#ifndef NO_NAMESPACES
}
#endif // NO_NAMESPACES
#endif /* _DB_H_INCLUDED_ */