Parent: [3d4fd3] (diff)

Child: [1835e9] (diff)

Download this file

rcldb_p.h    217 lines (184 with data), 7.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/* Copyright (C) 2007 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _rcldb_p_h_included_
#define _rcldb_p_h_included_
#include "autoconfig.h"
#include <map>
#include <mutex>
#include <xapian.h>
#ifdef IDX_THREADS
#include "workqueue.h"
#endif // IDX_THREADS
#include "xmacros.h"
namespace Rcl {
class Query;
#ifdef IDX_THREADS
// Task for the index update thread. This can be
// - add/update for a new / update documment
// - delete for a deleted document
// - purgeOrphans when a multidoc file is updated during a partial pass (no
// general purge). We want to remove subDocs that possibly don't
// exist anymore. We find them by their different sig
// txtlen and doc are only valid for add/update else, len is (size_t)-1 and doc
// is empty
class DbUpdTask {
public:
enum Op {AddOrUpdate, Delete, PurgeOrphans};
// Note that udi and uniterm are strictly equivalent and are
// passed both just to avoid recomputing uniterm which is
// available on the caller site.
// Take some care to avoid sharing string data (if string impl is cow)
DbUpdTask(Op _op, const string& ud, const string& un,
Xapian::Document *d, size_t tl, string& rztxt
) : op(_op), udi(ud.begin(), ud.end()), uniterm(un.begin(), un.end()),
doc(d), txtlen(tl) {
rawztext.swap(rztxt);
}
// Udi and uniterm equivalently designate the doc
Op op;
string udi;
string uniterm;
Xapian::Document *doc;
// txtlen is used to update the flush interval. It's -1 for a
// purge because we actually don't know it, and the code fakes a
// text length based on the term count.
size_t txtlen;
string rawztext; // Compressed doc text
};
#endif // IDX_THREADS
class TextSplitDb;
// A class for data and methods that would have to expose
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
// 2 different ones for indexing or query as there is not much in
// common.
class Db::Native {
public:
Db *m_rcldb; // Parent
bool m_isopen;
bool m_iswritable;
bool m_noversionwrite; //Set if open failed because of version mismatch!
bool m_storetext{false};
#ifdef IDX_THREADS
WorkQueue<DbUpdTask*> m_wqueue;
std::mutex m_mutex;
long long m_totalworkns;
bool m_havewriteq;
void maybeStartThreads();
#endif // IDX_THREADS
// Indexing
Xapian::WritableDatabase xwdb;
// Querying (active even if the wdb is too)
Xapian::Database xrdb;
Native(Db *db);
~Native();
#ifdef IDX_THREADS
friend void *DbUpdWorker(void*);
#endif // IDX_THREADS
void openWrite(const std::string& dir, Db::OpenMode mode);
void openRead(const string& dir);
// Final steps of doc update, part which need to be single-threaded
bool addOrUpdateWrite(const string& udi, const string& uniterm,
Xapian::Document *doc, size_t txtlen
, const string& rawztext);
/** Delete all documents which are contained in the input document,
* which must be a file-level one.
*
* @param onlyOrphans if true, only delete documents which have
* not the same signature as the input. This is used to delete docs
* which do not exist any more in the file after an update, for
* example the tail messages after a folder truncation). If false,
* delete all.
* @param udi the parent document identifier.
* @param uniterm equivalent to udi, passed just to avoid recomputing.
*/
bool purgeFileWrite(bool onlyOrphans, const string& udi,
const string& uniterm);
bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
int getPageNumberForPosition(const vector<int>& pbreaks, int pos);
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc,
bool fetchtext = false);
size_t whatDbIdx(Xapian::docid id);
/** Retrieve Xapian::docid, given unique document identifier,
* using the posting list for the derived term.
*
* @param udi the unique document identifier (opaque hashed path+ipath).
* @param idxi the database index, at query time, when using external
* databases.
* @param[out] xdoc the xapian document.
* @return 0 if not found
*/
Xapian::docid getDoc(const string& udi, int idxi, Xapian::Document& xdoc);
/** Retrieve unique document identifier for given Xapian document,
* using the document termlist
*/
bool xdocToUdi(Xapian::Document& xdoc, string &udi);
/** Check if doc is indexed by term */
bool hasTerm(const string& udi, int idxi, const string& term);
/** Update existing Xapian document for pure extended attrs change */
bool docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
Doc &doc, Xapian::Document& xdoc);
/** Remove all terms currently indexed for field defined by idx prefix */
bool clearField(Xapian::Document& xdoc, const string& pfx,
Xapian::termcount wdfdec);
/** Check if term wdf is 0 and remove term if so */
bool clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term);
/** Compute list of subdocuments for a given udi. We look for documents
* indexed by a parent term matching the udi, the posting list for the
* parentterm(udi) (As suggested by James Aylett)
*
* Note that this is not currently recursive: all subdocs are supposed
* to be children of the file doc.
* Ie: in a mail folder, all messages, attachments, attachments of
* attached messages etc. must have the folder file document as
* parent.
*
* Finer grain parent-child relationships are defined by the
* indexer (rcldb user), using the ipath.
*
*/
bool subDocs(const string &udi, int idxi, vector<Xapian::docid>& docids);
/** Check if a page position list is defined */
bool hasPages(Xapian::docid id);
std::string rawtextMetaKey(Xapian::docid did) {
// Xapian's Olly Betts avises to use a key which will
// sort the same as the docid (which we do), and to
// use Xapian's pack.h:pack_uint_preserving_sort() which is
// efficient but hard to read. I'd wager that this
// does not make much of a difference. 10 ascii bytes
// gives us 10 billion docs, which is enough (says I).
char buf[30];
sprintf(buf, "%010d", did);
return buf;
}
bool getRawText(Xapian::docid docid, string& rawtext);
void deleteDocument(Xapian::docid docid) {
string metareason;
XAPTRY(xwdb.set_metadata(rawtextMetaKey(docid), string()),
xwdb, metareason);
if (!metareason.empty()) {
LOGERR("deleteDocument: set_metadata error: " <<
metareason << "\n");
// not fatal
}
xwdb.delete_document(docid);
}
};
// This is the word position offset at which we index the body text
// (abstract, keywords, etc.. are stored before this)
static const unsigned int baseTextPosition = 100000;
}
#endif /* _rcldb_p_h_included_ */