Child: [e6402e] (diff)

Download this file

rcldb_p.h    161 lines (138 with data), 5.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
/* Copyright (C) 2007 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _rcldb_p_h_included_
#define _rcldb_p_h_included_
#include "autoconfig.h"
#include <map>
#include <xapian.h>
#ifdef IDX_THREADS
#include "workqueue.h"
#endif // IDX_THREADS
#include "debuglog.h"
#include "xmacros.h"
#include "ptmutex.h"
namespace Rcl {
class Query;
#ifdef IDX_THREADS
// Task for the index update thread. This can be
// - add/update for a new / update documment
// - delete for a deleted document
// - purgeOrphans when a multidoc file is updated during a partial pass (no
// general purge). We want to remove subDocs that possibly don't
// exist anymore. We find them by their different sig
// txtlen and doc are only valid for add/update else, len is (size_t)-1 and doc
// is empty
class DbUpdTask {
public:
enum Op {AddOrUpdate, Delete, PurgeOrphans};
// Note that udi and uniterm are strictly equivalent and are
// passed both just to avoid recomputing uniterm which is
// available on the caller site.
DbUpdTask(Op _op, const string& ud, const string& un,
const Xapian::Document &d, size_t tl)
: op(_op), udi(ud), uniterm(un), doc(d), txtlen(tl)
{}
// Udi and uniterm equivalently designate the doc
Op op;
string udi;
string uniterm;
Xapian::Document doc;
// txtlen is used to update the flush interval. It's -1 for a
// purge because we actually don't know it, and the code fakes a
// text length based on the term count.
size_t txtlen;
};
#endif // IDX_THREADS
// A class for data and methods that would have to expose
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
// 2 different ones for indexing or query as there is not much in
// common.
class Db::Native {
public:
Db *m_rcldb; // Parent
bool m_isopen;
bool m_iswritable;
bool m_noversionwrite; //Set if open failed because of version mismatch!
#ifdef IDX_THREADS
WorkQueue<DbUpdTask*> m_wqueue;
int m_loglevel;
PTMutexInit m_mutex;
long long m_totalworkns;
bool m_havewriteq;
void maybeStartThreads();
#endif // IDX_THREADS
// Indexing
Xapian::WritableDatabase xwdb;
// Querying (active even if the wdb is too)
Xapian::Database xrdb;
Native(Db *db);
~Native();
#ifdef IDX_THREADS
friend void *DbUpdWorker(void*);
#endif // IDX_THREADS
// Final steps of doc update, part which need to be single-threaded
bool addOrUpdateWrite(const string& udi, const string& uniterm,
Xapian::Document& doc, size_t txtlen);
/** Delete all documents which are contained in the input document,
* which must be a file-level one.
*
* @param onlyOrphans if true, only delete documents which have
* not the same signature as the input. This is used to delete docs
* which do not exist any more in the file after an update, for
* example the tail messages after a folder truncation). If false,
* delete all.
* @param udi the parent document identifier.
* @param uniterm equivalent to udi, passed just to avoid recomputing.
*/
bool purgeFileWrite(bool onlyOrphans, const string& udi,
const string& uniterm);
bool getPagePositions(Xapian::docid docid, vector<int>& vpos);
int getPageNumberForPosition(const vector<int>& pbreaks, unsigned int pos);
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
/** Retrieve Xapian::docid, given unique document identifier,
* using the posting list for the derived term.
*
* @return 0 if not found
*/
Xapian::docid getDoc(const string& udi, Xapian::Document& xdoc);
/** Retrieve unique document identifier for given Xapian document,
* using the document termlist
*/
bool xdocToUdi(Xapian::Document& xdoc, string &udi);
/** Check if doc is indexed by term */
bool hasTerm(const string& udi, const string& term);
/** Compute list of subdocuments for a given udi. We look for documents
* indexed by a parent term matching the udi, the posting list for the
* parentterm(udi) (As suggested by James Aylett)
*
* Note that this is not currently recursive: all subdocs are supposed
* to be children of the file doc.
* Ie: in a mail folder, all messages, attachments, attachments of
* attached messages etc. must have the folder file document as
* parent.
*
* Finer grain parent-child relationships are defined by the
* indexer (rcldb user), using the ipath.
*
*/
bool subDocs(const string &udi, vector<Xapian::docid>& docids);
};
// This is the word position offset at which we index the body text
// (abstract, keywords, etc.. are stored before this)
static const unsigned int baseTextPosition = 100000;
}
#endif /* _rcldb_p_h_included_ */