|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.132 2008-05-20 10:09:54 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.133 2008-06-13 18:22:46 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
34 |
using namespace std;
|
34 |
using namespace std;
|
35 |
#endif /* NO_NAMESPACES */
|
35 |
#endif /* NO_NAMESPACES */
|
36 |
|
36 |
|
37 |
#include "rclconfig.h"
|
37 |
#include "rclconfig.h"
|
38 |
#include "rcldb.h"
|
38 |
#include "rcldb.h"
|
|
|
39 |
#include "rcldb_p.h"
|
39 |
#include "stemdb.h"
|
40 |
#include "stemdb.h"
|
40 |
#include "textsplit.h"
|
41 |
#include "textsplit.h"
|
41 |
#include "transcode.h"
|
42 |
#include "transcode.h"
|
42 |
#include "unacpp.h"
|
43 |
#include "unacpp.h"
|
43 |
#include "conftree.h"
|
44 |
#include "conftree.h"
|
|
... |
|
... |
45 |
#include "pathut.h"
|
46 |
#include "pathut.h"
|
46 |
#include "smallut.h"
|
47 |
#include "smallut.h"
|
47 |
#include "pathhash.h"
|
48 |
#include "pathhash.h"
|
48 |
#include "utf8iter.h"
|
49 |
#include "utf8iter.h"
|
49 |
#include "searchdata.h"
|
50 |
#include "searchdata.h"
|
|
|
51 |
#include "rclquery.h"
|
|
|
52 |
#include "rclquery_p.h"
|
50 |
|
53 |
|
51 |
#include "xapian.h"
|
|
|
52 |
|
54 |
|
53 |
#ifndef MAX
|
55 |
#ifndef MAX
|
54 |
#define MAX(A,B) (A>B?A:B)
|
56 |
#define MAX(A,B) (A>B?A:B)
|
55 |
#endif
|
57 |
#endif
|
56 |
#ifndef MIN
|
58 |
#ifndef MIN
|
|
... |
|
... |
86 |
// Synthetic abstract marker (to discriminate from abstract actually
|
88 |
// Synthetic abstract marker (to discriminate from abstract actually
|
87 |
// found in doc)
|
89 |
// found in doc)
|
88 |
const static string rclSyntAbs = "?!#@";
|
90 |
const static string rclSyntAbs = "?!#@";
|
89 |
const static string emptystring;
|
91 |
const static string emptystring;
|
90 |
|
92 |
|
91 |
// A class for data and methods that would have to expose
|
|
|
92 |
// Xapian-specific stuff if they were in Rcl::Db. There could actually be
|
|
|
93 |
// 2 different ones for indexing or query as there is not much in
|
|
|
94 |
// common.
|
|
|
95 |
class Native {
|
|
|
96 |
public:
|
|
|
97 |
Db *m_db;
|
|
|
98 |
bool m_isopen;
|
|
|
99 |
bool m_iswritable;
|
|
|
100 |
|
|
|
101 |
// Indexing
|
|
|
102 |
Xapian::WritableDatabase wdb;
|
|
|
103 |
|
|
|
104 |
// Querying
|
|
|
105 |
Xapian::Database db;
|
|
|
106 |
Xapian::Query query; // query descriptor: terms and subqueries
|
|
|
107 |
// joined by operators (or/and etc...)
|
|
|
108 |
|
|
|
109 |
// Filtering results on location. There are 2 possible approaches
|
|
|
110 |
// for this:
|
|
|
111 |
// - Set a "MatchDecider" to be used by Xapian during the query
|
|
|
112 |
// - Filter the results out of Xapian (this also uses a
|
|
|
113 |
// Xapian::MatchDecider object, but applied to the results by Recoll.
|
|
|
114 |
//
|
|
|
115 |
// The result filtering approach was the first implemented.
|
|
|
116 |
//
|
|
|
117 |
// The efficiency of both methods depend on the searches, so the code
|
|
|
118 |
// for both has been kept. A nice point for the Xapian approach is that
|
|
|
119 |
// the result count estimate are correct (they are wrong with
|
|
|
120 |
// the postfilter approach). It is also faster in some worst case scenarios
|
|
|
121 |
// so this now the default (but the post-filtering is faster in many common
|
|
|
122 |
// cases).
|
|
|
123 |
//
|
|
|
124 |
// Which is used is decided in SetQuery(), by setting either of
|
|
|
125 |
// the two following members. This in turn is controlled by a
|
|
|
126 |
// preprocessor directive.
|
|
|
127 |
|
|
|
128 |
#define XAPIAN_FILTERING 1
|
|
|
129 |
|
|
|
130 |
Xapian::MatchDecider *decider; // Xapian does the filtering
|
|
|
131 |
Xapian::MatchDecider *postfilter; // Result filtering done by Recoll
|
|
|
132 |
|
|
|
133 |
Xapian::Enquire *enquire; // Open query descriptor.
|
|
|
134 |
Xapian::MSet mset; // Partial result set
|
|
|
135 |
|
|
|
136 |
// Term frequencies for current query. See makeAbstract, setQuery
|
|
|
137 |
map<string, double> m_termfreqs;
|
|
|
138 |
|
|
|
139 |
Native(Db *db)
|
|
|
140 |
: m_db(db),
|
|
|
141 |
m_isopen(false), m_iswritable(false), decider(0), postfilter(0),
|
|
|
142 |
enquire(0)
|
|
|
143 |
{ }
|
|
|
144 |
|
|
|
145 |
~Native() {
|
|
|
146 |
delete decider;
|
|
|
147 |
delete postfilter;
|
|
|
148 |
delete enquire;
|
|
|
149 |
}
|
|
|
150 |
|
|
|
151 |
string makeAbstract(Xapian::docid id, const list<string>& terms);
|
|
|
152 |
|
|
|
153 |
bool dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc);
|
|
|
154 |
|
|
|
155 |
/** Compute list of subdocuments for a given path (given by hash)
|
|
|
156 |
* We look for all Q terms beginning with the path/hash
|
|
|
157 |
* As suggested by James Aylett, a better method would be to add
|
|
|
158 |
* a single term (ie: XP/path/to/file) to all subdocs, then finding
|
|
|
159 |
* them would be a simple matter of retrieving the posting list for the
|
|
|
160 |
* term. There would still be a need for the current Qterm though, as a
|
|
|
161 |
* unique term for replace_document, and for retrieving by
|
|
|
162 |
* path/ipath (history)
|
|
|
163 |
*/
|
|
|
164 |
bool subDocs(const string &hash, vector<Xapian::docid>& docids);
|
|
|
165 |
|
|
|
166 |
};
|
|
|
167 |
|
|
|
168 |
class FilterMatcher : public Xapian::MatchDecider {
|
|
|
169 |
public:
|
|
|
170 |
FilterMatcher(const string &topdir)
|
|
|
171 |
: m_topdir(topdir)
|
|
|
172 |
{}
|
|
|
173 |
virtual ~FilterMatcher() {}
|
|
|
174 |
|
|
|
175 |
virtual
|
|
|
176 |
#if XAPIAN_MAJOR_VERSION < 1
|
|
|
177 |
int
|
|
|
178 |
#else
|
|
|
179 |
bool
|
|
|
180 |
#endif
|
|
|
181 |
operator()(const Xapian::Document &xdoc) const
|
|
|
182 |
{
|
|
|
183 |
m_cnt++;
|
|
|
184 |
// Parse xapian document's data and populate doc fields
|
|
|
185 |
string data = xdoc.get_data();
|
|
|
186 |
ConfSimple parms(&data);
|
|
|
187 |
|
|
|
188 |
// The only filtering for now is on file path (subtree)
|
|
|
189 |
string url;
|
|
|
190 |
parms.get(string("url"), url);
|
|
|
191 |
LOGDEB2(("FilterMatcher topdir [%s] url [%s]\n",
|
|
|
192 |
m_topdir.c_str(), url.c_str()));
|
|
|
193 |
if (url.find(m_topdir, 7) == 7) {
|
|
|
194 |
LOGDEB2(("FilterMatcher: MATCH %d\n", m_cnt));
|
|
|
195 |
return true;
|
|
|
196 |
} else {
|
|
|
197 |
LOGDEB2(("FilterMatcher: NO MATCH %d\n", m_cnt));
|
|
|
198 |
return false;
|
|
|
199 |
}
|
|
|
200 |
}
|
|
|
201 |
static int m_cnt;
|
|
|
202 |
|
|
|
203 |
private:
|
|
|
204 |
string m_topdir;
|
|
|
205 |
};
|
|
|
206 |
int FilterMatcher::m_cnt;
|
|
|
207 |
|
|
|
208 |
/* See comment in class declaration */
|
93 |
/* See comment in class declaration */
|
209 |
bool Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
|
94 |
bool Db::Native::subDocs(const string &hash, vector<Xapian::docid>& docids)
|
210 |
{
|
95 |
{
|
211 |
docids.clear();
|
96 |
docids.clear();
|
212 |
string qterm = "Q"+ hash + "|";
|
97 |
string qterm = "Q"+ hash + "|";
|
213 |
string ermsg;
|
98 |
string ermsg;
|
214 |
|
99 |
|
|
... |
|
... |
248 |
LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str()));
|
133 |
LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str()));
|
249 |
return false;
|
134 |
return false;
|
250 |
}
|
135 |
}
|
251 |
|
136 |
|
252 |
// Turn data record from db into document fields
|
137 |
// Turn data record from db into document fields
|
253 |
bool Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
|
138 |
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, Doc &doc)
|
254 |
{
|
139 |
{
|
255 |
LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str()));
|
140 |
LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str()));
|
256 |
ConfSimple parms(&data);
|
141 |
ConfSimple parms(&data);
|
257 |
if (!parms.ok())
|
142 |
if (!parms.ok())
|
258 |
return false;
|
143 |
return false;
|
|
... |
|
... |
304 |
#define LOGABS LOGDEB2
|
189 |
#define LOGABS LOGDEB2
|
305 |
#endif
|
190 |
#endif
|
306 |
|
191 |
|
307 |
// Build a document abstract by extracting text chunks around the query terms
|
192 |
// Build a document abstract by extracting text chunks around the query terms
|
308 |
// This uses the db termlists, not the original document.
|
193 |
// This uses the db termlists, not the original document.
|
309 |
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
194 |
string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
310 |
{
|
195 |
{
|
311 |
Chrono chron;
|
196 |
Chrono chron;
|
312 |
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
197 |
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
313 |
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
198 |
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
314 |
|
199 |
|
|
|
200 |
list<string> iterms;
|
|
|
201 |
query->getQueryTerms(iterms);
|
|
|
202 |
|
315 |
list<string> terms = noPrefixList(iterms);
|
203 |
list<string> terms = noPrefixList(iterms);
|
316 |
if (terms.empty()) {
|
204 |
if (terms.empty()) {
|
317 |
return "";
|
205 |
return "";
|
318 |
}
|
206 |
}
|
319 |
|
207 |
|
320 |
// Retrieve db-wide frequencies for the query terms
|
208 |
// Retrieve db-wide frequencies for the query terms
|
321 |
if (m_termfreqs.empty()) {
|
209 |
if (query->m_nq->termfreqs.empty()) {
|
322 |
double doccnt = db.get_doccount();
|
210 |
double doccnt = db.get_doccount();
|
323 |
if (doccnt == 0) doccnt = 1;
|
211 |
if (doccnt == 0) doccnt = 1;
|
324 |
for (list<string>::const_iterator qit = terms.begin();
|
212 |
for (list<string>::const_iterator qit = terms.begin();
|
325 |
qit != terms.end(); qit++) {
|
213 |
qit != terms.end(); qit++) {
|
326 |
m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
|
214 |
query->m_nq->termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
|
327 |
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
215 |
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
328 |
m_termfreqs[*qit]));
|
216 |
query->m_nq->termfreqs[*qit]));
|
329 |
}
|
217 |
}
|
330 |
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
218 |
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
331 |
}
|
219 |
}
|
332 |
|
220 |
|
333 |
// Compute a term quality coefficient by retrieving the term
|
221 |
// Compute a term quality coefficient by retrieving the term
|
|
... |
|
... |
341 |
for (list<string>::const_iterator qit = terms.begin();
|
229 |
for (list<string>::const_iterator qit = terms.begin();
|
342 |
qit != terms.end(); qit++) {
|
230 |
qit != terms.end(); qit++) {
|
343 |
Xapian::TermIterator term = db.termlist_begin(docid);
|
231 |
Xapian::TermIterator term = db.termlist_begin(docid);
|
344 |
term.skip_to(*qit);
|
232 |
term.skip_to(*qit);
|
345 |
if (term != db.termlist_end(docid) && *term == *qit) {
|
233 |
if (term != db.termlist_end(docid) && *term == *qit) {
|
346 |
double q = (term.get_wdf() / doclen) * m_termfreqs[*qit];
|
234 |
double q = (term.get_wdf() / doclen) * query->m_nq->termfreqs[*qit];
|
347 |
q = -log10(q);
|
235 |
q = -log10(q);
|
348 |
if (q < 3) {
|
236 |
if (q < 3) {
|
349 |
q = 0.05;
|
237 |
q = 0.05;
|
350 |
} else if (q < 4) {
|
238 |
} else if (q < 4) {
|
351 |
q = 0.3;
|
239 |
q = 0.3;
|
|
... |
|
... |
554 |
}
|
442 |
}
|
555 |
|
443 |
|
556 |
/* Rcl::Db methods ///////////////////////////////// */
|
444 |
/* Rcl::Db methods ///////////////////////////////// */
|
557 |
|
445 |
|
558 |
Db::Db()
|
446 |
Db::Db()
|
559 |
: m_ndb(0), m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250),
|
447 |
: m_ndb(0), m_idxAbsTruncLen(250), m_synthAbsLen(250),
|
560 |
m_synthAbsWordCtxLen(4), m_flushMb(-1),
|
448 |
m_synthAbsWordCtxLen(4), m_flushMb(-1),
|
561 |
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0),
|
449 |
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0),
|
562 |
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
450 |
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
563 |
{
|
451 |
{
|
564 |
m_ndb = new Native(this);
|
452 |
m_ndb = new Native(this);
|
|
... |
|
... |
584 |
list<string> res;
|
472 |
list<string> res;
|
585 |
stringToStrings(Xapian::Stem::get_available_languages(), res);
|
473 |
stringToStrings(Xapian::Stem::get_available_languages(), res);
|
586 |
return res;
|
474 |
return res;
|
587 |
}
|
475 |
}
|
588 |
|
476 |
|
589 |
// Generic Xapian exception catching code. We do this quite often,
|
|
|
590 |
// and I have no idea how to do this except for a macro
|
|
|
591 |
#define XCATCHERROR(MSG) \
|
|
|
592 |
catch (const Xapian::Error &e) { \
|
|
|
593 |
MSG = e.get_msg(); \
|
|
|
594 |
if (MSG.empty()) MSG = "Empty error message"; \
|
|
|
595 |
} catch (const string &s) { \
|
|
|
596 |
MSG = s; \
|
|
|
597 |
if (MSG.empty()) MSG = "Empty error message"; \
|
|
|
598 |
} catch (const char *s) { \
|
|
|
599 |
MSG = s; \
|
|
|
600 |
if (MSG.empty()) MSG = "Empty error message"; \
|
|
|
601 |
} catch (...) { \
|
|
|
602 |
MSG = "Caught unknown xapian exception"; \
|
|
|
603 |
}
|
|
|
604 |
|
|
|
605 |
|
|
|
606 |
bool Db::open(const string& dir, const string &stops, OpenMode mode, int qops)
|
477 |
bool Db::open(const string& dir, const string &stops, OpenMode mode,
|
|
|
478 |
bool keep_updated)
|
607 |
{
|
479 |
{
|
608 |
bool keep_updated = (qops & QO_KEEP_UPDATED) != 0;
|
|
|
609 |
qops &= ~QO_KEEP_UPDATED;
|
|
|
610 |
|
|
|
611 |
if (m_ndb == 0)
|
480 |
if (m_ndb == 0)
|
612 |
return false;
|
481 |
return false;
|
613 |
LOGDEB(("Db::open: m_isopen %d m_iswritable %d\n", m_ndb->m_isopen,
|
482 |
LOGDEB(("Db::open: m_isopen %d m_iswritable %d\n", m_ndb->m_isopen,
|
614 |
m_ndb->m_iswritable));
|
483 |
m_ndb->m_iswritable));
|
615 |
|
484 |
|
|
... |
|
... |
722 |
bool Db::reOpen()
|
591 |
bool Db::reOpen()
|
723 |
{
|
592 |
{
|
724 |
if (m_ndb && m_ndb->m_isopen) {
|
593 |
if (m_ndb && m_ndb->m_isopen) {
|
725 |
if (!close())
|
594 |
if (!close())
|
726 |
return false;
|
595 |
return false;
|
727 |
if (!open(m_basedir, "", m_mode, m_qOpts | QO_KEEP_UPDATED)) {
|
596 |
if (!open(m_basedir, "", m_mode, true)) {
|
728 |
return false;
|
597 |
return false;
|
729 |
}
|
598 |
}
|
730 |
}
|
599 |
}
|
731 |
return true;
|
600 |
return true;
|
732 |
}
|
601 |
}
|
|
... |
|
... |
1465 |
names.push_back("XIMPOSSIBLE");
|
1334 |
names.push_back("XIMPOSSIBLE");
|
1466 |
}
|
1335 |
}
|
1467 |
return true;
|
1336 |
return true;
|
1468 |
}
|
1337 |
}
|
1469 |
|
1338 |
|
1470 |
// Prepare query out of user search data
|
|
|
1471 |
bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
|
|
|
1472 |
const string& stemlang)
|
|
|
1473 |
{
|
|
|
1474 |
if (!m_ndb) {
|
|
|
1475 |
LOGERR(("Db::setQuery: no db!\n"));
|
|
|
1476 |
return false;
|
|
|
1477 |
}
|
|
|
1478 |
m_reason.erase();
|
|
|
1479 |
LOGDEB(("Db::setQuery:\n"));
|
|
|
1480 |
|
|
|
1481 |
m_filterTopDir = sdata->getTopdir();
|
|
|
1482 |
deleteZ(m_ndb->decider);
|
|
|
1483 |
deleteZ(m_ndb->postfilter);
|
|
|
1484 |
if (!m_filterTopDir.empty()) {
|
|
|
1485 |
#if XAPIAN_FILTERING
|
|
|
1486 |
m_ndb->decider =
|
|
|
1487 |
#else
|
|
|
1488 |
m_ndb->postfilter =
|
|
|
1489 |
#endif
|
|
|
1490 |
new FilterMatcher(m_filterTopDir);
|
|
|
1491 |
}
|
|
|
1492 |
m_dbindices.clear();
|
|
|
1493 |
m_qOpts = opts;
|
|
|
1494 |
m_ndb->m_termfreqs.clear();
|
|
|
1495 |
FilterMatcher::m_cnt = 0;
|
|
|
1496 |
Xapian::Query xq;
|
|
|
1497 |
if (!sdata->toNativeQuery(*this, &xq,
|
|
|
1498 |
(opts & Db::QO_STEM) ? stemlang : "")) {
|
|
|
1499 |
m_reason += sdata->getReason();
|
|
|
1500 |
return false;
|
|
|
1501 |
}
|
|
|
1502 |
m_ndb->query = xq;
|
|
|
1503 |
string ermsg;
|
|
|
1504 |
string d;
|
|
|
1505 |
try {
|
|
|
1506 |
delete m_ndb->enquire;
|
|
|
1507 |
m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
|
|
|
1508 |
m_ndb->enquire->set_query(m_ndb->query);
|
|
|
1509 |
m_ndb->mset = Xapian::MSet();
|
|
|
1510 |
// Get the query description and trim the "Xapian::Query"
|
|
|
1511 |
d = m_ndb->query.get_description();
|
|
|
1512 |
} XCATCHERROR(ermsg);
|
|
|
1513 |
if (!ermsg.empty()) {
|
|
|
1514 |
LOGDEB(("Db::SetQuery: xapian error %s\n", ermsg.c_str()));
|
|
|
1515 |
return false;
|
|
|
1516 |
}
|
|
|
1517 |
|
|
|
1518 |
if (d.find("Xapian::Query") == 0)
|
|
|
1519 |
d.erase(0, strlen("Xapian::Query"));
|
|
|
1520 |
if (!m_filterTopDir.empty()) {
|
|
|
1521 |
d += string(" [dir: ") + m_filterTopDir + "]";
|
|
|
1522 |
}
|
|
|
1523 |
sdata->setDescription(d);
|
|
|
1524 |
LOGDEB(("Db::SetQuery: Q: %s\n", sdata->getDescription().c_str()));
|
|
|
1525 |
return true;
|
|
|
1526 |
}
|
|
|
1527 |
|
|
|
1528 |
class TermMatchCmpByWcf {
|
1339 |
class TermMatchCmpByWcf {
|
1529 |
public:
|
1340 |
public:
|
1530 |
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
1341 |
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
1531 |
return r.wcf - l.wcf < 0;
|
1342 |
return r.wcf - l.wcf < 0;
|
1532 |
}
|
1343 |
}
|
|
... |
|
... |
1733 |
return false;
|
1544 |
return false;
|
1734 |
}
|
1545 |
}
|
1735 |
return true;
|
1546 |
return true;
|
1736 |
}
|
1547 |
}
|
1737 |
|
1548 |
|
1738 |
bool Db::getQueryTerms(list<string>& terms)
|
|
|
1739 |
{
|
|
|
1740 |
if (!m_ndb)
|
|
|
1741 |
return false;
|
|
|
1742 |
|
1549 |
|
1743 |
terms.clear();
|
|
|
1744 |
Xapian::TermIterator it;
|
|
|
1745 |
string ermsg;
|
|
|
1746 |
try {
|
|
|
1747 |
for (it = m_ndb->query.get_terms_begin();
|
|
|
1748 |
it != m_ndb->query.get_terms_end(); it++) {
|
|
|
1749 |
terms.push_back(*it);
|
|
|
1750 |
}
|
|
|
1751 |
} XCATCHERROR(ermsg);
|
|
|
1752 |
if (!ermsg.empty()) {
|
|
|
1753 |
LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
|
|
|
1754 |
return false;
|
|
|
1755 |
}
|
|
|
1756 |
return true;
|
|
|
1757 |
}
|
|
|
1758 |
|
|
|
1759 |
bool Db::getMatchTerms(const Doc& doc, list<string>& terms)
|
|
|
1760 |
{
|
|
|
1761 |
if (!m_ndb || !m_ndb->enquire) {
|
|
|
1762 |
LOGERR(("Db::getMatchTerms: no query opened\n"));
|
|
|
1763 |
return -1;
|
|
|
1764 |
}
|
|
|
1765 |
|
|
|
1766 |
terms.clear();
|
|
|
1767 |
Xapian::TermIterator it;
|
|
|
1768 |
Xapian::docid id = Xapian::docid(doc.xdocid);
|
|
|
1769 |
string ermsg;
|
|
|
1770 |
try {
|
|
|
1771 |
for (it=m_ndb->enquire->get_matching_terms_begin(id);
|
|
|
1772 |
it != m_ndb->enquire->get_matching_terms_end(id); it++) {
|
|
|
1773 |
terms.push_back(*it);
|
|
|
1774 |
}
|
|
|
1775 |
} XCATCHERROR(ermsg);
|
|
|
1776 |
if (!ermsg.empty()) {
|
|
|
1777 |
LOGERR(("getQueryTerms: xapian error: %s\n", ermsg.c_str()));
|
|
|
1778 |
return false;
|
|
|
1779 |
}
|
|
|
1780 |
|
|
|
1781 |
return true;
|
|
|
1782 |
}
|
|
|
1783 |
|
|
|
1784 |
// Mset size
|
|
|
1785 |
static const int qquantum = 30;
|
|
|
1786 |
|
|
|
1787 |
int Db::getResCnt()
|
|
|
1788 |
{
|
|
|
1789 |
if (!m_ndb || !m_ndb->enquire) {
|
|
|
1790 |
LOGERR(("Db::getResCnt: no query opened\n"));
|
|
|
1791 |
return -1;
|
|
|
1792 |
}
|
|
|
1793 |
string ermsg;
|
|
|
1794 |
if (m_ndb->mset.size() <= 0) {
|
|
|
1795 |
try {
|
|
|
1796 |
m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum,
|
|
|
1797 |
0, m_ndb->decider);
|
|
|
1798 |
} catch (const Xapian::DatabaseModifiedError &error) {
|
|
|
1799 |
m_ndb->db.reopen();
|
|
|
1800 |
m_ndb->mset = m_ndb->enquire->get_mset(0, qquantum,
|
|
|
1801 |
0, m_ndb->decider);
|
|
|
1802 |
} XCATCHERROR(ermsg);
|
|
|
1803 |
if (!ermsg.empty()) {
|
|
|
1804 |
LOGERR(("enquire->get_mset: exception: %s\n", ermsg.c_str()));
|
|
|
1805 |
return -1;
|
|
|
1806 |
}
|
|
|
1807 |
}
|
|
|
1808 |
int ret = -1;
|
|
|
1809 |
try {
|
|
|
1810 |
ret = m_ndb->mset.get_matches_lower_bound();
|
|
|
1811 |
} catch (...) {}
|
|
|
1812 |
return ret;
|
|
|
1813 |
}
|
|
|
1814 |
|
|
|
1815 |
|
|
|
1816 |
// Get document at rank i in query (i is the index in the whole result
|
|
|
1817 |
// set, as in the enquire class. We check if the current mset has the
|
|
|
1818 |
// doc, else ask for an other one. We use msets of 10 documents. Don't
|
|
|
1819 |
// know if the whole thing makes sense at all but it seems to work.
|
|
|
1820 |
//
|
|
|
1821 |
// If there is a postquery filter (ie: file names), we have to
|
|
|
1822 |
// maintain a correspondance from the sequential external index
|
|
|
1823 |
// sequence to the internal Xapian hole-y one (the holes being the documents
|
|
|
1824 |
// that dont match the filter).
|
|
|
1825 |
bool Db::getDoc(int exti, Doc &doc, int *percent)
|
|
|
1826 |
{
|
|
|
1827 |
LOGDEB1(("Db::getDoc: exti %d\n", exti));
|
|
|
1828 |
if (!m_ndb || !m_ndb->enquire) {
|
|
|
1829 |
LOGERR(("Db::getDoc: no query opened\n"));
|
|
|
1830 |
return false;
|
|
|
1831 |
}
|
|
|
1832 |
|
|
|
1833 |
int xapi;
|
|
|
1834 |
if (m_ndb->postfilter) {
|
|
|
1835 |
// There is a postquery filter, does this fall in already known area ?
|
|
|
1836 |
if (exti >= (int)m_dbindices.size()) {
|
|
|
1837 |
// Have to fetch xapian docs and filter until we get
|
|
|
1838 |
// enough or fail
|
|
|
1839 |
m_dbindices.reserve(exti+1);
|
|
|
1840 |
// First xapian doc we fetch is the one after last stored
|
|
|
1841 |
int first = m_dbindices.size() > 0 ? m_dbindices.back() + 1 : 0;
|
|
|
1842 |
// Loop until we get enough docs
|
|
|
1843 |
while (exti >= (int)m_dbindices.size()) {
|
|
|
1844 |
LOGDEB(("Db::getDoc: fetching %d starting at %d\n",
|
|
|
1845 |
qquantum, first));
|
|
|
1846 |
try {
|
|
|
1847 |
m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum);
|
|
|
1848 |
} catch (const Xapian::DatabaseModifiedError &error) {
|
|
|
1849 |
m_ndb->db.reopen();
|
|
|
1850 |
m_ndb->mset = m_ndb->enquire->get_mset(first, qquantum);
|
|
|
1851 |
} catch (const Xapian::Error & error) {
|
|
|
1852 |
LOGERR(("enquire->get_mset: exception: %s\n",
|
|
|
1853 |
error.get_msg().c_str()));
|
|
|
1854 |
abort();
|
|
|
1855 |
}
|
|
|
1856 |
|
|
|
1857 |
if (m_ndb->mset.empty()) {
|
|
|
1858 |
LOGDEB(("Db::getDoc: got empty mset\n"));
|
|
|
1859 |
return false;
|
|
|
1860 |
}
|
|
|
1861 |
first = m_ndb->mset.get_firstitem();
|
|
|
1862 |
for (unsigned int i = 0; i < m_ndb->mset.size() ; i++) {
|
|
|
1863 |
LOGDEB(("Db::getDoc: [%d]\n", i));
|
|
|
1864 |
Xapian::Document xdoc = m_ndb->mset[i].get_document();
|
|
|
1865 |
if ((*m_ndb->postfilter)(xdoc)) {
|
|
|
1866 |
m_dbindices.push_back(first + i);
|
|
|
1867 |
}
|
|
|
1868 |
}
|
|
|
1869 |
first = first + m_ndb->mset.size();
|
|
|
1870 |
}
|
|
|
1871 |
}
|
|
|
1872 |
xapi = m_dbindices[exti];
|
|
|
1873 |
} else {
|
|
|
1874 |
xapi = exti;
|
|
|
1875 |
}
|
|
|
1876 |
|
|
|
1877 |
// From there on, we work with a xapian enquire item number. Fetch it
|
|
|
1878 |
int first = m_ndb->mset.get_firstitem();
|
|
|
1879 |
int last = first + m_ndb->mset.size() -1;
|
|
|
1880 |
|
|
|
1881 |
if (!(xapi >= first && xapi <= last)) {
|
|
|
1882 |
LOGDEB(("Fetching for first %d, count %d\n", xapi, qquantum));
|
|
|
1883 |
try {
|
|
|
1884 |
m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum,
|
|
|
1885 |
0, m_ndb->decider);
|
|
|
1886 |
} catch (const Xapian::DatabaseModifiedError &error) {
|
|
|
1887 |
m_ndb->db.reopen();
|
|
|
1888 |
m_ndb->mset = m_ndb->enquire->get_mset(xapi, qquantum,
|
|
|
1889 |
0, m_ndb->decider);
|
|
|
1890 |
|
|
|
1891 |
} catch (const Xapian::Error & error) {
|
|
|
1892 |
LOGERR(("enquire->get_mset: exception: %s\n",
|
|
|
1893 |
error.get_msg().c_str()));
|
|
|
1894 |
abort();
|
|
|
1895 |
}
|
|
|
1896 |
if (m_ndb->mset.empty())
|
|
|
1897 |
return false;
|
|
|
1898 |
first = m_ndb->mset.get_firstitem();
|
|
|
1899 |
last = first + m_ndb->mset.size() -1;
|
|
|
1900 |
}
|
|
|
1901 |
|
|
|
1902 |
LOGDEB1(("Db::getDoc: Qry [%s] win [%d-%d] Estimated results: %d",
|
|
|
1903 |
m_ndb->query.get_description().c_str(),
|
|
|
1904 |
first, last,
|
|
|
1905 |
m_ndb->mset.get_matches_lower_bound()));
|
|
|
1906 |
|
|
|
1907 |
Xapian::Document xdoc = m_ndb->mset[xapi-first].get_document();
|
|
|
1908 |
Xapian::docid docid = *(m_ndb->mset[xapi-first]);
|
|
|
1909 |
if (percent)
|
|
|
1910 |
*percent = m_ndb->mset.convert_to_percent(m_ndb->mset[xapi-first]);
|
|
|
1911 |
|
|
|
1912 |
// Parse xapian document's data and populate doc fields
|
|
|
1913 |
string data = xdoc.get_data();
|
|
|
1914 |
return m_ndb->dbDataToRclDoc(docid, data, doc);
|
|
|
1915 |
}
|
|
|
1916 |
|
|
|
1917 |
bool Db::makeDocAbstract(Doc &doc, string& abstract)
|
1550 |
bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
|
1918 |
{
|
1551 |
{
|
1919 |
LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
|
1552 |
LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
|
1920 |
if (!m_ndb || !m_ndb->enquire) {
|
1553 |
if (!m_ndb) {
|
1921 |
LOGERR(("Db::makeDocAbstract: no query opened\n"));
|
1554 |
LOGERR(("Db::makeDocAbstract: no db\n"));
|
1922 |
return false;
|
1555 |
return false;
|
1923 |
}
|
1556 |
}
|
1924 |
list<string> terms;
|
|
|
1925 |
getQueryTerms(terms);
|
|
|
1926 |
abstract = m_ndb->makeAbstract(doc.xdocid, terms);
|
1557 |
abstract = m_ndb->makeAbstract(doc.xdocid, query);
|
1927 |
return true;
|
1558 |
return true;
|
1928 |
}
|
1559 |
}
|
1929 |
|
1560 |
|
1930 |
// Retrieve document defined by file name and internal path.
|
1561 |
// Retrieve document defined by file name and internal path.
|
1931 |
bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
|
1562 |
bool Db::getDoc(const string &fn, const string &ipath, Doc &doc, int *pc)
|
|
... |
|
... |
1967 |
LOGERR(("Db::getDoc: %s\n", ermsg.c_str()));
|
1598 |
LOGERR(("Db::getDoc: %s\n", ermsg.c_str()));
|
1968 |
}
|
1599 |
}
|
1969 |
return false;
|
1600 |
return false;
|
1970 |
}
|
1601 |
}
|
1971 |
|
1602 |
|
1972 |
list<string> Db::expand(const Doc &doc)
|
|
|
1973 |
{
|
|
|
1974 |
list<string> res;
|
|
|
1975 |
if (!m_ndb || !m_ndb->enquire) {
|
|
|
1976 |
LOGERR(("Db::expand: no query opened\n"));
|
|
|
1977 |
return res;
|
|
|
1978 |
}
|
|
|
1979 |
string ermsg;
|
|
|
1980 |
for (int tries = 0; tries < 2; tries++) {
|
|
|
1981 |
try {
|
|
|
1982 |
Xapian::RSet rset;
|
|
|
1983 |
rset.add_document(Xapian::docid(doc.xdocid));
|
|
|
1984 |
// We don't exclude the original query terms.
|
|
|
1985 |
Xapian::ESet eset = m_ndb->enquire->get_eset(20, rset, false);
|
|
|
1986 |
LOGDEB(("ESet terms:\n"));
|
|
|
1987 |
// We filter out the special terms
|
|
|
1988 |
for (Xapian::ESetIterator it = eset.begin();
|
|
|
1989 |
it != eset.end(); it++) {
|
|
|
1990 |
LOGDEB((" [%s]\n", (*it).c_str()));
|
|
|
1991 |
if ((*it).empty() || ((*it).at(0)>='A' && (*it).at(0)<='Z'))
|
|
|
1992 |
continue;
|
|
|
1993 |
res.push_back(*it);
|
|
|
1994 |
if (res.size() >= 10)
|
|
|
1995 |
break;
|
|
|
1996 |
}
|
|
|
1997 |
} catch (const Xapian::DatabaseModifiedError &error) {
|
|
|
1998 |
continue;
|
|
|
1999 |
} XCATCHERROR(ermsg);
|
|
|
2000 |
if (!ermsg.empty()) {
|
|
|
2001 |
LOGERR(("Db::expand: xapian error %s\n", ermsg.c_str()));
|
|
|
2002 |
res.clear();
|
|
|
2003 |
}
|
|
|
2004 |
break;
|
|
|
2005 |
}
|
|
|
2006 |
|
|
|
2007 |
return res;
|
|
|
2008 |
}
|
|
|
2009 |
|
|
|
2010 |
|
|
|
2011 |
#ifndef NO_NAMESPACES
|
1603 |
#ifndef NO_NAMESPACES
|
2012 |
}
|
1604 |
}
|
2013 |
#endif
|
1605 |
#endif
|