|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.5 2004-12-17 15:50:48 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.6 2005-01-24 13:17:58 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
|
4 |
|
5 |
#include <sys/stat.h>
|
5 |
#include <sys/stat.h>
|
6 |
|
6 |
|
7 |
#include <iostream>
|
7 |
#include <iostream>
|
|
... |
|
... |
12 |
|
12 |
|
13 |
#include "rcldb.h"
|
13 |
#include "rcldb.h"
|
14 |
#include "textsplit.h"
|
14 |
#include "textsplit.h"
|
15 |
#include "transcode.h"
|
15 |
#include "transcode.h"
|
16 |
#include "unacpp.h"
|
16 |
#include "unacpp.h"
|
|
|
17 |
#include "conftree.h"
|
17 |
|
18 |
|
18 |
#include "xapian.h"
|
19 |
#include "xapian.h"
|
19 |
|
20 |
|
20 |
// Data for a xapian database
|
21 |
// Data for a xapian database. There could actually be 2 different ones for
|
|
|
22 |
// indexing or query as there is not much in common.
|
21 |
class Native {
|
23 |
class Native {
|
22 |
public:
|
24 |
public:
|
23 |
bool isopen;
|
25 |
bool isopen;
|
24 |
bool iswritable;
|
26 |
bool iswritable;
|
25 |
class Xapian::Database db;
|
27 |
// Indexing
|
26 |
class Xapian::WritableDatabase wdb;
|
28 |
Xapian::WritableDatabase wdb;
|
27 |
vector<bool> updated;
|
29 |
vector<bool> updated;
|
28 |
|
30 |
|
|
|
31 |
// Querying
|
|
|
32 |
Xapian::Database db;
|
|
|
33 |
Xapian::Query query;
|
29 |
Native() : isopen(false), iswritable(false) {}
|
34 |
Native() : isopen(false), iswritable(false) {}
|
30 |
|
35 |
|
31 |
};
|
36 |
};
|
32 |
|
37 |
|
33 |
Rcl::Db::Db()
|
38 |
Rcl::Db::Db()
|
|
... |
|
... |
35 |
pdata = new Native;
|
40 |
pdata = new Native;
|
36 |
}
|
41 |
}
|
37 |
|
42 |
|
38 |
Rcl::Db::~Db()
|
43 |
Rcl::Db::~Db()
|
39 |
{
|
44 |
{
|
|
|
45 |
cerr << "Rcl::Db::~Db" << endl;
|
40 |
if (pdata == 0)
|
46 |
if (pdata == 0)
|
41 |
return;
|
47 |
return;
|
42 |
Native *ndb = (Native *)pdata;
|
48 |
Native *ndb = (Native *)pdata;
|
43 |
cerr << "Db::~Db: isopen " << ndb->isopen << " iswritable " <<
|
49 |
cerr << "Db::~Db: isopen " << ndb->isopen << " iswritable " <<
|
44 |
ndb->iswritable << endl;
|
50 |
ndb->iswritable << endl;
|
45 |
try {
|
51 |
try {
|
46 |
// There is nothing to do for an ro db.
|
52 |
// There is nothing to do for an ro db.
|
47 |
if (ndb->isopen == false || ndb->iswritable == false) {
|
53 |
if (ndb->isopen == false || ndb->iswritable == false) {
|
|
|
54 |
cerr << "Deleting native database" << endl;
|
48 |
delete ndb;
|
55 |
delete ndb;
|
49 |
return;
|
56 |
return;
|
50 |
}
|
57 |
}
|
51 |
ndb->wdb.flush();
|
58 |
ndb->wdb.flush();
|
52 |
delete ndb;
|
59 |
delete ndb;
|
|
... |
|
... |
80 |
ndb->iswritable = true;
|
87 |
ndb->iswritable = true;
|
81 |
break;
|
88 |
break;
|
82 |
case DbRO:
|
89 |
case DbRO:
|
83 |
default:
|
90 |
default:
|
84 |
ndb->iswritable = false;
|
91 |
ndb->iswritable = false;
|
85 |
cerr << "Not ready to open RO yet" << endl;
|
92 |
ndb->db = Xapian::Auto::open(dir, Xapian::DB_OPEN);
|
86 |
exit(1);
|
93 |
break;
|
87 |
}
|
94 |
}
|
88 |
ndb->isopen = true;
|
95 |
ndb->isopen = true;
|
89 |
return true;
|
96 |
return true;
|
90 |
} catch (const Xapian::Error &e) {
|
97 |
} catch (const Xapian::Error &e) {
|
91 |
cout << "Exception: " << e.get_msg() << endl;
|
98 |
cout << "Exception: " << e.get_msg() << endl;
|
|
... |
|
... |
140 |
Xapian::termpos curpos; // Last position sent to callback
|
147 |
Xapian::termpos curpos; // Last position sent to callback
|
141 |
wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
|
148 |
wsData(Xapian::Document &d) : doc(d), basepos(1), curpos(0)
|
142 |
{}
|
149 |
{}
|
143 |
};
|
150 |
};
|
144 |
|
151 |
|
|
|
152 |
// Callback for the document to word splitting class during indexation
|
145 |
bool splitCb(void *cdata, const std::string &term, int pos)
|
153 |
static bool splitCb(void *cdata, const std::string &term, int pos)
|
146 |
{
|
154 |
{
|
147 |
wsData *data = (wsData*)cdata;
|
155 |
wsData *data = (wsData*)cdata;
|
148 |
|
156 |
|
149 |
// cerr << "splitCb: term " << term << endl;
|
157 |
// cerr << "splitCb: term " << term << endl;
|
150 |
//string printable;
|
158 |
//string printable;
|
|
... |
|
... |
170 |
{
|
178 |
{
|
171 |
string inter;
|
179 |
string inter;
|
172 |
out.erase();
|
180 |
out.erase();
|
173 |
if (!unac_cpp(in, inter))
|
181 |
if (!unac_cpp(in, inter))
|
174 |
return false;
|
182 |
return false;
|
175 |
out.resize(inter.length());
|
183 |
out.reserve(inter.length());
|
176 |
for (unsigned int i = 0; i < inter.length(); i++) {
|
184 |
for (unsigned int i = 0; i < inter.length(); i++) {
|
177 |
if (inter[i] >= 'A' && inter[i] <= 'Z')
|
185 |
if (inter[i] >= 'A' && inter[i] <= 'Z')
|
178 |
out += inter[i] + 'a' - 'A';
|
186 |
out += inter[i] + 'a' - 'A';
|
179 |
else
|
187 |
else
|
180 |
out += inter[i];
|
188 |
out += inter[i];
|
|
... |
|
... |
237 |
|
245 |
|
238 |
if (1 /*dupes == DUPE_replace*/) {
|
246 |
if (1 /*dupes == DUPE_replace*/) {
|
239 |
// If this document has already been indexed, update the existing
|
247 |
// If this document has already been indexed, update the existing
|
240 |
// entry.
|
248 |
// entry.
|
241 |
try {
|
249 |
try {
|
242 |
Xapian::docid did = ndb->wdb.replace_document(pathterm,
|
250 |
#if 0
|
243 |
newdocument);
|
251 |
Xapian::docid did =
|
|
|
252 |
#endif
|
|
|
253 |
ndb->wdb.replace_document(pathterm, newdocument);
|
244 |
#if 0
|
254 |
#if 0
|
245 |
if (did < updated.size()) {
|
255 |
if (did < updated.size()) {
|
246 |
updated[did] = true;
|
256 |
updated[did] = true;
|
247 |
//cout << "updated." << endl;
|
257 |
//cout << "updated." << endl;
|
248 |
} else {
|
258 |
} else {
|
|
... |
|
... |
295 |
return true;
|
305 |
return true;
|
296 |
}
|
306 |
}
|
297 |
|
307 |
|
298 |
return true;
|
308 |
return true;
|
299 |
}
|
309 |
}
|
|
|
310 |
|
|
|
311 |
#include <vector>
|
|
|
312 |
|
|
|
313 |
class wsQData {
|
|
|
314 |
public:
|
|
|
315 |
vector<string> terms;
|
|
|
316 |
};
|
|
|
317 |
|
|
|
318 |
// Callback for the document to word splitting class during indexation
|
|
|
319 |
static bool splitQCb(void *cdata, const std::string &term, int )
|
|
|
320 |
{
|
|
|
321 |
wsQData *data = (wsQData*)cdata;
|
|
|
322 |
|
|
|
323 |
cerr << "splitQCb: term '" << term << "'" << endl;
|
|
|
324 |
cerr << "splitQCb: term length: " << term.length() << endl;
|
|
|
325 |
//string printable;
|
|
|
326 |
//transcode(term, printable, "UTF-8", "ISO8859-1");
|
|
|
327 |
//cerr << "Adding " << printable << endl;
|
|
|
328 |
|
|
|
329 |
data->terms.push_back(term);
|
|
|
330 |
return true;
|
|
|
331 |
}
|
|
|
332 |
|
|
|
333 |
bool Rcl::Db::setQuery(const std::string &querystring)
|
|
|
334 |
{
|
|
|
335 |
wsQData splitData;
|
|
|
336 |
TextSplit splitter(splitQCb, &splitData);
|
|
|
337 |
|
|
|
338 |
string noacc;
|
|
|
339 |
if (!dumb_string(querystring, noacc)) {
|
|
|
340 |
return false;
|
|
|
341 |
}
|
|
|
342 |
// noacc = querystring;
|
|
|
343 |
splitter.text_to_words(noacc);
|
|
|
344 |
|
|
|
345 |
Native *ndb = (Native *)pdata;
|
|
|
346 |
|
|
|
347 |
// splitData.terms.resize(0);
|
|
|
348 |
// splitData.terms.push_back(string("le"));
|
|
|
349 |
ndb->query = Xapian::Query(Xapian::Query::OP_OR, splitData.terms.begin(),
|
|
|
350 |
splitData.terms.end());
|
|
|
351 |
|
|
|
352 |
return true;
|
|
|
353 |
}
|
|
|
354 |
|
|
|
355 |
bool Rcl::Db::getDoc(int i, Doc &doc)
|
|
|
356 |
{
|
|
|
357 |
// cerr << "Rcl::Db::getDoc: " << i << endl;
|
|
|
358 |
Native *ndb = (Native *)pdata;
|
|
|
359 |
|
|
|
360 |
Xapian::Enquire enquire(ndb->db);
|
|
|
361 |
enquire.set_query(ndb->query);
|
|
|
362 |
Xapian::MSet matches = enquire.get_mset(i, 1);
|
|
|
363 |
|
|
|
364 |
// cerr << "Query `" << ndb->query.get_description() << "'" <<
|
|
|
365 |
// "Estimated results: " << matches.get_matches_lower_bound() << endl;
|
|
|
366 |
|
|
|
367 |
if (matches.empty())
|
|
|
368 |
return false;
|
|
|
369 |
|
|
|
370 |
Xapian::Document xdoc = matches.begin().get_document();
|
|
|
371 |
|
|
|
372 |
// Parse xapian document's data and populate doc fields
|
|
|
373 |
string data = xdoc.get_data();
|
|
|
374 |
ConfSimple parms(&data);
|
|
|
375 |
parms.get(string("mtype"), doc.mimetype);
|
|
|
376 |
parms.get(string("mtime"), doc.mtime);
|
|
|
377 |
parms.get(string("url"), doc.url);
|
|
|
378 |
return true;
|
|
|
379 |
}
|