|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.75 2006-05-09 10:15:14 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.76 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
54 |
#define MIN(A,B) (A<B?A:B)
|
54 |
#define MIN(A,B) (A<B?A:B)
|
55 |
#endif
|
55 |
#endif
|
56 |
#ifndef NO_NAMESPACES
|
56 |
#ifndef NO_NAMESPACES
|
57 |
namespace Rcl {
|
57 |
namespace Rcl {
|
58 |
#endif
|
58 |
#endif
|
59 |
// This is how long an abstract we keep or build from beginning of text when
|
|
|
60 |
// indexing. It only has an influence on the size of the db as we are free
|
|
|
61 |
// to shorten it again when displaying
|
|
|
62 |
#define INDEX_ABSTRACT_SIZE 250
|
|
|
63 |
|
|
|
64 |
// This is the size of the abstract that we synthetize out of query
|
|
|
65 |
// term contexts at query time
|
|
|
66 |
#define MA_ABSTRACT_SIZE 250
|
|
|
67 |
// This is how many words (context size) we keep around query terms
|
|
|
68 |
// when building the abstract
|
|
|
69 |
#define MA_EXTRACT_WIDTH 4
|
|
|
70 |
|
59 |
|
71 |
// Truncate longer path and uniquize with hash . The goal for this is
|
60 |
// Truncate longer path and uniquize with hash . The goal for this is
|
72 |
// to avoid xapian max term length limitations, not to gain space (we
|
61 |
// to avoid xapian max term length limitations, not to gain space (we
|
73 |
// gain very little even with very short maxlens like 30)
|
62 |
// gain very little even with very short maxlens like 30)
|
74 |
#define PATHHASHLEN 150
|
63 |
#define PATHHASHLEN 150
|
|
... |
|
... |
79 |
|
68 |
|
80 |
// Data for a xapian database. There could actually be 2 different
|
69 |
// Data for a xapian database. There could actually be 2 different
|
81 |
// ones for indexing or query as there is not much in common.
|
70 |
// ones for indexing or query as there is not much in common.
|
82 |
class Native {
|
71 |
class Native {
|
83 |
public:
|
72 |
public:
|
|
|
73 |
Db *m_db;
|
84 |
bool m_isopen;
|
74 |
bool m_isopen;
|
85 |
bool m_iswritable;
|
75 |
bool m_iswritable;
|
86 |
Db::OpenMode m_mode;
|
76 |
Db::OpenMode m_mode;
|
87 |
string m_basedir;
|
77 |
string m_basedir;
|
88 |
|
78 |
|
|
... |
|
... |
104 |
bool dbDataToRclDoc(std::string &data, Doc &doc,
|
94 |
bool dbDataToRclDoc(std::string &data, Doc &doc,
|
105 |
int qopts,
|
95 |
int qopts,
|
106 |
Xapian::docid docid,
|
96 |
Xapian::docid docid,
|
107 |
const list<string>& terms);
|
97 |
const list<string>& terms);
|
108 |
|
98 |
|
109 |
Native()
|
99 |
Native(Db *db)
|
|
|
100 |
: m_db(db),
|
110 |
: m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0)
|
101 |
m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0)
|
111 |
{ }
|
102 |
{ }
|
112 |
~Native() {
|
103 |
~Native() {
|
113 |
delete enquire;
|
104 |
delete enquire;
|
114 |
}
|
105 |
}
|
115 |
bool filterMatch(Db *rdb, Xapian::Document &xdoc) {
|
106 |
bool filterMatch(Db *rdb, Xapian::Document &xdoc) {
|
|
... |
|
... |
147 |
}
|
138 |
}
|
148 |
|
139 |
|
149 |
};
|
140 |
};
|
150 |
|
141 |
|
151 |
Db::Db()
|
142 |
Db::Db()
|
152 |
: m_qOpts(QO_NONE)
|
143 |
: m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250),
|
|
|
144 |
m_synthAbsWordCtxLen(4)
|
153 |
{
|
145 |
{
|
154 |
m_ndb = new Native;
|
146 |
m_ndb = new Native(this);
|
155 |
}
|
147 |
}
|
156 |
|
148 |
|
157 |
Db::~Db()
|
149 |
Db::~Db()
|
158 |
{
|
150 |
{
|
159 |
LOGDEB1(("Db::~Db\n"));
|
151 |
LOGDEB1(("Db::~Db\n"));
|
|
... |
|
... |
280 |
if (m_ndb->m_iswritable == true) {
|
272 |
if (m_ndb->m_iswritable == true) {
|
281 |
m_ndb->wdb.flush();
|
273 |
m_ndb->wdb.flush();
|
282 |
LOGDEB(("Rcl:Db: Called xapian flush\n"));
|
274 |
LOGDEB(("Rcl:Db: Called xapian flush\n"));
|
283 |
}
|
275 |
}
|
284 |
delete m_ndb;
|
276 |
delete m_ndb;
|
285 |
m_ndb = new Native;
|
277 |
m_ndb = new Native(this);
|
286 |
if (m_ndb)
|
278 |
if (m_ndb)
|
287 |
return true;
|
279 |
return true;
|
288 |
} catch (const Xapian::Error &e) {
|
280 |
} catch (const Xapian::Error &e) {
|
289 |
ermsg = e.get_msg().c_str();
|
281 |
ermsg = e.get_msg().c_str();
|
290 |
} catch (const string &s) {
|
282 |
} catch (const string &s) {
|
|
... |
|
... |
440 |
return true;
|
432 |
return true;
|
441 |
}
|
433 |
}
|
442 |
return true;
|
434 |
return true;
|
443 |
}
|
435 |
}
|
444 |
|
436 |
|
|
|
437 |
// Let our user set the parameters for abstract processing
|
|
|
438 |
void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
|
|
|
439 |
{
|
|
|
440 |
LOGDEB(("Db::setAbstractParams: trunc %d syntlen %d ctxlen %d\n",
|
|
|
441 |
idxtrunc, syntlen, syntctxlen));
|
|
|
442 |
if (idxtrunc > 0 && idxtrunc < 2000)
|
|
|
443 |
m_idxAbsTruncLen = idxtrunc;
|
|
|
444 |
if (syntlen > 0 && syntlen < 2000)
|
|
|
445 |
m_synthAbsLen = syntlen;
|
|
|
446 |
if (syntctxlen > 0 && syntctxlen < 20)
|
|
|
447 |
m_synthAbsWordCtxLen = syntctxlen;
|
|
|
448 |
}
|
|
|
449 |
|
445 |
// Add document in internal form to the database: index the terms in
|
450 |
// Add document in internal form to the database: index the terms in
|
446 |
// the title abstract and body and add special terms for file name,
|
451 |
// the title abstract and body and add special terms for file name,
|
447 |
// date, mime type ... , create the document data record (more
|
452 |
// date, mime type ... , create the document data record (more
|
448 |
// metadata), and update database
|
453 |
// metadata), and update database
|
449 |
bool Db::add(const string &fn, const Doc &idoc,
|
454 |
bool Db::add(const string &fn, const Doc &idoc,
|
|
... |
|
... |
455 |
|
460 |
|
456 |
Doc doc = idoc;
|
461 |
Doc doc = idoc;
|
457 |
|
462 |
|
458 |
// Truncate abstract, title and keywords to reasonable lengths. If
|
463 |
// Truncate abstract, title and keywords to reasonable lengths. If
|
459 |
// abstract is currently empty, we make up one with the beginning
|
464 |
// abstract is currently empty, we make up one with the beginning
|
460 |
// of the document.
|
465 |
// of the document. This is then not indexed, but part of the doc
|
|
|
466 |
// data so that we can return it to a query without having to
|
|
|
467 |
// decode the original file.
|
461 |
bool syntabs = false;
|
468 |
bool syntabs = false;
|
462 |
if (doc.abstract.empty()) {
|
469 |
if (doc.abstract.empty()) {
|
463 |
syntabs = true;
|
470 |
syntabs = true;
|
464 |
doc.abstract = rclSyntAbs +
|
471 |
doc.abstract = rclSyntAbs +
|
465 |
truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE);
|
472 |
truncate_to_word(doc.text, m_idxAbsTruncLen);
|
466 |
} else {
|
473 |
} else {
|
467 |
doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
|
474 |
doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
|
468 |
}
|
475 |
}
|
469 |
doc.abstract = neutchars(doc.abstract, "\n\r");
|
476 |
doc.abstract = neutchars(doc.abstract, "\n\r");
|
470 |
doc.title = truncate_to_word(doc.title, 100);
|
477 |
doc.title = truncate_to_word(doc.title, 100);
|
471 |
doc.keywords = truncate_to_word(doc.keywords, 300);
|
478 |
doc.keywords = truncate_to_word(doc.keywords, 300);
|
472 |
|
479 |
|
|
... |
|
... |
511 |
return false;
|
518 |
return false;
|
512 |
}
|
519 |
}
|
513 |
splitter.text_to_words(noacc);
|
520 |
splitter.text_to_words(noacc);
|
514 |
splitData.basepos += splitData.curpos + 100;
|
521 |
splitData.basepos += splitData.curpos + 100;
|
515 |
|
522 |
|
516 |
// Split and index abstract
|
523 |
// Split and index abstract. We don't do this if it is synthetic
|
|
|
524 |
// any more (this used to give a relevance boost to the beginning
|
|
|
525 |
// of text, why ?)
|
517 |
LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
|
526 |
LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
|
|
|
527 |
if (!syntabs) {
|
|
|
528 |
// syntabs indicator test kept here in case we want to go back
|
|
|
529 |
// to indexing synthetic abstracts one day
|
518 |
if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) :
|
530 |
if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) :
|
519 |
doc.abstract, noacc)) {
|
531 |
doc.abstract, noacc)) {
|
520 |
LOGERR(("Db::add: dumb_string failed\n"));
|
532 |
LOGERR(("Db::add: dumb_string failed\n"));
|
521 |
return false;
|
533 |
return false;
|
522 |
}
|
534 |
}
|
523 |
splitter.text_to_words(noacc);
|
535 |
splitter.text_to_words(noacc);
|
|
|
536 |
}
|
524 |
splitData.basepos += splitData.curpos + 100;
|
537 |
splitData.basepos += splitData.curpos + 100;
|
525 |
|
538 |
|
526 |
////// Special terms for metadata
|
539 |
////// Special terms for metadata
|
527 |
// Mime type
|
540 |
// Mime type
|
528 |
newdocument.add_term("T" + doc.mimetype);
|
541 |
newdocument.add_term("T" + doc.mimetype);
|
|
... |
|
... |
1180 |
parms.get(string("dmtime"), doc.dmtime);
|
1193 |
parms.get(string("dmtime"), doc.dmtime);
|
1181 |
parms.get(string("origcharset"), doc.origcharset);
|
1194 |
parms.get(string("origcharset"), doc.origcharset);
|
1182 |
parms.get(string("caption"), doc.title);
|
1195 |
parms.get(string("caption"), doc.title);
|
1183 |
parms.get(string("keywords"), doc.keywords);
|
1196 |
parms.get(string("keywords"), doc.keywords);
|
1184 |
parms.get(string("abstract"), doc.abstract);
|
1197 |
parms.get(string("abstract"), doc.abstract);
|
|
|
1198 |
// Possibly remove synthetic abstract indicator (if it's there, we
|
|
|
1199 |
// used to index the beginning of the text as abstract).
|
1185 |
bool syntabs = false;
|
1200 |
bool syntabs = false;
|
1186 |
if (doc.abstract.find(rclSyntAbs) == 0) {
|
1201 |
if (doc.abstract.find(rclSyntAbs) == 0) {
|
1187 |
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
1202 |
doc.abstract = doc.abstract.substr(rclSyntAbs.length());
|
1188 |
syntabs = true;
|
1203 |
syntabs = true;
|
1189 |
}
|
1204 |
}
|
|
|
1205 |
// If the option is set and the abstract is synthetic or empty , build
|
|
|
1206 |
// abstract from position data.
|
1190 |
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
|
1207 |
if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
|
1191 |
LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n"));
|
1208 |
LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
|
1192 |
if (doc.abstract.empty() || syntabs ||
|
1209 |
if (doc.abstract.empty() || syntabs ||
|
1193 |
(qopts & Db::QO_REPLACE_ABSTRACT))
|
1210 |
(qopts & Db::QO_REPLACE_ABSTRACT))
|
1194 |
doc.abstract = makeAbstract(docid, terms);
|
1211 |
doc.abstract = makeAbstract(docid, terms);
|
1195 |
}
|
1212 |
}
|
1196 |
parms.get(string("ipath"), doc.ipath);
|
1213 |
parms.get(string("ipath"), doc.ipath);
|
1197 |
parms.get(string("fbytes"), doc.fbytes);
|
1214 |
parms.get(string("fbytes"), doc.fbytes);
|
1198 |
parms.get(string("dbytes"), doc.dbytes);
|
1215 |
parms.get(string("dbytes"), doc.dbytes);
|
1199 |
doc.xdocid = docid;
|
1216 |
doc.xdocid = docid;
|
1200 |
return true;
|
1217 |
return true;
|
|
... |
|
... |
1395 |
// Go through the list of query terms. For each entry in each
|
1412 |
// Go through the list of query terms. For each entry in each
|
1396 |
// position list, populate the slot in the document buffer, and
|
1413 |
// position list, populate the slot in the document buffer, and
|
1397 |
// remember the position and its neigbours
|
1414 |
// remember the position and its neigbours
|
1398 |
vector<unsigned int> qtermposs; // The term positions
|
1415 |
vector<unsigned int> qtermposs; // The term positions
|
1399 |
set<unsigned int> chunkposs; // All the positions we shall populate
|
1416 |
set<unsigned int> chunkposs; // All the positions we shall populate
|
|
|
1417 |
int totaloccs = 0;
|
1400 |
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
|
1418 |
for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
|
1401 |
qit++) {
|
1419 |
qit++) {
|
1402 |
Xapian::PositionIterator pos;
|
1420 |
Xapian::PositionIterator pos;
|
1403 |
// There may be query terms not in this doc. This raises an
|
1421 |
// There may be query terms not in this doc. This raises an
|
1404 |
// exception when requesting the position list, we just catch it.
|
1422 |
// exception when requesting the position list, we just catch it.
|
|
... |
|
... |
1407 |
for (pos = db.positionlist_begin(docid, *qit);
|
1425 |
for (pos = db.positionlist_begin(docid, *qit);
|
1408 |
pos != db.positionlist_end(docid, *qit); pos++) {
|
1426 |
pos != db.positionlist_end(docid, *qit); pos++) {
|
1409 |
unsigned int ipos = *pos;
|
1427 |
unsigned int ipos = *pos;
|
1410 |
LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
1428 |
LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
1411 |
// Possibly extend the array. Do it in big chunks
|
1429 |
// Possibly extend the array. Do it in big chunks
|
1412 |
if (ipos + MA_EXTRACT_WIDTH >= buf.size()) {
|
1430 |
if (ipos + m_db->m_synthAbsWordCtxLen >= buf.size()) {
|
1413 |
buf.resize(ipos + MA_EXTRACT_WIDTH + 1000);
|
1431 |
buf.resize(ipos + m_db->m_synthAbsWordCtxLen + 1000);
|
1414 |
}
|
1432 |
}
|
1415 |
buf[ipos] = *qit;
|
1433 |
buf[ipos] = *qit;
|
1416 |
// Remember the term position
|
1434 |
// Remember the term position
|
1417 |
qtermposs.push_back(ipos);
|
1435 |
qtermposs.push_back(ipos);
|
1418 |
// Add adjacent slots to the set to populate at next step
|
1436 |
// Add adjacent slots to the set to populate at next step
|
1419 |
for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH);
|
1437 |
for (unsigned int ii = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
1420 |
ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) {
|
1438 |
ii <= MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1); ii++) {
|
1421 |
chunkposs.insert(ii);
|
1439 |
chunkposs.insert(ii);
|
1422 |
}
|
1440 |
}
|
1423 |
// Limit the number of occurences we keep for each
|
1441 |
// Limit the number of occurences we keep for each
|
1424 |
// term. The abstract has a finite length anyway !
|
1442 |
// term. The abstract has a finite length anyway !
|
1425 |
if (occurrences++ > 10)
|
1443 |
if (occurrences++ > 10)
|
1426 |
break;
|
1444 |
break;
|
1427 |
}
|
1445 |
}
|
1428 |
} catch (...) {
|
1446 |
} catch (...) {
|
1429 |
}
|
1447 |
}
|
|
|
1448 |
// Limit total size
|
|
|
1449 |
if (totaloccs++ > 100)
|
|
|
1450 |
break;
|
1430 |
}
|
1451 |
}
|
1431 |
|
1452 |
|
1432 |
LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n",
|
1453 |
LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n",
|
1433 |
chron.millis(), qtermposs.size()));
|
1454 |
chron.millis(), qtermposs.size()));
|
1434 |
|
1455 |
|
|
... |
|
... |
1468 |
// Extract data around the first (in random order) term positions,
|
1489 |
// Extract data around the first (in random order) term positions,
|
1469 |
// and store the chunks in the map
|
1490 |
// and store the chunks in the map
|
1470 |
for (vector<unsigned int>::const_iterator it = qtermposs.begin();
|
1491 |
for (vector<unsigned int>::const_iterator it = qtermposs.begin();
|
1471 |
it != qtermposs.end(); it++) {
|
1492 |
it != qtermposs.end(); it++) {
|
1472 |
unsigned int ipos = *it;
|
1493 |
unsigned int ipos = *it;
|
1473 |
unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH);
|
1494 |
unsigned int start = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
1474 |
unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1);
|
1495 |
unsigned int end = MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1);
|
1475 |
string chunk;
|
1496 |
string chunk;
|
1476 |
for (unsigned int ii = start; ii <= end; ii++) {
|
1497 |
for (unsigned int ii = start; ii <= end; ii++) {
|
1477 |
if (!buf[ii].empty()) {
|
1498 |
if (!buf[ii].empty()) {
|
1478 |
chunk += buf[ii] + " ";
|
1499 |
chunk += buf[ii] + " ";
|
1479 |
abslen += buf[ii].length();
|
1500 |
abslen += buf[ii].length();
|
1480 |
}
|
1501 |
}
|
1481 |
if (abslen > MA_ABSTRACT_SIZE)
|
1502 |
if (int(abslen) > m_db->m_synthAbsLen)
|
1482 |
break;
|
1503 |
break;
|
1483 |
}
|
1504 |
}
|
1484 |
if (end != buf.size()-1)
|
1505 |
if (end != buf.size()-1)
|
1485 |
chunk += "... ";
|
1506 |
chunk += "... ";
|
1486 |
mabs[ipos] = chunk;
|
1507 |
mabs[ipos] = chunk;
|
1487 |
if (abslen > MA_ABSTRACT_SIZE)
|
1508 |
if (int(abslen) > m_db->m_synthAbsLen)
|
1488 |
break;
|
1509 |
break;
|
1489 |
}
|
1510 |
}
|
1490 |
|
1511 |
|
1491 |
// Build the abstract by walking the map (in order of position)
|
1512 |
// Build the abstract by walking the map (in order of position)
|
1492 |
string abstract;
|
1513 |
string abstract;
|