|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
/* Copyright (C) 2004 J.F.Dockes
|
1 |
/* Copyright (C) 2004-2018 J.F.Dockes
|
2 |
* This program is free software; you can redistribute it and/or modify
|
2 |
* This program is free software; you can redistribute it and/or modify
|
3 |
* it under the terms of the GNU General Public License as published by
|
3 |
* it under the terms of the GNU General Public License as published by
|
4 |
* the Free Software Foundation; either version 2 of the License, or
|
4 |
* the Free Software Foundation; either version 2 of the License, or
|
5 |
* (at your option) any later version.
|
5 |
* (at your option) any later version.
|
6 |
*
|
6 |
*
|
|
... |
|
... |
48 |
#include "chrono.h"
|
48 |
#include "chrono.h"
|
49 |
#include "utf8iter.h"
|
49 |
#include "utf8iter.h"
|
50 |
#include "searchdata.h"
|
50 |
#include "searchdata.h"
|
51 |
#include "rclquery.h"
|
51 |
#include "rclquery.h"
|
52 |
#include "rclquery_p.h"
|
52 |
#include "rclquery_p.h"
|
|
|
53 |
#include "rclvalues.h"
|
53 |
#include "md5ut.h"
|
54 |
#include "md5ut.h"
|
54 |
#include "rclversion.h"
|
55 |
#include "rclversion.h"
|
55 |
#include "cancelcheck.h"
|
56 |
#include "cancelcheck.h"
|
56 |
#include "termproc.h"
|
57 |
#include "termproc.h"
|
57 |
#include "expansiondbs.h"
|
58 |
#include "expansiondbs.h"
|
|
... |
|
... |
60 |
#include "utf8fn.h"
|
61 |
#include "utf8fn.h"
|
61 |
#include "wipedir.h"
|
62 |
#include "wipedir.h"
|
62 |
#ifdef RCL_USE_ASPELL
|
63 |
#ifdef RCL_USE_ASPELL
|
63 |
#include "rclaspell.h"
|
64 |
#include "rclaspell.h"
|
64 |
#endif
|
65 |
#endif
|
|
|
66 |
#include "zlibut.h"
|
|
|
67 |
|
|
|
68 |
#ifndef XAPIAN_AT_LEAST
|
|
|
69 |
// Added in Xapian 1.4.2. Define it here for older versions
|
|
|
70 |
#define XAPIAN_AT_LEAST(A,B,C) \
|
|
|
71 |
(XAPIAN_MAJOR_VERSION > (A) || \
|
|
|
72 |
(XAPIAN_MAJOR_VERSION == (A) && \
|
|
|
73 |
(XAPIAN_MINOR_VERSION > (B) || \
|
|
|
74 |
(XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
|
|
|
75 |
#endif
|
|
|
76 |
|
65 |
|
77 |
|
66 |
// Recoll index format version is stored in user metadata. When this change,
|
78 |
// Recoll index format version is stored in user metadata. When this change,
|
67 |
// we can't open the db and will have to reindex.
|
79 |
// we can't open the db and will have to reindex.
|
68 |
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
80 |
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
69 |
static const string cstr_RCL_IDX_VERSION("1");
|
81 |
static const string cstr_RCL_IDX_VERSION("1");
|
|
|
82 |
static const string cstr_RCL_IDX_DESCRIPTOR_KEY("RCL_IDX_DESCRIPTOR_KEY");
|
70 |
|
83 |
|
71 |
static const string cstr_mbreaks("rclmbreaks");
|
84 |
static const string cstr_mbreaks("rclmbreaks");
|
72 |
|
85 |
|
73 |
namespace Rcl {
|
86 |
namespace Rcl {
|
74 |
|
87 |
|
|
... |
|
... |
188 |
}
|
201 |
}
|
189 |
bool status = false;
|
202 |
bool status = false;
|
190 |
switch (tsk->op) {
|
203 |
switch (tsk->op) {
|
191 |
case DbUpdTask::AddOrUpdate:
|
204 |
case DbUpdTask::AddOrUpdate:
|
192 |
LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
|
205 |
LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
|
193 |
status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm,
|
206 |
status = ndbp->addOrUpdateWrite(
|
194 |
tsk->doc, tsk->txtlen);
|
207 |
tsk->udi, tsk->uniterm, tsk->doc, tsk->txtlen, tsk->rawztext);
|
195 |
break;
|
208 |
break;
|
196 |
case DbUpdTask::Delete:
|
209 |
case DbUpdTask::Delete:
|
197 |
LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
|
210 |
LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
|
198 |
status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm);
|
211 |
status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm);
|
199 |
break;
|
212 |
break;
|
|
... |
|
... |
236 |
writeqlen << " wqts " << writethreads << "\n");
|
249 |
writeqlen << " wqts " << writethreads << "\n");
|
237 |
}
|
250 |
}
|
238 |
|
251 |
|
239 |
#endif // IDX_THREADS
|
252 |
#endif // IDX_THREADS
|
240 |
|
253 |
|
|
|
254 |
void Db::Native::openWrite(const string& dir, Db::OpenMode mode)
|
|
|
255 |
{
|
|
|
256 |
int action = (mode == Db::DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
|
|
|
257 |
Xapian::DB_CREATE_OR_OVERWRITE;
|
|
|
258 |
|
|
|
259 |
#ifdef _WIN32
|
|
|
260 |
// Xapian is quite bad at erasing partial db which can
|
|
|
261 |
// occur because of open file deletion errors on
|
|
|
262 |
// Windows.
|
|
|
263 |
if (mode == DbTrunc) {
|
|
|
264 |
if (path_exists(path_cat(dir, "iamchert"))) {
|
|
|
265 |
wipedir(dir);
|
|
|
266 |
unlink(dir.c_str());
|
|
|
267 |
}
|
|
|
268 |
}
|
|
|
269 |
#endif
|
|
|
270 |
|
|
|
271 |
if (::access(dir.c_str(), 0) == 0) {
|
|
|
272 |
// Existing index
|
|
|
273 |
xwdb = Xapian::WritableDatabase(dir, action);
|
|
|
274 |
} else {
|
|
|
275 |
// New index. If possible, and depending on config, use a stub
|
|
|
276 |
// to force using Chert. No sense in doing this if we are
|
|
|
277 |
// storing the text anyway.
|
|
|
278 |
#if XAPIAN_AT_LEAST(1,3,0) && XAPIAN_HAS_CHERT_BACKEND
|
|
|
279 |
// Xapian with Glass and Chert support. If storedoctext is
|
|
|
280 |
// specified in the configuration, use the default backend
|
|
|
281 |
// (Glass), else force Chert. There might be reasons why
|
|
|
282 |
// someone would want to use Chert and store text anyway, but
|
|
|
283 |
// it's an exotic case, and things are complicated enough
|
|
|
284 |
// already.
|
|
|
285 |
if (o_index_storedoctext) {
|
|
|
286 |
xwdb = Xapian::WritableDatabase(dir, action);
|
|
|
287 |
m_storetext = true;
|
|
|
288 |
} else {
|
|
|
289 |
// Force Chert format, don't store the text.
|
|
|
290 |
string stub = path_cat(m_rcldb->m_config->getConfDir(),
|
|
|
291 |
"xapian.stub");
|
|
|
292 |
FILE *fp = fopen(stub.c_str(), "w");
|
|
|
293 |
if (nullptr == fp) {
|
|
|
294 |
throw(string("Can't create ") + stub);
|
|
|
295 |
}
|
|
|
296 |
fprintf(fp, "chert %s\n", dir.c_str());
|
|
|
297 |
fclose(fp);
|
|
|
298 |
xwdb = Xapian::WritableDatabase(stub, action);
|
|
|
299 |
m_storetext = false;
|
|
|
300 |
}
|
|
|
301 |
#elif (! XAPIAN_AT_LEAST(1,3,0)) || XAPIAN_AT_LEAST(1,5,0)
|
|
|
302 |
// Old Xapian (chert only) or newer (no chert). Use the
|
|
|
303 |
// default index backend and let the user decide of the
|
|
|
304 |
// abstract generation method. The configured default is to
|
|
|
305 |
// store the text.
|
|
|
306 |
xwdb = Xapian::WritableDatabase(dir, action);
|
|
|
307 |
m_storetext = o_index_storedoctext;
|
|
|
308 |
#endif
|
|
|
309 |
// Set the storetext value inside the index descriptor (new
|
|
|
310 |
// with recoll 1.24, maybe we'll have other stuff to store in
|
|
|
311 |
// there in the future).
|
|
|
312 |
string desc = string("storetext=") + (m_storetext ? "1" : "0") + "\n";
|
|
|
313 |
xwdb.set_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY, desc);
|
|
|
314 |
}
|
|
|
315 |
|
|
|
316 |
// If the index is empty, write the data format version at once
|
|
|
317 |
// to avoid stupid error messages:
|
|
|
318 |
if (xwdb.get_doccount() == 0) {
|
|
|
319 |
xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
|
|
|
320 |
}
|
|
|
321 |
|
|
|
322 |
m_iswritable = true;
|
|
|
323 |
|
|
|
324 |
#ifdef IDX_THREADS
|
|
|
325 |
maybeStartThreads();
|
|
|
326 |
#endif
|
|
|
327 |
}
|
|
|
328 |
|
|
|
329 |
void Db::Native::openRead(const string& dir)
|
|
|
330 |
{
|
|
|
331 |
m_iswritable = false;
|
|
|
332 |
xrdb = Xapian::Database(dir);
|
|
|
333 |
string desc = xrdb.get_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY);
|
|
|
334 |
ConfSimple cf(desc, 1);
|
|
|
335 |
string val;
|
|
|
336 |
m_storetext = false;
|
|
|
337 |
if (cf.get("storetext", val) && stringToBool(val)) {
|
|
|
338 |
m_storetext = true;
|
|
|
339 |
}
|
|
|
340 |
LOGDEB("Db::openRead: index " << (m_storetext?"stores":"does not store") <<
|
|
|
341 |
" document text\n");
|
|
|
342 |
}
|
|
|
343 |
|
241 |
/* See comment in class declaration: return all subdocuments of a
|
344 |
/* See comment in class declaration: return all subdocuments of a
|
242 |
* document given by its unique id.
|
345 |
* document given by its unique id. */
|
243 |
*/
|
|
|
244 |
bool Db::Native::subDocs(const string &udi, int idxi,
|
346 |
bool Db::Native::subDocs(const string &udi, int idxi,
|
245 |
vector<Xapian::docid>& docids)
|
347 |
vector<Xapian::docid>& docids)
|
246 |
{
|
348 |
{
|
247 |
LOGDEB2("subDocs: [" << uniterm << "]\n");
|
349 |
LOGDEB2("subDocs: [" << uniterm << "]\n");
|
248 |
string pterm = make_parentterm(udi);
|
350 |
string pterm = make_parentterm(udi);
|
|
... |
|
... |
439 |
return 0;
|
541 |
return 0;
|
440 |
}
|
542 |
}
|
441 |
|
543 |
|
442 |
// Turn data record from db into document fields
|
544 |
// Turn data record from db into document fields
|
443 |
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
545 |
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
444 |
Doc &doc)
|
546 |
Doc &doc, bool fetchtext)
|
445 |
{
|
547 |
{
|
446 |
LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n");
|
548 |
LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n");
|
447 |
ConfSimple parms(data);
|
549 |
ConfSimple parms(data);
|
448 |
if (!parms.ok())
|
550 |
if (!parms.ok())
|
449 |
return false;
|
551 |
return false;
|
|
... |
|
... |
499 |
if (doc.meta.find(*it) == doc.meta.end())
|
601 |
if (doc.meta.find(*it) == doc.meta.end())
|
500 |
parms.get(*it, doc.meta[*it]);
|
602 |
parms.get(*it, doc.meta[*it]);
|
501 |
}
|
603 |
}
|
502 |
doc.meta[Doc::keyurl] = doc.url;
|
604 |
doc.meta[Doc::keyurl] = doc.url;
|
503 |
doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
|
605 |
doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
|
|
|
606 |
if (fetchtext) {
|
|
|
607 |
getRawText(docid, doc.text);
|
|
|
608 |
}
|
504 |
return true;
|
609 |
return true;
|
505 |
}
|
610 |
}
|
506 |
|
611 |
|
507 |
bool Db::Native::hasPages(Xapian::docid docid)
|
612 |
bool Db::Native::hasPages(Xapian::docid docid)
|
508 |
{
|
613 |
{
|
|
... |
|
... |
578 |
vector<int>::const_iterator it =
|
683 |
vector<int>::const_iterator it =
|
579 |
upper_bound(pbreaks.begin(), pbreaks.end(), pos);
|
684 |
upper_bound(pbreaks.begin(), pbreaks.end(), pos);
|
580 |
return int(it - pbreaks.begin() + 1);
|
685 |
return int(it - pbreaks.begin() + 1);
|
581 |
}
|
686 |
}
|
582 |
|
687 |
|
|
|
688 |
bool Db::Native::getRawText(Xapian::docid docid, string& rawtext)
|
|
|
689 |
{
|
|
|
690 |
if (!m_storetext) {
|
|
|
691 |
LOGDEB("Db::Native::getRawText: document text not stored in index\n");
|
|
|
692 |
return false;
|
|
|
693 |
}
|
|
|
694 |
string reason;
|
|
|
695 |
XAPTRY(rawtext = xrdb.get_metadata(rawtextMetaKey(docid)), xrdb, reason);
|
|
|
696 |
if (!reason.empty()) {
|
|
|
697 |
LOGERR("Rcl::Db::getRawText: could not get value: " << reason << endl);
|
|
|
698 |
return false;
|
|
|
699 |
}
|
|
|
700 |
if (rawtext.empty()) {
|
|
|
701 |
return true;
|
|
|
702 |
}
|
|
|
703 |
ZLibUtBuf cbuf;
|
|
|
704 |
inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf);
|
|
|
705 |
rawtext.assign(cbuf.getBuf(), cbuf.getCnt());
|
|
|
706 |
return true;
|
|
|
707 |
}
|
|
|
708 |
|
583 |
// Note: we're passed a Xapian::Document* because Xapian
|
709 |
// Note: we're passed a Xapian::Document* because Xapian
|
584 |
// reference-counting is not mt-safe. We take ownership and need
|
710 |
// reference-counting is not mt-safe. We take ownership and need
|
585 |
// to delete it before returning.
|
711 |
// to delete it before returning.
|
586 |
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
712 |
bool Db::Native::addOrUpdateWrite(
|
587 |
Xapian::Document *newdocument_ptr,
|
713 |
const string& udi, const string& uniterm, Xapian::Document *newdocument_ptr,
|
588 |
size_t textlen)
|
714 |
size_t textlen, const string& rawztext)
|
589 |
{
|
715 |
{
|
590 |
#ifdef IDX_THREADS
|
716 |
#ifdef IDX_THREADS
|
591 |
Chrono chron;
|
717 |
Chrono chron;
|
592 |
std::unique_lock<std::mutex> lock(m_mutex);
|
718 |
std::unique_lock<std::mutex> lock(m_mutex);
|
593 |
#endif
|
719 |
#endif
|
594 |
std::shared_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
|
720 |
std::unique_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
|
595 |
|
721 |
|
596 |
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
722 |
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
597 |
// to do this after having prepared the document, but it needs to be in
|
723 |
// to do this after having prepared the document, but it needs to be in
|
598 |
// the single-threaded section.
|
724 |
// the single-threaded section.
|
599 |
if (m_rcldb->m_maxFsOccupPc > 0 &&
|
725 |
if (m_rcldb->m_maxFsOccupPc > 0 &&
|
|
... |
|
... |
612 |
|
738 |
|
613 |
const char *fnc = udi.c_str();
|
739 |
const char *fnc = udi.c_str();
|
614 |
string ermsg;
|
740 |
string ermsg;
|
615 |
|
741 |
|
616 |
// Add db entry or update existing entry:
|
742 |
// Add db entry or update existing entry:
|
|
|
743 |
Xapian::docid did = 0;
|
617 |
try {
|
744 |
try {
|
618 |
Xapian::docid did =
|
|
|
619 |
xwdb.replace_document(uniterm, *newdocument_ptr);
|
745 |
did = xwdb.replace_document(uniterm, *newdocument_ptr);
|
620 |
if (did < m_rcldb->updated.size()) {
|
746 |
if (did < m_rcldb->updated.size()) {
|
621 |
// This is necessary because only the file-level docs are tested
|
747 |
// This is necessary because only the file-level docs are tested
|
622 |
// by needUpdate(), so the subdocs existence flags are only set
|
748 |
// by needUpdate(), so the subdocs existence flags are only set
|
623 |
// here.
|
749 |
// here.
|
624 |
m_rcldb->updated[did] = true;
|
750 |
m_rcldb->updated[did] = true;
|
625 |
LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n");
|
751 |
LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n");
|
626 |
} else {
|
752 |
} else {
|
627 |
LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
|
753 |
LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
|
628 |
}
|
754 |
}
|
629 |
} XCATCHERROR(ermsg);
|
755 |
} XCATCHERROR(ermsg);
|
630 |
|
|
|
631 |
if (!ermsg.empty()) {
|
756 |
if (!ermsg.empty()) {
|
632 |
LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
|
757 |
LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
|
633 |
ermsg.erase();
|
758 |
ermsg.erase();
|
634 |
// FIXME: is this ever actually needed?
|
759 |
// FIXME: is this ever actually needed?
|
635 |
try {
|
760 |
try {
|
|
... |
|
... |
641 |
LOGERR("Db::add: add_document failed: " << ermsg << "\n");
|
766 |
LOGERR("Db::add: add_document failed: " << ermsg << "\n");
|
642 |
return false;
|
767 |
return false;
|
643 |
}
|
768 |
}
|
644 |
}
|
769 |
}
|
645 |
|
770 |
|
|
|
771 |
XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext),
|
|
|
772 |
xwdb, m_rcldb->m_reason);
|
|
|
773 |
if (!m_rcldb->m_reason.empty()) {
|
|
|
774 |
LOGERR("Db::addOrUpdate: set_metadata error: " <<
|
|
|
775 |
m_rcldb->m_reason << "\n");
|
|
|
776 |
// This only affects snippets, so let's say not fatal
|
|
|
777 |
}
|
|
|
778 |
|
646 |
// Test if we're over the flush threshold (limit memory usage):
|
779 |
// Test if we're over the flush threshold (limit memory usage):
|
647 |
bool ret = m_rcldb->maybeflush(textlen);
|
780 |
bool ret = m_rcldb->maybeflush(textlen);
|
648 |
#ifdef IDX_THREADS
|
781 |
#ifdef IDX_THREADS
|
649 |
m_totalworkns += chron.nanos();
|
782 |
m_totalworkns += chron.nanos();
|
650 |
#endif
|
783 |
#endif
|
|
... |
|
... |
680 |
LOGINFO("purgeFileWrite: got empty sig\n");
|
813 |
LOGINFO("purgeFileWrite: got empty sig\n");
|
681 |
return false;
|
814 |
return false;
|
682 |
}
|
815 |
}
|
683 |
} else {
|
816 |
} else {
|
684 |
LOGDEB("purgeFile: delete docid " << *docid << "\n");
|
817 |
LOGDEB("purgeFile: delete docid " << *docid << "\n");
|
685 |
xwdb.delete_document(*docid);
|
818 |
deleteDocument(*docid);
|
686 |
}
|
819 |
}
|
687 |
vector<Xapian::docid> docids;
|
820 |
vector<Xapian::docid> docids;
|
688 |
subDocs(udi, 0, docids);
|
821 |
subDocs(udi, 0, docids);
|
689 |
LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n");
|
822 |
LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n");
|
690 |
for (vector<Xapian::docid>::iterator it = docids.begin();
|
823 |
for (vector<Xapian::docid>::iterator it = docids.begin();
|
|
... |
|
... |
703 |
}
|
836 |
}
|
704 |
}
|
837 |
}
|
705 |
|
838 |
|
706 |
if (!orphansOnly || sig != subdocsig) {
|
839 |
if (!orphansOnly || sig != subdocsig) {
|
707 |
LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
|
840 |
LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
|
708 |
xwdb.delete_document(*it);
|
841 |
deleteDocument(*it);
|
709 |
}
|
842 |
}
|
710 |
}
|
843 |
}
|
711 |
return true;
|
844 |
return true;
|
712 |
} XCATCHERROR(ermsg);
|
845 |
} XCATCHERROR(ermsg);
|
713 |
if (!ermsg.empty()) {
|
846 |
if (!ermsg.empty()) {
|
|
... |
|
... |
765 |
vector<string> res;
|
898 |
vector<string> res;
|
766 |
stringToStrings(Xapian::Stem::get_available_languages(), res);
|
899 |
stringToStrings(Xapian::Stem::get_available_languages(), res);
|
767 |
return res;
|
900 |
return res;
|
768 |
}
|
901 |
}
|
769 |
|
902 |
|
|
|
903 |
|
770 |
bool Db::open(OpenMode mode, OpenError *error)
|
904 |
bool Db::open(OpenMode mode, OpenError *error)
|
771 |
{
|
905 |
{
|
772 |
if (error)
|
906 |
if (error)
|
773 |
*error = DbOpenMainDb;
|
907 |
*error = DbOpenMainDb;
|
774 |
|
908 |
|
|
... |
|
... |
791 |
string ermsg;
|
925 |
string ermsg;
|
792 |
try {
|
926 |
try {
|
793 |
switch (mode) {
|
927 |
switch (mode) {
|
794 |
case DbUpd:
|
928 |
case DbUpd:
|
795 |
case DbTrunc:
|
929 |
case DbTrunc:
|
796 |
{
|
930 |
m_ndb->openWrite(dir, mode);
|
797 |
// Xapian is quite bad at erasing partial db which can
|
931 |
updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
|
798 |
// occur because of open file deletion errors on
|
|
|
799 |
// Windows.
|
|
|
800 |
if (mode == DbTrunc) {
|
|
|
801 |
if (path_exists(path_cat(dir, "iamchert"))) {
|
|
|
802 |
wipedir(dir);
|
|
|
803 |
unlink(dir.c_str());
|
|
|
804 |
}
|
|
|
805 |
}
|
|
|
806 |
int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
|
|
|
807 |
Xapian::DB_CREATE_OR_OVERWRITE;
|
|
|
808 |
if (::access(dir.c_str(), 0) != 0) {
|
|
|
809 |
// New index. use a stub to force using Chert
|
|
|
810 |
string stub = path_cat(m_config->getConfDir(),
|
|
|
811 |
"xapian.stub");
|
|
|
812 |
FILE *fp = fopen(stub.c_str(), "w");
|
|
|
813 |
if (nullptr == fp) {
|
|
|
814 |
throw(string("Can't create ") + stub);
|
|
|
815 |
}
|
|
|
816 |
fprintf(fp, "chert %s\n", dir.c_str());
|
|
|
817 |
fclose(fp);
|
|
|
818 |
m_ndb->xwdb = Xapian::WritableDatabase(stub, action);
|
|
|
819 |
} else {
|
|
|
820 |
m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
|
|
|
821 |
}
|
|
|
822 |
// If db is empty, write the data format version at once
|
|
|
823 |
// to avoid stupid error messages:
|
|
|
824 |
if (m_ndb->xwdb.get_doccount() == 0)
|
|
|
825 |
m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY,
|
|
|
826 |
cstr_RCL_IDX_VERSION);
|
|
|
827 |
m_ndb->m_iswritable = true;
|
|
|
828 |
#ifdef IDX_THREADS
|
|
|
829 |
m_ndb->maybeStartThreads();
|
|
|
830 |
#endif
|
|
|
831 |
// We used to open a readonly object in addition to
|
932 |
// We used to open a readonly object in addition to the
|
832 |
// the r/w one because some operations were faster
|
933 |
// r/w one because some operations were faster when
|
833 |
// when performed through a Database: no forced
|
934 |
// performed through a Database: no forced flushes on
|
834 |
// flushes on allterms_begin(), used in
|
935 |
// allterms_begin(), used in subDocs(). This issue has
|
835 |
// subDocs(). This issue has been gone for a long time
|
936 |
// been gone for a long time (now: Xapian 1.2) and the
|
836 |
// (now: Xapian 1.2) and the separate objects seem to
|
937 |
// separate objects seem to trigger other Xapian issues,
|
837 |
// trigger other Xapian issues, so the query db is now
|
|
|
838 |
// a clone of the update one.
|
938 |
// so the query db is now a clone of the update one.
|
839 |
m_ndb->xrdb = m_ndb->xwdb;
|
939 |
m_ndb->xrdb = m_ndb->xwdb;
|
840 |
LOGDEB("Db::open: lastdocid: " << m_ndb->xwdb.get_lastdocid() <<
|
940 |
LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
|
841 |
"\n");
|
|
|
842 |
LOGDEB2("Db::open: resetting updated\n");
|
|
|
843 |
updated.resize(m_ndb->xwdb.get_lastdocid() + 1);
|
|
|
844 |
for (unsigned int i = 0; i < updated.size(); i++)
|
|
|
845 |
updated[i] = false;
|
|
|
846 |
}
|
|
|
847 |
break;
|
941 |
break;
|
848 |
case DbRO:
|
942 |
case DbRO:
|
849 |
default:
|
943 |
default:
|
850 |
m_ndb->m_iswritable = false;
|
944 |
m_ndb->openRead(dir);
|
851 |
m_ndb->xrdb = Xapian::Database(dir);
|
945 |
for (auto& db : m_extraDbs) {
|
852 |
for (vector<string>::iterator it = m_extraDbs.begin();
|
|
|
853 |
it != m_extraDbs.end(); it++) {
|
|
|
854 |
if (error)
|
946 |
if (error)
|
855 |
*error = DbOpenExtraDb;
|
947 |
*error = DbOpenExtraDb;
|
856 |
LOGDEB("Db::Open: adding query db [" << &(*it) << "]\n");
|
948 |
LOGDEB("Db::Open: adding query db [" << &db << "]\n");
|
857 |
// An error here used to be non-fatal (1.13 and older)
|
949 |
// An error here used to be non-fatal (1.13 and older)
|
858 |
// but I can't see why
|
950 |
// but I can't see why
|
859 |
m_ndb->xrdb.add_database(Xapian::Database(*it));
|
951 |
m_ndb->xrdb.add_database(Xapian::Database(db));
|
860 |
}
|
952 |
}
|
861 |
break;
|
953 |
break;
|
862 |
}
|
954 |
}
|
863 |
if (error)
|
955 |
if (error)
|
864 |
*error = DbOpenMainDb;
|
956 |
*error = DbOpenMainDb;
|
|
... |
|
... |
1049 |
string aerr;
|
1141 |
string aerr;
|
1050 |
bool mstripped = true;
|
1142 |
bool mstripped = true;
|
1051 |
LOGDEB("Db::testDbDir: [" << dir << "]\n");
|
1143 |
LOGDEB("Db::testDbDir: [" << dir << "]\n");
|
1052 |
try {
|
1144 |
try {
|
1053 |
Xapian::Database db(dir);
|
1145 |
Xapian::Database db(dir);
|
1054 |
// If we have terms with a leading ':' it's an
|
1146 |
// If the prefix for mimetype is wrapped, it's an unstripped
|
1055 |
// unstripped index
|
1147 |
// index. T has been in use in recoll since the beginning and
|
|
|
1148 |
// all documents have a T field (possibly empty).
|
1056 |
Xapian::TermIterator term = db.allterms_begin(":");
|
1149 |
Xapian::TermIterator term = db.allterms_begin(":T:");
|
1057 |
if (term == db.allterms_end())
|
1150 |
if (term == db.allterms_end()) {
|
1058 |
mstripped = true;
|
1151 |
mstripped = true;
|
1059 |
else
|
1152 |
} else {
|
1060 |
mstripped = false;
|
1153 |
mstripped = false;
|
|
|
1154 |
}
|
|
|
1155 |
LOGDEB("testDbDir: " << dir << " is a " <<
|
|
|
1156 |
(mstripped ? "stripped" : "raw") << " index\n");
|
1061 |
} XCATCHERROR(aerr);
|
1157 |
} XCATCHERROR(aerr);
|
1062 |
if (!aerr.empty()) {
|
1158 |
if (!aerr.empty()) {
|
1063 |
LOGERR("Db::Open: error while trying to open database from [" <<
|
1159 |
LOGERR("Db::Open: error while trying to open database from [" <<
|
1064 |
dir << "]: " << aerr << "\n");
|
1160 |
dir << "]: " << aerr << "\n");
|
1065 |
return false;
|
1161 |
return false;
|
|
... |
|
... |
1368 |
tpidx.setTSD(&splitter);
|
1464 |
tpidx.setTSD(&splitter);
|
1369 |
|
1465 |
|
1370 |
// Udi unique term: this is used for file existence/uptodate
|
1466 |
// Udi unique term: this is used for file existence/uptodate
|
1371 |
// checks, and unique id for the replace_document() call.
|
1467 |
// checks, and unique id for the replace_document() call.
|
1372 |
string uniterm = make_uniterm(udi);
|
1468 |
string uniterm = make_uniterm(udi);
|
|
|
1469 |
string rawztext; // Doc compressed text
|
1373 |
|
1470 |
|
1374 |
if (doc.onlyxattr) {
|
1471 |
if (doc.onlyxattr) {
|
1375 |
// Only updating an existing doc with new extended attributes
|
1472 |
// Only updating an existing doc with new extended attributes
|
1376 |
// data. Need to read the old doc and its data record
|
1473 |
// data. Need to read the old doc and its data record
|
1377 |
// first. This is so different from the normal processing that
|
1474 |
// first. This is so different from the normal processing that
|
|
... |
|
... |
1419 |
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
1516 |
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
1420 |
splitter.basepos + splitter.curpos++);
|
1517 |
splitter.basepos + splitter.curpos++);
|
1421 |
for (vector<string>::iterator it = vpath.begin();
|
1518 |
for (vector<string>::iterator it = vpath.begin();
|
1422 |
it != vpath.end(); it++){
|
1519 |
it != vpath.end(); it++){
|
1423 |
if (it->length() > 230) {
|
1520 |
if (it->length() > 230) {
|
1424 |
// Just truncate it. May still be useful because of wildcards
|
1521 |
// Just truncate it. May still be useful because
|
|
|
1522 |
// of wildcards
|
1425 |
*it = it->substr(0, 230);
|
1523 |
*it = it->substr(0, 230);
|
1426 |
}
|
1524 |
}
|
1427 |
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
1525 |
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
1428 |
splitter.basepos + splitter.curpos++);
|
1526 |
splitter.basepos + splitter.curpos++);
|
1429 |
}
|
1527 |
}
|
|
... |
|
... |
1434 |
// positions, as we may want to do phrase searches with them (this
|
1532 |
// positions, as we may want to do phrase searches with them (this
|
1435 |
// makes no sense for keywords by the way).
|
1533 |
// makes no sense for keywords by the way).
|
1436 |
//
|
1534 |
//
|
1437 |
// The order has no importance, and we set a position gap of 100
|
1535 |
// The order has no importance, and we set a position gap of 100
|
1438 |
// between fields to avoid false proximity matches.
|
1536 |
// between fields to avoid false proximity matches.
|
1439 |
map<string, string>::iterator meta_it;
|
1537 |
for (const auto& entry: doc.meta) {
|
1440 |
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
|
|
1441 |
if (!meta_it->second.empty()) {
|
1538 |
if (entry.second.empty()) {
|
1442 |
const FieldTraits *ftp;
|
1539 |
continue;
|
1443 |
// We don't test for an empty prefix here. Some fields are part
|
1540 |
}
|
1444 |
// of the internal conf with an empty prefix (ie: abstract).
|
1541 |
const FieldTraits *ftp{nullptr};
|
1445 |
if (!fieldToTraits(meta_it->first, &ftp)) {
|
1542 |
fieldToTraits(entry.first, &ftp);
|
1446 |
LOGDEB0("Db::add: no prefix for field [" <<
|
1543 |
if (ftp && ftp->valueslot) {
|
1447 |
meta_it->first << "], no indexing\n");
|
1544 |
LOGDEB("Adding value: for field " << entry.first << " slot "
|
1448 |
continue;
|
1545 |
<< ftp->valueslot << endl);
|
1449 |
}
|
1546 |
add_field_value(newdocument, *ftp, entry.second);
|
|
|
1547 |
}
|
|
|
1548 |
|
|
|
1549 |
// There was an old comment here about not testing for
|
|
|
1550 |
// empty prefix, and we indeed did not test. I don't think
|
|
|
1551 |
// that it makes sense any more (and was in disagreement
|
|
|
1552 |
// with the LOG message. Really now: no prefix: no
|
|
|
1553 |
// indexing.
|
|
|
1554 |
if (ftp && !ftp->pfx.empty()) {
|
1450 |
LOGDEB0("Db::add: field [" << meta_it->first << "] pfx [" <<
|
1555 |
LOGDEB0("Db::add: field [" << entry.first << "] pfx [" <<
|
1451 |
ftp->pfx << "] inc " << ftp->wdfinc << ": [" <<
|
1556 |
ftp->pfx << "] inc " << ftp->wdfinc << ": [" <<
|
1452 |
meta_it->second << "]\n");
|
1557 |
entry.second << "]\n");
|
1453 |
splitter.setTraits(*ftp);
|
1558 |
splitter.setTraits(*ftp);
|
1454 |
if (!splitter.text_to_words(meta_it->second)) {
|
1559 |
if (!splitter.text_to_words(entry.second)) {
|
1455 |
LOGDEB("Db::addOrUpdate: split failed for " <<
|
1560 |
LOGDEB("Db::addOrUpdate: split failed for " <<
|
1456 |
meta_it->first << "\n");
|
1561 |
entry.first << "\n");
|
1457 |
}
|
1562 |
}
|
1458 |
}
|
1563 |
} else {
|
|
|
1564 |
LOGDEB0("Db::add: no prefix for field [" <<
|
|
|
1565 |
entry.first << "], no indexing\n");
|
|
|
1566 |
}
|
1459 |
}
|
1567 |
}
|
1460 |
|
1568 |
|
1461 |
// Reset to no prefix and default params
|
1569 |
// Reset to no prefix and default params
|
1462 |
splitter.setTraits(FieldTraits());
|
1570 |
splitter.setTraits(FieldTraits());
|
1463 |
|
1571 |
|
|
... |
|
... |
1468 |
LOGDEB2("Db::add: split body: [" << doc.text << "]\n");
|
1576 |
LOGDEB2("Db::add: split body: [" << doc.text << "]\n");
|
1469 |
|
1577 |
|
1470 |
#ifdef TEXTSPLIT_STATS
|
1578 |
#ifdef TEXTSPLIT_STATS
|
1471 |
splitter.resetStats();
|
1579 |
splitter.resetStats();
|
1472 |
#endif
|
1580 |
#endif
|
1473 |
if (!splitter.text_to_words(doc.text))
|
1581 |
if (!splitter.text_to_words(doc.text)) {
|
1474 |
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
1582 |
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
|
|
1583 |
} else {
|
|
|
1584 |
if (m_ndb->m_storetext) {
|
|
|
1585 |
ZLibUtBuf buf;
|
|
|
1586 |
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
|
|
|
1587 |
rawztext.assign(buf.getBuf(), buf.getCnt());
|
|
|
1588 |
}
|
|
|
1589 |
}
|
1475 |
|
1590 |
|
1476 |
#ifdef TEXTSPLIT_STATS
|
1591 |
#ifdef TEXTSPLIT_STATS
|
1477 |
// Reject bad data. unrecognized base64 text is characterized by
|
1592 |
// Reject bad data. unrecognized base64 text is characterized by
|
1478 |
// high avg word length and high variation (because there are
|
1593 |
// high avg word length and high variation (because there are
|
1479 |
// word-splitters like +/ inside the data).
|
1594 |
// word-splitters like +/ inside the data).
|
|
... |
|
... |
1499 |
// We also add a term for the filename extension if any.
|
1614 |
// We also add a term for the filename extension if any.
|
1500 |
string utf8fn;
|
1615 |
string utf8fn;
|
1501 |
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
1616 |
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
1502 |
string fn;
|
1617 |
string fn;
|
1503 |
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
1618 |
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
1504 |
// We should truncate after extracting the extension, but this is
|
1619 |
// We should truncate after extracting the extension,
|
1505 |
// a pathological case anyway
|
1620 |
// but this is a pathological case anyway
|
1506 |
if (fn.size() > 230)
|
1621 |
if (fn.size() > 230)
|
1507 |
utf8truncate(fn, 230);
|
1622 |
utf8truncate(fn, 230);
|
1508 |
string::size_type pos = fn.rfind('.');
|
1623 |
string::size_type pos = fn.rfind('.');
|
1509 |
if (pos != string::npos && pos != fn.length() - 1) {
|
1624 |
if (pos != string::npos && pos != fn.length() - 1) {
|
1510 |
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
|
1625 |
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
|
1511 |
fn.substr(pos + 1));
|
1626 |
fn.substr(pos + 1));
|
1512 |
}
|
1627 |
}
|
1513 |
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
|
1628 |
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn,0);
|
1514 |
}
|
1629 |
}
|
1515 |
}
|
1630 |
}
|
1516 |
|
1631 |
|
1517 |
newdocument.add_boolean_term(uniterm);
|
1632 |
newdocument.add_boolean_term(uniterm);
|
1518 |
// Parent term. This is used to find all descendents, mostly
|
1633 |
// Parent term. This is used to find all descendents, mostly
|
|
... |
|
... |
1685 |
LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
|
1800 |
LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
|
1686 |
newdocument.set_data(record);
|
1801 |
newdocument.set_data(record);
|
1687 |
}
|
1802 |
}
|
1688 |
#ifdef IDX_THREADS
|
1803 |
#ifdef IDX_THREADS
|
1689 |
if (m_ndb->m_havewriteq) {
|
1804 |
if (m_ndb->m_havewriteq) {
|
1690 |
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
1805 |
DbUpdTask *tp = new DbUpdTask(
|
1691 |
newdocument_ptr, doc.text.length());
|
1806 |
DbUpdTask::AddOrUpdate, udi, uniterm, newdocument_ptr,
|
|
|
1807 |
doc.text.length(), rawztext);
|
1692 |
if (!m_ndb->m_wqueue.put(tp)) {
|
1808 |
if (!m_ndb->m_wqueue.put(tp)) {
|
1693 |
LOGERR("Db::addOrUpdate:Cant queue task\n");
|
1809 |
LOGERR("Db::addOrUpdate:Cant queue task\n");
|
1694 |
delete newdocument_ptr;
|
1810 |
delete newdocument_ptr;
|
1695 |
return false;
|
1811 |
return false;
|
1696 |
} else {
|
1812 |
} else {
|
|
... |
|
... |
1698 |
}
|
1814 |
}
|
1699 |
}
|
1815 |
}
|
1700 |
#endif
|
1816 |
#endif
|
1701 |
|
1817 |
|
1702 |
return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
|
1818 |
return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
|
1703 |
doc.text.length());
|
1819 |
doc.text.length(), rawztext);
|
1704 |
}
|
1820 |
}
|
1705 |
|
1821 |
|
1706 |
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
1822 |
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
1707 |
Doc &doc, Xapian::Document& xdoc)
|
1823 |
Doc &doc, Xapian::Document& xdoc)
|
1708 |
{
|
1824 |
{
|
|
... |
|
... |
2062 |
// size from the data record, but this would be
|
2178 |
// size from the data record, but this would be
|
2063 |
// bad for performance.
|
2179 |
// bad for performance.
|
2064 |
Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
|
2180 |
Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
|
2065 |
maybeflush(trms * 5);
|
2181 |
maybeflush(trms * 5);
|
2066 |
}
|
2182 |
}
|
2067 |
m_ndb->xwdb.delete_document(docid);
|
2183 |
m_ndb->deleteDocument(docid);
|
2068 |
LOGDEB("Db::purge: deleted document #" << docid << "\n");
|
2184 |
LOGDEB("Db::purge: deleted document #" << docid << "\n");
|
2069 |
} catch (const Xapian::DocNotFoundError &) {
|
2185 |
} catch (const Xapian::DocNotFoundError &) {
|
2070 |
LOGDEB0("Db::purge: document #" << docid << " not found\n");
|
2186 |
LOGDEB0("Db::purge: document #" << docid << " not found\n");
|
2071 |
} catch (const Xapian::Error &e) {
|
2187 |
} catch (const Xapian::Error &e) {
|
2072 |
LOGERR("Db::purge: document #" << docid << ": " <<
|
2188 |
LOGERR("Db::purge: document #" << docid << ": " <<
|
|
... |
|
... |
2123 |
if (!exists)
|
2239 |
if (!exists)
|
2124 |
return true;
|
2240 |
return true;
|
2125 |
|
2241 |
|
2126 |
#ifdef IDX_THREADS
|
2242 |
#ifdef IDX_THREADS
|
2127 |
if (m_ndb->m_havewriteq) {
|
2243 |
if (m_ndb->m_havewriteq) {
|
|
|
2244 |
string rztxt;
|
2128 |
DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm,
|
2245 |
DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm,
|
2129 |
0, (size_t)-1);
|
2246 |
0, (size_t)-1, rztxt);
|
2130 |
if (!m_ndb->m_wqueue.put(tp)) {
|
2247 |
if (!m_ndb->m_wqueue.put(tp)) {
|
2131 |
LOGERR("Db::purgeFile:Cant queue task\n");
|
2248 |
LOGERR("Db::purgeFile:Cant queue task\n");
|
2132 |
return false;
|
2249 |
return false;
|
2133 |
} else {
|
2250 |
} else {
|
2134 |
return true;
|
2251 |
return true;
|
|
... |
|
... |
2150 |
|
2267 |
|
2151 |
string uniterm = make_uniterm(udi);
|
2268 |
string uniterm = make_uniterm(udi);
|
2152 |
|
2269 |
|
2153 |
#ifdef IDX_THREADS
|
2270 |
#ifdef IDX_THREADS
|
2154 |
if (m_ndb->m_havewriteq) {
|
2271 |
if (m_ndb->m_havewriteq) {
|
|
|
2272 |
string rztxt;
|
2155 |
DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm,
|
2273 |
DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm,
|
2156 |
0, (size_t)-1);
|
2274 |
0, (size_t)-1, rztxt);
|
2157 |
if (!m_ndb->m_wqueue.put(tp)) {
|
2275 |
if (!m_ndb->m_wqueue.put(tp)) {
|
2158 |
LOGERR("Db::purgeFile:Cant queue task\n");
|
2276 |
LOGERR("Db::purgeFile:Cant queue task\n");
|
2159 |
return false;
|
2277 |
return false;
|
2160 |
} else {
|
2278 |
} else {
|
2161 |
return true;
|
2279 |
return true;
|