Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
1
/* Copyright (C) 2004 J.F.Dockes
1
/* Copyright (C) 2004-2018 J.F.Dockes
2
 *   This program is free software; you can redistribute it and/or modify
2
 *   This program is free software; you can redistribute it and/or modify
3
 *   it under the terms of the GNU General Public License as published by
3
 *   it under the terms of the GNU General Public License as published by
4
 *   the Free Software Foundation; either version 2 of the License, or
4
 *   the Free Software Foundation; either version 2 of the License, or
5
 *   (at your option) any later version.
5
 *   (at your option) any later version.
6
 *
6
 *
...
...
48
#include "chrono.h"
48
#include "chrono.h"
49
#include "utf8iter.h"
49
#include "utf8iter.h"
50
#include "searchdata.h"
50
#include "searchdata.h"
51
#include "rclquery.h"
51
#include "rclquery.h"
52
#include "rclquery_p.h"
52
#include "rclquery_p.h"
53
#include "rclvalues.h"
53
#include "md5ut.h"
54
#include "md5ut.h"
54
#include "rclversion.h"
55
#include "rclversion.h"
55
#include "cancelcheck.h"
56
#include "cancelcheck.h"
56
#include "termproc.h"
57
#include "termproc.h"
57
#include "expansiondbs.h"
58
#include "expansiondbs.h"
...
...
60
#include "utf8fn.h"
61
#include "utf8fn.h"
61
#include "wipedir.h"
62
#include "wipedir.h"
62
#ifdef RCL_USE_ASPELL
63
#ifdef RCL_USE_ASPELL
63
#include "rclaspell.h"
64
#include "rclaspell.h"
64
#endif
65
#endif
66
#include "zlibut.h"
67
68
#ifndef XAPIAN_AT_LEAST
69
// Added in Xapian 1.4.2. Define it here for older versions
70
#define XAPIAN_AT_LEAST(A,B,C) \
71
 (XAPIAN_MAJOR_VERSION > (A) || \
72
 (XAPIAN_MAJOR_VERSION == (A) && \
73
 (XAPIAN_MINOR_VERSION > (B) || \
74
 (XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
75
#endif
76
65
77
66
// Recoll index format version is stored in user metadata. When this change,
78
// Recoll index format version is stored in user metadata. When this change,
67
// we can't open the db and will have to reindex.
79
// we can't open the db and will have to reindex.
68
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
80
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
69
static const string cstr_RCL_IDX_VERSION("1");
81
static const string cstr_RCL_IDX_VERSION("1");
82
static const string cstr_RCL_IDX_DESCRIPTOR_KEY("RCL_IDX_DESCRIPTOR_KEY");
70
83
71
static const string cstr_mbreaks("rclmbreaks");
84
static const string cstr_mbreaks("rclmbreaks");
72
85
73
namespace Rcl {
86
namespace Rcl {
74
87
...
...
188
    }
201
    }
189
    bool status = false;
202
    bool status = false;
190
    switch (tsk->op) {
203
    switch (tsk->op) {
191
    case DbUpdTask::AddOrUpdate:
204
    case DbUpdTask::AddOrUpdate:
192
        LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
205
        LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
193
        status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm, 
206
        status = ndbp->addOrUpdateWrite(
194
                      tsk->doc, tsk->txtlen);
207
                tsk->udi, tsk->uniterm, tsk->doc, tsk->txtlen, tsk->rawztext);
195
        break;
208
        break;
196
    case DbUpdTask::Delete:
209
    case DbUpdTask::Delete:
197
        LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
210
        LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
198
        status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm);
211
        status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm);
199
        break;
212
        break;
...
...
236
           writeqlen << " wqts " << writethreads << "\n");
249
           writeqlen << " wqts " << writethreads << "\n");
237
}
250
}
238
251
239
#endif // IDX_THREADS
252
#endif // IDX_THREADS
240
253
254
void Db::Native::openWrite(const string& dir, Db::OpenMode mode)
255
{
256
    int action = (mode == Db::DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
257
        Xapian::DB_CREATE_OR_OVERWRITE;
258
259
#ifdef _WIN32
260
    // Xapian is quite bad at erasing partial db which can
261
    // occur because of open file deletion errors on
262
    // Windows. 
263
    if (mode == DbTrunc) {
264
        if (path_exists(path_cat(dir, "iamchert"))) {
265
            wipedir(dir);
266
            unlink(dir.c_str());
267
        }
268
    }
269
#endif
270
    
271
    if (::access(dir.c_str(), 0) == 0) {
272
        // Existing index
273
        xwdb = Xapian::WritableDatabase(dir, action);
274
    } else {
275
        // New index. If possible, and depending on config, use a stub
276
        // to force using Chert. No sense in doing this if we are
277
        // storing the text anyway.
278
#if XAPIAN_AT_LEAST(1,3,0) && XAPIAN_HAS_CHERT_BACKEND
279
        // Xapian with Glass and Chert support. If storedoctext is
280
        // specified in the configuration, use the default backend
281
        // (Glass), else force Chert. There might be reasons why
282
        // someone would want to use Chert and store text anyway, but
283
        // it's an exotic case, and things are complicated enough
284
        // already.
285
        if (o_index_storedoctext) {
286
            xwdb = Xapian::WritableDatabase(dir, action);
287
            m_storetext = true;
288
        } else {
289
            // Force Chert format, don't store the text.
290
            string stub = path_cat(m_rcldb->m_config->getConfDir(),
291
                                   "xapian.stub");
292
            FILE *fp = fopen(stub.c_str(), "w");
293
            if (nullptr == fp) {
294
                throw(string("Can't create ") + stub);
295
            }
296
            fprintf(fp, "chert %s\n", dir.c_str());
297
            fclose(fp);
298
            xwdb = Xapian::WritableDatabase(stub, action);
299
            m_storetext = false;
300
        }
301
#elif (! XAPIAN_AT_LEAST(1,3,0)) || XAPIAN_AT_LEAST(1,5,0)
302
        // Old Xapian (chert only) or newer (no chert). Use the
303
        // default index backend and let the user decide of the
304
        // abstract generation method. The configured default is to
305
        // store the text.
306
        xwdb = Xapian::WritableDatabase(dir, action);
307
        m_storetext = o_index_storedoctext;
308
#endif
309
        // Set the storetext value inside the index descriptor (new
310
        // with recoll 1.24, maybe we'll have other stuff to store in
311
        // there in the future).
312
        string desc = string("storetext=") + (m_storetext ? "1" : "0") + "\n";
313
        xwdb.set_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY, desc);
314
    }
315
    
316
    // If the index is empty, write the data format version at once
317
    // to avoid stupid error messages:
318
    if (xwdb.get_doccount() == 0) {
319
        xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
320
    }
321
322
    m_iswritable = true;
323
324
#ifdef IDX_THREADS
325
    maybeStartThreads();
326
#endif
327
}
328
329
void Db::Native::openRead(const string& dir)
330
{
331
    m_iswritable = false;
332
    xrdb = Xapian::Database(dir);
333
    string desc = xrdb.get_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY);
334
    ConfSimple cf(desc, 1);
335
    string val;
336
    m_storetext = false;
337
    if (cf.get("storetext", val) && stringToBool(val)) {
338
        m_storetext = true;
339
    }
340
    LOGDEB("Db::openRead: index " << (m_storetext?"stores":"does not store") <<
341
           " document text\n");
342
}
343
241
/* See comment in class declaration: return all subdocuments of a
344
/* See comment in class declaration: return all subdocuments of a
242
 * document given by its unique id. 
345
 * document given by its unique id. */
243
*/
244
bool Db::Native::subDocs(const string &udi, int idxi, 
346
bool Db::Native::subDocs(const string &udi, int idxi, 
245
             vector<Xapian::docid>& docids) 
347
             vector<Xapian::docid>& docids) 
246
{
348
{
247
    LOGDEB2("subDocs: [" << uniterm << "]\n");
349
    LOGDEB2("subDocs: [" << uniterm << "]\n");
248
    string pterm = make_parentterm(udi);
350
    string pterm = make_parentterm(udi);
...
...
439
    return 0;
541
    return 0;
440
}
542
}
441
543
442
// Turn data record from db into document fields
544
// Turn data record from db into document fields
443
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, 
545
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, 
444
              Doc &doc)
546
              Doc &doc, bool fetchtext)
445
{
547
{
446
    LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n");
548
    LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n");
447
    ConfSimple parms(data);
549
    ConfSimple parms(data);
448
    if (!parms.ok())
550
    if (!parms.ok())
449
    return false;
551
    return false;
...
...
499
    if (doc.meta.find(*it) == doc.meta.end())
601
    if (doc.meta.find(*it) == doc.meta.end())
500
        parms.get(*it, doc.meta[*it]);
602
        parms.get(*it, doc.meta[*it]);
501
    }
603
    }
502
    doc.meta[Doc::keyurl] = doc.url;
604
    doc.meta[Doc::keyurl] = doc.url;
503
    doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
605
    doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
606
    if (fetchtext) {
607
        getRawText(docid, doc.text);
608
    }
504
    return true;
609
    return true;
505
}
610
}
506
611
507
bool Db::Native::hasPages(Xapian::docid docid)
612
bool Db::Native::hasPages(Xapian::docid docid)
508
{
613
{
...
...
578
    vector<int>::const_iterator it = 
683
    vector<int>::const_iterator it = 
579
    upper_bound(pbreaks.begin(), pbreaks.end(), pos);
684
    upper_bound(pbreaks.begin(), pbreaks.end(), pos);
580
    return int(it - pbreaks.begin() + 1);
685
    return int(it - pbreaks.begin() + 1);
581
}
686
}
582
687
688
bool Db::Native::getRawText(Xapian::docid docid, string& rawtext)
689
{
690
    if (!m_storetext) {
691
        LOGDEB("Db::Native::getRawText: document text not stored in index\n");
692
        return false;
693
    }
694
    string reason;
695
    XAPTRY(rawtext = xrdb.get_metadata(rawtextMetaKey(docid)), xrdb, reason);
696
    if (!reason.empty()) {
697
        LOGERR("Rcl::Db::getRawText: could not get value: " << reason << endl);
698
        return false;
699
    }
700
    if (rawtext.empty()) {
701
        return true;
702
    }
703
    ZLibUtBuf cbuf;
704
    inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf);
705
    rawtext.assign(cbuf.getBuf(), cbuf.getCnt());
706
    return true;
707
}
708
583
// Note: we're passed a Xapian::Document* because Xapian
709
// Note: we're passed a Xapian::Document* because Xapian
584
// reference-counting is not mt-safe. We take ownership and need
710
// reference-counting is not mt-safe. We take ownership and need
585
// to delete it before returning.
711
// to delete it before returning.
586
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, 
712
bool Db::Native::addOrUpdateWrite(
587
                Xapian::Document *newdocument_ptr, 
713
    const string& udi, const string& uniterm, Xapian::Document *newdocument_ptr, 
588
                                  size_t textlen)
714
    size_t textlen, const string& rawztext)
589
{
715
{
590
#ifdef IDX_THREADS
716
#ifdef IDX_THREADS
591
    Chrono chron;
717
    Chrono chron;
592
    std::unique_lock<std::mutex> lock(m_mutex);
718
    std::unique_lock<std::mutex> lock(m_mutex);
593
#endif
719
#endif
594
    std::shared_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
720
    std::unique_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
595
721
596
    // Check file system full every mbyte of indexed text. It's a bit wasteful
722
    // Check file system full every mbyte of indexed text. It's a bit wasteful
597
    // to do this after having prepared the document, but it needs to be in
723
    // to do this after having prepared the document, but it needs to be in
598
    // the single-threaded section.
724
    // the single-threaded section.
599
    if (m_rcldb->m_maxFsOccupPc > 0 && 
725
    if (m_rcldb->m_maxFsOccupPc > 0 && 
...
...
612
738
613
    const char *fnc = udi.c_str();
739
    const char *fnc = udi.c_str();
614
    string ermsg;
740
    string ermsg;
615
741
616
    // Add db entry or update existing entry:
742
    // Add db entry or update existing entry:
743
    Xapian::docid did = 0;
617
    try {
744
    try {
618
  Xapian::docid did = 
619
        xwdb.replace_document(uniterm, *newdocument_ptr);
745
    did = xwdb.replace_document(uniterm, *newdocument_ptr);
620
    if (did < m_rcldb->updated.size()) {
746
    if (did < m_rcldb->updated.size()) {
621
            // This is necessary because only the file-level docs are tested
747
            // This is necessary because only the file-level docs are tested
622
            // by needUpdate(), so the subdocs existence flags are only set
748
            // by needUpdate(), so the subdocs existence flags are only set
623
            // here.
749
            // here.
624
        m_rcldb->updated[did] = true;
750
        m_rcldb->updated[did] = true;
625
        LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n");
751
        LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n");
626
    } else {
752
    } else {
627
        LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
753
        LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
628
    }
754
    }
629
    } XCATCHERROR(ermsg);
755
    } XCATCHERROR(ermsg);
630
631
    if (!ermsg.empty()) {
756
    if (!ermsg.empty()) {
632
    LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
757
    LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
633
    ermsg.erase();
758
    ermsg.erase();
634
    // FIXME: is this ever actually needed?
759
    // FIXME: is this ever actually needed?
635
    try {
760
    try {
...
...
641
        LOGERR("Db::add: add_document failed: " << ermsg << "\n");
766
        LOGERR("Db::add: add_document failed: " << ermsg << "\n");
642
        return false;
767
        return false;
643
    }
768
    }
644
    }
769
    }
645
770
771
    XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext),
772
           xwdb, m_rcldb->m_reason);
773
    if (!m_rcldb->m_reason.empty()) {
774
        LOGERR("Db::addOrUpdate: set_metadata error: " <<
775
               m_rcldb->m_reason << "\n");
776
        // This only affects snippets, so let's say not fatal
777
    }
778
    
646
    // Test if we're over the flush threshold (limit memory usage):
779
    // Test if we're over the flush threshold (limit memory usage):
647
    bool ret = m_rcldb->maybeflush(textlen);
780
    bool ret = m_rcldb->maybeflush(textlen);
648
#ifdef IDX_THREADS
781
#ifdef IDX_THREADS
649
    m_totalworkns += chron.nanos();
782
    m_totalworkns += chron.nanos();
650
#endif
783
#endif
...
...
680
        LOGINFO("purgeFileWrite: got empty sig\n");
813
        LOGINFO("purgeFileWrite: got empty sig\n");
681
        return false;
814
        return false;
682
        }
815
        }
683
    } else {
816
    } else {
684
        LOGDEB("purgeFile: delete docid " << *docid << "\n");
817
        LOGDEB("purgeFile: delete docid " << *docid << "\n");
685
      xwdb.delete_document(*docid);
818
            deleteDocument(*docid);
686
    }
819
    }
687
    vector<Xapian::docid> docids;
820
    vector<Xapian::docid> docids;
688
    subDocs(udi, 0, docids);
821
    subDocs(udi, 0, docids);
689
    LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n");
822
    LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n");
690
    for (vector<Xapian::docid>::iterator it = docids.begin();
823
    for (vector<Xapian::docid>::iterator it = docids.begin();
...
...
703
        }
836
        }
704
        }
837
        }
705
        
838
        
706
        if (!orphansOnly || sig != subdocsig) {
839
        if (!orphansOnly || sig != subdocsig) {
707
        LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
840
        LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
708
        xwdb.delete_document(*it);
841
        deleteDocument(*it);
709
        }
842
        }
710
    }
843
    }
711
    return true;
844
    return true;
712
    } XCATCHERROR(ermsg);
845
    } XCATCHERROR(ermsg);
713
    if (!ermsg.empty()) {
846
    if (!ermsg.empty()) {
...
...
765
    vector<string> res;
898
    vector<string> res;
766
    stringToStrings(Xapian::Stem::get_available_languages(), res);
899
    stringToStrings(Xapian::Stem::get_available_languages(), res);
767
    return res;
900
    return res;
768
}
901
}
769
902
903
770
bool Db::open(OpenMode mode, OpenError *error)
904
bool Db::open(OpenMode mode, OpenError *error)
771
{
905
{
772
    if (error)
906
    if (error)
773
    *error = DbOpenMainDb;
907
    *error = DbOpenMainDb;
774
908
...
...
791
    string ermsg;
925
    string ermsg;
792
    try {
926
    try {
793
    switch (mode) {
927
    switch (mode) {
794
    case DbUpd:
928
    case DbUpd:
795
    case DbTrunc: 
929
    case DbTrunc: 
796
      {
930
            m_ndb->openWrite(dir, mode);
797
                // Xapian is quite bad at erasing partial db which can
931
            updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
798
                // occur because of open file deletion errors on
799
                // Windows.
800
                if (mode == DbTrunc) {
801
                    if (path_exists(path_cat(dir, "iamchert"))) {
802
                        wipedir(dir);
803
                        unlink(dir.c_str());
804
                    }
805
                }
806
      int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
807
          Xapian::DB_CREATE_OR_OVERWRITE;
808
                if (::access(dir.c_str(), 0) != 0) {
809
                    // New index. use a stub to force using Chert
810
                    string stub = path_cat(m_config->getConfDir(),
811
                                           "xapian.stub");
812
                    FILE *fp = fopen(stub.c_str(), "w");
813
                    if (nullptr == fp) {
814
                        throw(string("Can't create ") + stub);
815
                    }
816
                    fprintf(fp, "chert %s\n", dir.c_str());
817
                    fclose(fp);
818
                    m_ndb->xwdb = Xapian::WritableDatabase(stub, action);
819
                } else {
820
                    m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
821
                }
822
                // If db is empty, write the data format version at once
823
                // to avoid stupid error messages:
824
                if (m_ndb->xwdb.get_doccount() == 0)
825
                    m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, 
826
                                             cstr_RCL_IDX_VERSION);
827
      m_ndb->m_iswritable = true;
828
#ifdef IDX_THREADS
829
      m_ndb->maybeStartThreads();
830
#endif
831
      // We used to open a readonly object in addition to
932
            // We used to open a readonly object in addition to the
832
      // the r/w one because some operations were faster
933
            // r/w one because some operations were faster when
833
      // when performed through a Database: no forced
934
            // performed through a Database: no forced flushes on
834
      // flushes on allterms_begin(), used in
935
            // allterms_begin(), used in subDocs(). This issue has
835
      // subDocs(). This issue has been gone for a long time
936
            // been gone for a long time (now: Xapian 1.2) and the
836
                // (now: Xapian 1.2) and the separate objects seem to
937
            // separate objects seem to trigger other Xapian issues,
837
                // trigger other Xapian issues, so the query db is now
838
                // a clone of the update one.
938
            // so the query db is now a clone of the update one.
839
      m_ndb->xrdb = m_ndb->xwdb;
939
            m_ndb->xrdb = m_ndb->xwdb;
840
      LOGDEB("Db::open: lastdocid: " << m_ndb->xwdb.get_lastdocid() <<
940
            LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
841
                       "\n");
842
                LOGDEB2("Db::open: resetting updated\n");
843
                updated.resize(m_ndb->xwdb.get_lastdocid() + 1);
844
                for (unsigned int i = 0; i < updated.size(); i++)
845
                    updated[i] = false;
846
      }
847
        break;
941
        break;
848
    case DbRO:
942
    case DbRO:
849
    default:
943
    default:
850
      m_ndb->m_iswritable = false;
944
            m_ndb->openRead(dir);
851
      m_ndb->xrdb = Xapian::Database(dir);
945
            for (auto& db : m_extraDbs) {
852
      for (vector<string>::iterator it = m_extraDbs.begin();
853
       it != m_extraDbs.end(); it++) {
854
        if (error)
946
        if (error)
855
            *error = DbOpenExtraDb;
947
            *error = DbOpenExtraDb;
856
        LOGDEB("Db::Open: adding query db [" << &(*it) << "]\n");
948
        LOGDEB("Db::Open: adding query db [" << &db << "]\n");
857
                // An error here used to be non-fatal (1.13 and older)
949
                // An error here used to be non-fatal (1.13 and older)
858
                // but I can't see why
950
                // but I can't see why
859
                m_ndb->xrdb.add_database(Xapian::Database(*it));
951
                m_ndb->xrdb.add_database(Xapian::Database(db));
860
        }
952
        }
861
        break;
953
        break;
862
    }
954
    }
863
    if (error)
955
    if (error)
864
        *error = DbOpenMainDb;
956
        *error = DbOpenMainDb;
...
...
1049
    string aerr;
1141
    string aerr;
1050
    bool mstripped = true;
1142
    bool mstripped = true;
1051
    LOGDEB("Db::testDbDir: [" << dir << "]\n");
1143
    LOGDEB("Db::testDbDir: [" << dir << "]\n");
1052
    try {
1144
    try {
1053
    Xapian::Database db(dir);
1145
    Xapian::Database db(dir);
1054
  // If we have terms with a leading ':' it's an
1146
  // If the prefix for mimetype is wrapped, it's an unstripped
1055
  // unstripped index
1147
  // index. T has been in use in recoll since the beginning and
1148
  // all documents have a T field (possibly empty).
1056
    Xapian::TermIterator term = db.allterms_begin(":");
1149
    Xapian::TermIterator term = db.allterms_begin(":T:");
1057
    if (term == db.allterms_end())
1150
    if (term == db.allterms_end()) {
1058
        mstripped = true;
1151
        mstripped = true;
1059
  else
1152
        } else {
1060
        mstripped = false;
1153
        mstripped = false;
1154
        }
1155
        LOGDEB("testDbDir: " << dir << " is a " <<
1156
               (mstripped ? "stripped" : "raw") << " index\n");
1061
    } XCATCHERROR(aerr);
1157
    } XCATCHERROR(aerr);
1062
    if (!aerr.empty()) {
1158
    if (!aerr.empty()) {
1063
    LOGERR("Db::Open: error while trying to open database from [" <<
1159
    LOGERR("Db::Open: error while trying to open database from [" <<
1064
               dir << "]: " << aerr << "\n");
1160
               dir << "]: " << aerr << "\n");
1065
    return false;
1161
    return false;
...
...
1368
    tpidx.setTSD(&splitter);
1464
    tpidx.setTSD(&splitter);
1369
1465
1370
    // Udi unique term: this is used for file existence/uptodate
1466
    // Udi unique term: this is used for file existence/uptodate
1371
    // checks, and unique id for the replace_document() call.
1467
    // checks, and unique id for the replace_document() call.
1372
    string uniterm = make_uniterm(udi);
1468
    string uniterm = make_uniterm(udi);
1469
    string rawztext; // Doc compressed text
1373
1470
1374
    if (doc.onlyxattr) {
1471
    if (doc.onlyxattr) {
1375
    // Only updating an existing doc with new extended attributes
1472
    // Only updating an existing doc with new extended attributes
1376
    // data.  Need to read the old doc and its data record
1473
    // data.  Need to read the old doc and its data record
1377
    // first. This is so different from the normal processing that
1474
    // first. This is so different from the normal processing that
...
...
1419
        newdocument.add_posting(wrap_prefix(pathelt_prefix),
1516
        newdocument.add_posting(wrap_prefix(pathelt_prefix),
1420
                    splitter.basepos + splitter.curpos++);
1517
                    splitter.basepos + splitter.curpos++);
1421
        for (vector<string>::iterator it = vpath.begin(); 
1518
        for (vector<string>::iterator it = vpath.begin(); 
1422
         it != vpath.end(); it++){
1519
         it != vpath.end(); it++){
1423
        if (it->length() > 230) {
1520
        if (it->length() > 230) {
1424
            // Just truncate it. May still be useful because of wildcards
1521
            // Just truncate it. May still be useful because
1522
          // of wildcards
1425
            *it = it->substr(0, 230);
1523
            *it = it->substr(0, 230);
1426
        }
1524
        }
1427
        newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
1525
        newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
1428
                    splitter.basepos + splitter.curpos++);
1526
                    splitter.basepos + splitter.curpos++);
1429
        }
1527
        }
...
...
1434
    // positions, as we may want to do phrase searches with them (this
1532
    // positions, as we may want to do phrase searches with them (this
1435
    // makes no sense for keywords by the way).
1533
    // makes no sense for keywords by the way).
1436
    //
1534
    //
1437
    // The order has no importance, and we set a position gap of 100
1535
    // The order has no importance, and we set a position gap of 100
1438
    // between fields to avoid false proximity matches.
1536
    // between fields to avoid false proximity matches.
1439
  map<string, string>::iterator meta_it;
1537
  for (const auto& entry: doc.meta) {
1440
  for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
1441
        if (!meta_it->second.empty()) {
1538
        if (entry.second.empty()) {
1442
      const FieldTraits *ftp;
1539
                continue;
1443
      // We don't test for an empty prefix here. Some fields are part
1540
            }
1444
      // of the internal conf with an empty prefix (ie: abstract).
1541
            const FieldTraits *ftp{nullptr};
1445
      if (!fieldToTraits(meta_it->first, &ftp)) {
1542
            fieldToTraits(entry.first, &ftp);
1446
          LOGDEB0("Db::add: no prefix for field [" <<
1543
            if (ftp && ftp->valueslot) {
1447
                            meta_it->first << "], no indexing\n");
1544
                LOGDEB("Adding value: for field " << entry.first << " slot "
1448
          continue;
1545
                       << ftp->valueslot << endl);
1449
      }
1546
                add_field_value(newdocument, *ftp, entry.second);
1547
            }
1548
1549
            // There was an old comment here about not testing for
1550
            // empty prefix, and we indeed did not test. I don't think
1551
            // that it makes sense any more (and was in disagreement
1552
            // with the LOG message. Really now: no prefix: no
1553
            // indexing.
1554
            if (ftp && !ftp->pfx.empty()) {
1450
      LOGDEB0("Db::add: field [" << meta_it->first << "] pfx [" <<
1555
                LOGDEB0("Db::add: field [" << entry.first << "] pfx [" <<
1451
                        ftp->pfx << "] inc " << ftp->wdfinc << ": [" <<
1556
                        ftp->pfx << "] inc " << ftp->wdfinc << ": [" <<
1452
                        meta_it->second << "]\n");
1557
                        entry.second << "]\n");
1453
                splitter.setTraits(*ftp);
1558
                splitter.setTraits(*ftp);
1454
      if (!splitter.text_to_words(meta_it->second)) {
1559
                if (!splitter.text_to_words(entry.second)) {
1455
          LOGDEB("Db::addOrUpdate: split failed for " <<
1560
                    LOGDEB("Db::addOrUpdate: split failed for " <<
1456
                           meta_it->first << "\n");
1561
                           entry.first << "\n");
1457
                }
1562
                }
1458
      }
1563
            } else {
1564
                LOGDEB0("Db::add: no prefix for field [" <<
1565
                        entry.first << "], no indexing\n");
1566
            }
1459
    }
1567
    }
1460
1568
1461
        // Reset to no prefix and default params
1569
        // Reset to no prefix and default params
1462
        splitter.setTraits(FieldTraits());
1570
        splitter.setTraits(FieldTraits());
1463
1571
...
...
1468
    LOGDEB2("Db::add: split body: [" << doc.text << "]\n");
1576
    LOGDEB2("Db::add: split body: [" << doc.text << "]\n");
1469
1577
1470
#ifdef TEXTSPLIT_STATS
1578
#ifdef TEXTSPLIT_STATS
1471
    splitter.resetStats();
1579
    splitter.resetStats();
1472
#endif
1580
#endif
1473
    if (!splitter.text_to_words(doc.text))
1581
    if (!splitter.text_to_words(doc.text)) {
1474
        LOGDEB("Db::addOrUpdate: split failed for main text\n");
1582
        LOGDEB("Db::addOrUpdate: split failed for main text\n");
1583
        } else {
1584
            if (m_ndb->m_storetext) {
1585
                ZLibUtBuf buf;
1586
                deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
1587
                rawztext.assign(buf.getBuf(), buf.getCnt());
1588
            }
1589
        }
1475
1590
1476
#ifdef TEXTSPLIT_STATS
1591
#ifdef TEXTSPLIT_STATS
1477
    // Reject bad data. unrecognized base64 text is characterized by
1592
    // Reject bad data. unrecognized base64 text is characterized by
1478
    // high avg word length and high variation (because there are
1593
    // high avg word length and high variation (because there are
1479
    // word-splitters like +/ inside the data).
1594
    // word-splitters like +/ inside the data).
...
...
1499
    // We also add a term for the filename extension if any.
1614
    // We also add a term for the filename extension if any.
1500
    string utf8fn;
1615
    string utf8fn;
1501
    if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
1616
    if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
1502
        string fn;
1617
        string fn;
1503
        if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
1618
        if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
1504
        // We should truncate after extracting the extension, but this is
1619
        // We should truncate after extracting the extension,
1505
        // a pathological case anyway
1620
        // but this is a pathological case anyway
1506
        if (fn.size() > 230)
1621
        if (fn.size() > 230)
1507
            utf8truncate(fn, 230);
1622
            utf8truncate(fn, 230);
1508
        string::size_type pos = fn.rfind('.');
1623
        string::size_type pos = fn.rfind('.');
1509
        if (pos != string::npos && pos != fn.length() - 1) {
1624
        if (pos != string::npos && pos != fn.length() - 1) {
1510
            newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
1625
            newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
1511
                         fn.substr(pos + 1));
1626
                         fn.substr(pos + 1));
1512
        }
1627
        }
1513
        newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
1628
        newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn,0);
1514
        }
1629
        }
1515
    }
1630
    }
1516
1631
1517
    newdocument.add_boolean_term(uniterm);
1632
    newdocument.add_boolean_term(uniterm);
1518
    // Parent term. This is used to find all descendents, mostly
1633
    // Parent term. This is used to find all descendents, mostly
...
...
1685
    LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
1800
    LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
1686
    newdocument.set_data(record);
1801
    newdocument.set_data(record);
1687
    }
1802
    }
1688
#ifdef IDX_THREADS
1803
#ifdef IDX_THREADS
1689
    if (m_ndb->m_havewriteq) {
1804
    if (m_ndb->m_havewriteq) {
1690
  DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, 
1805
  DbUpdTask *tp = new DbUpdTask(
1691
                    newdocument_ptr, doc.text.length());
1806
            DbUpdTask::AddOrUpdate, udi, uniterm, newdocument_ptr,
1807
            doc.text.length(), rawztext);
1692
    if (!m_ndb->m_wqueue.put(tp)) {
1808
    if (!m_ndb->m_wqueue.put(tp)) {
1693
        LOGERR("Db::addOrUpdate:Cant queue task\n");
1809
        LOGERR("Db::addOrUpdate:Cant queue task\n");
1694
            delete newdocument_ptr;
1810
            delete newdocument_ptr;
1695
        return false;
1811
        return false;
1696
    } else {
1812
    } else {
...
...
1698
    }
1814
    }
1699
    }
1815
    }
1700
#endif
1816
#endif
1701
1817
1702
    return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
1818
    return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
1703
                   doc.text.length());
1819
                   doc.text.length(), rawztext);
1704
}
1820
}
1705
1821
1706
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
1822
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
1707
                    Doc &doc, Xapian::Document& xdoc)
1823
                    Doc &doc, Xapian::Document& xdoc)
1708
{
1824
{
...
...
2062
            // size from the data record, but this would be
2178
            // size from the data record, but this would be
2063
            // bad for performance.
2179
            // bad for performance.
2064
            Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
2180
            Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
2065
            maybeflush(trms * 5);
2181
            maybeflush(trms * 5);
2066
        }
2182
        }
2067
        m_ndb->xwdb.delete_document(docid);
2183
        m_ndb->deleteDocument(docid);
2068
        LOGDEB("Db::purge: deleted document #" << docid << "\n");
2184
        LOGDEB("Db::purge: deleted document #" << docid << "\n");
2069
        } catch (const Xapian::DocNotFoundError &) {
2185
        } catch (const Xapian::DocNotFoundError &) {
2070
        LOGDEB0("Db::purge: document #" << docid << " not found\n");
2186
        LOGDEB0("Db::purge: document #" << docid << " not found\n");
2071
        } catch (const Xapian::Error &e) {
2187
        } catch (const Xapian::Error &e) {
2072
        LOGERR("Db::purge: document #" << docid << ": " <<
2188
        LOGERR("Db::purge: document #" << docid << ": " <<
...
...
2123
    if (!exists)
2239
    if (!exists)
2124
    return true;
2240
    return true;
2125
2241
2126
#ifdef IDX_THREADS
2242
#ifdef IDX_THREADS
2127
    if (m_ndb->m_havewriteq) {
2243
    if (m_ndb->m_havewriteq) {
2244
        string rztxt;
2128
    DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, 
2245
    DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, 
2129
                      0, (size_t)-1);
2246
                      0, (size_t)-1, rztxt);
2130
    if (!m_ndb->m_wqueue.put(tp)) {
2247
    if (!m_ndb->m_wqueue.put(tp)) {
2131
        LOGERR("Db::purgeFile:Cant queue task\n");
2248
        LOGERR("Db::purgeFile:Cant queue task\n");
2132
        return false;
2249
        return false;
2133
    } else {
2250
    } else {
2134
        return true;
2251
        return true;
...
...
2150
2267
2151
    string uniterm = make_uniterm(udi);
2268
    string uniterm = make_uniterm(udi);
2152
2269
2153
#ifdef IDX_THREADS
2270
#ifdef IDX_THREADS
2154
    if (m_ndb->m_havewriteq) {
2271
    if (m_ndb->m_havewriteq) {
2272
        string rztxt;
2155
    DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, 
2273
    DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, 
2156
                      0, (size_t)-1);
2274
                      0, (size_t)-1, rztxt);
2157
    if (!m_ndb->m_wqueue.put(tp)) {
2275
    if (!m_ndb->m_wqueue.put(tp)) {
2158
        LOGERR("Db::purgeFile:Cant queue task\n");
2276
        LOGERR("Db::purgeFile:Cant queue task\n");
2159
        return false;
2277
        return false;
2160
    } else {
2278
    } else {
2161
        return true;
2279
        return true;