a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
...
...
49
#include "rclquery.h"
49
#include "rclquery.h"
50
#include "rclquery_p.h"
50
#include "rclquery_p.h"
51
#include "md5.h"
51
#include "md5.h"
52
#include "rclversion.h"
52
#include "rclversion.h"
53
#include "cancelcheck.h"
53
#include "cancelcheck.h"
54
#include "ptmutex.h"
54
55
55
#ifndef MAX
56
#ifndef MAX
56
#define MAX(A,B) (A>B?A:B)
57
#define MAX(A,B) (A>B?A:B)
57
#endif
58
#endif
58
#ifndef MIN
59
#ifndef MIN
59
#define MIN(A,B) (A<B?A:B)
60
#define MIN(A,B) (A<B?A:B)
60
#endif
61
#endif
61
62
62
// Recoll index format version is stored in user metadata. When this change,
63
// Recoll index format version is stored in user metadata. When this change,
63
// we can't open the db and will have to reindex.
64
// we can't open the db and will have to reindex.
64
static const string RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
65
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
65
static const string RCL_IDX_VERSION("1");
66
static const string cstr_RCL_IDX_VERSION("1");
66
67
67
// This is the word position offset at which we index the body text
68
// This is the word position offset at which we index the body text
68
// (abstract, keywords, etc.. are stored before this)
69
// (abstract, keywords, etc.. are stored before this)
69
static const unsigned int baseTextPosition = 100000;
70
static const unsigned int baseTextPosition = 100000;
70
71
...
...
77
const string end_of_field_term = "XXND";
78
const string end_of_field_term = "XXND";
78
79
79
// This is used as a marker inside the abstract frag lists, but
80
// This is used as a marker inside the abstract frag lists, but
80
// normally doesn't remain in final output (which is built with a
81
// normally doesn't remain in final output (which is built with a
81
// custom sep. by our caller).
82
// custom sep. by our caller).
82
static const string ellipsis("...");
83
static const string cstr_ellipsis("...");
83
84
84
string version_string(){
85
string version_string(){
85
    return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
86
    return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
86
        string(Xapian::version_string());
87
        string(Xapian::version_string());
87
}
88
}
88
89
89
// Synthetic abstract marker (to discriminate from abstract actually
90
// Synthetic abstract marker (to discriminate from abstract actually
90
// found in document)
91
// found in document)
91
static const string rclSyntAbs("?!#@");
92
static const string cstr_syntAbs("?!#@");
92
93
93
// Only ONE field name inside the index data record differs from the
94
// Only ONE field name inside the index data record differs from the
94
// Rcl::Doc ones: caption<->title, for a remnant of compatibility with
95
// Rcl::Doc ones: caption<->title, for a remnant of compatibility with
95
// omega
96
// omega
96
static const string keycap("caption");
97
static const string cstr_keycap("caption");
97
98
98
// Static/Default table for field->prefix/weight translation. 
99
// Static/Default table for field->prefix/weight translation. 
99
// This is logically const after initialization. Can't use a
100
// This is logically const after initialization. Can't use a
100
// static object to init this as the static std::string objects may
101
// static object to init this as the static std::string objects may
101
// not be ready.
102
// not be ready.
...
...
104
// "fields" configuration (cf: Db::fieldToTraits()), meaning that the
105
// "fields" configuration (cf: Db::fieldToTraits()), meaning that the
105
// entries can be overriden in the configuration, but not
106
// entries can be overriden in the configuration, but not
106
// suppressed. 
107
// suppressed. 
107
108
108
static map<string, FieldTraits> fldToTraits;
109
static map<string, FieldTraits> fldToTraits;
110
static PTMutexInit o_fldToTraits_mutex;
111
109
static void initFldToTraits() 
112
static void initFldToTraits() 
110
{
113
{
114
    PTMutexLocker locker(o_fldToTraits_mutex);
115
    // As we perform non-locked testing of initialization, check again with
116
    // the lock held
117
    if (fldToTraits.size())
118
  return;
119
111
    // Can't remember why "abstract" is indexed without a prefix
120
    // Can't remember why "abstract" is indexed without a prefix
112
    // (result: it's indexed twice actually). Maybe I'll dare change
121
    // (result: it's indexed twice actually). Maybe I'll dare change
113
    // this one day
122
    // this one day
114
    fldToTraits[Doc::keyabs] = FieldTraits();
123
    fldToTraits[Doc::keyabs] = FieldTraits();
115
124
116
    fldToTraits["ext"] = FieldTraits("XE");
125
    fldToTraits["ext"] = FieldTraits("XE");
117
    fldToTraits[Doc::keyfn] = FieldTraits("XSFN");
126
    fldToTraits[Doc::keyfn] = FieldTraits("XSFN");
118
127
119
    fldToTraits[keycap] = FieldTraits("S");
128
    fldToTraits[cstr_keycap] = FieldTraits("S");
120
    fldToTraits[Doc::keytt] = FieldTraits("S");
129
    fldToTraits[Doc::keytt] = FieldTraits("S");
121
    fldToTraits["subject"] = FieldTraits("S");
130
    fldToTraits["subject"] = FieldTraits("S");
122
131
123
    fldToTraits[Doc::keyau] = FieldTraits("A");
132
    fldToTraits[Doc::keyau] = FieldTraits("A");
124
    fldToTraits["creator"] = FieldTraits("A");
133
    fldToTraits["creator"] = FieldTraits("A");
...
...
187
    parms.get(Doc::keyurl, doc.url);
196
    parms.get(Doc::keyurl, doc.url);
188
    parms.get(Doc::keytp, doc.mimetype);
197
    parms.get(Doc::keytp, doc.mimetype);
189
    parms.get(Doc::keyfmt, doc.fmtime);
198
    parms.get(Doc::keyfmt, doc.fmtime);
190
    parms.get(Doc::keydmt, doc.dmtime);
199
    parms.get(Doc::keydmt, doc.dmtime);
191
    parms.get(Doc::keyoc, doc.origcharset);
200
    parms.get(Doc::keyoc, doc.origcharset);
192
    parms.get(keycap, doc.meta[Doc::keytt]);
201
    parms.get(cstr_keycap, doc.meta[Doc::keytt]);
193
    parms.get(Doc::keykw, doc.meta[Doc::keykw]);
202
    parms.get(Doc::keykw, doc.meta[Doc::keykw]);
194
    parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
203
    parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
195
    // Possibly remove synthetic abstract indicator (if it's there, we
204
    // Possibly remove synthetic abstract indicator (if it's there, we
196
    // used to index the beginning of the text as abstract).
205
    // used to index the beginning of the text as abstract).
197
    doc.syntabs = false;
206
    doc.syntabs = false;
198
    if (doc.meta[Doc::keyabs].find(rclSyntAbs) == 0) {
207
    if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
199
    doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(rclSyntAbs.length());
208
    doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
200
    doc.syntabs = true;
209
    doc.syntabs = true;
201
    }
210
    }
202
    parms.get(Doc::keyipt, doc.ipath);
211
    parms.get(Doc::keyipt, doc.ipath);
203
    parms.get(Doc::keyfs, doc.fbytes);
212
    parms.get(Doc::keyfs, doc.fbytes);
204
    parms.get(Doc::keyds, doc.dbytes);
213
    parms.get(Doc::keyds, doc.dbytes);
...
...
415
            if (ii == (unsigned int)ipos) {
424
            if (ii == (unsigned int)ipos) {
416
            sparseDoc[ii] = qterm;
425
            sparseDoc[ii] = qterm;
417
            } else if (ii > (unsigned int)ipos && 
426
            } else if (ii > (unsigned int)ipos && 
418
                   ii < (unsigned int)ipos + qtrmwrdcnt) {
427
                   ii < (unsigned int)ipos + qtrmwrdcnt) {
419
            sparseDoc[ii] = occupiedmarker;
428
            sparseDoc[ii] = occupiedmarker;
420
            } else if (!sparseDoc[ii].compare(ellipsis)) {
429
            } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
421
            // For an empty slot, the test has a side
430
            // For an empty slot, the test has a side
422
            // effect of inserting an empty string which
431
            // effect of inserting an empty string which
423
            // is what we want
432
            // is what we want
424
            sparseDoc[ii] = emptys;
433
            sparseDoc[ii] = emptys;
425
            }
434
            }
...
...
427
        // Add ellipsis at the end. This may be replaced later by
436
        // Add ellipsis at the end. This may be replaced later by
428
        // an overlapping extract. Take care not to replace an
437
        // an overlapping extract. Take care not to replace an
429
        // empty string here, we really want an empty slot,
438
        // empty string here, we really want an empty slot,
430
        // use find()
439
        // use find()
431
        if (sparseDoc.find(sto+1) == sparseDoc.end()) {
440
        if (sparseDoc.find(sto+1) == sparseDoc.end()) {
432
            sparseDoc[sto+1] = ellipsis;
441
            sparseDoc[sto+1] = cstr_ellipsis;
433
        }
442
        }
434
443
435
        // Limit to allocated occurences and total size
444
        // Limit to allocated occurences and total size
436
        if (++occurrences >= maxoccs || 
445
        if (++occurrences >= maxoccs || 
437
            qtermposs.size() >= maxtotaloccs)
446
            qtermposs.size() >= maxtotaloccs)
...
...
529
    if (TextSplit::isCJK(*uit))
538
    if (TextSplit::isCJK(*uit))
530
        newcjk = true;
539
        newcjk = true;
531
    if (!incjk || (incjk && !newcjk))
540
    if (!incjk || (incjk && !newcjk))
532
        chunk += " ";
541
        chunk += " ";
533
    incjk = newcjk;
542
    incjk = newcjk;
534
    if (it->second == ellipsis) {
543
    if (it->second == cstr_ellipsis) {
535
        vabs.push_back(chunk);
544
        vabs.push_back(chunk);
536
        chunk.clear();
545
        chunk.clear();
537
    } else {
546
    } else {
538
        chunk += it->second;
547
        chunk += it->second;
539
    }
548
    }
...
...
610
            Xapian::DB_CREATE_OR_OVERWRITE;
619
            Xapian::DB_CREATE_OR_OVERWRITE;
611
        m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
620
        m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
612
                // If db is empty, write the data format version at once
621
                // If db is empty, write the data format version at once
613
                // to avoid stupid error messages:
622
                // to avoid stupid error messages:
614
                if (m_ndb->xwdb.get_doccount() == 0)
623
                if (m_ndb->xwdb.get_doccount() == 0)
615
                    m_ndb->xwdb.set_metadata(RCL_IDX_VERSION_KEY, 
624
                    m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, 
616
                                             RCL_IDX_VERSION);
625
                                             cstr_RCL_IDX_VERSION);
617
        m_ndb->m_iswritable = true;
626
        m_ndb->m_iswritable = true;
618
        // We open a readonly object in all cases (possibly in
627
        // We open a readonly object in all cases (possibly in
619
        // addition to the r/w one) because some operations
628
        // addition to the r/w one) because some operations
620
        // are faster when performed through a Database: no
629
        // are faster when performed through a Database: no
621
        // forced flushes on allterms_begin(), ie, used in
630
        // forced flushes on allterms_begin(), ie, used in
...
...
648
        *error = DbOpenMainDb;
657
        *error = DbOpenMainDb;
649
658
650
    // Check index format version. Must not try to check a just created or
659
    // Check index format version. Must not try to check a just created or
651
    // truncated db
660
    // truncated db
652
    if (mode != DbTrunc && m_ndb->xdb().get_doccount() > 0) {
661
    if (mode != DbTrunc && m_ndb->xdb().get_doccount() > 0) {
653
        string version = m_ndb->xdb().get_metadata(RCL_IDX_VERSION_KEY);
662
        string version = m_ndb->xdb().get_metadata(cstr_RCL_IDX_VERSION_KEY);
654
        if (version.compare(RCL_IDX_VERSION)) {
663
        if (version.compare(cstr_RCL_IDX_VERSION)) {
655
        m_ndb->m_noversionwrite = true;
664
        m_ndb->m_noversionwrite = true;
656
        LOGERR(("Rcl::Db::open: file index [%s], software [%s]\n",
665
        LOGERR(("Rcl::Db::open: file index [%s], software [%s]\n",
657
            version.c_str(), RCL_IDX_VERSION.c_str()));
666
            version.c_str(), cstr_RCL_IDX_VERSION.c_str()));
658
        throw Xapian::DatabaseError("Recoll index version mismatch",
667
        throw Xapian::DatabaseError("Recoll index version mismatch",
659
                        "", "");
668
                        "", "");
660
        }
669
        }
661
    }
670
    }
662
    m_mode = mode;
671
    m_mode = mode;
...
...
691
    string ermsg;
700
    string ermsg;
692
    try {
701
    try {
693
    bool w = m_ndb->m_iswritable;
702
    bool w = m_ndb->m_iswritable;
694
    if (w) {
703
    if (w) {
695
        if (!m_ndb->m_noversionwrite)
704
        if (!m_ndb->m_noversionwrite)
696
        m_ndb->xwdb.set_metadata(RCL_IDX_VERSION_KEY, RCL_IDX_VERSION);
705
        m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
697
        LOGDEB(("Rcl::Db:close: xapian will close. May take some time\n"));
706
        LOGDEB(("Rcl::Db:close: xapian will close. May take some time\n"));
698
    }
707
    }
699
    // Used to do a flush here. Cant see why it should be necessary.
708
    // Used to do a flush here. Cant see why it should be necessary.
700
    deleteZ(m_ndb);
709
    deleteZ(m_ndb);
701
    if (w)
710
    if (w)
...
...
950
    if (syntctxlen > 0)
959
    if (syntctxlen > 0)
951
    m_synthAbsWordCtxLen = syntctxlen;
960
    m_synthAbsWordCtxLen = syntctxlen;
952
}
961
}
953
962
954
static const int MB = 1024 * 1024;
963
static const int MB = 1024 * 1024;
955
static const string nc("\n\r\x0c");
964
static const string cstr_nc("\n\r\x0c");
956
965
957
#define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";}
966
#define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";}
958
967
959
// Add document in internal form to the database: index the terms in
968
// Add document in internal form to the database: index the terms in
960
// the title abstract and body and add special terms for file name,
969
// the title abstract and body and add special terms for file name,
...
...
1166
    RECORD_APPEND(record, Doc::keyipt, doc.ipath);
1175
    RECORD_APPEND(record, Doc::keyipt, doc.ipath);
1167
1176
1168
    if (doc.meta[Doc::keytt].empty())
1177
    if (doc.meta[Doc::keytt].empty())
1169
    doc.meta[Doc::keytt] = doc.utf8fn;
1178
    doc.meta[Doc::keytt] = doc.utf8fn;
1170
    doc.meta[Doc::keytt] = 
1179
    doc.meta[Doc::keytt] = 
1171
    neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc);
1180
    neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
1172
    if (!doc.meta[Doc::keytt].empty())
1181
    if (!doc.meta[Doc::keytt].empty())
1173
    RECORD_APPEND(record, keycap, doc.meta[Doc::keytt]);
1182
    RECORD_APPEND(record, cstr_keycap, doc.meta[Doc::keytt]);
1174
1183
1175
    trimstring(doc.meta[Doc::keykw], " \t\r\n");
1184
    trimstring(doc.meta[Doc::keykw], " \t\r\n");
1176
    doc.meta[Doc::keykw] = 
1185
    doc.meta[Doc::keykw] = 
1177
    neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc);
1186
    neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
1178
    if (!doc.meta[Doc::keykw].empty())
1187
    if (!doc.meta[Doc::keykw].empty())
1179
    RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);
1188
    RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);
1180
1189
1181
    // If abstract is empty, we make up one with the beginning of the
1190
    // If abstract is empty, we make up one with the beginning of the
1182
    // document. This is then not indexed, but part of the doc data so
1191
    // document. This is then not indexed, but part of the doc data so
...
...
1187
    // don't exist yet.
1196
    // don't exist yet.
1188
    trimstring(doc.meta[Doc::keyabs], " \t\r\n");
1197
    trimstring(doc.meta[Doc::keyabs], " \t\r\n");
1189
    if (doc.meta[Doc::keyabs].empty()) {
1198
    if (doc.meta[Doc::keyabs].empty()) {
1190
    syntabs = true;
1199
    syntabs = true;
1191
    if (!doc.text.empty())
1200
    if (!doc.text.empty())
1192
        doc.meta[Doc::keyabs] = rclSyntAbs + 
1201
        doc.meta[Doc::keyabs] = cstr_syntAbs + 
1193
        neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), nc);
1202
        neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
1194
    } else {
1203
    } else {
1195
    doc.meta[Doc::keyabs] = 
1204
    doc.meta[Doc::keyabs] = 
1196
        neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
1205
        neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
1197
              nc);
1206
              cstr_nc);
1198
    }
1207
    }
1199
    if (!doc.meta[Doc::keyabs].empty())
1208
    if (!doc.meta[Doc::keyabs].empty())
1200
    RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
1209
    RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
1201
1210
1202
    const set<string>& stored = m_config->getStoredFields();
1211
    const set<string>& stored = m_config->getStoredFields();
1203
    for (set<string>::const_iterator it = stored.begin();
1212
    for (set<string>::const_iterator it = stored.begin();
1204
     it != stored.end(); it++) {
1213
     it != stored.end(); it++) {
1205
    string nm = m_config->fieldCanon(*it);
1214
    string nm = m_config->fieldCanon(*it);
1206
    if (!doc.meta[*it].empty()) {
1215
    if (!doc.meta[*it].empty()) {
1207
        string value = 
1216
        string value = 
1208
        neutchars(truncate_to_word(doc.meta[*it], 150), nc);
1217
        neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
1209
        RECORD_APPEND(record, nm, value);
1218
        RECORD_APPEND(record, nm, value);
1210
    }
1219
    }
1211
    }
1220
    }
1212
1221
1213
    // If the file's md5 was computed, add value. This is optionally
1222
    // If the file's md5 was computed, add value. This is optionally
...
...
1609
}
1618
}
1610
1619
1611
// Characters that can begin a wildcard or regexp expression. We use skipto
1620
// Characters that can begin a wildcard or regexp expression. We use skipto
1612
// to begin the allterms search with terms that begin with the portion of
1621
// to begin the allterms search with terms that begin with the portion of
1613
// the input string prior to these chars.
1622
// the input string prior to these chars.
1614
const string wildSpecChars = "*?[";
1623
const string cstr_wildSpecChars = "*?[";
1615
const string regSpecChars = "(.[{";
1624
const string cstr_regSpecChars = "(.[{";
1616
1625
1617
// Find all index terms that match a wildcard or regular expression
1626
// Find all index terms that match a wildcard or regular expression
1618
bool Db::termMatch(MatchType typ, const string &lang,
1627
bool Db::termMatch(MatchType typ, const string &lang,
1619
           const string &root, 
1628
           const string &root, 
1620
           TermMatchResult& res,
1629
           TermMatchResult& res,
...
...
1637
    string droot;
1646
    string droot;
1638
    if (!unacmaybefold(root, droot, "UTF-8", true)) {
1647
    if (!unacmaybefold(root, droot, "UTF-8", true)) {
1639
    LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
1648
    LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
1640
    return false;
1649
    return false;
1641
    }
1650
    }
1642
    string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;
1651
    string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
1643
1652
1644
    string prefix;
1653
    string prefix;
1645
    if (!field.empty()) {
1654
    if (!field.empty()) {
1646
    const FieldTraits *ftp = 0;
1655
    const FieldTraits *ftp = 0;
1647
    if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
1656
    if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
...
...
1850
    XAPTRY(vab = m_ndb->makeAbstract(doc.xdocid, query),
1859
    XAPTRY(vab = m_ndb->makeAbstract(doc.xdocid, query),
1851
           m_ndb->xrdb, m_reason);
1860
           m_ndb->xrdb, m_reason);
1852
    for (vector<string>::const_iterator it = vab.begin(); 
1861
    for (vector<string>::const_iterator it = vab.begin(); 
1853
     it != vab.end(); it++) {
1862
     it != vab.end(); it++) {
1854
    abstract.append(*it);
1863
    abstract.append(*it);
1855
    abstract.append(ellipsis);
1864
    abstract.append(cstr_ellipsis);
1856
    }
1865
    }
1857
    return m_reason.empty() ? true : false;
1866
    return m_reason.empty() ? true : false;
1858
}
1867
}
1859
1868
1860
// Retrieve document defined by Unique doc identifier. This is mainly used
1869
// Retrieve document defined by Unique doc identifier. This is mainly used