|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
50 |
#include "rclquery_p.h"
|
50 |
#include "rclquery_p.h"
|
51 |
#include "md5.h"
|
51 |
#include "md5.h"
|
52 |
#include "rclversion.h"
|
52 |
#include "rclversion.h"
|
53 |
#include "cancelcheck.h"
|
53 |
#include "cancelcheck.h"
|
54 |
#include "ptmutex.h"
|
54 |
#include "ptmutex.h"
|
|
|
55 |
#include "termproc.h"
|
55 |
|
56 |
|
56 |
#ifndef MAX
|
57 |
#ifndef MAX
|
57 |
#define MAX(A,B) (A>B?A:B)
|
58 |
#define MAX(A,B) (A>B?A:B)
|
58 |
#endif
|
59 |
#endif
|
59 |
#ifndef MIN
|
60 |
#ifndef MIN
|
|
... |
|
... |
856 |
|
857 |
|
857 |
|
858 |
|
858 |
// The splitter breaks text into words and adds postings to the Xapian
|
859 |
// The splitter breaks text into words and adds postings to the Xapian
|
859 |
// document. We use a single object to split all of the document
|
860 |
// document. We use a single object to split all of the document
|
860 |
// fields and position jumps to separate fields
|
861 |
// fields and position jumps to separate fields
|
861 |
class TextSplitDb : public TextSplit {
|
862 |
class TextSplitDb : public TextSplitP {
|
862 |
public:
|
863 |
public:
|
863 |
Xapian::WritableDatabase db;
|
864 |
Xapian::WritableDatabase db;
|
864 |
Xapian::Document &doc; // Xapian document
|
865 |
Xapian::Document &doc; // Xapian document
|
865 |
// Base for document section. Gets large increment when we change
|
866 |
// Base for document section. Gets large increment when we change
|
866 |
// sections, to avoid cross-section proximity matches.
|
867 |
// sections, to avoid cross-section proximity matches.
|
|
... |
|
... |
871 |
// section size (last relative term position), and this is what
|
872 |
// section size (last relative term position), and this is what
|
872 |
// gets added to basepos in addition to the inter-section increment
|
873 |
// gets added to basepos in addition to the inter-section increment
|
873 |
// to compute the first position of the next section.
|
874 |
// to compute the first position of the next section.
|
874 |
Xapian::termpos curpos;
|
875 |
Xapian::termpos curpos;
|
875 |
|
876 |
|
876 |
StopList &stops;
|
|
|
877 |
TextSplitDb(Xapian::WritableDatabase idb,
|
877 |
TextSplitDb(Xapian::WritableDatabase idb,
|
878 |
Xapian::Document &d, StopList &_stops)
|
878 |
Xapian::Document &d, TermProc *prc)
|
|
|
879 |
: TextSplitP(prc),
|
879 |
: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
|
880 |
db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
|
880 |
{}
|
881 |
{}
|
881 |
// Reimplement text_to_words to add start and end special terms
|
882 |
// Reimplement text_to_words to add start and end special terms
|
882 |
virtual bool text_to_words(const string &in);
|
883 |
virtual bool text_to_words(const string &in);
|
883 |
bool takeword(const std::string &term, int pos, int, int);
|
|
|
884 |
void setprefix(const string& pref) {prefix = pref;}
|
884 |
void setprefix(const string& pref) {prefix = pref;}
|
885 |
void setwdfinc(int i) {wdfinc = i;}
|
885 |
void setwdfinc(int i) {wdfinc = i;}
|
|
|
886 |
|
|
|
887 |
friend class TermProcIdx;
|
886 |
|
888 |
|
887 |
private:
|
889 |
private:
|
888 |
// If prefix is set, we also add a posting for the prefixed terms
|
890 |
// If prefix is set, we also add a posting for the prefixed terms
|
889 |
// (ie: for titles, add postings for both "term" and "Sterm")
|
891 |
// (ie: for titles, add postings for both "term" and "Sterm")
|
890 |
string prefix;
|
892 |
string prefix;
|
891 |
// Some fields have more weight
|
893 |
// Some fields have more weight
|
892 |
int wdfinc;
|
894 |
int wdfinc;
|
893 |
};
|
895 |
};
|
894 |
|
896 |
|
895 |
|
897 |
// Reimplement text_to_words to insert the begin and end anchor terms.
|
896 |
bool TextSplitDb::text_to_words(const string &in)
|
898 |
bool TextSplitDb::text_to_words(const string &in)
|
897 |
{
|
899 |
{
|
898 |
LOGDEB2(("TextSplitDb::text_to_words\n"));
|
900 |
LOGDEB2(("TextSplitDb::text_to_words\n"));
|
899 |
string ermsg;
|
901 |
string ermsg;
|
900 |
try {
|
902 |
try {
|
|
... |
|
... |
906 |
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
908 |
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
907 |
basepos += curpos + 100;
|
909 |
basepos += curpos + 100;
|
908 |
return false;
|
910 |
return false;
|
909 |
}
|
911 |
}
|
910 |
|
912 |
|
911 |
if (!TextSplit::text_to_words(in)) {
|
913 |
if (!TextSplitP::text_to_words(in)) {
|
912 |
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
|
914 |
LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
|
913 |
basepos += curpos + 100;
|
915 |
basepos += curpos + 100;
|
914 |
return false;
|
916 |
return false;
|
915 |
}
|
917 |
}
|
916 |
|
918 |
|
|
... |
|
... |
922 |
if (!ermsg.empty()) {
|
924 |
if (!ermsg.empty()) {
|
923 |
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
925 |
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
924 |
basepos += curpos + 100;
|
926 |
basepos += curpos + 100;
|
925 |
return false;
|
927 |
return false;
|
926 |
}
|
928 |
}
|
|
|
929 |
|
927 |
basepos += curpos + 100;
|
930 |
basepos += curpos + 100;
|
928 |
return true;
|
931 |
return true;
|
929 |
}
|
932 |
}
|
930 |
|
933 |
|
931 |
// Get one term from the doc, remove accents and lowercase, then add posting
|
934 |
class TermProcIdx : public TermProc {
|
|
|
935 |
public:
|
|
|
936 |
TermProcIdx() : TermProc(0), m_ts(0) {}
|
|
|
937 |
void setTSD(TextSplitDb *ts) {m_ts = ts;}
|
|
|
938 |
|
932 |
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
|
939 |
bool takeword(const std::string &term, int pos, int, int)
|
933 |
{
|
|
|
934 |
LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
|
|
|
935 |
|
|
|
936 |
string term;
|
|
|
937 |
if (!unacmaybefold(_term, term, "UTF-8", true)) {
|
|
|
938 |
LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
|
|
|
939 |
_term.c_str()));
|
|
|
940 |
term.clear();
|
|
|
941 |
// We don't generate a fatal error because of a bad term
|
|
|
942 |
return true;
|
|
|
943 |
}
|
940 |
{
|
944 |
|
|
|
945 |
if (stops.isStop(term)) {
|
|
|
946 |
LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
|
|
|
947 |
return true;
|
|
|
948 |
}
|
|
|
949 |
|
|
|
950 |
// Compute absolute position (pos is relative to current segment),
|
941 |
// Compute absolute position (pos is relative to current segment),
|
951 |
// and remember relative.
|
942 |
// and remember relative.
|
952 |
curpos = pos;
|
943 |
m_ts->curpos = pos;
|
953 |
pos += basepos;
|
944 |
pos += m_ts->basepos;
|
954 |
string ermsg;
|
945 |
string ermsg;
|
955 |
try {
|
946 |
try {
|
956 |
// Index without prefix, using the field-specific weighting
|
947 |
// Index without prefix, using the field-specific weighting
|
|
|
948 |
LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
|
957 |
doc.add_posting(term, pos, wdfinc);
|
949 |
m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
|
958 |
#ifdef TESTING_XAPIAN_SPELL
|
950 |
#ifdef TESTING_XAPIAN_SPELL
|
959 |
if (Db::isSpellingCandidate(term)) {
|
951 |
if (Db::isSpellingCandidate(term)) {
|
960 |
db.add_spelling(term);
|
952 |
m_ts->db.add_spelling(term);
|
961 |
}
|
953 |
}
|
962 |
#endif
|
954 |
#endif
|
963 |
// Index the prefixed term.
|
955 |
// Index the prefixed term.
|
964 |
if (!prefix.empty()) {
|
956 |
if (!m_ts->prefix.empty()) {
|
965 |
doc.add_posting(prefix + term, pos, wdfinc);
|
957 |
m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
|
966 |
}
|
958 |
}
|
967 |
return true;
|
959 |
return true;
|
968 |
} XCATCHERROR(ermsg);
|
960 |
} XCATCHERROR(ermsg);
|
969 |
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
961 |
LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
|
970 |
return false;
|
962 |
return false;
|
|
|
963 |
}
|
|
|
964 |
private:
|
|
|
965 |
TextSplitDb *m_ts;
|
971 |
}
|
966 |
};
|
|
|
967 |
|
972 |
|
968 |
|
973 |
#ifdef TESTING_XAPIAN_SPELL
|
969 |
#ifdef TESTING_XAPIAN_SPELL
|
974 |
string Db::getSpellingSuggestion(const string& word)
|
970 |
string Db::getSpellingSuggestion(const string& word)
|
975 |
{
|
971 |
{
|
976 |
if (m_ndb == 0)
|
972 |
if (m_ndb == 0)
|
|
... |
|
... |
1030 |
}
|
1026 |
}
|
1031 |
|
1027 |
|
1032 |
Doc doc = idoc;
|
1028 |
Doc doc = idoc;
|
1033 |
|
1029 |
|
1034 |
Xapian::Document newdocument;
|
1030 |
Xapian::Document newdocument;
|
|
|
1031 |
TermProcIdx tpidx;
|
|
|
1032 |
// TermProcStop tpstop(&tpidx, m_stops);
|
|
|
1033 |
TermProcCommongrams tpstop(&tpidx, m_stops);
|
|
|
1034 |
TermProcPrep tpprep(&tpstop);
|
1035 |
TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
|
1035 |
TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
|
1036 |
|
1036 |
tpidx.setTSD(&splitter);
|
1037 |
// Split and index file name as document term(s)
|
1037 |
// Split and index file name as document term(s)
|
1038 |
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
1038 |
LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
|
1039 |
if (!splitter.text_to_words(doc.utf8fn))
|
1039 |
if (!splitter.text_to_words(doc.utf8fn))
|
1040 |
LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
|
1040 |
LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
|
1041 |
|
1041 |
|