Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
...
...
50
#include "rclquery_p.h"
50
#include "rclquery_p.h"
51
#include "md5.h"
51
#include "md5.h"
52
#include "rclversion.h"
52
#include "rclversion.h"
53
#include "cancelcheck.h"
53
#include "cancelcheck.h"
54
#include "ptmutex.h"
54
#include "ptmutex.h"
55
#include "termproc.h"
55
56
56
#ifndef MAX
57
#ifndef MAX
57
#define MAX(A,B) (A>B?A:B)
58
#define MAX(A,B) (A>B?A:B)
58
#endif
59
#endif
59
#ifndef MIN
60
#ifndef MIN
...
...
856
857
857
858
858
// The splitter breaks text into words and adds postings to the Xapian
859
// The splitter breaks text into words and adds postings to the Xapian
859
// document. We use a single object to split all of the document
860
// document. We use a single object to split all of the document
860
// fields and position jumps to separate fields
861
// fields and position jumps to separate fields
861
class TextSplitDb : public TextSplit {
862
class TextSplitDb : public TextSplitP {
862
 public:
863
 public:
863
    Xapian::WritableDatabase db;
864
    Xapian::WritableDatabase db;
864
    Xapian::Document &doc;   // Xapian document 
865
    Xapian::Document &doc;   // Xapian document 
865
    // Base for document section. Gets large increment when we change
866
    // Base for document section. Gets large increment when we change
866
    // sections, to avoid cross-section proximity matches.
867
    // sections, to avoid cross-section proximity matches.
...
...
871
    // section size (last relative term position), and this is what
872
    // section size (last relative term position), and this is what
872
    // gets added to basepos in addition to the inter-section increment
873
    // gets added to basepos in addition to the inter-section increment
873
    // to compute the first position of the next section.
874
    // to compute the first position of the next section.
874
    Xapian::termpos curpos;
875
    Xapian::termpos curpos;
875
876
876
    StopList &stops;
877
    TextSplitDb(Xapian::WritableDatabase idb, 
877
    TextSplitDb(Xapian::WritableDatabase idb, 
878
      Xapian::Document &d, StopList &_stops) 
878
      Xapian::Document &d, TermProc *prc)
879
  : TextSplitP(prc), 
879
    : db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)
880
      db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
880
    {}
881
    {}
881
    // Reimplement text_to_words to add start and end special terms
882
    // Reimplement text_to_words to add start and end special terms
882
    virtual bool text_to_words(const string &in);
883
    virtual bool text_to_words(const string &in);
883
    bool takeword(const std::string &term, int pos, int, int);
884
    void setprefix(const string& pref) {prefix = pref;}
884
    void setprefix(const string& pref) {prefix = pref;}
885
    void setwdfinc(int i) {wdfinc = i;}
885
    void setwdfinc(int i) {wdfinc = i;}
886
887
    friend class TermProcIdx;
886
888
887
private:
889
private:
888
    // If prefix is set, we also add a posting for the prefixed terms
890
    // If prefix is set, we also add a posting for the prefixed terms
889
    // (ie: for titles, add postings for both "term" and "Sterm")
891
    // (ie: for titles, add postings for both "term" and "Sterm")
890
    string  prefix; 
892
    string  prefix; 
891
    // Some fields have more weight
893
    // Some fields have more weight
892
    int wdfinc;
894
    int wdfinc;
893
};
895
};
894
896
895
897
// Reimplement text_to_words to insert the begin and end anchor terms.
896
bool TextSplitDb::text_to_words(const string &in) 
898
bool TextSplitDb::text_to_words(const string &in) 
897
{
899
{
898
    LOGDEB2(("TextSplitDb::text_to_words\n"));
900
    LOGDEB2(("TextSplitDb::text_to_words\n"));
899
    string ermsg;
901
    string ermsg;
900
    try {
902
    try {
...
...
906
    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
908
    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
907
    basepos += curpos + 100;
909
    basepos += curpos + 100;
908
    return false;
910
    return false;
909
    }
911
    }
910
912
911
    if (!TextSplit::text_to_words(in)) {
913
    if (!TextSplitP::text_to_words(in)) {
912
    LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
914
    LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
913
    basepos += curpos + 100;
915
    basepos += curpos + 100;
914
    return false;
916
    return false;
915
    }
917
    }
916
918
...
...
922
    if (!ermsg.empty()) {
924
    if (!ermsg.empty()) {
923
    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
925
    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
924
    basepos += curpos + 100;
926
    basepos += curpos + 100;
925
    return false;
927
    return false;
926
    }
928
    }
929
927
    basepos += curpos + 100;
930
    basepos += curpos + 100;
928
    return true;
931
    return true;
929
}
932
}
930
933
931
// Get one term from the doc, remove accents and lowercase, then add posting
934
class TermProcIdx : public TermProc {
935
public:
936
    TermProcIdx() : TermProc(0), m_ts(0) {}
937
    void setTSD(TextSplitDb *ts) {m_ts = ts;}
938
932
bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)
939
    bool takeword(const std::string &term, int pos, int, int)
933
{
934
    LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
935
936
    string term;
937
    if (!unacmaybefold(_term, term, "UTF-8", true)) {
938
  LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n", 
939
                 _term.c_str()));
940
  term.clear();
941
  // We don't generate a fatal error because of a bad term
942
  return true;
943
    }
940
    {
944
945
    if (stops.isStop(term)) {
946
  LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
947
  return true;
948
    }
949
950
    // Compute absolute position (pos is relative to current segment),
941
  // Compute absolute position (pos is relative to current segment),
951
    // and remember relative.
942
  // and remember relative.
952
    curpos = pos;
943
  m_ts->curpos = pos;
953
    pos += basepos;
944
  pos += m_ts->basepos;
954
    string ermsg;
945
  string ermsg;
955
    try {
946
  try {
956
    // Index without prefix, using the field-specific weighting
947
        // Index without prefix, using the field-specific weighting
948
      LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
957
    doc.add_posting(term, pos, wdfinc);
949
        m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
958
#ifdef TESTING_XAPIAN_SPELL
950
#ifdef TESTING_XAPIAN_SPELL
959
    if (Db::isSpellingCandidate(term)) {
951
        if (Db::isSpellingCandidate(term)) {
960
        db.add_spelling(term);
952
      m_ts->db.add_spelling(term);
961
  }
953
      }
962
#endif
954
#endif
963
    // Index the prefixed term.
955
        // Index the prefixed term.
964
    if (!prefix.empty()) {
956
        if (!m_ts->prefix.empty()) {
965
        doc.add_posting(prefix + term, pos, wdfinc);
957
      m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
966
  }
958
      }
967
    return true;
959
        return true;
968
    } XCATCHERROR(ermsg);
960
  } XCATCHERROR(ermsg);
969
    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
961
  LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
970
    return false;
962
  return false;
963
    }
964
private:
965
    TextSplitDb *m_ts;
971
}
966
};
967
972
968
973
#ifdef TESTING_XAPIAN_SPELL
969
#ifdef TESTING_XAPIAN_SPELL
974
string Db::getSpellingSuggestion(const string& word)
970
string Db::getSpellingSuggestion(const string& word)
975
{
971
{
976
    if (m_ndb == 0)
972
    if (m_ndb == 0)
...
...
1030
    }
1026
    }
1031
1027
1032
    Doc doc = idoc;
1028
    Doc doc = idoc;
1033
1029
1034
    Xapian::Document newdocument;
1030
    Xapian::Document newdocument;
1031
    TermProcIdx tpidx;
1032
//    TermProcStop tpstop(&tpidx, m_stops);
1033
    TermProcCommongrams tpstop(&tpidx, m_stops);
1034
    TermProcPrep tpprep(&tpstop);
1035
    TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);
1035
    TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
1036
1036
    tpidx.setTSD(&splitter);
1037
    // Split and index file name as document term(s)
1037
    // Split and index file name as document term(s)
1038
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
1038
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
1039
    if (!splitter.text_to_words(doc.utf8fn))
1039
    if (!splitter.text_to_words(doc.utf8fn))
1040
        LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
1040
        LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
1041
1041