recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [61bf17] .. [5fd311]

Switch to unified view


...
#include "rclquery_p.h"
#include "md5.h"
#include "rclversion.h"
#include "cancelcheck.h"
#include "ptmutex.h"
#include "termproc.h"

#ifndef MAX
#define MAX(A,B) (A>B?A:B)
#endif
#ifndef MIN
...


// The splitter breaks text into words and adds postings to the Xapian
// document. We use a single object to split all of the document
// fields and position jumps to separate fields
class TextSplitDb : public TextSplitP {
 public:
    Xapian::WritableDatabase db;
    Xapian::Document &doc;   // Xapian document 
    // Base for document section. Gets large increment when we change
    // sections, to avoid cross-section proximity matches.
...
    // section size (last relative term position), and this is what
    // gets added to basepos in addition to the inter-section increment
    // to compute the first position of the next section.
    Xapian::termpos curpos;


    TextSplitDb(Xapian::WritableDatabase idb, 
      Xapian::Document &d, TermProc *prc)
  : TextSplitP(prc), 
      db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
    {}
    // Reimplement text_to_words to add start and end special terms
    virtual bool text_to_words(const string &in);

    void setprefix(const string& pref) {prefix = pref;}
    void setwdfinc(int i) {wdfinc = i;}

    friend class TermProcIdx;

private:
    // If prefix is set, we also add a posting for the prefixed terms
    // (ie: for titles, add postings for both "term" and "Sterm")
    string  prefix; 
    // Some fields have more weight
    int wdfinc;
};

// Reimplement text_to_words to insert the begin and end anchor terms.
bool TextSplitDb::text_to_words(const string &in) 
{
    LOGDEB2(("TextSplitDb::text_to_words\n"));
    string ermsg;
    try {
...
    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
    basepos += curpos + 100;
    return false;
    }

    if (!TextSplitP::text_to_words(in)) {
    LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
    basepos += curpos + 100;
    return false;
    }

...
    if (!ermsg.empty()) {
    LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
    basepos += curpos + 100;
    return false;
    }

    basepos += curpos + 100;
    return true;
}

class TermProcIdx : public TermProc {
public:
    TermProcIdx() : TermProc(0), m_ts(0) {}
    void setTSD(TextSplitDb *ts) {m_ts = ts;}

    bool takeword(const std::string &term, int pos, int, int)










    {






  // Compute absolute position (pos is relative to current segment),
  // and remember relative.
  m_ts->curpos = pos;
  pos += m_ts->basepos;
  string ermsg;
  try {
        // Index without prefix, using the field-specific weighting
      LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
        m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
#ifdef TESTING_XAPIAN_SPELL
        if (Db::isSpellingCandidate(term)) {
      m_ts->db.add_spelling(term);
      }
#endif
        // Index the prefixed term.
        if (!m_ts->prefix.empty()) {
      m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
      }
        return true;
  } XCATCHERROR(ermsg);
  LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
  return false;
    }
private:
    TextSplitDb *m_ts;
};


#ifdef TESTING_XAPIAN_SPELL
string Db::getSpellingSuggestion(const string& word)
{
    if (m_ndb == 0)
...
    }

    Doc doc = idoc;

    Xapian::Document newdocument;
    TermProcIdx tpidx;
//    TermProcStop tpstop(&tpidx, m_stops);
    TermProcCommongrams tpstop(&tpidx, m_stops);
    TermProcPrep tpprep(&tpstop);
    TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
    tpidx.setTSD(&splitter);
    // Split and index file name as document term(s)
    LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
    if (!splitter.text_to_words(doc.utf8fn))
        LOGDEB(("Db::addOrUpdate: split failed for file name\n"));


	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
	...		...
50	#include "rclquery_p.h"	50	#include "rclquery_p.h"
51	#include "md5.h"	51	#include "md5.h"
52	#include "rclversion.h"	52	#include "rclversion.h"
53	#include "cancelcheck.h"	53	#include "cancelcheck.h"
54	#include "ptmutex.h"	54	#include "ptmutex.h"
		55	#include "termproc.h"
55		56
56	#ifndef MAX	57	#ifndef MAX
57	#define MAX(A,B) (A>B?A:B)	58	#define MAX(A,B) (A>B?A:B)
58	#endif	59	#endif
59	#ifndef MIN	60	#ifndef MIN
	...		...
856		857
857		858
858	// The splitter breaks text into words and adds postings to the Xapian	859	// The splitter breaks text into words and adds postings to the Xapian
859	// document. We use a single object to split all of the document	860	// document. We use a single object to split all of the document
860	// fields and position jumps to separate fields	861	// fields and position jumps to separate fields
861	class TextSplitDb : public TextSplit {	862	class TextSplitDb : public TextSplitP {
862	public:	863	public:
863	Xapian::WritableDatabase db;	864	Xapian::WritableDatabase db;
864	Xapian::Document &doc; // Xapian document	865	Xapian::Document &doc; // Xapian document
865	// Base for document section. Gets large increment when we change	866	// Base for document section. Gets large increment when we change
866	// sections, to avoid cross-section proximity matches.	867	// sections, to avoid cross-section proximity matches.
	...		...
871	// section size (last relative term position), and this is what	872	// section size (last relative term position), and this is what
872	// gets added to basepos in addition to the inter-section increment	873	// gets added to basepos in addition to the inter-section increment
873	// to compute the first position of the next section.	874	// to compute the first position of the next section.
874	Xapian::termpos curpos;	875	Xapian::termpos curpos;
875		876
876	StopList &stops;
877	TextSplitDb(Xapian::WritableDatabase idb,	877	TextSplitDb(Xapian::WritableDatabase idb,
878	Xapian::Document &d, StopList &_stops)	878	Xapian::Document &d, TermProc *prc)
		879	: TextSplitP(prc),
879	: db(idb), doc(d), basepos(1), curpos(0), stops(_stops), wdfinc(1)	880	db(idb), doc(d), basepos(1), curpos(0), wdfinc(1)
880	{}	881	{}
881	// Reimplement text_to_words to add start and end special terms	882	// Reimplement text_to_words to add start and end special terms
882	virtual bool text_to_words(const string &in);	883	virtual bool text_to_words(const string &in);
883	bool takeword(const std::string &term, int pos, int, int);
884	void setprefix(const string& pref) {prefix = pref;}	884	void setprefix(const string& pref) {prefix = pref;}
885	void setwdfinc(int i) {wdfinc = i;}	885	void setwdfinc(int i) {wdfinc = i;}
		886
		887	friend class TermProcIdx;
886		888
887	private:	889	private:
888	// If prefix is set, we also add a posting for the prefixed terms	890	// If prefix is set, we also add a posting for the prefixed terms
889	// (ie: for titles, add postings for both "term" and "Sterm")	891	// (ie: for titles, add postings for both "term" and "Sterm")
890	string prefix;	892	string prefix;
891	// Some fields have more weight	893	// Some fields have more weight
892	int wdfinc;	894	int wdfinc;
893	};	895	};
894		896
895		897	// Reimplement text_to_words to insert the begin and end anchor terms.
896	bool TextSplitDb::text_to_words(const string &in)	898	bool TextSplitDb::text_to_words(const string &in)
897	{	899	{
898	LOGDEB2(("TextSplitDb::text_to_words\n"));	900	LOGDEB2(("TextSplitDb::text_to_words\n"));
899	string ermsg;	901	string ermsg;
900	try {	902	try {
	...		...
906	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));	908	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
907	basepos += curpos + 100;	909	basepos += curpos + 100;
908	return false;	910	return false;
909	}	911	}
910		912
911	if (!TextSplit::text_to_words(in)) {	913	if (!TextSplitP::text_to_words(in)) {
912	LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));	914	LOGDEB(("TextSplitDb: TextSplit::text_to_words failed\n"));
913	basepos += curpos + 100;	915	basepos += curpos + 100;
914	return false;	916	return false;
915	}	917	}
916		918
	...		...
922	if (!ermsg.empty()) {	924	if (!ermsg.empty()) {
923	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));	925	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
924	basepos += curpos + 100;	926	basepos += curpos + 100;
925	return false;	927	return false;
926	}	928	}
		929
927	basepos += curpos + 100;	930	basepos += curpos + 100;
928	return true;	931	return true;
929	}	932	}
930		933
931	// Get one term from the doc, remove accents and lowercase, then add posting	934	class TermProcIdx : public TermProc {
		935	public:
		936	TermProcIdx() : TermProc(0), m_ts(0) {}
		937	void setTSD(TextSplitDb *ts) {m_ts = ts;}
		938
932	bool TextSplitDb::takeword(const std::string &_term, int pos, int, int)	939	bool takeword(const std::string &term, int pos, int, int)
933	{
934	LOGDEB2(("TextSplitDb::takeword: [%s]\n", _term.c_str()));
935
936	string term;
937	if (!unacmaybefold(_term, term, "UTF-8", true)) {
938	LOGINFO(("Db::splitter::takeword: unac failed for [%s]\n",
939	_term.c_str()));
940	term.clear();
941	// We don't generate a fatal error because of a bad term
942	return true;
943	}	940	{
944
945	if (stops.isStop(term)) {
946	LOGDEB1(("Db: takeword [%s] in stop list\n", term.c_str()));
947	return true;
948	}
949
950	// Compute absolute position (pos is relative to current segment),	941	// Compute absolute position (pos is relative to current segment),
951	// and remember relative.	942	// and remember relative.
952	curpos = pos;	943	m_ts->curpos = pos;
953	pos += basepos;	944	pos += m_ts->basepos;
954	string ermsg;	945	string ermsg;
955	try {	946	try {
956	// Index without prefix, using the field-specific weighting	947	// Index without prefix, using the field-specific weighting
		948	LOGDEB1(("Emitting term at %d : [%s]\n", pos, term.c_str()));
957	doc.add_posting(term, pos, wdfinc);	949	m_ts->doc.add_posting(term, pos, m_ts->wdfinc);
958	#ifdef TESTING_XAPIAN_SPELL	950	#ifdef TESTING_XAPIAN_SPELL
959	if (Db::isSpellingCandidate(term)) {	951	if (Db::isSpellingCandidate(term)) {
960	db.add_spelling(term);	952	m_ts->db.add_spelling(term);
961	}	953	}
962	#endif	954	#endif
963	// Index the prefixed term.	955	// Index the prefixed term.
964	if (!prefix.empty()) {	956	if (!m_ts->prefix.empty()) {
965	doc.add_posting(prefix + term, pos, wdfinc);	957	m_ts->doc.add_posting(m_ts->prefix + term, pos, m_ts->wdfinc);
966	}	958	}
967	return true;	959	return true;
968	} XCATCHERROR(ermsg);	960	} XCATCHERROR(ermsg);
969	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));	961	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
970	return false;	962	return false;
		963	}
		964	private:
		965	TextSplitDb *m_ts;
971	}	966	};
		967
972		968
973	#ifdef TESTING_XAPIAN_SPELL	969	#ifdef TESTING_XAPIAN_SPELL
974	string Db::getSpellingSuggestion(const string& word)	970	string Db::getSpellingSuggestion(const string& word)
975	{	971	{
976	if (m_ndb == 0)	972	if (m_ndb == 0)
	...		...
1030	}	1026	}
1031		1027
1032	Doc doc = idoc;	1028	Doc doc = idoc;
1033		1029
1034	Xapian::Document newdocument;	1030	Xapian::Document newdocument;
		1031	TermProcIdx tpidx;
		1032	// TermProcStop tpstop(&tpidx, m_stops);
		1033	TermProcCommongrams tpstop(&tpidx, m_stops);
		1034	TermProcPrep tpprep(&tpstop);
1035	TextSplitDb splitter(m_ndb->xwdb, newdocument, m_stops);	1035	TextSplitDb splitter(m_ndb->xwdb, newdocument, &tpprep);
1036		1036	tpidx.setTSD(&splitter);
1037	// Split and index file name as document term(s)	1037	// Split and index file name as document term(s)
1038	LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));	1038	LOGDEB2(("Db::add: split file name [%s]\n", fn.c_str()));
1039	if (!splitter.text_to_words(doc.utf8fn))	1039	if (!splitter.text_to_words(doc.utf8fn))
1040	LOGDEB(("Db::addOrUpdate: split failed for file name\n"));	1040	LOGDEB(("Db::addOrUpdate: split failed for file name\n"));
1041		1041