recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [ab3206] .. [52bc9f]

Switch to unified view


...
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
#include "autoconfig.h"

#include <stdio.h>
#include <cstring>
#include <unistd.h>
#include <fnmatch.h>
#include <regex.h>
...
#include "md5.h"
#include "rclversion.h"
#include "cancelcheck.h"
#include "ptmutex.h"
#include "termproc.h"
#include "expansiondbs.h"

#ifndef MAX
#define MAX(A,B) (A>B?A:B)
#endif
#ifndef MIN
...
static const string mimetype_prefix = "T";
static const string xapday_prefix = "D";
static const string xapmonth_prefix = "M";
static const string xapyear_prefix = "Y";
const string pathelt_prefix = "XP";
#ifdef RCL_INDEX_STRIPCHARS
const string start_of_field_term = "XXST";
const string end_of_field_term = "XXND";
static const string page_break_term = "XXPG";
#else
string start_of_field_term;
string end_of_field_term;
const string page_break_term = "XXPG/";
#endif

// Field name for the unsplit file name. Has to exist in the field file 
// because of usage in termmatch()
static const string unsplitFilenameFieldName = "rclUnsplitFN";
static const string unsplitfilename_prefix = "XSFS";

...
// un-prefixed, so this is simpler and better.
static void noPrefixList(const vector<string>& in, vector<string>& out) 
{
    for (vector<string>::const_iterator qit = in.begin(); 
     qit != in.end(); qit++) {
  if (!has_prefix(*qit))
        out.push_back(*qit);
    }
}

#undef DEBUGABSTRACT  
...
    int cutoff = 500 * 1000;

    for (term = xrdb.termlist_begin(docid);
         term != xrdb.termlist_end(docid); term++) {
        // Ignore prefixed terms
      if (has_prefix(*term))
        continue;
        if (cutoff-- < 0) {
        ret = ABSRES_TRUNC;
        LOGDEB0(("makeAbstract: max term count cutoff\n"));
        break;
...
    incjk = newcjk;
    if (it->second == cstr_ellipsis) {
        vabs.push_back(pair<int,string>(page, chunk));
        chunk.clear();
    } else {
      if (it->second.compare(end_of_field_term) && 
      it->second.compare(start_of_field_term))
      chunk += it->second;
    }
    }
    if (!chunk.empty())
    vabs.push_back(pair<int, string>(page, chunk));

...
    : m_ndb(0), m_config(cfp), m_idxAbsTruncLen(250), m_synthAbsLen(250),
      m_synthAbsWordCtxLen(4), m_flushMb(-1), 
      m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
      m_maxFsOccupPc(0), m_mode(Db::DbRO)
{
#ifndef RCL_INDEX_STRIPCHARS
    if (start_of_field_term.empty()) {
  if (o_index_stripchars) {
      start_of_field_term = "XXST";
      end_of_field_term = "XXND";
  } else {
      start_of_field_term = "XXST/";
      end_of_field_term = "XXND/";
  }
    }
#endif

    m_ndb = new Native(this);
    if (m_config) {
    m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
    m_config->getConfParam("idxflushmb", &m_flushMb);
    }
...
{
    int res = -1;
    if (!m_ndb || !m_ndb->m_isopen)
        return -1;

    string term = _term;
#ifndef RCL_INDEX_STRIPCHARS
    if (o_index_stripchars)
#endif
  if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
        LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
        return 0;
  }

    if (m_stops.isStop(term)) {
    LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
    return 0;
    }
...
    : TextSplitP(prc), 
      doc(d), basepos(1), curpos(0), wdfinc(1)
    {}
    // Reimplement text_to_words to add start and end special terms
    virtual bool text_to_words(const string &in);

    void setprefix(const string& pref) 
    {
  if (pref.empty())
      prefix.clear();
  else
      prefix = wrap_prefix(pref);
    }

    void setwdfinc(int i) 
    {
  wdfinc = i;
    }

    friend class TermProcIdx;

private:
    // If prefix is set, we also add a posting for the prefixed terms
...
#ifdef TESTING_XAPIAN_SPELL
string Db::getSpellingSuggestion(const string& word)
{
    if (m_ndb == 0)
    return string();

    string term = word;

#ifndef RCL_INDEX_STRIPCHARS
    if (o_index_stripchars)
#endif
    if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
    LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
    return string();
    }

    if (!isSpellingCandidate(term))
    return string();
    return m_ndb->xrdb.get_spelling_suggestion(term);
}
#endif
...

    // The term processing pipeline:
    TermProcIdx tpidx;
    TermProc *nxt = &tpidx;
    TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
    //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;

    TermProcPrep tpprep(nxt);
#ifndef RCL_INDEX_STRIPCHARS
    if (o_index_stripchars)
#endif
  nxt = &tpprep;

    TextSplitDb splitter(newdocument, nxt);
    tpidx.setTSD(&splitter);

    // If the ipath is like a path, index the last element. This is
...
    {
    string path = url_gpath(doc.url);
    vector<string> vpath;
    stringToTokens(path, vpath, "/");
    splitter.curpos = 0;
    newdocument.add_posting(wrap_prefix(pathelt_prefix),
                splitter.basepos + splitter.curpos++);
    for (vector<string>::iterator it = vpath.begin(); 
         it != vpath.end(); it++){
        if (it->length() > 230) {
        // Just truncate it. May still be useful because of wildcards
        *it = it->substr(0, 230);
        }
        newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
                    splitter.basepos + splitter.curpos++);
    }
    }

    // Index textual metadata.  These are all indexed as text with
...
    if (!splitter.text_to_words(doc.text))
        LOGDEB(("Db::addOrUpdate: split failed for main text\n"));

    ////// Special terms for other metadata. No positions for these.
    // Mime type
    newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);

    // Simple file name indexed unsplit for specific "file name"
    // searches. This is not the same as a filename: clause inside the
    // query language.
    // We also add a term for the filename extension if any.
...
        // a pathological case anyway
        if (fn.size() > 230)
        utf8truncate(fn, 230);
        string::size_type pos = fn.rfind('.');
        if (pos != string::npos && pos != fn.length() - 1) {
      newdocument.add_term(wrap_prefix(fileext_prefix) + 
                   fn.substr(pos + 1));
        }
        newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
    }
    }

    // Udi unique term: this is used for file existence/uptodate
    // checks, and unique id for the replace_document() call.
...
    time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : 
            doc.dmtime.c_str());
    struct tm *tm = localtime(&mtime);
    char buf[9];
    snprintf(buf, 9, "%04d%02d%02d",
        tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
    // Date (YYYYMMDD)
    newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf)); 
    // Month (YYYYMM)
    buf[6] = '\0';
    newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
    // Year (YYYY)
    buf[4] = '\0';
    newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf)); 


    //////////////////////////////////////////////////////////////////
    // Document data record. omindex has the following nl separated fields:
    // - url
...
bool Db::maxYearSpan(int *minyear, int *maxyear)
{
    *minyear = 1000000; 
    *maxyear = -1000000;
    TermMatchResult result;
    if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
    return false;
    for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
     it != result.entries.end(); it++) {
        if (!it->term.empty()) {
            int year = atoi(it->term.c_str()+1);
...
// the input string prior to these chars.
const string cstr_wildSpecChars = "*?[";
const string cstr_regSpecChars = "(.[{";

// Find all index terms that match a wildcard or regular expression
// If field is set, we return a list of appropriately prefixed terms (which 
// are going to be used to build a Xapian query).
bool Db::termMatch(MatchType typ, const string &lang,
           const string &root, 
           TermMatchResult& res,
           int max, 
           const string& field)


{
    if (!m_ndb || !m_ndb->m_isopen)
    return false;
    Xapian::Database xdb = m_ndb->xdb();


    XAPTRY(res.dbdoccount = xdb.get_doccount();
           res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
    if (!m_reason.empty())
        return false;

    // Get rid of capitals and accents

    string droot = root;

#ifndef RCL_INDEX_STRIPCHARS
    if (o_index_stripchars)
#endif
  if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
        LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
        return false;
  }

    string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;

    string prefix;
    if (!field.empty()) {
    const FieldTraits *ftp = 0;
    if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
            LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n", 
                    field.c_str()));
        } else {
        prefix = wrap_prefix(ftp->pfx);
    }
    }
    res.prefix = prefix;


    if (typ == ET_STEM) {
    if (!stemExpand(lang, root, res, max))
        return false;


    for (vector<TermMatchEntry>::iterator it = res.entries.begin(); 
         it != res.entries.end(); it++) {
        XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
                   it->docs = xdb.get_termfreq(it->term),
                   xdb, m_reason);
...
    }

    TermMatchCmpByTerm tcmp;
    sort(res.entries.begin(), res.entries.end(), tcmp);
    TermMatchTermEqual teq;
    vector<TermMatchEntry>::iterator uit = 
  unique(res.entries.begin(), res.entries.end(), teq);
    res.entries.resize(uit - res.entries.begin());
    TermMatchCmpByWcf wcmp;
    sort(res.entries.begin(), res.entries.end(), wcmp);
    if (max > 0) {
    res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
    }

	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
	...		...
12	* You should have received a copy of the GNU General Public License	12	* You should have received a copy of the GNU General Public License
13	* along with this program; if not, write to the	13	* along with this program; if not, write to the
14	* Free Software Foundation, Inc.,	14	* Free Software Foundation, Inc.,
15	* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.	15	* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16	*/	16	*/
		17	#include "autoconfig.h"
		18
17	#include <stdio.h>	19	#include <stdio.h>
18	#include <cstring>	20	#include <cstring>
19	#include <unistd.h>	21	#include <unistd.h>
20	#include <fnmatch.h>	22	#include <fnmatch.h>
21	#include <regex.h>	23	#include <regex.h>
	...		...
51	#include "md5.h"	53	#include "md5.h"
52	#include "rclversion.h"	54	#include "rclversion.h"
53	#include "cancelcheck.h"	55	#include "cancelcheck.h"
54	#include "ptmutex.h"	56	#include "ptmutex.h"
55	#include "termproc.h"	57	#include "termproc.h"
		58	#include "expansiondbs.h"
56		59
57	#ifndef MAX	60	#ifndef MAX
58	#define MAX(A,B) (A>B?A:B)	61	#define MAX(A,B) (A>B?A:B)
59	#endif	62	#endif
60	#ifndef MIN	63	#ifndef MIN
	...		...
82	static const string mimetype_prefix = "T";	85	static const string mimetype_prefix = "T";
83	static const string xapday_prefix = "D";	86	static const string xapday_prefix = "D";
84	static const string xapmonth_prefix = "M";	87	static const string xapmonth_prefix = "M";
85	static const string xapyear_prefix = "Y";	88	static const string xapyear_prefix = "Y";
86	const string pathelt_prefix = "XP";	89	const string pathelt_prefix = "XP";
		90	#ifdef RCL_INDEX_STRIPCHARS
87	const string start_of_field_term = "XXST";	91	const string start_of_field_term = "XXST";
88	const string end_of_field_term = "XXND";	92	const string end_of_field_term = "XXND";
89	static const string page_break_term = "XXPG";	93	static const string page_break_term = "XXPG";
		94	#else
		95	string start_of_field_term;
		96	string end_of_field_term;
		97	const string page_break_term = "XXPG/";
		98	#endif
		99
90	// Field name for the unsplit file name. Has to exist in the field file	100	// Field name for the unsplit file name. Has to exist in the field file
91	// because of usage in termmatch()	101	// because of usage in termmatch()
92	static const string unsplitFilenameFieldName = "rclUnsplitFN";	102	static const string unsplitFilenameFieldName = "rclUnsplitFN";
93	static const string unsplitfilename_prefix = "XSFS";	103	static const string unsplitfilename_prefix = "XSFS";
94		104
	...		...
195	// un-prefixed, so this is simpler and better.	205	// un-prefixed, so this is simpler and better.
196	static void noPrefixList(const vector<string>& in, vector<string>& out)	206	static void noPrefixList(const vector<string>& in, vector<string>& out)
197	{	207	{
198	for (vector<string>::const_iterator qit = in.begin();	208	for (vector<string>::const_iterator qit = in.begin();
199	qit != in.end(); qit++) {	209	qit != in.end(); qit++) {
200	if (qit->size() && !('A' <= (qit)[0] && (qit)[0] <= 'Z'))	210	if (!has_prefix(*qit))
201	out.push_back(*qit);	211	out.push_back(*qit);
202	}	212	}
203	}	213	}
204		214
205	#undef DEBUGABSTRACT	215	#undef DEBUGABSTRACT
	...		...
589	int cutoff = 500 * 1000;	599	int cutoff = 500 * 1000;
590		600
591	for (term = xrdb.termlist_begin(docid);	601	for (term = xrdb.termlist_begin(docid);
592	term != xrdb.termlist_end(docid); term++) {	602	term != xrdb.termlist_end(docid); term++) {
593	// Ignore prefixed terms	603	// Ignore prefixed terms
594	if ('A' <= (term).at(0) && (term).at(0) <= 'Z')	604	if (has_prefix(*term))
595	continue;	605	continue;
596	if (cutoff-- < 0) {	606	if (cutoff-- < 0) {
597	ret = ABSRES_TRUNC;	607	ret = ABSRES_TRUNC;
598	LOGDEB0(("makeAbstract: max term count cutoff\n"));	608	LOGDEB0(("makeAbstract: max term count cutoff\n"));
599	break;	609	break;
	...		...
670	incjk = newcjk;	680	incjk = newcjk;
671	if (it->second == cstr_ellipsis) {	681	if (it->second == cstr_ellipsis) {
672	vabs.push_back(pair<int,string>(page, chunk));	682	vabs.push_back(pair<int,string>(page, chunk));
673	chunk.clear();	683	chunk.clear();
674	} else {	684	} else {
		685	if (it->second.compare(end_of_field_term) &&
		686	it->second.compare(start_of_field_term))
675	chunk += it->second;	687	chunk += it->second;
676	}	688	}
677	}	689	}
678	if (!chunk.empty())	690	if (!chunk.empty())
679	vabs.push_back(pair<int, string>(page, chunk));	691	vabs.push_back(pair<int, string>(page, chunk));
680		692
	...		...
690	: m_ndb(0), m_config(cfp), m_idxAbsTruncLen(250), m_synthAbsLen(250),	702	: m_ndb(0), m_config(cfp), m_idxAbsTruncLen(250), m_synthAbsLen(250),
691	m_synthAbsWordCtxLen(4), m_flushMb(-1),	703	m_synthAbsWordCtxLen(4), m_flushMb(-1),
692	m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),	704	m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
693	m_maxFsOccupPc(0), m_mode(Db::DbRO)	705	m_maxFsOccupPc(0), m_mode(Db::DbRO)
694	{	706	{
		707	#ifndef RCL_INDEX_STRIPCHARS
		708	if (start_of_field_term.empty()) {
		709	if (o_index_stripchars) {
		710	start_of_field_term = "XXST";
		711	end_of_field_term = "XXND";
		712	} else {
		713	start_of_field_term = "XXST/";
		714	end_of_field_term = "XXND/";
		715	}
		716	}
		717	#endif
		718
695	m_ndb = new Native(this);	719	m_ndb = new Native(this);
696	if (m_config) {	720	if (m_config) {
697	m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);	721	m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
698	m_config->getConfParam("idxflushmb", &m_flushMb);	722	m_config->getConfParam("idxflushmb", &m_flushMb);
699	}	723	}
	...		...
892	{	916	{
893	int res = -1;	917	int res = -1;
894	if (!m_ndb \|\| !m_ndb->m_isopen)	918	if (!m_ndb \|\| !m_ndb->m_isopen)
895	return -1;	919	return -1;
896		920
897	string term;	921	string term = _term;
		922	#ifndef RCL_INDEX_STRIPCHARS
		923	if (o_index_stripchars)
		924	#endif
898	if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {	925	if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
899	LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));	926	LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
900	return 0;	927	return 0;
901	}	928	}
902		929
903	if (m_stops.isStop(term)) {	930	if (m_stops.isStop(term)) {
904	LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));	931	LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
905	return 0;	932	return 0;
906	}	933	}
	...		...
1012	: TextSplitP(prc),	1039	: TextSplitP(prc),
1013	doc(d), basepos(1), curpos(0), wdfinc(1)	1040	doc(d), basepos(1), curpos(0), wdfinc(1)
1014	{}	1041	{}
1015	// Reimplement text_to_words to add start and end special terms	1042	// Reimplement text_to_words to add start and end special terms
1016	virtual bool text_to_words(const string &in);	1043	virtual bool text_to_words(const string &in);
		1044
1017	void setprefix(const string& pref) {prefix = pref;}	1045	void setprefix(const string& pref)
		1046	{
		1047	if (pref.empty())
		1048	prefix.clear();
		1049	else
		1050	prefix = wrap_prefix(pref);
		1051	}
		1052
1018	void setwdfinc(int i) {wdfinc = i;}	1053	void setwdfinc(int i)
		1054	{
		1055	wdfinc = i;
		1056	}
1019		1057
1020	friend class TermProcIdx;	1058	friend class TermProcIdx;
1021		1059
1022	private:	1060	private:
1023	// If prefix is set, we also add a posting for the prefixed terms	1061	// If prefix is set, we also add a posting for the prefixed terms
	...		...
1145	#ifdef TESTING_XAPIAN_SPELL	1183	#ifdef TESTING_XAPIAN_SPELL
1146	string Db::getSpellingSuggestion(const string& word)	1184	string Db::getSpellingSuggestion(const string& word)
1147	{	1185	{
1148	if (m_ndb == 0)	1186	if (m_ndb == 0)
1149	return string();	1187	return string();
		1188
1150	string term;	1189	string term = word;
		1190
		1191	#ifndef RCL_INDEX_STRIPCHARS
		1192	if (o_index_stripchars)
		1193	#endif
1151	if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {	1194	if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
1152	LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));	1195	LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
1153	return string();	1196	return string();
1154	}	1197	}
		1198
1155	if (!isSpellingCandidate(term))	1199	if (!isSpellingCandidate(term))
1156	return string();	1200	return string();
1157	return m_ndb->xrdb.get_spelling_suggestion(term);	1201	return m_ndb->xrdb.get_spelling_suggestion(term);
1158	}	1202	}
1159	#endif	1203	#endif
	...		...
1257		1301
1258	// The term processing pipeline:	1302	// The term processing pipeline:
1259	TermProcIdx tpidx;	1303	TermProcIdx tpidx;
1260	TermProc *nxt = &tpidx;	1304	TermProc *nxt = &tpidx;
1261	TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;	1305	TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
1262	// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;	1306	//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
		1307
1263	TermProcPrep tpprep(nxt); nxt = &tpprep;	1308	TermProcPrep tpprep(nxt);
		1309	#ifndef RCL_INDEX_STRIPCHARS
		1310	if (o_index_stripchars)
		1311	#endif
		1312	nxt = &tpprep;
1264		1313
1265	TextSplitDb splitter(newdocument, nxt);	1314	TextSplitDb splitter(newdocument, nxt);
1266	tpidx.setTSD(&splitter);	1315	tpidx.setTSD(&splitter);
1267		1316
1268	// If the ipath is like a path, index the last element. This is	1317	// If the ipath is like a path, index the last element. This is
	...		...
1284	{	1333	{
1285	string path = url_gpath(doc.url);	1334	string path = url_gpath(doc.url);
1286	vector<string> vpath;	1335	vector<string> vpath;
1287	stringToTokens(path, vpath, "/");	1336	stringToTokens(path, vpath, "/");
1288	splitter.curpos = 0;	1337	splitter.curpos = 0;
1289	newdocument.add_posting(pathelt_prefix,	1338	newdocument.add_posting(wrap_prefix(pathelt_prefix),
1290	splitter.basepos + splitter.curpos++);	1339	splitter.basepos + splitter.curpos++);
1291	for (vector<string>::iterator it = vpath.begin();	1340	for (vector<string>::iterator it = vpath.begin();
1292	it != vpath.end(); it++){	1341	it != vpath.end(); it++){
1293	if (it->length() > 230) {	1342	if (it->length() > 230) {
1294	// Just truncate it. May still be useful because of wildcards	1343	// Just truncate it. May still be useful because of wildcards
1295	*it = it->substr(0, 230);	1344	*it = it->substr(0, 230);
1296	}	1345	}
1297	newdocument.add_posting(pathelt_prefix + *it,	1346	newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
1298	splitter.basepos + splitter.curpos++);	1347	splitter.basepos + splitter.curpos++);
1299	}	1348	}
1300	}	1349	}
1301		1350
1302	// Index textual metadata. These are all indexed as text with	1351	// Index textual metadata. These are all indexed as text with
	...		...
1337	if (!splitter.text_to_words(doc.text))	1386	if (!splitter.text_to_words(doc.text))
1338	LOGDEB(("Db::addOrUpdate: split failed for main text\n"));	1387	LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
1339		1388
1340	////// Special terms for other metadata. No positions for these.	1389	////// Special terms for other metadata. No positions for these.
1341	// Mime type	1390	// Mime type
1342	newdocument.add_term(mimetype_prefix + doc.mimetype);	1391	newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
1343		1392
1344	// Simple file name indexed unsplit for specific "file name"	1393	// Simple file name indexed unsplit for specific "file name"
1345	// searches. This is not the same as a filename: clause inside the	1394	// searches. This is not the same as a filename: clause inside the
1346	// query language.	1395	// query language.
1347	// We also add a term for the filename extension if any.	1396	// We also add a term for the filename extension if any.
	...		...
1353	// a pathological case anyway	1402	// a pathological case anyway
1354	if (fn.size() > 230)	1403	if (fn.size() > 230)
1355	utf8truncate(fn, 230);	1404	utf8truncate(fn, 230);
1356	string::size_type pos = fn.rfind('.');	1405	string::size_type pos = fn.rfind('.');
1357	if (pos != string::npos && pos != fn.length() - 1) {	1406	if (pos != string::npos && pos != fn.length() - 1) {
1358	newdocument.add_term(fileext_prefix + fn.substr(pos + 1));	1407	newdocument.add_term(wrap_prefix(fileext_prefix) +
		1408	fn.substr(pos + 1));
1359	}	1409	}
1360	newdocument.add_term(unsplitfilename_prefix + fn);	1410	newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
1361	}	1411	}
1362	}	1412	}
1363		1413
1364	// Udi unique term: this is used for file existence/uptodate	1414	// Udi unique term: this is used for file existence/uptodate
1365	// checks, and unique id for the replace_document() call.	1415	// checks, and unique id for the replace_document() call.
	...		...
1374	time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :	1424	time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
1375	doc.dmtime.c_str());	1425	doc.dmtime.c_str());
1376	struct tm *tm = localtime(&mtime);	1426	struct tm *tm = localtime(&mtime);
1377	char buf[9];	1427	char buf[9];
1378	snprintf(buf, 9, "%04d%02d%02d",	1428	snprintf(buf, 9, "%04d%02d%02d",
1379	tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);	1429	tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
		1430	// Date (YYYYMMDD)
1380	newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)	1431	newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf));
		1432	// Month (YYYYMM)
1381	buf[6] = '\0';	1433	buf[6] = '\0';
1382	newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)	1434	newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
		1435	// Year (YYYY)
1383	buf[4] = '\0';	1436	buf[4] = '\0';
1384	newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)	1437	newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf));
1385		1438
1386		1439
1387	//////////////////////////////////////////////////////////////////	1440	//////////////////////////////////////////////////////////////////
1388	// Document data record. omindex has the following nl separated fields:	1441	// Document data record. omindex has the following nl separated fields:
1389	// - url	1442	// - url
	...		...
1854	bool Db::maxYearSpan(int minyear, int maxyear)	1907	bool Db::maxYearSpan(int minyear, int maxyear)
1855	{	1908	{
1856	*minyear = 1000000;	1909	*minyear = 1000000;
1857	*maxyear = -1000000;	1910	*maxyear = -1000000;
1858	TermMatchResult result;	1911	TermMatchResult result;
1859	if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear"))	1912	if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
1860	return false;	1913	return false;
1861	for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();	1914	for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
1862	it != result.entries.end(); it++) {	1915	it != result.entries.end(); it++) {
1863	if (!it->term.empty()) {	1916	if (!it->term.empty()) {
1864	int year = atoi(it->term.c_str()+1);	1917	int year = atoi(it->term.c_str()+1);
	...		...
1919	// the input string prior to these chars.	1972	// the input string prior to these chars.
1920	const string cstr_wildSpecChars = "*?[";	1973	const string cstr_wildSpecChars = "*?[";
1921	const string cstr_regSpecChars = "(.[{";	1974	const string cstr_regSpecChars = "(.[{";
1922		1975
1923	// Find all index terms that match a wildcard or regular expression	1976	// Find all index terms that match a wildcard or regular expression
		1977	// If field is set, we return a list of appropriately prefixed terms (which
		1978	// are going to be used to build a Xapian query).
1924	bool Db::termMatch(MatchType typ, const string &lang,	1979	bool Db::termMatch(MatchType typ, const string &lang,
1925	const string &root,	1980	const string &root,
1926	TermMatchResult& res,	1981	TermMatchResult& res,
1927	int max,	1982	int max,
1928	const string& field,	1983	const string& field)
1929	string *prefixp
1930	)
1931	{	1984	{
1932	if (!m_ndb \|\| !m_ndb->m_isopen)	1985	if (!m_ndb \|\| !m_ndb->m_isopen)
1933	return false;	1986	return false;
1934	Xapian::Database xdb = m_ndb->xdb();	1987	Xapian::Database xdb = m_ndb->xdb();
1935		1988
1936	res.clear();
1937	XAPTRY(res.dbdoccount = xdb.get_doccount();	1989	XAPTRY(res.dbdoccount = xdb.get_doccount();
1938	res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);	1990	res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
1939	if (!m_reason.empty())	1991	if (!m_reason.empty())
1940	return false;	1992	return false;
1941		1993
1942	// Get rid of capitals and accents	1994	// Get rid of capitals and accents
		1995
1943	string droot;	1996	string droot = root;
		1997
		1998	#ifndef RCL_INDEX_STRIPCHARS
		1999	if (o_index_stripchars)
		2000	#endif
1944	if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {	2001	if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
1945	LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));	2002	LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
1946	return false;	2003	return false;
1947	}	2004	}
		2005
1948	string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;	2006	string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
1949		2007
1950	string prefix;	2008	string prefix;
1951	if (!field.empty()) {	2009	if (!field.empty()) {
1952	const FieldTraits *ftp = 0;	2010	const FieldTraits *ftp = 0;
1953	if (!fieldToTraits(field, &ftp) \|\| ftp->pfx.empty()) {	2011	if (!fieldToTraits(field, &ftp) \|\| ftp->pfx.empty()) {
1954	LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",	2012	LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
1955	field.c_str()));	2013	field.c_str()));
1956	} else {	2014	} else {
1957	prefix = ftp->pfx;	2015	prefix = wrap_prefix(ftp->pfx);
1958	}	2016	}
1959	if (prefixp)	2017	}
1960	*prefixp = prefix;	2018	res.prefix = prefix;
1961	}
1962		2019
1963	if (typ == ET_STEM) {	2020	if (typ == ET_STEM) {
1964	if (!stemExpand(lang, root, res, max))	2021	if (!stemExpand(lang, root, res, max))
1965	return false;	2022	return false;
1966	sort(res.entries.begin(), res.entries.end());
1967	unique(res.entries.begin(), res.entries.end());
1968	for (vector<TermMatchEntry>::iterator it = res.entries.begin();	2023	for (vector<TermMatchEntry>::iterator it = res.entries.begin();
1969	it != res.entries.end(); it++) {	2024	it != res.entries.end(); it++) {
1970	XAPTRY(it->wcf = xdb.get_collection_freq(it->term);	2025	XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
1971	it->docs = xdb.get_termfreq(it->term),	2026	it->docs = xdb.get_termfreq(it->term),
1972	xdb, m_reason);	2027	xdb, m_reason);
	...		...
2052	}	2107	}
2053		2108
2054	TermMatchCmpByTerm tcmp;	2109	TermMatchCmpByTerm tcmp;
2055	sort(res.entries.begin(), res.entries.end(), tcmp);	2110	sort(res.entries.begin(), res.entries.end(), tcmp);
2056	TermMatchTermEqual teq;	2111	TermMatchTermEqual teq;
		2112	vector<TermMatchEntry>::iterator uit =
2057	unique(res.entries.begin(), res.entries.end(), teq);	2113	unique(res.entries.begin(), res.entries.end(), teq);
		2114	res.entries.resize(uit - res.entries.begin());
2058	TermMatchCmpByWcf wcmp;	2115	TermMatchCmpByWcf wcmp;
2059	sort(res.entries.begin(), res.entries.end(), wcmp);	2116	sort(res.entries.begin(), res.entries.end(), wcmp);
2060	if (max > 0) {	2117	if (max > 0) {
2061	res.entries.resize(MIN(res.entries.size(), (unsigned int)max));	2118	res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
2062	}	2119	}