recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [d9e603] .. [e5543b]

Switch to unified view


...
#include "autoconfig.h"

#include <stdio.h>
#include <cstring>
#include <unistd.h>


#include <math.h>
#include <time.h>

#include <string>
#include <vector>
#include <algorithm>
#include <sstream>


using namespace std;


#include "xapian.h"

#include "rclconfig.h"
#include "debuglog.h"
...
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
static const string cstr_RCL_IDX_VERSION("1");

static const string cstr_mbreaks("rclmbreaks");


namespace Rcl {


// Some prefixes that we could get from the fields file, but are not going
// to ever change.
static const string fileext_prefix = "XE";
const string mimetype_prefix = "T";
...
const string page_break_term = "XXPG/";
#endif

// Field name for the unsplit file name. Has to exist in the field file 
// because of usage in termmatch()
const string unsplitFilenameFieldName = "rclUnsplitFN";
static const string unsplitfilename_prefix = "XSFS";

string version_string(){
    return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
        string(Xapian::version_string());
...
    LOGERR(("Db::needUpdate: error while checking existence: %s\n", 
        m_reason.c_str()));
    return true;
}


// Return existing stem db languages
vector<string> Db::getStemLangs()
{
    LOGDEB(("Db::getStemLang\n"));
    vector<string> langs;
...
    LOGERR(("Db::purgeFile: %s\n", ermsg.c_str()));
    }
    return false;
}



















































































































bool Db::dbStats(DbStats& res)
{
    if (!m_ndb || !m_ndb->m_isopen)
    return false;
    Xapian::Database xdb = m_ndb->xrdb;
...
       res.mindoclen = xdb.get_doclength_lower_bound();
       res.maxdoclen = xdb.get_doclength_upper_bound();
       , xdb, m_reason);
    if (!m_reason.empty())
        return false;











































































































































































































































































































































































    return true;
}

// Retrieve document defined by Unique doc identifier. This is used
// by the GUI history feature and by open parent/getenclosing
...

    LOGERR(("Db::getDoc: %s\n", m_reason.c_str()));
    return false;
}

} // End namespace Rcl



	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
	...		...
17	#include "autoconfig.h"	17	#include "autoconfig.h"
18		18
19	#include <stdio.h>	19	#include <stdio.h>
20	#include <cstring>	20	#include <cstring>
21	#include <unistd.h>	21	#include <unistd.h>
22	#include <fnmatch.h>
23	#include <regex.h>
24	#include <math.h>	22	#include <math.h>
25	#include <time.h>	23	#include <time.h>
26		24
27	#include <string>	25	#include <string>
28	#include <vector>	26	#include <vector>
29	#include <algorithm>	27	#include <algorithm>
30	#include <sstream>	28	#include <sstream>
31		29
32	#ifndef NO_NAMESPACES
33	using namespace std;	30	using namespace std;
34	#endif /* NO_NAMESPACES */
35		31
36	#include "xapian.h"	32	#include "xapian.h"
37		33
38	#include "rclconfig.h"	34	#include "rclconfig.h"
39	#include "debuglog.h"	35	#include "debuglog.h"
	...		...
63	static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");	59	static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
64	static const string cstr_RCL_IDX_VERSION("1");	60	static const string cstr_RCL_IDX_VERSION("1");
65		61
66	static const string cstr_mbreaks("rclmbreaks");	62	static const string cstr_mbreaks("rclmbreaks");
67		63
68	#ifndef NO_NAMESPACES
69	namespace Rcl {	64	namespace Rcl {
70	#endif
71		65
72	// Some prefixes that we could get from the fields file, but are not going	66	// Some prefixes that we could get from the fields file, but are not going
73	// to ever change.	67	// to ever change.
74	static const string fileext_prefix = "XE";	68	static const string fileext_prefix = "XE";
75	const string mimetype_prefix = "T";	69	const string mimetype_prefix = "T";
	...		...
92	const string page_break_term = "XXPG/";	86	const string page_break_term = "XXPG/";
93	#endif	87	#endif
94		88
95	// Field name for the unsplit file name. Has to exist in the field file	89	// Field name for the unsplit file name. Has to exist in the field file
96	// because of usage in termmatch()	90	// because of usage in termmatch()
97	static const string unsplitFilenameFieldName = "rclUnsplitFN";	91	const string unsplitFilenameFieldName = "rclUnsplitFN";
98	static const string unsplitfilename_prefix = "XSFS";	92	static const string unsplitfilename_prefix = "XSFS";
99		93
100	string version_string(){	94	string version_string(){
101	return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +	95	return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
102	string(Xapian::version_string());	96	string(Xapian::version_string());
	...		...
1356	LOGERR(("Db::needUpdate: error while checking existence: %s\n",	1350	LOGERR(("Db::needUpdate: error while checking existence: %s\n",
1357	m_reason.c_str()));	1351	m_reason.c_str()));
1358	return true;	1352	return true;
1359	}	1353	}
1360		1354
1361
1362	// Return existing stem db languages	1355	// Return existing stem db languages
1363	vector<string> Db::getStemLangs()	1356	vector<string> Db::getStemLangs()
1364	{	1357	{
1365	LOGDEB(("Db::getStemLang\n"));	1358	LOGDEB(("Db::getStemLang\n"));
1366	vector<string> langs;	1359	vector<string> langs;
	...		...
1579	LOGERR(("Db::purgeFile: %s\n", ermsg.c_str()));	1572	LOGERR(("Db::purgeFile: %s\n", ermsg.c_str()));
1580	}	1573	}
1581	return false;	1574	return false;
1582	}	1575	}
1583		1576
1584	// File name wild card expansion. This is a specialisation ot termMatch
1585	bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
1586	{
1587	string pattern = fnexp;
1588	names.clear();
1589
1590	// If pattern is not capitalized, not quoted (quoted pattern can't
1591	// get here currently anyway), and has no wildcards, we add * at
1592	// each end: match any substring
1593	if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
1594	pattern = pattern.substr(1, pattern.size() -2);
1595	} else if (pattern.find_first_of(cstr_minwilds) == string::npos &&
1596	!unaciscapital(pattern)) {
1597	pattern = "" + pattern + "";
1598	} // else let it be
1599
1600	LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
1601
1602	// We inconditionnally lowercase and strip the pattern, as is done
1603	// during indexing. This seems to be the only sane possible
1604	// approach with file names and wild cards. termMatch does
1605	// stripping conditionally on indexstripchars.
1606	string pat1;
1607	if (unacmaybefold(pattern, pat1, "UTF-8", UNACOP_UNACFOLD)) {
1608	pattern.swap(pat1);
1609	}
1610
1611	TermMatchResult result;
1612	if (!termMatch(ET_WILD, string(), pattern, result, max,
1613	unsplitFilenameFieldName))
1614	return false;
1615	for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
1616	it != result.entries.end(); it++)
1617	names.push_back(it->term);
1618
1619	if (names.empty()) {
1620	// Build an impossible query: we know its impossible because we
1621	// control the prefixes!
1622	names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
1623	}
1624	return true;
1625	}
1626
1627	// Walk the Y terms and return min/max
1628	bool Db::maxYearSpan(int minyear, int maxyear)
1629	{
1630	LOGDEB(("Rcl::Db:maxYearSpan\n"));
1631	*minyear = 1000000;
1632	*maxyear = -1000000;
1633	TermMatchResult result;
1634	if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear")) {
1635	LOGINFO(("Rcl::Db:maxYearSpan: termMatch failed\n"));
1636	return false;
1637	}
1638	for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
1639	it != result.entries.end(); it++) {
1640	if (!it->term.empty()) {
1641	int year = atoi(strip_prefix(it->term).c_str());
1642	if (year < *minyear)
1643	*minyear = year;
1644	if (year > *maxyear)
1645	*maxyear = year;
1646	}
1647	}
1648	return true;
1649	}
1650
1651
1652	class TermMatchCmpByWcf {
1653	public:
1654	int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
1655	return r.wcf - l.wcf < 0;
1656	}
1657	};
1658	class TermMatchCmpByTerm {
1659	public:
1660	int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
1661	return l.term.compare(r.term) > 0;
1662	}
1663	};
1664	class TermMatchTermEqual {
1665	public:
1666	int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
1667	return !l.term.compare(r.term);
1668	}
1669	};
1670
1671	#ifdef RCL_INDEX_STRIPCHARS
1672	bool Db::stemExpand(const string &langs, const string &term,
1673	TermMatchResult& result)
1674	{
1675	if (m_ndb == 0 \|\| m_ndb->m_isopen == false)
1676	return false;
1677	vector<string> exp;
1678	StemDb db(m_ndb->xrdb);
1679	if (!db.stemExpand(langs, term, exp))
1680	return false;
1681	result.entries.insert(result.entries.end(), exp.begin(), exp.end());
1682	return true;
1683	}
1684	#endif
1685
1686	/** Add prefix to all strings in list.
1687	* @param prefix already wrapped prefix
1688	*/
1689	static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
1690	{
1691	if (prefix.empty())
1692	return;
1693	for (vector<TermMatchEntry>::iterator it = terms.begin();
1694	it != terms.end(); it++)
1695	it->term.insert(0, prefix);
1696	}
1697
1698	bool Db::dbStats(DbStats& res)	1577	bool Db::dbStats(DbStats& res)
1699	{	1578	{
1700	if (!m_ndb \|\| !m_ndb->m_isopen)	1579	if (!m_ndb \|\| !m_ndb->m_isopen)
1701	return false;	1580	return false;
1702	Xapian::Database xdb = m_ndb->xrdb;	1581	Xapian::Database xdb = m_ndb->xrdb;
	...		...
1706	res.mindoclen = xdb.get_doclength_lower_bound();	1585	res.mindoclen = xdb.get_doclength_lower_bound();
1707	res.maxdoclen = xdb.get_doclength_upper_bound();	1586	res.maxdoclen = xdb.get_doclength_upper_bound();
1708	, xdb, m_reason);	1587	, xdb, m_reason);
1709	if (!m_reason.empty())	1588	if (!m_reason.empty())
1710	return false;	1589	return false;
1711	return true;
1712	}
1713
1714	// Find all index terms that match a wildcard or regular expression If
1715	// field is set, we return a list of appropriately prefixed terms
1716	// (which are going to be used to build a Xapian query). This routine
1717	// performs case/diacritics/stemming expansion and possibly calls
1718	// idxTermMatch for wildcard/regexp expansion and filtering against
1719	// the main index terms.
1720	bool Db::termMatch(int typ_sens, const string &lang,
1721	const string &_term,
1722	TermMatchResult& res,
1723	int max,
1724	const string& field)
1725	{
1726	int matchtyp = matchTypeTp(typ_sens);
1727	if (!m_ndb \|\| !m_ndb->m_isopen)
1728	return false;
1729	Xapian::Database xrdb = m_ndb->xrdb;
1730
1731	bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
1732	bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
1733
1734	bool stripped = false;
1735	#ifdef RCL_INDEX_STRIPCHARS
1736	stripped = true;
1737	#else
1738	stripped = o_index_stripchars;
1739	#endif
1740
1741	LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
1742	"max %d field [%s] stripped %d\n",
1743	matchtyp, diac_sensitive, case_sensitive, lang.c_str(),
1744	_term.c_str(), max, field.c_str(), stripped));
1745
1746	// If index is stripped, no case or diac expansion can be needed:
1747	// for the processing inside this routine, everything looks like
1748	// we're all-sensitive: no use of expansion db.
1749	// Also, convert input to lowercase and strip its accents.
1750	string term = _term;
1751	if (stripped) {
1752	diac_sensitive = case_sensitive = true;
1753	if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
1754	LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
1755	return false;
1756	}
1757	}
1758
1759	#ifndef RCL_INDEX_STRIPCHARS
1760	// The case/diac expansion db
1761	SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
1762	XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
1763	#endif // RCL_INDEX_STRIPCHARS
1764
1765
1766	if (matchtyp == ET_WILD \|\| matchtyp == ET_REGEXP) {
1767	#ifdef RCL_INDEX_STRIPCHARS
1768	idxTermMatch(typ_sens, lang, term, res, max, field);
1769	#else
1770	RefCntr<StrMatcher> matcher;
1771	if (matchtyp == ET_WILD) {
1772	matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
1773	} else {
1774	matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
1775	}
1776	if (!diac_sensitive \|\| !case_sensitive) {
1777	// Perform case/diac expansion on the exp as appropriate and
1778	// expand the result.
1779	vector<string> exp;
1780	if (diac_sensitive) {
1781	// Expand for diacritics and case, filtering for same diacritics
1782	SynTermTransUnac foldtrans(UNACOP_FOLD);
1783	synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
1784	} else if (case_sensitive) {
1785	// Expand for diacritics and case, filtering for same case
1786	SynTermTransUnac unactrans(UNACOP_UNAC);
1787	synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
1788	} else {
1789	// Expand for diacritics and case, no filtering
1790	synac.synKeyExpand(matcher.getptr(), exp);
1791	}
1792	// Retrieve additional info and filter against the index itself
1793	for (vector<string>::const_iterator it = exp.begin();
1794	it != exp.end(); it++) {
1795	idxTermMatch(ET_NONE, "", *it, res, max, field);
1796	}
1797	} else {
1798	idxTermMatch(typ_sens, lang, term, res, max, field);
1799	}
1800
1801	#endif // RCL_INDEX_STRIPCHARS
1802
1803	} else {
1804	// Expansion is STEM or NONE (which may still need case/diac exp)
1805
1806	#ifdef RCL_INDEX_STRIPCHARS
1807
1808	idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
1809
1810	#else
1811	vector<string> lexp;
1812	if (diac_sensitive && case_sensitive) {
1813	// No case/diac expansion
1814	lexp.push_back(term);
1815	} else if (diac_sensitive) {
1816	// Expand for accents and case, filtering for same accents,
1817	SynTermTransUnac foldtrans(UNACOP_FOLD);
1818	synac.synExpand(term, lexp, &foldtrans);
1819	} else if (case_sensitive) {
1820	// Expand for accents and case, filtering for same case
1821	SynTermTransUnac unactrans(UNACOP_UNAC);
1822	synac.synExpand(term, lexp, &unactrans);
1823	} else {
1824	// We are neither accent- nor case- sensitive and may need stem
1825	// expansion or not. Expand for accents and case
1826	synac.synExpand(term, lexp);
1827	}
1828
1829	if (matchTypeTp(typ_sens) == ET_STEM) {
1830	// Need stem expansion. Lowercase the result of accent and case
1831	// expansion for input to stemdb.
1832	for (unsigned int i = 0; i < lexp.size(); i++) {
1833	string lower;
1834	unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
1835	lexp[i] = lower;
1836	}
1837	sort(lexp.begin(), lexp.end());
1838	lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
1839	StemDb sdb(xrdb);
1840	vector<string> exp1;
1841	for (vector<string>::const_iterator it = lexp.begin();
1842	it != lexp.end(); it++) {
1843	sdb.stemExpand(lang, *it, exp1);
1844	}
1845	LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
1846
1847	// Expand the resulting list for case (all stemdb content
1848	// is lowercase)
1849	lexp.clear();
1850	for (vector<string>::const_iterator it = exp1.begin();
1851	it != exp1.end(); it++) {
1852	synac.synExpand(*it, lexp);
1853	}
1854	sort(lexp.begin(), lexp.end());
1855	lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
1856	}
1857
1858	// Filter the result and get the stats, possibly add prefixes.
1859	LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
1860	for (vector<string>::const_iterator it = lexp.begin();
1861	it != lexp.end(); it++) {
1862	idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
1863	}
1864	}
1865	#endif
1866
1867	TermMatchCmpByTerm tcmp;
1868	sort(res.entries.begin(), res.entries.end(), tcmp);
1869	TermMatchTermEqual teq;
1870	vector<TermMatchEntry>::iterator uit =
1871	unique(res.entries.begin(), res.entries.end(), teq);
1872	res.entries.resize(uit - res.entries.begin());
1873	TermMatchCmpByWcf wcmp;
1874	sort(res.entries.begin(), res.entries.end(), wcmp);
1875	if (max > 0) {
1876	// Would need a small max and big stem expansion...
1877	res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
1878	}
1879	return true;
1880	}
1881
1882	// Second phase of wildcard/regexp term expansion after case/diac
1883	// expansion: expand against main index terms
1884	bool Db::idxTermMatch(int typ_sens, const string &lang,
1885	const string &root,
1886	TermMatchResult& res,
1887	int max,
1888	const string& field)
1889	{
1890	int typ = matchTypeTp(typ_sens);
1891
1892	#ifndef RCL_INDEX_STRIPCHARS
1893	if (typ == ET_STEM) {
1894	LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
1895	abort();
1896	}
1897	#endif
1898
1899	if (!m_ndb \|\| !m_ndb->m_isopen)
1900	return false;
1901	Xapian::Database xdb = m_ndb->xrdb;
1902
1903	string prefix;
1904	if (!field.empty()) {
1905	const FieldTraits *ftp = 0;
1906	if (!fieldToTraits(field, &ftp) \|\| ftp->pfx.empty()) {
1907	LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
1908	field.c_str()));
1909	} else {
1910	prefix = wrap_prefix(ftp->pfx);
1911	}
1912	}
1913	res.prefix = prefix;
1914
1915	#ifdef RCL_INDEX_STRIPCHARS
1916	if (typ == ET_STEM) {
1917	if (!stemExpand(lang, root, res))
1918	return false;
1919	for (vector<TermMatchEntry>::iterator it = res.entries.begin();
1920	it != res.entries.end(); it++) {
1921	XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
1922	it->docs = xdb.get_termfreq(it->term),
1923	xdb, m_reason);
1924	if (!m_reason.empty())
1925	return false;
1926	LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
1927	}
1928	if (!prefix.empty())
1929	addPrefix(res.entries, prefix);
1930	} else
1931	#endif
1932	{
1933	RefCntr<StrMatcher> matcher;
1934	if (typ == ET_REGEXP) {
1935	matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
1936	if (!matcher->ok()) {
1937	LOGERR(("termMatch: regcomp failed: %s\n",
1938	matcher->getreason().c_str()))
1939	return false;
1940	}
1941	} else if (typ == ET_WILD) {
1942	matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
1943	}
1944
1945	// Find the initial section before any special char
1946	string::size_type es = string::npos;
1947	if (matcher.isNotNull()) {
1948	es = matcher->baseprefixlen();
1949	}
1950	string is;
1951	switch (es) {
1952	case string::npos: is = prefix + root; break;
1953	case 0: is = prefix; break;
1954	default: is = prefix + root.substr(0, es); break;
1955	}
1956	LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
1957
1958	for (int tries = 0; tries < 2; tries++) {
1959	try {
1960	Xapian::TermIterator it = xdb.allterms_begin();
1961	if (!is.empty())
1962	it.skip_to(is.c_str());
1963	for (int rcnt = 0; it != xdb.allterms_end(); it++) {
1964	// If we're beyond the terms matching the initial
1965	// string, end
1966	if (!is.empty() && (*it).find(is) != 0)
1967	break;
1968	string term;
1969	if (!prefix.empty())
1970	term = (*it).substr(prefix.length());
1971	else
1972	term = *it;
1973
1974	if (matcher.isNotNull() && !matcher->match(term))
1975	continue;
1976
1977	res.entries.push_back(
1978	TermMatchEntry(it, xdb.get_collection_freq(it),
1979	it.get_termfreq()));
1980
1981	// The problem with truncating here is that this is done
1982	// alphabetically and we may not keep the most frequent
1983	// terms. OTOH, not doing it may stall the program if
1984	// we are walking the whole term list. We compromise
1985	// by cutting at 2*max
1986	if (max > 0 && ++rcnt >= 2*max)
1987	break;
1988	}
1989	m_reason.erase();
1990	break;
1991	} catch (const Xapian::DatabaseModifiedError &e) {
1992	m_reason = e.get_msg();
1993	xdb.reopen();
1994	continue;
1995	} XCATCHERROR(m_reason);
1996	break;
1997	}
1998	if (!m_reason.empty()) {
1999	LOGERR(("termMatch: %s\n", m_reason.c_str()));
2000	return false;
2001	}
2002	}
2003
2004	return true;
2005	}
2006
2007	/** Term list walking. */
2008	class TermIter {
2009	public:
2010	Xapian::TermIterator it;
2011	Xapian::Database db;
2012	};
2013	TermIter *Db::termWalkOpen()
2014	{
2015	if (!m_ndb \|\| !m_ndb->m_isopen)
2016	return 0;
2017	TermIter *tit = new TermIter;