|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
17 |
#include "autoconfig.h"
|
17 |
#include "autoconfig.h"
|
18 |
|
18 |
|
19 |
#include <stdio.h>
|
19 |
#include <stdio.h>
|
20 |
#include <cstring>
|
20 |
#include <cstring>
|
21 |
#include <unistd.h>
|
21 |
#include <unistd.h>
|
22 |
#include <fnmatch.h>
|
|
|
23 |
#include <regex.h>
|
|
|
24 |
#include <math.h>
|
22 |
#include <math.h>
|
25 |
#include <time.h>
|
23 |
#include <time.h>
|
26 |
|
24 |
|
27 |
#include <string>
|
25 |
#include <string>
|
28 |
#include <vector>
|
26 |
#include <vector>
|
29 |
#include <algorithm>
|
27 |
#include <algorithm>
|
30 |
#include <sstream>
|
28 |
#include <sstream>
|
31 |
|
29 |
|
32 |
#ifndef NO_NAMESPACES
|
|
|
33 |
using namespace std;
|
30 |
using namespace std;
|
34 |
#endif /* NO_NAMESPACES */
|
|
|
35 |
|
31 |
|
36 |
#include "xapian.h"
|
32 |
#include "xapian.h"
|
37 |
|
33 |
|
38 |
#include "rclconfig.h"
|
34 |
#include "rclconfig.h"
|
39 |
#include "debuglog.h"
|
35 |
#include "debuglog.h"
|
|
... |
|
... |
63 |
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
59 |
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
64 |
static const string cstr_RCL_IDX_VERSION("1");
|
60 |
static const string cstr_RCL_IDX_VERSION("1");
|
65 |
|
61 |
|
66 |
static const string cstr_mbreaks("rclmbreaks");
|
62 |
static const string cstr_mbreaks("rclmbreaks");
|
67 |
|
63 |
|
68 |
#ifndef NO_NAMESPACES
|
|
|
69 |
namespace Rcl {
|
64 |
namespace Rcl {
|
70 |
#endif
|
|
|
71 |
|
65 |
|
72 |
// Some prefixes that we could get from the fields file, but are not going
|
66 |
// Some prefixes that we could get from the fields file, but are not going
|
73 |
// to ever change.
|
67 |
// to ever change.
|
74 |
static const string fileext_prefix = "XE";
|
68 |
static const string fileext_prefix = "XE";
|
75 |
const string mimetype_prefix = "T";
|
69 |
const string mimetype_prefix = "T";
|
|
... |
|
... |
92 |
const string page_break_term = "XXPG/";
|
86 |
const string page_break_term = "XXPG/";
|
93 |
#endif
|
87 |
#endif
|
94 |
|
88 |
|
95 |
// Field name for the unsplit file name. Has to exist in the field file
|
89 |
// Field name for the unsplit file name. Has to exist in the field file
|
96 |
// because of usage in termmatch()
|
90 |
// because of usage in termmatch()
|
97 |
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
91 |
const string unsplitFilenameFieldName = "rclUnsplitFN";
|
98 |
static const string unsplitfilename_prefix = "XSFS";
|
92 |
static const string unsplitfilename_prefix = "XSFS";
|
99 |
|
93 |
|
100 |
string version_string(){
|
94 |
string version_string(){
|
101 |
return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
|
95 |
return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
|
102 |
string(Xapian::version_string());
|
96 |
string(Xapian::version_string());
|
|
... |
|
... |
1356 |
LOGERR(("Db::needUpdate: error while checking existence: %s\n",
|
1350 |
LOGERR(("Db::needUpdate: error while checking existence: %s\n",
|
1357 |
m_reason.c_str()));
|
1351 |
m_reason.c_str()));
|
1358 |
return true;
|
1352 |
return true;
|
1359 |
}
|
1353 |
}
|
1360 |
|
1354 |
|
1361 |
|
|
|
1362 |
// Return existing stem db languages
|
1355 |
// Return existing stem db languages
|
1363 |
vector<string> Db::getStemLangs()
|
1356 |
vector<string> Db::getStemLangs()
|
1364 |
{
|
1357 |
{
|
1365 |
LOGDEB(("Db::getStemLang\n"));
|
1358 |
LOGDEB(("Db::getStemLang\n"));
|
1366 |
vector<string> langs;
|
1359 |
vector<string> langs;
|
|
... |
|
... |
1579 |
LOGERR(("Db::purgeFile: %s\n", ermsg.c_str()));
|
1572 |
LOGERR(("Db::purgeFile: %s\n", ermsg.c_str()));
|
1580 |
}
|
1573 |
}
|
1581 |
return false;
|
1574 |
return false;
|
1582 |
}
|
1575 |
}
|
1583 |
|
1576 |
|
1584 |
// File name wild card expansion. This is a specialisation ot termMatch
|
|
|
1585 |
bool Db::filenameWildExp(const string& fnexp, vector<string>& names, int max)
|
|
|
1586 |
{
|
|
|
1587 |
string pattern = fnexp;
|
|
|
1588 |
names.clear();
|
|
|
1589 |
|
|
|
1590 |
// If pattern is not capitalized, not quoted (quoted pattern can't
|
|
|
1591 |
// get here currently anyway), and has no wildcards, we add * at
|
|
|
1592 |
// each end: match any substring
|
|
|
1593 |
if (pattern[0] == '"' && pattern[pattern.size()-1] == '"') {
|
|
|
1594 |
pattern = pattern.substr(1, pattern.size() -2);
|
|
|
1595 |
} else if (pattern.find_first_of(cstr_minwilds) == string::npos &&
|
|
|
1596 |
!unaciscapital(pattern)) {
|
|
|
1597 |
pattern = "*" + pattern + "*";
|
|
|
1598 |
} // else let it be
|
|
|
1599 |
|
|
|
1600 |
LOGDEB(("Rcl::Db::filenameWildExp: pattern: [%s]\n", pattern.c_str()));
|
|
|
1601 |
|
|
|
1602 |
// We inconditionnally lowercase and strip the pattern, as is done
|
|
|
1603 |
// during indexing. This seems to be the only sane possible
|
|
|
1604 |
// approach with file names and wild cards. termMatch does
|
|
|
1605 |
// stripping conditionally on indexstripchars.
|
|
|
1606 |
string pat1;
|
|
|
1607 |
if (unacmaybefold(pattern, pat1, "UTF-8", UNACOP_UNACFOLD)) {
|
|
|
1608 |
pattern.swap(pat1);
|
|
|
1609 |
}
|
|
|
1610 |
|
|
|
1611 |
TermMatchResult result;
|
|
|
1612 |
if (!termMatch(ET_WILD, string(), pattern, result, max,
|
|
|
1613 |
unsplitFilenameFieldName))
|
|
|
1614 |
return false;
|
|
|
1615 |
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
|
|
1616 |
it != result.entries.end(); it++)
|
|
|
1617 |
names.push_back(it->term);
|
|
|
1618 |
|
|
|
1619 |
if (names.empty()) {
|
|
|
1620 |
// Build an impossible query: we know its impossible because we
|
|
|
1621 |
// control the prefixes!
|
|
|
1622 |
names.push_back(wrap_prefix("XNONE") + "NoMatchingTerms");
|
|
|
1623 |
}
|
|
|
1624 |
return true;
|
|
|
1625 |
}
|
|
|
1626 |
|
|
|
1627 |
// Walk the Y terms and return min/max
|
|
|
1628 |
bool Db::maxYearSpan(int *minyear, int *maxyear)
|
|
|
1629 |
{
|
|
|
1630 |
LOGDEB(("Rcl::Db:maxYearSpan\n"));
|
|
|
1631 |
*minyear = 1000000;
|
|
|
1632 |
*maxyear = -1000000;
|
|
|
1633 |
TermMatchResult result;
|
|
|
1634 |
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear")) {
|
|
|
1635 |
LOGINFO(("Rcl::Db:maxYearSpan: termMatch failed\n"));
|
|
|
1636 |
return false;
|
|
|
1637 |
}
|
|
|
1638 |
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
|
|
1639 |
it != result.entries.end(); it++) {
|
|
|
1640 |
if (!it->term.empty()) {
|
|
|
1641 |
int year = atoi(strip_prefix(it->term).c_str());
|
|
|
1642 |
if (year < *minyear)
|
|
|
1643 |
*minyear = year;
|
|
|
1644 |
if (year > *maxyear)
|
|
|
1645 |
*maxyear = year;
|
|
|
1646 |
}
|
|
|
1647 |
}
|
|
|
1648 |
return true;
|
|
|
1649 |
}
|
|
|
1650 |
|
|
|
1651 |
|
|
|
1652 |
class TermMatchCmpByWcf {
|
|
|
1653 |
public:
|
|
|
1654 |
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
|
1655 |
return r.wcf - l.wcf < 0;
|
|
|
1656 |
}
|
|
|
1657 |
};
|
|
|
1658 |
class TermMatchCmpByTerm {
|
|
|
1659 |
public:
|
|
|
1660 |
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
|
1661 |
return l.term.compare(r.term) > 0;
|
|
|
1662 |
}
|
|
|
1663 |
};
|
|
|
1664 |
class TermMatchTermEqual {
|
|
|
1665 |
public:
|
|
|
1666 |
int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
|
|
|
1667 |
return !l.term.compare(r.term);
|
|
|
1668 |
}
|
|
|
1669 |
};
|
|
|
1670 |
|
|
|
1671 |
#ifdef RCL_INDEX_STRIPCHARS
|
|
|
1672 |
bool Db::stemExpand(const string &langs, const string &term,
|
|
|
1673 |
TermMatchResult& result)
|
|
|
1674 |
{
|
|
|
1675 |
if (m_ndb == 0 || m_ndb->m_isopen == false)
|
|
|
1676 |
return false;
|
|
|
1677 |
vector<string> exp;
|
|
|
1678 |
StemDb db(m_ndb->xrdb);
|
|
|
1679 |
if (!db.stemExpand(langs, term, exp))
|
|
|
1680 |
return false;
|
|
|
1681 |
result.entries.insert(result.entries.end(), exp.begin(), exp.end());
|
|
|
1682 |
return true;
|
|
|
1683 |
}
|
|
|
1684 |
#endif
|
|
|
1685 |
|
|
|
1686 |
/** Add prefix to all strings in list.
|
|
|
1687 |
* @param prefix already wrapped prefix
|
|
|
1688 |
*/
|
|
|
1689 |
static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
|
|
|
1690 |
{
|
|
|
1691 |
if (prefix.empty())
|
|
|
1692 |
return;
|
|
|
1693 |
for (vector<TermMatchEntry>::iterator it = terms.begin();
|
|
|
1694 |
it != terms.end(); it++)
|
|
|
1695 |
it->term.insert(0, prefix);
|
|
|
1696 |
}
|
|
|
1697 |
|
|
|
1698 |
bool Db::dbStats(DbStats& res)
|
1577 |
bool Db::dbStats(DbStats& res)
|
1699 |
{
|
1578 |
{
|
1700 |
if (!m_ndb || !m_ndb->m_isopen)
|
1579 |
if (!m_ndb || !m_ndb->m_isopen)
|
1701 |
return false;
|
1580 |
return false;
|
1702 |
Xapian::Database xdb = m_ndb->xrdb;
|
1581 |
Xapian::Database xdb = m_ndb->xrdb;
|
|
... |
|
... |
1706 |
res.mindoclen = xdb.get_doclength_lower_bound();
|
1585 |
res.mindoclen = xdb.get_doclength_lower_bound();
|
1707 |
res.maxdoclen = xdb.get_doclength_upper_bound();
|
1586 |
res.maxdoclen = xdb.get_doclength_upper_bound();
|
1708 |
, xdb, m_reason);
|
1587 |
, xdb, m_reason);
|
1709 |
if (!m_reason.empty())
|
1588 |
if (!m_reason.empty())
|
1710 |
return false;
|
1589 |
return false;
|
1711 |
return true;
|
|
|
1712 |
}
|
|
|
1713 |
|
|
|
1714 |
// Find all index terms that match a wildcard or regular expression If
|
|
|
1715 |
// field is set, we return a list of appropriately prefixed terms
|
|
|
1716 |
// (which are going to be used to build a Xapian query). This routine
|
|
|
1717 |
// performs case/diacritics/stemming expansion and possibly calls
|
|
|
1718 |
// idxTermMatch for wildcard/regexp expansion and filtering against
|
|
|
1719 |
// the main index terms.
|
|
|
1720 |
bool Db::termMatch(int typ_sens, const string &lang,
|
|
|
1721 |
const string &_term,
|
|
|
1722 |
TermMatchResult& res,
|
|
|
1723 |
int max,
|
|
|
1724 |
const string& field)
|
|
|
1725 |
{
|
|
|
1726 |
int matchtyp = matchTypeTp(typ_sens);
|
|
|
1727 |
if (!m_ndb || !m_ndb->m_isopen)
|
|
|
1728 |
return false;
|
|
|
1729 |
Xapian::Database xrdb = m_ndb->xrdb;
|
|
|
1730 |
|
|
|
1731 |
bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
|
|
|
1732 |
bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
|
|
|
1733 |
|
|
|
1734 |
bool stripped = false;
|
|
|
1735 |
#ifdef RCL_INDEX_STRIPCHARS
|
|
|
1736 |
stripped = true;
|
|
|
1737 |
#else
|
|
|
1738 |
stripped = o_index_stripchars;
|
|
|
1739 |
#endif
|
|
|
1740 |
|
|
|
1741 |
LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
|
|
|
1742 |
"max %d field [%s] stripped %d\n",
|
|
|
1743 |
matchtyp, diac_sensitive, case_sensitive, lang.c_str(),
|
|
|
1744 |
_term.c_str(), max, field.c_str(), stripped));
|
|
|
1745 |
|
|
|
1746 |
// If index is stripped, no case or diac expansion can be needed:
|
|
|
1747 |
// for the processing inside this routine, everything looks like
|
|
|
1748 |
// we're all-sensitive: no use of expansion db.
|
|
|
1749 |
// Also, convert input to lowercase and strip its accents.
|
|
|
1750 |
string term = _term;
|
|
|
1751 |
if (stripped) {
|
|
|
1752 |
diac_sensitive = case_sensitive = true;
|
|
|
1753 |
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
|
|
1754 |
LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
|
|
|
1755 |
return false;
|
|
|
1756 |
}
|
|
|
1757 |
}
|
|
|
1758 |
|
|
|
1759 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
1760 |
// The case/diac expansion db
|
|
|
1761 |
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
|
|
1762 |
XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
|
|
|
1763 |
#endif // RCL_INDEX_STRIPCHARS
|
|
|
1764 |
|
|
|
1765 |
|
|
|
1766 |
if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
|
|
|
1767 |
#ifdef RCL_INDEX_STRIPCHARS
|
|
|
1768 |
idxTermMatch(typ_sens, lang, term, res, max, field);
|
|
|
1769 |
#else
|
|
|
1770 |
RefCntr<StrMatcher> matcher;
|
|
|
1771 |
if (matchtyp == ET_WILD) {
|
|
|
1772 |
matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
|
|
|
1773 |
} else {
|
|
|
1774 |
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
|
|
|
1775 |
}
|
|
|
1776 |
if (!diac_sensitive || !case_sensitive) {
|
|
|
1777 |
// Perform case/diac expansion on the exp as appropriate and
|
|
|
1778 |
// expand the result.
|
|
|
1779 |
vector<string> exp;
|
|
|
1780 |
if (diac_sensitive) {
|
|
|
1781 |
// Expand for diacritics and case, filtering for same diacritics
|
|
|
1782 |
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
|
1783 |
synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
|
|
|
1784 |
} else if (case_sensitive) {
|
|
|
1785 |
// Expand for diacritics and case, filtering for same case
|
|
|
1786 |
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
|
1787 |
synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
|
|
|
1788 |
} else {
|
|
|
1789 |
// Expand for diacritics and case, no filtering
|
|
|
1790 |
synac.synKeyExpand(matcher.getptr(), exp);
|
|
|
1791 |
}
|
|
|
1792 |
// Retrieve additional info and filter against the index itself
|
|
|
1793 |
for (vector<string>::const_iterator it = exp.begin();
|
|
|
1794 |
it != exp.end(); it++) {
|
|
|
1795 |
idxTermMatch(ET_NONE, "", *it, res, max, field);
|
|
|
1796 |
}
|
|
|
1797 |
} else {
|
|
|
1798 |
idxTermMatch(typ_sens, lang, term, res, max, field);
|
|
|
1799 |
}
|
|
|
1800 |
|
|
|
1801 |
#endif // RCL_INDEX_STRIPCHARS
|
|
|
1802 |
|
|
|
1803 |
} else {
|
|
|
1804 |
// Expansion is STEM or NONE (which may still need case/diac exp)
|
|
|
1805 |
|
|
|
1806 |
#ifdef RCL_INDEX_STRIPCHARS
|
|
|
1807 |
|
|
|
1808 |
idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
|
|
|
1809 |
|
|
|
1810 |
#else
|
|
|
1811 |
vector<string> lexp;
|
|
|
1812 |
if (diac_sensitive && case_sensitive) {
|
|
|
1813 |
// No case/diac expansion
|
|
|
1814 |
lexp.push_back(term);
|
|
|
1815 |
} else if (diac_sensitive) {
|
|
|
1816 |
// Expand for accents and case, filtering for same accents,
|
|
|
1817 |
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
|
1818 |
synac.synExpand(term, lexp, &foldtrans);
|
|
|
1819 |
} else if (case_sensitive) {
|
|
|
1820 |
// Expand for accents and case, filtering for same case
|
|
|
1821 |
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
|
1822 |
synac.synExpand(term, lexp, &unactrans);
|
|
|
1823 |
} else {
|
|
|
1824 |
// We are neither accent- nor case- sensitive and may need stem
|
|
|
1825 |
// expansion or not. Expand for accents and case
|
|
|
1826 |
synac.synExpand(term, lexp);
|
|
|
1827 |
}
|
|
|
1828 |
|
|
|
1829 |
if (matchTypeTp(typ_sens) == ET_STEM) {
|
|
|
1830 |
// Need stem expansion. Lowercase the result of accent and case
|
|
|
1831 |
// expansion for input to stemdb.
|
|
|
1832 |
for (unsigned int i = 0; i < lexp.size(); i++) {
|
|
|
1833 |
string lower;
|
|
|
1834 |
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
|
|
1835 |
lexp[i] = lower;
|
|
|
1836 |
}
|
|
|
1837 |
sort(lexp.begin(), lexp.end());
|
|
|
1838 |
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
|
1839 |
StemDb sdb(xrdb);
|
|
|
1840 |
vector<string> exp1;
|
|
|
1841 |
for (vector<string>::const_iterator it = lexp.begin();
|
|
|
1842 |
it != lexp.end(); it++) {
|
|
|
1843 |
sdb.stemExpand(lang, *it, exp1);
|
|
|
1844 |
}
|
|
|
1845 |
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
|
|
1846 |
|
|
|
1847 |
// Expand the resulting list for case (all stemdb content
|
|
|
1848 |
// is lowercase)
|
|
|
1849 |
lexp.clear();
|
|
|
1850 |
for (vector<string>::const_iterator it = exp1.begin();
|
|
|
1851 |
it != exp1.end(); it++) {
|
|
|
1852 |
synac.synExpand(*it, lexp);
|
|
|
1853 |
}
|
|
|
1854 |
sort(lexp.begin(), lexp.end());
|
|
|
1855 |
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
|
1856 |
}
|
|
|
1857 |
|
|
|
1858 |
// Filter the result and get the stats, possibly add prefixes.
|
|
|
1859 |
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
|
|
1860 |
for (vector<string>::const_iterator it = lexp.begin();
|
|
|
1861 |
it != lexp.end(); it++) {
|
|
|
1862 |
idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
|
|
|
1863 |
}
|
|
|
1864 |
}
|
|
|
1865 |
#endif
|
|
|
1866 |
|
|
|
1867 |
TermMatchCmpByTerm tcmp;
|
|
|
1868 |
sort(res.entries.begin(), res.entries.end(), tcmp);
|
|
|
1869 |
TermMatchTermEqual teq;
|
|
|
1870 |
vector<TermMatchEntry>::iterator uit =
|
|
|
1871 |
unique(res.entries.begin(), res.entries.end(), teq);
|
|
|
1872 |
res.entries.resize(uit - res.entries.begin());
|
|
|
1873 |
TermMatchCmpByWcf wcmp;
|
|
|
1874 |
sort(res.entries.begin(), res.entries.end(), wcmp);
|
|
|
1875 |
if (max > 0) {
|
|
|
1876 |
// Would need a small max and big stem expansion...
|
|
|
1877 |
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
|
|
1878 |
}
|
|
|
1879 |
return true;
|
|
|
1880 |
}
|
|
|
1881 |
|
|
|
1882 |
// Second phase of wildcard/regexp term expansion after case/diac
|
|
|
1883 |
// expansion: expand against main index terms
|
|
|
1884 |
bool Db::idxTermMatch(int typ_sens, const string &lang,
|
|
|
1885 |
const string &root,
|
|
|
1886 |
TermMatchResult& res,
|
|
|
1887 |
int max,
|
|
|
1888 |
const string& field)
|
|
|
1889 |
{
|
|
|
1890 |
int typ = matchTypeTp(typ_sens);
|
|
|
1891 |
|
|
|
1892 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
1893 |
if (typ == ET_STEM) {
|
|
|
1894 |
LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
|
|
|
1895 |
abort();
|
|
|
1896 |
}
|
|
|
1897 |
#endif
|
|
|
1898 |
|
|
|
1899 |
if (!m_ndb || !m_ndb->m_isopen)
|
|
|
1900 |
return false;
|
|
|
1901 |
Xapian::Database xdb = m_ndb->xrdb;
|
|
|
1902 |
|
|
|
1903 |
string prefix;
|
|
|
1904 |
if (!field.empty()) {
|
|
|
1905 |
const FieldTraits *ftp = 0;
|
|
|
1906 |
if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
|
|
|
1907 |
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
|
|
1908 |
field.c_str()));
|
|
|
1909 |
} else {
|
|
|
1910 |
prefix = wrap_prefix(ftp->pfx);
|
|
|
1911 |
}
|
|
|
1912 |
}
|
|
|
1913 |
res.prefix = prefix;
|
|
|
1914 |
|
|
|
1915 |
#ifdef RCL_INDEX_STRIPCHARS
|
|
|
1916 |
if (typ == ET_STEM) {
|
|
|
1917 |
if (!stemExpand(lang, root, res))
|
|
|
1918 |
return false;
|
|
|
1919 |
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
|
|
1920 |
it != res.entries.end(); it++) {
|
|
|
1921 |
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
|
|
1922 |
it->docs = xdb.get_termfreq(it->term),
|
|
|
1923 |
xdb, m_reason);
|
|
|
1924 |
if (!m_reason.empty())
|
|
|
1925 |
return false;
|
|
|
1926 |
LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
|
|
|
1927 |
}
|
|
|
1928 |
if (!prefix.empty())
|
|
|
1929 |
addPrefix(res.entries, prefix);
|
|
|
1930 |
} else
|
|
|
1931 |
#endif
|
|
|
1932 |
{
|
|
|
1933 |
RefCntr<StrMatcher> matcher;
|
|
|
1934 |
if (typ == ET_REGEXP) {
|
|
|
1935 |
matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
|
|
|
1936 |
if (!matcher->ok()) {
|
|
|
1937 |
LOGERR(("termMatch: regcomp failed: %s\n",
|
|
|
1938 |
matcher->getreason().c_str()))
|
|
|
1939 |
return false;
|
|
|
1940 |
}
|
|
|
1941 |
} else if (typ == ET_WILD) {
|
|
|
1942 |
matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
|
|
|
1943 |
}
|
|
|
1944 |
|
|
|
1945 |
// Find the initial section before any special char
|
|
|
1946 |
string::size_type es = string::npos;
|
|
|
1947 |
if (matcher.isNotNull()) {
|
|
|
1948 |
es = matcher->baseprefixlen();
|
|
|
1949 |
}
|
|
|
1950 |
string is;
|
|
|
1951 |
switch (es) {
|
|
|
1952 |
case string::npos: is = prefix + root; break;
|
|
|
1953 |
case 0: is = prefix; break;
|
|
|
1954 |
default: is = prefix + root.substr(0, es); break;
|
|
|
1955 |
}
|
|
|
1956 |
LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
|
|
|
1957 |
|
|
|
1958 |
for (int tries = 0; tries < 2; tries++) {
|
|
|
1959 |
try {
|
|
|
1960 |
Xapian::TermIterator it = xdb.allterms_begin();
|
|
|
1961 |
if (!is.empty())
|
|
|
1962 |
it.skip_to(is.c_str());
|
|
|
1963 |
for (int rcnt = 0; it != xdb.allterms_end(); it++) {
|
|
|
1964 |
// If we're beyond the terms matching the initial
|
|
|
1965 |
// string, end
|
|
|
1966 |
if (!is.empty() && (*it).find(is) != 0)
|
|
|
1967 |
break;
|
|
|
1968 |
string term;
|
|
|
1969 |
if (!prefix.empty())
|
|
|
1970 |
term = (*it).substr(prefix.length());
|
|
|
1971 |
else
|
|
|
1972 |
term = *it;
|
|
|
1973 |
|
|
|
1974 |
if (matcher.isNotNull() && !matcher->match(term))
|
|
|
1975 |
continue;
|
|
|
1976 |
|
|
|
1977 |
res.entries.push_back(
|
|
|
1978 |
TermMatchEntry(*it, xdb.get_collection_freq(*it),
|
|
|
1979 |
it.get_termfreq()));
|
|
|
1980 |
|
|
|
1981 |
// The problem with truncating here is that this is done
|
|
|
1982 |
// alphabetically and we may not keep the most frequent
|
|
|
1983 |
// terms. OTOH, not doing it may stall the program if
|
|
|
1984 |
// we are walking the whole term list. We compromise
|
|
|
1985 |
// by cutting at 2*max
|
|
|
1986 |
if (max > 0 && ++rcnt >= 2*max)
|
|
|
1987 |
break;
|
|
|
1988 |
}
|
|
|
1989 |
m_reason.erase();
|
|
|
1990 |
break;
|
|
|
1991 |
} catch (const Xapian::DatabaseModifiedError &e) {
|
|
|
1992 |
m_reason = e.get_msg();
|
|
|
1993 |
xdb.reopen();
|
|
|
1994 |
continue;
|
|
|
1995 |
} XCATCHERROR(m_reason);
|
|
|
1996 |
break;
|
|
|
1997 |
}
|
|
|
1998 |
if (!m_reason.empty()) {
|
|
|
1999 |
LOGERR(("termMatch: %s\n", m_reason.c_str()));
|
|
|
2000 |
return false;
|
|
|
2001 |
}
|
|
|
2002 |
}
|
|
|
2003 |
|
|
|
2004 |
return true;
|
|
|
2005 |
}
|
|
|
2006 |
|
|
|
2007 |
/** Term list walking. */
|
|
|
2008 |
class TermIter {
|
|
|
2009 |
public:
|
|
|
2010 |
Xapian::TermIterator it;
|
|
|
2011 |
Xapian::Database db;
|
|
|
2012 |
};
|
|
|
2013 |
TermIter *Db::termWalkOpen()
|
|
|
2014 |
{
|
|
|
2015 |
if (!m_ndb || !m_ndb->m_isopen)
|
|
|
2016 |
return 0;
|
|
|
2017 |
TermIter *tit = new TermIter;
|
|
|
2018 |
if (tit) {
|
|
|
2019 |
tit->db = m_ndb->xrdb;
|
|
|
2020 |
XAPTRY(tit->it = tit->db.allterms_begin(), tit->db, m_reason);
|
|
|
2021 |
if (!m_reason.empty()) {
|
|
|
2022 |
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
|
|
2023 |
return 0;
|
|
|
2024 |
}
|
|
|
2025 |
}
|
|
|
2026 |
return tit;
|
|
|
2027 |
}
|
|
|
2028 |
bool Db::termWalkNext(TermIter *tit, string &term)
|
|
|
2029 |
{
|
|
|
2030 |
XAPTRY(
|
|
|
2031 |
if (tit && tit->it != tit->db.allterms_end()) {
|
|
|
2032 |
term = *(tit->it)++;
|
|
|
2033 |
return true;
|
|
|
2034 |
}
|
|
|
2035 |
, tit->db, m_reason);
|
|
|
2036 |
|
|
|
2037 |
if (!m_reason.empty()) {
|
|
|
2038 |
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
|
|
2039 |
}
|
|
|
2040 |
return false;
|
|
|
2041 |
}
|
|
|
2042 |
void Db::termWalkClose(TermIter *tit)
|
|
|
2043 |
{
|
|
|
2044 |
try {
|
|
|
2045 |
delete tit;
|
|
|
2046 |
} catch (...) {}
|
|
|
2047 |
}
|
|
|
2048 |
|
|
|
2049 |
bool Db::termExists(const string& word)
|
|
|
2050 |
{
|
|
|
2051 |
if (!m_ndb || !m_ndb->m_isopen)
|
|
|
2052 |
return 0;
|
|
|
2053 |
|
|
|
2054 |
XAPTRY(if (!m_ndb->xrdb.term_exists(word)) return false,
|
|
|
2055 |
m_ndb->xrdb, m_reason);
|
|
|
2056 |
|
|
|
2057 |
if (!m_reason.empty()) {
|
|
|
2058 |
LOGERR(("Db::termWalkOpen: xapian error: %s\n", m_reason.c_str()));
|
|
|
2059 |
return false;
|
|
|
2060 |
}
|
|
|
2061 |
return true;
|
|
|
2062 |
}
|
|
|
2063 |
|
|
|
2064 |
|
|
|
2065 |
bool Db::stemDiffers(const string& lang, const string& word,
|
|
|
2066 |
const string& base)
|
|
|
2067 |
{
|
|
|
2068 |
Xapian::Stem stemmer(lang);
|
|
|
2069 |
if (!stemmer(word).compare(stemmer(base))) {
|
|
|
2070 |
LOGDEB2(("Rcl::Db::stemDiffers: same for %s and %s\n",
|
|
|
2071 |
word.c_str(), base.c_str()));
|
|
|
2072 |
return false;
|
|
|
2073 |
}
|
|
|
2074 |
return true;
|
1590 |
return true;
|
2075 |
}
|
1591 |
}
|
2076 |
|
1592 |
|
2077 |
// Retrieve document defined by Unique doc identifier. This is used
|
1593 |
// Retrieve document defined by Unique doc identifier. This is used
|
2078 |
// by the GUI history feature and by open parent/getenclosing
|
1594 |
// by the GUI history feature and by open parent/getenclosing
|
|
... |
|
... |
2118 |
|
1634 |
|
2119 |
LOGERR(("Db::getDoc: %s\n", m_reason.c_str()));
|
1635 |
LOGERR(("Db::getDoc: %s\n", m_reason.c_str()));
|
2120 |
return false;
|
1636 |
return false;
|
2121 |
}
|
1637 |
}
|
2122 |
|
1638 |
|
2123 |
#ifndef NO_NAMESPACES
|
1639 |
} // End namespace Rcl
|
2124 |
}
|
|
|
2125 |
#endif
|
|
|