Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
...
...
1666
    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
1666
    int operator()(const TermMatchEntry& l, const TermMatchEntry& r) {
1667
    return !l.term.compare(r.term);
1667
    return !l.term.compare(r.term);
1668
    }
1668
    }
1669
};
1669
};
1670
1670
1671
#ifdef RCL_INDEX_STRIPCHARS
1671
bool Db::stemExpand(const string &langs, const string &term, 
1672
bool Db::stemExpand(const string &langs, const string &term, 
1672
            TermMatchResult& result)
1673
            TermMatchResult& result)
1673
{
1674
{
1674
    if (m_ndb == 0 || m_ndb->m_isopen == false)
1675
    if (m_ndb == 0 || m_ndb->m_isopen == false)
1675
    return false;
1676
    return false;
...
...
1678
    if (!db.stemExpand(langs, term, exp))
1679
    if (!db.stemExpand(langs, term, exp))
1679
    return false;
1680
    return false;
1680
    result.entries.insert(result.entries.end(), exp.begin(), exp.end());
1681
    result.entries.insert(result.entries.end(), exp.begin(), exp.end());
1681
    return true;
1682
    return true;
1682
}
1683
}
1684
#endif
1683
1685
1684
/** Add prefix to all strings in list. 
1686
/** Add prefix to all strings in list. 
1685
 * @param prefix already wrapped prefix
1687
 * @param prefix already wrapped prefix
1686
 */
1688
 */
1687
static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
1689
static void addPrefix(vector<TermMatchEntry>& terms, const string& prefix)
...
...
1691
    for (vector<TermMatchEntry>::iterator it = terms.begin(); 
1693
    for (vector<TermMatchEntry>::iterator it = terms.begin(); 
1692
         it != terms.end(); it++)
1694
         it != terms.end(); it++)
1693
    it->term.insert(0, prefix);
1695
    it->term.insert(0, prefix);
1694
}
1696
}
1695
1697
1696
// Find all index terms that match a wildcard or regular expression
1698
bool Db::dbStats(DbStats& res)
1697
// If field is set, we return a list of appropriately prefixed terms (which 
1698
// are going to be used to build a Xapian query).
1699
bool Db::termMatch(MatchType typ, const string &lang,
1700
         const string &_root,
1701
         TermMatchResult& res,
1702
         int max, 
1703
         const string& field)
1704
{
1699
{
1705
    if (!m_ndb || !m_ndb->m_isopen)
1700
    if (!m_ndb || !m_ndb->m_isopen)
1706
    return false;
1701
    return false;
1707
    Xapian::Database xdb = m_ndb->xrdb;
1702
    Xapian::Database xdb = m_ndb->xrdb;
1708
1703
...
...
1711
       res.mindoclen = xdb.get_doclength_lower_bound();
1706
       res.mindoclen = xdb.get_doclength_lower_bound();
1712
       res.maxdoclen = xdb.get_doclength_upper_bound();
1707
       res.maxdoclen = xdb.get_doclength_upper_bound();
1713
       , xdb, m_reason);
1708
       , xdb, m_reason);
1714
    if (!m_reason.empty())
1709
    if (!m_reason.empty())
1715
        return false;
1710
        return false;
1711
    return true;
1712
}
1716
1713
1717
    string droot = _root;
1714
// Find all index terms that match a wildcard or regular expression If
1715
// field is set, we return a list of appropriately prefixed terms
1716
// (which are going to be used to build a Xapian query).  This routine
1717
// performs case/diacritics/stemming expansion and possibly calls
1718
// idxTermMatch for wildcard/regexp expansion and filtering against
1719
// the main index terms.
1720
bool Db::termMatch(int typ_sens, const string &lang,
1721
         const string &_term,
1722
         TermMatchResult& res,
1723
         int max, 
1724
         const string& field)
1725
{
1726
    int matchtyp = matchTypeTp(typ_sens);
1727
    if (!m_ndb || !m_ndb->m_isopen)
1728
  return false;
1729
    Xapian::Database xrdb = m_ndb->xrdb;
1718
1730
1719
    // If index is stripped, get rid of capitals and accents
1731
    bool diac_sensitive = (typ_sens & ET_DIACSENS) != 0;
1732
    bool case_sensitive = (typ_sens & ET_CASESENS) != 0;
1733
1734
    bool stripped = false;
1735
#ifdef RCL_INDEX_STRIPCHARS
1736
    stripped = true;
1737
#else
1738
    stripped = o_index_stripchars;
1739
#endif
1740
1741
    LOGDEB(("Db::TermMatch: typ %d diacsens %d casesens %d lang [%s] term [%s] "
1742
      "max %d field [%s] stripped %d\n",
1743
      matchtyp, diac_sensitive, case_sensitive, lang.c_str(), 
1744
      _term.c_str(), max, field.c_str(), stripped));
1745
1746
    // If index is stripped, no case or diac expansion can be needed:
1747
    // for the processing inside this routine, everything looks like
1748
    // we're all-sensitive: no use of expansion db.
1749
    // Also, convert input to lowercase and strip its accents.
1750
    string term = _term;
1751
    if (stripped) {
1752
  diac_sensitive = case_sensitive = true;
1753
  if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
1754
      LOGERR(("Db::termMatch: unac failed for [%s]\n", _term.c_str()));
1755
      return false;
1756
  }
1757
    }
1758
1720
#ifndef RCL_INDEX_STRIPCHARS
1759
#ifndef RCL_INDEX_STRIPCHARS
1721
    if (o_index_stripchars)
1760
    // The case/diac expansion db
1761
    SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
1762
    XapComputableSynFamMember synac(xrdb, synFamDiCa, "all", &unacfoldtrans);
1763
#endif // RCL_INDEX_STRIPCHARS
1764
1765
1766
    if (matchtyp == ET_WILD || matchtyp == ET_REGEXP) {
1767
#ifdef RCL_INDEX_STRIPCHARS
1768
  idxTermMatch(typ_sens, lang, term, res, max, field);
1769
#else
1770
  RefCntr<StrMatcher> matcher;
1771
  if (matchtyp == ET_WILD) {
1772
      matcher = RefCntr<StrMatcher>(new StrWildMatcher(term));
1773
  } else {
1774
      matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(term));
1775
  }
1776
  if (!diac_sensitive || !case_sensitive) {
1777
      // Perform case/diac expansion on the exp as appropriate and
1778
      // expand the result.
1779
      vector<string> exp;
1780
      if (diac_sensitive) {
1781
      // Expand for diacritics and case, filtering for same diacritics
1782
      SynTermTransUnac foldtrans(UNACOP_FOLD);
1783
      synac.synKeyExpand(matcher.getptr(), exp, &foldtrans);
1784
      } else if (case_sensitive) {
1785
      // Expand for diacritics and case, filtering for same case
1786
      SynTermTransUnac unactrans(UNACOP_UNAC);
1787
      synac.synKeyExpand(matcher.getptr(), exp, &unactrans);
1788
      } else {
1789
      // Expand for diacritics and case, no filtering
1790
      synac.synKeyExpand(matcher.getptr(), exp);
1791
      }
1792
      // Retrieve additional info and filter against the index itself
1793
      for (vector<string>::const_iterator it = exp.begin(); 
1794
       it != exp.end(); it++) {
1795
      idxTermMatch(ET_NONE, "", *it, res, max, field);
1796
      }
1797
  } else {
1798
      idxTermMatch(typ_sens, lang, term, res, max, field);
1799
  }
1800
1801
#endif // RCL_INDEX_STRIPCHARS
1802
1803
    } else {
1804
  // Expansion is STEM or NONE (which may still need case/diac exp)
1805
1806
#ifdef RCL_INDEX_STRIPCHARS
1807
1808
  idxTermMatch(Rcl::Db::ET_STEM, lang, term, res, max, field);
1809
1810
#else
1811
  vector<string> lexp;
1812
  if (diac_sensitive && case_sensitive) {
1813
      // No case/diac expansion
1814
      lexp.push_back(term);
1815
  } else if (diac_sensitive) {
1816
      // Expand for accents and case, filtering for same accents,
1817
      SynTermTransUnac foldtrans(UNACOP_FOLD);
1818
      synac.synExpand(term, lexp, &foldtrans);
1819
  } else if (case_sensitive) {
1820
      // Expand for accents and case, filtering for same case
1821
      SynTermTransUnac unactrans(UNACOP_UNAC);
1822
      synac.synExpand(term, lexp, &unactrans);
1823
  } else {
1824
      // We are neither accent- nor case- sensitive and may need stem
1825
      // expansion or not. Expand for accents and case
1826
      synac.synExpand(term, lexp);
1827
  }
1828
1829
  if (matchTypeTp(typ_sens) == ET_STEM) {
1830
      // Need stem expansion. Lowercase the result of accent and case
1831
      // expansion for input to stemdb.
1832
      for (unsigned int i = 0; i < lexp.size(); i++) {
1833
      string lower;
1834
      unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
1835
      lexp[i] = lower;
1836
      }
1837
      sort(lexp.begin(), lexp.end());
1838
      lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
1839
      StemDb sdb(xrdb);
1840
      vector<string> exp1;
1841
      for (vector<string>::const_iterator it = lexp.begin(); 
1842
       it != lexp.end(); it++) {
1843
      sdb.stemExpand(lang, *it, exp1);
1844
      }
1845
      LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
1846
1847
      // Expand the resulting list for case (all stemdb content
1848
      // is lowercase)
1849
      lexp.clear();
1850
      for (vector<string>::const_iterator it = exp1.begin(); 
1851
       it != exp1.end(); it++) {
1852
      synac.synExpand(*it, lexp);
1853
      }
1854
      sort(lexp.begin(), lexp.end());
1855
      lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
1856
  }
1857
1858
  // Filter the result and get the stats, possibly add prefixes.
1859
  LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
1860
  for (vector<string>::const_iterator it = lexp.begin();
1861
       it != lexp.end(); it++) {
1862
      idxTermMatch(Rcl::Db::ET_WILD, "", *it, res, max, field);
1863
  }
1864
    }
1722
#endif
1865
#endif
1723
  if (!unacmaybefold(_root, droot, "UTF-8", UNACOP_UNACFOLD)) {
1866
1724
      LOGERR(("Db::termMatch: unac failed for [%s]\n", _root.c_str()));
1867
    TermMatchCmpByTerm tcmp;
1868
    sort(res.entries.begin(), res.entries.end(), tcmp);
1869
    TermMatchTermEqual teq;
1870
    vector<TermMatchEntry>::iterator uit = 
1871
  unique(res.entries.begin(), res.entries.end(), teq);
1872
    res.entries.resize(uit - res.entries.begin());
1873
    TermMatchCmpByWcf wcmp;
1874
    sort(res.entries.begin(), res.entries.end(), wcmp);
1875
    if (max > 0) {
1876
  // Would need a small max and big stem expansion...
1877
  res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
1878
    }
1879
    return true;
1880
}
1881
1882
// Second phase of wildcard/regexp term expansion after case/diac
1883
// expansion: expand against main index terms
1884
bool Db::idxTermMatch(int typ_sens, const string &lang,
1885
            const string &root,
1886
            TermMatchResult& res,
1887
            int max, 
1888
            const string& field)
1889
{
1890
    int typ = matchTypeTp(typ_sens);
1891
1892
#ifndef RCL_INDEX_STRIPCHARS
1893
    if (typ == ET_STEM) {
1894
  LOGFATAL(("RCLDB: internal error: idxTermMatch called with ET_STEM\n"));
1895
  abort();
1896
    }
1897
#endif
1898
1899
    if (!m_ndb || !m_ndb->m_isopen)
1725
        return false;
1900
    return false;
1726
  }
1901
    Xapian::Database xdb = m_ndb->xrdb;
1727
1728
    string nochars = typ == ET_WILD ? cstr_wildSpecStChars : 
1729
  cstr_regSpecStChars;
1730
1902
1731
    string prefix;
1903
    string prefix;
1732
    if (!field.empty()) {
1904
    if (!field.empty()) {
1733
    const FieldTraits *ftp = 0;
1905
    const FieldTraits *ftp = 0;
1734
    if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
1906
    if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
...
...
1738
        prefix = wrap_prefix(ftp->pfx);
1910
        prefix = wrap_prefix(ftp->pfx);
1739
    }
1911
    }
1740
    }
1912
    }
1741
    res.prefix = prefix;
1913
    res.prefix = prefix;
1742
1914
1915
#ifdef RCL_INDEX_STRIPCHARS
1743
    if (typ == ET_STEM) {
1916
    if (typ == ET_STEM) {
1744
    if (!stemExpand(lang, droot, res))
1917
    if (!stemExpand(lang, root, res))
1745
        return false;
1918
        return false;
1746
    for (vector<TermMatchEntry>::iterator it = res.entries.begin(); 
1919
    for (vector<TermMatchEntry>::iterator it = res.entries.begin(); 
1747
         it != res.entries.end(); it++) {
1920
         it != res.entries.end(); it++) {
1748
        XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
1921
        XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
1749
                   it->docs = xdb.get_termfreq(it->term),
1922
                   it->docs = xdb.get_termfreq(it->term),
...
...
1752
                return false;
1925
                return false;
1753
        LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
1926
        LOGDEB1(("termMatch: %d [%s]\n", it->wcf, it->term.c_str()));
1754
    }
1927
    }
1755
        if (!prefix.empty())
1928
        if (!prefix.empty())
1756
            addPrefix(res.entries, prefix);
1929
            addPrefix(res.entries, prefix);
1757
    } else {
1930
    } else 
1758
  regex_t reg;
1931
#endif
1759
  int errcode;
1932
    {
1933
  RefCntr<StrMatcher> matcher;
1760
    if (typ == ET_REGEXP) {
1934
    if (typ == ET_REGEXP) {
1761
      if ((errcode = regcomp(&reg, droot.c_str(), 
1935
      matcher = RefCntr<StrMatcher>(new StrRegexpMatcher(root));
1762
                 REG_EXTENDED|REG_NOSUB))) {
1936
      if (!matcher->ok()) {
1763
      char errbuf[200];
1764
      regerror(errcode, &reg, errbuf, 199);
1765
        LOGERR(("termMatch: regcomp failed: %s\n", errbuf));
1937
        LOGERR(("termMatch: regcomp failed: %s\n", 
1766
      res.entries.push_back(string(errbuf));
1938
          matcher->getreason().c_str()))
1767
      regfree(&reg);
1768
        return false;
1939
            return false;
1769
        }
1940
        }
1941
  } else if (typ == ET_WILD) {
1942
      matcher = RefCntr<StrMatcher>(new StrWildMatcher(root));
1770
    }
1943
    }
1771
1944
1772
    // Find the initial section before any special char
1945
    // Find the initial section before any special char
1773
  string::size_type es = droot.find_first_of(nochars);
1946
  string::size_type es = string::npos;
1947
  if (matcher.isNotNull()) {
1948
      es = matcher->baseprefixlen();
1949
  }
1774
    string is;
1950
    string is;
1775
    switch (es) {
1951
    switch (es) {
1776
    case string::npos: is = prefix + droot; break;
1952
    case string::npos: is = prefix + root; break;
1777
    case 0: is = prefix; break;
1953
    case 0: is = prefix; break;
1778
    default: is = prefix + droot.substr(0, es); break;
1954
    default: is = prefix + root.substr(0, es); break;
1779
    }
1955
    }
1780
    LOGDEB1(("termMatch: initsec: [%s]\n", is.c_str()));
1956
    LOGDEB2(("termMatch: initsec: [%s]\n", is.c_str()));
1781
1957
1782
        for (int tries = 0; tries < 2; tries++) { 
1958
        for (int tries = 0; tries < 2; tries++) { 
1783
            try {
1959
            try {
1784
                Xapian::TermIterator it = xdb.allterms_begin(); 
1960
                Xapian::TermIterator it = xdb.allterms_begin(); 
1785
                if (!is.empty())
1961
                if (!is.empty())
...
...
1792
                    string term;
1968
                    string term;
1793
                    if (!prefix.empty())
1969
                    if (!prefix.empty())
1794
                        term = (*it).substr(prefix.length());
1970
                        term = (*it).substr(prefix.length());
1795
                    else
1971
                    else
1796
                        term = *it;
1972
                        term = *it;
1797
                    if (typ == ET_WILD) {
1973
1798
                        if (fnmatch(droot.c_str(), term.c_str(), 0) == 
1974
          if (matcher.isNotNull() && !matcher->match(term))
1799
                            FNM_NOMATCH)
1975
          continue;
1800
                            continue;
1976
1801
                    } else {
1802
                        if (regexec(&reg, term.c_str(), 0, 0, 0))
1803
                            continue;
1804
                    }
1805
                    // Do we want stem expansion here? We don't do it for now
1806
                    res.entries.push_back(TermMatchEntry(*it, 
1977
                    res.entries.push_back(
1807
                                                   xdb.get_collection_freq(*it),
1978
          TermMatchEntry(*it, xdb.get_collection_freq(*it),
1808
                                                   it.get_termfreq()));
1979
                     it.get_termfreq()));
1809
1980
1810
            // The problem with truncating here is that this is done
1981
            // The problem with truncating here is that this is done
1811
            // alphabetically and we may not keep the most frequent 
1982
            // alphabetically and we may not keep the most frequent 
1812
            // terms. OTOH, not doing it may stall the program if
1983
            // terms. OTOH, not doing it may stall the program if
1813
            // we are walking the whole term list. We compromise
1984
            // we are walking the whole term list. We compromise
...
...
1826
        }
1997
        }
1827
    if (!m_reason.empty()) {
1998
    if (!m_reason.empty()) {
1828
        LOGERR(("termMatch: %s\n", m_reason.c_str()));
1999
        LOGERR(("termMatch: %s\n", m_reason.c_str()));
1829
        return false;
2000
        return false;
1830
    }
2001
    }
1831
1832
  if (typ == ET_REGEXP) {
1833
      regfree(&reg);
1834
  }
1835
1836
    }
2002
    }
1837
2003
1838
    TermMatchCmpByTerm tcmp;
1839
    sort(res.entries.begin(), res.entries.end(), tcmp);
1840
    TermMatchTermEqual teq;
1841
    vector<TermMatchEntry>::iterator uit = 
1842
  unique(res.entries.begin(), res.entries.end(), teq);
1843
    res.entries.resize(uit - res.entries.begin());
1844
    TermMatchCmpByWcf wcmp;
1845
    sort(res.entries.begin(), res.entries.end(), wcmp);
1846
    if (max > 0) {
1847
  // Would need a small max and big stem expansion...
1848
  res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
1849
    }
1850
    return true;
2004
    return true;
1851
}
2005
}
1852
2006
1853
/** Term list walking. */
2007
/** Term list walking. */
1854
class TermIter {
2008
class TermIter {