recoll / Code / Diff of /src/rcldb/searchdata.cpp

Diff of /src/rcldb/searchdata.cpp [d35c69] .. [881794]

Switch to unified view


...
#include "termproc.h"
#include "synfamily.h"
#include "stemdb.h"
#include "expansiondbs.h"
#include "base64.h"
#include "daterange.h"

namespace Rcl {

typedef  vector<SearchDataClause *>::iterator qlist_it_t;
typedef  vector<SearchDataClause *>::const_iterator qlist_cit_t;

static const int original_term_wqf_booster = 10;

void SearchData::commoninit()





























{
    m_haveDates = false;
    m_maxSize = size_t(-1);
    m_minSize = size_t(-1);
    m_haveWildCards = false;
    m_softmaxexpand = -1;
    m_autodiacsens = false;
    m_autocasesens = true;
    m_maxexp = 10000;
    m_maxcl = 100000;







































































}

// Expand categories and mime type wild card exps
// Actually, using getAllMimeTypes() here is a bit problematic because
// there maybe other types in the index, not indexed by content, but
...
    if (cfg->isMimeCategory(*it)) {
        vector<string>tps;
        cfg->getMimeCatTypes(*it, tps);
        exptps.insert(exptps.end(), tps.begin(), tps.end());
    } else {
      bool matched = false;
        for (vector<string>::const_iterator ait = alltypes.begin();
         ait != alltypes.end(); ait++) {
        if (fnmatch(it->c_str(), ait->c_str(), FNM_CASEFOLD) 
            != FNM_NOMATCH) {
            exptps.push_back(*ait);
          matched = true;
        }
        }
      if (!matched)
      exptps.push_back(it->c_str());
    }
    }
    tps = exptps;
    return true;
}

bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp, 
                vector<SearchDataClause*>& query, 
                string& reason, void *d)

{
    Xapian::Query xq;
    for (qlist_it_t it = query.begin(); it != query.end(); it++) {
    Xapian::Query nq;
    if (!(*it)->toNativeQuery(db, &nq)) {
        LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
            (*it)->getReason().c_str()));
        reason += (*it)->getReason() + " ";
        return false;
    }       
...
            else 
                xq = nq;
        } else {
            xq = Xapian::Query(op, xq, nq);
        }
    if (int(xq.get_length()) >= getMaxCl()) {
        LOGERR(("Maximum Xapian query size exceeded."
            " Maybe increase maxXapianClauses."));
        m_reason += "Maximum Xapian query size exceeded."
        " Maybe increase maxXapianClauses.";
        return false;
...

   *((Xapian::Query *)d) = xq;
    return true;
}











































































































bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
{
    LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
    m_reason.erase();

    // Walk the clause list translating each in turn and building the 
    // Xapian query tree
    Xapian::Query xq;
    if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
    LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n", 
        m_reason.c_str()));
    return false;
    }

...
    m_query.push_back(cl);
    return true;
}

// Make me all new
void SearchData::erase() 
{
    LOGDEB0(("SearchData::erase\n"));
    m_tp = SCLT_AND;
    for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
    delete *it;
    m_query.clear();
...
    TextSplitQ *m_ts;
    map<int, string> m_terms;
    map<int, bool> m_nste;
};


















































#if 1
static void listVector(const string& what, const vector<string>&l)
{
    string a;
...
 * @param sterm output original input term if there were no wildcards
 * @param prefix field prefix in index. We could recompute it, but the caller
 *  has it already. Used in the simple case where there is nothing to expand, 
 *  and we just return the prefixed term (else Db::termMatch deals with it).
 */
bool SearchDataClauseSimple::expandTerm(Rcl::Db &db, 
                  string& ermsg, int mods, 
                  const string& term, 
                  vector<string>& oexp, string &sterm,
                  const string& prefix)
{
    LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
         mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
    sterm.clear();
    oexp.clear();
    if (term.empty())
    return true;

    bool haswild = term.find_first_of(cstr_minwilds) != string::npos;

    // If there are no wildcards, add term to the list of user-entered terms
    if (!haswild)
    m_hldata.uterms.insert(term);

    bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;

    // No stem expansion if there are wildcards or if prevented by caller
    if (haswild || getStemLang().empty()) {
    LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
    nostemexp = true;
    }

    bool noexpansion = nostemexp && !haswild;
...

    // If any character has a diacritic, we become
    // diacritic-sensitive. Note that the way that the test is
    // performed (conversion+comparison) will automatically ignore
    // accented characters which are actually a separate letter
    if (getAutoDiac() && unachasaccents(term)) {
        LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
        diac_sensitive = true;
    }

    // If any character apart the first is uppercase, we become
    // case-sensitive.  The first character is reserved for
    // turning off stemming. You need to use a query language
    // modifier to search for Floor in a case-sensitive way.
    Utf8Iter it(term);
    it++;
    if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
        LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
        case_sensitive = true;
    }

    // If we are sensitive to case or diacritics turn stemming off
...
#endif

    if (noexpansion) {
    sterm = term;
    oexp.push_back(prefix + term);
    m_hldata.terms[term] = m_hldata.uterms.size() - 1;
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
    return true;
    } 

    // Make objects before the goto jungle to avoid compiler complaints
    SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
    XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all", 
                    &unacfoldtrans);
    // This will hold the result of case and diacritics expansion as input
    // to stem expansion.
    vector<string> lexp;
    
...
    if (haswild) {
    // Note that if there are wildcards, we do a direct from-index
    // expansion, which means that we are casediac-sensitive. There
    // would be nothing to prevent us to expand from the casediac
    // synonyms first. To be done later
  db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res, 
           getMaxExp(), m_field);
    goto termmatchtoresult;
    }

    sterm = term;

#ifdef RCL_INDEX_STRIPCHARS

    db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res, 
       getMaxExp(), m_field);

#else

    if (o_index_stripchars) {
    // If the index is raw, we can only come here if nostemexp is unset
    // and we just need stem expansion.
  db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res, 
           getMaxExp(), m_field);
    goto termmatchtoresult;
    } 

    // No stem expansion when diacritic or case sensitivity is set, it
    // makes no sense (it would mess with the diacritics anyway if
...
    }
    sort(lexp.begin(), lexp.end());
    {
    vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
    lexp.resize(uit - lexp.begin());
    StemDb sdb(db.m_ndb->xrdb);
    vector<string> exp1;
    for (vector<string>::const_iterator it = lexp.begin(); 
         it != lexp.end(); it++) {
        sdb.stemExpand(getStemLang(), *it, exp1);
    }
    LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));

    // Expand the resulting list for case (all stemdb content
    // is lowercase)
...
    // Bogus wildcard expand to generate the result (possibly add prefixes)
exptotermatch:
    LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
    for (vector<string>::const_iterator it = lexp.begin();
     it != lexp.end(); it++) {
    db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
           getMaxExp(), m_field);
    }
#endif

    // Term match entries to vector of terms
termmatchtoresult:
    if (int(res.entries.size()) >= getMaxExp()) {
    ermsg = "Maximum term expansion size exceeded."
        " Maybe increase maxTermExpand.";
    return false;
    }
    for (vector<TermMatchEntry>::const_iterator it = res.entries.begin(); 
...
    oexp.push_back(prefix + term);

    // Remember the uterm-to-expansion links
    for (vector<string>::const_iterator it = oexp.begin(); 
     it != oexp.end(); it++) {
    m_hldata.terms[strip_prefix(*it)] = term;
    }
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
    return true;
}

...
    // vector)
    comb.pop_back();
    }
}

void SearchDataClauseSimple::processSimpleSpan(Rcl::Db &db, string& ermsg,
                         const string& span, 
                         int mods, void * pq)
{
    vector<Xapian::Query>& pqueries(*(vector<Xapian::Query>*)pq);
    LOGDEB0(("StringToXapianQ::processSimpleSpan: [%s] mods 0x%x\n",
        span.c_str(), (unsigned int)mods));
    vector<string> exp;  
    string sterm; // dumb version of user term

    string prefix;
    const FieldTraits *ftp;
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp)) {
    prefix = wrap_prefix(ftp->pfx);
    }

    if (!expandTerm(db, ermsg, mods, span, exp, sterm, prefix))
    return;
    
    // Set up the highlight data. No prefix should go in there
    for (vector<string>::const_iterator it = exp.begin(); 
     it != exp.end(); it++) {
    m_hldata.groups.push_back(vector<string>(1, it->substr(prefix.size())));
    m_hldata.slacks.push_back(0);
    m_hldata.grpsugidx.push_back(m_hldata.ugroups.size() - 1);
    }

    // Push either term or OR of stem-expanded set
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
    m_curcl += exp.size();
...
    // If sterm (simplified original user term) is not null, give it a
    // relevance boost. We do this even if no expansion occurred (else
    // the non-expanded terms in a term list would end-up with even
    // less wqf). This does not happen if there are wildcards anywhere
    // in the search.
    // We normally boost the original term in the stem expansion list. Don't
    // do it if there are wildcards anywhere, this would skew the results.
    bool doBoostUserTerm = 
  (m_parentSearch && !m_parentSearch->haveWildCards()) || 
  (m_parentSearch == 0 && !m_haveWildCards);
    if (doBoostUserTerm && !sterm.empty()) {
        xq = Xapian::Query(Xapian::Query::OP_OR, xq, 
               Xapian::Query(prefix+sterm, 
                     original_term_wqf_booster));
    }
    pqueries.push_back(xq);
...

// User entry element had several terms: transform into a PHRASE or
// NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
                       TextSplitQ *splitData, 
                       int mods, void *pq,
                       bool useNear, int slack)
{
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
    Xapian::Query::OP_PHRASE;
    vector<Xapian::Query> orqueries;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
    bool hadmultiple = false;
#endif
    vector<vector<string> >groups;

    string prefix;
    const FieldTraits *ftp;
    if (!m_field.empty() && db.fieldToTraits(m_field, &ftp)) {
    prefix = wrap_prefix(ftp->pfx);
    }

    if (mods & Rcl::SearchDataClause::SDCM_ANCHORSTART) {
    orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
...
    int lmods = mods;
    if (nostemexp)
        lmods |= SearchDataClause::SDCM_NOSTEMMING;
    string sterm;
    vector<string> exp;
    if (!expandTerm(db, ermsg, lmods, *it, exp, sterm, prefix))
        return;
    LOGDEB0(("ProcessPhraseOrNear: exp size %d\n", exp.size()));
    listVector("", exp);
    // groups is used for highlighting, we don't want prefixes in there.
    vector<string> noprefs;
...
    }
    groups.push_back(noprefs);
    orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
                      exp.begin(), exp.end()));
    m_curcl += exp.size();
    if (m_curcl >= getMaxCl())
        return;
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
    if (exp.size() > 1) 
        hadmultiple = true;
#endif
...
    vector<string> comb;
    multiply_groups(groups.begin(), groups.end(), comb, allcombs);
    
    // Insert the search groups and slacks in the highlight data, with
    // a reference to the user entry that generated them:
    m_hldata.groups.insert(m_hldata.groups.end(), 
             allcombs.begin(), allcombs.end());
    m_hldata.slacks.insert(m_hldata.slacks.end(), allcombs.size(), slack);
    m_hldata.grpsugidx.insert(m_hldata.grpsugidx.end(), allcombs.size(), 
                  m_hldata.ugroups.size() - 1);
}

// Trim string beginning with ^ or ending with $ and convert to flags
static int stringToMods(string& s)
{
...
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
 *     composition of the phrase terms (no stem expansion in this case)
 * @return the subquery count (either or'd stem-expanded terms or phrase word
 *   count)
 */
bool SearchDataClauseSimple::processUserString(Rcl::Db &db, const string &iq,

                    int mods, string &ermsg,
                  void *pq, int slack, bool useNear)



{
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);

    LOGDEB(("StringToXapianQ:pUS:: qstr [%s] fld [%s] mods 0x%x "
        "slack %d near %d\n", 
        iq.c_str(), m_field.c_str(), mods, slack, useNear));
    ermsg.erase();
    m_curcl = 0;
    const StopList stops = db.getStopList();

    // Simple whitespace-split input into user-level words and
    // double-quoted phrases: word1 word2 "this is a phrase". 
    //
    // The text splitter may further still decide that the resulting
...
        continue;// ??
        case 1: {
        int lmods = mods;
        if (splitter.nostemexps.front())
            lmods |= SearchDataClause::SDCM_NOSTEMMING;
        m_hldata.ugroups.push_back(vector<string>(1, *it));
        processSimpleSpan(db, ermsg, splitter.terms.front(),
                lmods, &pqueries);
        }
        break;
        default:
        m_hldata.ugroups.push_back(vector<string>(1, *it));
        processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
                    useNear, slack);
        }
        if (m_curcl >= getMaxCl()) {
        ermsg = "Maximum Xapian query size exceeded."
            " Maybe increase maxXapianClauses.";
        break;
        }
    }
...
    }
    return true;
}

// Translate a simple OR, AND, or EXCL search clause. 
bool SearchDataClauseSimple::toNativeQuery(Rcl::Db &db, void *p)

{
    LOGDEB2(("SearchDataClauseSimple::toNativeQuery: stemlang [%s]\n",
         getStemLang().c_str()));

    Xapian::Query *qp = (Xapian::Query *)p;
...
    case SCLT_EXCL: op = Xapian::Query::OP_OR; break;
    default:
    LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
    return false;
    }

    vector<Xapian::Query> pqueries;









    if (!processUserString(db, m_text, getModifiers(), m_reason, &pqueries))
    return false;
    if (pqueries.empty()) {
    LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
    return true;
    }
...
//
// We do not split the entry any more (used to do some crazy thing
// about expanding multiple fragments in the past). We just take the
// value blanks and all and expand this against the indexed unsplit
// file names
bool SearchDataClauseFilename::toNativeQuery(Rcl::Db &db, void *p)

{
    Xapian::Query *qp = (Xapian::Query *)p;
    *qp = Xapian::Query();

    vector<string> names;
    int maxexp = 10000;
    db.getConf()->getConfParam("maxTermExpand", &maxexp);
    db.filenameWildExp(m_text, names, maxexp);
    *qp = Xapian::Query(Xapian::Query::OP_OR, names.begin(), names.end());

    if (m_weight != 1.0) {
    *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
    }
    return true;
}

// Translate NEAR or PHRASE clause. 
bool SearchDataClauseDist::toNativeQuery(Rcl::Db &db, void *p)

{
    LOGDEB(("SearchDataClauseDist::toNativeQuery\n"));

    Xapian::Query *qp = (Xapian::Query *)p;
    *qp = Xapian::Query();

    vector<Xapian::Query> pqueries;
    Xapian::Query nq;







    // We produce a single phrase out of the user entry then use
    // stringToXapianQueries() to lowercase and simplify the phrase
    // terms etc. This will result into a single (complex)
    // Xapian::Query.
    if (m_text.find('\"') != string::npos) {
    m_text = neutchars(m_text, "\"");
    }
    string s = cstr_dquote + m_text + cstr_dquote;
    bool useNear = (m_tp == SCLT_NEAR);


    if (!processUserString(db, s, getModifiers(), m_reason, &pqueries, 
               m_slack, useNear))
    return false;
    if (pqueries.empty()) {
    LOGERR(("SearchDataClauseDist: resolved to null query\n"));
    return true;
    }

	a/src/rcldb/searchdata.cpp		b/src/rcldb/searchdata.cpp
	...		...
44	#include "termproc.h"	44	#include "termproc.h"
45	#include "synfamily.h"	45	#include "synfamily.h"
46	#include "stemdb.h"	46	#include "stemdb.h"
47	#include "expansiondbs.h"	47	#include "expansiondbs.h"
48	#include "base64.h"	48	#include "base64.h"
		49	#include "daterange.h"
49		50
50	namespace Rcl {	51	namespace Rcl {
51		52
52	typedef vector<SearchDataClause *>::iterator qlist_it_t;	53	typedef vector<SearchDataClause *>::iterator qlist_it_t;
53	typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;	54	typedef vector<SearchDataClause *>::const_iterator qlist_cit_t;
54		55
55	static const int original_term_wqf_booster = 10;	56	static const int original_term_wqf_booster = 10;
56		57
57	/* The dates-to-query routine is is lifted quasi-verbatim but	58	void SearchData::commoninit()
58	* modified from xapian-omega:date.cc. Copyright info:
59	*
60	* Copyright 1999,2000,2001 BrightStation PLC
61	* Copyright 2001 James Aylett
62	* Copyright 2001,2002 Ananova Ltd
63	* Copyright 2002 Intercede 1749 Ltd
64	* Copyright 2002,2003,2006 Olly Betts
65	*
66	* This program is free software; you can redistribute it and/or
67	* modify it under the terms of the GNU General Public License as
68	* published by the Free Software Foundation; either version 2 of the
69	* License, or (at your option) any later version.
70	*
71	* This program is distributed in the hope that it will be useful,
72	* but WITHOUT ANY WARRANTY; without even the implied warranty of
73	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
74	* GNU General Public License for more details.
75	*
76	* You should have received a copy of the GNU General Public License
77	* along with this program; if not, write to the Free Software
78	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
79	* USA
80	*/
81
82	#ifdef RCL_INDEX_STRIPCHARS
83	#define bufprefix(BUF, L) {(BUF)[0] = L;}
84	#define bpoffs() 1
85	#else
86	static inline void bufprefix(char *buf, char c)
87	{	59	{
88	if (o_index_stripchars) {	60	m_haveDates = false;
89	buf[0] = c;	61	m_maxSize = size_t(-1);
90	} else {	62	m_minSize = size_t(-1);
91	buf[0] = ':';	63	m_haveWildCards = false;
92	buf[1] = c;	64	m_softmaxexpand = -1;
93	buf[2] = ':';	65	m_autodiacsens = false;
94	}	66	m_autocasesens = true;
95	}	67	m_maxexp = 10000;
96	static inline int bpoffs()	68	m_maxcl = 100000;
97	{
98	return o_index_stripchars ? 1 : 3;
99	}
100	#endif
101
102	static Xapian::Query
103	date_range_filter(int y1, int m1, int d1, int y2, int m2, int d2)
104	{
105	// Xapian uses a smallbuf and snprintf. Can't be bothered, we're
106	// only doing %d's !
107	char buf[200];
108	bufprefix(buf, 'D');
109	sprintf(buf+bpoffs(), "%04d%02d", y1, m1);
110	vector<Xapian::Query> v;
111
112	int d_last = monthdays(m1, y1);
113	int d_end = d_last;
114	if (y1 == y2 && m1 == m2 && d2 < d_last) {
115	d_end = d2;
116	}
117	// Deal with any initial partial month
118	if (d1 > 1 \|\| d_end < d_last) {
119	for ( ; d1 <= d_end ; d1++) {
120	sprintf(buf + 6 + bpoffs(), "%02d", d1);
121	v.push_back(Xapian::Query(buf));
122	}
123	} else {
124	bufprefix(buf, 'M');
125	v.push_back(Xapian::Query(buf));
126	}
127
128	if (y1 == y2 && m1 == m2) {
129	return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end());
130	}
131
132	int m_last = (y1 < y2) ? 12 : m2 - 1;
133	while (++m1 <= m_last) {
134	sprintf(buf + 4 + bpoffs(), "%02d", m1);
135	bufprefix(buf, 'M');
136	v.push_back(Xapian::Query(buf));
137	}
138
139	if (y1 < y2) {
140	while (++y1 < y2) {
141	sprintf(buf + bpoffs(), "%04d", y1);
142	bufprefix(buf, 'Y');
143	v.push_back(Xapian::Query(buf));
144	}
145	sprintf(buf + bpoffs(), "%04d", y2);
146	bufprefix(buf, 'M');
147	for (m1 = 1; m1 < m2; m1++) {
148	sprintf(buf + 4 + bpoffs(), "%02d", m1);
149	v.push_back(Xapian::Query(buf));
150	}
151	}
152
153	sprintf(buf + 2 + bpoffs(), "%02d", m2);
154
155	// Deal with any final partial month
156	if (d2 < monthdays(m2, y2)) {
157	bufprefix(buf, 'D');
158	for (d1 = 1 ; d1 <= d2; d1++) {
159	sprintf(buf + 6 + bpoffs(), "%02d", d1);
160	v.push_back(Xapian::Query(buf));
161	}
162	} else {
163	bufprefix(buf, 'M');
164	v.push_back(Xapian::Query(buf));
165	}
166
167	return Xapian::Query(Xapian::Query::OP_OR, v.begin(), v.end());
168	}	69	}
169		70
170	// Expand categories and mime type wild card exps	71	// Expand categories and mime type wild card exps
171	// Actually, using getAllMimeTypes() here is a bit problematic because	72	// Actually, using getAllMimeTypes() here is a bit problematic because
172	// there maybe other types in the index, not indexed by content, but	73	// there maybe other types in the index, not indexed by content, but
	...		...
186	if (cfg->isMimeCategory(*it)) {	87	if (cfg->isMimeCategory(*it)) {
187	vector<string>tps;	88	vector<string>tps;
188	cfg->getMimeCatTypes(*it, tps);	89	cfg->getMimeCatTypes(*it, tps);
189	exptps.insert(exptps.end(), tps.begin(), tps.end());	90	exptps.insert(exptps.end(), tps.begin(), tps.end());
190	} else {	91	} else {
		92	bool matched = false;
191	for (vector<string>::const_iterator ait = alltypes.begin();	93	for (vector<string>::const_iterator ait = alltypes.begin();
192	ait != alltypes.end(); ait++) {	94	ait != alltypes.end(); ait++) {
193	if (fnmatch(it->c_str(), ait->c_str(), FNM_CASEFOLD)	95	if (fnmatch(it->c_str(), ait->c_str(), FNM_CASEFOLD)
194	!= FNM_NOMATCH) {	96	!= FNM_NOMATCH) {
195	exptps.push_back(*ait);	97	exptps.push_back(*ait);
		98	matched = true;
196	}	99	}
197	}	100	}
		101	if (!matched)
		102	exptps.push_back(it->c_str());
198	}	103	}
199	}	104	}
200	tps = exptps;	105	tps = exptps;
201	return true;	106	return true;
202	}	107	}
203		108
204	bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,	109	bool SearchData::clausesToQuery(Rcl::Db &db, SClType tp,
205	vector<SearchDataClause*>& query,	110	vector<SearchDataClause*>& query,
206	string& reason, void *d,	111	string& reason, void *d)
207	int maxexp, int maxcl)
208	{	112	{
209	Xapian::Query xq;	113	Xapian::Query xq;
210	for (qlist_it_t it = query.begin(); it != query.end(); it++) {	114	for (qlist_it_t it = query.begin(); it != query.end(); it++) {
211	Xapian::Query nq;	115	Xapian::Query nq;
212	if (!(*it)->toNativeQuery(db, &nq, maxexp, maxcl)) {	116	if (!(*it)->toNativeQuery(db, &nq)) {
213	LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",	117	LOGERR(("SearchData::clausesToQuery: toNativeQuery failed: %s\n",
214	(*it)->getReason().c_str()));	118	(*it)->getReason().c_str()));
215	reason += (*it)->getReason() + " ";	119	reason += (*it)->getReason() + " ";
216	return false;	120	return false;
217	}	121	}
	...		...
238	else	142	else
239	xq = nq;	143	xq = nq;
240	} else {	144	} else {
241	xq = Xapian::Query(op, xq, nq);	145	xq = Xapian::Query(op, xq, nq);
242	}	146	}
243	if (int(xq.get_length()) >= maxcl) {	147	if (int(xq.get_length()) >= getMaxCl()) {
244	LOGERR(("Maximum Xapian query size exceeded."	148	LOGERR(("Maximum Xapian query size exceeded."
245	" Maybe increase maxXapianClauses."));	149	" Maybe increase maxXapianClauses."));
246	m_reason += "Maximum Xapian query size exceeded."	150	m_reason += "Maximum Xapian query size exceeded."
247	" Maybe increase maxXapianClauses.";	151	" Maybe increase maxXapianClauses.";
248	return false;	152	return false;
	...		...
253		157
254	((Xapian::Query )d) = xq;	158	((Xapian::Query )d) = xq;
255	return true;	159	return true;
256	}	160	}
257		161
258	static string tpToString(SClType tp)
259	{
260	switch (tp) {
261	case SCLT_AND: return "AND";
262	case SCLT_OR: return "OR";
263	case SCLT_EXCL: return "EX";
264	case SCLT_FILENAME: return "FN";
265	case SCLT_PHRASE: return "PH";
266	case SCLT_NEAR: return "NE";
267	case SCLT_SUB: return "SU"; // Unsupported actually
268	default: return "UN";
269	}
270	}
271
272	string SearchData::asXML()
273	{
274	LOGDEB(("SearchData::asXML\n"));
275	ostringstream os;
276
277	// Searchdata
278	os << "<SD>" << endl;
279
280	// Clause list
281	os << "<CL>" << endl;
282	if (m_tp != SCLT_AND)
283	os << "<CLT>" << tpToString(m_tp) << "</CLT>" << endl;
284	for (unsigned int i = 0; i < m_query.size(); i++) {
285	SearchDataClause *c = m_query[i];
286	if (c->getTp() == SCLT_SUB) {
287	LOGERR(("SearchData::asXML: can't do subclauses !\n"));
288	continue;
289	}
290	SearchDataClauseSimple *cl =
291	dynamic_cast<SearchDataClauseSimple*>(c);
292	os << "<C>" << endl;
293	if (cl->getTp() != SCLT_AND) {
294	os << "<CT>" << tpToString(cl->getTp()) << "</CT>" << endl;
295	}
296	if (cl->getTp() != SCLT_FILENAME && !cl->getfield().empty()) {
297	os << "<F>" << base64_encode(cl->getfield()) << "</F>" << endl;
298	}
299	os << "<T>" << base64_encode(cl->gettext()) << "</T>" << endl;
300	if (cl->getTp() == SCLT_NEAR \|\| cl->getTp() == SCLT_PHRASE) {
301	SearchDataClauseDist *cld =
302	dynamic_cast<SearchDataClauseDist*>(cl);
303	os << "<S>" << cld->getslack() << "</S>" << endl;
304	}
305	os << "</C>" << endl;
306	}
307	os << "</CL>" << endl;
308
309	if (m_haveDates) {
310	if (m_dates.y1 > 0) {
311	os << "<DMI>" <<
312	"<D>" << m_dates.d1 << "</D>" <<
313	"<M>" << m_dates.m1 << "</M>" <<
314	"<Y>" << m_dates.y1 << "</Y>"
315	<< "</DMI>" << endl;
316	}
317	if (m_dates.y2 > 0) {
318	os << "<DMA>" <<
319	"<D>" << m_dates.d2 << "</D>" <<
320	"<M>" << m_dates.m2 << "</M>" <<
321	"<Y>" << m_dates.y2 << "</Y>"
322	<< "</DMA>" << endl;
323	}
324	}
325
326
327	if (m_minSize != size_t(-1)) {
328	os << "<MIS>" << m_minSize << "</MIS>" << endl;
329	}
330	if (m_maxSize != size_t(-1)) {
331	os << "<MAS>" << m_maxSize << "</MAS>" << endl;
332	}
333
334	if (!m_filetypes.empty()) {
335	os << "<ST>";
336	for (vector<string>::iterator it = m_filetypes.begin();
337	it != m_filetypes.end(); it++) {
338	os << *it << " ";
339	}
340	os << "</ST>" << endl;
341	}
342
343	if (!m_nfiletypes.empty()) {
344	os << "<IT>";
345	for (vector<string>::iterator it = m_nfiletypes.begin();
346	it != m_nfiletypes.end(); it++) {
347	os << *it << " ";
348	}
349	os << "</IT>" << endl;
350	}
351
352	for (vector<DirSpec>::const_iterator dit = m_dirspecs.begin();
353	dit != m_dirspecs.end(); dit++) {
354	if (dit->exclude) {
355	os << "<ND>" << base64_encode(dit->dir) << "</ND>" << endl;
356	} else {
357	os << "<YD>" << base64_encode(dit->dir) << "</YD>" << endl;
358	}
359	}
360	os << "</SD>";
361	return os.str();
362	}
363
364	bool SearchData::toNativeQuery(Rcl::Db &db, void *d, int maxexp, int maxcl)	162	bool SearchData::toNativeQuery(Rcl::Db &db, void *d)
365	{	163	{
366	LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));	164	LOGDEB(("SearchData::toNativeQuery: stemlang [%s]\n", m_stemlang.c_str()));
367	m_reason.erase();	165	m_reason.erase();
368		166
369	// Walk the clause list translating each in turn and building the	167	// Walk the clause list translating each in turn and building the
370	// Xapian query tree	168	// Xapian query tree
371	Xapian::Query xq;	169	Xapian::Query xq;
372	if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq, maxexp, maxcl)) {	170	if (!clausesToQuery(db, m_tp, m_query, m_reason, &xq)) {
373	LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",	171	LOGERR(("SearchData::toNativeQuery: clausesToQuery failed. reason: %s\n",
374	m_reason.c_str()));	172	m_reason.c_str()));
375	return false;	173	return false;
376	}	174	}
377		175
	...		...
630	m_query.push_back(cl);	428	m_query.push_back(cl);
631	return true;	429	return true;
632	}	430	}
633		431
634	// Make me all new	432	// Make me all new
635	void SearchData::erase() {	433	void SearchData::erase()
		434	{
636	LOGDEB0(("SearchData::erase\n"));	435	LOGDEB0(("SearchData::erase\n"));
637	m_tp = SCLT_AND;	436	m_tp = SCLT_AND;
638	for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)	437	for (qlist_it_t it = m_query.begin(); it != m_query.end(); it++)
639	delete *it;	438	delete *it;
640	m_query.clear();	439	m_query.clear();
	...		...
727	TextSplitQ *m_ts;	526	TextSplitQ *m_ts;
728	map<int, string> m_terms;	527	map<int, string> m_terms;
729	map<int, bool> m_nste;	528	map<int, bool> m_nste;
730	};	529	};
731		530
732	// A class used to translate a user compound string (not a query
733	// language string) as may be entered in any_terms/all_terms search
734	// entry fields, ex: [term1 "a phrase" term3] into a xapian query
735	// tree.
736	// The object keeps track of the query terms and term groups while
737	// translating.
738	class StringToXapianQ {
739	public:
740	StringToXapianQ(Db& db, HighlightData& hld, const string& field,
741	const string &stmlng, bool boostUser, int maxexp, int maxcl)
742	: m_db(db), m_field(field), m_stemlang(stmlng),
743	m_doBoostUserTerms(boostUser), m_hld(hld), m_autodiacsens(false),
744	m_autocasesens(true), m_maxexp(maxexp), m_maxcl(maxcl), m_curcl(0)
745	{
746	m_db.getConf()->getConfParam("autodiacsens", &m_autodiacsens);
747	m_db.getConf()->getConfParam("autocasesens", &m_autocasesens);
748	}
749
750	bool processUserString(const string &iq,
751	int mods,
752	string &ermsg,
753	vector<Xapian::Query> &pqueries,
754	int slack = 0, bool useNear = false);
755	private:
756	bool expandTerm(string& ermsg, int mods,
757	const string& term, vector<string>& exp,
758	string& sterm, const string& prefix);
759	// After splitting entry on whitespace: process non-phrase element
760	void processSimpleSpan(string& ermsg, const string& span,
761	int mods,
762	vector<Xapian::Query> &pqueries);
763	// Process phrase/near element
764	void processPhraseOrNear(string& ermsg, TextSplitQ *splitData,
765	int mods,
766	vector<Xapian::Query> &pqueries,
767	bool useNear, int slack);
768
769	Db& m_db;
770	const string& m_field;
771	const string& m_stemlang;
772	const bool m_doBoostUserTerms;
773	HighlightData& m_hld;
774	bool m_autodiacsens;
775	bool m_autocasesens;
776	int m_maxexp;
777	int m_maxcl;
778	int m_curcl;
779	};
780		531
781	#if 1	532	#if 1
782	static void listVector(const string& what, const vector<string>&l)	533	static void listVector(const string& what, const vector<string>&l)
783	{	534	{
784	string a;	535	string a;
	...		...
798	* @param sterm output original input term if there were no wildcards	549	* @param sterm output original input term if there were no wildcards
799	* @param prefix field prefix in index. We could recompute it, but the caller	550	* @param prefix field prefix in index. We could recompute it, but the caller
800	* has it already. Used in the simple case where there is nothing to expand,	551	* has it already. Used in the simple case where there is nothing to expand,
801	* and we just return the prefixed term (else Db::termMatch deals with it).	552	* and we just return the prefixed term (else Db::termMatch deals with it).
802	*/	553	*/
803	bool StringToXapianQ::expandTerm(string& ermsg, int mods,	554	bool SearchDataClauseSimple::expandTerm(Rcl::Db &db,
		555	string& ermsg, int mods,
804	const string& term,	556	const string& term,
805	vector<string>& oexp, string &sterm,	557	vector<string>& oexp, string &sterm,
806	const string& prefix)	558	const string& prefix)
807	{	559	{
808	LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",	560	LOGDEB0(("expandTerm: mods 0x%x fld [%s] trm [%s] lang [%s]\n",
809	mods, m_field.c_str(), term.c_str(), m_stemlang.c_str()));	561	mods, m_field.c_str(), term.c_str(), getStemLang().c_str()));
810	sterm.clear();	562	sterm.clear();
811	oexp.clear();	563	oexp.clear();
812	if (term.empty())	564	if (term.empty())
813	return true;	565	return true;
814		566
815	bool haswild = term.find_first_of(cstr_minwilds) != string::npos;	567	bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
816		568
817	// If there are no wildcards, add term to the list of user-entered terms	569	// If there are no wildcards, add term to the list of user-entered terms
818	if (!haswild)	570	if (!haswild)
819	m_hld.uterms.insert(term);	571	m_hldata.uterms.insert(term);
820		572
821	bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;	573	bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
822		574
823	// No stem expansion if there are wildcards or if prevented by caller	575	// No stem expansion if there are wildcards or if prevented by caller
824	if (haswild \|\| m_stemlang.empty()) {	576	if (haswild \|\| getStemLang().empty()) {
825	LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));	577	LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
826	nostemexp = true;	578	nostemexp = true;
827	}	579	}
828		580
829	bool noexpansion = nostemexp && !haswild;	581	bool noexpansion = nostemexp && !haswild;
	...		...
840		592
841	// If any character has a diacritic, we become	593	// If any character has a diacritic, we become
842	// diacritic-sensitive. Note that the way that the test is	594	// diacritic-sensitive. Note that the way that the test is
843	// performed (conversion+comparison) will automatically ignore	595	// performed (conversion+comparison) will automatically ignore
844	// accented characters which are actually a separate letter	596	// accented characters which are actually a separate letter
845	if (m_autodiacsens && unachasaccents(term)) {	597	if (getAutoDiac() && unachasaccents(term)) {
846	LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));	598	LOGDEB0(("expandTerm: term has accents -> diac-sensitive\n"));
847	diac_sensitive = true;	599	diac_sensitive = true;
848	}	600	}
849		601
850	// If any character apart the first is uppercase, we become	602	// If any character apart the first is uppercase, we become
851	// case-sensitive. The first character is reserved for	603	// case-sensitive. The first character is reserved for
852	// turning off stemming. You need to use a query language	604	// turning off stemming. You need to use a query language
853	// modifier to search for Floor in a case-sensitive way.	605	// modifier to search for Floor in a case-sensitive way.
854	Utf8Iter it(term);	606	Utf8Iter it(term);
855	it++;	607	it++;
856	if (m_autocasesens && unachasuppercase(term.substr(it.getBpos()))) {	608	if (getAutoCase() && unachasuppercase(term.substr(it.getBpos()))) {
857	LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));	609	LOGDEB0(("expandTerm: term has uppercase -> case-sensitive\n"));
858	case_sensitive = true;	610	case_sensitive = true;
859	}	611	}
860		612
861	// If we are sensitive to case or diacritics turn stemming off	613	// If we are sensitive to case or diacritics turn stemming off
	...		...
870	#endif	622	#endif
871		623
872	if (noexpansion) {	624	if (noexpansion) {
873	sterm = term;	625	sterm = term;
874	oexp.push_back(prefix + term);	626	oexp.push_back(prefix + term);
875	m_hld.terms[term] = m_hld.uterms.size() - 1;	627	m_hldata.terms[term] = m_hldata.uterms.size() - 1;
876	LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));	628	LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
877	return true;	629	return true;
878	}	630	}
879		631
880	// Make objects before the goto jungle to avoid compiler complaints	632	// Make objects before the goto jungle to avoid compiler complaints
881	SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);	633	SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
882	XapComputableSynFamMember synac(m_db.m_ndb->xrdb, synFamDiCa, "all",	634	XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
883	&unacfoldtrans);	635	&unacfoldtrans);
884	// This will hold the result of case and diacritics expansion as input	636	// This will hold the result of case and diacritics expansion as input
885	// to stem expansion.	637	// to stem expansion.
886	vector<string> lexp;	638	vector<string> lexp;
887		639
	...		...
889	if (haswild) {	641	if (haswild) {
890	// Note that if there are wildcards, we do a direct from-index	642	// Note that if there are wildcards, we do a direct from-index
891	// expansion, which means that we are casediac-sensitive. There	643	// expansion, which means that we are casediac-sensitive. There
892	// would be nothing to prevent us to expand from the casediac	644	// would be nothing to prevent us to expand from the casediac
893	// synonyms first. To be done later	645	// synonyms first. To be done later
894	m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang,term,res,m_maxexp,m_field);	646	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
		647	getMaxExp(), m_field);
895	goto termmatchtoresult;	648	goto termmatchtoresult;
896	}	649	}
897		650
898	sterm = term;	651	sterm = term;
899		652
900	#ifdef RCL_INDEX_STRIPCHARS	653	#ifdef RCL_INDEX_STRIPCHARS
901		654
902	m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, res, m_maxexp, m_field);	655	db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
		656	getMaxExp(), m_field);
903		657
904	#else	658	#else
905		659
906	if (o_index_stripchars) {	660	if (o_index_stripchars) {
907	// If the index is raw, we can only come here if nostemexp is unset	661	// If the index is raw, we can only come here if nostemexp is unset
908	// and we just need stem expansion.	662	// and we just need stem expansion.
909	m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang,term,res,m_maxexp,m_field);	663	db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
		664	getMaxExp(), m_field);
910	goto termmatchtoresult;	665	goto termmatchtoresult;
911	}	666	}
912		667
913	// No stem expansion when diacritic or case sensitivity is set, it	668	// No stem expansion when diacritic or case sensitivity is set, it
914	// makes no sense (it would mess with the diacritics anyway if	669	// makes no sense (it would mess with the diacritics anyway if
	...		...
948	}	703	}
949	sort(lexp.begin(), lexp.end());	704	sort(lexp.begin(), lexp.end());
950	{	705	{
951	vector<string>::iterator uit = unique(lexp.begin(), lexp.end());	706	vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
952	lexp.resize(uit - lexp.begin());	707	lexp.resize(uit - lexp.begin());
953	StemDb db(m_db.m_ndb->xrdb);	708	StemDb sdb(db.m_ndb->xrdb);
954	vector<string> exp1;	709	vector<string> exp1;
955	for (vector<string>::const_iterator it = lexp.begin();	710	for (vector<string>::const_iterator it = lexp.begin();
956	it != lexp.end(); it++) {	711	it != lexp.end(); it++) {
957	db.stemExpand(m_stemlang, *it, exp1);	712	sdb.stemExpand(getStemLang(), *it, exp1);
958	}	713	}
959	LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));	714	LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
960		715
961	// Expand the resulting list for case (all stemdb content	716	// Expand the resulting list for case (all stemdb content
962	// is lowercase)	717	// is lowercase)