recoll / Code / Diff of /src/rcldb/searchdata.cpp

Diff of /src/rcldb/searchdata.cpp [5463ea] .. [0821f0]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
    groups.insert(groups.end(), m_groups.begin(), m_groups.end());
    return true;
    }

private:
    void expandTerm(bool dont, const string& term, list<string>& exp, 
              string& sterm);
    // After splitting entry on whitespace: process non-phrase element
    void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
    // Process phrase/near element
    void processPhraseOrNear(wsQData *splitData, 
...
    // Single terms and phrases resulting from breaking up text;
    vector<string>          m_terms;
    vector<vector<string> > m_groups; 
};

/** Expand stem and wildcards
 *
 * @param nostemexp don't perform stem expansion. This is mainly used to
 *   prevent stem expansion inside phrases (because the user probably
 *   does not expect it). This does NOT prevent wild card expansion.
 *   Other factors than nostemexp can prevent stem expansion: 
 *   a null stemlang, resulting from a global user preference, a
 *   capitalized term, or wildcard(s)
 * @param term input single word
 * @param exp output expansion list
 * @param sterm output original input term if there were no wildcards

 */
void StringToXapianQ::expandTerm(bool nostemexp, 
                      const string& term, 
                      list<string>& exp,
                      string &sterm)
{
    LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n", 
         term.c_str(), m_stemlang.c_str(), nostemexp));
    sterm.erase();
    exp.clear();
    if (term.empty()) {
    return;
    }




    bool haswild = term.find_first_of("*?[") != string::npos;

    // No stemming if there are wildcards or prevented globally.
    if (haswild || m_stemlang.empty())
...
    }
    }

    if (nostemexp && !haswild) {
    // Neither stemming nor wildcard expansion: just the word
    sterm = term;
    exp.push_front(term);
    exp.resize(1);
    } else {
    list<TermMatchEntry> l;
    if (haswild) {
        m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
    } else {
        sterm = term;
        m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
    }
    for (list<TermMatchEntry>::const_iterator it = l.begin(); 
         it != l.end(); it++) {
        exp.push_back(it->term);
    }
...
void StringToXapianQ::processSimpleSpan(const string& span, 
                    list<Xapian::Query> &pqueries)
{
    list<string> exp;  
    string sterm; // dumb version of user term
    expandTerm(false, span, exp, sterm);
    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
    addPrefix(exp, m_prefix);
    // Push either term or OR of stem-expanded set
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());

...
    // inside NEAR, all others must be leafs.
    bool nostemexp = (op == Xapian::Query::OP_PHRASE) || hadmultiple;

    string sterm;
    list<string>exp;
    expandTerm(nostemexp, *it, exp, sterm);
    groups.push_back(vector<string>(exp.begin(), exp.end()));
    addPrefix(exp, m_prefix);
    orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR, 
                      exp.begin(), exp.end()));
#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
...
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
 *     composition of the phrase terms (no stem expansion in this case)
 * @return the subquery count (either or'd stem-expanded terms or phrase word
 *   count)
 */
bool StringToXapianQ::processUserString(const string &_iq,
                    string &ermsg,
                    list<Xapian::Query> &pqueries,
                    const StopList& stops,
                    int slack, 
                    bool useNear
                    )
{
    LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
    ermsg.erase();
    m_terms.clear();
    m_groups.clear();

    // First unaccent/normalize the input: do it first so that it
    // happens in the same order as when indexing: unac then split. As
    // the character count can change during normalisation, this is
    // specially important for cjk because the artificial cjk split is
    // based on character counts
    string iq;
    dumb_string(_iq, iq);

    // Simple whitespace-split input into user-level words and
    // double-quoted phrases: word1 word2 "this is a phrase". The text
    // splitter may further still decide that the resulting "words"
    // are really phrases, this depends on separators: [paul@dom.net]

	a/src/rcldb/searchdata.cpp		b/src/rcldb/searchdata.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.30 2008-12-17 14:26:09 dockes Exp $ (C) 2006 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.31 2008-12-19 09:44:39 dockes Exp $ (C) 2006 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
227	groups.insert(groups.end(), m_groups.begin(), m_groups.end());	227	groups.insert(groups.end(), m_groups.begin(), m_groups.end());
228	return true;	228	return true;
229	}	229	}
230		230
231	private:	231	private:
232	void stripExpandTerm(bool dont, const string& term, list<string>& exp,	232	void expandTerm(bool dont, const string& term, list<string>& exp,
233	string& sterm);	233	string& sterm);
234	// After splitting entry on whitespace: process non-phrase element	234	// After splitting entry on whitespace: process non-phrase element
235	void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);	235	void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);
236	// Process phrase/near element	236	// Process phrase/near element
237	void processPhraseOrNear(wsQData *splitData,	237	void processPhraseOrNear(wsQData *splitData,
	...		...
245	// Single terms and phrases resulting from breaking up text;	245	// Single terms and phrases resulting from breaking up text;
246	vector<string> m_terms;	246	vector<string> m_terms;
247	vector<vector<string> > m_groups;	247	vector<vector<string> > m_groups;
248	};	248	};
249		249
250	/** Unaccent and lowercase term, possibly expand stem and wildcards	250	/** Expand stem and wildcards
251	*	251	*
252	* @param nostemexp don't perform stem expansion. This is mainly used to	252	* @param nostemexp don't perform stem expansion. This is mainly used to
253	* prevent stem expansion inside phrases (because the user probably	253	* prevent stem expansion inside phrases (because the user probably
254	* does not expect it). This does NOT prevent wild card expansion.	254	* does not expect it). This does NOT prevent wild card expansion.
255	* Other factors than nostemexp can prevent stem expansion:	255	* Other factors than nostemexp can prevent stem expansion:
256	* a null stemlang, resulting from a global user preference, a	256	* a null stemlang, resulting from a global user preference, a
257	* capitalized term, or wildcard(s)	257	* capitalized term, or wildcard(s)
258	* @param term input single word	258	* @param term input single word
259	* @param exp output expansion list	259	* @param exp output expansion list
260	* @param sterm output lower-cased+unaccented version of the input term	260	* @param sterm output original input term if there were no wildcards
261	* (only for stem expansion, not wildcards)
262	*/	261	*/
263	void StringToXapianQ::stripExpandTerm(bool nostemexp,	262	void StringToXapianQ::expandTerm(bool nostemexp,
264	const string& term,	263	const string& term,
265	list<string>& exp,	264	list<string>& exp,
266	string &sterm)	265	string &sterm)
267	{	266	{
268	LOGDEB2(("stripExpandTerm: term [%s] stemlang [%s] nostemexp %d\n",	267	LOGDEB2(("expandTerm: term [%s] stemlang [%s] nostemexp %d\n",
269	term.c_str(), m_stemlang.c_str(), nostemexp));	268	term.c_str(), m_stemlang.c_str(), nostemexp));
270	sterm.erase();	269	sterm.erase();
271	exp.clear();	270	exp.clear();
272	if (term.empty()) {	271	if (term.empty()) {
273	return;	272	return;
274	}	273	}
275	// term1 is lowercase and without diacritics
276	string term1;
277	dumb_string(term, term1);
278		274
279	bool haswild = term.find_first_of("*?[") != string::npos;	275	bool haswild = term.find_first_of("*?[") != string::npos;
280		276
281	// No stemming if there are wildcards or prevented globally.	277	// No stemming if there are wildcards or prevented globally.
282	if (haswild \|\| m_stemlang.empty())	278	if (haswild \|\| m_stemlang.empty())
	...		...
297	}	293	}
298	}	294	}
299		295
300	if (nostemexp && !haswild) {	296	if (nostemexp && !haswild) {
301	// Neither stemming nor wildcard expansion: just the word	297	// Neither stemming nor wildcard expansion: just the word
302	sterm = term1;	298	sterm = term;
303	exp.push_front(term1);	299	exp.push_front(term);
304	exp.resize(1);	300	exp.resize(1);
305	} else {	301	} else {
306	list<TermMatchEntry> l;	302	list<TermMatchEntry> l;
307	if (haswild) {	303	if (haswild) {
308	m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term1, l);	304	m_db.termMatch(Rcl::Db::ET_WILD, m_stemlang, term, l);
309	} else {	305	} else {
310	sterm = term1;	306	sterm = term;
311	m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term1, l);	307	m_db.termMatch(Rcl::Db::ET_STEM, m_stemlang, term, l);
312	}	308	}
313	for (list<TermMatchEntry>::const_iterator it = l.begin();	309	for (list<TermMatchEntry>::const_iterator it = l.begin();
314	it != l.end(); it++) {	310	it != l.end(); it++) {
315	exp.push_back(it->term);	311	exp.push_back(it->term);
316	}	312	}
	...		...
363	void StringToXapianQ::processSimpleSpan(const string& span,	359	void StringToXapianQ::processSimpleSpan(const string& span,
364	list<Xapian::Query> &pqueries)	360	list<Xapian::Query> &pqueries)
365	{	361	{
366	list<string> exp;	362	list<string> exp;
367	string sterm; // dumb version of user term	363	string sterm; // dumb version of user term
368	stripExpandTerm(false, span, exp, sterm);	364	expandTerm(false, span, exp, sterm);
369	m_terms.insert(m_terms.end(), exp.begin(), exp.end());	365	m_terms.insert(m_terms.end(), exp.begin(), exp.end());
370	addPrefix(exp, m_prefix);	366	addPrefix(exp, m_prefix);
371	// Push either term or OR of stem-expanded set	367	// Push either term or OR of stem-expanded set
372	Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());	368	Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
373		369
	...		...
407	// inside NEAR, all others must be leafs.	403	// inside NEAR, all others must be leafs.
408	bool nostemexp = (op == Xapian::Query::OP_PHRASE) \|\| hadmultiple;	404	bool nostemexp = (op == Xapian::Query::OP_PHRASE) \|\| hadmultiple;
409		405
410	string sterm;	406	string sterm;
411	list<string>exp;	407	list<string>exp;
412	stripExpandTerm(nostemexp, *it, exp, sterm);	408	expandTerm(nostemexp, *it, exp, sterm);
413	groups.push_back(vector<string>(exp.begin(), exp.end()));	409	groups.push_back(vector<string>(exp.begin(), exp.end()));
414	addPrefix(exp, m_prefix);	410	addPrefix(exp, m_prefix);
415	orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,	411	orqueries.push_back(Xapian::Query(Xapian::Query::OP_OR,
416	exp.begin(), exp.end()));	412	exp.begin(), exp.end()));
417	#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF	413	#ifdef XAPIAN_NEAR_EXPAND_SINGLE_BUF
	...		...
446	* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR	442	* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
447	* composition of the phrase terms (no stem expansion in this case)	443	* composition of the phrase terms (no stem expansion in this case)
448	* @return the subquery count (either or'd stem-expanded terms or phrase word	444	* @return the subquery count (either or'd stem-expanded terms or phrase word
449	* count)	445	* count)
450	*/	446	*/
451	bool StringToXapianQ::processUserString(const string &iq,	447	bool StringToXapianQ::processUserString(const string &_iq,
452	string &ermsg,	448	string &ermsg,
453	list<Xapian::Query> &pqueries,	449	list<Xapian::Query> &pqueries,
454	const StopList& stops,	450	const StopList& stops,
455	int slack,	451	int slack,
456	bool useNear	452	bool useNear
457	)	453	)
458	{	454	{
459	LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));	455	LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));
460	ermsg.erase();	456	ermsg.erase();
461	m_terms.clear();	457	m_terms.clear();
462	m_groups.clear();	458	m_groups.clear();
		459
		460	// First unaccent/normalize the input: do it first so that it
		461	// happens in the same order as when indexing: unac then split. As
		462	// the character count can change during normalisation, this is
		463	// specially important for cjk because the artificial cjk split is
		464	// based on character counts
		465	string iq;
		466	dumb_string(_iq, iq);
463		467
464	// Simple whitespace-split input into user-level words and	468	// Simple whitespace-split input into user-level words and
465	// double-quoted phrases: word1 word2 "this is a phrase". The text	469	// double-quoted phrases: word1 word2 "this is a phrase". The text
466	// splitter may further still decide that the resulting "words"	470	// splitter may further still decide that the resulting "words"
467	// are really phrases, this depends on separators: [paul@dom.net]	471	// are really phrases, this depends on separators: [paul@dom.net]