recoll / Code / Diff of /src/rcldb/searchdata.cpp

Diff of /src/rcldb/searchdata.cpp [765b6c] .. [7dcc7c]

Switch to unified view


...
class wsQData : public TextSplitCB {
 public:
    wsQData(const StopList &_stops) 
    : stops(_stops), alltermcount(0)
    {}
    bool takeword(const std::string &interm, int , int, int) {
  alltermcount++;
  LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));

  // Check if the first letter is a majuscule in which
  // case we do not want to do stem expansion. Note that
  // the test is convoluted and possibly problematic
  string noacterm, noaclowterm;
  if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
      LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
      return true;
  } 
  if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
      LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
      return true;
  }
  bool nostemexp = false;
  Utf8Iter it1(noacterm);
  Utf8Iter it2(noaclowterm);
  if (*it1 != *it2)
      nostemexp = true;

  if (stops.hasStops() && stops.isStop(noaclowterm)) {
      LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
      return true;
  }
  terms.push_back(noaclowterm);
  nostemexps.push_back(nostemexp);
  return true;
    }

    vector<string> terms;
    vector<bool>   nostemexps;









    const StopList &stops;
    // Count of terms including stopwords: this is for adjusting
    // phrase/near slack
    int alltermcount; 
};
...

private:
    void expandTerm(bool dont, const string& term, list<string>& exp, 
              string& sterm);
    // After splitting entry on whitespace: process non-phrase element
    void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
    // Process phrase/near element
    void processPhraseOrNear(wsQData *splitData, 
                 list<Xapian::Query> &pqueries,
                 bool useNear, int slack);

...
    // No stemming if there are wildcards or prevented globally.
    if (haswild || m_stemlang.empty())
    nostemexp = true;

    if (!nostemexp) {












    }

    if (nostemexp && !haswild) {
    // Neither stemming nor wildcard expansion: just the word
    sterm = term;
...
    return;
    for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
    it->insert(0, prefix);
}

void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
                    list<Xapian::Query> &pqueries)
{
    list<string> exp;  
    string sterm; // dumb version of user term
    expandTerm(nostemexp, span, exp, sterm);
    m_terms.insert(m_terms.end(), exp.begin(), exp.end());
    addPrefix(exp, m_prefix);
    // Push either term or OR of stem-expanded set
    Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());

...
    list<Xapian::Query> orqueries;
    bool hadmultiple = false;
    vector<vector<string> >groups;

    // Go through the list and perform stem/wildcard expansion for each element
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
    for (vector<string>::iterator it = splitData->terms.begin();
     it != splitData->terms.end(); it++, nxit++) {
    // Adjust when we do stem expansion. Not inside phrases, and
    // some versions of xapian will accept only one OR clause
    // inside NEAR, all others must be leafs.
    bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) || hadmultiple;

    string sterm;
    list<string>exp;
    expandTerm(nostemexp, *it, exp, sterm);
    groups.push_back(vector<string>(exp.begin(), exp.end()));
...
    m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
}

/** 
 * Turn user entry string (NOT query language) into a list of xapian queries.
 * We just separate words and phrases, and do wildcard and stem expansion,
 *
 * This is used to process data entered into an OR/AND/NEAR/PHRASE field of
 * the GUI.
 *
 * The final list contains one query for each term or phrase
 *   - Elements corresponding to a stem-expanded part are an OP_OR
 *     composition of the stem-expanded terms (or a single term query).
 *   - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
 *     composition of the phrase terms (no stem expansion in this case)
 * @return the subquery count (either or'd stem-expanded terms or phrase word
 *   count)
 */
bool StringToXapianQ::processUserString(const string &iq,
                    string &ermsg,
                    list<Xapian::Query> &pqueries,
                    const StopList& stops,
                    int slack, 
                    bool useNear
                    )
{
    LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
    ermsg.erase();
    m_terms.clear();
    m_groups.clear();









    // Simple whitespace-split input into user-level words and
    // double-quoted phrases: word1 word2 "this is a phrase". 
    //
    // The text splitter may further still decide that the resulting
    // "words" are really phrases, this depends on separators:
    // [paul@dom.net] would still be a word (span), but [about:me]
    // will probably be handled as a phrase.
    list<string> phrases;
    TextSplit::stringToStrings(iq, phrases);

    // Process each element: textsplit into terms, handle stem/wildcard 
    // expansion and transform into an appropriate Xapian::Query
...
        LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
        switch (splitData->terms.size()) {
        case 0: 
        continue;// ??
        case 1: 
        processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
        break;
        default:
        processPhraseOrNear(splitData, pqueries, useNear, slack);
        }
    }

	a/src/rcldb/searchdata.cpp		b/src/rcldb/searchdata.cpp
	...		...
181	class wsQData : public TextSplitCB {	181	class wsQData : public TextSplitCB {
182	public:	182	public:
183	wsQData(const StopList &_stops)	183	wsQData(const StopList &_stops)
184	: stops(_stops), alltermcount(0)	184	: stops(_stops), alltermcount(0)
185	{}	185	{}
		186	bool takeword(const std::string &interm, int , int, int) {
		187	alltermcount++;
		188	LOGDEB1(("wsQData::takeword: %s\n", interm.c_str()));
		189
		190	// Check if the first letter is a majuscule in which
		191	// case we do not want to do stem expansion. Note that
		192	// the test is convoluted and possibly problematic
		193	string noacterm, noaclowterm;
		194	if (!unacmaybefold(interm, noacterm, "UTF-8", false)) {
		195	LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", interm.c_str()));
		196	return true;
		197	}
		198	if (!unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
		199	LOGINFO(("SearchData::splitter::takeword: unac failed for [%s]\n", noacterm.c_str()));
		200	return true;
		201	}
		202	bool nostemexp = false;
		203	Utf8Iter it1(noacterm);
		204	Utf8Iter it2(noaclowterm);
		205	if (it1 != it2)
		206	nostemexp = true;
		207
		208	if (stops.hasStops() && stops.isStop(noaclowterm)) {
		209	LOGDEB1(("wsQData::takeword [%s] in stop list\n", noaclowterm.c_str()));
		210	return true;
		211	}
		212	terms.push_back(noaclowterm);
		213	nostemexps.push_back(nostemexp);
		214	return true;
		215	}
		216
186	vector<string> terms;	217	vector<string> terms;
187	bool takeword(const std::string &term, int , int, int) {	218	vector<bool> nostemexps;
188	alltermcount++;
189	LOGDEB1(("wsQData::takeword: %s\n", term.c_str()));
190	if (stops.hasStops() && stops.isStop(term)) {
191	LOGDEB1(("wsQData::takeword [%s] in stop list\n", term.c_str()));
192	return true;
193	}
194	terms.push_back(term);
195	return true;
196	}
197	const StopList &stops;	219	const StopList &stops;
198	// Count of terms including stopwords: this is for adjusting	220	// Count of terms including stopwords: this is for adjusting
199	// phrase/near slack	221	// phrase/near slack
200	int alltermcount;	222	int alltermcount;
201	};	223	};
	...		...
230		252
231	private:	253	private:
232	void expandTerm(bool dont, const string& term, list<string>& exp,	254	void expandTerm(bool dont, const string& term, list<string>& exp,
233	string& sterm);	255	string& sterm);
234	// After splitting entry on whitespace: process non-phrase element	256	// After splitting entry on whitespace: process non-phrase element
235	void processSimpleSpan(const string& span, list<Xapian::Query> &pqueries);	257	void processSimpleSpan(const string& span, bool nostemexp, list<Xapian::Query> &pqueries);
236	// Process phrase/near element	258	// Process phrase/near element
237	void processPhraseOrNear(wsQData *splitData,	259	void processPhraseOrNear(wsQData *splitData,
238	list<Xapian::Query> &pqueries,	260	list<Xapian::Query> &pqueries,
239	bool useNear, int slack);	261	bool useNear, int slack);
240		262
	...		...
277	// No stemming if there are wildcards or prevented globally.	299	// No stemming if there are wildcards or prevented globally.
278	if (haswild \|\| m_stemlang.empty())	300	if (haswild \|\| m_stemlang.empty())
279	nostemexp = true;	301	nostemexp = true;
280		302
281	if (!nostemexp) {	303	if (!nostemexp) {
282	// Check if the first letter is a majuscule in which
283	// case we do not want to do stem expansion. Note that
284	// the test is convoluted and possibly problematic
285
286	string noacterm, noaclowterm;
287	if (unacmaybefold(term, noacterm, "UTF-8", false) &&
288	unacmaybefold(noacterm, noaclowterm, "UTF-8", true)) {
289	Utf8Iter it1(noacterm);
290	Utf8Iter it2(noaclowterm);
291	if (it1 != it2)
292	nostemexp = true;
293	}
294	}	304	}
295		305
296	if (nostemexp && !haswild) {	306	if (nostemexp && !haswild) {
297	// Neither stemming nor wildcard expansion: just the word	307	// Neither stemming nor wildcard expansion: just the word
298	sterm = term;	308	sterm = term;
	...		...
354	return;	364	return;
355	for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)	365	for (list<string>::iterator it = terms.begin(); it != terms.end(); it++)
356	it->insert(0, prefix);	366	it->insert(0, prefix);
357	}	367	}
358		368
359	void StringToXapianQ::processSimpleSpan(const string& span,	369	void StringToXapianQ::processSimpleSpan(const string& span, bool nostemexp,
360	list<Xapian::Query> &pqueries)	370	list<Xapian::Query> &pqueries)
361	{	371	{
362	list<string> exp;	372	list<string> exp;
363	string sterm; // dumb version of user term	373	string sterm; // dumb version of user term
364	expandTerm(false, span, exp, sterm);	374	expandTerm(nostemexp, span, exp, sterm);
365	m_terms.insert(m_terms.end(), exp.begin(), exp.end());	375	m_terms.insert(m_terms.end(), exp.begin(), exp.end());
366	addPrefix(exp, m_prefix);	376	addPrefix(exp, m_prefix);
367	// Push either term or OR of stem-expanded set	377	// Push either term or OR of stem-expanded set
368	Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());	378	Xapian::Query xq(Xapian::Query::OP_OR, exp.begin(), exp.end());
369		379
	...		...
394	list<Xapian::Query> orqueries;	404	list<Xapian::Query> orqueries;
395	bool hadmultiple = false;	405	bool hadmultiple = false;
396	vector<vector<string> >groups;	406	vector<vector<string> >groups;
397		407
398	// Go through the list and perform stem/wildcard expansion for each element	408	// Go through the list and perform stem/wildcard expansion for each element
		409	vector<bool>::iterator nxit = splitData->nostemexps.begin();
399	for (vector<string>::iterator it = splitData->terms.begin();	410	for (vector<string>::iterator it = splitData->terms.begin();
400	it != splitData->terms.end(); it++) {	411	it != splitData->terms.end(); it++, nxit++) {
401	// Adjust when we do stem expansion. Not inside phrases, and	412	// Adjust when we do stem expansion. Not inside phrases, and
402	// some versions of xapian will accept only one OR clause	413	// some versions of xapian will accept only one OR clause
403	// inside NEAR, all others must be leafs.	414	// inside NEAR, all others must be leafs.
404	bool nostemexp = (op == Xapian::Query::OP_PHRASE) \|\| hadmultiple;	415	bool nostemexp = *nxit \|\| (op == Xapian::Query::OP_PHRASE) \|\| hadmultiple;
405		416
406	string sterm;	417	string sterm;
407	list<string>exp;	418	list<string>exp;
408	expandTerm(nostemexp, *it, exp, sterm);	419	expandTerm(nostemexp, *it, exp, sterm);
409	groups.push_back(vector<string>(exp.begin(), exp.end()));	420	groups.push_back(vector<string>(exp.begin(), exp.end()));
	...		...
432	m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());	443	m_groups.insert(m_groups.end(), allcombs.begin(), allcombs.end());
433	}	444	}
434		445
435	/**	446	/**
436	* Turn user entry string (NOT query language) into a list of xapian queries.	447	* Turn user entry string (NOT query language) into a list of xapian queries.
437	* We just separate words and phrases, and do wildcard and stemp expansion,	448	* We just separate words and phrases, and do wildcard and stem expansion,
		449	*
		450	* This is used to process data entered into an OR/AND/NEAR/PHRASE field of
		451	* the GUI.
438	*	452	*
439	* The final list contains one query for each term or phrase	453	* The final list contains one query for each term or phrase
440	* - Elements corresponding to a stem-expanded part are an OP_OR	454	* - Elements corresponding to a stem-expanded part are an OP_OR
441	* composition of the stem-expanded terms (or a single term query).	455	* composition of the stem-expanded terms (or a single term query).
442	* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR	456	* - Elements corresponding to phrase/near are an OP_PHRASE/NEAR
443	* composition of the phrase terms (no stem expansion in this case)	457	* composition of the phrase terms (no stem expansion in this case)
444	* @return the subquery count (either or'd stem-expanded terms or phrase word	458	* @return the subquery count (either or'd stem-expanded terms or phrase word
445	* count)	459	* count)
446	*/	460	*/
447	bool StringToXapianQ::processUserString(const string &_iq,	461	bool StringToXapianQ::processUserString(const string &iq,
448	string &ermsg,	462	string &ermsg,
449	list<Xapian::Query> &pqueries,	463	list<Xapian::Query> &pqueries,
450	const StopList& stops,	464	const StopList& stops,
451	int slack,	465	int slack,
452	bool useNear	466	bool useNear
453	)	467	)
454	{	468	{
455	LOGDEB(("StringToXapianQ:: query string: [%s]\n", _iq.c_str()));	469	LOGDEB(("StringToXapianQ:: query string: [%s]\n", iq.c_str()));
456	ermsg.erase();	470	ermsg.erase();
457	m_terms.clear();	471	m_terms.clear();
458	m_groups.clear();	472	m_groups.clear();
459		473
460	// First unaccent/normalize the input: do it first so that it
461	// happens in the same order as when indexing: unac then split. As
462	// the character count can change during normalisation, this is
463	// specially important for cjk because the artificial cjk split is
464	// based on character counts
465	string iq;
466	dumb_string(_iq, iq);
467
468	// Simple whitespace-split input into user-level words and	474	// Simple whitespace-split input into user-level words and
469	// double-quoted phrases: word1 word2 "this is a phrase". The text	475	// double-quoted phrases: word1 word2 "this is a phrase".
		476	//
470	// splitter may further still decide that the resulting "words"	477	// The text splitter may further still decide that the resulting
471	// are really phrases, this depends on separators: [paul@dom.net]	478	// "words" are really phrases, this depends on separators:
472	// would still be a word (span), but [about:me] will probably be	479	// [paul@dom.net] would still be a word (span), but [about:me]
473	// handled as a phrase.	480	// will probably be handled as a phrase.
474	list<string> phrases;	481	list<string> phrases;
475	TextSplit::stringToStrings(iq, phrases);	482	TextSplit::stringToStrings(iq, phrases);
476		483
477	// Process each element: textsplit into terms, handle stem/wildcard	484	// Process each element: textsplit into terms, handle stem/wildcard
478	// expansion and transform into an appropriate Xapian::Query	485	// expansion and transform into an appropriate Xapian::Query
	...		...
514	LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));	521	LOGDEB0(("strToXapianQ: termcount: %d\n", splitData->terms.size()));
515	switch (splitData->terms.size()) {	522	switch (splitData->terms.size()) {
516	case 0:	523	case 0:
517	continue;// ??	524	continue;// ??
518	case 1:	525	case 1:
519	processSimpleSpan(splitData->terms.front(), pqueries);	526	processSimpleSpan(splitData->terms.front(), splitData->nostemexps.front(), pqueries);
520	break;	527	break;
521	default:	528	default:
522	processPhraseOrNear(splitData, pqueries, useNear, slack);	529	processPhraseOrNear(splitData, pqueries, useNear, slack);
523	}	530	}
524	}	531	}