recoll / Code / Diff of /src/rcldb/searchdata.cpp

Diff of /src/rcldb/searchdata.cpp [f56c94] .. [844f4f]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.18 2007-09-20 08:43:12 dockes Exp $ (C) 2006 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
    try {
    for (list<string>::iterator it = phrases.begin(); 
         it != phrases.end(); it++) {
        LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));

      // If there are multiple spans in this element, including
      // at least one composite, we need to use a word split,
      // else a phrase query including a span would fail. 
      // (other possible solution: adjust slack to account for the
      //  additional position increase?)
      // Ex: "term0@term01 term1" is onlyspans-split as:
      //   0 term0@term01            0   12
      //   2 term1                  13   18
      // The position of term1 is 2, not 1, so the phrase search would
      // fail. We search for "term0 term01 term1" instead, which may 
      // have worse performance, but will succeed.
        wsQData splitDataS(stops), splitDataW(stops);
        TextSplit splitterS(&splitDataS, 
                TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
                      TextSplit::TXTS_KEEPWILD));
        splitterS.text_to_words(*it);
        TextSplit splitterW(&splitDataW, 
                TextSplit::Flags(TextSplit::TXTS_NOSPANS | 
                      TextSplit::TXTS_KEEPWILD));
        splitterW.text_to_words(*it);
        wsQData *splitData = &splitDataS;
        if (splitDataS.terms.size() > 1 && 
        splitDataS.terms.size() != splitDataW.terms.size())
        splitData = &splitDataW;
...
        LOGDEB(("strToXapianQ: splitter term count: %d\n", 
             splitData->terms.size()));
        switch (splitData->terms.size()) {
        case 0: continue;// ??
        case 1: 
      // Just a term. Still may be expanded (by stem or
      // wildcard) to an OR list.
        {
            string term = splitData->terms.front();
            list<string> exp;  
          string sterm; // dumb version of user term
            stripExpandTerm(false, term, exp, sterm);
            m_terms.insert(m_terms.end(), exp.begin(), exp.end());
            // Push either term or OR of stem-expanded set
            addPrefix(exp, prefix);
            Xapian::Query xq(Xapian::Query::OP_OR, 
...
            pqueries.push_back(xq);
        }
        break;

        default:
      // Element had several terms: transform into a PHRASE
      // or NEAR xapian query, the elements of which can
      // themselves be OR queries if the terms get expanded
      // by stemming or wildcards (we don't do stemming for
      // PHRASE though)
        Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
        Xapian::Query::OP_PHRASE;
        list<Xapian::Query> orqueries;
        bool hadmultiple = false;
        vector<vector<string> >groups;

	a/src/rcldb/searchdata.cpp		b/src/rcldb/searchdata.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.17 2007-06-22 06:14:04 dockes Exp $ (C) 2006 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: searchdata.cpp,v 1.18 2007-09-20 08:43:12 dockes Exp $ (C) 2006 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
364	try {	364	try {
365	for (list<string>::iterator it = phrases.begin();	365	for (list<string>::iterator it = phrases.begin();
366	it != phrases.end(); it++) {	366	it != phrases.end(); it++) {
367	LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));	367	LOGDEB(("strToXapianQ: phrase or word: [%s]\n", it->c_str()));
368		368
369	// If there are both spans and single words in this element,	369	// If there are multiple spans in this element, including
370	// we need to use a word split, else a phrase query including	370	// at least one composite, we need to use a word split,
371	// a span would fail if we didn't adjust the proximity to	371	// else a phrase query including a span would fail.
372	// account for the additional span term which is complicated.	372	// (other possible solution: adjust slack to account for the
		373	// additional position increase?)
		374	// Ex: "term0@term01 term1" is onlyspans-split as:
		375	// 0 term0@term01 0 12
		376	// 2 term1 13 18
		377	// The position of term1 is 2, not 1, so the phrase search would
		378	// fail. We search for "term0 term01 term1" instead, which may
		379	// have worse performance, but will succeed.
373	wsQData splitDataS(stops), splitDataW(stops);	380	wsQData splitDataS(stops), splitDataW(stops);
374	TextSplit splitterS(&splitDataS, (TextSplit::Flags)	381	TextSplit splitterS(&splitDataS,
375	(TextSplit::TXTS_ONLYSPANS \|	382	TextSplit::Flags(TextSplit::TXTS_ONLYSPANS \|
376	TextSplit::TXTS_KEEPWILD));	383	TextSplit::TXTS_KEEPWILD));
377	splitterS.text_to_words(*it);	384	splitterS.text_to_words(*it);
378	TextSplit splitterW(&splitDataW, (TextSplit::Flags)	385	TextSplit splitterW(&splitDataW,
379	(TextSplit::TXTS_NOSPANS \|	386	TextSplit::Flags(TextSplit::TXTS_NOSPANS \|
380	TextSplit::TXTS_KEEPWILD));	387	TextSplit::TXTS_KEEPWILD));
381	splitterW.text_to_words(*it);	388	splitterW.text_to_words(*it);
382	wsQData *splitData = &splitDataS;	389	wsQData *splitData = &splitDataS;
383	if (splitDataS.terms.size() > 1 &&	390	if (splitDataS.terms.size() > 1 &&
384	splitDataS.terms.size() != splitDataW.terms.size())	391	splitDataS.terms.size() != splitDataW.terms.size())
385	splitData = &splitDataW;	392	splitData = &splitDataW;
	...		...
387	LOGDEB(("strToXapianQ: splitter term count: %d\n",	394	LOGDEB(("strToXapianQ: splitter term count: %d\n",
388	splitData->terms.size()));	395	splitData->terms.size()));
389	switch (splitData->terms.size()) {	396	switch (splitData->terms.size()) {
390	case 0: continue;// ??	397	case 0: continue;// ??
391	case 1:	398	case 1:
392	// Not a real phrase: one term. Still may be expanded	399	// Just a term. Still may be expanded (by stem or
393	// (stem or wildcard)	400	// wildcard) to an OR list.
394	{	401	{
395	string term = splitData->terms.front();	402	string term = splitData->terms.front();
396	list<string> exp;	403	list<string> exp;
397	string sterm;	404	string sterm; // dumb version of user term
398	stripExpandTerm(false, term, exp, sterm);	405	stripExpandTerm(false, term, exp, sterm);
399	m_terms.insert(m_terms.end(), exp.begin(), exp.end());	406	m_terms.insert(m_terms.end(), exp.begin(), exp.end());
400	// Push either term or OR of stem-expanded set	407	// Push either term or OR of stem-expanded set
401	addPrefix(exp, prefix);	408	addPrefix(exp, prefix);
402	Xapian::Query xq(Xapian::Query::OP_OR,	409	Xapian::Query xq(Xapian::Query::OP_OR,
	...		...
415	pqueries.push_back(xq);	422	pqueries.push_back(xq);
416	}	423	}
417	break;	424	break;
418		425
419	default:	426	default:
420	// Phrase/near: transform into a PHRASE or NEAR xapian	427	// Element had several terms: transform into a PHRASE
421	// query, the element of which can themselves be OR	428	// or NEAR xapian query, the elements of which can
422	// queries if the terms get expanded by stemming or	429	// themselves be OR queries if the terms get expanded
423	// wildcards (we don't do stemming for PHRASE though)	430	// by stemming or wildcards (we don't do stemming for
		431	// PHRASE though)
424	Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :	432	Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
425	Xapian::Query::OP_PHRASE;	433	Xapian::Query::OP_PHRASE;
426	list<Xapian::Query> orqueries;	434	list<Xapian::Query> orqueries;
427	bool hadmultiple = false;	435	bool hadmultiple = false;
428	vector<vector<string> >groups;	436	vector<vector<string> >groups;