recoll / Code / Diff of /src/rcldb/searchdatatox.cpp

Diff of /src/rcldb/searchdatatox.cpp [7876fb] .. [b53686]

Switch to unified view


...

    *((Xapian::Query *)d) = xq;
    return true;
}

// Splitter for breaking a user string into simple terms and
// phrases. This is for parts of the user entry which would appear as
// a single word because there is no white space inside, but are
// actually multiple terms to rcldb (ie term1,term2). Still, most of
// the time, the result of our splitting will be a single term.
class TextSplitQ : public TextSplitP {
 public:
    TextSplitQ(Flags flags, TermProc *prc)
  : TextSplitP(prc, flags), m_nostemexp(false) {

    }

    bool takeword(const std::string &term, int pos, int bs, int be) {

    // Check if the first letter is a majuscule in which
    // case we do not want to do stem expansion. Need to do this
    // before unac of course...
    m_nostemexp = unaciscapital(term);

    return TextSplitP::takeword(term, pos, bs, be);
    }

    bool nostemexp() const {
        return m_nostemexp;
    }
private:
    bool m_nostemexp;



};

class TermProcQ : public TermProc {
public:
    TermProcQ() : TermProc(0), m_alltermcount(0), m_lastpos(0), m_ts(0) {}

    // We need a ref to the splitter (only it knows about orig term
    // capitalization for controlling stemming. The ref can't be set
    // in the constructor because the splitter is not built yet when
    // we are born (chicken and egg).
    void setTSQ(const TextSplitQ *ts) {
        m_ts = ts;
    }
    
    bool takeword(const std::string &term, int pos, int bs, int be) {

    m_alltermcount++;
    if (m_lastpos < pos)
        m_lastpos = pos;
    bool noexpand = be ? m_ts->nostemexp() : true;
    LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", 
         term.c_str(), pos, noexpand));
    if (m_terms[pos].size() < term.size()) {
        m_terms[pos] = term;
        m_nste[pos] = noexpand;
    }
    return true;
    }

    bool flush() {

    for (map<int, string>::const_iterator it = m_terms.begin();
         it != m_terms.end(); it++) {
        m_vterms.push_back(it->second);
        m_vnostemexps.push_back(m_nste[it->first]);
    }
    return true;
    }

    int alltermcount() const {
        return m_alltermcount;
    }
    int lastpos() const {
        return m_lastpos;
    }
    const vector<string>& terms() {
        return m_vterms;
    }
    const vector<bool>& nostemexps() {
        return m_vnostemexps;
    }
private:
    // Count of terms including stopwords: this is for adjusting
    // phrase/near slack
    int m_alltermcount; 
    int m_lastpos;
    const TextSplitQ *m_ts;
    vector<string> m_vterms;
    vector<bool>   m_vnostemexps;
    map<int, string> m_terms;
    map<int, bool> m_nste;
};


...
// User entry element had several terms: transform into a PHRASE or
// NEAR xapian query, the elements of which can themselves be OR
// queries if the terms get expanded by stemming or wildcards (we
// don't do stemming for PHRASE though)
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
                         TermProcQ *splitData, 
                         int mods, void *pq,
                         bool useNear, int slack)
{
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
...
    orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
    slack++;
    }

    // Go through the list and perform stem/wildcard expansion for each element
    vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
    for (vector<string>::const_iterator it = splitData->terms().begin();
     it != splitData->terms().end(); it++, nxit++) {
    LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
    // Adjust when we do stem expansion. Not if disabled by
    // caller, not inside phrases, and some versions of xapian
    // will accept only one OR clause inside NEAR.
    bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
...
    }

    // Generate an appropriate PHRASE/NEAR query with adjusted slack
    // For phrases, give a relevance boost like we do for original terms
    LOGDEB2(("PHRASE/NEAR:  alltermcount %d lastpos %d\n", 
             splitData->alltermcount(), splitData->lastpos()));
    Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
             splitData->lastpos() + 1 + slack);
    if (op == Xapian::Query::OP_PHRASE)
    xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, 
               original_term_wqf_booster);
    pqueries.push_back(xq);

...
        // performance, but will succeed.
        // We now adjust the phrase/near slack by comparing the term count
        // and the last position

        // The term processing pipeline:
            // split -> [unac/case ->] stops -> store terms
        TermProcQ tpq;
        TermProc *nxt = &tpq;
            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
            //tpcommon.onlygrams(true);
...
        if (o_index_stripchars)
        nxt = &tpprep;

        TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
                         TextSplit::TXTS_KEEPWILD), 
              nxt);
        tpq.setTSQ(&splitter);
        splitter.text_to_words(*it);

        slack += tpq.lastpos() - tpq.terms().size() + 1;

        LOGDEB0(("strToXapianQ: termcount: %d\n", tpq.terms().size()));
        switch (tpq.terms().size() + terminc) {
        case 0: 
        continue;// ??
        case 1: {
        int lmods = mods;
        if (tpq.nostemexps().front())
            lmods |= SearchDataClause::SDCM_NOSTEMMING;
        m_hldata.ugroups.push_back(tpq.terms());
        processSimpleSpan(db, ermsg, tpq.terms().front(),
                  lmods, &pqueries);
        }
        break;
        default:
        m_hldata.ugroups.push_back(tpq.terms());
        processPhraseOrNear(db, ermsg, &tpq, mods, &pqueries,
                    useNear, slack);
        }
        if (m_curcl >= getMaxCl()) {
        ermsg = maxXapClauseMsg;
        if (!o_index_stripchars)
...
    switch (m_tp) {
    case SCLT_AND: op = Xapian::Query::OP_AND; break;
    case SCLT_OR: op = Xapian::Query::OP_OR; break;
    default:
    LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
        m_reason = "Internal error";
    return false;
    }

    vector<Xapian::Query> pqueries;
    if (!processUserString(db, m_text, m_reason, &pqueries))
    return false;
    if (pqueries.empty()) {
    LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
        m_reason = string("Resolved to null query. Term too long ? : [" + 
                          m_text + string("]"));
    return false;
    }

    *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
    if  (m_weight != 1.0) {
    *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
...
    bool useNear = (m_tp == SCLT_NEAR);
    if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
    return false;
    if (pqueries.empty()) {
    LOGERR(("SearchDataClauseDist: resolved to null query\n"));
        m_reason = string("Resolved to null query. Term too long ? : [" + 
                          m_text + string("]"));
    return false;
    }

    *qp = *pqueries.begin();
    if (m_weight != 1.0) {
    *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);

	a/src/rcldb/searchdatatox.cpp		b/src/rcldb/searchdatatox.cpp
	...		...
286		286
287	((Xapian::Query )d) = xq;	287	((Xapian::Query )d) = xq;
288	return true;	288	return true;
289	}	289	}
290		290
291	// Splitter callback for breaking a user string into simple terms and	291	// Splitter for breaking a user string into simple terms and
292	// phrases. This is for parts of the user entry which would appear as	292	// phrases. This is for parts of the user entry which would appear as
293	// a single word because there is no white space inside, but are	293	// a single word because there is no white space inside, but are
294	// actually multiple terms to rcldb (ie term1,term2)	294	// actually multiple terms to rcldb (ie term1,term2). Still, most of
		295	// the time, the result of our splitting will be a single term.
295	class TextSplitQ : public TextSplitP {	296	class TextSplitQ : public TextSplitP {
296	public:	297	public:
297	TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)	298	TextSplitQ(Flags flags, TermProc *prc)
298	: TextSplitP(prc, flags),	299	: TextSplitP(prc, flags), m_nostemexp(false) {
299	curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
300	{}	300	}
301		301
302	bool takeword(const std::string &term, int pos, int bs, int be)	302	bool takeword(const std::string &term, int pos, int bs, int be) {
303	{
304	// Check if the first letter is a majuscule in which	303	// Check if the first letter is a majuscule in which
305	// case we do not want to do stem expansion. Need to do this	304	// case we do not want to do stem expansion. Need to do this
306	// before unac of course...	305	// before unac of course...
307	curnostemexp = unaciscapital(term);	306	m_nostemexp = unaciscapital(term);
308		307
309	return TextSplitP::takeword(term, pos, bs, be);	308	return TextSplitP::takeword(term, pos, bs, be);
310	}	309	}
311		310
312	bool curnostemexp;	311	bool nostemexp() const {
313	vector<string> terms;	312	return m_nostemexp;
314	vector<bool> nostemexps;	313	}
315	const StopList &stops;	314	private:
316	// Count of terms including stopwords: this is for adjusting	315	bool m_nostemexp;
317	// phrase/near slack
318	int alltermcount;
319	int lastpos;
320	};	316	};
321		317
322	class TermProcQ : public TermProc {	318	class TermProcQ : public TermProc {
323	public:	319	public:
324	TermProcQ() : TermProc(0), m_ts(0) {}	320	TermProcQ() : TermProc(0), m_alltermcount(0), m_lastpos(0), m_ts(0) {}
		321
		322	// We need a ref to the splitter (only it knows about orig term
		323	// capitalization for controlling stemming. The ref can't be set
		324	// in the constructor because the splitter is not built yet when
		325	// we are born (chicken and egg).
325	void setTSQ(TextSplitQ *ts) {m_ts = ts;}	326	void setTSQ(const TextSplitQ *ts) {
		327	m_ts = ts;
		328	}
326		329
327	bool takeword(const std::string &term, int pos, int bs, int be)	330	bool takeword(const std::string &term, int pos, int bs, int be) {
328	{
329	m_ts->alltermcount++;	331	m_alltermcount++;
330	if (m_ts->lastpos < pos)	332	if (m_lastpos < pos)
331	m_ts->lastpos = pos;	333	m_lastpos = pos;
332	bool noexpand = be ? m_ts->curnostemexp : true;	334	bool noexpand = be ? m_ts->nostemexp() : true;
333	LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",	335	LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
334	term.c_str(), pos, noexpand));	336	term.c_str(), pos, noexpand));
335	if (m_terms[pos].size() < term.size()) {	337	if (m_terms[pos].size() < term.size()) {
336	m_terms[pos] = term;	338	m_terms[pos] = term;
337	m_nste[pos] = noexpand;	339	m_nste[pos] = noexpand;
338	}	340	}
339	return true;	341	return true;
340	}	342	}
		343
341	bool flush()	344	bool flush() {
342	{
343	for (map<int, string>::const_iterator it = m_terms.begin();	345	for (map<int, string>::const_iterator it = m_terms.begin();
344	it != m_terms.end(); it++) {	346	it != m_terms.end(); it++) {
345	m_ts->terms.push_back(it->second);	347	m_vterms.push_back(it->second);
346	m_ts->nostemexps.push_back(m_nste[it->first]);	348	m_vnostemexps.push_back(m_nste[it->first]);
347	}	349	}
348	return true;	350	return true;
349	}	351	}
		352
		353	int alltermcount() const {
		354	return m_alltermcount;
		355	}
		356	int lastpos() const {
		357	return m_lastpos;
		358	}
		359	const vector<string>& terms() {
		360	return m_vterms;
		361	}
		362	const vector<bool>& nostemexps() {
		363	return m_vnostemexps;
		364	}
350	private:	365	private:
		366	// Count of terms including stopwords: this is for adjusting
		367	// phrase/near slack
		368	int m_alltermcount;
		369	int m_lastpos;
351	TextSplitQ *m_ts;	370	const TextSplitQ *m_ts;
		371	vector<string> m_vterms;
		372	vector<bool> m_vnostemexps;
352	map<int, string> m_terms;	373	map<int, string> m_terms;
353	map<int, bool> m_nste;	374	map<int, bool> m_nste;
354	};	375	};
355		376
356		377
	...		...
586	// User entry element had several terms: transform into a PHRASE or	607	// User entry element had several terms: transform into a PHRASE or
587	// NEAR xapian query, the elements of which can themselves be OR	608	// NEAR xapian query, the elements of which can themselves be OR
588	// queries if the terms get expanded by stemming or wildcards (we	609	// queries if the terms get expanded by stemming or wildcards (we
589	// don't do stemming for PHRASE though)	610	// don't do stemming for PHRASE though)
590	void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,	611	void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
591	TextSplitQ *splitData,	612	TermProcQ *splitData,
592	int mods, void *pq,	613	int mods, void *pq,
593	bool useNear, int slack)	614	bool useNear, int slack)
594	{	615	{
595	vector<Xapian::Query> &pqueries((vector<Xapian::Query>)pq);	616	vector<Xapian::Query> &pqueries((vector<Xapian::Query>)pq);
596	Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :	617	Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
	...		...
611	orqueries.push_back(Xapian::Query(prefix + start_of_field_term));	632	orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
612	slack++;	633	slack++;
613	}	634	}
614		635
615	// Go through the list and perform stem/wildcard expansion for each element	636	// Go through the list and perform stem/wildcard expansion for each element
616	vector<bool>::iterator nxit = splitData->nostemexps.begin();	637	vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
617	for (vector<string>::iterator it = splitData->terms.begin();	638	for (vector<string>::const_iterator it = splitData->terms().begin();
618	it != splitData->terms.end(); it++, nxit++) {	639	it != splitData->terms().end(); it++, nxit++) {
619	LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));	640	LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
620	// Adjust when we do stem expansion. Not if disabled by	641	// Adjust when we do stem expansion. Not if disabled by
621	// caller, not inside phrases, and some versions of xapian	642	// caller, not inside phrases, and some versions of xapian
622	// will accept only one OR clause inside NEAR.	643	// will accept only one OR clause inside NEAR.
623	bool nostemexp = *nxit \|\| (op == Xapian::Query::OP_PHRASE)	644	bool nostemexp = *nxit \|\| (op == Xapian::Query::OP_PHRASE)
	...		...
658	}	679	}
659		680
660	// Generate an appropriate PHRASE/NEAR query with adjusted slack	681	// Generate an appropriate PHRASE/NEAR query with adjusted slack
661	// For phrases, give a relevance boost like we do for original terms	682	// For phrases, give a relevance boost like we do for original terms
662	LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",	683	LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
663	splitData->alltermcount, splitData->lastpos));	684	splitData->alltermcount(), splitData->lastpos()));
664	Xapian::Query xq(op, orqueries.begin(), orqueries.end(),	685	Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
665	splitData->lastpos + 1 + slack);	686	splitData->lastpos() + 1 + slack);
666	if (op == Xapian::Query::OP_PHRASE)	687	if (op == Xapian::Query::OP_PHRASE)
667	xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,	688	xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
668	original_term_wqf_booster);	689	original_term_wqf_booster);
669	pqueries.push_back(xq);	690	pqueries.push_back(xq);
670		691
	...		...
770	// performance, but will succeed.	791	// performance, but will succeed.
771	// We now adjust the phrase/near slack by comparing the term count	792	// We now adjust the phrase/near slack by comparing the term count
772	// and the last position	793	// and the last position
773		794
774	// The term processing pipeline:	795	// The term processing pipeline:
		796	// split -> [unac/case ->] stops -> store terms
775	TermProcQ tpq;	797	TermProcQ tpq;
776	TermProc *nxt = &tpq;	798	TermProc *nxt = &tpq;
777	TermProcStop tpstop(nxt, stops); nxt = &tpstop;	799	TermProcStop tpstop(nxt, stops); nxt = &tpstop;
778	//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;	800	//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
779	//tpcommon.onlygrams(true);	801	//tpcommon.onlygrams(true);
	...		...
781	if (o_index_stripchars)	803	if (o_index_stripchars)
782	nxt = &tpprep;	804	nxt = &tpprep;
783		805
784	TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS \|	806	TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS \|
785	TextSplit::TXTS_KEEPWILD),	807	TextSplit::TXTS_KEEPWILD),
786	stops, nxt);	808	nxt);
787	tpq.setTSQ(&splitter);	809	tpq.setTSQ(&splitter);
788	splitter.text_to_words(*it);	810	splitter.text_to_words(*it);
789		811
790	slack += splitter.lastpos - splitter.terms.size() + 1;	812	slack += tpq.lastpos() - tpq.terms().size() + 1;
791		813
792	LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));	814	LOGDEB0(("strToXapianQ: termcount: %d\n", tpq.terms().size()));
793	switch (splitter.terms.size() + terminc) {	815	switch (tpq.terms().size() + terminc) {
794	case 0:	816	case 0:
795	continue;// ??	817	continue;// ??
796	case 1: {	818	case 1: {
797	int lmods = mods;	819	int lmods = mods;
798	if (splitter.nostemexps.front())	820	if (tpq.nostemexps().front())
799	lmods \|= SearchDataClause::SDCM_NOSTEMMING;	821	lmods \|= SearchDataClause::SDCM_NOSTEMMING;
800	m_hldata.ugroups.push_back(splitter.terms);	822	m_hldata.ugroups.push_back(tpq.terms());
801	processSimpleSpan(db, ermsg, splitter.terms.front(),	823	processSimpleSpan(db, ermsg, tpq.terms().front(),
802	lmods, &pqueries);	824	lmods, &pqueries);
803	}	825	}
804	break;	826	break;
805	default:	827	default:
806	m_hldata.ugroups.push_back(splitter.terms);	828	m_hldata.ugroups.push_back(tpq.terms());
807	processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,	829	processPhraseOrNear(db, ermsg, &tpq, mods, &pqueries,
808	useNear, slack);	830	useNear, slack);
809	}	831	}
810	if (m_curcl >= getMaxCl()) {	832	if (m_curcl >= getMaxCl()) {
811	ermsg = maxXapClauseMsg;	833	ermsg = maxXapClauseMsg;
812	if (!o_index_stripchars)	834	if (!o_index_stripchars)
	...		...
844	switch (m_tp) {	866	switch (m_tp) {
845	case SCLT_AND: op = Xapian::Query::OP_AND; break;	867	case SCLT_AND: op = Xapian::Query::OP_AND; break;
846	case SCLT_OR: op = Xapian::Query::OP_OR; break;	868	case SCLT_OR: op = Xapian::Query::OP_OR; break;
847	default:	869	default:
848	LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));	870	LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
		871	m_reason = "Internal error";
849	return false;	872	return false;
850	}	873	}
851		874
852	vector<Xapian::Query> pqueries;	875	vector<Xapian::Query> pqueries;
853	if (!processUserString(db, m_text, m_reason, &pqueries))	876	if (!processUserString(db, m_text, m_reason, &pqueries))
854	return false;	877	return false;
855	if (pqueries.empty()) {	878	if (pqueries.empty()) {
856	LOGERR(("SearchDataClauseSimple: resolved to null query\n"));	879	LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
		880	m_reason = string("Resolved to null query. Term too long ? : [" +
		881	m_text + string("]"));
857	return true;	882	return false;
858	}	883	}
859		884
860	*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());	885	*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
861	if (m_weight != 1.0) {	886	if (m_weight != 1.0) {
862	qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, qp, m_weight);	887	qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, qp, m_weight);
	...		...
968	bool useNear = (m_tp == SCLT_NEAR);	993	bool useNear = (m_tp == SCLT_NEAR);
969	if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))	994	if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
970	return false;	995	return false;
971	if (pqueries.empty()) {	996	if (pqueries.empty()) {
972	LOGERR(("SearchDataClauseDist: resolved to null query\n"));	997	LOGERR(("SearchDataClauseDist: resolved to null query\n"));
		998	m_reason = string("Resolved to null query. Term too long ? : [" +
		999	m_text + string("]"));
973	return true;	1000	return false;
974	}	1001	}
975		1002
976	qp = pqueries.begin();	1003	qp = pqueries.begin();
977	if (m_weight != 1.0) {	1004	if (m_weight != 1.0) {
978	qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, qp, m_weight);	1005	qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, qp, m_weight);