recoll / Code / Diff of /src/rcldb/searchdata.cpp

Diff of /src/rcldb/searchdata.cpp [f8280c] .. [427293]

Switch to unified view


...
    }

    bool haswild = term.find_first_of(cstr_minwilds) != string::npos;

    // If there are no wildcards, add term to the list of user-entered terms
    if (!haswild) {
    m_hldata.uterms.insert(term);
        sterm = term;
    }
    // No stem expansion if there are wildcards or if prevented by caller
    bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
    if (haswild || getStemLang().empty()) {
    LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
    nostemexp = true;
    }

    // noexpansion can be modified further down by possible case/diac expansion
    bool noexpansion = nostemexp && !haswild; 

    int termmatchsens = 0;

#ifndef RCL_INDEX_STRIPCHARS
    bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
    bool case_sensitive = (mods & SDCM_CASESENS) != 0;

    if (o_index_stripchars) {
    diac_sensitive = case_sensitive = false;
    } else {
    // If we are working with a raw index, apply the rules for case and 
...
    }

    if (!case_sensitive || !diac_sensitive)
        noexpansion = false;
    }

    if (case_sensitive)
  termmatchsens |= Db::ET_CASESENS;
    if (diac_sensitive)
  termmatchsens |= Db::ET_DIACSENS;
#endif

    if (noexpansion) {

    oexp.push_back(prefix + term);
    m_hldata.terms[term] = m_hldata.uterms.size() - 1;
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
    return true;
    } 

    Db::MatchType mtyp = haswild ? Db::ET_WILD : 
  nostemexp ? Db::ET_NONE : Db::ET_STEM;





    TermMatchResult res;
    if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
            m_field)) {
  // Let it go through















    }

























































































    // Term match entries to vector of terms

    if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
    ermsg = "Maximum term expansion size exceeded."
        " Maybe increase maxTermExpand.";
    return false;
    }
    for (vector<TermMatchEntry>::const_iterator it = res.entries.begin(); 
     it != res.entries.end(); it++) {
    oexp.push_back(it->term);
    }
    // If the term does not exist at all in the db, the return from
    // termMatch() is going to be empty, which is not what we want (we
    // would then compute an empty Xapian query)
    if (oexp.empty())
    oexp.push_back(prefix + term);

    // Remember the uterm-to-expansion links

	a/src/rcldb/searchdata.cpp		b/src/rcldb/searchdata.cpp
	...		...
542	}	542	}
543		543
544	bool haswild = term.find_first_of(cstr_minwilds) != string::npos;	544	bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
545		545
546	// If there are no wildcards, add term to the list of user-entered terms	546	// If there are no wildcards, add term to the list of user-entered terms
547	if (!haswild)	547	if (!haswild) {
548	m_hldata.uterms.insert(term);	548	m_hldata.uterms.insert(term);
549		549	sterm = term;
		550	}
550	// No stem expansion if there are wildcards or if prevented by caller	551	// No stem expansion if there are wildcards or if prevented by caller
551	bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;	552	bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
552	if (haswild \|\| getStemLang().empty()) {	553	if (haswild \|\| getStemLang().empty()) {
553	LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));	554	LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
554	nostemexp = true;	555	nostemexp = true;
555	}	556	}
556		557
557	// noexpansion can be modified further down by possible case/diac expansion	558	// noexpansion can be modified further down by possible case/diac expansion
558	bool noexpansion = nostemexp && !haswild;	559	bool noexpansion = nostemexp && !haswild;
559		560
		561	int termmatchsens = 0;
		562
560	#ifndef RCL_INDEX_STRIPCHARS	563	#ifndef RCL_INDEX_STRIPCHARS
561	bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;	564	bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
562	bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;	565	bool case_sensitive = (mods & SDCM_CASESENS) != 0;
563		566
564	if (o_index_stripchars) {	567	if (o_index_stripchars) {
565	diac_sensitive = case_sensitive = false;	568	diac_sensitive = case_sensitive = false;
566	} else {	569	} else {
567	// If we are working with a raw index, apply the rules for case and	570	// If we are working with a raw index, apply the rules for case and
	...		...
594	}	597	}
595		598
596	if (!case_sensitive \|\| !diac_sensitive)	599	if (!case_sensitive \|\| !diac_sensitive)
597	noexpansion = false;	600	noexpansion = false;
598	}	601	}
		602
		603	if (case_sensitive)
		604	termmatchsens \|= Db::ET_CASESENS;
		605	if (diac_sensitive)
		606	termmatchsens \|= Db::ET_DIACSENS;
599	#endif	607	#endif
600		608
601	if (noexpansion) {	609	if (noexpansion) {
602	sterm = term;
603	oexp.push_back(prefix + term);	610	oexp.push_back(prefix + term);
604	m_hldata.terms[term] = m_hldata.uterms.size() - 1;	611	m_hldata.terms[term] = m_hldata.uterms.size() - 1;
605	LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));	612	LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
606	return true;	613	return true;
607	}	614	}
608		615
609	#ifndef RCL_INDEX_STRIPCHARS	616	Db::MatchType mtyp = haswild ? Db::ET_WILD :
610	// The case/diac expansion db	617	nostemexp ? Db::ET_NONE : Db::ET_STEM;
611	SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
612	XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
613	&unacfoldtrans);
614	#endif // RCL_INDEX_STRIPCHARS
615
616	TermMatchResult res;	618	TermMatchResult res;
617		619	if (!db.termMatch(mtyp \| termmatchsens, getStemLang(), term, res, maxexpand,
618	if (haswild) {	620	m_field)) {
619	#ifndef RCL_INDEX_STRIPCHARS	621	// Let it go through
620	if (!o_index_stripchars && (!diac_sensitive \|\| !case_sensitive)) {
621	// Perform case/diac expansion on the exp as appropriate and
622	// expand the result.
623	vector<string> exp;
624	if (diac_sensitive) {
625	// Expand for diacritics and case, filtering for same diacritics
626	SynTermTransUnac foldtrans(UNACOP_FOLD);
627	synac.keyWildExpand(term, exp, &foldtrans);
628	} else if (case_sensitive) {
629	// Expand for diacritics and case, filtering for same case
630	SynTermTransUnac unactrans(UNACOP_UNAC);
631	synac.keyWildExpand(term, exp, &unactrans);
632	} else {
633	// Expand for diacritics and case, no filtering
634	synac.keyWildExpand(term, exp);
635	}	622	}
636	// There are no wildcards in the result from above but
637	// calling termMatch gets the result into the right form
638	for (vector<string>::const_iterator it = exp.begin();
639	it != exp.end(); it++) {
640	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
641	maxexpand, m_field);
642	}
643	}
644	#endif // RCL_INDEX_STRIPCHARS
645
646	// Expand the original wildcard expression even if we did the
647	// case/diac dance above,
648	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
649	maxexpand, m_field);
650	goto termmatchtoresult;
651	}
652
653	sterm = term;
654
655	#ifdef RCL_INDEX_STRIPCHARS
656
657	db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
658	maxexpand, m_field);
659
660	#else
661
662	if (o_index_stripchars) {
663	// If the index is stripped, we can only come here if
664	// nostemexp is unset and we just need stem expansion.
665	db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
666	maxexpand, m_field);
667	} else {
668	vector<string> lexp;
669	if (diac_sensitive && case_sensitive) {
670	// No expansion whatsoever.
671	lexp.push_back(term);
672	} else if (diac_sensitive) {
673	// Expand for accents and case, filtering for same accents,
674	SynTermTransUnac foldtrans(UNACOP_FOLD);
675	synac.synExpand(term, lexp, &foldtrans);
676	} else if (case_sensitive) {
677	// Expand for accents and case, filtering for same case
678	SynTermTransUnac unactrans(UNACOP_UNAC);
679	synac.synExpand(term, lexp, &unactrans);
680	} else {
681	// We are neither accent- nor case- sensitive and may need stem
682	// expansion or not. Expand for accents and case
683	synac.synExpand(term, lexp);
684	}
685
686	if (!nostemexp) {
687	// Need stem expansion. Lowercase the result of accent and case
688	// expansion for input to stemdb.
689	for (unsigned int i = 0; i < lexp.size(); i++) {
690	string lower;
691	unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
692	lexp[i] = lower;
693	}
694	sort(lexp.begin(), lexp.end());
695	lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
696	StemDb sdb(db.m_ndb->xrdb);
697	vector<string> exp1;
698	for (vector<string>::const_iterator it = lexp.begin();
699	it != lexp.end(); it++) {
700	sdb.stemExpand(getStemLang(), *it, exp1);
701	}
702	LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
703
704	// Expand the resulting list for case (all stemdb content
705	// is lowercase)
706	lexp.clear();
707	for (vector<string>::const_iterator it = exp1.begin();
708	it != exp1.end(); it++) {
709	synac.synExpand(*it, lexp);
710	}
711	sort(lexp.begin(), lexp.end());
712	lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
713	}
714
715	// Bogus wildcard expand to generate the result (possibly add prefixes)
716	LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
717	for (vector<string>::const_iterator it = lexp.begin();
718	it != lexp.end(); it++) {
719	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
720	maxexpand, m_field);
721	}
722	}
723	#endif
724		623
725	// Term match entries to vector of terms	624	// Term match entries to vector of terms
726	termmatchtoresult:
727	if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {	625	if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
728	ermsg = "Maximum term expansion size exceeded."	626	ermsg = "Maximum term expansion size exceeded."
729	" Maybe increase maxTermExpand.";	627	" Maybe increase maxTermExpand.";
730	return false;	628	return false;
731	}	629	}
732	for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();	630	for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
733	it != res.entries.end(); it++) {	631	it != res.entries.end(); it++) {
734	oexp.push_back(it->term);	632	oexp.push_back(it->term);
735	}	633	}
736	// If the term does not exist at all in the db, the return from	634	// If the term does not exist at all in the db, the return from
737	// term match is going to be empty, which is not what we want (we	635	// termMatch() is going to be empty, which is not what we want (we
738	// would then compute an empty Xapian query)	636	// would then compute an empty Xapian query)
739	if (oexp.empty())	637	if (oexp.empty())
740	oexp.push_back(prefix + term);	638	oexp.push_back(prefix + term);
741		639
742	// Remember the uterm-to-expansion links	640	// Remember the uterm-to-expansion links