recoll / Code / Diff of /src/rcldb/searchdata.cpp

Diff of /src/rcldb/searchdata.cpp [50af4f] .. [9b55eb]

Switch to unified view


...
/** Expand term into term list, using appropriate mode: stem, wildcards, 
 *  diacritics... 
 *
 * @param mods stem expansion, case and diacritics sensitivity control.
 * @param term input single word
 * @param oexp output expansion list
 * @param sterm output original input term if there were no wildcards
 * @param prefix field prefix in index. We could recompute it, but the caller
 *  has it already. Used in the simple case where there is nothing to expand, 
 *  and we just return the prefixed term (else Db::termMatch deals with it).
 */
...

    // If there are no wildcards, add term to the list of user-entered terms
    if (!haswild)
    m_hldata.uterms.insert(term);

    // No stem expansion if there are wildcards or if prevented by caller
    bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;


    if (haswild || getStemLang().empty()) {
    LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
    nostemexp = true;
    }

    // noexpansion can be modified further down by possible case/diac expansion
    bool noexpansion = nostemexp && !haswild; 

#ifndef RCL_INDEX_STRIPCHARS
    bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
    bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;

...
    m_hldata.terms[term] = m_hldata.uterms.size() - 1;
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
    return true;
    } 

    // The case/diac expansion db
    SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
    XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all", 
                    &unacfoldtrans);




    TermMatchResult res;

    if (haswild) {
#ifndef RCL_INDEX_STRIPCHARS
  if (!o_index_stripchars && (!diac_sensitive || !case_sensitive)) {
      // Perform case/diac expansion on the exp as appropriate and
      // expand the result.
      vector<string> exp;
      if (diac_sensitive) {
      // Expand for diacritics and case, filtering for same diacritics
      SynTermTransUnac foldtrans(UNACOP_FOLD);
      synac.keyWildExpand(term, exp, &foldtrans);
      } else if (case_sensitive) {
      // Expand for diacritics and case, filtering for same case
      SynTermTransUnac unactrans(UNACOP_UNAC);
      synac.keyWildExpand(term, exp, &unactrans);
      } else {
      // Expand for diacritics and case, no filtering
      synac.keyWildExpand(term, exp);
      }
      // There are no wildcards in the result from above but
      // calling termMatch gets the result into the right form
      for (vector<string>::const_iterator it = exp.begin(); 
       it != exp.end(); it++) {
      db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res, 
               maxexpand, m_field);
      }
  }
#endif // RCL_INDEX_STRIPCHARS

  // Expand the original wildcard expression even if we did the
  // case/diac dance above,
    db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res, 
             maxexpand, m_field);
    goto termmatchtoresult;
    }

...
    if (o_index_stripchars) {
    // If the index is stripped, we can only come here if
    // nostemexp is unset and we just need stem expansion.
    db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res, 
             maxexpand, m_field);
























    } else {
  vector<string> lexp;
  if (diac_sensitive && case_sensitive) {
      // No expansion whatsoever. 
      lexp.push_back(term);
  } else if (diac_sensitive) {
      // Expand for accents and case, filtering for same accents,
      SynTermTransUnac foldtrans(UNACOP_FOLD);
      synac.synExpand(term, lexp, &foldtrans);
  } else if (case_sensitive) {
      // Expand for accents and case, filtering for same case
      SynTermTransUnac unactrans(UNACOP_UNAC);
      synac.synExpand(term, lexp, &unactrans);
  } else {
        // We are neither accent- nor case- sensitive and may need stem
        // expansion or not. Expand for accents and case
        synac.synExpand(term, lexp);
  }

    if (!nostemexp) {



      // Need stem expansion. Lowercase the result of accent and case
      // expansion for input to stemdb.
      for (unsigned int i = 0; i < lexp.size(); i++) {
     string lower;
     unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
     lexp[i] = lower;
      }
      sort(lexp.begin(), lexp.end());
      lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());


        StemDb sdb(db.m_ndb->xrdb);
        vector<string> exp1;
      for (vector<string>::const_iterator it = lexp.begin(); 
       it != lexp.end(); it++) {
      sdb.stemExpand(getStemLang(), *it, exp1);
      }
      LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));

      // Expand the resulting list for case (all stemdb content
      // is lowercase)
      lexp.clear();
      for (vector<string>::const_iterator it = exp1.begin(); 
       it != exp1.end(); it++) {
      synac.synExpand(*it, lexp);
      }
      sort(lexp.begin(), lexp.end());
      lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
  }

  // Bogus wildcard expand to generate the result (possibly add prefixes)
  LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
    for (vector<string>::const_iterator it = lexp.begin();
         it != lexp.end(); it++) {





















        db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
           maxexpand, m_field);
  }
    }
#endif

    // Term match entries to vector of terms
termmatchtoresult:

	a/src/rcldb/searchdata.cpp		b/src/rcldb/searchdata.cpp
	...		...
543	/** Expand term into term list, using appropriate mode: stem, wildcards,	543	/** Expand term into term list, using appropriate mode: stem, wildcards,
544	* diacritics...	544	* diacritics...
545	*	545	*
546	* @param mods stem expansion, case and diacritics sensitivity control.	546	* @param mods stem expansion, case and diacritics sensitivity control.
547	* @param term input single word	547	* @param term input single word
548	* @param exp output expansion list	548	* @param oexp output expansion list
549	* @param sterm output original input term if there were no wildcards	549	* @param sterm output original input term if there were no wildcards
550	* @param prefix field prefix in index. We could recompute it, but the caller	550	* @param prefix field prefix in index. We could recompute it, but the caller
551	* has it already. Used in the simple case where there is nothing to expand,	551	* has it already. Used in the simple case where there is nothing to expand,
552	* and we just return the prefixed term (else Db::termMatch deals with it).	552	* and we just return the prefixed term (else Db::termMatch deals with it).
553	*/	553	*/
	...		...
576		576
577	// If there are no wildcards, add term to the list of user-entered terms	577	// If there are no wildcards, add term to the list of user-entered terms
578	if (!haswild)	578	if (!haswild)
579	m_hldata.uterms.insert(term);	579	m_hldata.uterms.insert(term);
580		580
		581	// No stem expansion if there are wildcards or if prevented by caller
581	bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;	582	bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
582
583	// No stem expansion if there are wildcards or if prevented by caller
584	if (haswild \|\| getStemLang().empty()) {	583	if (haswild \|\| getStemLang().empty()) {
585	LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));	584	LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
586	nostemexp = true;	585	nostemexp = true;
587	}	586	}
588		587
		588	// noexpansion can be modified further down by possible case/diac expansion
589	bool noexpansion = nostemexp && !haswild;	589	bool noexpansion = nostemexp && !haswild;
590		590
591	#ifndef RCL_INDEX_STRIPCHARS	591	#ifndef RCL_INDEX_STRIPCHARS
592	bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;	592	bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
593	bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;	593	bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
594		594
	...		...
635	m_hldata.terms[term] = m_hldata.uterms.size() - 1;	635	m_hldata.terms[term] = m_hldata.uterms.size() - 1;
636	LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));	636	LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
637	return true;	637	return true;
638	}	638	}
639		639
640	// Make objects before the goto jungle to avoid compiler complaints	640	// The case/diac expansion db
641	SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);	641	SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
642	XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",	642	XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
643	&unacfoldtrans);	643	&unacfoldtrans);
644	// This will hold the result of case and diacritics expansion as input
645	// to stem expansion.
646	vector<string> lexp;
647
648	TermMatchResult res;	644	TermMatchResult res;
		645
649	if (haswild) {	646	if (haswild) {
650	// Note that if there are wildcards, we do a direct from-index	647	#ifndef RCL_INDEX_STRIPCHARS
651	// expansion, which means that we are casediac-sensitive. There	648	if (!o_index_stripchars && (!diac_sensitive \|\| !case_sensitive)) {
652	// would be nothing to prevent us to expand from the casediac	649	// Perform case/diac expansion on the exp as appropriate and
653	// synonyms first. To be done later	650	// expand the result.
		651	vector<string> exp;
		652	if (diac_sensitive) {
		653	// Expand for diacritics and case, filtering for same diacritics
		654	SynTermTransUnac foldtrans(UNACOP_FOLD);
		655	synac.keyWildExpand(term, exp, &foldtrans);
		656	} else if (case_sensitive) {
		657	// Expand for diacritics and case, filtering for same case
		658	SynTermTransUnac unactrans(UNACOP_UNAC);
		659	synac.keyWildExpand(term, exp, &unactrans);
		660	} else {
		661	// Expand for diacritics and case, no filtering
		662	synac.keyWildExpand(term, exp);
		663	}
		664	// There are no wildcards in the result from above but
		665	// calling termMatch gets the result into the right form
		666	for (vector<string>::const_iterator it = exp.begin();
		667	it != exp.end(); it++) {
		668	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
		669	maxexpand, m_field);
		670	}
		671	}
		672	#endif // RCL_INDEX_STRIPCHARS
		673
		674	// Expand the original wildcard expression even if we did the
		675	// case/diac dance above,
654	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,	676	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
655	maxexpand, m_field);	677	maxexpand, m_field);
656	goto termmatchtoresult;	678	goto termmatchtoresult;
657	}	679	}
658		680
	...		...
668	if (o_index_stripchars) {	690	if (o_index_stripchars) {
669	// If the index is stripped, we can only come here if	691	// If the index is stripped, we can only come here if
670	// nostemexp is unset and we just need stem expansion.	692	// nostemexp is unset and we just need stem expansion.
671	db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,	693	db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
672	maxexpand, m_field);	694	maxexpand, m_field);
673	goto termmatchtoresult;
674	}
675
676	// No stem expansion when diacritic or case sensitivity is set, it
677	// makes no sense (it would mess with the diacritics anyway if
678	// they are not in the stem part). In these 3 cases, perform
679	// appropriate expansion from the charstripping db, and do a bogus
680	// wildcard expansion (there is no wild card) to generate the
681	// result:
682
683	if (diac_sensitive && case_sensitive) {
684	// No expansion whatsoever.
685	lexp.push_back(term);
686	goto exptotermatch;
687	} else if (diac_sensitive) {
688	// Expand for accents and case, filtering for same accents,
689	SynTermTransUnac foldtrans(UNACOP_FOLD);
690	synac.synExpand(term, lexp, &foldtrans);
691	goto exptotermatch;
692	} else if (case_sensitive) {
693	// Expand for accents and case, filtering for same case
694	SynTermTransUnac unactrans(UNACOP_UNAC);
695	synac.synExpand(term, lexp, &unactrans);
696	goto exptotermatch;
697	} else {	695	} else {
		696	vector<string> lexp;
		697	if (diac_sensitive && case_sensitive) {
		698	// No expansion whatsoever.
		699	lexp.push_back(term);
		700	} else if (diac_sensitive) {
		701	// Expand for accents and case, filtering for same accents,
		702	SynTermTransUnac foldtrans(UNACOP_FOLD);
		703	synac.synExpand(term, lexp, &foldtrans);
		704	} else if (case_sensitive) {
		705	// Expand for accents and case, filtering for same case
		706	SynTermTransUnac unactrans(UNACOP_UNAC);
		707	synac.synExpand(term, lexp, &unactrans);
		708	} else {
698	// We are neither accent- nor case- sensitive and may need stem	709	// We are neither accent- nor case- sensitive and may need stem
699	// expansion or not. Expand for accents and case	710	// expansion or not. Expand for accents and case
700	synac.synExpand(term, lexp);	711	synac.synExpand(term, lexp);
		712	}
		713
701	if (nostemexp)	714	if (!nostemexp) {
702	goto exptotermatch;
703	}
704
705	// Need stem expansion. Lowercase the result of accent and case	715	// Need stem expansion. Lowercase the result of accent and case
706	// expansion for input to stemdb.	716	// expansion for input to stemdb.
707	for (unsigned int i = 0; i < lexp.size(); i++) {	717	for (unsigned int i = 0; i < lexp.size(); i++) {
708	string lower;	718	string lower;
709	unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);	719	unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
710	lexp[i] = lower;	720	lexp[i] = lower;
711	}	721	}
712	sort(lexp.begin(), lexp.end());	722	sort(lexp.begin(), lexp.end());
713	{	723	lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
714	vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
715	lexp.resize(uit - lexp.begin());
716	StemDb sdb(db.m_ndb->xrdb);	724	StemDb sdb(db.m_ndb->xrdb);
717	vector<string> exp1;	725	vector<string> exp1;
		726	for (vector<string>::const_iterator it = lexp.begin();
		727	it != lexp.end(); it++) {
		728	sdb.stemExpand(getStemLang(), *it, exp1);
		729	}
		730	LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
		731
		732	// Expand the resulting list for case (all stemdb content
		733	// is lowercase)
		734	lexp.clear();
		735	for (vector<string>::const_iterator it = exp1.begin();
		736	it != exp1.end(); it++) {
		737	synac.synExpand(*it, lexp);
		738	}
		739	sort(lexp.begin(), lexp.end());
		740	lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
		741	}
		742
		743	// Bogus wildcard expand to generate the result (possibly add prefixes)
		744	LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
718	for (vector<string>::const_iterator it = lexp.begin();	745	for (vector<string>::const_iterator it = lexp.begin();
719	it != lexp.end(); it++) {	746	it != lexp.end(); it++) {
720	sdb.stemExpand(getStemLang(), *it, exp1);
721	}
722	LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
723
724	// Expand the resulting list for case (all stemdb content
725	// is lowercase)
726	lexp.clear();
727	for (vector<string>::const_iterator it = exp1.begin();
728	it != exp1.end(); it++) {
729	synac.synExpand(*it, lexp);
730	}
731	sort(lexp.begin(), lexp.end());
732	uit = unique(lexp.begin(), lexp.end());
733	lexp.resize(uit - lexp.begin());
734	}
735
736	// Bogus wildcard expand to generate the result (possibly add prefixes)
737	exptotermatch:
738	LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
739	for (vector<string>::const_iterator it = lexp.begin();
740	it != lexp.end(); it++) {
741	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,	747	db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
742	maxexpand, m_field);	748	maxexpand, m_field);
		749	}
743	}	750	}
744	#endif	751	#endif
745		752
746	// Term match entries to vector of terms	753	// Term match entries to vector of terms
747	termmatchtoresult:	754	termmatchtoresult: