|
a/src/rcldb/searchdata.cpp |
|
b/src/rcldb/searchdata.cpp |
|
... |
|
... |
543 |
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
543 |
/** Expand term into term list, using appropriate mode: stem, wildcards,
|
544 |
* diacritics...
|
544 |
* diacritics...
|
545 |
*
|
545 |
*
|
546 |
* @param mods stem expansion, case and diacritics sensitivity control.
|
546 |
* @param mods stem expansion, case and diacritics sensitivity control.
|
547 |
* @param term input single word
|
547 |
* @param term input single word
|
548 |
* @param exp output expansion list
|
548 |
* @param oexp output expansion list
|
549 |
* @param sterm output original input term if there were no wildcards
|
549 |
* @param sterm output original input term if there were no wildcards
|
550 |
* @param prefix field prefix in index. We could recompute it, but the caller
|
550 |
* @param prefix field prefix in index. We could recompute it, but the caller
|
551 |
* has it already. Used in the simple case where there is nothing to expand,
|
551 |
* has it already. Used in the simple case where there is nothing to expand,
|
552 |
* and we just return the prefixed term (else Db::termMatch deals with it).
|
552 |
* and we just return the prefixed term (else Db::termMatch deals with it).
|
553 |
*/
|
553 |
*/
|
|
... |
|
... |
576 |
|
576 |
|
577 |
// If there are no wildcards, add term to the list of user-entered terms
|
577 |
// If there are no wildcards, add term to the list of user-entered terms
|
578 |
if (!haswild)
|
578 |
if (!haswild)
|
579 |
m_hldata.uterms.insert(term);
|
579 |
m_hldata.uterms.insert(term);
|
580 |
|
580 |
|
|
|
581 |
// No stem expansion if there are wildcards or if prevented by caller
|
581 |
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
582 |
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
582 |
|
|
|
583 |
// No stem expansion if there are wildcards or if prevented by caller
|
|
|
584 |
if (haswild || getStemLang().empty()) {
|
583 |
if (haswild || getStemLang().empty()) {
|
585 |
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
584 |
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
586 |
nostemexp = true;
|
585 |
nostemexp = true;
|
587 |
}
|
586 |
}
|
588 |
|
587 |
|
|
|
588 |
// noexpansion can be modified further down by possible case/diac expansion
|
589 |
bool noexpansion = nostemexp && !haswild;
|
589 |
bool noexpansion = nostemexp && !haswild;
|
590 |
|
590 |
|
591 |
#ifndef RCL_INDEX_STRIPCHARS
|
591 |
#ifndef RCL_INDEX_STRIPCHARS
|
592 |
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
592 |
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
593 |
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
593 |
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
594 |
|
594 |
|
|
... |
|
... |
635 |
m_hldata.terms[term] = m_hldata.uterms.size() - 1;
|
635 |
m_hldata.terms[term] = m_hldata.uterms.size() - 1;
|
636 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
636 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
637 |
return true;
|
637 |
return true;
|
638 |
}
|
638 |
}
|
639 |
|
639 |
|
640 |
// Make objects before the goto jungle to avoid compiler complaints
|
640 |
// The case/diac expansion db
|
641 |
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
641 |
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
642 |
XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
|
642 |
XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
|
643 |
&unacfoldtrans);
|
643 |
&unacfoldtrans);
|
644 |
// This will hold the result of case and diacritics expansion as input
|
|
|
645 |
// to stem expansion.
|
|
|
646 |
vector<string> lexp;
|
|
|
647 |
|
|
|
648 |
TermMatchResult res;
|
644 |
TermMatchResult res;
|
|
|
645 |
|
649 |
if (haswild) {
|
646 |
if (haswild) {
|
650 |
// Note that if there are wildcards, we do a direct from-index
|
647 |
#ifndef RCL_INDEX_STRIPCHARS
|
651 |
// expansion, which means that we are casediac-sensitive. There
|
648 |
if (!o_index_stripchars && (!diac_sensitive || !case_sensitive)) {
|
652 |
// would be nothing to prevent us to expand from the casediac
|
649 |
// Perform case/diac expansion on the exp as appropriate and
|
653 |
// synonyms first. To be done later
|
650 |
// expand the result.
|
|
|
651 |
vector<string> exp;
|
|
|
652 |
if (diac_sensitive) {
|
|
|
653 |
// Expand for diacritics and case, filtering for same diacritics
|
|
|
654 |
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
|
655 |
synac.keyWildExpand(term, exp, &foldtrans);
|
|
|
656 |
} else if (case_sensitive) {
|
|
|
657 |
// Expand for diacritics and case, filtering for same case
|
|
|
658 |
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
|
659 |
synac.keyWildExpand(term, exp, &unactrans);
|
|
|
660 |
} else {
|
|
|
661 |
// Expand for diacritics and case, no filtering
|
|
|
662 |
synac.keyWildExpand(term, exp);
|
|
|
663 |
}
|
|
|
664 |
// There are no wildcards in the result from above but
|
|
|
665 |
// calling termMatch gets the result into the right form
|
|
|
666 |
for (vector<string>::const_iterator it = exp.begin();
|
|
|
667 |
it != exp.end(); it++) {
|
|
|
668 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
|
|
669 |
maxexpand, m_field);
|
|
|
670 |
}
|
|
|
671 |
}
|
|
|
672 |
#endif // RCL_INDEX_STRIPCHARS
|
|
|
673 |
|
|
|
674 |
// Expand the original wildcard expression even if we did the
|
|
|
675 |
// case/diac dance above,
|
654 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
|
676 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
|
655 |
maxexpand, m_field);
|
677 |
maxexpand, m_field);
|
656 |
goto termmatchtoresult;
|
678 |
goto termmatchtoresult;
|
657 |
}
|
679 |
}
|
658 |
|
680 |
|
|
... |
|
... |
668 |
if (o_index_stripchars) {
|
690 |
if (o_index_stripchars) {
|
669 |
// If the index is stripped, we can only come here if
|
691 |
// If the index is stripped, we can only come here if
|
670 |
// nostemexp is unset and we just need stem expansion.
|
692 |
// nostemexp is unset and we just need stem expansion.
|
671 |
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
693 |
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
672 |
maxexpand, m_field);
|
694 |
maxexpand, m_field);
|
673 |
goto termmatchtoresult;
|
|
|
674 |
}
|
|
|
675 |
|
|
|
676 |
// No stem expansion when diacritic or case sensitivity is set, it
|
|
|
677 |
// makes no sense (it would mess with the diacritics anyway if
|
|
|
678 |
// they are not in the stem part). In these 3 cases, perform
|
|
|
679 |
// appropriate expansion from the charstripping db, and do a bogus
|
|
|
680 |
// wildcard expansion (there is no wild card) to generate the
|
|
|
681 |
// result:
|
|
|
682 |
|
|
|
683 |
if (diac_sensitive && case_sensitive) {
|
|
|
684 |
// No expansion whatsoever.
|
|
|
685 |
lexp.push_back(term);
|
|
|
686 |
goto exptotermatch;
|
|
|
687 |
} else if (diac_sensitive) {
|
|
|
688 |
// Expand for accents and case, filtering for same accents,
|
|
|
689 |
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
|
690 |
synac.synExpand(term, lexp, &foldtrans);
|
|
|
691 |
goto exptotermatch;
|
|
|
692 |
} else if (case_sensitive) {
|
|
|
693 |
// Expand for accents and case, filtering for same case
|
|
|
694 |
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
|
695 |
synac.synExpand(term, lexp, &unactrans);
|
|
|
696 |
goto exptotermatch;
|
|
|
697 |
} else {
|
695 |
} else {
|
|
|
696 |
vector<string> lexp;
|
|
|
697 |
if (diac_sensitive && case_sensitive) {
|
|
|
698 |
// No expansion whatsoever.
|
|
|
699 |
lexp.push_back(term);
|
|
|
700 |
} else if (diac_sensitive) {
|
|
|
701 |
// Expand for accents and case, filtering for same accents,
|
|
|
702 |
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
|
703 |
synac.synExpand(term, lexp, &foldtrans);
|
|
|
704 |
} else if (case_sensitive) {
|
|
|
705 |
// Expand for accents and case, filtering for same case
|
|
|
706 |
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
|
707 |
synac.synExpand(term, lexp, &unactrans);
|
|
|
708 |
} else {
|
698 |
// We are neither accent- nor case- sensitive and may need stem
|
709 |
// We are neither accent- nor case- sensitive and may need stem
|
699 |
// expansion or not. Expand for accents and case
|
710 |
// expansion or not. Expand for accents and case
|
700 |
synac.synExpand(term, lexp);
|
711 |
synac.synExpand(term, lexp);
|
|
|
712 |
}
|
|
|
713 |
|
701 |
if (nostemexp)
|
714 |
if (!nostemexp) {
|
702 |
goto exptotermatch;
|
|
|
703 |
}
|
|
|
704 |
|
|
|
705 |
// Need stem expansion. Lowercase the result of accent and case
|
715 |
// Need stem expansion. Lowercase the result of accent and case
|
706 |
// expansion for input to stemdb.
|
716 |
// expansion for input to stemdb.
|
707 |
for (unsigned int i = 0; i < lexp.size(); i++) {
|
717 |
for (unsigned int i = 0; i < lexp.size(); i++) {
|
708 |
string lower;
|
718 |
string lower;
|
709 |
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
719 |
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
710 |
lexp[i] = lower;
|
720 |
lexp[i] = lower;
|
711 |
}
|
721 |
}
|
712 |
sort(lexp.begin(), lexp.end());
|
722 |
sort(lexp.begin(), lexp.end());
|
713 |
{
|
723 |
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
714 |
vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
|
|
|
715 |
lexp.resize(uit - lexp.begin());
|
|
|
716 |
StemDb sdb(db.m_ndb->xrdb);
|
724 |
StemDb sdb(db.m_ndb->xrdb);
|
717 |
vector<string> exp1;
|
725 |
vector<string> exp1;
|
|
|
726 |
for (vector<string>::const_iterator it = lexp.begin();
|
|
|
727 |
it != lexp.end(); it++) {
|
|
|
728 |
sdb.stemExpand(getStemLang(), *it, exp1);
|
|
|
729 |
}
|
|
|
730 |
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
|
|
731 |
|
|
|
732 |
// Expand the resulting list for case (all stemdb content
|
|
|
733 |
// is lowercase)
|
|
|
734 |
lexp.clear();
|
|
|
735 |
for (vector<string>::const_iterator it = exp1.begin();
|
|
|
736 |
it != exp1.end(); it++) {
|
|
|
737 |
synac.synExpand(*it, lexp);
|
|
|
738 |
}
|
|
|
739 |
sort(lexp.begin(), lexp.end());
|
|
|
740 |
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
|
741 |
}
|
|
|
742 |
|
|
|
743 |
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
|
|
744 |
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
718 |
for (vector<string>::const_iterator it = lexp.begin();
|
745 |
for (vector<string>::const_iterator it = lexp.begin();
|
719 |
it != lexp.end(); it++) {
|
746 |
it != lexp.end(); it++) {
|
720 |
sdb.stemExpand(getStemLang(), *it, exp1);
|
|
|
721 |
}
|
|
|
722 |
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
|
|
723 |
|
|
|
724 |
// Expand the resulting list for case (all stemdb content
|
|
|
725 |
// is lowercase)
|
|
|
726 |
lexp.clear();
|
|
|
727 |
for (vector<string>::const_iterator it = exp1.begin();
|
|
|
728 |
it != exp1.end(); it++) {
|
|
|
729 |
synac.synExpand(*it, lexp);
|
|
|
730 |
}
|
|
|
731 |
sort(lexp.begin(), lexp.end());
|
|
|
732 |
uit = unique(lexp.begin(), lexp.end());
|
|
|
733 |
lexp.resize(uit - lexp.begin());
|
|
|
734 |
}
|
|
|
735 |
|
|
|
736 |
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
|
|
737 |
exptotermatch:
|
|
|
738 |
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
|
|
739 |
for (vector<string>::const_iterator it = lexp.begin();
|
|
|
740 |
it != lexp.end(); it++) {
|
|
|
741 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
747 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
742 |
maxexpand, m_field);
|
748 |
maxexpand, m_field);
|
|
|
749 |
}
|
743 |
}
|
750 |
}
|
744 |
#endif
|
751 |
#endif
|
745 |
|
752 |
|
746 |
// Term match entries to vector of terms
|
753 |
// Term match entries to vector of terms
|
747 |
termmatchtoresult:
|
754 |
termmatchtoresult:
|