|
a/src/rcldb/searchdata.cpp |
|
b/src/rcldb/searchdata.cpp |
|
... |
|
... |
542 |
}
|
542 |
}
|
543 |
|
543 |
|
544 |
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
544 |
bool haswild = term.find_first_of(cstr_minwilds) != string::npos;
|
545 |
|
545 |
|
546 |
// If there are no wildcards, add term to the list of user-entered terms
|
546 |
// If there are no wildcards, add term to the list of user-entered terms
|
547 |
if (!haswild)
|
547 |
if (!haswild) {
|
548 |
m_hldata.uterms.insert(term);
|
548 |
m_hldata.uterms.insert(term);
|
549 |
|
549 |
sterm = term;
|
|
|
550 |
}
|
550 |
// No stem expansion if there are wildcards or if prevented by caller
|
551 |
// No stem expansion if there are wildcards or if prevented by caller
|
551 |
bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
|
552 |
bool nostemexp = (mods & SDCM_NOSTEMMING) != 0;
|
552 |
if (haswild || getStemLang().empty()) {
|
553 |
if (haswild || getStemLang().empty()) {
|
553 |
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
554 |
LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
|
554 |
nostemexp = true;
|
555 |
nostemexp = true;
|
555 |
}
|
556 |
}
|
556 |
|
557 |
|
557 |
// noexpansion can be modified further down by possible case/diac expansion
|
558 |
// noexpansion can be modified further down by possible case/diac expansion
|
558 |
bool noexpansion = nostemexp && !haswild;
|
559 |
bool noexpansion = nostemexp && !haswild;
|
559 |
|
560 |
|
|
|
561 |
int termmatchsens = 0;
|
|
|
562 |
|
560 |
#ifndef RCL_INDEX_STRIPCHARS
|
563 |
#ifndef RCL_INDEX_STRIPCHARS
|
561 |
bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
|
564 |
bool diac_sensitive = (mods & SDCM_DIACSENS) != 0;
|
562 |
bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
|
565 |
bool case_sensitive = (mods & SDCM_CASESENS) != 0;
|
563 |
|
566 |
|
564 |
if (o_index_stripchars) {
|
567 |
if (o_index_stripchars) {
|
565 |
diac_sensitive = case_sensitive = false;
|
568 |
diac_sensitive = case_sensitive = false;
|
566 |
} else {
|
569 |
} else {
|
567 |
// If we are working with a raw index, apply the rules for case and
|
570 |
// If we are working with a raw index, apply the rules for case and
|
|
... |
|
... |
594 |
}
|
597 |
}
|
595 |
|
598 |
|
596 |
if (!case_sensitive || !diac_sensitive)
|
599 |
if (!case_sensitive || !diac_sensitive)
|
597 |
noexpansion = false;
|
600 |
noexpansion = false;
|
598 |
}
|
601 |
}
|
|
|
602 |
|
|
|
603 |
if (case_sensitive)
|
|
|
604 |
termmatchsens |= Db::ET_CASESENS;
|
|
|
605 |
if (diac_sensitive)
|
|
|
606 |
termmatchsens |= Db::ET_DIACSENS;
|
599 |
#endif
|
607 |
#endif
|
600 |
|
608 |
|
601 |
if (noexpansion) {
|
609 |
if (noexpansion) {
|
602 |
sterm = term;
|
|
|
603 |
oexp.push_back(prefix + term);
|
610 |
oexp.push_back(prefix + term);
|
604 |
m_hldata.terms[term] = m_hldata.uterms.size() - 1;
|
611 |
m_hldata.terms[term] = m_hldata.uterms.size() - 1;
|
605 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
612 |
LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
|
606 |
return true;
|
613 |
return true;
|
607 |
}
|
614 |
}
|
608 |
|
615 |
|
609 |
#ifndef RCL_INDEX_STRIPCHARS
|
616 |
Db::MatchType mtyp = haswild ? Db::ET_WILD :
|
610 |
// The case/diac expansion db
|
617 |
nostemexp ? Db::ET_NONE : Db::ET_STEM;
|
611 |
SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
|
|
|
612 |
XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all",
|
|
|
613 |
&unacfoldtrans);
|
|
|
614 |
#endif // RCL_INDEX_STRIPCHARS
|
|
|
615 |
|
|
|
616 |
TermMatchResult res;
|
618 |
TermMatchResult res;
|
617 |
|
619 |
if (!db.termMatch(mtyp | termmatchsens, getStemLang(), term, res, maxexpand,
|
618 |
if (haswild) {
|
620 |
m_field)) {
|
619 |
#ifndef RCL_INDEX_STRIPCHARS
|
621 |
// Let it go through
|
620 |
if (!o_index_stripchars && (!diac_sensitive || !case_sensitive)) {
|
|
|
621 |
// Perform case/diac expansion on the exp as appropriate and
|
|
|
622 |
// expand the result.
|
|
|
623 |
vector<string> exp;
|
|
|
624 |
if (diac_sensitive) {
|
|
|
625 |
// Expand for diacritics and case, filtering for same diacritics
|
|
|
626 |
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
|
627 |
synac.keyWildExpand(term, exp, &foldtrans);
|
|
|
628 |
} else if (case_sensitive) {
|
|
|
629 |
// Expand for diacritics and case, filtering for same case
|
|
|
630 |
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
|
631 |
synac.keyWildExpand(term, exp, &unactrans);
|
|
|
632 |
} else {
|
|
|
633 |
// Expand for diacritics and case, no filtering
|
|
|
634 |
synac.keyWildExpand(term, exp);
|
|
|
635 |
}
|
622 |
}
|
636 |
// There are no wildcards in the result from above but
|
|
|
637 |
// calling termMatch gets the result into the right form
|
|
|
638 |
for (vector<string>::const_iterator it = exp.begin();
|
|
|
639 |
it != exp.end(); it++) {
|
|
|
640 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
|
|
641 |
maxexpand, m_field);
|
|
|
642 |
}
|
|
|
643 |
}
|
|
|
644 |
#endif // RCL_INDEX_STRIPCHARS
|
|
|
645 |
|
|
|
646 |
// Expand the original wildcard expression even if we did the
|
|
|
647 |
// case/diac dance above,
|
|
|
648 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res,
|
|
|
649 |
maxexpand, m_field);
|
|
|
650 |
goto termmatchtoresult;
|
|
|
651 |
}
|
|
|
652 |
|
|
|
653 |
sterm = term;
|
|
|
654 |
|
|
|
655 |
#ifdef RCL_INDEX_STRIPCHARS
|
|
|
656 |
|
|
|
657 |
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
|
|
658 |
maxexpand, m_field);
|
|
|
659 |
|
|
|
660 |
#else
|
|
|
661 |
|
|
|
662 |
if (o_index_stripchars) {
|
|
|
663 |
// If the index is stripped, we can only come here if
|
|
|
664 |
// nostemexp is unset and we just need stem expansion.
|
|
|
665 |
db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res,
|
|
|
666 |
maxexpand, m_field);
|
|
|
667 |
} else {
|
|
|
668 |
vector<string> lexp;
|
|
|
669 |
if (diac_sensitive && case_sensitive) {
|
|
|
670 |
// No expansion whatsoever.
|
|
|
671 |
lexp.push_back(term);
|
|
|
672 |
} else if (diac_sensitive) {
|
|
|
673 |
// Expand for accents and case, filtering for same accents,
|
|
|
674 |
SynTermTransUnac foldtrans(UNACOP_FOLD);
|
|
|
675 |
synac.synExpand(term, lexp, &foldtrans);
|
|
|
676 |
} else if (case_sensitive) {
|
|
|
677 |
// Expand for accents and case, filtering for same case
|
|
|
678 |
SynTermTransUnac unactrans(UNACOP_UNAC);
|
|
|
679 |
synac.synExpand(term, lexp, &unactrans);
|
|
|
680 |
} else {
|
|
|
681 |
// We are neither accent- nor case- sensitive and may need stem
|
|
|
682 |
// expansion or not. Expand for accents and case
|
|
|
683 |
synac.synExpand(term, lexp);
|
|
|
684 |
}
|
|
|
685 |
|
|
|
686 |
if (!nostemexp) {
|
|
|
687 |
// Need stem expansion. Lowercase the result of accent and case
|
|
|
688 |
// expansion for input to stemdb.
|
|
|
689 |
for (unsigned int i = 0; i < lexp.size(); i++) {
|
|
|
690 |
string lower;
|
|
|
691 |
unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
|
|
|
692 |
lexp[i] = lower;
|
|
|
693 |
}
|
|
|
694 |
sort(lexp.begin(), lexp.end());
|
|
|
695 |
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
|
696 |
StemDb sdb(db.m_ndb->xrdb);
|
|
|
697 |
vector<string> exp1;
|
|
|
698 |
for (vector<string>::const_iterator it = lexp.begin();
|
|
|
699 |
it != lexp.end(); it++) {
|
|
|
700 |
sdb.stemExpand(getStemLang(), *it, exp1);
|
|
|
701 |
}
|
|
|
702 |
LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
|
|
|
703 |
|
|
|
704 |
// Expand the resulting list for case (all stemdb content
|
|
|
705 |
// is lowercase)
|
|
|
706 |
lexp.clear();
|
|
|
707 |
for (vector<string>::const_iterator it = exp1.begin();
|
|
|
708 |
it != exp1.end(); it++) {
|
|
|
709 |
synac.synExpand(*it, lexp);
|
|
|
710 |
}
|
|
|
711 |
sort(lexp.begin(), lexp.end());
|
|
|
712 |
lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
|
|
|
713 |
}
|
|
|
714 |
|
|
|
715 |
// Bogus wildcard expand to generate the result (possibly add prefixes)
|
|
|
716 |
LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
|
|
|
717 |
for (vector<string>::const_iterator it = lexp.begin();
|
|
|
718 |
it != lexp.end(); it++) {
|
|
|
719 |
db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
|
|
|
720 |
maxexpand, m_field);
|
|
|
721 |
}
|
|
|
722 |
}
|
|
|
723 |
#endif
|
|
|
724 |
|
623 |
|
725 |
// Term match entries to vector of terms
|
624 |
// Term match entries to vector of terms
|
726 |
termmatchtoresult:
|
|
|
727 |
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
|
625 |
if (int(res.entries.size()) >= maxexpand && !maxexpissoft) {
|
728 |
ermsg = "Maximum term expansion size exceeded."
|
626 |
ermsg = "Maximum term expansion size exceeded."
|
729 |
" Maybe increase maxTermExpand.";
|
627 |
" Maybe increase maxTermExpand.";
|
730 |
return false;
|
628 |
return false;
|
731 |
}
|
629 |
}
|
732 |
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
630 |
for (vector<TermMatchEntry>::const_iterator it = res.entries.begin();
|
733 |
it != res.entries.end(); it++) {
|
631 |
it != res.entries.end(); it++) {
|
734 |
oexp.push_back(it->term);
|
632 |
oexp.push_back(it->term);
|
735 |
}
|
633 |
}
|
736 |
// If the term does not exist at all in the db, the return from
|
634 |
// If the term does not exist at all in the db, the return from
|
737 |
// term match is going to be empty, which is not what we want (we
|
635 |
// termMatch() is going to be empty, which is not what we want (we
|
738 |
// would then compute an empty Xapian query)
|
636 |
// would then compute an empty Xapian query)
|
739 |
if (oexp.empty())
|
637 |
if (oexp.empty())
|
740 |
oexp.push_back(prefix + term);
|
638 |
oexp.push_back(prefix + term);
|
741 |
|
639 |
|
742 |
// Remember the uterm-to-expansion links
|
640 |
// Remember the uterm-to-expansion links
|