Switch to unified view

a/src/rcldb/searchdata.cpp b/src/rcldb/searchdata.cpp
...
...
543
/** Expand term into term list, using appropriate mode: stem, wildcards, 
543
/** Expand term into term list, using appropriate mode: stem, wildcards, 
544
 *  diacritics... 
544
 *  diacritics... 
545
 *
545
 *
546
 * @param mods stem expansion, case and diacritics sensitivity control.
546
 * @param mods stem expansion, case and diacritics sensitivity control.
547
 * @param term input single word
547
 * @param term input single word
548
 * @param exp output expansion list
548
 * @param oexp output expansion list
549
 * @param sterm output original input term if there were no wildcards
549
 * @param sterm output original input term if there were no wildcards
550
 * @param prefix field prefix in index. We could recompute it, but the caller
550
 * @param prefix field prefix in index. We could recompute it, but the caller
551
 *  has it already. Used in the simple case where there is nothing to expand, 
551
 *  has it already. Used in the simple case where there is nothing to expand, 
552
 *  and we just return the prefixed term (else Db::termMatch deals with it).
552
 *  and we just return the prefixed term (else Db::termMatch deals with it).
553
 */
553
 */
...
...
576
576
577
    // If there are no wildcards, add term to the list of user-entered terms
577
    // If there are no wildcards, add term to the list of user-entered terms
578
    if (!haswild)
578
    if (!haswild)
579
    m_hldata.uterms.insert(term);
579
    m_hldata.uterms.insert(term);
580
580
581
    // No stem expansion if there are wildcards or if prevented by caller
581
    bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
582
    bool nostemexp = (mods & SearchDataClause::SDCM_NOSTEMMING) != 0;
582
583
    // No stem expansion if there are wildcards or if prevented by caller
584
    if (haswild || getStemLang().empty()) {
583
    if (haswild || getStemLang().empty()) {
585
    LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
584
    LOGDEB2(("expandTerm: found wildcards or stemlang empty: no exp\n"));
586
    nostemexp = true;
585
    nostemexp = true;
587
    }
586
    }
588
587
588
    // noexpansion can be modified further down by possible case/diac expansion
589
    bool noexpansion = nostemexp && !haswild;
589
    bool noexpansion = nostemexp && !haswild; 
590
590
591
#ifndef RCL_INDEX_STRIPCHARS
591
#ifndef RCL_INDEX_STRIPCHARS
592
    bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
592
    bool diac_sensitive = (mods & SearchDataClause::SDCM_DIACSENS) != 0;
593
    bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
593
    bool case_sensitive = (mods & SearchDataClause::SDCM_CASESENS) != 0;
594
594
...
...
635
    m_hldata.terms[term] = m_hldata.uterms.size() - 1;
635
    m_hldata.terms[term] = m_hldata.uterms.size() - 1;
636
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
636
    LOGDEB(("ExpandTerm: final: %s\n", stringsToString(oexp).c_str()));
637
    return true;
637
    return true;
638
    } 
638
    } 
639
639
640
    // Make objects before the goto jungle to avoid compiler complaints
640
    // The case/diac expansion db
641
    SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
641
    SynTermTransUnac unacfoldtrans(UNACOP_UNACFOLD);
642
    XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all", 
642
    XapComputableSynFamMember synac(db.m_ndb->xrdb, synFamDiCa, "all", 
643
                    &unacfoldtrans);
643
                    &unacfoldtrans);
644
    // This will hold the result of case and diacritics expansion as input
645
    // to stem expansion.
646
    vector<string> lexp;
647
    
648
    TermMatchResult res;
644
    TermMatchResult res;
645
649
    if (haswild) {
646
    if (haswild) {
650
  // Note that if there are wildcards, we do a direct from-index
647
#ifndef RCL_INDEX_STRIPCHARS
651
  // expansion, which means that we are casediac-sensitive. There
648
  if (!o_index_stripchars && (!diac_sensitive || !case_sensitive)) {
652
  // would be nothing to prevent us to expand from the casediac
649
      // Perform case/diac expansion on the exp as appropriate and
653
  // synonyms first. To be done later
650
      // expand the result.
651
      vector<string> exp;
652
      if (diac_sensitive) {
653
      // Expand for diacritics and case, filtering for same diacritics
654
      SynTermTransUnac foldtrans(UNACOP_FOLD);
655
      synac.keyWildExpand(term, exp, &foldtrans);
656
      } else if (case_sensitive) {
657
      // Expand for diacritics and case, filtering for same case
658
      SynTermTransUnac unactrans(UNACOP_UNAC);
659
      synac.keyWildExpand(term, exp, &unactrans);
660
      } else {
661
      // Expand for diacritics and case, no filtering
662
      synac.keyWildExpand(term, exp);
663
      }
664
      // There are no wildcards in the result from above but
665
      // calling termMatch gets the result into the right form
666
      for (vector<string>::const_iterator it = exp.begin(); 
667
       it != exp.end(); it++) {
668
      db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res, 
669
               maxexpand, m_field);
670
      }
671
  }
672
#endif // RCL_INDEX_STRIPCHARS
673
674
  // Expand the original wildcard expression even if we did the
675
  // case/diac dance above,
654
    db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res, 
676
    db.termMatch(Rcl::Db::ET_WILD, getStemLang(), term, res, 
655
             maxexpand, m_field);
677
             maxexpand, m_field);
656
    goto termmatchtoresult;
678
    goto termmatchtoresult;
657
    }
679
    }
658
680
...
...
668
    if (o_index_stripchars) {
690
    if (o_index_stripchars) {
669
    // If the index is stripped, we can only come here if
691
    // If the index is stripped, we can only come here if
670
    // nostemexp is unset and we just need stem expansion.
692
    // nostemexp is unset and we just need stem expansion.
671
    db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res, 
693
    db.termMatch(Rcl::Db::ET_STEM, getStemLang(), term, res, 
672
             maxexpand, m_field);
694
             maxexpand, m_field);
673
  goto termmatchtoresult;
674
    } 
675
676
    // No stem expansion when diacritic or case sensitivity is set, it
677
    // makes no sense (it would mess with the diacritics anyway if
678
    // they are not in the stem part).  In these 3 cases, perform
679
    // appropriate expansion from the charstripping db, and do a bogus
680
    // wildcard expansion (there is no wild card) to generate the
681
    // result:
682
683
    if (diac_sensitive && case_sensitive) {
684
  // No expansion whatsoever. 
685
  lexp.push_back(term);
686
  goto exptotermatch;
687
    } else if (diac_sensitive) {
688
  // Expand for accents and case, filtering for same accents,
689
  SynTermTransUnac foldtrans(UNACOP_FOLD);
690
  synac.synExpand(term, lexp, &foldtrans);
691
  goto exptotermatch;
692
    } else if (case_sensitive) {
693
  // Expand for accents and case, filtering for same case
694
  SynTermTransUnac unactrans(UNACOP_UNAC);
695
  synac.synExpand(term, lexp, &unactrans);
696
  goto exptotermatch;
697
    } else {
695
    } else {
696
  vector<string> lexp;
697
  if (diac_sensitive && case_sensitive) {
698
      // No expansion whatsoever. 
699
      lexp.push_back(term);
700
  } else if (diac_sensitive) {
701
      // Expand for accents and case, filtering for same accents,
702
      SynTermTransUnac foldtrans(UNACOP_FOLD);
703
      synac.synExpand(term, lexp, &foldtrans);
704
  } else if (case_sensitive) {
705
      // Expand for accents and case, filtering for same case
706
      SynTermTransUnac unactrans(UNACOP_UNAC);
707
      synac.synExpand(term, lexp, &unactrans);
708
  } else {
698
    // We are neither accent- nor case- sensitive and may need stem
709
        // We are neither accent- nor case- sensitive and may need stem
699
    // expansion or not. Expand for accents and case
710
        // expansion or not. Expand for accents and case
700
    synac.synExpand(term, lexp);
711
        synac.synExpand(term, lexp);
712
  }
713
701
    if (nostemexp)
714
    if (!nostemexp) {
702
      goto exptotermatch;
703
    }
704
705
    // Need stem expansion. Lowercase the result of accent and case
715
      // Need stem expansion. Lowercase the result of accent and case
706
    // expansion for input to stemdb.
716
      // expansion for input to stemdb.
707
    for (unsigned int i = 0; i < lexp.size(); i++) {
717
      for (unsigned int i = 0; i < lexp.size(); i++) {
708
    string lower;
718
     string lower;
709
    unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
719
     unacmaybefold(lexp[i], lower, "UTF-8", UNACOP_FOLD);
710
    lexp[i] = lower;
720
     lexp[i] = lower;
711
    }
721
      }
712
    sort(lexp.begin(), lexp.end());
722
      sort(lexp.begin(), lexp.end());
713
    {
723
      lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
714
  vector<string>::iterator uit = unique(lexp.begin(), lexp.end());
715
  lexp.resize(uit - lexp.begin());
716
    StemDb sdb(db.m_ndb->xrdb);
724
        StemDb sdb(db.m_ndb->xrdb);
717
    vector<string> exp1;
725
        vector<string> exp1;
726
      for (vector<string>::const_iterator it = lexp.begin(); 
727
       it != lexp.end(); it++) {
728
      sdb.stemExpand(getStemLang(), *it, exp1);
729
      }
730
      LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
731
732
      // Expand the resulting list for case (all stemdb content
733
      // is lowercase)
734
      lexp.clear();
735
      for (vector<string>::const_iterator it = exp1.begin(); 
736
       it != exp1.end(); it++) {
737
      synac.synExpand(*it, lexp);
738
      }
739
      sort(lexp.begin(), lexp.end());
740
      lexp.erase(unique(lexp.begin(), lexp.end()), lexp.end());
741
  }
742
743
  // Bogus wildcard expand to generate the result (possibly add prefixes)
744
  LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
718
    for (vector<string>::const_iterator it = lexp.begin(); 
745
    for (vector<string>::const_iterator it = lexp.begin();
719
         it != lexp.end(); it++) {
746
         it != lexp.end(); it++) {
720
      sdb.stemExpand(getStemLang(), *it, exp1);
721
  }
722
  LOGDEB(("ExpTerm: stem exp-> %s\n", stringsToString(exp1).c_str()));
723
724
  // Expand the resulting list for case (all stemdb content
725
  // is lowercase)
726
  lexp.clear();
727
  for (vector<string>::const_iterator it = exp1.begin(); 
728
       it != exp1.end(); it++) {
729
      synac.synExpand(*it, lexp);
730
  }
731
  sort(lexp.begin(), lexp.end());
732
  uit = unique(lexp.begin(), lexp.end());
733
  lexp.resize(uit - lexp.begin());
734
    }
735
736
    // Bogus wildcard expand to generate the result (possibly add prefixes)
737
exptotermatch:
738
    LOGDEB(("ExpandTerm:TM: lexp: %s\n", stringsToString(lexp).c_str()));
739
    for (vector<string>::const_iterator it = lexp.begin();
740
   it != lexp.end(); it++) {
741
    db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
747
        db.termMatch(Rcl::Db::ET_WILD, getStemLang(), *it, res,
742
             maxexpand, m_field);
748
           maxexpand, m_field);
749
  }
743
    }
750
    }
744
#endif
751
#endif
745
752
746
    // Term match entries to vector of terms
753
    // Term match entries to vector of terms
747
termmatchtoresult:
754
termmatchtoresult: