Switch to unified view

a/src/rcldb/searchdatatox.cpp b/src/rcldb/searchdatatox.cpp
...
...
286
286
287
    *((Xapian::Query *)d) = xq;
287
    *((Xapian::Query *)d) = xq;
288
    return true;
288
    return true;
289
}
289
}
290
290
291
// Splitter callback for breaking a user string into simple terms and
291
// Splitter for breaking a user string into simple terms and
292
// phrases. This is for parts of the user entry which would appear as
292
// phrases. This is for parts of the user entry which would appear as
293
// a single word because there is no white space inside, but are
293
// a single word because there is no white space inside, but are
294
// actually multiple terms to rcldb (ie term1,term2)
294
// actually multiple terms to rcldb (ie term1,term2). Still, most of
295
// the time, the result of our splitting will be a single term.
295
class TextSplitQ : public TextSplitP {
296
class TextSplitQ : public TextSplitP {
296
 public:
297
 public:
297
    TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
298
    TextSplitQ(Flags flags, TermProc *prc)
298
  : TextSplitP(prc, flags), 
299
  : TextSplitP(prc, flags), m_nostemexp(false) {
299
    curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
300
    {}
300
    }
301
301
302
    bool takeword(const std::string &term, int pos, int bs, int be) 
302
    bool takeword(const std::string &term, int pos, int bs, int be) {
303
    {
304
    // Check if the first letter is a majuscule in which
303
    // Check if the first letter is a majuscule in which
305
    // case we do not want to do stem expansion. Need to do this
304
    // case we do not want to do stem expansion. Need to do this
306
    // before unac of course...
305
    // before unac of course...
307
    curnostemexp = unaciscapital(term);
306
    m_nostemexp = unaciscapital(term);
308
307
309
    return TextSplitP::takeword(term, pos, bs, be);
308
    return TextSplitP::takeword(term, pos, bs, be);
310
    }
309
    }
311
310
312
    bool           curnostemexp;
311
    bool nostemexp() const {
313
    vector<string> terms;
312
        return m_nostemexp;
314
    vector<bool>   nostemexps;
313
    }
315
    const StopList &stops;
314
private:
316
    // Count of terms including stopwords: this is for adjusting
315
    bool m_nostemexp;
317
    // phrase/near slack
318
    int alltermcount; 
319
    int lastpos;
320
};
316
};
321
317
322
class TermProcQ : public TermProc {
318
class TermProcQ : public TermProc {
323
public:
319
public:
324
    TermProcQ() : TermProc(0), m_ts(0) {}
320
    TermProcQ() : TermProc(0), m_alltermcount(0), m_lastpos(0), m_ts(0) {}
321
322
    // We need a ref to the splitter (only it knows about orig term
323
    // capitalization for controlling stemming. The ref can't be set
324
    // in the constructor because the splitter is not built yet when
325
    // we are born (chicken and egg).
325
    void setTSQ(TextSplitQ *ts) {m_ts = ts;}
326
    void setTSQ(const TextSplitQ *ts) {
327
        m_ts = ts;
328
    }
326
    
329
    
327
    bool takeword(const std::string &term, int pos, int bs, int be) 
330
    bool takeword(const std::string &term, int pos, int bs, int be) {
328
    {
329
    m_ts->alltermcount++;
331
    m_alltermcount++;
330
    if (m_ts->lastpos < pos)
332
    if (m_lastpos < pos)
331
        m_ts->lastpos = pos;
333
        m_lastpos = pos;
332
    bool noexpand = be ? m_ts->curnostemexp : true;
334
    bool noexpand = be ? m_ts->nostemexp() : true;
333
    LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", 
335
    LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n", 
334
         term.c_str(), pos, noexpand));
336
         term.c_str(), pos, noexpand));
335
    if (m_terms[pos].size() < term.size()) {
337
    if (m_terms[pos].size() < term.size()) {
336
        m_terms[pos] = term;
338
        m_terms[pos] = term;
337
        m_nste[pos] = noexpand;
339
        m_nste[pos] = noexpand;
338
    }
340
    }
339
    return true;
341
    return true;
340
    }
342
    }
343
341
    bool flush()
344
    bool flush() {
342
    {
343
    for (map<int, string>::const_iterator it = m_terms.begin();
345
    for (map<int, string>::const_iterator it = m_terms.begin();
344
         it != m_terms.end(); it++) {
346
         it != m_terms.end(); it++) {
345
        m_ts->terms.push_back(it->second);
347
        m_vterms.push_back(it->second);
346
        m_ts->nostemexps.push_back(m_nste[it->first]);
348
        m_vnostemexps.push_back(m_nste[it->first]);
347
    }
349
    }
348
    return true;
350
    return true;
349
    }
351
    }
352
353
    int alltermcount() const {
354
        return m_alltermcount;
355
    }
356
    int lastpos() const {
357
        return m_lastpos;
358
    }
359
    const vector<string>& terms() {
360
        return m_vterms;
361
    }
362
    const vector<bool>& nostemexps() {
363
        return m_vnostemexps;
364
    }
350
private:
365
private:
366
    // Count of terms including stopwords: this is for adjusting
367
    // phrase/near slack
368
    int m_alltermcount; 
369
    int m_lastpos;
351
    TextSplitQ *m_ts;
370
    const TextSplitQ *m_ts;
371
    vector<string> m_vterms;
372
    vector<bool>   m_vnostemexps;
352
    map<int, string> m_terms;
373
    map<int, string> m_terms;
353
    map<int, bool> m_nste;
374
    map<int, bool> m_nste;
354
};
375
};
355
376
356
377
...
...
586
// User entry element had several terms: transform into a PHRASE or
607
// User entry element had several terms: transform into a PHRASE or
587
// NEAR xapian query, the elements of which can themselves be OR
608
// NEAR xapian query, the elements of which can themselves be OR
588
// queries if the terms get expanded by stemming or wildcards (we
609
// queries if the terms get expanded by stemming or wildcards (we
589
// don't do stemming for PHRASE though)
610
// don't do stemming for PHRASE though)
590
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
611
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg, 
591
                         TextSplitQ *splitData, 
612
                         TermProcQ *splitData, 
592
                         int mods, void *pq,
613
                         int mods, void *pq,
593
                         bool useNear, int slack)
614
                         bool useNear, int slack)
594
{
615
{
595
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
616
    vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
596
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
617
    Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR : 
...
...
611
    orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
632
    orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
612
    slack++;
633
    slack++;
613
    }
634
    }
614
635
615
    // Go through the list and perform stem/wildcard expansion for each element
636
    // Go through the list and perform stem/wildcard expansion for each element
616
    vector<bool>::iterator nxit = splitData->nostemexps.begin();
637
    vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
617
    for (vector<string>::iterator it = splitData->terms.begin();
638
    for (vector<string>::const_iterator it = splitData->terms().begin();
618
     it != splitData->terms.end(); it++, nxit++) {
639
     it != splitData->terms().end(); it++, nxit++) {
619
    LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
640
    LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
620
    // Adjust when we do stem expansion. Not if disabled by
641
    // Adjust when we do stem expansion. Not if disabled by
621
    // caller, not inside phrases, and some versions of xapian
642
    // caller, not inside phrases, and some versions of xapian
622
    // will accept only one OR clause inside NEAR.
643
    // will accept only one OR clause inside NEAR.
623
    bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
644
    bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE) 
...
...
658
    }
679
    }
659
680
660
    // Generate an appropriate PHRASE/NEAR query with adjusted slack
681
    // Generate an appropriate PHRASE/NEAR query with adjusted slack
661
    // For phrases, give a relevance boost like we do for original terms
682
    // For phrases, give a relevance boost like we do for original terms
662
    LOGDEB2(("PHRASE/NEAR:  alltermcount %d lastpos %d\n", 
683
    LOGDEB2(("PHRASE/NEAR:  alltermcount %d lastpos %d\n", 
663
             splitData->alltermcount, splitData->lastpos));
684
             splitData->alltermcount(), splitData->lastpos()));
664
    Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
685
    Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
665
             splitData->lastpos + 1 + slack);
686
             splitData->lastpos() + 1 + slack);
666
    if (op == Xapian::Query::OP_PHRASE)
687
    if (op == Xapian::Query::OP_PHRASE)
667
    xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, 
688
    xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq, 
668
               original_term_wqf_booster);
689
               original_term_wqf_booster);
669
    pqueries.push_back(xq);
690
    pqueries.push_back(xq);
670
691
...
...
770
        // performance, but will succeed.
791
        // performance, but will succeed.
771
        // We now adjust the phrase/near slack by comparing the term count
792
        // We now adjust the phrase/near slack by comparing the term count
772
        // and the last position
793
        // and the last position
773
794
774
        // The term processing pipeline:
795
        // The term processing pipeline:
796
            // split -> [unac/case ->] stops -> store terms
775
        TermProcQ tpq;
797
        TermProcQ tpq;
776
        TermProc *nxt = &tpq;
798
        TermProc *nxt = &tpq;
777
            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
799
            TermProcStop tpstop(nxt, stops); nxt = &tpstop;
778
            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
800
            //TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
779
            //tpcommon.onlygrams(true);
801
            //tpcommon.onlygrams(true);
...
...
781
        if (o_index_stripchars)
803
        if (o_index_stripchars)
782
        nxt = &tpprep;
804
        nxt = &tpprep;
783
805
784
        TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
806
        TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS | 
785
                         TextSplit::TXTS_KEEPWILD), 
807
                         TextSplit::TXTS_KEEPWILD), 
786
              stops, nxt);
808
              nxt);
787
        tpq.setTSQ(&splitter);
809
        tpq.setTSQ(&splitter);
788
        splitter.text_to_words(*it);
810
        splitter.text_to_words(*it);
789
811
790
        slack += splitter.lastpos - splitter.terms.size() + 1;
812
        slack += tpq.lastpos() - tpq.terms().size() + 1;
791
813
792
        LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
814
        LOGDEB0(("strToXapianQ: termcount: %d\n", tpq.terms().size()));
793
        switch (splitter.terms.size() + terminc) {
815
        switch (tpq.terms().size() + terminc) {
794
        case 0: 
816
        case 0: 
795
        continue;// ??
817
        continue;// ??
796
        case 1: {
818
        case 1: {
797
        int lmods = mods;
819
        int lmods = mods;
798
        if (splitter.nostemexps.front())
820
        if (tpq.nostemexps().front())
799
            lmods |= SearchDataClause::SDCM_NOSTEMMING;
821
            lmods |= SearchDataClause::SDCM_NOSTEMMING;
800
        m_hldata.ugroups.push_back(splitter.terms);
822
        m_hldata.ugroups.push_back(tpq.terms());
801
        processSimpleSpan(db, ermsg, splitter.terms.front(),
823
        processSimpleSpan(db, ermsg, tpq.terms().front(),
802
                  lmods, &pqueries);
824
                  lmods, &pqueries);
803
        }
825
        }
804
        break;
826
        break;
805
        default:
827
        default:
806
        m_hldata.ugroups.push_back(splitter.terms);
828
        m_hldata.ugroups.push_back(tpq.terms());
807
        processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
829
        processPhraseOrNear(db, ermsg, &tpq, mods, &pqueries,
808
                    useNear, slack);
830
                    useNear, slack);
809
        }
831
        }
810
        if (m_curcl >= getMaxCl()) {
832
        if (m_curcl >= getMaxCl()) {
811
        ermsg = maxXapClauseMsg;
833
        ermsg = maxXapClauseMsg;
812
        if (!o_index_stripchars)
834
        if (!o_index_stripchars)
...
...
844
    switch (m_tp) {
866
    switch (m_tp) {
845
    case SCLT_AND: op = Xapian::Query::OP_AND; break;
867
    case SCLT_AND: op = Xapian::Query::OP_AND; break;
846
    case SCLT_OR: op = Xapian::Query::OP_OR; break;
868
    case SCLT_OR: op = Xapian::Query::OP_OR; break;
847
    default:
869
    default:
848
    LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
870
    LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
871
        m_reason = "Internal error";
849
    return false;
872
    return false;
850
    }
873
    }
851
874
852
    vector<Xapian::Query> pqueries;
875
    vector<Xapian::Query> pqueries;
853
    if (!processUserString(db, m_text, m_reason, &pqueries))
876
    if (!processUserString(db, m_text, m_reason, &pqueries))
854
    return false;
877
    return false;
855
    if (pqueries.empty()) {
878
    if (pqueries.empty()) {
856
    LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
879
    LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
880
        m_reason = string("Resolved to null query. Term too long ? : [" + 
881
                          m_text + string("]"));
857
    return true;
882
    return false;
858
    }
883
    }
859
884
860
    *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
885
    *qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
861
    if  (m_weight != 1.0) {
886
    if  (m_weight != 1.0) {
862
    *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
887
    *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
...
...
968
    bool useNear = (m_tp == SCLT_NEAR);
993
    bool useNear = (m_tp == SCLT_NEAR);
969
    if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
994
    if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
970
    return false;
995
    return false;
971
    if (pqueries.empty()) {
996
    if (pqueries.empty()) {
972
    LOGERR(("SearchDataClauseDist: resolved to null query\n"));
997
    LOGERR(("SearchDataClauseDist: resolved to null query\n"));
998
        m_reason = string("Resolved to null query. Term too long ? : [" + 
999
                          m_text + string("]"));
973
    return true;
1000
    return false;
974
    }
1001
    }
975
1002
976
    *qp = *pqueries.begin();
1003
    *qp = *pqueries.begin();
977
    if (m_weight != 1.0) {
1004
    if (m_weight != 1.0) {
978
    *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
1005
    *qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);