|
a/src/rcldb/searchdatatox.cpp |
|
b/src/rcldb/searchdatatox.cpp |
|
... |
|
... |
286 |
|
286 |
|
287 |
*((Xapian::Query *)d) = xq;
|
287 |
*((Xapian::Query *)d) = xq;
|
288 |
return true;
|
288 |
return true;
|
289 |
}
|
289 |
}
|
290 |
|
290 |
|
291 |
// Splitter callback for breaking a user string into simple terms and
|
291 |
// Splitter for breaking a user string into simple terms and
|
292 |
// phrases. This is for parts of the user entry which would appear as
|
292 |
// phrases. This is for parts of the user entry which would appear as
|
293 |
// a single word because there is no white space inside, but are
|
293 |
// a single word because there is no white space inside, but are
|
294 |
// actually multiple terms to rcldb (ie term1,term2)
|
294 |
// actually multiple terms to rcldb (ie term1,term2). Still, most of
|
|
|
295 |
// the time, the result of our splitting will be a single term.
|
295 |
class TextSplitQ : public TextSplitP {
|
296 |
class TextSplitQ : public TextSplitP {
|
296 |
public:
|
297 |
public:
|
297 |
TextSplitQ(Flags flags, const StopList &_stops, TermProc *prc)
|
298 |
TextSplitQ(Flags flags, TermProc *prc)
|
298 |
: TextSplitP(prc, flags),
|
299 |
: TextSplitP(prc, flags), m_nostemexp(false) {
|
299 |
curnostemexp(false), stops(_stops), alltermcount(0), lastpos(0)
|
|
|
300 |
{}
|
300 |
}
|
301 |
|
301 |
|
302 |
bool takeword(const std::string &term, int pos, int bs, int be)
|
302 |
bool takeword(const std::string &term, int pos, int bs, int be) {
|
303 |
{
|
|
|
304 |
// Check if the first letter is a majuscule in which
|
303 |
// Check if the first letter is a majuscule in which
|
305 |
// case we do not want to do stem expansion. Need to do this
|
304 |
// case we do not want to do stem expansion. Need to do this
|
306 |
// before unac of course...
|
305 |
// before unac of course...
|
307 |
curnostemexp = unaciscapital(term);
|
306 |
m_nostemexp = unaciscapital(term);
|
308 |
|
307 |
|
309 |
return TextSplitP::takeword(term, pos, bs, be);
|
308 |
return TextSplitP::takeword(term, pos, bs, be);
|
310 |
}
|
309 |
}
|
311 |
|
310 |
|
312 |
bool curnostemexp;
|
311 |
bool nostemexp() const {
|
313 |
vector<string> terms;
|
312 |
return m_nostemexp;
|
314 |
vector<bool> nostemexps;
|
313 |
}
|
315 |
const StopList &stops;
|
314 |
private:
|
316 |
// Count of terms including stopwords: this is for adjusting
|
315 |
bool m_nostemexp;
|
317 |
// phrase/near slack
|
|
|
318 |
int alltermcount;
|
|
|
319 |
int lastpos;
|
|
|
320 |
};
|
316 |
};
|
321 |
|
317 |
|
322 |
class TermProcQ : public TermProc {
|
318 |
class TermProcQ : public TermProc {
|
323 |
public:
|
319 |
public:
|
324 |
TermProcQ() : TermProc(0), m_ts(0) {}
|
320 |
TermProcQ() : TermProc(0), m_alltermcount(0), m_lastpos(0), m_ts(0) {}
|
|
|
321 |
|
|
|
322 |
// We need a ref to the splitter (only it knows about orig term
|
|
|
323 |
// capitalization for controlling stemming. The ref can't be set
|
|
|
324 |
// in the constructor because the splitter is not built yet when
|
|
|
325 |
// we are born (chicken and egg).
|
325 |
void setTSQ(TextSplitQ *ts) {m_ts = ts;}
|
326 |
void setTSQ(const TextSplitQ *ts) {
|
|
|
327 |
m_ts = ts;
|
|
|
328 |
}
|
326 |
|
329 |
|
327 |
bool takeword(const std::string &term, int pos, int bs, int be)
|
330 |
bool takeword(const std::string &term, int pos, int bs, int be) {
|
328 |
{
|
|
|
329 |
m_ts->alltermcount++;
|
331 |
m_alltermcount++;
|
330 |
if (m_ts->lastpos < pos)
|
332 |
if (m_lastpos < pos)
|
331 |
m_ts->lastpos = pos;
|
333 |
m_lastpos = pos;
|
332 |
bool noexpand = be ? m_ts->curnostemexp : true;
|
334 |
bool noexpand = be ? m_ts->nostemexp() : true;
|
333 |
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
335 |
LOGDEB1(("TermProcQ::takeword: pushing [%s] pos %d noexp %d\n",
|
334 |
term.c_str(), pos, noexpand));
|
336 |
term.c_str(), pos, noexpand));
|
335 |
if (m_terms[pos].size() < term.size()) {
|
337 |
if (m_terms[pos].size() < term.size()) {
|
336 |
m_terms[pos] = term;
|
338 |
m_terms[pos] = term;
|
337 |
m_nste[pos] = noexpand;
|
339 |
m_nste[pos] = noexpand;
|
338 |
}
|
340 |
}
|
339 |
return true;
|
341 |
return true;
|
340 |
}
|
342 |
}
|
|
|
343 |
|
341 |
bool flush()
|
344 |
bool flush() {
|
342 |
{
|
|
|
343 |
for (map<int, string>::const_iterator it = m_terms.begin();
|
345 |
for (map<int, string>::const_iterator it = m_terms.begin();
|
344 |
it != m_terms.end(); it++) {
|
346 |
it != m_terms.end(); it++) {
|
345 |
m_ts->terms.push_back(it->second);
|
347 |
m_vterms.push_back(it->second);
|
346 |
m_ts->nostemexps.push_back(m_nste[it->first]);
|
348 |
m_vnostemexps.push_back(m_nste[it->first]);
|
347 |
}
|
349 |
}
|
348 |
return true;
|
350 |
return true;
|
349 |
}
|
351 |
}
|
|
|
352 |
|
|
|
353 |
int alltermcount() const {
|
|
|
354 |
return m_alltermcount;
|
|
|
355 |
}
|
|
|
356 |
int lastpos() const {
|
|
|
357 |
return m_lastpos;
|
|
|
358 |
}
|
|
|
359 |
const vector<string>& terms() {
|
|
|
360 |
return m_vterms;
|
|
|
361 |
}
|
|
|
362 |
const vector<bool>& nostemexps() {
|
|
|
363 |
return m_vnostemexps;
|
|
|
364 |
}
|
350 |
private:
|
365 |
private:
|
|
|
366 |
// Count of terms including stopwords: this is for adjusting
|
|
|
367 |
// phrase/near slack
|
|
|
368 |
int m_alltermcount;
|
|
|
369 |
int m_lastpos;
|
351 |
TextSplitQ *m_ts;
|
370 |
const TextSplitQ *m_ts;
|
|
|
371 |
vector<string> m_vterms;
|
|
|
372 |
vector<bool> m_vnostemexps;
|
352 |
map<int, string> m_terms;
|
373 |
map<int, string> m_terms;
|
353 |
map<int, bool> m_nste;
|
374 |
map<int, bool> m_nste;
|
354 |
};
|
375 |
};
|
355 |
|
376 |
|
356 |
|
377 |
|
|
... |
|
... |
586 |
// User entry element had several terms: transform into a PHRASE or
|
607 |
// User entry element had several terms: transform into a PHRASE or
|
587 |
// NEAR xapian query, the elements of which can themselves be OR
|
608 |
// NEAR xapian query, the elements of which can themselves be OR
|
588 |
// queries if the terms get expanded by stemming or wildcards (we
|
609 |
// queries if the terms get expanded by stemming or wildcards (we
|
589 |
// don't do stemming for PHRASE though)
|
610 |
// don't do stemming for PHRASE though)
|
590 |
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
611 |
void SearchDataClauseSimple::processPhraseOrNear(Rcl::Db &db, string& ermsg,
|
591 |
TextSplitQ *splitData,
|
612 |
TermProcQ *splitData,
|
592 |
int mods, void *pq,
|
613 |
int mods, void *pq,
|
593 |
bool useNear, int slack)
|
614 |
bool useNear, int slack)
|
594 |
{
|
615 |
{
|
595 |
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
616 |
vector<Xapian::Query> &pqueries(*(vector<Xapian::Query>*)pq);
|
596 |
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
617 |
Xapian::Query::op op = useNear ? Xapian::Query::OP_NEAR :
|
|
... |
|
... |
611 |
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
|
632 |
orqueries.push_back(Xapian::Query(prefix + start_of_field_term));
|
612 |
slack++;
|
633 |
slack++;
|
613 |
}
|
634 |
}
|
614 |
|
635 |
|
615 |
// Go through the list and perform stem/wildcard expansion for each element
|
636 |
// Go through the list and perform stem/wildcard expansion for each element
|
616 |
vector<bool>::iterator nxit = splitData->nostemexps.begin();
|
637 |
vector<bool>::const_iterator nxit = splitData->nostemexps().begin();
|
617 |
for (vector<string>::iterator it = splitData->terms.begin();
|
638 |
for (vector<string>::const_iterator it = splitData->terms().begin();
|
618 |
it != splitData->terms.end(); it++, nxit++) {
|
639 |
it != splitData->terms().end(); it++, nxit++) {
|
619 |
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
640 |
LOGDEB0(("ProcessPhrase: processing [%s]\n", it->c_str()));
|
620 |
// Adjust when we do stem expansion. Not if disabled by
|
641 |
// Adjust when we do stem expansion. Not if disabled by
|
621 |
// caller, not inside phrases, and some versions of xapian
|
642 |
// caller, not inside phrases, and some versions of xapian
|
622 |
// will accept only one OR clause inside NEAR.
|
643 |
// will accept only one OR clause inside NEAR.
|
623 |
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
|
644 |
bool nostemexp = *nxit || (op == Xapian::Query::OP_PHRASE)
|
|
... |
|
... |
658 |
}
|
679 |
}
|
659 |
|
680 |
|
660 |
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
681 |
// Generate an appropriate PHRASE/NEAR query with adjusted slack
|
661 |
// For phrases, give a relevance boost like we do for original terms
|
682 |
// For phrases, give a relevance boost like we do for original terms
|
662 |
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
683 |
LOGDEB2(("PHRASE/NEAR: alltermcount %d lastpos %d\n",
|
663 |
splitData->alltermcount, splitData->lastpos));
|
684 |
splitData->alltermcount(), splitData->lastpos()));
|
664 |
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
685 |
Xapian::Query xq(op, orqueries.begin(), orqueries.end(),
|
665 |
splitData->lastpos + 1 + slack);
|
686 |
splitData->lastpos() + 1 + slack);
|
666 |
if (op == Xapian::Query::OP_PHRASE)
|
687 |
if (op == Xapian::Query::OP_PHRASE)
|
667 |
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
688 |
xq = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, xq,
|
668 |
original_term_wqf_booster);
|
689 |
original_term_wqf_booster);
|
669 |
pqueries.push_back(xq);
|
690 |
pqueries.push_back(xq);
|
670 |
|
691 |
|
|
... |
|
... |
770 |
// performance, but will succeed.
|
791 |
// performance, but will succeed.
|
771 |
// We now adjust the phrase/near slack by comparing the term count
|
792 |
// We now adjust the phrase/near slack by comparing the term count
|
772 |
// and the last position
|
793 |
// and the last position
|
773 |
|
794 |
|
774 |
// The term processing pipeline:
|
795 |
// The term processing pipeline:
|
|
|
796 |
// split -> [unac/case ->] stops -> store terms
|
775 |
TermProcQ tpq;
|
797 |
TermProcQ tpq;
|
776 |
TermProc *nxt = &tpq;
|
798 |
TermProc *nxt = &tpq;
|
777 |
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
799 |
TermProcStop tpstop(nxt, stops); nxt = &tpstop;
|
778 |
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
800 |
//TermProcCommongrams tpcommon(nxt, stops); nxt = &tpcommon;
|
779 |
//tpcommon.onlygrams(true);
|
801 |
//tpcommon.onlygrams(true);
|
|
... |
|
... |
781 |
if (o_index_stripchars)
|
803 |
if (o_index_stripchars)
|
782 |
nxt = &tpprep;
|
804 |
nxt = &tpprep;
|
783 |
|
805 |
|
784 |
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
806 |
TextSplitQ splitter(TextSplit::Flags(TextSplit::TXTS_ONLYSPANS |
|
785 |
TextSplit::TXTS_KEEPWILD),
|
807 |
TextSplit::TXTS_KEEPWILD),
|
786 |
stops, nxt);
|
808 |
nxt);
|
787 |
tpq.setTSQ(&splitter);
|
809 |
tpq.setTSQ(&splitter);
|
788 |
splitter.text_to_words(*it);
|
810 |
splitter.text_to_words(*it);
|
789 |
|
811 |
|
790 |
slack += splitter.lastpos - splitter.terms.size() + 1;
|
812 |
slack += tpq.lastpos() - tpq.terms().size() + 1;
|
791 |
|
813 |
|
792 |
LOGDEB0(("strToXapianQ: termcount: %d\n", splitter.terms.size()));
|
814 |
LOGDEB0(("strToXapianQ: termcount: %d\n", tpq.terms().size()));
|
793 |
switch (splitter.terms.size() + terminc) {
|
815 |
switch (tpq.terms().size() + terminc) {
|
794 |
case 0:
|
816 |
case 0:
|
795 |
continue;// ??
|
817 |
continue;// ??
|
796 |
case 1: {
|
818 |
case 1: {
|
797 |
int lmods = mods;
|
819 |
int lmods = mods;
|
798 |
if (splitter.nostemexps.front())
|
820 |
if (tpq.nostemexps().front())
|
799 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
821 |
lmods |= SearchDataClause::SDCM_NOSTEMMING;
|
800 |
m_hldata.ugroups.push_back(splitter.terms);
|
822 |
m_hldata.ugroups.push_back(tpq.terms());
|
801 |
processSimpleSpan(db, ermsg, splitter.terms.front(),
|
823 |
processSimpleSpan(db, ermsg, tpq.terms().front(),
|
802 |
lmods, &pqueries);
|
824 |
lmods, &pqueries);
|
803 |
}
|
825 |
}
|
804 |
break;
|
826 |
break;
|
805 |
default:
|
827 |
default:
|
806 |
m_hldata.ugroups.push_back(splitter.terms);
|
828 |
m_hldata.ugroups.push_back(tpq.terms());
|
807 |
processPhraseOrNear(db, ermsg, &splitter, mods, &pqueries,
|
829 |
processPhraseOrNear(db, ermsg, &tpq, mods, &pqueries,
|
808 |
useNear, slack);
|
830 |
useNear, slack);
|
809 |
}
|
831 |
}
|
810 |
if (m_curcl >= getMaxCl()) {
|
832 |
if (m_curcl >= getMaxCl()) {
|
811 |
ermsg = maxXapClauseMsg;
|
833 |
ermsg = maxXapClauseMsg;
|
812 |
if (!o_index_stripchars)
|
834 |
if (!o_index_stripchars)
|
|
... |
|
... |
844 |
switch (m_tp) {
|
866 |
switch (m_tp) {
|
845 |
case SCLT_AND: op = Xapian::Query::OP_AND; break;
|
867 |
case SCLT_AND: op = Xapian::Query::OP_AND; break;
|
846 |
case SCLT_OR: op = Xapian::Query::OP_OR; break;
|
868 |
case SCLT_OR: op = Xapian::Query::OP_OR; break;
|
847 |
default:
|
869 |
default:
|
848 |
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
870 |
LOGERR(("SearchDataClauseSimple: bad m_tp %d\n", m_tp));
|
|
|
871 |
m_reason = "Internal error";
|
849 |
return false;
|
872 |
return false;
|
850 |
}
|
873 |
}
|
851 |
|
874 |
|
852 |
vector<Xapian::Query> pqueries;
|
875 |
vector<Xapian::Query> pqueries;
|
853 |
if (!processUserString(db, m_text, m_reason, &pqueries))
|
876 |
if (!processUserString(db, m_text, m_reason, &pqueries))
|
854 |
return false;
|
877 |
return false;
|
855 |
if (pqueries.empty()) {
|
878 |
if (pqueries.empty()) {
|
856 |
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
879 |
LOGERR(("SearchDataClauseSimple: resolved to null query\n"));
|
|
|
880 |
m_reason = string("Resolved to null query. Term too long ? : [" +
|
|
|
881 |
m_text + string("]"));
|
857 |
return true;
|
882 |
return false;
|
858 |
}
|
883 |
}
|
859 |
|
884 |
|
860 |
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
885 |
*qp = Xapian::Query(op, pqueries.begin(), pqueries.end());
|
861 |
if (m_weight != 1.0) {
|
886 |
if (m_weight != 1.0) {
|
862 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
887 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
|
... |
|
... |
968 |
bool useNear = (m_tp == SCLT_NEAR);
|
993 |
bool useNear = (m_tp == SCLT_NEAR);
|
969 |
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
|
994 |
if (!processUserString(db, s, m_reason, &pqueries, m_slack, useNear))
|
970 |
return false;
|
995 |
return false;
|
971 |
if (pqueries.empty()) {
|
996 |
if (pqueries.empty()) {
|
972 |
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
997 |
LOGERR(("SearchDataClauseDist: resolved to null query\n"));
|
|
|
998 |
m_reason = string("Resolved to null query. Term too long ? : [" +
|
|
|
999 |
m_text + string("]"));
|
973 |
return true;
|
1000 |
return false;
|
974 |
}
|
1001 |
}
|
975 |
|
1002 |
|
976 |
*qp = *pqueries.begin();
|
1003 |
*qp = *pqueries.begin();
|
977 |
if (m_weight != 1.0) {
|
1004 |
if (m_weight != 1.0) {
|
978 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|
1005 |
*qp = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, *qp, m_weight);
|