|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
242 |
doccnt = 1;
|
242 |
doccnt = 1;
|
243 |
|
243 |
|
244 |
for (vector<string>::const_iterator qit = qterms.begin();
|
244 |
for (vector<string>::const_iterator qit = qterms.begin();
|
245 |
qit != qterms.end(); qit++) {
|
245 |
qit != qterms.end(); qit++) {
|
246 |
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
246 |
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
247 |
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
247 |
LOGABS(("set..QTermFreqs: [%s] db freq %.1e\n", qit->c_str(),
|
248 |
query->m_nq->termfreqs[*qit]));
|
248 |
query->m_nq->termfreqs[*qit]));
|
249 |
}
|
249 |
}
|
250 |
}
|
250 |
}
|
251 |
|
251 |
|
252 |
// Compute query terms quality coefficients for a matched document by
|
252 |
// Compute query terms quality coefficients for a matched document by
|
|
... |
|
... |
296 |
if (termQcoefs.find(*qit) != termQcoefs.end())
|
296 |
if (termQcoefs.find(*qit) != termQcoefs.end())
|
297 |
byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
|
297 |
byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
|
298 |
}
|
298 |
}
|
299 |
|
299 |
|
300 |
#ifdef DEBUGABSTRACT
|
300 |
#ifdef DEBUGABSTRACT
|
|
|
301 |
LOGDEB(("Db::qualityTerms:\n"));
|
301 |
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
302 |
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
302 |
qit != byQ.rend(); qit++) {
|
303 |
qit != byQ.rend(); qit++) {
|
303 |
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
|
304 |
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
|
304 |
}
|
305 |
}
|
305 |
#endif
|
306 |
#endif
|
|
... |
|
... |
413 |
// Build a document abstract by extracting text chunks around the query terms
|
414 |
// Build a document abstract by extracting text chunks around the query terms
|
414 |
// This uses the db termlists, not the original document.
|
415 |
// This uses the db termlists, not the original document.
|
415 |
//
|
416 |
//
|
416 |
// DatabaseModified and other general exceptions are catched and
|
417 |
// DatabaseModified and other general exceptions are catched and
|
417 |
// possibly retried by our caller
|
418 |
// possibly retried by our caller
|
418 |
bool Db::Native::makeAbstract(Xapian::docid docid, Query *query,
|
419 |
abstract_result Db::Native::makeAbstract(Xapian::docid docid, Query *query,
|
419 |
vector<pair<int, string> >& vabs)
|
420 |
vector<pair<int, string> >& vabs,
|
|
|
421 |
int imaxoccs, int ictxwords)
|
420 |
{
|
422 |
{
|
421 |
Chrono chron;
|
423 |
Chrono chron;
|
422 |
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
424 |
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d imaxoccs %d\n", chron.ms(),
|
423 |
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
425 |
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen, imaxoccs));
|
424 |
|
426 |
|
425 |
// The (unprefixed) terms matched by this document
|
427 |
// The (unprefixed) terms matched by this document
|
426 |
vector<string> matchedTerms;
|
428 |
vector<string> matchedTerms;
|
427 |
{
|
429 |
{
|
428 |
vector<string> iterms;
|
430 |
vector<string> iterms;
|
429 |
query->getMatchTerms(docid, iterms);
|
431 |
query->getMatchTerms(docid, iterms);
|
430 |
noPrefixList(iterms, matchedTerms);
|
432 |
noPrefixList(iterms, matchedTerms);
|
431 |
if (matchedTerms.empty()) {
|
433 |
if (matchedTerms.empty()) {
|
432 |
LOGDEB(("makeAbstract::Empty term list\n"));
|
434 |
LOGDEB(("makeAbstract::Empty term list\n"));
|
433 |
return false;
|
435 |
return ABSRES_ERROR;
|
434 |
}
|
436 |
}
|
435 |
}
|
437 |
}
|
436 |
listList("Match terms: ", matchedTerms);
|
438 |
listList("Match terms: ", matchedTerms);
|
437 |
|
439 |
|
438 |
// Retrieve the term freqencies for the query terms. This is
|
440 |
// Retrieve the term freqencies for the query terms. This is
|
|
... |
|
... |
451 |
double totalweight = qualityTerms(docid, query, matchedTerms, byQ);
|
453 |
double totalweight = qualityTerms(docid, query, matchedTerms, byQ);
|
452 |
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
454 |
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
453 |
// This can't happen, but would crash us
|
455 |
// This can't happen, but would crash us
|
454 |
if (totalweight == 0.0) {
|
456 |
if (totalweight == 0.0) {
|
455 |
LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
|
457 |
LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
|
456 |
return false;
|
458 |
return ABSRES_ERROR;
|
457 |
}
|
459 |
}
|
458 |
|
460 |
|
459 |
///////////////////
|
461 |
///////////////////
|
460 |
// For each of the query terms, ask xapian for its positions list
|
462 |
// For each of the query terms, ask xapian for its positions list
|
461 |
// in the document. For each position entry, remember it in
|
463 |
// in the document. For each position entry, remember it in
|
|
... |
|
... |
472 |
// Limit the total number of slots we populate. The 7 is taken as
|
474 |
// Limit the total number of slots we populate. The 7 is taken as
|
473 |
// average word size. It was a mistake to have the user max
|
475 |
// average word size. It was a mistake to have the user max
|
474 |
// abstract size parameter in characters, we basically only deal
|
476 |
// abstract size parameter in characters, we basically only deal
|
475 |
// with words. We used to limit the character size at the end, but
|
477 |
// with words. We used to limit the character size at the end, but
|
476 |
// this damaged our careful selection of terms
|
478 |
// this damaged our careful selection of terms
|
477 |
const unsigned int maxtotaloccs =
|
479 |
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
478 |
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
|
480 |
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
|
479 |
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
|
481 |
int ctxwords = ictxwords == -1 ? m_rcldb->m_synthAbsWordCtxLen : ictxwords;
|
|
|
482 |
LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n",
|
|
|
483 |
chron.ms(), maxtotaloccs, ctxwords));
|
480 |
|
484 |
|
481 |
// This is used to mark positions overlapped by a multi-word match term
|
485 |
// This is used to mark positions overlapped by a multi-word match term
|
482 |
const string occupiedmarker("?");
|
486 |
const string occupiedmarker("?");
|
|
|
487 |
|
|
|
488 |
abstract_result ret = ABSRES_OK;
|
483 |
|
489 |
|
484 |
// Let's go populate
|
490 |
// Let's go populate
|
485 |
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
491 |
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
486 |
qit != byQ.rend(); qit++) {
|
492 |
qit != byQ.rend(); qit++) {
|
487 |
string qterm = qit->second;
|
493 |
string qterm = qit->second;
|
|
... |
|
... |
520 |
|
526 |
|
521 |
// Add adjacent slots to the set to populate at next
|
527 |
// Add adjacent slots to the set to populate at next
|
522 |
// step by inserting empty strings. Special provisions
|
528 |
// step by inserting empty strings. Special provisions
|
523 |
// for adding ellipsis and for positions overlapped by
|
529 |
// for adding ellipsis and for positions overlapped by
|
524 |
// the match term.
|
530 |
// the match term.
|
525 |
unsigned int sta = MAX(0, ipos-m_rcldb->m_synthAbsWordCtxLen);
|
531 |
unsigned int sta = MAX(0, ipos - ctxwords);
|
526 |
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
532 |
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
527 |
m_rcldb->m_synthAbsWordCtxLen;
|
533 |
m_rcldb->m_synthAbsWordCtxLen;
|
528 |
for (unsigned int ii = sta; ii <= sto; ii++) {
|
534 |
for (unsigned int ii = sta; ii <= sto; ii++) {
|
529 |
if (ii == (unsigned int)ipos) {
|
535 |
if (ii == (unsigned int)ipos) {
|
530 |
sparseDoc[ii] = qterm;
|
536 |
sparseDoc[ii] = qterm;
|
|
... |
|
... |
546 |
sparseDoc[sto+1] = cstr_ellipsis;
|
552 |
sparseDoc[sto+1] = cstr_ellipsis;
|
547 |
}
|
553 |
}
|
548 |
|
554 |
|
549 |
// Limit to allocated occurences and total size
|
555 |
// Limit to allocated occurences and total size
|
550 |
if (++occurrences >= maxoccs ||
|
556 |
if (++occurrences >= maxoccs ||
|
551 |
totaloccs >= maxtotaloccs)
|
557 |
totaloccs >= maxtotaloccs) {
|
|
|
558 |
ret = ABSRES_TRUNC;
|
|
|
559 |
LOGDEB(("Db::makeAbstract: max occurrences cutoff\n"));
|
552 |
break;
|
560 |
break;
|
|
|
561 |
}
|
553 |
}
|
562 |
}
|
554 |
} catch (...) {
|
563 |
} catch (...) {
|
555 |
// Term does not occur. No problem.
|
564 |
// Term does not occur. No problem.
|
556 |
}
|
565 |
}
|
557 |
if (totaloccs >= maxtotaloccs)
|
566 |
if (totaloccs >= maxtotaloccs) {
|
|
|
567 |
ret = ABSRES_TRUNC;
|
|
|
568 |
LOGDEB(("Db::makeAbstract: max1 occurrences cutoff\n"));
|
558 |
break;
|
569 |
break;
|
|
|
570 |
}
|
559 |
}
|
571 |
}
|
560 |
LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
|
572 |
LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
|
561 |
chron.millis(), totaloccs));
|
573 |
chron.millis(), totaloccs));
|
562 |
|
574 |
|
563 |
// This can happen if there are term occurences in the keywords
|
575 |
// This can happen if there are term occurences in the keywords
|
564 |
// etc. but not elsewhere ?
|
576 |
// etc. but not elsewhere ?
|
565 |
if (totaloccs == 0) {
|
577 |
if (totaloccs == 0) {
|
566 |
LOGDEB1(("makeAbstract: no occurrences\n"));
|
578 |
LOGDEB1(("makeAbstract: no occurrences\n"));
|
567 |
return false;
|
579 |
return ABSRES_ERROR;
|
568 |
}
|
580 |
}
|
569 |
|
581 |
|
570 |
// Walk all document's terms position lists and populate slots
|
582 |
// Walk all document's terms position lists and populate slots
|
571 |
// around the query terms. We arbitrarily truncate the list to
|
583 |
// around the query terms. We arbitrarily truncate the list to
|
572 |
// avoid taking forever. If we do cutoff, the abstract may be
|
584 |
// avoid taking forever. If we do cutoff, the abstract may be
|
|
... |
|
... |
580 |
term != xrdb.termlist_end(docid); term++) {
|
592 |
term != xrdb.termlist_end(docid); term++) {
|
581 |
// Ignore prefixed terms
|
593 |
// Ignore prefixed terms
|
582 |
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
|
594 |
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
|
583 |
continue;
|
595 |
continue;
|
584 |
if (cutoff-- < 0) {
|
596 |
if (cutoff-- < 0) {
|
|
|
597 |
ret = ABSRES_TRUNC;
|
585 |
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
598 |
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
586 |
break;
|
599 |
break;
|
587 |
}
|
600 |
}
|
588 |
|
601 |
|
589 |
Xapian::PositionIterator pos;
|
602 |
Xapian::PositionIterator pos;
|
590 |
for (pos = xrdb.positionlist_begin(docid, *term);
|
603 |
for (pos = xrdb.positionlist_begin(docid, *term);
|
591 |
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
604 |
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
592 |
if (cutoff-- < 0) {
|
605 |
if (cutoff-- < 0) {
|
|
|
606 |
ret = ABSRES_TRUNC;
|
593 |
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
607 |
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
594 |
break;
|
608 |
break;
|
595 |
}
|
609 |
}
|
596 |
map<unsigned int, string>::iterator vit;
|
610 |
map<unsigned int, string>::iterator vit;
|
597 |
if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
|
611 |
if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
|
598 |
// Don't replace a term: the terms list is in
|
612 |
// Don't replace a term: the terms list is in
|
599 |
// alphabetic order, and we may have several terms
|
613 |
// alphabetic order, and we may have several terms
|
600 |
// at the same position, we want to keep only the
|
614 |
// at the same position, we want to keep only the
|
601 |
// first one (ie: dockes and dockes@wanadoo.fr)
|
615 |
// first one (ie: dockes and dockes@wanadoo.fr)
|
602 |
if (vit->second.empty()) {
|
616 |
if (vit->second.empty()) {
|
603 |
LOGABS(("makeAbstract: populating: [%s] at %d\n",
|
617 |
LOGDEB2(("makeAbstract: populating: [%s] at %d\n",
|
604 |
(*term).c_str(), *pos));
|
618 |
(*term).c_str(), *pos));
|
605 |
sparseDoc[*pos] = *term;
|
619 |
sparseDoc[*pos] = *term;
|
606 |
}
|
620 |
}
|
607 |
}
|
621 |
}
|
608 |
}
|
622 |
}
|
609 |
}
|
623 |
}
|
|
... |
|
... |
663 |
}
|
677 |
}
|
664 |
if (!chunk.empty())
|
678 |
if (!chunk.empty())
|
665 |
vabs.push_back(pair<int, string>(page, chunk));
|
679 |
vabs.push_back(pair<int, string>(page, chunk));
|
666 |
|
680 |
|
667 |
LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
|
681 |
LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
|
668 |
return true;
|
682 |
return ret;
|
669 |
}
|
683 |
}
|
670 |
|
684 |
|
671 |
/* Rcl::Db methods ///////////////////////////////// */
|
685 |
/* Rcl::Db methods ///////////////////////////////// */
|
672 |
|
686 |
|
673 |
bool Db::o_inPlaceReset;
|
687 |
bool Db::o_inPlaceReset;
|
|
... |
|
... |
2117 |
return false;
|
2131 |
return false;
|
2118 |
}
|
2132 |
}
|
2119 |
return true;
|
2133 |
return true;
|
2120 |
}
|
2134 |
}
|
2121 |
|
2135 |
|
2122 |
bool Db::makeDocAbstract(Doc &doc, Query *query,
|
2136 |
abstract_result Db::makeDocAbstract(Doc &doc, Query *query,
|
2123 |
vector<pair<int, string> >& abstract)
|
2137 |
vector<pair<int, string> >& abstract,
|
|
|
2138 |
int maxoccs, int ctxwords)
|
2124 |
{
|
2139 |
{
|
|
|
2140 |
LOGDEB(("makeDocAbstract: maxoccs %d ctxwords %d\n", maxoccs, ctxwords));
|
2125 |
if (!m_ndb || !m_ndb->m_isopen) {
|
2141 |
if (!m_ndb || !m_ndb->m_isopen) {
|
2126 |
LOGERR(("Db::makeDocAbstract: no db\n"));
|
2142 |
LOGERR(("Db::makeDocAbstract: no db\n"));
|
2127 |
return false;
|
2143 |
return ABSRES_ERROR;
|
2128 |
}
|
2144 |
}
|
2129 |
bool ret = false;
|
2145 |
abstract_result ret = ABSRES_ERROR;
|
2130 |
XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract),
|
2146 |
XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract,
|
|
|
2147 |
maxoccs, ctxwords),
|
2131 |
m_ndb->xrdb, m_reason);
|
2148 |
m_ndb->xrdb, m_reason);
|
2132 |
return (ret && m_reason.empty()) ? true : false;
|
2149 |
if (!m_reason.empty())
|
|
|
2150 |
return ABSRES_ERROR;
|
|
|
2151 |
return ret;
|
2133 |
}
|
2152 |
}
|
2134 |
|
2153 |
|
2135 |
bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract)
|
2154 |
bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract)
|
2136 |
{
|
2155 |
{
|
2137 |
if (!m_ndb || !m_ndb->m_isopen) {
|
2156 |
if (!m_ndb || !m_ndb->m_isopen) {
|