|
a/src/rcldb/rclabstract.cpp |
|
b/src/rcldb/rclabstract.cpp |
|
... |
|
... |
125 |
doccnt = 1;
|
125 |
doccnt = 1;
|
126 |
|
126 |
|
127 |
for (vector<string>::const_iterator qit = qterms.begin();
|
127 |
for (vector<string>::const_iterator qit = qterms.begin();
|
128 |
qit != qterms.end(); qit++) {
|
128 |
qit != qterms.end(); qit++) {
|
129 |
termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
129 |
termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
130 |
LOGABS(("setDbWideQTermFreqs: [%s] db freq %.1e\n", qit->c_str(),
|
130 |
LOGABS("setDbWideQTermFreqs: [" << (qit) << "] db freq " << (termfreqs[*qit]) << "\n" );
|
131 |
termfreqs[*qit]));
|
|
|
132 |
}
|
131 |
}
|
133 |
}
|
132 |
}
|
134 |
|
133 |
|
135 |
// Compute matched terms quality coefficients for a matched document by
|
134 |
// Compute matched terms quality coefficients for a matched document by
|
136 |
// retrieving the Within Document Frequencies and multiplying by
|
135 |
// retrieving the Within Document Frequencies and multiplying by
|
|
... |
|
... |
145 |
// aggregated frequency.
|
144 |
// aggregated frequency.
|
146 |
double Query::Native::qualityTerms(Xapian::docid docid,
|
145 |
double Query::Native::qualityTerms(Xapian::docid docid,
|
147 |
const vector<string>& terms,
|
146 |
const vector<string>& terms,
|
148 |
multimap<double, vector<string> >& byQ)
|
147 |
multimap<double, vector<string> >& byQ)
|
149 |
{
|
148 |
{
|
150 |
LOGABS(("qualityTerms\n"));
|
149 |
LOGABS("qualityTerms\n" );
|
151 |
setDbWideQTermsFreqs();
|
150 |
setDbWideQTermsFreqs();
|
152 |
|
151 |
|
153 |
map<string, double> termQcoefs;
|
152 |
map<string, double> termQcoefs;
|
154 |
double totalweight = 0;
|
153 |
double totalweight = 0;
|
155 |
|
154 |
|
|
... |
|
... |
164 |
|
163 |
|
165 |
#ifdef DEBUGABSTRACT
|
164 |
#ifdef DEBUGABSTRACT
|
166 |
{
|
165 |
{
|
167 |
string deb;
|
166 |
string deb;
|
168 |
hld.toString(deb);
|
167 |
hld.toString(deb);
|
169 |
LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));
|
168 |
LOGABS("qualityTerms: hld: " << (deb) << "\n" );
|
170 |
}
|
169 |
}
|
171 |
#endif
|
170 |
#endif
|
172 |
|
171 |
|
173 |
// Group the input terms by the user term they were possibly expanded from
|
172 |
// Group the input terms by the user term they were possibly expanded from
|
174 |
map<string, vector<string> > byRoot;
|
173 |
map<string, vector<string> > byRoot;
|
|
... |
|
... |
193 |
it != debit->second.end(); it++) {
|
192 |
it != debit->second.end(); it++) {
|
194 |
byRootstr.append("[").append(*it).append("] ");
|
193 |
byRootstr.append("[").append(*it).append("] ");
|
195 |
}
|
194 |
}
|
196 |
byRootstr.append("\n");
|
195 |
byRootstr.append("\n");
|
197 |
}
|
196 |
}
|
198 |
LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));
|
197 |
LOGABS("\nqualityTerms: uterms to terms: " << (byRootstr) << "\n" );
|
199 |
}
|
198 |
}
|
200 |
#endif
|
199 |
#endif
|
201 |
|
200 |
|
202 |
// Compute in-document and global frequencies for the groups.
|
201 |
// Compute in-document and global frequencies for the groups.
|
203 |
map<string, double> grpwdfs;
|
202 |
map<string, double> grpwdfs;
|
|
... |
|
... |
241 |
}
|
240 |
}
|
242 |
|
241 |
|
243 |
#ifdef DEBUGABSTRACT
|
242 |
#ifdef DEBUGABSTRACT
|
244 |
for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
|
243 |
for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
|
245 |
mit != byQ.rend(); mit++) {
|
244 |
mit != byQ.rend(); mit++) {
|
246 |
LOGABS(("qualityTerms: group\n"));
|
245 |
LOGABS("qualityTerms: group\n" );
|
247 |
for (vector<string>::const_iterator qit = mit->second.begin();
|
246 |
for (vector<string>::const_iterator qit = mit->second.begin();
|
248 |
qit != mit->second.end(); qit++) {
|
247 |
qit != mit->second.end(); qit++) {
|
249 |
LOGABS(("%.1e->[%s]\n", mit->first, qit->c_str()));
|
248 |
LOGABS("" << (mit->first) << "->[" << (qit) << "]\n" );
|
250 |
}
|
249 |
}
|
251 |
}
|
250 |
}
|
252 |
#endif
|
251 |
#endif
|
253 |
return totalweight;
|
252 |
return totalweight;
|
254 |
}
|
253 |
}
|
|
... |
|
... |
313 |
int Query::Native::makeAbstract(Xapian::docid docid,
|
312 |
int Query::Native::makeAbstract(Xapian::docid docid,
|
314 |
vector<Snippet>& vabs,
|
313 |
vector<Snippet>& vabs,
|
315 |
int imaxoccs, int ictxwords)
|
314 |
int imaxoccs, int ictxwords)
|
316 |
{
|
315 |
{
|
317 |
Chrono chron;
|
316 |
Chrono chron;
|
318 |
LOGABS(("makeAbstract: docid %ld imaxoccs %d ictxwords %d\n",
|
317 |
LOGABS("makeAbstract: docid " << (long(docid)) << " imaxoccs " << (imaxoccs) << " ictxwords " << (ictxwords) << "\n" );
|
319 |
long(docid), imaxoccs, ictxwords));
|
|
|
320 |
|
318 |
|
321 |
// The (unprefixed) terms matched by this document
|
319 |
// The (unprefixed) terms matched by this document
|
322 |
vector<string> matchedTerms;
|
320 |
vector<string> matchedTerms;
|
323 |
getMatchTerms(docid, matchedTerms);
|
321 |
getMatchTerms(docid, matchedTerms);
|
324 |
if (matchedTerms.empty()) {
|
322 |
if (matchedTerms.empty()) {
|
|
... |
|
... |
337 |
// going to try and show text around the less common search terms.
|
335 |
// going to try and show text around the less common search terms.
|
338 |
// Terms issued from an original one by stem expansion are
|
336 |
// Terms issued from an original one by stem expansion are
|
339 |
// aggregated by the qualityTerms() routine.
|
337 |
// aggregated by the qualityTerms() routine.
|
340 |
multimap<double, vector<string> > byQ;
|
338 |
multimap<double, vector<string> > byQ;
|
341 |
double totalweight = qualityTerms(docid, matchedTerms, byQ);
|
339 |
double totalweight = qualityTerms(docid, matchedTerms, byQ);
|
342 |
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
340 |
LOGABS("makeAbstract:" << (chron.ms()) << ": computed Qcoefs.\n" );
|
343 |
// This can't happen, but would crash us
|
341 |
// This can't happen, but would crash us
|
344 |
if (totalweight == 0.0) {
|
342 |
if (totalweight == 0.0) {
|
345 |
LOGERR("makeAbstract: totalweight == 0.0 !\n" );
|
343 |
LOGERR("makeAbstract: totalweight == 0.0 !\n" );
|
346 |
return ABSRES_ERROR;
|
344 |
return ABSRES_ERROR;
|
347 |
}
|
345 |
}
|
|
... |
|
... |
374 |
// with words. We used to limit the character size at the end, but
|
372 |
// with words. We used to limit the character size at the end, but
|
375 |
// this damaged our careful selection of terms
|
373 |
// this damaged our careful selection of terms
|
376 |
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
374 |
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
377 |
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
|
375 |
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
|
378 |
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
|
376 |
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
|
379 |
LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n",
|
377 |
LOGABS("makeAbstract:" << (chron.ms()) << ": mxttloccs " << (maxtotaloccs) << " ctxwords " << (ctxwords) << "\n" );
|
380 |
chron.ms(), maxtotaloccs, ctxwords));
|
|
|
381 |
|
378 |
|
382 |
int ret = ABSRES_OK;
|
379 |
int ret = ABSRES_OK;
|
383 |
|
380 |
|
384 |
// Let's go populate
|
381 |
// Let's go populate
|
385 |
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
382 |
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
|
... |
|
... |
403 |
if (grpoccs >= maxgrpoccs)
|
400 |
if (grpoccs >= maxgrpoccs)
|
404 |
break;
|
401 |
break;
|
405 |
|
402 |
|
406 |
string qterm = *qit;
|
403 |
string qterm = *qit;
|
407 |
|
404 |
|
408 |
LOGABS(("makeAbstract: [%s] %d max grp occs (coef %.2f)\n",
|
405 |
LOGABS("makeAbstract: [" << (qterm) << "] " << (maxgrpoccs) << " max grp occs (coef " << (q) << ")\n" );
|
409 |
qterm.c_str(), maxgrpoccs, q));
|
|
|
410 |
|
406 |
|
411 |
// The match term may span several words
|
407 |
// The match term may span several words
|
412 |
int qtrmwrdcnt =
|
408 |
int qtrmwrdcnt =
|
413 |
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
409 |
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
414 |
|
410 |
|
|
... |
|
... |
423 |
for (pos = xrdb.positionlist_begin(docid, qterm);
|
419 |
for (pos = xrdb.positionlist_begin(docid, qterm);
|
424 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
420 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
425 |
int ipos = *pos;
|
421 |
int ipos = *pos;
|
426 |
if (ipos < int(baseTextPosition)) // Not in text body
|
422 |
if (ipos < int(baseTextPosition)) // Not in text body
|
427 |
continue;
|
423 |
continue;
|
428 |
LOGABS(("makeAbstract: [%s] at pos %d grpoccs %d maxgrpoccs"
|
424 |
LOGABS("makeAbstract: [" << (qterm) << "] at pos " << (ipos) << " grpoccs " << (grpoccs) << " maxgrpoccs " << (maxgrpoccs) << "\n" );
|
429 |
" %d\n", qterm.c_str(), ipos, grpoccs, maxgrpoccs));
|
|
|
430 |
|
425 |
|
431 |
totaloccs++;
|
426 |
totaloccs++;
|
432 |
grpoccs++;
|
427 |
grpoccs++;
|
433 |
|
428 |
|
434 |
// Add adjacent slots to the set to populate at next
|
429 |
// Add adjacent slots to the set to populate at next
|
|
... |
|
... |
464 |
}
|
459 |
}
|
465 |
|
460 |
|
466 |
// Group done ?
|
461 |
// Group done ?
|
467 |
if (grpoccs >= maxgrpoccs) {
|
462 |
if (grpoccs >= maxgrpoccs) {
|
468 |
ret |= ABSRES_TRUNC;
|
463 |
ret |= ABSRES_TRUNC;
|
469 |
LOGABS(("Db::makeAbstract: max group occs cutoff\n"));
|
464 |
LOGABS("Db::makeAbstract: max group occs cutoff\n" );
|
470 |
break;
|
465 |
break;
|
471 |
}
|
466 |
}
|
472 |
// Global done ?
|
467 |
// Global done ?
|
473 |
if (totaloccs >= maxtotaloccs) {
|
468 |
if (totaloccs >= maxtotaloccs) {
|
474 |
ret |= ABSRES_TRUNC;
|
469 |
ret |= ABSRES_TRUNC;
|
475 |
LOGABS(("Db::makeAbstract: max occurrences cutoff\n"));
|
470 |
LOGABS("Db::makeAbstract: max occurrences cutoff\n" );
|
476 |
break;
|
471 |
break;
|
477 |
}
|
472 |
}
|
478 |
}
|
473 |
}
|
479 |
} catch (...) {
|
474 |
} catch (...) {
|
480 |
// Term does not occur. No problem.
|
475 |
// Term does not occur. No problem.
|
481 |
}
|
476 |
}
|
482 |
|
477 |
|
483 |
if (totaloccs >= maxtotaloccs) {
|
478 |
if (totaloccs >= maxtotaloccs) {
|
484 |
ret |= ABSRES_TRUNC;
|
479 |
ret |= ABSRES_TRUNC;
|
485 |
LOGABS(("Db::makeAbstract: max1 occurrences cutoff\n"));
|
480 |
LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );
|
486 |
break;
|
481 |
break;
|
487 |
}
|
482 |
}
|
488 |
}
|
483 |
}
|
489 |
}
|
484 |
}
|
490 |
maxpos += ctxwords + 1;
|
485 |
maxpos += ctxwords + 1;
|
491 |
|
486 |
|
492 |
LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
|
487 |
LOGABS("makeAbstract:" << (chron.millis()) << ":chosen number of positions " << (totaloccs) << "\n" );
|
493 |
chron.millis(), totaloccs));
|
|
|
494 |
// This can happen if there are term occurences in the keywords
|
488 |
// This can happen if there are term occurences in the keywords
|
495 |
// etc. but not elsewhere ?
|
489 |
// etc. but not elsewhere ?
|
496 |
if (totaloccs == 0) {
|
490 |
if (totaloccs == 0) {
|
497 |
LOGDEB("makeAbstract: no occurrences\n" );
|
491 |
LOGDEB("makeAbstract: no occurrences\n" );
|
498 |
return ABSRES_OK;
|
492 |
return ABSRES_OK;
|
|
... |
|
... |
564 |
#endif
|
558 |
#endif
|
565 |
|
559 |
|
566 |
vector<int> vpbreaks;
|
560 |
vector<int> vpbreaks;
|
567 |
ndb->getPagePositions(docid, vpbreaks);
|
561 |
ndb->getPagePositions(docid, vpbreaks);
|
568 |
|
562 |
|
569 |
LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(),
|
563 |
LOGABS("makeAbstract:" << (chron.millis()) << ": extracting. Got " << (vpbreaks.size()) << " pages\n" );
|
570 |
vpbreaks.size()));
|
|
|
571 |
// Finally build the abstract by walking the map (in order of position)
|
564 |
// Finally build the abstract by walking the map (in order of position)
|
572 |
vabs.clear();
|
565 |
vabs.clear();
|
573 |
string chunk;
|
566 |
string chunk;
|
574 |
bool incjk = false;
|
567 |
bool incjk = false;
|
575 |
int page = 0;
|
568 |
int page = 0;
|
|
... |
|
... |
613 |
}
|
606 |
}
|
614 |
|
607 |
|
615 |
|
608 |
|
616 |
}
|
609 |
}
|
617 |
|
610 |
|
|
|
611 |
|
|
|
612 |
|