|
a/src/rcldb/rclabstract.cpp |
|
b/src/rcldb/rclabstract.cpp |
|
... |
|
... |
340 |
map<unsigned int, string> sparseDoc;
|
340 |
map<unsigned int, string> sparseDoc;
|
341 |
// Also remember apart the search term positions so that we can list
|
341 |
// Also remember apart the search term positions so that we can list
|
342 |
// them with their snippets.
|
342 |
// them with their snippets.
|
343 |
unordered_set<unsigned int> searchTermPositions;
|
343 |
unordered_set<unsigned int> searchTermPositions;
|
344 |
|
344 |
|
|
|
345 |
// Remember max position. Used to stop walking positions lists while
|
|
|
346 |
// populating the adjacent slots.
|
|
|
347 |
unsigned int maxpos = 0;
|
|
|
348 |
|
345 |
// Total number of occurences for all terms. We stop when we have too much
|
349 |
// Total number of occurences for all terms. We stop when we have too much
|
346 |
unsigned int totaloccs = 0;
|
350 |
unsigned int totaloccs = 0;
|
347 |
|
351 |
|
348 |
// Total number of slots we populate. The 7 is taken as
|
352 |
// Total number of slots we populate. The 7 is taken as
|
349 |
// average word size. It was a mistake to have the user max
|
353 |
// average word size. It was a mistake to have the user max
|
|
... |
|
... |
417 |
m_q->m_db->getAbsCtxLen();
|
421 |
m_q->m_db->getAbsCtxLen();
|
418 |
for (unsigned int ii = sta; ii <= sto; ii++) {
|
422 |
for (unsigned int ii = sta; ii <= sto; ii++) {
|
419 |
if (ii == (unsigned int)ipos) {
|
423 |
if (ii == (unsigned int)ipos) {
|
420 |
sparseDoc[ii] = qterm;
|
424 |
sparseDoc[ii] = qterm;
|
421 |
searchTermPositions.insert(ii);
|
425 |
searchTermPositions.insert(ii);
|
|
|
426 |
if (ii > maxpos)
|
|
|
427 |
maxpos = ii;
|
422 |
} else if (ii > (unsigned int)ipos &&
|
428 |
} else if (ii > (unsigned int)ipos &&
|
423 |
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
429 |
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
424 |
sparseDoc[ii] = occupiedmarker;
|
430 |
sparseDoc[ii] = occupiedmarker;
|
425 |
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
|
431 |
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
|
426 |
// For an empty slot, the test has a side
|
432 |
// For an empty slot, the test has a side
|
|
... |
|
... |
458 |
}
|
464 |
}
|
459 |
}
|
465 |
}
|
460 |
}
|
466 |
}
|
461 |
LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
|
467 |
LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
|
462 |
chron.millis(), totaloccs));
|
468 |
chron.millis(), totaloccs));
|
|
|
469 |
maxpos += ctxwords + 1;
|
463 |
|
470 |
|
464 |
// This can happen if there are term occurences in the keywords
|
471 |
// This can happen if there are term occurences in the keywords
|
465 |
// etc. but not elsewhere ?
|
472 |
// etc. but not elsewhere ?
|
466 |
if (totaloccs == 0) {
|
473 |
if (totaloccs == 0) {
|
467 |
LOGDEB1(("makeAbstract: no occurrences\n"));
|
474 |
LOGDEB1(("makeAbstract: no occurrences\n"));
|
|
... |
|
... |
470 |
|
477 |
|
471 |
// Walk all document's terms position lists and populate slots
|
478 |
// Walk all document's terms position lists and populate slots
|
472 |
// around the query terms. We arbitrarily truncate the list to
|
479 |
// around the query terms. We arbitrarily truncate the list to
|
473 |
// avoid taking forever. If we do cutoff, the abstract may be
|
480 |
// avoid taking forever. If we do cutoff, the abstract may be
|
474 |
// inconsistant (missing words, potentially altering meaning),
|
481 |
// inconsistant (missing words, potentially altering meaning),
|
475 |
// which is bad.
|
482 |
// which is bad.
|
476 |
{
|
483 |
{
|
477 |
Xapian::TermIterator term;
|
484 |
Xapian::TermIterator term;
|
478 |
int cutoff = 500 * 1000;
|
485 |
int cutoff = m_q->m_snipMaxPosWalk;
|
479 |
|
|
|
480 |
for (term = xrdb.termlist_begin(docid);
|
486 |
for (term = xrdb.termlist_begin(docid);
|
481 |
term != xrdb.termlist_end(docid); term++) {
|
487 |
term != xrdb.termlist_end(docid); term++) {
|
482 |
// Ignore prefixed terms
|
488 |
// Ignore prefixed terms
|
483 |
if (has_prefix(*term))
|
489 |
if (has_prefix(*term))
|
484 |
continue;
|
490 |
continue;
|
485 |
if (cutoff-- < 0) {
|
491 |
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
486 |
ret = ABSRES_TRUNC;
|
492 |
ret = ABSRES_TERMMISS;
|
487 |
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
493 |
LOGDEB0(("makeAbstract: max term count cutoff %d\n",
|
|
|
494 |
m_q->m_snipMaxPosWalk));
|
488 |
break;
|
495 |
break;
|
489 |
}
|
496 |
}
|
490 |
|
497 |
|
|
|
498 |
map<unsigned int, string>::iterator vit;
|
491 |
Xapian::PositionIterator pos;
|
499 |
Xapian::PositionIterator pos;
|
492 |
for (pos = xrdb.positionlist_begin(docid, *term);
|
500 |
for (pos = xrdb.positionlist_begin(docid, *term);
|
493 |
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
501 |
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
494 |
if (cutoff-- < 0) {
|
502 |
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
495 |
ret = ABSRES_TRUNC;
|
503 |
ret = ABSRES_TERMMISS;
|
496 |
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
504 |
LOGDEB0(("makeAbstract: max term count cutoff %d\n",
|
|
|
505 |
m_q->m_snipMaxPosWalk));
|
497 |
break;
|
506 |
break;
|
498 |
}
|
507 |
}
|
499 |
map<unsigned int, string>::iterator vit;
|
508 |
// If we are beyond the max possible position, stop
|
|
|
509 |
// for this term
|
|
|
510 |
if (*pos > maxpos) {
|
|
|
511 |
break;
|
|
|
512 |
}
|
500 |
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
|
513 |
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
|
501 |
// Don't replace a term: the terms list is in
|
514 |
// Don't replace a term: the terms list is in
|
502 |
// alphabetic order, and we may have several terms
|
515 |
// alphabetic order, and we may have several terms
|
503 |
// at the same position, we want to keep only the
|
516 |
// at the same position, we want to keep only the
|
504 |
// first one (ie: dockes and dockes@wanadoo.fr)
|
517 |
// first one (ie: dockes and dockes@wanadoo.fr)
|