recoll / Code / Diff of /src/rcldb/rclabstract.cpp

Diff of /src/rcldb/rclabstract.cpp [f624d3] .. [a16d04]

Switch to unified view


...
    map<unsigned int, string> sparseDoc;
    // Also remember apart the search term positions so that we can list
    // them with their snippets.
    unordered_set<unsigned int> searchTermPositions;

    // Remember max position. Used to stop walking positions lists while 
    // populating the adjacent slots.
    unsigned int maxpos = 0;

    // Total number of occurences for all terms. We stop when we have too much
    unsigned int totaloccs = 0;

    // Total number of slots we populate. The 7 is taken as
    // average word size. It was a mistake to have the user max
...
            m_q->m_db->getAbsCtxLen();
            for (unsigned int ii = sta; ii <= sto;  ii++) {
            if (ii == (unsigned int)ipos) {
                sparseDoc[ii] = qterm;
                searchTermPositions.insert(ii);
              if (ii > maxpos)
              maxpos = ii;
            } else if (ii > (unsigned int)ipos && 
                   ii < (unsigned int)ipos + qtrmwrdcnt) {
                sparseDoc[ii] = occupiedmarker;
            } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
                // For an empty slot, the test has a side
...
        }
    }
    }
    LOGABS(("makeAbstract:%d:chosen number of positions %d\n", 
        chron.millis(), totaloccs));
    maxpos += ctxwords + 1;

    // This can happen if there are term occurences in the keywords
    // etc. but not elsewhere ?
    if (totaloccs == 0) {
    LOGDEB1(("makeAbstract: no occurrences\n"));
...

    // Walk all document's terms position lists and populate slots
    // around the query terms. We arbitrarily truncate the list to
    // avoid taking forever. If we do cutoff, the abstract may be
    // inconsistant (missing words, potentially altering meaning),
    // which is bad. 
    { 
    Xapian::TermIterator term;
  int cutoff = m_q->m_snipMaxPosWalk;

    for (term = xrdb.termlist_begin(docid);
         term != xrdb.termlist_end(docid); term++) {
        // Ignore prefixed terms
        if (has_prefix(*term))
        continue;
      if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
        ret = ABSRES_TERMMISS;
        LOGDEB0(("makeAbstract: max term count cutoff %d\n", 
           m_q->m_snipMaxPosWalk));
        break;
        }

      map<unsigned int, string>::iterator vit;
        Xapian::PositionIterator pos;
        for (pos = xrdb.positionlist_begin(docid, *term); 
         pos != xrdb.positionlist_end(docid, *term); pos++) {
      if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
            ret = ABSRES_TERMMISS;
            LOGDEB0(("makeAbstract: max term count cutoff %d\n", 
               m_q->m_snipMaxPosWalk));
            break;
        }
      // If we are beyond the max possible position, stop
      // for this term
      if (*pos > maxpos) {
          break;
      }
        if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
            // Don't replace a term: the terms list is in
            // alphabetic order, and we may have several terms
            // at the same position, we want to keep only the
            // first one (ie: dockes and dockes@wanadoo.fr)

	a/src/rcldb/rclabstract.cpp		b/src/rcldb/rclabstract.cpp
	...		...
340	map<unsigned int, string> sparseDoc;	340	map<unsigned int, string> sparseDoc;
341	// Also remember apart the search term positions so that we can list	341	// Also remember apart the search term positions so that we can list
342	// them with their snippets.	342	// them with their snippets.
343	unordered_set<unsigned int> searchTermPositions;	343	unordered_set<unsigned int> searchTermPositions;
344		344
		345	// Remember max position. Used to stop walking positions lists while
		346	// populating the adjacent slots.
		347	unsigned int maxpos = 0;
		348
345	// Total number of occurences for all terms. We stop when we have too much	349	// Total number of occurences for all terms. We stop when we have too much
346	unsigned int totaloccs = 0;	350	unsigned int totaloccs = 0;
347		351
348	// Total number of slots we populate. The 7 is taken as	352	// Total number of slots we populate. The 7 is taken as
349	// average word size. It was a mistake to have the user max	353	// average word size. It was a mistake to have the user max
	...		...
417	m_q->m_db->getAbsCtxLen();	421	m_q->m_db->getAbsCtxLen();
418	for (unsigned int ii = sta; ii <= sto; ii++) {	422	for (unsigned int ii = sta; ii <= sto; ii++) {
419	if (ii == (unsigned int)ipos) {	423	if (ii == (unsigned int)ipos) {
420	sparseDoc[ii] = qterm;	424	sparseDoc[ii] = qterm;
421	searchTermPositions.insert(ii);	425	searchTermPositions.insert(ii);
		426	if (ii > maxpos)
		427	maxpos = ii;
422	} else if (ii > (unsigned int)ipos &&	428	} else if (ii > (unsigned int)ipos &&
423	ii < (unsigned int)ipos + qtrmwrdcnt) {	429	ii < (unsigned int)ipos + qtrmwrdcnt) {
424	sparseDoc[ii] = occupiedmarker;	430	sparseDoc[ii] = occupiedmarker;
425	} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {	431	} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
426	// For an empty slot, the test has a side	432	// For an empty slot, the test has a side
	...		...
458	}	464	}
459	}	465	}
460	}	466	}
461	LOGABS(("makeAbstract:%d:chosen number of positions %d\n",	467	LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
462	chron.millis(), totaloccs));	468	chron.millis(), totaloccs));
		469	maxpos += ctxwords + 1;
463		470
464	// This can happen if there are term occurences in the keywords	471	// This can happen if there are term occurences in the keywords
465	// etc. but not elsewhere ?	472	// etc. but not elsewhere ?
466	if (totaloccs == 0) {	473	if (totaloccs == 0) {
467	LOGDEB1(("makeAbstract: no occurrences\n"));	474	LOGDEB1(("makeAbstract: no occurrences\n"));
	...		...
470		477
471	// Walk all document's terms position lists and populate slots	478	// Walk all document's terms position lists and populate slots
472	// around the query terms. We arbitrarily truncate the list to	479	// around the query terms. We arbitrarily truncate the list to
473	// avoid taking forever. If we do cutoff, the abstract may be	480	// avoid taking forever. If we do cutoff, the abstract may be
474	// inconsistant (missing words, potentially altering meaning),	481	// inconsistant (missing words, potentially altering meaning),
475	// which is bad.	482	// which is bad.
476	{	483	{
477	Xapian::TermIterator term;	484	Xapian::TermIterator term;
478	int cutoff = 500 * 1000;	485	int cutoff = m_q->m_snipMaxPosWalk;
479
480	for (term = xrdb.termlist_begin(docid);	486	for (term = xrdb.termlist_begin(docid);
481	term != xrdb.termlist_end(docid); term++) {	487	term != xrdb.termlist_end(docid); term++) {
482	// Ignore prefixed terms	488	// Ignore prefixed terms
483	if (has_prefix(*term))	489	if (has_prefix(*term))
484	continue;	490	continue;
485	if (cutoff-- < 0) {	491	if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
486	ret = ABSRES_TRUNC;	492	ret = ABSRES_TERMMISS;
487	LOGDEB0(("makeAbstract: max term count cutoff\n"));	493	LOGDEB0(("makeAbstract: max term count cutoff %d\n",
		494	m_q->m_snipMaxPosWalk));
488	break;	495	break;
489	}	496	}
490		497
		498	map<unsigned int, string>::iterator vit;
491	Xapian::PositionIterator pos;	499	Xapian::PositionIterator pos;
492	for (pos = xrdb.positionlist_begin(docid, *term);	500	for (pos = xrdb.positionlist_begin(docid, *term);
493	pos != xrdb.positionlist_end(docid, *term); pos++) {	501	pos != xrdb.positionlist_end(docid, *term); pos++) {
494	if (cutoff-- < 0) {	502	if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
495	ret = ABSRES_TRUNC;	503	ret = ABSRES_TERMMISS;
496	LOGDEB0(("makeAbstract: max term count cutoff\n"));	504	LOGDEB0(("makeAbstract: max term count cutoff %d\n",
		505	m_q->m_snipMaxPosWalk));
497	break;	506	break;
498	}	507	}
499	map<unsigned int, string>::iterator vit;	508	// If we are beyond the max possible position, stop
		509	// for this term
		510	if (*pos > maxpos) {
		511	break;
		512	}
500	if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {	513	if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
501	// Don't replace a term: the terms list is in	514	// Don't replace a term: the terms list is in
502	// alphabetic order, and we may have several terms	515	// alphabetic order, and we may have several terms
503	// at the same position, we want to keep only the	516	// at the same position, we want to keep only the
504	// first one (ie: dockes and dockes@wanadoo.fr)	517	// first one (ie: dockes and dockes@wanadoo.fr)