recoll / Code / Diff of /src/rcldb/rclabstract.cpp

Diff of /src/rcldb/rclabstract.cpp [f51a4f] .. [81c171]

Switch to unified view


...
{
    string a;
    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
        a = a + *it + " ";
    }
    LOGDEB("" << what << ": " << a << "\n");
}
#else
#define LOGABS LOGDEB2
static void listList(const string&, const vector<string>&)
{
...
// result in general.
static const bool prune_prefixed_terms = true; 
static void noPrefixList(const vector<string>& in, vector<string>& out) 
{
    for (vector<string>::const_iterator qit = in.begin(); 
         qit != in.end(); qit++) {
        if (prune_prefixed_terms) {
            if (has_prefix(*qit))
                continue;
        }
        out.push_back(strip_prefix(*qit));
    }
    sort(out.begin(), out.end());
    vector<string>::iterator it = unique(out.begin(), out.end());
    out.resize(it - out.begin());
}

bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
{
    if (!xenquire) {
        LOGERR("Query::getMatchTerms: no query opened\n");
        return false;
    }

    terms.clear();
    Xapian::TermIterator it;
    Xapian::docid id = Xapian::docid(xdocid);
...
    XAPTRY(iterms.insert(iterms.begin(),
                        xenquire->get_matching_terms_begin(id),
                        xenquire->get_matching_terms_end(id)),
           m_q->m_db->m_ndb->xrdb, m_q->m_reason);
    if (!m_q->m_reason.empty()) {
        LOGERR("getMatchTerms: xapian error: " << m_q->m_reason << "\n");
        return false;
    }
    noPrefixList(iterms, terms);
    return true;
}

...
// while computing abstracts for the different result documents.
void Query::Native::setDbWideQTermsFreqs()
{
    // Do it once only for a given query.
    if (!termfreqs.empty())
        return;

    vector<string> qterms;
    {
        vector<string> iqterms;
        m_q->getQueryTerms(iqterms);
        noPrefixList(iqterms, qterms);
    }
    // listList("Query terms: ", qterms);
    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;

    double doccnt = xrdb.get_doccount();
    if (doccnt == 0) 
        doccnt = 1;

    for (vector<string>::const_iterator qit = qterms.begin(); 
         qit != qterms.end(); qit++) {
        termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
        LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " <<
               termfreqs[*qit] << "\n");
    }
}

// Compute matched terms quality coefficients for a matched document by
// retrieving the Within Document Frequencies and multiplying by
...
// common stem, which seems wrong, we group the terms by
// root, compute a frequency for the group from the sum of member
// occurrences, and let the frequency for each group member be the
// aggregated frequency.
double Query::Native::qualityTerms(Xapian::docid docid, 
                                   const vector<string>& terms,
                                   multimap<double, vector<string> >& byQ)
{
    LOGABS("qualityTerms\n");
    setDbWideQTermsFreqs();

    map<string, double> termQcoefs;
    double totalweight = 0;

    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
    double doclen = xrdb.get_doclength(docid);
    if (doclen == 0) 
        doclen = 1;
    HighlightData hld;
    if (m_q->m_sd) {
        m_q->m_sd->getTerms(hld);
    }

#ifdef DEBUGABSTRACT
    {
        string deb;
        hld.toString(deb);
        LOGABS("qualityTerms: hld: " << deb << "\n");
    }
#endif

    // Group the input terms by the user term they were possibly expanded from
    map<string, vector<string> > byRoot;
    for (vector<string>::const_iterator qit = terms.begin(); 
         qit != terms.end(); qit++) {
        map<string, string>::const_iterator eit = hld.terms.find(*qit);
        if (eit != hld.terms.end()) {
            byRoot[eit->second].push_back(*qit);
        } else {
            LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n");
            byRoot[*qit].push_back(*qit);
        }
    }

#ifdef DEBUGABSTRACT
    {
        string byRootstr;
        for (map<string, vector<string> >::const_iterator debit = 
                 byRoot.begin();  debit != byRoot.end(); debit++) {
            byRootstr.append("[").append(debit->first).append("]->");
            for (vector<string>::const_iterator it = debit->second.begin();
                 it != debit->second.end(); it++) {
                byRootstr.append("[").append(*it).append("] ");
            }
            byRootstr.append("\n");
        }
        LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n");
    }
#endif

    // Compute in-document and global frequencies for the groups.
    map<string, double> grpwdfs;
    map<string, double> grptfreqs;
    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
         git != byRoot.end(); git++) {
        for (vector<string>::const_iterator qit = git->second.begin(); 
             qit != git->second.end(); qit++) {
            Xapian::TermIterator term = xrdb.termlist_begin(docid);
            term.skip_to(*qit);
            if (term != xrdb.termlist_end(docid) && *term == *qit) {
                if (grpwdfs.find(git->first) != grpwdfs.end()) {
                    grpwdfs[git->first] = term.get_wdf() / doclen;
                    grptfreqs[git->first] = termfreqs[*qit];
                } else {
                    grpwdfs[git->first] += term.get_wdf() / doclen;
                    grptfreqs[git->first] += termfreqs[*qit];
                }
            }
        }    
    }

    // Build a sorted by quality container for the groups
    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
         git != byRoot.end(); git++) {
        double q = (grpwdfs[git->first]) * grptfreqs[git->first];
        q = -log10(q);
        if (q < 3) {
            q = 0.05;
        } else if (q < 4) {
            q = 0.3;
        } else if (q < 5) {
            q = 0.7;
        } else if (q < 6) {
            q = 0.8;
        } else {
            q = 1;
        }
        totalweight += q;
        byQ.insert(pair<double, vector<string> >(q, git->second));
    }

#ifdef DEBUGABSTRACT
    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
         mit != byQ.rend(); mit++) {
        LOGABS("qualityTerms: group\n");
        for (vector<string>::const_iterator qit = mit->second.begin();
             qit != mit->second.end(); qit++) {
            LOGABS("" << mit->first << "->[" << *qit << "]\n");
        }
    }
#endif
    return totalweight;
}

// Return page number for first match of "significant" term.
int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
{
    LOGDEB("Query::Native::getFirstMatchPage\n");
    if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
        LOGERR("Query::getFirstMatchPage: no db\n");
        return -1;
    }
    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
    Xapian::Database& xrdb(ndb->xrdb);

    vector<string> terms;
    getMatchTerms(docid, terms);

    if (terms.empty()) {
        LOGDEB("getFirstMatchPage: empty match term list (field match?)\n");
        return -1;
    }

    vector<int> pagepos;
    ndb->getPagePositions(docid, pagepos);
    if (pagepos.empty())
        return -1;
        
    setDbWideQTermsFreqs();

    // We try to use a page which matches the "best" term. Get a sorted list
    multimap<double, vector<string> > byQ;
    qualityTerms(docid, terms, byQ);

    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
         mit != byQ.rend(); mit++) {
        for (vector<string>::const_iterator qit = mit->second.begin();
             qit != mit->second.end(); qit++) {
            string qterm = *qit;
            Xapian::PositionIterator pos;
            string emptys;
            try {
                for (pos = xrdb.positionlist_begin(docid, qterm);
                     pos != xrdb.positionlist_end(docid, qterm); pos++) {
                    int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
                    if (pagenum > 0) {
                        term = qterm;
                        return pagenum;
                    }
                }
            } catch (...) {
                // Term does not occur. No problem.
            }
        }
    }
    return -1;
}

// Build a document abstract by extracting text chunks around the query terms
// This uses the db termlists, not the original document.
//
// DatabaseModified and other general exceptions are catched and
// possibly retried by our caller
int Query::Native::makeAbstract(Xapian::docid docid,
                                vector<Snippet>& vabs, 
                                int imaxoccs, int ictxwords)
{
    Chrono chron;
    LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
           imaxoccs << " ictxwords " << ictxwords << "\n");

    // The (unprefixed) terms matched by this document
    vector<string> matchedTerms;
    getMatchTerms(docid, matchedTerms);
    if (matchedTerms.empty()) {
        LOGDEB("makeAbstract:"<<chron.millis()<<"mS:Empty term list\n");
        return ABSRES_ERROR;
    }

    listList("Match terms: ", matchedTerms);

    // Retrieve the term frequencies for the query terms. This is
...
    // going to try and show text around the less common search terms.
    // Terms issued from an original one by stem expansion are
    // aggregated by the qualityTerms() routine.
    multimap<double, vector<string> > byQ;
    double totalweight = qualityTerms(docid, matchedTerms, byQ);
    LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
    // This can't happen, but would crash us
    if (totalweight == 0.0) {
        LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
        return ABSRES_ERROR;
    }

    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
    Xapian::Database& xrdb(ndb->xrdb);

...
    // average word size. It was a mistake to have the user max
    // abstract size parameter in characters, we basically only deal
    // with words. We used to limit the character size at the end, but
    // this damaged our careful selection of terms
    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
        m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
    LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
           maxtotaloccs << " ctxwords " << ctxwords << "\n");

    int ret = ABSRES_OK;

    // Let's go populate
    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
         mit != byQ.rend(); mit++) {
        unsigned int maxgrpoccs;
        double q;
        if (byQ.size() == 1) {
            maxgrpoccs = maxtotaloccs;
            q = 1.0;
        } else {
            // We give more slots to the better term groups
            q = mit->first / totalweight;
            maxgrpoccs = int(ceil(maxtotaloccs * q));
        }
        unsigned int grpoccs = 0;

        for (vector<string>::const_iterator qit = mit->second.begin();
             qit != mit->second.end(); qit++) {

            // Group done ?
            if (grpoccs >= maxgrpoccs) 
                break;

            string qterm = *qit;

            LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
                   " max grp occs (coef " << q << ")\n");

            // The match term may span several words
            int qtrmwrdcnt = 
                TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);

            Xapian::PositionIterator pos;
            // There may be query terms not in this doc. This raises an
            // exception when requesting the position list, we catch it ??
            // Not clear how this can happen because we are walking the
            // match list returned by Xapian. Maybe something with the
            // fields?
            string emptys;
            try {
                for (pos = xrdb.positionlist_begin(docid, qterm);
                     pos != xrdb.positionlist_end(docid, qterm); pos++) {
                    int ipos = *pos;
                    if (ipos < int(baseTextPosition)) // Not in text body
                        continue;
                    LOGABS("makeAbstract: [" << qterm << "] at pos " <<
                           ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
                           maxgrpoccs << "\n");

                    totaloccs++;
                    grpoccs++;

                    // Add adjacent slots to the set to populate at next
                    // step by inserting empty strings. Special provisions
                    // for adding ellipsis and for positions overlapped by
                    // the match term.
                    unsigned int sta = MAX(int(baseTextPosition), 
                                           ipos - ctxwords);
                    unsigned int sto = ipos + qtrmwrdcnt-1 + 
                        m_q->m_db->getAbsCtxLen();
                    for (unsigned int ii = sta; ii <= sto;  ii++) {
                        if (ii == (unsigned int)ipos) {
                            sparseDoc[ii] = qterm;
                            searchTermPositions.insert(ii);
                            if (ii > maxpos)
                                maxpos = ii;
                        } else if (ii > (unsigned int)ipos && 
                                   ii < (unsigned int)ipos + qtrmwrdcnt) {
                            sparseDoc[ii] = occupiedmarker;
                        } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
                            // For an empty slot, the test has a side
                            // effect of inserting an empty string which
                            // is what we want.
                            sparseDoc[ii] = emptys;
                        }
                    }
                    // Add ellipsis at the end. This may be replaced later by
                    // an overlapping extract. Take care not to replace an
                    // empty string here, we really want an empty slot,
                    // use find()
                    if (sparseDoc.find(sto+1) == sparseDoc.end()) {
                        sparseDoc[sto+1] = cstr_ellipsis;
                    }

                    // Group done ?
                    if (grpoccs >= maxgrpoccs) {
                        ret |= ABSRES_TRUNC;
                        LOGABS("Db::makeAbstract: max group occs cutoff\n");
                        break;
                    }
                    // Global done ?
                    if (totaloccs >= maxtotaloccs) {
                        ret |= ABSRES_TRUNC;
                        LOGABS("Db::makeAbstract: max occurrences cutoff\n");
                        break;
                    }
                }
            } catch (...) {
                // Term does not occur. No problem.
            }

            if (totaloccs >= maxtotaloccs) {
                ret |= ABSRES_TRUNC;
                LOGABS("Db::makeAbstract: max1 occurrences cutoff\n");
                break;
            }
        }
    }
    maxpos += ctxwords + 1;

    LOGABS("makeAbstract:" << chron.millis() <<
           "mS:chosen number of positions " << totaloccs << "\n");
    // This can happen if there are term occurences in the keywords
    // etc. but not elsewhere ?
    if (totaloccs == 0) {
        LOGDEB("makeAbstract: no occurrences\n");
        return ABSRES_OK;
    }

    // Walk all document's terms position lists and populate slots
    // around the query terms. We arbitrarily truncate the list to
    // avoid taking forever. If we do cutoff, the abstract may be
    // inconsistant (missing words, potentially altering meaning),
    // which is bad. 
    { 
        Xapian::TermIterator term;
        int cutoff = m_q->m_snipMaxPosWalk;
        for (term = xrdb.termlist_begin(docid);
             term != xrdb.termlist_end(docid); term++) {
            // Ignore prefixed terms
            if (has_prefix(*term))
                continue;
            if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
                ret |= ABSRES_TERMMISS;
                LOGDEB0("makeAbstract: max term count cutoff " <<
                        m_q->m_snipMaxPosWalk << "\n");
                break;
            }

            map<unsigned int, string>::iterator vit;
            Xapian::PositionIterator pos;
            for (pos = xrdb.positionlist_begin(docid, *term);
                 pos != xrdb.positionlist_end(docid, *term); pos++) {
                if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
                    ret |= ABSRES_TERMMISS;
                    LOGDEB0("makeAbstract: max term count cutoff " <<
                            m_q->m_snipMaxPosWalk << "\n");
                    break;
                }
                // If we are beyond the max possible position, stop
                // for this term
                if (*pos > maxpos) {
                    break;
                }
                if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
                    // Don't replace a term: the terms list is in
                    // alphabetic order, and we may have several terms
                    // at the same position, we want to keep only the
                    // first one (ie: dockes and dockes@wanadoo.fr)
                    if (vit->second.empty()) {
                        LOGDEB2("makeAbstract: populating: [" << *term <<
                                "] at " << *pos << "\n");
                        sparseDoc[*pos] = *term;
                    }
                }
            }
        }
    }
    LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");




























#if 0
    // Debug only: output the full term[position] vector
    bool epty = false;
    int ipos = 0;
    for (map<unsigned int, string>::iterator it = sparseDoc.begin(); 
         it != sparseDoc.end();
         it++, ipos++) {
        if (it->empty()) {
            if (!epty)
                LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
            epty=true;
        } else {
            epty = false;
            LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
        }
    }
#endif

    vector<int> vpbreaks;
    ndb->getPagePositions(docid, vpbreaks);

    LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
           vpbreaks.size() << " pages\n");
    // Finally build the abstract by walking the map (in order of position)
    vabs.clear();
    string chunk;
    bool incjk = false;
    int page = 0;
    string term;
    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
         it != sparseDoc.end(); it++) {
        LOGDEB2("Abtract:output " << it->first << " -> [" << it->second <<
                "]\n");
        if (!occupiedmarker.compare(it->second)) {
            LOGDEB("Abstract: qtrm position not filled ??\n");
            continue;
        }
        if (chunk.empty() && !vpbreaks.empty()) {
            page =  ndb->getPageNumberForPosition(vpbreaks, it->first);
            if (page < 0) 
                page = 0;
            term.clear();
        }
        Utf8Iter uit(it->second);
        bool newcjk = false;
        if (TextSplit::isCJK(*uit))
            newcjk = true;
        if (!incjk || (incjk && !newcjk))
            chunk += " ";
        incjk = newcjk;
        if (searchTermPositions.find(it->first) != searchTermPositions.end())
            term = it->second;
        if (it->second == cstr_ellipsis) {
            vabs.push_back(Snippet(page, chunk).setTerm(term));
            chunk.clear();
        } else {
            if (it->second.compare(end_of_field_term) && 
                it->second.compare(start_of_field_term))
                chunk += it->second;
        }
    }
    if (!chunk.empty())
        vabs.push_back(Snippet(page, chunk).setTerm(term));

    LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
    return ret;
}


}




	a/src/rcldb/rclabstract.cpp		b/src/rcldb/rclabstract.cpp
	...		...
48	{	48	{
49	string a;	49	string a;
50	for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {	50	for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
51	a = a + *it + " ";	51	a = a + *it + " ";
52	}	52	}
53	LOGDEB("" << (what) << ": " << (a) << "\n" );	53	LOGDEB("" << what << ": " << a << "\n");
54	}	54	}
55	#else	55	#else
56	#define LOGABS LOGDEB2	56	#define LOGABS LOGDEB2
57	static void listList(const string&, const vector<string>&)	57	static void listList(const string&, const vector<string>&)
58	{	58	{
	...		...
65	// result in general.	65	// result in general.
66	static const bool prune_prefixed_terms = true;	66	static const bool prune_prefixed_terms = true;
67	static void noPrefixList(const vector<string>& in, vector<string>& out)	67	static void noPrefixList(const vector<string>& in, vector<string>& out)
68	{	68	{
69	for (vector<string>::const_iterator qit = in.begin();	69	for (vector<string>::const_iterator qit = in.begin();
70	qit != in.end(); qit++) {	70	qit != in.end(); qit++) {
71	if (prune_prefixed_terms) {	71	if (prune_prefixed_terms) {
72	if (has_prefix(*qit))	72	if (has_prefix(*qit))
73	continue;	73	continue;
74	}	74	}
75	out.push_back(strip_prefix(*qit));	75	out.push_back(strip_prefix(*qit));
76	}	76	}
77	sort(out.begin(), out.end());	77	sort(out.begin(), out.end());
78	vector<string>::iterator it = unique(out.begin(), out.end());	78	vector<string>::iterator it = unique(out.begin(), out.end());
79	out.resize(it - out.begin());	79	out.resize(it - out.begin());
80	}	80	}
81		81
82	bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)	82	bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
83	{	83	{
84	if (!xenquire) {	84	if (!xenquire) {
85	LOGERR("Query::getMatchTerms: no query opened\n" );	85	LOGERR("Query::getMatchTerms: no query opened\n");
86	return false;	86	return false;
87	}	87	}
88		88
89	terms.clear();	89	terms.clear();
90	Xapian::TermIterator it;	90	Xapian::TermIterator it;
91	Xapian::docid id = Xapian::docid(xdocid);	91	Xapian::docid id = Xapian::docid(xdocid);
	...		...
93	XAPTRY(iterms.insert(iterms.begin(),	93	XAPTRY(iterms.insert(iterms.begin(),
94	xenquire->get_matching_terms_begin(id),	94	xenquire->get_matching_terms_begin(id),
95	xenquire->get_matching_terms_end(id)),	95	xenquire->get_matching_terms_end(id)),
96	m_q->m_db->m_ndb->xrdb, m_q->m_reason);	96	m_q->m_db->m_ndb->xrdb, m_q->m_reason);
97	if (!m_q->m_reason.empty()) {	97	if (!m_q->m_reason.empty()) {
98	LOGERR("getMatchTerms: xapian error: " << (m_q->m_reason) << "\n" );	98	LOGERR("getMatchTerms: xapian error: " << m_q->m_reason << "\n");
99	return false;	99	return false;
100	}	100	}
101	noPrefixList(iterms, terms);	101	noPrefixList(iterms, terms);
102	return true;	102	return true;
103	}	103	}
104		104
	...		...
107	// while computing abstracts for the different result documents.	107	// while computing abstracts for the different result documents.
108	void Query::Native::setDbWideQTermsFreqs()	108	void Query::Native::setDbWideQTermsFreqs()
109	{	109	{
110	// Do it once only for a given query.	110	// Do it once only for a given query.
111	if (!termfreqs.empty())	111	if (!termfreqs.empty())
112	return;	112	return;
113		113
114	vector<string> qterms;	114	vector<string> qterms;
115	{	115	{
116	vector<string> iqterms;	116	vector<string> iqterms;
117	m_q->getQueryTerms(iqterms);	117	m_q->getQueryTerms(iqterms);
118	noPrefixList(iqterms, qterms);	118	noPrefixList(iqterms, qterms);
119	}	119	}
120	// listList("Query terms: ", qterms);	120	// listList("Query terms: ", qterms);
121	Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;	121	Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
122		122
123	double doccnt = xrdb.get_doccount();	123	double doccnt = xrdb.get_doccount();
124	if (doccnt == 0)	124	if (doccnt == 0)
125	doccnt = 1;	125	doccnt = 1;
126		126
127	for (vector<string>::const_iterator qit = qterms.begin();	127	for (vector<string>::const_iterator qit = qterms.begin();
128	qit != qterms.end(); qit++) {	128	qit != qterms.end(); qit++) {
129	termfreqs[qit] = xrdb.get_termfreq(qit) / doccnt;	129	termfreqs[qit] = xrdb.get_termfreq(qit) / doccnt;
130	LOGABS("setDbWideQTermFreqs: [" << (qit) << "] db freq " << (termfreqs[*qit]) << "\n" );	130	LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " <<
		131	termfreqs[*qit] << "\n");
131	}	132	}
132	}	133	}
133		134
134	// Compute matched terms quality coefficients for a matched document by	135	// Compute matched terms quality coefficients for a matched document by
135	// retrieving the Within Document Frequencies and multiplying by	136	// retrieving the Within Document Frequencies and multiplying by
	...		...
141	// common stem, which seems wrong, we group the terms by	142	// common stem, which seems wrong, we group the terms by
142	// root, compute a frequency for the group from the sum of member	143	// root, compute a frequency for the group from the sum of member
143	// occurrences, and let the frequency for each group member be the	144	// occurrences, and let the frequency for each group member be the
144	// aggregated frequency.	145	// aggregated frequency.
145	double Query::Native::qualityTerms(Xapian::docid docid,	146	double Query::Native::qualityTerms(Xapian::docid docid,
146	const vector<string>& terms,	147	const vector<string>& terms,
147	multimap<double, vector<string> >& byQ)	148	multimap<double, vector<string> >& byQ)
148	{	149	{
149	LOGABS("qualityTerms\n" );	150	LOGABS("qualityTerms\n");
150	setDbWideQTermsFreqs();	151	setDbWideQTermsFreqs();
151		152
152	map<string, double> termQcoefs;	153	map<string, double> termQcoefs;
153	double totalweight = 0;	154	double totalweight = 0;
154		155
155	Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;	156	Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
156	double doclen = xrdb.get_doclength(docid);	157	double doclen = xrdb.get_doclength(docid);
157	if (doclen == 0)	158	if (doclen == 0)
158	doclen = 1;	159	doclen = 1;
159	HighlightData hld;	160	HighlightData hld;
160	if (m_q->m_sd) {	161	if (m_q->m_sd) {
161	m_q->m_sd->getTerms(hld);	162	m_q->m_sd->getTerms(hld);
162	}	163	}
163		164
164	#ifdef DEBUGABSTRACT	165	#ifdef DEBUGABSTRACT
165	{	166	{
166	string deb;	167	string deb;
167	hld.toString(deb);	168	hld.toString(deb);
168	LOGABS("qualityTerms: hld: " << (deb) << "\n" );	169	LOGABS("qualityTerms: hld: " << deb << "\n");
169	}	170	}
170	#endif	171	#endif
171		172
172	// Group the input terms by the user term they were possibly expanded from	173	// Group the input terms by the user term they were possibly expanded from
173	map<string, vector<string> > byRoot;	174	map<string, vector<string> > byRoot;
174	for (vector<string>::const_iterator qit = terms.begin();	175	for (vector<string>::const_iterator qit = terms.begin();
175	qit != terms.end(); qit++) {	176	qit != terms.end(); qit++) {
176	map<string, string>::const_iterator eit = hld.terms.find(*qit);	177	map<string, string>::const_iterator eit = hld.terms.find(*qit);
177	if (eit != hld.terms.end()) {	178	if (eit != hld.terms.end()) {
178	byRoot[eit->second].push_back(*qit);	179	byRoot[eit->second].push_back(*qit);
179	} else {	180	} else {
180	LOGDEB0("qualityTerms: [" << ((*qit)) << "] not found in hld\n" );	181	LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n");
181	byRoot[qit].push_back(qit);	182	byRoot[qit].push_back(qit);
182	}	183	}
183	}	184	}
184		185
185	#ifdef DEBUGABSTRACT	186	#ifdef DEBUGABSTRACT
186	{	187	{
187	string byRootstr;	188	string byRootstr;
188	for (map<string, vector<string> >::const_iterator debit =	189	for (map<string, vector<string> >::const_iterator debit =
189	byRoot.begin(); debit != byRoot.end(); debit++) {	190	byRoot.begin(); debit != byRoot.end(); debit++) {
190	byRootstr.append("[").append(debit->first).append("]->");	191	byRootstr.append("[").append(debit->first).append("]->");
191	for (vector<string>::const_iterator it = debit->second.begin();	192	for (vector<string>::const_iterator it = debit->second.begin();
192	it != debit->second.end(); it++) {	193	it != debit->second.end(); it++) {
193	byRootstr.append("[").append(*it).append("] ");	194	byRootstr.append("[").append(*it).append("] ");
194	}	195	}
195	byRootstr.append("\n");	196	byRootstr.append("\n");
196	}	197	}
197	LOGABS("\nqualityTerms: uterms to terms: " << (byRootstr) << "\n" );	198	LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n");
198	}	199	}
199	#endif	200	#endif
200		201
201	// Compute in-document and global frequencies for the groups.	202	// Compute in-document and global frequencies for the groups.
202	map<string, double> grpwdfs;	203	map<string, double> grpwdfs;
203	map<string, double> grptfreqs;	204	map<string, double> grptfreqs;
204	for (map<string, vector<string> >::const_iterator git = byRoot.begin();	205	for (map<string, vector<string> >::const_iterator git = byRoot.begin();
205	git != byRoot.end(); git++) {	206	git != byRoot.end(); git++) {
206	for (vector<string>::const_iterator qit = git->second.begin();	207	for (vector<string>::const_iterator qit = git->second.begin();
207	qit != git->second.end(); qit++) {	208	qit != git->second.end(); qit++) {
208	Xapian::TermIterator term = xrdb.termlist_begin(docid);	209	Xapian::TermIterator term = xrdb.termlist_begin(docid);
209	term.skip_to(*qit);	210	term.skip_to(*qit);
210	if (term != xrdb.termlist_end(docid) && term == qit) {	211	if (term != xrdb.termlist_end(docid) && term == qit) {
211	if (grpwdfs.find(git->first) != grpwdfs.end()) {	212	if (grpwdfs.find(git->first) != grpwdfs.end()) {
212	grpwdfs[git->first] = term.get_wdf() / doclen;	213	grpwdfs[git->first] = term.get_wdf() / doclen;
213	grptfreqs[git->first] = termfreqs[*qit];	214	grptfreqs[git->first] = termfreqs[*qit];
214	} else {	215	} else {
215	grpwdfs[git->first] += term.get_wdf() / doclen;	216	grpwdfs[git->first] += term.get_wdf() / doclen;
216	grptfreqs[git->first] += termfreqs[*qit];	217	grptfreqs[git->first] += termfreqs[*qit];
217	}	218	}
218	}	219	}
219	}	220	}
220	}	221	}
221		222
222	// Build a sorted by quality container for the groups	223	// Build a sorted by quality container for the groups
223	for (map<string, vector<string> >::const_iterator git = byRoot.begin();	224	for (map<string, vector<string> >::const_iterator git = byRoot.begin();
224	git != byRoot.end(); git++) {	225	git != byRoot.end(); git++) {
225	double q = (grpwdfs[git->first]) * grptfreqs[git->first];	226	double q = (grpwdfs[git->first]) * grptfreqs[git->first];
226	q = -log10(q);	227	q = -log10(q);
227	if (q < 3) {	228	if (q < 3) {
228	q = 0.05;	229	q = 0.05;
229	} else if (q < 4) {	230	} else if (q < 4) {
230	q = 0.3;	231	q = 0.3;
231	} else if (q < 5) {	232	} else if (q < 5) {
232	q = 0.7;	233	q = 0.7;
233	} else if (q < 6) {	234	} else if (q < 6) {
234	q = 0.8;	235	q = 0.8;
235	} else {	236	} else {
236	q = 1;	237	q = 1;
237	}	238	}
238	totalweight += q;	239	totalweight += q;
239	byQ.insert(pair<double, vector<string> >(q, git->second));	240	byQ.insert(pair<double, vector<string> >(q, git->second));
240	}	241	}
241		242
242	#ifdef DEBUGABSTRACT	243	#ifdef DEBUGABSTRACT
243	for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();	244	for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
244	mit != byQ.rend(); mit++) {	245	mit != byQ.rend(); mit++) {
245	LOGABS("qualityTerms: group\n" );	246	LOGABS("qualityTerms: group\n");
246	for (vector<string>::const_iterator qit = mit->second.begin();	247	for (vector<string>::const_iterator qit = mit->second.begin();
247	qit != mit->second.end(); qit++) {	248	qit != mit->second.end(); qit++) {
248	LOGABS("" << (mit->first) << "->[" << (qit) << "]\n" );	249	LOGABS("" << mit->first << "->[" << *qit << "]\n");
249	}	250	}
250	}	251	}
251	#endif	252	#endif
252	return totalweight;	253	return totalweight;
253	}	254	}
254		255
255	// Return page number for first match of "significant" term.	256	// Return page number for first match of "significant" term.
256	int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)	257	int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
257	{	258	{
258	LOGDEB("Query::Native::getFirstMatchPage\n");	259	LOGDEB("Query::Native::getFirstMatchPage\n");
259	if (!m_q\|\| !m_q->m_db \|\| !m_q->m_db->m_ndb \|\| !m_q->m_db->m_ndb->m_isopen) {	260	if (!m_q\|\| !m_q->m_db \|\| !m_q->m_db->m_ndb \|\| !m_q->m_db->m_ndb->m_isopen) {
260	LOGERR("Query::getFirstMatchPage: no db\n" );	261	LOGERR("Query::getFirstMatchPage: no db\n");
261	return -1;	262	return -1;
262	}	263	}
263	Rcl::Db::Native *ndb(m_q->m_db->m_ndb);	264	Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
264	Xapian::Database& xrdb(ndb->xrdb);	265	Xapian::Database& xrdb(ndb->xrdb);
265		266
266	vector<string> terms;	267	vector<string> terms;
267	getMatchTerms(docid, terms);	268	getMatchTerms(docid, terms);
268		269
269	if (terms.empty()) {	270	if (terms.empty()) {
270	LOGDEB("getFirstMatchPage: empty match term list (field match?)\n" );	271	LOGDEB("getFirstMatchPage: empty match term list (field match?)\n");
271	return -1;	272	return -1;
272	}	273	}
273		274
274	vector<int> pagepos;	275	vector<int> pagepos;
275	ndb->getPagePositions(docid, pagepos);	276	ndb->getPagePositions(docid, pagepos);
276	if (pagepos.empty())	277	if (pagepos.empty())
277	return -1;	278	return -1;
278		279
279	setDbWideQTermsFreqs();	280	setDbWideQTermsFreqs();
280		281
281	// We try to use a page which matches the "best" term. Get a sorted list	282	// We try to use a page which matches the "best" term. Get a sorted list
282	multimap<double, vector<string> > byQ;	283	multimap<double, vector<string> > byQ;
283	qualityTerms(docid, terms, byQ);	284	qualityTerms(docid, terms, byQ);
284		285
285	for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();	286	for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
286	mit != byQ.rend(); mit++) {	287	mit != byQ.rend(); mit++) {
287	for (vector<string>::const_iterator qit = mit->second.begin();	288	for (vector<string>::const_iterator qit = mit->second.begin();
288	qit != mit->second.end(); qit++) {	289	qit != mit->second.end(); qit++) {
289	string qterm = *qit;	290	string qterm = *qit;
290	Xapian::PositionIterator pos;	291	Xapian::PositionIterator pos;
291	string emptys;	292	string emptys;
292	try {	293	try {
293	for (pos = xrdb.positionlist_begin(docid, qterm);	294	for (pos = xrdb.positionlist_begin(docid, qterm);
294	pos != xrdb.positionlist_end(docid, qterm); pos++) {	295	pos != xrdb.positionlist_end(docid, qterm); pos++) {
295	int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);	296	int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
296	if (pagenum > 0) {	297	if (pagenum > 0) {
297	term = qterm;	298	term = qterm;
298	return pagenum;	299	return pagenum;
299	}	300	}
300	}	301	}
301	} catch (...) {	302	} catch (...) {
302	// Term does not occur. No problem.	303	// Term does not occur. No problem.
303	}	304	}
304	}	305	}
305	}	306	}
306	return -1;	307	return -1;
307	}	308	}
308		309
309	// Build a document abstract by extracting text chunks around the query terms	310	// Build a document abstract by extracting text chunks around the query terms
310	// This uses the db termlists, not the original document.	311	// This uses the db termlists, not the original document.
311	//	312	//
312	// DatabaseModified and other general exceptions are catched and	313	// DatabaseModified and other general exceptions are catched and
313	// possibly retried by our caller	314	// possibly retried by our caller
314	int Query::Native::makeAbstract(Xapian::docid docid,	315	int Query::Native::makeAbstract(Xapian::docid docid,
315	vector<Snippet>& vabs,	316	vector<Snippet>& vabs,
316	int imaxoccs, int ictxwords)	317	int imaxoccs, int ictxwords)
317	{	318	{
318	Chrono chron;	319	Chrono chron;
319	LOGABS("makeAbstract: docid " << (long(docid)) << " imaxoccs " << (imaxoccs) << " ictxwords " << (ictxwords) << "\n" );	320	LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
		321	imaxoccs << " ictxwords " << ictxwords << "\n");
320		322
321	// The (unprefixed) terms matched by this document	323	// The (unprefixed) terms matched by this document
322	vector<string> matchedTerms;	324	vector<string> matchedTerms;
323	getMatchTerms(docid, matchedTerms);	325	getMatchTerms(docid, matchedTerms);
324	if (matchedTerms.empty()) {	326	if (matchedTerms.empty()) {
325	LOGDEB("makeAbstract::Empty term list\n" );	327	LOGDEB("makeAbstract:"<<chron.millis()<<"mS:Empty term list\n");
326	return ABSRES_ERROR;	328	return ABSRES_ERROR;
327	}	329	}
328		330
329	listList("Match terms: ", matchedTerms);	331	listList("Match terms: ", matchedTerms);
330		332
331	// Retrieve the term frequencies for the query terms. This is	333	// Retrieve the term frequencies for the query terms. This is
	...		...
337	// going to try and show text around the less common search terms.	339	// going to try and show text around the less common search terms.
338	// Terms issued from an original one by stem expansion are	340	// Terms issued from an original one by stem expansion are
339	// aggregated by the qualityTerms() routine.	341	// aggregated by the qualityTerms() routine.
340	multimap<double, vector<string> > byQ;	342	multimap<double, vector<string> > byQ;
341	double totalweight = qualityTerms(docid, matchedTerms, byQ);	343	double totalweight = qualityTerms(docid, matchedTerms, byQ);
342	LOGABS("makeAbstract:" << (chron.ms()) << ": computed Qcoefs.\n" );	344	LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
343	// This can't happen, but would crash us	345	// This can't happen, but would crash us
344	if (totalweight == 0.0) {	346	if (totalweight == 0.0) {
345	LOGERR("makeAbstract: totalweight == 0.0 !\n" );	347	LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
346	return ABSRES_ERROR;	348	return ABSRES_ERROR;
347	}	349	}
348		350
349	Rcl::Db::Native *ndb(m_q->m_db->m_ndb);	351	Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
350	Xapian::Database& xrdb(ndb->xrdb);	352	Xapian::Database& xrdb(ndb->xrdb);
351		353
	...		...
372	// average word size. It was a mistake to have the user max	374	// average word size. It was a mistake to have the user max
373	// abstract size parameter in characters, we basically only deal	375	// abstract size parameter in characters, we basically only deal
374	// with words. We used to limit the character size at the end, but	376	// with words. We used to limit the character size at the end, but
375	// this damaged our careful selection of terms	377	// this damaged our careful selection of terms
376	const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :	378	const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
377	m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));	379	m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
378	int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;	380	int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
379	LOGABS("makeAbstract:" << (chron.ms()) << ": mxttloccs " << (maxtotaloccs) << " ctxwords " << (ctxwords) << "\n" );	381	LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
		382	maxtotaloccs << " ctxwords " << ctxwords << "\n");
380		383
381	int ret = ABSRES_OK;	384	int ret = ABSRES_OK;
382		385
383	// Let's go populate	386	// Let's go populate
384	for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();	387	for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
385	mit != byQ.rend(); mit++) {	388	mit != byQ.rend(); mit++) {
386	unsigned int maxgrpoccs;	389	unsigned int maxgrpoccs;
387	double q;	390	double q;
388	if (byQ.size() == 1) {	391	if (byQ.size() == 1) {
389	maxgrpoccs = maxtotaloccs;	392	maxgrpoccs = maxtotaloccs;
390	q = 1.0;	393	q = 1.0;
391	} else {	394	} else {
392	// We give more slots to the better term groups	395	// We give more slots to the better term groups
393	q = mit->first / totalweight;	396	q = mit->first / totalweight;
394	maxgrpoccs = int(ceil(maxtotaloccs * q));	397	maxgrpoccs = int(ceil(maxtotaloccs * q));
395	}	398	}
396	unsigned int grpoccs = 0;	399	unsigned int grpoccs = 0;
397		400
398	for (vector<string>::const_iterator qit = mit->second.begin();	401	for (vector<string>::const_iterator qit = mit->second.begin();
399	qit != mit->second.end(); qit++) {	402	qit != mit->second.end(); qit++) {
400		403
401	// Group done ?	404	// Group done ?
402	if (grpoccs >= maxgrpoccs)	405	if (grpoccs >= maxgrpoccs)
403	break;	406	break;
404		407
405	string qterm = *qit;	408	string qterm = *qit;
406		409
407	LOGABS("makeAbstract: [" << (qterm) << "] " << (maxgrpoccs) << " max grp occs (coef " << (q) << ")\n" );	410	LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
		411	" max grp occs (coef " << q << ")\n");
408		412
409	// The match term may span several words	413	// The match term may span several words
410	int qtrmwrdcnt =	414	int qtrmwrdcnt =
411	TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);	415	TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
412		416
413	Xapian::PositionIterator pos;	417	Xapian::PositionIterator pos;
414	// There may be query terms not in this doc. This raises an	418	// There may be query terms not in this doc. This raises an
415	// exception when requesting the position list, we catch it ??	419	// exception when requesting the position list, we catch it ??
416	// Not clear how this can happen because we are walking the	420	// Not clear how this can happen because we are walking the
417	// match list returned by Xapian. Maybe something with the	421	// match list returned by Xapian. Maybe something with the
418	// fields?	422	// fields?
419	string emptys;	423	string emptys;
420	try {	424	try {
421	for (pos = xrdb.positionlist_begin(docid, qterm);	425	for (pos = xrdb.positionlist_begin(docid, qterm);
422	pos != xrdb.positionlist_end(docid, qterm); pos++) {	426	pos != xrdb.positionlist_end(docid, qterm); pos++) {
423	int ipos = *pos;	427	int ipos = *pos;
424	if (ipos < int(baseTextPosition)) // Not in text body	428	if (ipos < int(baseTextPosition)) // Not in text body
425	continue;	429	continue;
426	LOGABS("makeAbstract: [" << (qterm) << "] at pos " << (ipos) << " grpoccs " << (grpoccs) << " maxgrpoccs " << (maxgrpoccs) << "\n" );	430	LOGABS("makeAbstract: [" << qterm << "] at pos " <<
		431	ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
		432	maxgrpoccs << "\n");
427		433
428	totaloccs++;	434	totaloccs++;
429	grpoccs++;	435	grpoccs++;
430		436
431	// Add adjacent slots to the set to populate at next	437	// Add adjacent slots to the set to populate at next
432	// step by inserting empty strings. Special provisions	438	// step by inserting empty strings. Special provisions
433	// for adding ellipsis and for positions overlapped by	439	// for adding ellipsis and for positions overlapped by
434	// the match term.	440	// the match term.
435	unsigned int sta = MAX(int(baseTextPosition),	441	unsigned int sta = MAX(int(baseTextPosition),
436	ipos - ctxwords);	442	ipos - ctxwords);
437	unsigned int sto = ipos + qtrmwrdcnt-1 +	443	unsigned int sto = ipos + qtrmwrdcnt-1 +
438	m_q->m_db->getAbsCtxLen();	444	m_q->m_db->getAbsCtxLen();
439	for (unsigned int ii = sta; ii <= sto; ii++) {	445	for (unsigned int ii = sta; ii <= sto; ii++) {
440	if (ii == (unsigned int)ipos) {	446	if (ii == (unsigned int)ipos) {
441	sparseDoc[ii] = qterm;	447	sparseDoc[ii] = qterm;
442	searchTermPositions.insert(ii);	448	searchTermPositions.insert(ii);
443	if (ii > maxpos)	449	if (ii > maxpos)
444	maxpos = ii;	450	maxpos = ii;
445	} else if (ii > (unsigned int)ipos &&	451	} else if (ii > (unsigned int)ipos &&
446	ii < (unsigned int)ipos + qtrmwrdcnt) {	452	ii < (unsigned int)ipos + qtrmwrdcnt) {
447	sparseDoc[ii] = occupiedmarker;	453	sparseDoc[ii] = occupiedmarker;
448	} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {	454	} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
449	// For an empty slot, the test has a side	455	// For an empty slot, the test has a side
450	// effect of inserting an empty string which	456	// effect of inserting an empty string which
451	// is what we want.	457	// is what we want.
452	sparseDoc[ii] = emptys;	458	sparseDoc[ii] = emptys;
453	}	459	}
454	}	460	}
455	// Add ellipsis at the end. This may be replaced later by	461	// Add ellipsis at the end. This may be replaced later by
456	// an overlapping extract. Take care not to replace an	462	// an overlapping extract. Take care not to replace an
457	// empty string here, we really want an empty slot,	463	// empty string here, we really want an empty slot,
458	// use find()	464	// use find()
459	if (sparseDoc.find(sto+1) == sparseDoc.end()) {	465	if (sparseDoc.find(sto+1) == sparseDoc.end()) {
460	sparseDoc[sto+1] = cstr_ellipsis;	466	sparseDoc[sto+1] = cstr_ellipsis;
461	}	467	}
462		468
463	// Group done ?	469	// Group done ?
464	if (grpoccs >= maxgrpoccs) {	470	if (grpoccs >= maxgrpoccs) {
465	ret \|= ABSRES_TRUNC;	471	ret \|= ABSRES_TRUNC;
466	LOGABS("Db::makeAbstract: max group occs cutoff\n" );	472	LOGABS("Db::makeAbstract: max group occs cutoff\n");
467	break;	473	break;
468	}	474	}
469	// Global done ?	475	// Global done ?
470	if (totaloccs >= maxtotaloccs) {	476	if (totaloccs >= maxtotaloccs) {
471	ret \|= ABSRES_TRUNC;	477	ret \|= ABSRES_TRUNC;
472	LOGABS("Db::makeAbstract: max occurrences cutoff\n" );	478	LOGABS("Db::makeAbstract: max occurrences cutoff\n");
473	break;	479	break;
474	}	480	}
475	}	481	}
476	} catch (...) {	482	} catch (...) {
477	// Term does not occur. No problem.	483	// Term does not occur. No problem.
478	}	484	}
479		485
480	if (totaloccs >= maxtotaloccs) {	486	if (totaloccs >= maxtotaloccs) {
481	ret \|= ABSRES_TRUNC;	487	ret \|= ABSRES_TRUNC;
482	LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );	488	LOGABS("Db::makeAbstract: max1 occurrences cutoff\n");
483	break;	489	break;
484	}	490	}
485	}	491	}
486	}	492	}
487	maxpos += ctxwords + 1;	493	maxpos += ctxwords + 1;
488		494
489	LOGABS("makeAbstract:" << (chron.millis()) << ":chosen number of positions " << (totaloccs) << "\n" );	495	LOGABS("makeAbstract:" << chron.millis() <<
		496	"mS:chosen number of positions " << totaloccs << "\n");
490	// This can happen if there are term occurences in the keywords	497	// This can happen if there are term occurences in the keywords
491	// etc. but not elsewhere ?	498	// etc. but not elsewhere ?
492	if (totaloccs == 0) {	499	if (totaloccs == 0) {
493	LOGDEB("makeAbstract: no occurrences\n" );	500	LOGDEB("makeAbstract: no occurrences\n");
494	return ABSRES_OK;	501	return ABSRES_OK;
495	}	502	}
496		503
497	// Walk all document's terms position lists and populate slots	504	// Walk all document's terms position lists and populate slots
498	// around the query terms. We arbitrarily truncate the list to	505	// around the query terms. We arbitrarily truncate the list to
499	// avoid taking forever. If we do cutoff, the abstract may be	506	// avoid taking forever. If we do cutoff, the abstract may be
500	// inconsistant (missing words, potentially altering meaning),	507	// inconsistant (missing words, potentially altering meaning),
501	// which is bad.	508	// which is bad.
502	{	509	{
503	Xapian::TermIterator term;	510	Xapian::TermIterator term;
504	int cutoff = m_q->m_snipMaxPosWalk;	511	int cutoff = m_q->m_snipMaxPosWalk;
505	for (term = xrdb.termlist_begin(docid);	512	for (term = xrdb.termlist_begin(docid);
506	term != xrdb.termlist_end(docid); term++) {	513	term != xrdb.termlist_end(docid); term++) {
507	// Ignore prefixed terms	514	// Ignore prefixed terms
508	if (has_prefix(*term))	515	if (has_prefix(*term))
509	continue;	516	continue;
510	if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {	517	if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
511	ret \|= ABSRES_TERMMISS;	518	ret \|= ABSRES_TERMMISS;
512	LOGDEB0("makeAbstract: max term count cutoff " << (m_q->m_snipMaxPosWalk) << "\n" );	519	LOGDEB0("makeAbstract: max term count cutoff " <<
513	break;	520	m_q->m_snipMaxPosWalk << "\n");
		521	break;
		522	}
		523
		524	map<unsigned int, string>::iterator vit;
		525	Xapian::PositionIterator pos;
		526	for (pos = xrdb.positionlist_begin(docid, *term);
		527	pos != xrdb.positionlist_end(docid, *term); pos++) {
		528	if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
		529	ret \|= ABSRES_TERMMISS;
		530	LOGDEB0("makeAbstract: max term count cutoff " <<
		531	m_q->m_snipMaxPosWalk << "\n");
		532	break;
		533	}
		534	// If we are beyond the max possible position, stop
		535	// for this term
		536	if (*pos > maxpos) {
		537	break;
		538	}
		539	if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
		540	// Don't replace a term: the terms list is in
		541	// alphabetic order, and we may have several terms
		542	// at the same position, we want to keep only the
		543	// first one (ie: dockes and dockes@wanadoo.fr)
		544	if (vit->second.empty()) {
		545	LOGDEB2("makeAbstract: populating: [" << *term <<
		546	"] at " << *pos << "\n");
		547	sparseDoc[pos] = term;
		548	}
		549	}
		550	}
		551	}
514	}	552	}
515		553	LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");
516	map<unsigned int, string>::iterator vit;
517	Xapian::PositionIterator pos;
518	for (pos = xrdb.positionlist_begin(docid, *term);
519	pos != xrdb.positionlist_end(docid, *term); pos++) {
520	if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
521	ret \|= ABSRES_TERMMISS;
522	LOGDEB0("makeAbstract: max term count cutoff " << (m_q->m_snipMaxPosWalk) << "\n" );
523	break;
524	}
525	// If we are beyond the max possible position, stop
526	// for this term
527	if (*pos > maxpos) {
528	break;
529	}
530	if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
531	// Don't replace a term: the terms list is in
532	// alphabetic order, and we may have several terms
533	// at the same position, we want to keep only the
534	// first one (ie: dockes and dockes@wanadoo.fr)
535	if (vit->second.empty()) {
536	LOGDEB2("makeAbstract: populating: [" << ((term)) << "] at " << (pos) << "\n" );
537	sparseDoc[pos] = term;
538	}
539	}
540	}
541	}