recoll / Code / Diff of /src/rcldb/rclabstract.cpp

Diff of /src/rcldb/rclabstract.cpp [5c46db] .. [b99372]

Switch to side-by-side view

--- a/src/rcldb/rclabstract.cpp
+++ b/src/rcldb/rclabstract.cpp
@@ -1,4 +1,4 @@
-/* Copyright (C) 2004 J.F.Dockes
+/* Copyright (C) 2004-2017 J.F.Dockes
  *   This program is free software; you can redistribute it and/or modify
  *   it under the terms of the GNU General Public License as published by
  *   the Free Software Foundation; either version 2 of the License, or
@@ -19,6 +19,9 @@
 #include <math.h>
 
 #include <map>
+#include <unordered_map>
+#include <deque>
+#include <algorithm>
 
 #include "log.h"
 #include "rcldb.h"
@@ -33,30 +36,22 @@
 
 using namespace std;
 
+
 namespace Rcl {
+
 // This is used as a marker inside the abstract frag lists, but
 // normally doesn't remain in final output (which is built with a
 // custom sep. by our caller).
 static const string cstr_ellipsis("...");
+static const string emptys;
 // This is used to mark positions overlapped by a multi-word match term
 static const string occupiedmarker("?");
 
-#undef DEBUGABSTRACT  
+#define DEBUGABSTRACT  
 #ifdef DEBUGABSTRACT
 #define LOGABS LOGDEB
-static void listList(const string& what, const vector<string>&l)
-{
-    string a;
-    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
-        a = a + *it + " ";
-    }
-    LOGDEB(""  << (what) << ": "  << (a) << "\n" );
-}
 #else
 #define LOGABS LOGDEB2
-static void listList(const string&, const vector<string>&)
-{
-}
 #endif
 
 // Unprefix terms. Actually it's not completely clear if we should
@@ -66,13 +61,12 @@
 static const bool prune_prefixed_terms = true; 
 static void noPrefixList(const vector<string>& in, vector<string>& out) 
 {
-    for (vector<string>::const_iterator qit = in.begin(); 
-	 qit != in.end(); qit++) {
-	if (prune_prefixed_terms) {
-	    if (has_prefix(*qit))
-		continue;
-	}
-	out.push_back(strip_prefix(*qit));
+    for (const auto& term : in) {
+        if (prune_prefixed_terms) {
+            if (has_prefix(term))
+                continue;
+        }
+        out.push_back(strip_prefix(term));
     }
     sort(out.begin(), out.end());
     vector<string>::iterator it = unique(out.begin(), out.end());
@@ -82,8 +76,8 @@
 bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
 {
     if (!xenquire) {
-	LOGERR("Query::getMatchTerms: no query opened\n" );
-	return false;
+        LOGERR("Query::getMatchTerms: no query opened\n");
+        return false;
     }
 
     terms.clear();
@@ -95,8 +89,8 @@
                         xenquire->get_matching_terms_end(id)),
            m_q->m_db->m_ndb->xrdb, m_q->m_reason);
     if (!m_q->m_reason.empty()) {
-	LOGERR("getMatchTerms: xapian error: "  << (m_q->m_reason) << "\n" );
-	return false;
+        LOGERR("getMatchTerms: xapian error: " << m_q->m_reason << "\n");
+        return false;
     }
     noPrefixList(iterms, terms);
     return true;
@@ -109,25 +103,25 @@
 {
     // Do it once only for a given query.
     if (!termfreqs.empty())
-	return;
+        return;
 
     vector<string> qterms;
     {
-	vector<string> iqterms;
-	m_q->getQueryTerms(iqterms);
-	noPrefixList(iqterms, qterms);
-    }
-    // listList("Query terms: ", qterms);
+        vector<string> iqterms;
+        m_q->getQueryTerms(iqterms);
+        noPrefixList(iqterms, qterms);
+    }
+    LOGDEB("Query terms: " << stringsToString(qterms) << endl);
     Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
 
     double doccnt = xrdb.get_doccount();
     if (doccnt == 0) 
-	doccnt = 1;
-
-    for (vector<string>::const_iterator qit = qterms.begin(); 
-	 qit != qterms.end(); qit++) {
-	termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
-	LOGABS("setDbWideQTermFreqs: ["  << (qit) << "] db freq "  << (termfreqs[*qit]) << "\n" );
+        doccnt = 1;
+
+    for (const auto& term : qterms) {
+        termfreqs[term] = xrdb.get_termfreq(term) / doccnt;
+        LOGABS("setDbWideQTermFreqs: [" << term << "] db freq " <<
+               termfreqs[term] << "\n");
     }
 }
 
@@ -143,10 +137,10 @@
 // occurrences, and let the frequency for each group member be the
 // aggregated frequency.
 double Query::Native::qualityTerms(Xapian::docid docid, 
-				   const vector<string>& terms,
-				   multimap<double, vector<string> >& byQ)
-{
-    LOGABS("qualityTerms\n" );
+                                   const vector<string>& terms,
+                                   multimap<double, vector<string> >& byQ)
+{
+    LOGABS("qualityTerms\n");
     setDbWideQTermsFreqs();
 
     map<string, double> termQcoefs;
@@ -155,110 +149,97 @@
     Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
     double doclen = xrdb.get_doclength(docid);
     if (doclen == 0) 
-	doclen = 1;
+        doclen = 1;
     HighlightData hld;
     if (m_q->m_sd) {
-	m_q->m_sd->getTerms(hld);
+        m_q->m_sd->getTerms(hld);
+    }
+
+    // Group the input terms by the user term they were possibly
+    // expanded from (by stemming)
+    map<string, vector<string> > byRoot;
+    for (const auto& term: terms) {
+        map<string, string>::const_iterator eit = hld.terms.find(term);
+        if (eit != hld.terms.end()) {
+            byRoot[eit->second].push_back(term);
+        } else {
+            LOGDEB0("qualityTerms: [" << term << "] not found in hld\n");
+            byRoot[term].push_back(term);
+        }
     }
 
 #ifdef DEBUGABSTRACT
     {
-	string deb;
-	hld.toString(deb);
-	LOGABS("qualityTerms: hld: "  << (deb) << "\n" );
-    }
-#endif
-
-    // Group the input terms by the user term they were possibly expanded from
-    map<string, vector<string> > byRoot;
-    for (vector<string>::const_iterator qit = terms.begin(); 
-	 qit != terms.end(); qit++) {
-	map<string, string>::const_iterator eit = hld.terms.find(*qit);
-	if (eit != hld.terms.end()) {
-	    byRoot[eit->second].push_back(*qit);
-	} else {
-	    LOGDEB0("qualityTerms: ["  << ((*qit)) << "] not found in hld\n" );
-	    byRoot[*qit].push_back(*qit);
-	}
-    }
-
-#ifdef DEBUGABSTRACT
-    {
-	string byRootstr;
-	for (map<string, vector<string> >::const_iterator debit = 
-		 byRoot.begin();  debit != byRoot.end(); debit++) {
-	    byRootstr.append("[").append(debit->first).append("]->");
-	    for (vector<string>::const_iterator it = debit->second.begin();
-		 it != debit->second.end(); it++) {
-		byRootstr.append("[").append(*it).append("] ");
-	    }
-	    byRootstr.append("\n");
-	}
-	LOGABS("\nqualityTerms: uterms to terms: "  << (byRootstr) << "\n" );
+        string deb;
+        hld.toString(deb);
+        LOGABS("qualityTerms: hld: " << deb << "\n");
+        string byRootstr;
+        for (const auto& entry : byRoot) {
+            byRootstr.append("[").append(entry.first).append("]->");
+            for (const auto& term : entry.second) {
+                byRootstr.append("[").append(term).append("] ");
+            }
+            byRootstr.append("\n");
+        }
+        LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n");
     }
 #endif
 
     // Compute in-document and global frequencies for the groups.
     map<string, double> grpwdfs;
     map<string, double> grptfreqs;
-    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
-	 git != byRoot.end(); git++) {
-	for (vector<string>::const_iterator qit = git->second.begin(); 
-	     qit != git->second.end(); qit++) {
-	    Xapian::TermIterator term = xrdb.termlist_begin(docid);
-	    term.skip_to(*qit);
-	    if (term != xrdb.termlist_end(docid) && *term == *qit) {
-		if (grpwdfs.find(git->first) != grpwdfs.end()) {
-		    grpwdfs[git->first] = term.get_wdf() / doclen;
-		    grptfreqs[git->first] = termfreqs[*qit];
-		} else {
-		    grpwdfs[git->first] += term.get_wdf() / doclen;
-		    grptfreqs[git->first] += termfreqs[*qit];
-		}
-	    }
-	}    
+    for (const auto& group : byRoot) {
+        for (const auto& term : group.second) {
+            Xapian::TermIterator xtermit = xrdb.termlist_begin(docid);
+            xtermit.skip_to(term);
+            if (xtermit != xrdb.termlist_end(docid) && *xtermit == term) {
+                if (grpwdfs.find(group.first) != grpwdfs.end()) {
+                    grpwdfs[group.first] = xtermit.get_wdf() / doclen;
+                    grptfreqs[group.first] = termfreqs[term];
+                } else {
+                    grpwdfs[group.first] += xtermit.get_wdf() / doclen;
+                    grptfreqs[group.first] += termfreqs[term];
+                }
+            }
+        }    
     }
 
     // Build a sorted by quality container for the groups
-    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
-	 git != byRoot.end(); git++) {
-	double q = (grpwdfs[git->first]) * grptfreqs[git->first];
-	q = -log10(q);
-	if (q < 3) {
-	    q = 0.05;
-	} else if (q < 4) {
-	    q = 0.3;
-	} else if (q < 5) {
-	    q = 0.7;
-	} else if (q < 6) {
-	    q = 0.8;
-	} else {
-	    q = 1;
-	}
-	totalweight += q;
-	byQ.insert(pair<double, vector<string> >(q, git->second));
+    for (const auto& group : byRoot) {
+        double q = (grpwdfs[group.first]) * grptfreqs[group.first];
+        q = -log10(q);
+        if (q < 3) {
+            q = 0.05;
+        } else if (q < 4) {
+            q = 0.3;
+        } else if (q < 5) {
+            q = 0.7;
+        } else if (q < 6) {
+            q = 0.8;
+        } else {
+            q = 1;
+        }
+        totalweight += q;
+        byQ.insert(pair<double, vector<string> >(q, group.second));
     }
 
 #ifdef DEBUGABSTRACT
-    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
-	 mit != byQ.rend(); mit++) {
-	LOGABS("qualityTerms: group\n" );
-	for (vector<string>::const_iterator qit = mit->second.begin();
-	     qit != mit->second.end(); qit++) {
-	    LOGABS(""  << (mit->first) << "->["  << (qit) << "]\n" );
-	}
+    for (auto mit= byQ.rbegin(); mit != byQ.rend(); mit++) {
+        LOGABS("qualityTerms: coef: " << mit->first << " group: " <<
+               stringsToString(mit->second) << endl);
     }
 #endif
     return totalweight;
 }
 
+
 // Return page number for first match of "significant" term.
 int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
 {
     LOGDEB("Query::Native::getFirstMatchPage\n");
     if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
-	LOGERR("Query::getFirstMatchPage: no db\n" );
-	return -1;
+        LOGERR("Query::getFirstMatchPage: no db\n");
+        return -1;
     }
     Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
     Xapian::Database& xrdb(ndb->xrdb);
@@ -267,93 +248,249 @@
     getMatchTerms(docid, terms);
 
     if (terms.empty()) {
-	LOGDEB("getFirstMatchPage: empty match term list (field match?)\n" );
-	return -1;
+        LOGDEB("getFirstMatchPage: empty match term list (field match?)\n");
+        return -1;
     }
 
     vector<int> pagepos;
     ndb->getPagePositions(docid, pagepos);
     if (pagepos.empty())
-	return -1;
-	
+        return -1;
+        
     setDbWideQTermsFreqs();
 
     // We try to use a page which matches the "best" term. Get a sorted list
     multimap<double, vector<string> > byQ;
     qualityTerms(docid, terms, byQ);
 
-    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
-	 mit != byQ.rend(); mit++) {
-	for (vector<string>::const_iterator qit = mit->second.begin();
-	     qit != mit->second.end(); qit++) {
-	    string qterm = *qit;
-	    Xapian::PositionIterator pos;
-	    string emptys;
-	    try {
-		for (pos = xrdb.positionlist_begin(docid, qterm); 
-		     pos != xrdb.positionlist_end(docid, qterm); pos++) {
-		    int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
-		    if (pagenum > 0) {
-			term = qterm;
-			return pagenum;
-		    }
-		}
-	    } catch (...) {
-		// Term does not occur. No problem.
-	    }
-	}
+    for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
+        for (vector<string>::const_iterator qit = mit->second.begin();
+             qit != mit->second.end(); qit++) {
+            string qterm = *qit;
+            Xapian::PositionIterator pos;
+            string emptys;
+            try {
+                for (pos = xrdb.positionlist_begin(docid, qterm);
+                     pos != xrdb.positionlist_end(docid, qterm); pos++) {
+                    int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
+                    if (pagenum > 0) {
+                        term = qterm;
+                        return pagenum;
+                    }
+                }
+            } catch (...) {
+                // Term does not occur. No problem.
+            }
+        }
     }
     return -1;
 }
 
-// Build a document abstract by extracting text chunks around the query terms
-// This uses the db termlists, not the original document.
-//
-// DatabaseModified and other general exceptions are catched and
-// possibly retried by our caller
-int Query::Native::makeAbstract(Xapian::docid docid,
-				vector<Snippet>& vabs, 
-				int imaxoccs, int ictxwords)
-{
-    Chrono chron;
-    LOGABS("makeAbstract: docid "  << (long(docid)) << " imaxoccs "  << (imaxoccs) << " ictxwords "  << (ictxwords) << "\n" );
-
-    // The (unprefixed) terms matched by this document
-    vector<string> matchedTerms;
-    getMatchTerms(docid, matchedTerms);
-    if (matchedTerms.empty()) {
-	LOGDEB("makeAbstract::Empty term list\n" );
-	return ABSRES_ERROR;
-    }
-
-    listList("Match terms: ", matchedTerms);
-
-    // Retrieve the term frequencies for the query terms. This is
-    // actually computed only once for a query, and for all terms in
-    // the query (not only the matches for this doc)
-    setDbWideQTermsFreqs();
-
-    // Build a sorted by quality container for the match terms We are
-    // going to try and show text around the less common search terms.
-    // Terms issued from an original one by stem expansion are
-    // aggregated by the qualityTerms() routine.
-    multimap<double, vector<string> > byQ;
-    double totalweight = qualityTerms(docid, matchedTerms, byQ);
-    LOGABS("makeAbstract:"  << (chron.ms()) << ": computed Qcoefs.\n" );
-    // This can't happen, but would crash us
-    if (totalweight == 0.0) {
-	LOGERR("makeAbstract: totalweight == 0.0 !\n" );
-	return ABSRES_ERROR;
-    }
-
-    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
+// Creating the abstract from index position data: populate the sparse
+// array with the positions for a given query term, and mark the
+// neighboring positions.
+void Query::Native::abstractPopulateQTerm(
+    Xapian::Database& xrdb,
+    Xapian::docid docid,
+    const string& qterm,
+    int qtrmwrdcnt,
+    int ctxwords,
+    unsigned int maxgrpoccs,
+    unsigned int maxtotaloccs,
+    map<unsigned int, string>& sparseDoc,
+    unordered_set<unsigned int>& searchTermPositions,
+    unsigned int& maxpos,
+    unsigned int& totaloccs,
+    unsigned int& grpoccs,
+    int& ret
+    )
+{
+    Xapian::PositionIterator pos;
+
+    // Walk the position list for this term.
+    for (pos = xrdb.positionlist_begin(docid, qterm);
+         pos != xrdb.positionlist_end(docid, qterm); pos++) {
+        int ipos = *pos;
+        if (ipos < int(baseTextPosition)) // Not in text body
+            continue;
+        LOGABS("makeAbstract: [" << qterm << "] at pos " <<
+               ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
+               maxgrpoccs << "\n");
+
+        totaloccs++;
+        grpoccs++;
+
+        // Add adjacent slots to the set to populate at next
+        // step by inserting empty strings. Special provisions
+        // for adding ellipsis and for positions overlapped by
+        // the match term.
+        unsigned int sta = MAX(int(baseTextPosition), 
+                               ipos - ctxwords);
+        unsigned int sto = ipos + qtrmwrdcnt-1 + 
+            m_q->m_db->getAbsCtxLen();
+        for (unsigned int ii = sta; ii <= sto;  ii++) {
+            if (ii == (unsigned int)ipos) {
+                sparseDoc[ii] = qterm;
+                searchTermPositions.insert(ii);
+                if (ii > maxpos)
+                    maxpos = ii;
+            } else if (ii > (unsigned int)ipos && 
+                       ii < (unsigned int)ipos + qtrmwrdcnt) {
+                // Position for another word of the multi-word term
+                sparseDoc[ii] = occupiedmarker;
+            } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
+                // For an empty slot, the test above has a side
+                // effect of inserting an empty string which
+                // is what we want. Do it also if it was an ellipsis
+                sparseDoc[ii] = emptys;
+            }
+        }
+        // Add ellipsis at the end. This may be replaced later by
+        // an overlapping extract. Take care not to replace an
+        // empty string here, we really want an empty slot,
+        // use find()
+        if (sparseDoc.find(sto+1) == sparseDoc.end()) {
+            sparseDoc[sto+1] = cstr_ellipsis;
+        }
+
+        // Group done ?
+        if (grpoccs >= maxgrpoccs) {
+            ret |= ABSRES_TRUNC;
+            LOGABS("Db::makeAbstract: max group occs cutoff\n");
+            break;
+        }
+        // Global done ?
+        if (totaloccs >= maxtotaloccs) {
+            ret |= ABSRES_TRUNC;
+            LOGABS("Db::makeAbstract: max occurrences cutoff\n");
+            break;
+        }
+    }
+}
+
+// Creating the abstract from index position data: after the query
+// terms have been inserted at their place in the sparse array, and
+// the neighboring positions marked, populate the neighbours: for each
+// term in the document, walk its position list and populate slots
+// around the query terms. We arbitrarily truncate the list to avoid
+// taking forever. If we do cutoff, the abstract may be inconsistant
+// (missing words, potentially altering meaning), which is bad.
+void Query::Native::abstractPopulateContextTerms(
+    Xapian::Database& xrdb,
+    Xapian::docid docid,
+    unsigned int maxpos,
+    map<unsigned int, string>& sparseDoc,
+    int& ret
+    )
+{
+    Xapian::TermIterator term;
+    int cutoff = m_q->m_snipMaxPosWalk;
+    for (term = xrdb.termlist_begin(docid);
+         term != xrdb.termlist_end(docid); term++) {
+        // Ignore prefixed terms
+        if (has_prefix(*term))
+            continue;
+        if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
+            ret |= ABSRES_TERMMISS;
+            LOGDEB0("makeAbstract: max term count cutoff " <<
+                    m_q->m_snipMaxPosWalk << "\n");
+            break;
+        }
+
+        map<unsigned int, string>::iterator vit;
+        Xapian::PositionIterator pos;
+        for (pos = xrdb.positionlist_begin(docid, *term);
+             pos != xrdb.positionlist_end(docid, *term); pos++) {
+            if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
+                ret |= ABSRES_TERMMISS;
+                LOGDEB0("makeAbstract: max term count cutoff " <<
+                        m_q->m_snipMaxPosWalk << "\n");
+                break;
+            }
+            // If we are beyond the max possible position, stop
+            // for this term
+            if (*pos > maxpos) {
+                break;
+            }
+            if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
+                // Don't replace a term: the terms list is in
+                // alphabetic order, and we may have several terms
+                // at the same position, we want to keep only the
+                // first one (ie: dockes and dockes@wanadoo.fr)
+                if (vit->second.empty()) {
+                    LOGDEB2("makeAbstract: populating: [" << *term <<
+                            "] at " << *pos << "\n");
+                    sparseDoc[*pos] = *term;
+                }
+            }
+        }
+    }
+}
+
+// Creating the abstract from position data: final phase: extract the
+// snippets from the sparse array.
+void Query::Native::abstractCreateSnippetsVector(
+    Rcl::Db::Native *ndb,
+    map<unsigned int, string>& sparseDoc,
+    unordered_set<unsigned int>& searchTermPositions,
+    vector<int>& vpbreaks,
+    vector<Snippet>& vabs)
+{
+    vabs.clear();
+    string chunk;
+    bool incjk = false;
+    int page = 0;
+    string term;
+
+    for (const auto& ent : sparseDoc) {
+        LOGDEB2("Abtract:output "<< ent.first <<" -> [" <<ent.second <<"]\n");
+        if (!occupiedmarker.compare(ent.second)) {
+            LOGDEB("Abstract: qtrm position not filled ??\n");
+            continue;
+        }
+        if (chunk.empty() && !vpbreaks.empty()) {
+            page =  ndb->getPageNumberForPosition(vpbreaks, ent.first);
+            if (page < 0) 
+                page = 0;
+            term.clear();
+        }
+        Utf8Iter uit(ent.second);
+        bool newcjk = false;
+        if (TextSplit::isCJK(*uit))
+            newcjk = true;
+        if (!incjk || (incjk && !newcjk))
+            chunk += " ";
+        incjk = newcjk;
+        if (searchTermPositions.find(ent.first) != searchTermPositions.end())
+            term = ent.second;
+        if (ent.second == cstr_ellipsis) {
+            vabs.push_back(Snippet(page, chunk).setTerm(term));
+            chunk.clear();
+        } else {
+            if (ent.second.compare(end_of_field_term) && 
+                ent.second.compare(start_of_field_term))
+                chunk += ent.second;
+        }
+    }
+    if (!chunk.empty())
+        vabs.push_back(Snippet(page, chunk).setTerm(term));
+}
+
+// Creating the abstract from index position data: top level routine
+int Query::Native::abstractFromIndex(
+    Rcl::Db::Native *ndb,
+    Xapian::docid docid,
+    const vector<string>& matchTerms,
+    const multimap<double, vector<string>> byQ,
+    double totalweight,
+    int ctxwords,
+    unsigned int maxtotaloccs,
+    vector<Snippet>& vabs,
+    Chrono& chron
+    )
+{
     Xapian::Database& xrdb(ndb->xrdb);
-
-    ///////////////////
-    // For each of the query terms, ask xapian for its positions list
-    // in the document. For each position entry, insert it and its
-    // neighbours in the set of 'interesting' positions
-
+    int ret = ABSRES_OK;
     // The terms 'array' that we partially populate with the document
     // terms, at their positions around the search terms positions:
     map<unsigned int, string> sparseDoc;
@@ -368,247 +505,160 @@
     // Total number of occurences for all terms. We stop when we have too much
     unsigned int totaloccs = 0;
 
+    // First pass to populate the sparse document: we walk the term
+    // groups, beginning with the better ones, and insert each term at
+    // its position. We also insert empty strings at the surrounding
+    // positions. These are markers showing where we should insert
+    // data during the next pass.
+    for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
+        unsigned int maxgrpoccs;
+        double q;
+        if (byQ.size() == 1) {
+            maxgrpoccs = maxtotaloccs;
+            q = 1.0;
+        } else {
+            // We give more slots to the better term groups
+            q = mit->first / totalweight;
+            maxgrpoccs = int(ceil(maxtotaloccs * q));
+        }
+        unsigned int grpoccs = 0;
+
+        // For each term in user term expansion group
+        for (const auto& qterm : mit->second) {
+            // Enough for this group ?
+            if (grpoccs >= maxgrpoccs) 
+                break;
+
+            LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
+                   " max grp occs (coef " << q << ")\n");
+
+            // The match term may span several words (more than one position)
+            int qtrmwrdcnt = 
+                TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
+
+            // Populate positions for this query term.
+            // There may be query terms not in this doc. This raises an
+            // exception when requesting the position list, we catch it ??
+            // Not clear how this can happen because we are walking the
+            // match list returned by Xapian. Maybe something with the
+            // fields?
+            try {
+                abstractPopulateQTerm(xrdb, docid, qterm, qtrmwrdcnt, ctxwords,
+                                      maxgrpoccs,maxtotaloccs, sparseDoc,
+                                      searchTermPositions, maxpos, totaloccs,
+                                      grpoccs, ret);
+            } catch (...) {
+                // Term does not occur. No problem.
+            }
+
+            if (totaloccs >= maxtotaloccs) {
+                ret |= ABSRES_TRUNC;
+                LOGABS("Db::makeAbstract: max1 occurrences cutoff\n");
+                break;
+            }
+        }
+    }
+    maxpos += ctxwords + 1;
+
+    LOGABS("makeAbstract:" << chron.millis() <<
+           "mS:chosen number of positions " << totaloccs << "\n");
+
+    // This can happen if there are term occurences in the keywords
+    // etc. but not elsewhere ?
+    if (totaloccs == 0) {
+        LOGDEB("makeAbstract: no occurrences\n");
+        return ABSRES_OK;
+    }
+
+    abstractPopulateContextTerms(xrdb, docid, maxpos, sparseDoc, ret);
+    
+    LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");
+
+    vector<int> vpbreaks;
+    ndb->getPagePositions(docid, vpbreaks);
+
+    LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
+           vpbreaks.size() << " pages\n");
+
+    // Finally build the abstract by walking the map (in order of position)
+    abstractCreateSnippetsVector(ndb, sparseDoc, searchTermPositions,
+                                 vpbreaks, vabs);
+    
+    LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
+    return ret;
+}
+
+
+// Build a document abstract by extracting text chunks around the
+// query terms.  This can either uses the index position lists, or the
+// stored document text, with very different implementations.
+//
+// DatabaseModified and other general exceptions are catched and
+// possibly retried by our caller.
+//
+// @param[out] vabs the abstract is returned as a vector of snippets.
+int Query::Native::makeAbstract(Xapian::docid docid,
+                                vector<Snippet>& vabs, 
+                                int imaxoccs, int ictxwords)
+{
+    Chrono chron;
+    LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
+           imaxoccs << " ictxwords " << ictxwords << "\n");
+
+    // The (unprefixed) terms matched by this document
+    vector<string> matchedTerms;
+    getMatchTerms(docid, matchedTerms);
+    if (matchedTerms.empty()) {
+        LOGDEB("makeAbstract:" << chron.millis() << "mS:Empty term list\n");
+        return ABSRES_ERROR;
+    }
+
+    LOGDEB("Match terms: " << stringsToString(matchedTerms) << endl);
+
+    // Retrieve the term frequencies for the query terms. This is
+    // actually computed only once for a query, and for all terms in
+    // the query (not only the matches for this doc)
+    setDbWideQTermsFreqs();
+
+    // Build a sorted by quality container for the match terms We are
+    // going to try and show text around the less common search terms.
+    // Terms issued from an original one by stem expansion are
+    // aggregated by the qualityTerms() routine (this is what we call
+    // 'term groups' in the following: index terms expanded from the
+    // same user term).
+    multimap<double, vector<string>> byQ;
+    double totalweight = qualityTerms(docid, matchedTerms, byQ);
+    LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
+    // This can't happen, but would crash us
+    if (totalweight == 0.0) {
+        LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
+        return ABSRES_ERROR;
+    }
+
+    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
+    Xapian::Database& xrdb(ndb->xrdb);
+
     // Total number of slots we populate. The 7 is taken as
     // average word size. It was a mistake to have the user max
     // abstract size parameter in characters, we basically only deal
     // with words. We used to limit the character size at the end, but
     // this damaged our careful selection of terms
     const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
-	m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
+        m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
     int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
-    LOGABS("makeAbstract:"  << (chron.ms()) << ": mxttloccs "  << (maxtotaloccs) << " ctxwords "  << (ctxwords) << "\n" );
-
-    int ret = ABSRES_OK;
-
-    // Let's go populate
-    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
-	 mit != byQ.rend(); mit++) {
-	unsigned int maxgrpoccs;
-	double q;
-	if (byQ.size() == 1) {
-	    maxgrpoccs = maxtotaloccs;
-	    q = 1.0;
-	} else {
-	    // We give more slots to the better term groups
-	    q = mit->first / totalweight;
-	    maxgrpoccs = int(ceil(maxtotaloccs * q));
-	}
-	unsigned int grpoccs = 0;
-
-	for (vector<string>::const_iterator qit = mit->second.begin();
-	     qit != mit->second.end(); qit++) {
-
-	    // Group done ?
-	    if (grpoccs >= maxgrpoccs) 
-		break;
-
-	    string qterm = *qit;
-
-	    LOGABS("makeAbstract: ["  << (qterm) << "] "  << (maxgrpoccs) << " max grp occs (coef "  << (q) << ")\n" );
-
-	    // The match term may span several words
-	    int qtrmwrdcnt = 
-		TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
-
-	    Xapian::PositionIterator pos;
-	    // There may be query terms not in this doc. This raises an
-	    // exception when requesting the position list, we catch it ??
-	    // Not clear how this can happen because we are walking the
-	    // match list returned by Xapian. Maybe something with the
-	    // fields?
-	    string emptys;
-	    try {
-		for (pos = xrdb.positionlist_begin(docid, qterm); 
-		     pos != xrdb.positionlist_end(docid, qterm); pos++) {
-		    int ipos = *pos;
-		    if (ipos < int(baseTextPosition)) // Not in text body
-			continue;
-		    LOGABS("makeAbstract: ["  << (qterm) << "] at pos "  << (ipos) << " grpoccs "  << (grpoccs) << " maxgrpoccs "  << (maxgrpoccs) << "\n" );
-
-		    totaloccs++;
-		    grpoccs++;
-
-		    // Add adjacent slots to the set to populate at next
-		    // step by inserting empty strings. Special provisions
-		    // for adding ellipsis and for positions overlapped by
-		    // the match term.
-		    unsigned int sta = MAX(int(baseTextPosition), 
-					   ipos - ctxwords);
-		    unsigned int sto = ipos + qtrmwrdcnt-1 + 
-			m_q->m_db->getAbsCtxLen();
-		    for (unsigned int ii = sta; ii <= sto;  ii++) {
-			if (ii == (unsigned int)ipos) {
-			    sparseDoc[ii] = qterm;
-			    searchTermPositions.insert(ii);
-			    if (ii > maxpos)
-				maxpos = ii;
-			} else if (ii > (unsigned int)ipos && 
-				   ii < (unsigned int)ipos + qtrmwrdcnt) {
-			    sparseDoc[ii] = occupiedmarker;
-			} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
-			    // For an empty slot, the test has a side
-			    // effect of inserting an empty string which
-			    // is what we want.
-			    sparseDoc[ii] = emptys;
-			}
-		    }
-		    // Add ellipsis at the end. This may be replaced later by
-		    // an overlapping extract. Take care not to replace an
-		    // empty string here, we really want an empty slot,
-		    // use find()
-		    if (sparseDoc.find(sto+1) == sparseDoc.end()) {
-			sparseDoc[sto+1] = cstr_ellipsis;
-		    }
-
-		    // Group done ?
-		    if (grpoccs >= maxgrpoccs) {
-			ret |= ABSRES_TRUNC;
-			LOGABS("Db::makeAbstract: max group occs cutoff\n" );
-			break;
-		    }
-		    // Global done ?
-		    if (totaloccs >= maxtotaloccs) {
-			ret |= ABSRES_TRUNC;
-			LOGABS("Db::makeAbstract: max occurrences cutoff\n" );
-			break;
-		    }
-		}
-	    } catch (...) {
-		// Term does not occur. No problem.
-	    }
-
-	    if (totaloccs >= maxtotaloccs) {
-		ret |= ABSRES_TRUNC;
-		LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );
-		break;
-	    }
-	}
-    }
-    maxpos += ctxwords + 1;
-
-    LOGABS("makeAbstract:"  << (chron.millis()) << ":chosen number of positions "  << (totaloccs) << "\n" );
-    // This can happen if there are term occurences in the keywords
-    // etc. but not elsewhere ?
-    if (totaloccs == 0) {
-	LOGDEB("makeAbstract: no occurrences\n" );
-	return ABSRES_OK;
-    }
-
-    // Walk all document's terms position lists and populate slots
-    // around the query terms. We arbitrarily truncate the list to
-    // avoid taking forever. If we do cutoff, the abstract may be
-    // inconsistant (missing words, potentially altering meaning),
-    // which is bad. 
-    { 
-	Xapian::TermIterator term;
-	int cutoff = m_q->m_snipMaxPosWalk;
-	for (term = xrdb.termlist_begin(docid);
-	     term != xrdb.termlist_end(docid); term++) {
-	    // Ignore prefixed terms
-	    if (has_prefix(*term))
-		continue;
-	    if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
-		ret |= ABSRES_TERMMISS;
-		LOGDEB0("makeAbstract: max term count cutoff "  << (m_q->m_snipMaxPosWalk) << "\n" );
-		break;
-	    }
-
-	    map<unsigned int, string>::iterator vit;
-	    Xapian::PositionIterator pos;
-	    for (pos = xrdb.positionlist_begin(docid, *term); 
-		 pos != xrdb.positionlist_end(docid, *term); pos++) {
-		if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
-		    ret |= ABSRES_TERMMISS;
-		    LOGDEB0("makeAbstract: max term count cutoff "  << (m_q->m_snipMaxPosWalk) << "\n" );
-		    break;
-		}
-		// If we are beyond the max possible position, stop
-		// for this term
-		if (*pos > maxpos) {
-		    break;
-		}
-		if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
-		    // Don't replace a term: the terms list is in
-		    // alphabetic order, and we may have several terms
-		    // at the same position, we want to keep only the
-		    // first one (ie: dockes and dockes@wanadoo.fr)
-		    if (vit->second.empty()) {
-			LOGDEB2("makeAbstract: populating: ["  << ((*term)) << "] at "  << (*pos) << "\n" );
-			sparseDoc[*pos] = *term;
-		    }
-		}
-	    }
-	}
-    }
-
-#if 0
-    // Debug only: output the full term[position] vector
-    bool epty = false;
-    int ipos = 0;
-    for (map<unsigned int, string>::iterator it = sparseDoc.begin(); 
-	 it != sparseDoc.end();
-	 it++, ipos++) {
-	if (it->empty()) {
-	    if (!epty)
-		LOGDEB("makeAbstract:vec["  << (ipos) << "]: ["  << (it) << "]\n" );
-	    epty=true;
-	} else {
-	    epty = false;
-	    LOGDEB("makeAbstract:vec["  << (ipos) << "]: ["  << (it) << "]\n" );
-	}
-    }
-#endif
-
-    vector<int> vpbreaks;
-    ndb->getPagePositions(docid, vpbreaks);
-
-    LOGABS("makeAbstract:"  << (chron.millis()) << ": extracting. Got "  << (vpbreaks.size()) << " pages\n" );
-    // Finally build the abstract by walking the map (in order of position)
-    vabs.clear();
-    string chunk;
-    bool incjk = false;
-    int page = 0;
-    string term;
-    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
-	 it != sparseDoc.end(); it++) {
-	LOGDEB2("Abtract:output "  << (it->first) << " -> ["  << (it->second) << "]\n" );
-	if (!occupiedmarker.compare(it->second)) {
-	    LOGDEB("Abstract: qtrm position not filled ??\n" );
-	    continue;
-	}
-	if (chunk.empty() && !vpbreaks.empty()) {
-	    page =  ndb->getPageNumberForPosition(vpbreaks, it->first);
-	    if (page < 0) 
-		page = 0;
-	    term.clear();
-	}
-	Utf8Iter uit(it->second);
-	bool newcjk = false;
-	if (TextSplit::isCJK(*uit))
-	    newcjk = true;
-	if (!incjk || (incjk && !newcjk))
-	    chunk += " ";
-	incjk = newcjk;
-	if (searchTermPositions.find(it->first) != searchTermPositions.end())
-	    term = it->second;
-	if (it->second == cstr_ellipsis) {
-	    vabs.push_back(Snippet(page, chunk).setTerm(term));
-	    chunk.clear();
-	} else {
-	    if (it->second.compare(end_of_field_term) && 
-		it->second.compare(start_of_field_term))
-		chunk += it->second;
-	}
-    }
-    if (!chunk.empty())
-	vabs.push_back(Snippet(page, chunk).setTerm(term));
-
-    LOGDEB2("makeAbtract: done in "  << (chron.millis()) << " mS\n" );
-    return ret;
-}
-
-
-}
-
-
-
+    LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
+           maxtotaloccs << " ctxwords " << ctxwords << "\n");
+
+    if (o_index_storedoctext) {
+        return abstractFromText(ndb, docid, matchedTerms, byQ,
+                                totalweight, ctxwords, maxtotaloccs, vabs,
+                                chron);
+    } else {
+        return abstractFromIndex(ndb, docid, matchedTerms, byQ,
+                                 totalweight, ctxwords, maxtotaloccs, vabs,
+                                 chron);
+    }
+}
+
+
+}