Switch to side-by-side view

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -77,6 +77,7 @@
 const string pathelt_prefix = "XP";
 const string start_of_field_term = "XXST";
 const string end_of_field_term = "XXND";
+const string page_break_term = "XXPG";
 
 // This is used as a marker inside the abstract frag lists, but
 // normally doesn't remain in final output (which is built with a
@@ -245,31 +246,21 @@
     return true;
 }
 
-// Remove prefixes (caps) from terms.
+// Keep only non-prefixed terms. We use to remove prefixes and keep
+// the terms instead, but field terms are normally also indexed
+// un-prefixed, so this is simpler and better.
 static void noPrefixList(const vector<string>& in, vector<string>& out) 
 {
     for (vector<string>::const_iterator qit = in.begin(); 
 	 qit != in.end(); qit++) {
-	if ('A' <= qit->at(0) && qit->at(0) <= 'Z') {
-	    string term = *qit;
-	    while (term.length() && 'A' <= term.at(0) && term.at(0) <= 'Z')
-		term.erase(0, 1);
-	    if (term.length())
-		out.push_back(term);
-	    continue;
-	} else {
+	if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
 	    out.push_back(*qit);
-	}
-    }
-}
-
-//#define DEBUGABSTRACT  1
+    }
+}
+
+#undef DEBUGABSTRACT  
 #ifdef DEBUGABSTRACT
 #define LOGABS LOGDEB
-#else
-#define LOGABS LOGDEB2
-#endif
-#if 0
 static void listList(const string& what, const vector<string>&l)
 {
     string a;
@@ -278,58 +269,55 @@
     }
     LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
 }
+#else
+#define LOGABS LOGDEB2
+static void listList(const string&, const vector<string>&)
+{
+}
 #endif
 
-// Build a document abstract by extracting text chunks around the query terms
-// This uses the db termlists, not the original document.
-//
-// DatabaseModified and other general exceptions are catched and
-// possibly retried by our caller
-vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
-{
-    Chrono chron;
-    LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
-	     m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
-
-    vector<string> terms;
-
+// Retrieve and store db-wide frequencies for the query terms.
+void Db::Native::setDbWideQTermsFreqs(Query *query)
+{
+    // Do it once only for a given query.
+    if (!query->m_nq->termfreqs.empty())
+	return;
+
+    vector<string> qterms;
     {
-        vector<string> iterms;
-        query->getMatchTerms(docid, iterms);
-        noPrefixList(iterms, terms);
-        if (terms.empty()) {
-            LOGDEB(("makeAbstract::Empty term list\n"));
-            return vector<string>();
-        }
-    }
-//    listList("Match terms: ", terms);
-
-    // Retrieve db-wide frequencies for the query terms (we do this once per
-    // query, using all the query terms, not only the document match terms)
-    if (query->m_nq->termfreqs.empty()) {
-        vector<string> iqterms, qterms;
-        query->getQueryTerms(iqterms);
-        noPrefixList(iqterms, qterms);
-//        listList("Query terms: ", qterms);
-	double doccnt = xrdb.get_doccount();
-	if (doccnt == 0) doccnt = 1;
-	for (vector<string>::const_iterator qit = qterms.begin(); 
-	     qit != qterms.end(); qit++) {
-	    query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
-	    LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), 
-		    query->m_nq->termfreqs[*qit]));
-	}
-	LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
-    }
-
-    // Compute a term quality coefficient by retrieving the term
-    // Within Document Frequencies and multiplying by overal term
-    // frequency, then using log-based thresholds. We are going to try
-    // and show text around the less common search terms.
+	vector<string> iqterms;
+	query->getQueryTerms(iqterms);
+	noPrefixList(iqterms, qterms);
+    }
+    // listList("Query terms: ", qterms);
+
+    double doccnt = xrdb.get_doccount();
+    if (doccnt == 0) 
+	doccnt = 1;
+
+    for (vector<string>::const_iterator qit = qterms.begin(); 
+	 qit != qterms.end(); qit++) {
+	query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
+	LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), 
+		query->m_nq->termfreqs[*qit]));
+    }
+}
+
+// Compute query terms quality coefficients for a matched document by
+// retrieving the Within Document Frequencies and multiplying by
+// overal term frequency, then using log-based thresholds.
+double Db::Native::qualityTerms(Xapian::docid docid, 
+				Query *query,
+				const vector<string>& terms,
+				multimap<double, string>& byQ)
+{
     map<string, double> termQcoefs;
     double totalweight = 0;
+
     double doclen = xrdb.get_doclength(docid);
-    if (doclen == 0) doclen = 1;
+    if (doclen == 0) 
+	doclen = 1;
+
     for (vector<string>::const_iterator qit = terms.begin(); 
 	 qit != terms.end(); qit++) {
 	Xapian::TermIterator term = xrdb.termlist_begin(docid);
@@ -352,10 +340,8 @@
 	    totalweight += q;
 	}
     }    
-    LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
 
     // Build a sorted by quality term list.
-    multimap<double, string> byQ;
     for (vector<string>::const_iterator qit = terms.begin(); 
 	 qit != terms.end(); qit++) {
 	if (termQcoefs.find(*qit) != termQcoefs.end())
@@ -368,8 +354,128 @@
 	LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
     }
 #endif
-
-
+    return totalweight;
+}
+
+// Return the positions list for the page break term
+bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
+{
+    string qterm = page_break_term;
+    Xapian::PositionIterator pos;
+    try {
+	for (pos = xrdb.positionlist_begin(docid, qterm); 
+	     pos != xrdb.positionlist_end(docid, qterm); pos++) {
+	    int ipos = *pos;
+	    if (ipos < int(baseTextPosition)) {
+		// Not in text body. Strange...
+		continue;
+	    }
+	    vpos.push_back(ipos);
+	} 
+    } catch (...) {
+	// Term does not occur. No problem.
+    }
+    return true;
+}
+
+// Return page number for first match of "significant" term.
+int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
+{
+    vector<string> terms;
+    {
+	vector<string> iterms;
+        query->getMatchTerms(docid, iterms);
+	noPrefixList(iterms, terms);
+    }
+    if (terms.empty()) {
+	LOGDEB(("getFirstMatchPage: empty match term list (field match?)\n"));
+	return -1;
+    }
+
+    vector<int> pagepos;
+    getPagePositions(docid, pagepos);
+    if (pagepos.empty())
+	return -1;
+	
+    setDbWideQTermsFreqs(query);
+
+    // We try to use a page which matches the "best" term. Get a sorted list
+    multimap<double, string> byQ;
+    double totalweight = qualityTerms(docid, query, terms, byQ);
+
+    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
+	 qit != byQ.rend(); qit++) {
+	string qterm = qit->second;
+	Xapian::PositionIterator pos;
+	string emptys;
+	try {
+	    for (pos = xrdb.positionlist_begin(docid, qterm); 
+		 pos != xrdb.positionlist_end(docid, qterm); pos++) {
+		int ipos = *pos;
+		if (ipos < int(baseTextPosition)) // Not in text body
+		    continue;
+		// What page ?
+		LOGABS(("getFirstPageMatch: looking for match for [%s]\n", 
+			qterm.c_str()));
+		vector<int>::const_iterator it = 
+		    lower_bound(pagepos.begin(), pagepos.end(), ipos);
+		if (it != pagepos.end())
+		    return it - pagepos.begin() + 1;
+	    }
+	} catch (...) {
+	    // Term does not occur. No problem.
+	}
+    }
+    return -1;
+}
+
+// Build a document abstract by extracting text chunks around the query terms
+// This uses the db termlists, not the original document.
+//
+// DatabaseModified and other general exceptions are catched and
+// possibly retried by our caller
+vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
+{
+    Chrono chron;
+    LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
+	     m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
+
+    // The (unprefixed) terms matched by this document
+    vector<string> terms;
+
+    {
+        vector<string> iterms;
+        query->getMatchTerms(docid, iterms);
+        noPrefixList(iterms, terms);
+        if (terms.empty()) {
+            LOGDEB(("makeAbstract::Empty term list\n"));
+            return vector<string>();
+        }
+    }
+    listList("Match terms: ", terms);
+
+    // Retrieve the term freqencies for the query terms. This is
+    // actually computed only once for a query, and for all terms in
+    // the query (not only the matches for this doc)
+    setDbWideQTermsFreqs(query);
+
+    // Build a sorted by quality container for the match terms We are
+    // going to try and show text around the less common search terms.
+    // TOBEDONE: terms issued from an original one by stem expansion
+    // should be somehow aggregated here, else, it may happen that
+    // such a group prevents displaying matches for other terms (by
+    // remaining its meaning to the maximum occurrences per term test
+    // using while walking the list below)
+    multimap<double, string> byQ;
+    double totalweight = qualityTerms(docid, query, terms, byQ);
+    LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
+    // This can't happen, but would crash us
+    if (totalweight == 0.0) {
+	LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
+	return vector<string>();
+    }
+
+    ///////////////////
     // For each of the query terms, ask xapian for its positions list
     // in the document. For each position entry, remember it in
     // qtermposs and insert it and its neighbours in the set of
@@ -390,11 +496,6 @@
     const unsigned int maxtotaloccs = 
 	m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
     LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
-    // This can't happen, but would crash us
-    if (totalweight == 0.0) {
-	LOGERR(("makeAbstract: 0 totalweight!\n"));
-	return vector<string>();
-    }
 
     // This is used to mark positions overlapped by a multi-word match term
     const string occupiedmarker("?");
@@ -1000,7 +1101,11 @@
 	LOGERR(("Db: xapian add_posting error %s\n", ermsg.c_str()));
 	return false;
     }
-
+    void newpage(int pos)
+    {
+	pos += m_ts->basepos;
+	m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
+    }
 private:
     TextSplitDb *m_ts;
 };
@@ -2014,6 +2119,19 @@
     return m_reason.empty() ? true : false;
 }
 
+int Db::getFirstMatchPage(Doc &doc, Query *query)
+{
+    LOGDEB1(("Db::getFirstMatchPages\n"));;
+    if (!m_ndb || !m_ndb->m_isopen) {
+	LOGERR(("Db::getFirstMatchPage: no db\n"));
+	return false;
+    }
+    int pagenum = -1;
+    XAPTRY(pagenum = m_ndb->getFirstMatchPage(Xapian::docid(doc.xdocid), query),
+	   m_ndb->xrdb, m_reason);
+    return m_reason.empty() ? pagenum : -1;
+}
+
 // Retrieve document defined by Unique doc identifier. This is mainly used
 // by the GUI history feature
 bool Db::getDoc(const string &udi, Doc &doc)