Switch to side-by-side view

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -250,6 +250,9 @@
 // Compute query terms quality coefficients for a matched document by
 // retrieving the Within Document Frequencies and multiplying by
 // overal term frequency, then using log-based thresholds.
+// 2012: it's not too clear to me why exactly we do the log thresholds thing.
+//  Preferring terms wich are rare either or both in the db and the document 
+//  seems reasonable though
 double Db::Native::qualityTerms(Xapian::docid docid, 
 				Query *query,
 				const vector<string>& terms,
@@ -350,6 +353,16 @@
     return true;
 }
 
+int Db::Native::getPageNumberForPosition(const vector<int>& pbreaks, 
+					 unsigned int pos)
+{
+    if (pos < baseTextPosition) // Not in text body
+	return -1;
+    vector<int>::const_iterator it = 
+	upper_bound(pbreaks.begin(), pbreaks.end(), pos);
+    return it - pbreaks.begin() + 1;
+}
+
 // Return page number for first match of "significant" term.
 int Db::Native::getFirstMatchPage(Xapian::docid docid, Query *query)
 {
@@ -383,15 +396,9 @@
 	try {
 	    for (pos = xrdb.positionlist_begin(docid, qterm); 
 		 pos != xrdb.positionlist_end(docid, qterm); pos++) {
-		int ipos = *pos;
-		if (ipos < int(baseTextPosition)) // Not in text body
-		    continue;
-		// What page ?
-		LOGABS(("getFirstPageMatch: search match for [%s] pos %d\n", 
-			qterm.c_str(), ipos));
-		vector<int>::const_iterator it = 
-		    upper_bound(pagepos.begin(), pagepos.end(), ipos);
-		return it - pagepos.begin() + 1;
+		int pagenum = getPageNumberForPosition(pagepos, *pos);
+		if (pagenum > 0)
+		    return pagenum;
 	    }
 	} catch (...) {
 	    // Term does not occur. No problem.
@@ -435,8 +442,8 @@
     // TOBEDONE: terms issued from an original one by stem expansion
     // should be somehow aggregated here, else, it may happen that
     // such a group prevents displaying matches for other terms (by
-    // remaining its meaning to the maximum occurrences per term test
-    // using while walking the list below)
+    // removing its meaning from the maximum occurrences per term test
+    // used while walking the list below)
     multimap<double, string> byQ;
     double totalweight = qualityTerms(docid, query, terms, byQ);
     LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
@@ -614,8 +621,11 @@
     }
 #endif
 
-    LOGABS(("makeAbstract:%d: extracting\n", chron.millis()));
-
+    vector<int> vpbreaks;
+    getPagePositions(docid, vpbreaks);
+
+    LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(),
+	    vpbreaks.size()));
     // Finally build the abstract by walking the map (in order of position)
     vector<string> vabs;
     string chunk;
@@ -625,6 +635,12 @@
 	LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
 	if (!occupiedmarker.compare(it->second))
 	    continue;
+	if (chunk.empty() && !vpbreaks.empty()) {
+	    int pnum =  getPageNumberForPosition(vpbreaks, it->first);
+	    ostringstream ss;
+	    ss << pnum;
+	    chunk += string(" [p ") + ss.str() + "] ";
+	}
 	Utf8Iter uit(it->second);
 	bool newcjk = false;
 	if (TextSplit::isCJK(*uit))