recoll / Code / Diff of /src/query/plaintorich.cpp

Diff of /src/query/plaintorich.cpp [dc7b34] .. [20c049]

Switch to side-by-side view

--- a/src/query/plaintorich.cpp
+++ b/src/query/plaintorich.cpp
@@ -49,13 +49,22 @@
     return sterms;
 }
 
+struct MatchEntry {
+    pair<int, int> offs;
+    unsigned int grpidx;
+    MatchEntry(int sta, int sto, unsigned int idx) 
+	: offs(sta, sto), grpidx(idx)
+    {
+    }
+};
+
 // Text splitter used to take note of the position of query terms
 // inside the result text. This is then used to insert highlight tags.
 class TextSplitPTR : public TextSplit {
  public:
 
     // Out: begin and end byte positions of query terms/groups in text
-    vector<pair<int, int> > tboffs;  
+    vector<MatchEntry> tboffs;  
 
     TextSplitPTR(const HighlightData& hdata)
     :  m_wcount(0), m_hdata(hdata)
@@ -67,7 +76,7 @@
 	for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
 	     vit != hdata.groups.end(); vit++) {
 	    if (vit->size() == 1) {
-		m_terms.insert(vit->front());
+		m_terms[vit->front()] = vit - hdata.groups.begin();
 	    } else if (vit->size() > 1) {
 		for (vector<string>::const_iterator it = vit->begin(); 
 		     it != vit->end(); it++) {
@@ -91,8 +100,9 @@
 	// pos, bts, bte));
 
 	// If this word is a search term, remember its byte-offset span. 
-	if (m_terms.find(dumb) != m_terms.end()) {
-	    tboffs.push_back(pair<int, int>(bts, bte));
+	map<string, unsigned int>::const_iterator it = m_terms.find(dumb);
+	if (it != m_terms.end()) {
+	    tboffs.push_back(MatchEntry(bts, bte, (*it).second));
 	}
 	
 	// If word is part of a search group, update its positions list
@@ -114,13 +124,13 @@
     virtual bool matchGroups();
 
 private:
-    virtual bool matchGroup(const vector<string>& terms, int dist);
+    virtual bool matchGroup(unsigned int idx);
 
     // Word count. Used to call checkCancel from time to time.
     int m_wcount;
 
     // In: user query terms
-    set<string>    m_terms; 
+    map<string, unsigned int>    m_terms; 
 
     // m_gterms holds all the terms in m_groups, as a set for quick lookup
     set<string>    m_gterms;
@@ -191,9 +201,12 @@
     return false;
 }
 
-// Find NEAR matches for the input group of terms, update highlight map
-bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
+// Find NEAR matches for one group of terms, update highlight map
+bool TextSplitPTR::matchGroup(unsigned int grpidx)
 {
+    const vector<string>& terms = m_hdata.groups[grpidx];
+    int window = m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx];
+
     LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
 	    vecStringToString(terms).c_str()));
 
@@ -203,26 +216,23 @@
     // A revert plist->term map. This is so that we can find who is who after
     // sorting the plists by length.
     map<vector<int>*, string> plistToTerm;
-    // For traces
-    vector<string> realgroup;
-
-    // Find the position list for each term in the group. Not all
-    // necessarily exist (esp for NEAR where terms have been
-    // stem-expanded: we don't know which matched)
+
+    // Find the position list for each term in the group. It is
+    // possible that this particular group was not actually matched by
+    // the search, so that some terms are not found.
     for (vector<string>::const_iterator it = terms.begin(); 
 	 it != terms.end(); it++) {
 	map<string, vector<int> >::iterator pl = m_plists.find(*it);
 	if (pl == m_plists.end()) {
 	    LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
 		    (*it).c_str()));
-	    continue;
+	    return false;
 	}
 	plists.push_back(&(pl->second));
 	plistToTerm[&(pl->second)] = *it;
-	realgroup.push_back(*it);
-    }
-    LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n", 
-	     window, vecStringToString(realgroup).c_str()));
+    }
+    // I think this can't actually happen, was useful when we used to
+    // prune the groups, but doesn't hurt.
     if (plists.size() < 2) {
 	LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
 	return false;
@@ -261,15 +271,13 @@
 	    SETMINMAX(pos, sta, sto);
 	    minpos = sto+1;
 	    // Translate the position window into a byte offset window
-	    int bs = 0;
 	    map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
 	    map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
 	    if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
 		LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
 			i1->second.first, i2->second.second));
-		tboffs.push_back(pair<int, int>(i1->second.first, 
-						i2->second.second));
-		bs = i1->second.first;
+		tboffs.push_back(MatchEntry(i1->second.first, 
+					    i2->second.second, grpidx));
 	    } else {
 		LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
 	    }
@@ -284,22 +292,23 @@
 /** Sort integer pairs by increasing first value and decreasing width */
 class PairIntCmpFirst {
 public:
-    bool operator()(pair<int,int> a, pair<int, int>b) {
-	if (a.first != b.first)
-	    return a.first < b.first;
-	return a.second > b.second;
+    bool operator()(const MatchEntry& a, const MatchEntry& b) {
+	if (a.offs.first != b.offs.first)
+	    return a.offs.first < b.offs.first;
+	return a.offs.second > b.offs.second;
     }
 };
 
-// Look for matches to PHRASE and NEAR term groups. Actually, we
-// handle all groups as NEAR (ignore order).
+// Look for matches to PHRASE and NEAR term groups and finalize the
+// matched regions list (sort it by increasing start then decreasing
+// length)
+// Actually, we handle all groups as NEAR (ignore order).
 bool TextSplitPTR::matchGroups()
 {
     for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
 	if (m_hdata.groups[i].size() <= 1)
 	    continue;
-	matchGroup(m_hdata.groups[i], 
-		   m_hdata.groups[i].size() + m_hdata.slacks[i]);
+	matchGroup(i);
     }
 
     // Sort regions by increasing start and decreasing width.  
@@ -324,6 +333,7 @@
 {
     Chrono chron;
 
+    m_hdata = &hdata;
     // Compute the positions for the query terms.  We use the text
     // splitter to break the text into words, and compare the words to
     // the search terms,
@@ -346,8 +356,8 @@
     // Iterator for the list of input term positions. We use it to
     // output highlight tags and to compute term positions in the
     // output text
-    vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
-    vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
+    vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
+    vector<MatchEntry>::iterator tPosEnd = splitter.tboffs.end();
 
 #if 0
     for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
@@ -365,8 +375,6 @@
     int hadcr = 0;
     int inindent = 1;
 
-    // Value for numbered anchors at each term match
-    int anchoridx = 1;
     // HTML state
     bool intag = false, inparamvalue = false;
     // My tag state
@@ -391,22 +399,20 @@
 	// we are at or after a term match, mark.
 	if (tPosIt != tPosEnd) {
 	    int ibyteidx = chariter.getBpos();
-	    if (ibyteidx == tPosIt->first) {
+	    if (ibyteidx == tPosIt->offs.first) {
 		if (!intag && ibyteidx >= (int)headend) {
-		    *olit += startAnchor(anchoridx);
-		    *olit += startMatch();
+		    *olit += startMatch(tPosIt->grpidx);
 		}
-		anchoridx++;
                 inrcltag = 1;
-	    } else if (ibyteidx == tPosIt->second) {
+	    } else if (ibyteidx == tPosIt->offs.second) {
 		// Output end of match region tags
 		if (!intag && ibyteidx > (int)headend) {
 		    *olit += endMatch();
-		    *olit += endAnchor();
 		}
 		// Skip all highlight areas that would overlap this one
-		int crend = tPosIt->second;
-		while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
+		int crend = tPosIt->offs.second;
+		while (tPosIt != splitter.tboffs.end() && 
+		       tPosIt->offs.first < crend)
 		    tPosIt++;
                 inrcltag = 0;
 	    }