recoll / Code / Diff of /src/qtgui/plaintorich.cpp

Diff of /src/qtgui/plaintorich.cpp [c27cf9] .. [5856df]

Switch to side-by-side view

--- a/src/qtgui/plaintorich.cpp
+++ b/src/qtgui/plaintorich.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.30 2007-11-15 18:05:32 dockes Exp $ (C) 2005 J.F.Dockes";
+static char rcsid[] = "@(#$Id: plaintorich.cpp,v 1.31 2008-07-01 08:27:58 dockes Exp $ (C) 2005 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -43,6 +43,8 @@
 #include "plaintorich.h"
 #include "cancelcheck.h"
 
+const string PlainToRich::snull = "";
+
 // For debug printing
 static string vecStringToString(const vector<string>& t)
 {
@@ -58,19 +60,13 @@
 class myTextSplitCB : public TextSplitCB {
  public:
 
-    // Out: first query term found in text
-    string firstTerm;
-    int    firstTermOcc;
-    int m_firstTermPos;
-    int m_firstTermBPos;
-
     // Out: begin and end byte positions of query terms/groups in text
     vector<pair<int, int> > tboffs;  
 
     myTextSplitCB(const vector<string>& its, 
 		  const vector<vector<string> >&groups, 
 		  const vector<int>& slacks) 
-	:  firstTermOcc(1), m_wcount(0), m_groups(groups), m_slacks(slacks)
+	:  m_wcount(0), m_groups(groups), m_slacks(slacks)
     {
 	for (vector<string>::const_iterator it = its.begin(); 
 	     it != its.end(); it++) {
@@ -95,11 +91,6 @@
 	// If this word is a search term, remember its byte-offset span. 
 	if (m_terms.find(dumb) != m_terms.end()) {
 	    tboffs.push_back(pair<int, int>(bts, bte));
-	    if (firstTerm.empty()) {
-		firstTerm = term;
-		m_firstTermPos = pos;
-		m_firstTermBPos = bts;
-	    }
 	}
 	
 	if (m_gterms.find(dumb) != m_gterms.end()) {
@@ -148,10 +139,12 @@
 #define SETMINMAX(POS, STA, STO)  {if ((POS) < (STA)) (STA) = (POS); \
 	if ((POS) > (STO)) (STO) = (POS);}
 
-// Recursively check that each term is inside the window (which is readjusted
-// as the successive terms are found)
+// Recursively check that each term is inside the window (which is
+// readjusted as the successive terms are found). i is the index for
+// the next position list to use (initially 1)
 static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
-		    unsigned int i, int min, int max, int *sp, int *ep)
+			      unsigned int i, int min, int max, 
+			      int *sp, int *ep)
 {
     int tmp = max + 1;
     // take care to avoid underflow
@@ -210,7 +203,7 @@
 	 it != terms.end(); it++) {
 	map<string, vector<int> >::iterator pl = m_plists.find(*it);
 	if (pl == m_plists.end()) {
-	    LOGDEB1(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
+	    LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
 		    (*it).c_str()));
 	    continue;
 	}
@@ -218,58 +211,53 @@
 	plistToTerm[&(pl->second)] = *it;
 	realgroup.push_back(*it);
     }
-    LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group %s\n", window,
-	     vecStringToString(realgroup).c_str()));
-    if (plists.size() < 2)
+    LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n", 
+	     window, vecStringToString(realgroup).c_str()));
+    if (plists.size() < 2) {
+	LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
 	return false;
+    }
     // Sort the positions lists so that the shorter is first
     std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
 
+    { // Debug
+	map<vector<int>*, string>::iterator it;
+	it =  plistToTerm.find(plists[0]);
+	if (it == plistToTerm.end()) {
+	    // SuperWeird
+	    LOGERR(("matchGroup: term for first list not found !?!\n"));
+	    return false;
+	}
+	LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
+		it->second.c_str(), plists[0]->size()));
+    }
+
     // Walk the shortest plist and look for matches
-    int sta = int(10E9), sto = 0;
-    int pos;
-    // Occurrences are from 1->N
-    firstTermOcc = 0;
-    vector<int>::iterator it = plists[0]->begin();
-    do {
-	if (it == plists[0]->end())
-	    return false;
-	pos = *it++;
-	firstTermOcc++;
-    } while (!do_proximity_test(window, plists, 1, pos, pos, &sta, &sto));
-    SETMINMAX(pos, sta, sto);
-
-    LOGDEB0(("myTextSplitCB::matchGroup: MATCH [%d,%d]\n", sta, sto)); 
-
-    // Translate the position window into a byte offset window
-    int bs = 0;
-    map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
-    map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
-    if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
-	LOGDEB1(("myTextSplitCB::matchGroup: pushing %d %d\n",
-		 i1->second.first, i2->second.second));
-	tboffs.push_back(pair<int, int>(i1->second.first, i2->second.second));
-	bs = i1->second.first;
-    } else {
-	LOGDEB(("myTextSplitCB::matchGroup: no bpos found for %d or %d\n", 
-		sta, sto));
-    }
-
-    if (firstTerm.empty() || m_firstTermPos > sta) {
-	// firsTerm is used to try an position the preview window over
-	// the match. As it's difficult to divine byte/word positions
-	// in qtextedit, we use a string search. Use the
-	// shortest plist for this, which hopefully gives a better
-	// chance for the group to be found (it's hopeless to try and
-	// match the whole group)
-	map<vector<int>*, string>::iterator it = 
-	    plistToTerm.find(plists.front());
-	if (it != plistToTerm.end())
-	    firstTerm = it->second;
-	LOGDEB0(("myTextSplitCB:: best group term %s, firstTermOcc %d\n",
-		 firstTerm.c_str(), firstTermOcc));
-	m_firstTermPos = sta;
-	m_firstTermBPos = bs;
+    for (vector<int>::iterator it = plists[0]->begin(); 
+	 it != plists[0]->end(); it++) {
+	int pos = *it;
+	int sta = int(10E9), sto = 0;
+	LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
+	if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
+	    LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n", 
+		     sta, sto)); 
+	    // Maybe extend the window by 1st term position, this was not
+	    // done by do_prox..
+	    SETMINMAX(pos, sta, sto);
+	    // Translate the position window into a byte offset window
+	    int bs = 0;
+	    map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
+	    map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
+	    if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
+		LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
+			i1->second.first, i2->second.second));
+		tboffs.push_back(pair<int, int>(i1->second.first, 
+						i2->second.second));
+		bs = i1->second.first;
+	    } else {
+		LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
+	    }
+	}
     }
 
     return true;
@@ -298,20 +286,6 @@
     // will be handled during output.
     std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
     return true;
-}
-
-// Setting searchable beacons in the text to walk the term list.
-static const char *termAnchorNameBase = "TRM";
-string termAnchorName(int i)
-{
-    char acname[sizeof(termAnchorNameBase) + 20];
-    sprintf(acname, "%s%d", termAnchorNameBase, i);
-    return string(acname);
-}
-
-static string termBeacon(int i)
-{
-    return string("<a name=\"") + termAnchorName(i) + "\">";
 }
 
 
@@ -325,9 +299,9 @@
 // Instead, we mark the search term positions either with html anchor
 // (qt currently has problems with them), or a special string, and the
 // caller will use the editor's find() function to position on it
-bool plaintorich(const string& in, list<string>& out, 
-		 const HiliteData& hdata,
-		 bool noHeader, int *lastAnchor, int chunksize)
+bool PlainToRich::plaintorich(const string& in, list<string>& out, 
+			      const HiliteData& hdata,
+			      int chunksize)
 {
     Chrono chron;
     const vector<string>& terms(hdata.terms);
@@ -342,6 +316,7 @@
 	LOGDEB0(("plaintorich: groups: \n"));
 	for (vector<vector<string> >::const_iterator vit = groups.begin(); 
 	     vit != groups.end(); vit++) {
+	    sterms += "GROUP: ";
 	    sterms += vecStringToString(*vit);
 	    sterms += "\n";
 	}
@@ -362,13 +337,10 @@
 
     out.clear();
     out.push_back("");
-    list<string>::iterator sit = out.begin();
+    list<string>::iterator olit = out.begin();
 
     // Rich text output
-    if (noHeader)
-	*sit = "";
-    else 
-	*sit = "<qt><head><title></title></head><body><p>";
+    *olit = header();
 
     // Iterator for the list of input term positions. We use it to
     // output highlight tags and to compute term positions in the
@@ -388,10 +360,11 @@
     // State variable used to limitate the number of consecutive empty lines 
     int ateol = 0;
 
-    // Stuff for numbered anchors at each term match
+    // Value for numbered anchors at each term match
     int anchoridx = 1;
 
     for (string::size_type pos = 0; pos != string::npos; pos = chariter++) {
+	// Check from time to time if we need to stop
 	if ((pos & 0xfff) == 0) {
 	    CancelCheck::instance().checkCancel();
 	}
@@ -401,20 +374,20 @@
 	if (tPosIt != tboffsend) {
 	    int ibyteidx = chariter.getBpos();
 	    if (ibyteidx == tPosIt->first) {
-		if (lastAnchor)
-		    *sit += termBeacon(anchoridx++);
-		*sit += "<termtag>";
+		*olit += startAnchor(anchoridx++);
+		*olit += startMatch();
 	    } else if (ibyteidx == tPosIt->second) {
 		// Output end tag, then skip all highlight areas that
 		// would overlap this one
-		*sit += "</termtag>";
+		*olit += endMatch();
+		*olit += endAnchor();
 		int crend = tPosIt->second;
 		while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
 		    tPosIt++;
-		// Maybe end chunk
-		if (sit->size() > (unsigned int)chunksize) {
+		// Maybe end this chunk, begin next
+		if (olit->size() > (unsigned int)chunksize) {
 		    out.push_back("");
-		    sit++;
+		    olit++;
 		}
 	    }
 	}
@@ -422,33 +395,29 @@
 	switch(*chariter) {
 	case '\n':
 	    if (ateol < 2) {
-		*sit += "<br>\n";
+		*olit += "<br>\n";
 		ateol++;
 	    }
 	    break;
 	case '\r': 
 	    break;
-	case '\007': // used as anchor char, strip other instances
-	    break;
 	case '<':
 	    ateol = 0;
-	    *sit += "&lt;";
+	    *olit += "&lt;";
 	    break;
 	case '&':
 	    ateol = 0;
-	    *sit += "&amp;";
+	    *olit += "&amp;";
 	    break;
 	default:
 	    // We don't change the eol status for whitespace, want a real line
 	    if (!(*chariter == ' ' || *chariter == '\t')) {
 		ateol = 0;
 	    }
-	    chariter.appendchartostring(*sit);
-	}
-    }
-    if (lastAnchor)
-	*lastAnchor = anchoridx - 1;
-#if 0
+	    chariter.appendchartostring(*olit);
+	}
+    }
+#if 1
     {
 	FILE *fp = fopen("/tmp/debugplaintorich", "a");
 	fprintf(fp, "BEGINOFPLAINTORICHOUTPUT\n");