recoll / Code / Diff of /src/query/plaintorich.cpp

Diff of /src/query/plaintorich.cpp [40a6bb] .. [ebdd6f]

Switch to unified view


...
        m_gterms.insert(*it);
        }
    }
    }

    // Accept word and its position. If word is search term, add
    // highlight zone definition. If word is part of search group
    // (phrase or near), update positions list.
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
    string dumb;
    if (!unacmaybefold(term, dumb, "UTF-8", true)) {
        LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
                     term.c_str()));
...
    // If this word is a search term, remember its byte-offset span. 
    if (m_terms.find(dumb) != m_terms.end()) {
        tboffs.push_back(pair<int, int>(bts, bte));
    }
    
  // If word is part of a search group, update its positions list
    if (m_gterms.find(dumb) != m_gterms.end()) {
        // Term group (phrase/near) handling
        m_plists[dumb].push_back(pos);
        m_gpostobytes[pos] = pair<int,int>(bts, bte);
        //LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
    }

  // Check for cancellation request
    if ((m_wcount++ & 0xfff) == 0)
        CancelCheck::instance().checkCancel();

    return true;
    }

    // Must be called after the split to find the phrase/near match positions
    virtual bool matchGroups();
...

#define SETMINMAX(POS, STA, STO)  {if ((POS) < (STA)) (STA) = (POS); \
    if ((POS) > (STO)) (STO) = (POS);}

// Recursively check that each term is inside the window (which is
// readjusted as the successive terms are found).
// @param window the search window width
// @param plists the position list vector
// @param i the position list to process (we then recurse with the next list)
// @param min the current minimum pos for a found term
// @param max the current maximum pos for a found term
// @param sp, ep output: the found area
// @param minpos bottom of search: this is the highest point of
//    any previous match. We don't look below this as overlapping matches 
//    make no sense for highlighting.
static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
                  unsigned int i, int min, int max, 
                  int *sp, int *ep, int minpos)
{
    LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n", 
       window, i, min, max, minpos));
    int tmp = max + 1 - window;
    if (tmp < minpos)

    tmp = minpos;

    // Find 1st position bigger than window start
    vector<int>::iterator it = plists[i]->begin();


    while (it != plists[i]->end() && *it < tmp)
    it++;

    // Try each position inside window in turn for match with other lists
    while (it != plists[i]->end()) {
...
        return false;
    if (i + 1 == plists.size()) {
        SETMINMAX(pos, *sp, *ep);
        return true;
    }
  SETMINMAX(pos, min, max);




    if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
        SETMINMAX(pos, *sp, *ep);
        return true;
    }
    it++;
    }
    return false;
}

// Find NEAR matches for the input group of terms, update highlight map
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
{
    LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
        vecStringToString(terms).c_str()));

...
    }
    LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
        it->second.c_str(), plists[0]->size()));
    }

    // Minpos is the highest end of a found match. While looking for
    // further matches, we don't want the search to extend before
    // this, because it does not make sense for highlight regions to
    // overlap
    int minpos = 0;
    // Walk the shortest plist and look for matches
    for (vector<int>::iterator it = plists[0]->begin(); 
     it != plists[0]->end(); it++) {
    int pos = *it;
    int sta = int(10E9), sto = 0;
    LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
    if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
        LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
             sta, sto)); 
        // Maybe extend the window by 1st term position, this was not
        // done by do_prox..
        SETMINMAX(pos, sta, sto);
      minpos = sto+1;
        // Translate the position window into a byte offset window
        int bs = 0;
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
...
                        i2->second.second));
        bs = i1->second.first;
        } else {
        LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
        }
  } else {
      LOGDEB0(("matchGroup: no group match found at this position\n"));
    }
    }

    return true;
}
...
        return a.first < b.first;
    return a.second > b.second;
    }
};

// Look for matches to PHRASE and NEAR term groups. Actually, we
// handle all groups as NEAR (ignore order).
bool TextSplitPTR::matchGroups()
{
    vector<vector<string> >::const_iterator vit = m_groups.begin();
    vector<int>::const_iterator sit = m_slacks.begin();
    for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
    matchGroup(*vit, *sit + (*vit).size());
    }

    // Sort regions by increasing start and decreasing width.  
    // The output process will skip overlapping entries.
    std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
    return true;
}


// Fix result text for display inside the gui text window.
//
// We call overridden functions to output header data, beginnings and ends of
// matches etc.




//
// If the input is text, we output the result in chunks, arranging not
// to cut in the middle of a tag, which would confuse qtextedit. If
// the input is html, the body is always a single output chunk.
bool PlainToRich::plaintorich(const string& in, 
                  list<string>& out, // Output chunk list
                  const HiliteData& hdata,
                  int chunksize)
{
...
    const vector<string>& terms(hdata.terms);
    const vector<vector<string> >& groups(hdata.groups);
    const vector<int>& slacks(hdata.gslks);

    if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {

    string sterms = vecStringToString(terms);
    LOGDEB0(("plaintorich: terms: %s\n", sterms.c_str()));
  sterms.clear();

    for (vector<vector<string> >::const_iterator vit = groups.begin(); 
         vit != groups.end(); vit++) {
        sterms += "GROUP: ";
        sterms += vecStringToString(*vit);
        sterms += "\n";
    }
    LOGDEB0(("plaintorich: groups:\n %s", sterms.c_str()));
        LOGDEB2(("  TEXT:[%s]\n", in.c_str()));
    }

    // Compute the positions for the query terms.  We use the text
    // splitter to break the text into words, and compare the words to
...
    // If we still have terms positions, check (byte) position. If
    // we are at or after a term match, mark.
    if (tPosIt != tPosEnd) {
        int ibyteidx = chariter.getBpos();
        if (ibyteidx == tPosIt->first) {
        if (!intag && ibyteidx >= (int)headend) {
            *olit += startAnchor(anchoridx);
            *olit += startMatch();
        }
        anchoridx++;
                inrcltag = 1;

	a/src/query/plaintorich.cpp		b/src/query/plaintorich.cpp
	...		...
75	m_gterms.insert(*it);	75	m_gterms.insert(*it);
76	}	76	}
77	}	77	}
78	}	78	}
79		79
80	// Callback called by the text-to-words breaker for each word	80	// Accept word and its position. If word is search term, add
		81	// highlight zone definition. If word is part of search group
		82	// (phrase or near), update positions list.
81	virtual bool takeword(const std::string& term, int pos, int bts, int bte) {	83	virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
82	string dumb;	84	string dumb;
83	if (!unacmaybefold(term, dumb, "UTF-8", true)) {	85	if (!unacmaybefold(term, dumb, "UTF-8", true)) {
84	LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",	86	LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
85	term.c_str()));	87	term.c_str()));
	...		...
91	// If this word is a search term, remember its byte-offset span.	93	// If this word is a search term, remember its byte-offset span.
92	if (m_terms.find(dumb) != m_terms.end()) {	94	if (m_terms.find(dumb) != m_terms.end()) {
93	tboffs.push_back(pair<int, int>(bts, bte));	95	tboffs.push_back(pair<int, int>(bts, bte));
94	}	96	}
95		97
		98	// If word is part of a search group, update its positions list
96	if (m_gterms.find(dumb) != m_gterms.end()) {	99	if (m_gterms.find(dumb) != m_gterms.end()) {
97	// Term group (phrase/near) handling	100	// Term group (phrase/near) handling
98	m_plists[dumb].push_back(pos);	101	m_plists[dumb].push_back(pos);
99	m_gpostobytes[pos] = pair<int,int>(bts, bte);	102	m_gpostobytes[pos] = pair<int,int>(bts, bte);
100	//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));	103	//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
101	}	104	}
		105
		106	// Check for cancellation request
102	if ((m_wcount++ & 0xfff) == 0)	107	if ((m_wcount++ & 0xfff) == 0)
103	CancelCheck::instance().checkCancel();	108	CancelCheck::instance().checkCancel();
		109
104	return true;	110	return true;
105	}	111	}
106		112
107	// Must be called after the split to find the phrase/near match positions	113	// Must be called after the split to find the phrase/near match positions
108	virtual bool matchGroups();	114	virtual bool matchGroups();
	...		...
138		144
139	#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \	145	#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \
140	if ((POS) > (STO)) (STO) = (POS);}	146	if ((POS) > (STO)) (STO) = (POS);}
141		147
142	// Recursively check that each term is inside the window (which is	148	// Recursively check that each term is inside the window (which is
143	// readjusted as the successive terms are found). i is the index for	149	// readjusted as the successive terms are found).
144	// the next position list to use (initially 1)	150	// @param window the search window width
		151	// @param plists the position list vector
		152	// @param i the position list to process (we then recurse with the next list)
		153	// @param min the current minimum pos for a found term
		154	// @param max the current maximum pos for a found term
		155	// @param sp, ep output: the found area
		156	// @param minpos bottom of search: this is the highest point of
		157	// any previous match. We don't look below this as overlapping matches
		158	// make no sense for highlighting.
145	static bool do_proximity_test(int window, vector<vector<int>* >& plists,	159	static bool do_proximity_test(int window, vector<vector<int>* >& plists,
146	unsigned int i, int min, int max,	160	unsigned int i, int min, int max,
147	int sp, int ep)	161	int sp, int ep, int minpos)
148	{	162	{
		163	LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n",
		164	window, i, min, max, minpos));
149	int tmp = max + 1;	165	int tmp = max + 1 - window;
150	// take care to avoid underflow	166	if (tmp < minpos)
151	if (window <= tmp)
152	tmp -= window;	167	tmp = minpos;
153	else	168
154	tmp = 0;	169	// Find 1st position bigger than window start
155	vector<int>::iterator it = plists[i]->begin();	170	vector<int>::iterator it = plists[i]->begin();
156
157	// Find 1st position bigger than window start
158	while (it != plists[i]->end() && *it < tmp)	171	while (it != plists[i]->end() && *it < tmp)
159	it++;	172	it++;
160		173
161	// Try each position inside window in turn for match with other lists	174	// Try each position inside window in turn for match with other lists
162	while (it != plists[i]->end()) {	175	while (it != plists[i]->end()) {
	...		...
165	return false;	178	return false;
166	if (i + 1 == plists.size()) {	179	if (i + 1 == plists.size()) {
167	SETMINMAX(pos, sp, ep);	180	SETMINMAX(pos, sp, ep);
168	return true;	181	return true;
169	}	182	}
170	if (pos < min) {	183	SETMINMAX(pos, min, max);
171	min = pos;
172	} else if (pos > max) {
173	max = pos;
174	}
175	if (do_proximity_test(window, plists, i + 1, min, max, sp, ep)) {	184	if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
176	SETMINMAX(pos, sp, ep);	185	SETMINMAX(pos, sp, ep);
177	return true;	186	return true;
178	}	187	}
179	it++;	188	it++;
180	}	189	}
181	return false;	190	return false;
182	}	191	}
183		192
184	// Check if there is a NEAR match for the group of terms	193	// Find NEAR matches for the input group of terms, update highlight map
185	bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)	194	bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
186	{	195	{
187	LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,	196	LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
188	vecStringToString(terms).c_str()));	197	vecStringToString(terms).c_str()));
189		198
	...		...
230	}	239	}
231	LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",	240	LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
232	it->second.c_str(), plists[0]->size()));	241	it->second.c_str(), plists[0]->size()));
233	}	242	}
234		243
		244	// Minpos is the highest end of a found match. While looking for
		245	// further matches, we don't want the search to extend before
		246	// this, because it does not make sense for highlight regions to
		247	// overlap
		248	int minpos = 0;
235	// Walk the shortest plist and look for matches	249	// Walk the shortest plist and look for matches
236	for (vector<int>::iterator it = plists[0]->begin();	250	for (vector<int>::iterator it = plists[0]->begin();
237	it != plists[0]->end(); it++) {	251	it != plists[0]->end(); it++) {
238	int pos = *it;	252	int pos = *it;
239	int sta = int(10E9), sto = 0;	253	int sta = int(10E9), sto = 0;
240	LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));	254	LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
241	if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {	255	if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
242	LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",	256	LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
243	sta, sto));	257	sta, sto));
244	// Maybe extend the window by 1st term position, this was not	258	// Maybe extend the window by 1st term position, this was not
245	// done by do_prox..	259	// done by do_prox..
246	SETMINMAX(pos, sta, sto);	260	SETMINMAX(pos, sta, sto);
		261	minpos = sto+1;
247	// Translate the position window into a byte offset window	262	// Translate the position window into a byte offset window
248	int bs = 0;	263	int bs = 0;
249	map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);	264	map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
250	map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);	265	map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
251	if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {	266	if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
	...		...
255	i2->second.second));	270	i2->second.second));
256	bs = i1->second.first;	271	bs = i1->second.first;
257	} else {	272	} else {
258	LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));	273	LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
259	}	274	}
		275	} else {
		276	LOGDEB0(("matchGroup: no group match found at this position\n"));
260	}	277	}
261	}	278	}
262		279
263	return true;	280	return true;
264	}	281	}
	...		...
271	return a.first < b.first;	288	return a.first < b.first;
272	return a.second > b.second;	289	return a.second > b.second;
273	}	290	}
274	};	291	};
275		292
276	// Do the phrase match thing, then merge the highlight lists	293	// Look for matches to PHRASE and NEAR term groups. Actually, we
		294	// handle all groups as NEAR (ignore order).
277	bool TextSplitPTR::matchGroups()	295	bool TextSplitPTR::matchGroups()
278	{	296	{
279	vector<vector<string> >::const_iterator vit = m_groups.begin();	297	vector<vector<string> >::const_iterator vit = m_groups.begin();
280	vector<int>::const_iterator sit = m_slacks.begin();	298	vector<int>::const_iterator sit = m_slacks.begin();
281	for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {	299	for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
282	matchGroup(vit, sit + (*vit).size());	300	matchGroup(vit, sit + (*vit).size());
283	}	301	}
284		302
285	// Sort by start and end offsets. The merging of overlapping entries	303	// Sort regions by increasing start and decreasing width.
286	// will be handled during output.	304	// The output process will skip overlapping entries.
287	std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());	305	std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
288	return true;	306	return true;
289	}	307	}
290		308
291		309
292	// Fix result text for display inside the gui text window.	310	// Fix result text for display inside the gui text window.
293	//	311	//
294	// To compute the term character positions in the output text, we used	312	// We call overridden functions to output header data, beginnings and ends of
295	// to emulate how qt's textedit counts chars (ignoring tags and	313	// matches etc.
296	// duplicate whitespace etc...). This was tricky business, dependant
297	// on qtextedit internals, and we don't do it any more, so we finally
298	// don't know the term par/car positions in the editor text.
299	// Instead, we now mark the search term positions with html anchors
300	//	314	//
301	// We output the result in chunks, arranging not to cut in the middle of	315	// If the input is text, we output the result in chunks, arranging not
302	// a tag, which would confuse qtextedit.	316	// to cut in the middle of a tag, which would confuse qtextedit. If
		317	// the input is html, the body is always a single output chunk.
303	bool PlainToRich::plaintorich(const string& in,	318	bool PlainToRich::plaintorich(const string& in,
304	list<string>& out, // Output chunk list	319	list<string>& out, // Output chunk list
305	const HiliteData& hdata,	320	const HiliteData& hdata,
306	int chunksize)	321	int chunksize)
307	{	322	{
	...		...
309	const vector<string>& terms(hdata.terms);	324	const vector<string>& terms(hdata.terms);
310	const vector<vector<string> >& groups(hdata.groups);	325	const vector<vector<string> >& groups(hdata.groups);
311	const vector<int>& slacks(hdata.gslks);	326	const vector<int>& slacks(hdata.gslks);
312		327
313	if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {	328	if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
314	LOGDEB0(("plaintorich: terms: \n"));
315	string sterms = vecStringToString(terms);	329	string sterms = vecStringToString(terms);
316	LOGDEB0((" %s\n", sterms.c_str()));	330	LOGDEB0(("plaintorich: terms: %s\n", sterms.c_str()));
317	sterms = "\n";	331	sterms.clear();
318	LOGDEB0(("plaintorich: groups: \n"));
319	for (vector<vector<string> >::const_iterator vit = groups.begin();	332	for (vector<vector<string> >::const_iterator vit = groups.begin();
320	vit != groups.end(); vit++) {	333	vit != groups.end(); vit++) {
321	sterms += "GROUP: ";	334	sterms += "GROUP: ";
322	sterms += vecStringToString(*vit);	335	sterms += vecStringToString(*vit);
323	sterms += "\n";	336	sterms += "\n";
324	}	337	}
325	LOGDEB0((" %s", sterms.c_str()));	338	LOGDEB0(("plaintorich: groups:\n %s", sterms.c_str()));
326	LOGDEB2((" TEXT:[%s]\n", in.c_str()));	339	LOGDEB2((" TEXT:[%s]\n", in.c_str()));
327	}	340	}
328		341
329	// Compute the positions for the query terms. We use the text	342	// Compute the positions for the query terms. We use the text
330	// splitter to break the text into words, and compare the words to	343	// splitter to break the text into words, and compare the words to
	...		...
392	// If we still have terms positions, check (byte) position. If	405	// If we still have terms positions, check (byte) position. If
393	// we are at or after a term match, mark.	406	// we are at or after a term match, mark.
394	if (tPosIt != tPosEnd) {	407	if (tPosIt != tPosEnd) {
395	int ibyteidx = chariter.getBpos();	408	int ibyteidx = chariter.getBpos();
396	if (ibyteidx == tPosIt->first) {	409	if (ibyteidx == tPosIt->first) {
397	if (!intag && ibyteidx > (int)headend) {	410	if (!intag && ibyteidx >= (int)headend) {
398	*olit += startAnchor(anchoridx);	411	*olit += startAnchor(anchoridx);
399	*olit += startMatch();	412	*olit += startMatch();
400	}	413	}
401	anchoridx++;	414	anchoridx++;
402	inrcltag = 1;	415	inrcltag = 1;