recoll / Code / Diff of /src/query/plaintorich.cpp

Diff of /src/query/plaintorich.cpp [bf3ac8] .. [e1b699]

Switch to unified view


...
    return sterms;
}

// Text splitter callback used to take note of the position of query terms 
// inside the result text. This is then used to insert highlight tags. 
class TextSplitPTR : public TextSplit {
 public:

    // Out: begin and end byte positions of query terms/groups in text
    vector<pair<int, int> > tboffs;  

    TextSplitPTR(const vector<string>& its, 
                 const vector<vector<string> >&groups, 
                 const vector<int>& slacks) 
    :  m_wcount(0), m_groups(groups), m_slacks(slacks)
    {
    for (vector<string>::const_iterator it = its.begin(); 
         it != its.end(); it++) {
        m_terms.insert(*it);
...

    // Callback called by the text-to-words breaker for each word
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
    string dumb;
    if (!unacmaybefold(term, dumb, "UTF-8", true)) {
        LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
                     term.c_str()));
        return true;
    }
    //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
    // pos, bts, bte));

...
    }
    return false;
}

// Check if there is a NEAR match for the group of terms
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
{
    LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
        vecStringToString(terms).c_str()));

    // The position lists we are going to work with. We extract them from the 
    // (string->plist) map
    vector<vector<int>* > plists;
...
    // stem-expanded: we don't know which matched)
    for (vector<string>::const_iterator it = terms.begin(); 
     it != terms.end(); it++) {
    map<string, vector<int> >::iterator pl = m_plists.find(*it);
    if (pl == m_plists.end()) {
        LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
            (*it).c_str()));
        continue;
    }
    plists.push_back(&(pl->second));
    plistToTerm[&(pl->second)] = *it;
    realgroup.push_back(*it);
    }
    LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n", 
         window, vecStringToString(realgroup).c_str()));
    if (plists.size() < 2) {
    LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
    return false;
    }
    // Sort the positions lists so that the shorter is first
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());

...
     it != plists[0]->end(); it++) {
    int pos = *it;
    int sta = int(10E9), sto = 0;
    LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
    if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
        LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
             sta, sto)); 
        // Maybe extend the window by 1st term position, this was not
        // done by do_prox..
        SETMINMAX(pos, sta, sto);
        // Translate the position window into a byte offset window
        int bs = 0;
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
        LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
            i1->second.first, i2->second.second));
        tboffs.push_back(pair<int, int>(i1->second.first, 
                        i2->second.second));
        bs = i1->second.first;
        } else {
...
    return a.second > b.second;
    }
};

// Do the phrase match thing, then merge the highlight lists
bool TextSplitPTR::matchGroups()
{
    vector<vector<string> >::const_iterator vit = m_groups.begin();
    vector<int>::const_iterator sit = m_slacks.begin();
    for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
    matchGroup(*vit, *sit + (*vit).size());
...
    }

    // Compute the positions for the query terms.  We use the text
    // splitter to break the text into words, and compare the words to
    // the search terms,
    TextSplitPTR splitter(terms, groups, slacks);

    // Note: the splitter returns the term locations in byte, not
    // character, offsets.
    splitter.text_to_words(in);
    LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));

    // Compute the positions for NEAR and PHRASE groups.
    splitter.matchGroups();

    out.clear();
    out.push_back("");
    list<string>::iterator olit = out.begin();

...
    *olit = header();

    // Iterator for the list of input term positions. We use it to
    // output highlight tags and to compute term positions in the
    // output text
    vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
    vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();

#if 0
    for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
     it != splitter.tboffs.end(); it++) {
    LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
    }
#endif

    // Input character iterator
...
            *olit += endMatch();
            *olit += endAnchor();
        }
        // Skip all highlight areas that would overlap this one
        int crend = tPosIt->second;
        while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
            tPosIt++;
                inrcltag = 0;
        }
    }
        

	a/src/query/plaintorich.cpp		b/src/query/plaintorich.cpp
	...		...
56	return sterms;	56	return sterms;
57	}	57	}
58		58
59	// Text splitter callback used to take note of the position of query terms	59	// Text splitter callback used to take note of the position of query terms
60	// inside the result text. This is then used to insert highlight tags.	60	// inside the result text. This is then used to insert highlight tags.
61	class myTextSplitCB : public TextSplitCB {	61	class TextSplitPTR : public TextSplit {
62	public:	62	public:
63		63
64	// Out: begin and end byte positions of query terms/groups in text	64	// Out: begin and end byte positions of query terms/groups in text
65	vector<pair<int, int> > tboffs;	65	vector<pair<int, int> > tboffs;
66		66
67	myTextSplitCB(const vector<string>& its,	67	TextSplitPTR(const vector<string>& its,
68	const vector<vector<string> >&groups,	68	const vector<vector<string> >&groups,
69	const vector<int>& slacks)	69	const vector<int>& slacks)
70	: m_wcount(0), m_groups(groups), m_slacks(slacks)	70	: m_wcount(0), m_groups(groups), m_slacks(slacks)
71	{	71	{
72	for (vector<string>::const_iterator it = its.begin();	72	for (vector<string>::const_iterator it = its.begin();
73	it != its.end(); it++) {	73	it != its.end(); it++) {
74	m_terms.insert(*it);	74	m_terms.insert(*it);
	...		...
84		84
85	// Callback called by the text-to-words breaker for each word	85	// Callback called by the text-to-words breaker for each word
86	virtual bool takeword(const std::string& term, int pos, int bts, int bte) {	86	virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
87	string dumb;	87	string dumb;
88	if (!unacmaybefold(term, dumb, "UTF-8", true)) {	88	if (!unacmaybefold(term, dumb, "UTF-8", true)) {
89	LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));	89	LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
		90	term.c_str()));
90	return true;	91	return true;
91	}	92	}
92	//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),	93	//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
93	// pos, bts, bte));	94	// pos, bts, bte));
94		95
	...		...
184	}	185	}
185	return false;	186	return false;
186	}	187	}
187		188
188	// Check if there is a NEAR match for the group of terms	189	// Check if there is a NEAR match for the group of terms
189	bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)	190	bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
190	{	191	{
191	LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,	192	LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
192	vecStringToString(terms).c_str()));	193	vecStringToString(terms).c_str()));
193		194
194	// The position lists we are going to work with. We extract them from the	195	// The position lists we are going to work with. We extract them from the
195	// (string->plist) map	196	// (string->plist) map
196	vector<vector<int>* > plists;	197	vector<vector<int>* > plists;
	...		...
205	// stem-expanded: we don't know which matched)	206	// stem-expanded: we don't know which matched)
206	for (vector<string>::const_iterator it = terms.begin();	207	for (vector<string>::const_iterator it = terms.begin();
207	it != terms.end(); it++) {	208	it != terms.end(); it++) {
208	map<string, vector<int> >::iterator pl = m_plists.find(*it);	209	map<string, vector<int> >::iterator pl = m_plists.find(*it);
209	if (pl == m_plists.end()) {	210	if (pl == m_plists.end()) {
210	LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",	211	LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
211	(*it).c_str()));	212	(*it).c_str()));
212	continue;	213	continue;
213	}	214	}
214	plists.push_back(&(pl->second));	215	plists.push_back(&(pl->second));
215	plistToTerm[&(pl->second)] = *it;	216	plistToTerm[&(pl->second)] = *it;
216	realgroup.push_back(*it);	217	realgroup.push_back(*it);
217	}	218	}
218	LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n",	219	LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n",
219	window, vecStringToString(realgroup).c_str()));	220	window, vecStringToString(realgroup).c_str()));
220	if (plists.size() < 2) {	221	if (plists.size() < 2) {
221	LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));	222	LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
222	return false;	223	return false;
223	}	224	}
224	// Sort the positions lists so that the shorter is first	225	// Sort the positions lists so that the shorter is first
225	std::sort(plists.begin(), plists.end(), VecIntCmpShorter());	226	std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
226		227
	...		...
241	it != plists[0]->end(); it++) {	242	it != plists[0]->end(); it++) {
242	int pos = *it;	243	int pos = *it;
243	int sta = int(10E9), sto = 0;	244	int sta = int(10E9), sto = 0;
244	LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));	245	LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
245	if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {	246	if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
246	LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n",	247	LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
247	sta, sto));	248	sta, sto));
248	// Maybe extend the window by 1st term position, this was not	249	// Maybe extend the window by 1st term position, this was not
249	// done by do_prox..	250	// done by do_prox..
250	SETMINMAX(pos, sta, sto);	251	SETMINMAX(pos, sta, sto);
251	// Translate the position window into a byte offset window	252	// Translate the position window into a byte offset window
252	int bs = 0;	253	int bs = 0;
253	map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);	254	map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
254	map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);	255	map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
255	if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {	256	if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
256	LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",	257	LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
257	i1->second.first, i2->second.second));	258	i1->second.first, i2->second.second));
258	tboffs.push_back(pair<int, int>(i1->second.first,	259	tboffs.push_back(pair<int, int>(i1->second.first,
259	i2->second.second));	260	i2->second.second));
260	bs = i1->second.first;	261	bs = i1->second.first;
261	} else {	262	} else {
	...		...
276	return a.second > b.second;	277	return a.second > b.second;
277	}	278	}
278	};	279	};
279		280
280	// Do the phrase match thing, then merge the highlight lists	281	// Do the phrase match thing, then merge the highlight lists
281	bool myTextSplitCB::matchGroups()	282	bool TextSplitPTR::matchGroups()
282	{	283	{
283	vector<vector<string> >::const_iterator vit = m_groups.begin();	284	vector<vector<string> >::const_iterator vit = m_groups.begin();
284	vector<int>::const_iterator sit = m_slacks.begin();	285	vector<int>::const_iterator sit = m_slacks.begin();
285	for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {	286	for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
286	matchGroup(vit, sit + (*vit).size());	287	matchGroup(vit, sit + (*vit).size());
	...		...
331	}	332	}
332		333
333	// Compute the positions for the query terms. We use the text	334	// Compute the positions for the query terms. We use the text
334	// splitter to break the text into words, and compare the words to	335	// splitter to break the text into words, and compare the words to
335	// the search terms,	336	// the search terms,
336	myTextSplitCB cb(terms, groups, slacks);	337	TextSplitPTR splitter(terms, groups, slacks);
337	TextSplit splitter(&cb);
338	// Note: the splitter returns the term locations in byte, not	338	// Note: the splitter returns the term locations in byte, not
339	// character, offsets.	339	// character, offsets.
340	splitter.text_to_words(in);	340	splitter.text_to_words(in);
341	LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));	341	LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
342		342
343	// Compute the positions for NEAR and PHRASE groups.	343	// Compute the positions for NEAR and PHRASE groups.
344	cb.matchGroups();	344	splitter.matchGroups();
345		345
346	out.clear();	346	out.clear();
347	out.push_back("");	347	out.push_back("");
348	list<string>::iterator olit = out.begin();	348	list<string>::iterator olit = out.begin();
349		349
	...		...
351	*olit = header();	351	*olit = header();
352		352
353	// Iterator for the list of input term positions. We use it to	353	// Iterator for the list of input term positions. We use it to
354	// output highlight tags and to compute term positions in the	354	// output highlight tags and to compute term positions in the
355	// output text	355	// output text
356	vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();	356	vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
357	vector<pair<int, int> >::iterator tPosEnd = cb.tboffs.end();	357	vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
358		358
359	#if 0	359	#if 0
360	for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();	360	for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
361	it != cb.tboffs.end(); it++) {	361	it != splitter.tboffs.end(); it++) {
362	LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));	362	LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
363	}	363	}
364	#endif	364	#endif
365		365
366	// Input character iterator	366	// Input character iterator
	...		...
410	*olit += endMatch();	410	*olit += endMatch();
411	*olit += endAnchor();	411	*olit += endAnchor();
412	}	412	}
413	// Skip all highlight areas that would overlap this one	413	// Skip all highlight areas that would overlap this one
414	int crend = tPosIt->second;	414	int crend = tPosIt->second;
415	while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)	415	while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
416	tPosIt++;	416	tPosIt++;
417	inrcltag = 0;	417	inrcltag = 0;
418	}	418	}
419	}	419	}
420		420