recoll / Code / Diff of /src/query/plaintorich.cpp

Diff of /src/query/plaintorich.cpp [52bc9f] .. [3736c0]

Switch to unified view


...
    :  m_wcount(0), m_hdata(hdata)
    {
    // We separate single terms and groups and extract the group
    // terms for computing positions list before looking for group
    // matches

    for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
         vit != hdata.groups.end(); vit++) {
        if (vit->size() == 1) {



        m_terms[vit->front()] = vit - hdata.groups.begin();







        } else if (vit->size() > 1) {
        for (vector<string>::const_iterator it = vit->begin(); 
             it != vit->end(); it++) {



            m_gterms.insert(*it);







        }
        }
    }
    }

    // Accept word and its position. If word is search term, add
    // highlight zone definition. If word is part of search group
    // (phrase or near), update positions list.
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
    string dumb = term;
#ifndef RCL_INDEX_STRIPCHARS
  if (o_index_stripchars) {
#endif
        if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
      LOGINFO(("PlainToRich::takeword: unac failed for [%s]\n",
           term.c_str()));
      return true;
      }
#ifndef RCL_INDEX_STRIPCHARS
    }
#endif

    //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
    // pos, bts, bte));

    // If this word is a search term, remember its byte-offset span. 
    map<string, unsigned int>::const_iterator it = m_terms.find(dumb);
...
//    make no sense for highlighting.
static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
                  unsigned int i, int min, int max, 
                  int *sp, int *ep, int minpos)
{
    LOGDEB1(("do_prox_test: win %d i %d min %d max %d minpos %d\n", 
         window, i, min, max, minpos));
    int tmp = max + 1 - window;
    if (tmp < minpos)
    tmp = minpos;

...
bool TextSplitPTR::matchGroup(unsigned int grpidx)
{
    const vector<string>& terms = m_hdata.groups[grpidx];
    int window = m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx];

    LOGDEB1(("TextSplitPTR::matchGroup:d %d: %s\n", window,
        vecStringToString(terms).c_str()));

    // The position lists we are going to work with. We extract them from the 
    // (string->plist) map
    vector<vector<int>* > plists;
...
    // the search, so that some terms are not found.
    for (vector<string>::const_iterator it = terms.begin(); 
     it != terms.end(); it++) {
    map<string, vector<int> >::iterator pl = m_plists.find(*it);
    if (pl == m_plists.end()) {
        LOGDEB1(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
            (*it).c_str()));
        return false;
    }
    plists.push_back(&(pl->second));
    plistToTerm[&(pl->second)] = *it;
    }
    // I think this can't actually happen, was useful when we used to
    // prune the groups, but doesn't hurt.
    if (plists.size() < 2) {
    LOGDEB1(("TextSplitPTR::matchGroup: no actual groups found\n"));
    return false;
    }
    // Sort the positions lists so that the shorter is first
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());

...
    if (it == plistToTerm.end()) {
        // SuperWeird
        LOGERR(("matchGroup: term for first list not found !?!\n"));
        return false;
    }
    LOGDEB1(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
        it->second.c_str(), plists[0]->size()));
    }

    // Minpos is the highest end of a found match. While looking for
    // further matches, we don't want the search to extend before
...
    // Walk the shortest plist and look for matches
    for (vector<int>::iterator it = plists[0]->begin(); 
     it != plists[0]->end(); it++) {
    int pos = *it;
    int sta = int(10E9), sto = 0;
    LOGDEB2(("MatchGroup: Testing at pos %d\n", pos));
    if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
        LOGDEB1(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
             sta, sto)); 
        // Maybe extend the window by 1st term position, this was not
        // done by do_prox..
        SETMINMAX(pos, sta, sto);
        minpos = sto+1;
        // Translate the position window into a byte offset window
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
        LOGDEB2(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
            i1->second.first, i2->second.second));
        tboffs.push_back(MatchEntry(i1->second.first, 
                        i2->second.second, grpidx));
        } else {
        LOGDEB0(("matchGroup: no bpos found for %d or %d\n", sta, sto));
        }
    } else {
        LOGDEB1(("matchGroup: no group match found at this position\n"));
    }
    }
...
                  list<string>& out, // Output chunk list
                  const HighlightData& hdata,
                  int chunksize)
{
    Chrono chron;
    bool ret = true;
    LOGDEB1(("plaintorichich: in: [%s]\n", in.c_str()));

    m_hdata = &hdata;
    // Compute the positions for the query terms.  We use the text
    // splitter to break the text into words, and compare the words to
    // the search terms,
...
    out.push_back("");
    list<string>::iterator olit = out.begin();

    // Rich text output
    *olit = header();

    // No term matches. Happens, for example on a snippet selected for
    // a term match when we are actually looking for a group match
    // (the snippet generator does this...).
    if (splitter.tboffs.empty()) {
  LOGDEB1(("plaintorich: no term matches\n"));
  ret = false;
    }

    // Iterator for the list of input term positions. We use it to
    // output highlight tags and to compute term positions in the
    // output text
    vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
...
    fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
    fclose(fp);
    }
#endif
    LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));
    return ret;
}

	a/src/query/plaintorich.cpp		b/src/query/plaintorich.cpp
	...		...
73	: m_wcount(0), m_hdata(hdata)	73	: m_wcount(0), m_hdata(hdata)
74	{	74	{
75	// We separate single terms and groups and extract the group	75	// We separate single terms and groups and extract the group
76	// terms for computing positions list before looking for group	76	// terms for computing positions list before looking for group
77	// matches	77	// matches
78
79	for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();	78	for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
80	vit != hdata.groups.end(); vit++) {	79	vit != hdata.groups.end(); vit++) {
81	if (vit->size() == 1) {	80	if (vit->size() == 1) {
82	#ifndef RCL_INDEX_STRIPCHARS
83	if (o_index_stripchars) {
84	#endif
85	m_terms[vit->front()] = vit - hdata.groups.begin();	81	m_terms[vit->front()] = vit - hdata.groups.begin();
86	#ifndef RCL_INDEX_STRIPCHARS
87	} else {
88	string dumb = vit->front();
89	unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
90	m_terms[dumb] = vit - hdata.groups.begin();
91	}
92	#endif
93	} else if (vit->size() > 1) {	82	} else if (vit->size() > 1) {
94	for (vector<string>::const_iterator it = vit->begin();	83	for (vector<string>::const_iterator it = vit->begin();
95	it != vit->end(); it++) {	84	it != vit->end(); it++) {
96	#ifndef RCL_INDEX_STRIPCHARS
97	if (o_index_stripchars) {
98	#endif
99	m_gterms.insert(*it);	85	m_gterms.insert(*it);
100	#ifndef RCL_INDEX_STRIPCHARS
101	} else {
102	string dumb = *it;
103	unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
104	m_gterms.insert(dumb);
105	}
106	#endif
107	}	86	}
108	}	87	}
109	}	88	}
110	}	89	}
111		90
112	// Accept word and its position. If word is search term, add	91	// Accept word and its position. If word is search term, add
113	// highlight zone definition. If word is part of search group	92	// highlight zone definition. If word is part of search group
114	// (phrase or near), update positions list.	93	// (phrase or near), update positions list.
115	virtual bool takeword(const std::string& term, int pos, int bts, int bte) {	94	virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
116	string dumb;	95	string dumb = term;
		96	#ifndef RCL_INDEX_STRIPCHARS
		97	if (o_index_stripchars) {
		98	#endif
117	if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {	99	if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
118	LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",	100	LOGINFO(("PlainToRich::takeword: unac failed for [%s]\n",
119	term.c_str()));	101	term.c_str()));
120	return true;	102	return true;
		103	}
		104	#ifndef RCL_INDEX_STRIPCHARS
121	}	105	}
		106	#endif
		107
122	//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),	108	//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
123	// pos, bts, bte));	109	// pos, bts, bte));
124		110
125	// If this word is a search term, remember its byte-offset span.	111	// If this word is a search term, remember its byte-offset span.
126	map<string, unsigned int>::const_iterator it = m_terms.find(dumb);	112	map<string, unsigned int>::const_iterator it = m_terms.find(dumb);
	...		...
194	// make no sense for highlighting.	180	// make no sense for highlighting.
195	static bool do_proximity_test(int window, vector<vector<int>* >& plists,	181	static bool do_proximity_test(int window, vector<vector<int>* >& plists,
196	unsigned int i, int min, int max,	182	unsigned int i, int min, int max,
197	int sp, int ep, int minpos)	183	int sp, int ep, int minpos)
198	{	184	{
199	LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n",	185	LOGDEB1(("do_prox_test: win %d i %d min %d max %d minpos %d\n",
200	window, i, min, max, minpos));	186	window, i, min, max, minpos));
201	int tmp = max + 1 - window;	187	int tmp = max + 1 - window;
202	if (tmp < minpos)	188	if (tmp < minpos)
203	tmp = minpos;	189	tmp = minpos;
204		190
	...		...
232	bool TextSplitPTR::matchGroup(unsigned int grpidx)	218	bool TextSplitPTR::matchGroup(unsigned int grpidx)
233	{	219	{
234	const vector<string>& terms = m_hdata.groups[grpidx];	220	const vector<string>& terms = m_hdata.groups[grpidx];
235	int window = m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx];	221	int window = m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx];
236		222
237	LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,	223	LOGDEB1(("TextSplitPTR::matchGroup:d %d: %s\n", window,
238	vecStringToString(terms).c_str()));	224	vecStringToString(terms).c_str()));
239		225
240	// The position lists we are going to work with. We extract them from the	226	// The position lists we are going to work with. We extract them from the
241	// (string->plist) map	227	// (string->plist) map
242	vector<vector<int>* > plists;	228	vector<vector<int>* > plists;
	...		...
249	// the search, so that some terms are not found.	235	// the search, so that some terms are not found.
250	for (vector<string>::const_iterator it = terms.begin();	236	for (vector<string>::const_iterator it = terms.begin();
251	it != terms.end(); it++) {	237	it != terms.end(); it++) {
252	map<string, vector<int> >::iterator pl = m_plists.find(*it);	238	map<string, vector<int> >::iterator pl = m_plists.find(*it);
253	if (pl == m_plists.end()) {	239	if (pl == m_plists.end()) {
254	LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",	240	LOGDEB1(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
255	(*it).c_str()));	241	(*it).c_str()));
256	return false;	242	return false;
257	}	243	}
258	plists.push_back(&(pl->second));	244	plists.push_back(&(pl->second));
259	plistToTerm[&(pl->second)] = *it;	245	plistToTerm[&(pl->second)] = *it;
260	}	246	}
261	// I think this can't actually happen, was useful when we used to	247	// I think this can't actually happen, was useful when we used to
262	// prune the groups, but doesn't hurt.	248	// prune the groups, but doesn't hurt.
263	if (plists.size() < 2) {	249	if (plists.size() < 2) {
264	LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));	250	LOGDEB1(("TextSplitPTR::matchGroup: no actual groups found\n"));
265	return false;	251	return false;
266	}	252	}
267	// Sort the positions lists so that the shorter is first	253	// Sort the positions lists so that the shorter is first
268	std::sort(plists.begin(), plists.end(), VecIntCmpShorter());	254	std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
269		255
	...		...
273	if (it == plistToTerm.end()) {	259	if (it == plistToTerm.end()) {
274	// SuperWeird	260	// SuperWeird
275	LOGERR(("matchGroup: term for first list not found !?!\n"));	261	LOGERR(("matchGroup: term for first list not found !?!\n"));
276	return false;	262	return false;
277	}	263	}
278	LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",	264	LOGDEB1(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
279	it->second.c_str(), plists[0]->size()));	265	it->second.c_str(), plists[0]->size()));
280	}	266	}
281		267
282	// Minpos is the highest end of a found match. While looking for	268	// Minpos is the highest end of a found match. While looking for
283	// further matches, we don't want the search to extend before	269	// further matches, we don't want the search to extend before
	...		...
287	// Walk the shortest plist and look for matches	273	// Walk the shortest plist and look for matches
288	for (vector<int>::iterator it = plists[0]->begin();	274	for (vector<int>::iterator it = plists[0]->begin();
289	it != plists[0]->end(); it++) {	275	it != plists[0]->end(); it++) {
290	int pos = *it;	276	int pos = *it;
291	int sta = int(10E9), sto = 0;	277	int sta = int(10E9), sto = 0;
292	LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));	278	LOGDEB2(("MatchGroup: Testing at pos %d\n", pos));
293	if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {	279	if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
294	LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",	280	LOGDEB1(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
295	sta, sto));	281	sta, sto));
296	// Maybe extend the window by 1st term position, this was not	282	// Maybe extend the window by 1st term position, this was not
297	// done by do_prox..	283	// done by do_prox..
298	SETMINMAX(pos, sta, sto);	284	SETMINMAX(pos, sta, sto);
299	minpos = sto+1;	285	minpos = sto+1;
300	// Translate the position window into a byte offset window	286	// Translate the position window into a byte offset window
301	map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);	287	map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
302	map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);	288	map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
303	if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {	289	if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
304	LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",	290	LOGDEB2(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
305	i1->second.first, i2->second.second));	291	i1->second.first, i2->second.second));
306	tboffs.push_back(MatchEntry(i1->second.first,	292	tboffs.push_back(MatchEntry(i1->second.first,
307	i2->second.second, grpidx));	293	i2->second.second, grpidx));
308	} else {	294	} else {
309	LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));	295	LOGDEB0(("matchGroup: no bpos found for %d or %d\n", sta, sto));
310	}	296	}
311	} else {	297	} else {
312	LOGDEB1(("matchGroup: no group match found at this position\n"));	298	LOGDEB1(("matchGroup: no group match found at this position\n"));
313	}	299	}
314	}	300	}
	...		...
357	list<string>& out, // Output chunk list	343	list<string>& out, // Output chunk list
358	const HighlightData& hdata,	344	const HighlightData& hdata,
359	int chunksize)	345	int chunksize)
360	{	346	{
361	Chrono chron;	347	Chrono chron;
		348	bool ret = true;
		349	LOGDEB1(("plaintorichich: in: [%s]\n", in.c_str()));
362		350
363	m_hdata = &hdata;	351	m_hdata = &hdata;
364	// Compute the positions for the query terms. We use the text	352	// Compute the positions for the query terms. We use the text
365	// splitter to break the text into words, and compare the words to	353	// splitter to break the text into words, and compare the words to
366	// the search terms,	354	// the search terms,
	...		...
377	out.push_back("");	365	out.push_back("");
378	list<string>::iterator olit = out.begin();	366	list<string>::iterator olit = out.begin();
379		367
380	// Rich text output	368	// Rich text output
381	*olit = header();	369	*olit = header();
		370
		371	// No term matches. Happens, for example on a snippet selected for
		372	// a term match when we are actually looking for a group match
		373	// (the snippet generator does this...).
		374	if (splitter.tboffs.empty()) {
		375	LOGDEB1(("plaintorich: no term matches\n"));
		376	ret = false;
		377	}
382		378
383	// Iterator for the list of input term positions. We use it to	379	// Iterator for the list of input term positions. We use it to
384	// output highlight tags and to compute term positions in the	380	// output highlight tags and to compute term positions in the
385	// output text	381	// output text
386	vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();	382	vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
	...		...
548	fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");	544	fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
549	fclose(fp);	545	fclose(fp);
550	}	546	}
551	#endif	547	#endif
552	LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));	548	LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));
553	return true;	549	return ret;
554	}	550	}