|
a/src/query/plaintorich.cpp |
|
b/src/query/plaintorich.cpp |
|
... |
|
... |
73 |
: m_wcount(0), m_hdata(hdata)
|
73 |
: m_wcount(0), m_hdata(hdata)
|
74 |
{
|
74 |
{
|
75 |
// We separate single terms and groups and extract the group
|
75 |
// We separate single terms and groups and extract the group
|
76 |
// terms for computing positions list before looking for group
|
76 |
// terms for computing positions list before looking for group
|
77 |
// matches
|
77 |
// matches
|
78 |
|
|
|
79 |
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
78 |
for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
|
80 |
vit != hdata.groups.end(); vit++) {
|
79 |
vit != hdata.groups.end(); vit++) {
|
81 |
if (vit->size() == 1) {
|
80 |
if (vit->size() == 1) {
|
82 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
83 |
if (o_index_stripchars) {
|
|
|
84 |
#endif
|
|
|
85 |
m_terms[vit->front()] = vit - hdata.groups.begin();
|
81 |
m_terms[vit->front()] = vit - hdata.groups.begin();
|
86 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
87 |
} else {
|
|
|
88 |
string dumb = vit->front();
|
|
|
89 |
unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
|
|
|
90 |
m_terms[dumb] = vit - hdata.groups.begin();
|
|
|
91 |
}
|
|
|
92 |
#endif
|
|
|
93 |
} else if (vit->size() > 1) {
|
82 |
} else if (vit->size() > 1) {
|
94 |
for (vector<string>::const_iterator it = vit->begin();
|
83 |
for (vector<string>::const_iterator it = vit->begin();
|
95 |
it != vit->end(); it++) {
|
84 |
it != vit->end(); it++) {
|
96 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
97 |
if (o_index_stripchars) {
|
|
|
98 |
#endif
|
|
|
99 |
m_gterms.insert(*it);
|
85 |
m_gterms.insert(*it);
|
100 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
101 |
} else {
|
|
|
102 |
string dumb = *it;
|
|
|
103 |
unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
|
|
|
104 |
m_gterms.insert(dumb);
|
|
|
105 |
}
|
|
|
106 |
#endif
|
|
|
107 |
}
|
86 |
}
|
108 |
}
|
87 |
}
|
109 |
}
|
88 |
}
|
110 |
}
|
89 |
}
|
111 |
|
90 |
|
112 |
// Accept word and its position. If word is search term, add
|
91 |
// Accept word and its position. If word is search term, add
|
113 |
// highlight zone definition. If word is part of search group
|
92 |
// highlight zone definition. If word is part of search group
|
114 |
// (phrase or near), update positions list.
|
93 |
// (phrase or near), update positions list.
|
115 |
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
94 |
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
116 |
string dumb;
|
95 |
string dumb = term;
|
|
|
96 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
97 |
if (o_index_stripchars) {
|
|
|
98 |
#endif
|
117 |
if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
|
99 |
if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
|
118 |
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
|
100 |
LOGINFO(("PlainToRich::takeword: unac failed for [%s]\n",
|
119 |
term.c_str()));
|
101 |
term.c_str()));
|
120 |
return true;
|
102 |
return true;
|
|
|
103 |
}
|
|
|
104 |
#ifndef RCL_INDEX_STRIPCHARS
|
121 |
}
|
105 |
}
|
|
|
106 |
#endif
|
|
|
107 |
|
122 |
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
108 |
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
123 |
// pos, bts, bte));
|
109 |
// pos, bts, bte));
|
124 |
|
110 |
|
125 |
// If this word is a search term, remember its byte-offset span.
|
111 |
// If this word is a search term, remember its byte-offset span.
|
126 |
map<string, unsigned int>::const_iterator it = m_terms.find(dumb);
|
112 |
map<string, unsigned int>::const_iterator it = m_terms.find(dumb);
|
|
... |
|
... |
194 |
// make no sense for highlighting.
|
180 |
// make no sense for highlighting.
|
195 |
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
|
181 |
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
|
196 |
unsigned int i, int min, int max,
|
182 |
unsigned int i, int min, int max,
|
197 |
int *sp, int *ep, int minpos)
|
183 |
int *sp, int *ep, int minpos)
|
198 |
{
|
184 |
{
|
199 |
LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n",
|
185 |
LOGDEB1(("do_prox_test: win %d i %d min %d max %d minpos %d\n",
|
200 |
window, i, min, max, minpos));
|
186 |
window, i, min, max, minpos));
|
201 |
int tmp = max + 1 - window;
|
187 |
int tmp = max + 1 - window;
|
202 |
if (tmp < minpos)
|
188 |
if (tmp < minpos)
|
203 |
tmp = minpos;
|
189 |
tmp = minpos;
|
204 |
|
190 |
|
|
... |
|
... |
232 |
bool TextSplitPTR::matchGroup(unsigned int grpidx)
|
218 |
bool TextSplitPTR::matchGroup(unsigned int grpidx)
|
233 |
{
|
219 |
{
|
234 |
const vector<string>& terms = m_hdata.groups[grpidx];
|
220 |
const vector<string>& terms = m_hdata.groups[grpidx];
|
235 |
int window = m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx];
|
221 |
int window = m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx];
|
236 |
|
222 |
|
237 |
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
|
223 |
LOGDEB1(("TextSplitPTR::matchGroup:d %d: %s\n", window,
|
238 |
vecStringToString(terms).c_str()));
|
224 |
vecStringToString(terms).c_str()));
|
239 |
|
225 |
|
240 |
// The position lists we are going to work with. We extract them from the
|
226 |
// The position lists we are going to work with. We extract them from the
|
241 |
// (string->plist) map
|
227 |
// (string->plist) map
|
242 |
vector<vector<int>* > plists;
|
228 |
vector<vector<int>* > plists;
|
|
... |
|
... |
249 |
// the search, so that some terms are not found.
|
235 |
// the search, so that some terms are not found.
|
250 |
for (vector<string>::const_iterator it = terms.begin();
|
236 |
for (vector<string>::const_iterator it = terms.begin();
|
251 |
it != terms.end(); it++) {
|
237 |
it != terms.end(); it++) {
|
252 |
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
238 |
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
253 |
if (pl == m_plists.end()) {
|
239 |
if (pl == m_plists.end()) {
|
254 |
LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
|
240 |
LOGDEB1(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
|
255 |
(*it).c_str()));
|
241 |
(*it).c_str()));
|
256 |
return false;
|
242 |
return false;
|
257 |
}
|
243 |
}
|
258 |
plists.push_back(&(pl->second));
|
244 |
plists.push_back(&(pl->second));
|
259 |
plistToTerm[&(pl->second)] = *it;
|
245 |
plistToTerm[&(pl->second)] = *it;
|
260 |
}
|
246 |
}
|
261 |
// I think this can't actually happen, was useful when we used to
|
247 |
// I think this can't actually happen, was useful when we used to
|
262 |
// prune the groups, but doesn't hurt.
|
248 |
// prune the groups, but doesn't hurt.
|
263 |
if (plists.size() < 2) {
|
249 |
if (plists.size() < 2) {
|
264 |
LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
|
250 |
LOGDEB1(("TextSplitPTR::matchGroup: no actual groups found\n"));
|
265 |
return false;
|
251 |
return false;
|
266 |
}
|
252 |
}
|
267 |
// Sort the positions lists so that the shorter is first
|
253 |
// Sort the positions lists so that the shorter is first
|
268 |
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
254 |
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
269 |
|
255 |
|
|
... |
|
... |
273 |
if (it == plistToTerm.end()) {
|
259 |
if (it == plistToTerm.end()) {
|
274 |
// SuperWeird
|
260 |
// SuperWeird
|
275 |
LOGERR(("matchGroup: term for first list not found !?!\n"));
|
261 |
LOGERR(("matchGroup: term for first list not found !?!\n"));
|
276 |
return false;
|
262 |
return false;
|
277 |
}
|
263 |
}
|
278 |
LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
|
264 |
LOGDEB1(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
|
279 |
it->second.c_str(), plists[0]->size()));
|
265 |
it->second.c_str(), plists[0]->size()));
|
280 |
}
|
266 |
}
|
281 |
|
267 |
|
282 |
// Minpos is the highest end of a found match. While looking for
|
268 |
// Minpos is the highest end of a found match. While looking for
|
283 |
// further matches, we don't want the search to extend before
|
269 |
// further matches, we don't want the search to extend before
|
|
... |
|
... |
287 |
// Walk the shortest plist and look for matches
|
273 |
// Walk the shortest plist and look for matches
|
288 |
for (vector<int>::iterator it = plists[0]->begin();
|
274 |
for (vector<int>::iterator it = plists[0]->begin();
|
289 |
it != plists[0]->end(); it++) {
|
275 |
it != plists[0]->end(); it++) {
|
290 |
int pos = *it;
|
276 |
int pos = *it;
|
291 |
int sta = int(10E9), sto = 0;
|
277 |
int sta = int(10E9), sto = 0;
|
292 |
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
278 |
LOGDEB2(("MatchGroup: Testing at pos %d\n", pos));
|
293 |
if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
|
279 |
if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
|
294 |
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
|
280 |
LOGDEB1(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
|
295 |
sta, sto));
|
281 |
sta, sto));
|
296 |
// Maybe extend the window by 1st term position, this was not
|
282 |
// Maybe extend the window by 1st term position, this was not
|
297 |
// done by do_prox..
|
283 |
// done by do_prox..
|
298 |
SETMINMAX(pos, sta, sto);
|
284 |
SETMINMAX(pos, sta, sto);
|
299 |
minpos = sto+1;
|
285 |
minpos = sto+1;
|
300 |
// Translate the position window into a byte offset window
|
286 |
// Translate the position window into a byte offset window
|
301 |
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
287 |
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
302 |
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
288 |
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
303 |
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
289 |
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
304 |
LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
|
290 |
LOGDEB2(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
|
305 |
i1->second.first, i2->second.second));
|
291 |
i1->second.first, i2->second.second));
|
306 |
tboffs.push_back(MatchEntry(i1->second.first,
|
292 |
tboffs.push_back(MatchEntry(i1->second.first,
|
307 |
i2->second.second, grpidx));
|
293 |
i2->second.second, grpidx));
|
308 |
} else {
|
294 |
} else {
|
309 |
LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
|
295 |
LOGDEB0(("matchGroup: no bpos found for %d or %d\n", sta, sto));
|
310 |
}
|
296 |
}
|
311 |
} else {
|
297 |
} else {
|
312 |
LOGDEB1(("matchGroup: no group match found at this position\n"));
|
298 |
LOGDEB1(("matchGroup: no group match found at this position\n"));
|
313 |
}
|
299 |
}
|
314 |
}
|
300 |
}
|
|
... |
|
... |
357 |
list<string>& out, // Output chunk list
|
343 |
list<string>& out, // Output chunk list
|
358 |
const HighlightData& hdata,
|
344 |
const HighlightData& hdata,
|
359 |
int chunksize)
|
345 |
int chunksize)
|
360 |
{
|
346 |
{
|
361 |
Chrono chron;
|
347 |
Chrono chron;
|
|
|
348 |
bool ret = true;
|
|
|
349 |
LOGDEB1(("plaintorichich: in: [%s]\n", in.c_str()));
|
362 |
|
350 |
|
363 |
m_hdata = &hdata;
|
351 |
m_hdata = &hdata;
|
364 |
// Compute the positions for the query terms. We use the text
|
352 |
// Compute the positions for the query terms. We use the text
|
365 |
// splitter to break the text into words, and compare the words to
|
353 |
// splitter to break the text into words, and compare the words to
|
366 |
// the search terms,
|
354 |
// the search terms,
|
|
... |
|
... |
377 |
out.push_back("");
|
365 |
out.push_back("");
|
378 |
list<string>::iterator olit = out.begin();
|
366 |
list<string>::iterator olit = out.begin();
|
379 |
|
367 |
|
380 |
// Rich text output
|
368 |
// Rich text output
|
381 |
*olit = header();
|
369 |
*olit = header();
|
|
|
370 |
|
|
|
371 |
// No term matches. Happens, for example on a snippet selected for
|
|
|
372 |
// a term match when we are actually looking for a group match
|
|
|
373 |
// (the snippet generator does this...).
|
|
|
374 |
if (splitter.tboffs.empty()) {
|
|
|
375 |
LOGDEB1(("plaintorich: no term matches\n"));
|
|
|
376 |
ret = false;
|
|
|
377 |
}
|
382 |
|
378 |
|
383 |
// Iterator for the list of input term positions. We use it to
|
379 |
// Iterator for the list of input term positions. We use it to
|
384 |
// output highlight tags and to compute term positions in the
|
380 |
// output highlight tags and to compute term positions in the
|
385 |
// output text
|
381 |
// output text
|
386 |
vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
|
382 |
vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
|
|
... |
|
... |
548 |
fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
|
544 |
fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
|
549 |
fclose(fp);
|
545 |
fclose(fp);
|
550 |
}
|
546 |
}
|
551 |
#endif
|
547 |
#endif
|
552 |
LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));
|
548 |
LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));
|
553 |
return true;
|
549 |
return ret;
|
554 |
}
|
550 |
}
|