|
a/src/query/plaintorich.cpp |
|
b/src/query/plaintorich.cpp |
|
... |
|
... |
56 |
return sterms;
|
56 |
return sterms;
|
57 |
}
|
57 |
}
|
58 |
|
58 |
|
59 |
// Text splitter callback used to take note of the position of query terms
|
59 |
// Text splitter callback used to take note of the position of query terms
|
60 |
// inside the result text. This is then used to insert highlight tags.
|
60 |
// inside the result text. This is then used to insert highlight tags.
|
61 |
class myTextSplitCB : public TextSplitCB {
|
61 |
class TextSplitPTR : public TextSplit {
|
62 |
public:
|
62 |
public:
|
63 |
|
63 |
|
64 |
// Out: begin and end byte positions of query terms/groups in text
|
64 |
// Out: begin and end byte positions of query terms/groups in text
|
65 |
vector<pair<int, int> > tboffs;
|
65 |
vector<pair<int, int> > tboffs;
|
66 |
|
66 |
|
67 |
myTextSplitCB(const vector<string>& its,
|
67 |
TextSplitPTR(const vector<string>& its,
|
68 |
const vector<vector<string> >&groups,
|
68 |
const vector<vector<string> >&groups,
|
69 |
const vector<int>& slacks)
|
69 |
const vector<int>& slacks)
|
70 |
: m_wcount(0), m_groups(groups), m_slacks(slacks)
|
70 |
: m_wcount(0), m_groups(groups), m_slacks(slacks)
|
71 |
{
|
71 |
{
|
72 |
for (vector<string>::const_iterator it = its.begin();
|
72 |
for (vector<string>::const_iterator it = its.begin();
|
73 |
it != its.end(); it++) {
|
73 |
it != its.end(); it++) {
|
74 |
m_terms.insert(*it);
|
74 |
m_terms.insert(*it);
|
|
... |
|
... |
84 |
|
84 |
|
85 |
// Callback called by the text-to-words breaker for each word
|
85 |
// Callback called by the text-to-words breaker for each word
|
86 |
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
86 |
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
87 |
string dumb;
|
87 |
string dumb;
|
88 |
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
88 |
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
89 |
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
|
89 |
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
|
|
|
90 |
term.c_str()));
|
90 |
return true;
|
91 |
return true;
|
91 |
}
|
92 |
}
|
92 |
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
93 |
//LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(),
|
93 |
// pos, bts, bte));
|
94 |
// pos, bts, bte));
|
94 |
|
95 |
|
|
... |
|
... |
184 |
}
|
185 |
}
|
185 |
return false;
|
186 |
return false;
|
186 |
}
|
187 |
}
|
187 |
|
188 |
|
188 |
// Check if there is a NEAR match for the group of terms
|
189 |
// Check if there is a NEAR match for the group of terms
|
189 |
bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
|
190 |
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
|
190 |
{
|
191 |
{
|
191 |
LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
|
192 |
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
|
192 |
vecStringToString(terms).c_str()));
|
193 |
vecStringToString(terms).c_str()));
|
193 |
|
194 |
|
194 |
// The position lists we are going to work with. We extract them from the
|
195 |
// The position lists we are going to work with. We extract them from the
|
195 |
// (string->plist) map
|
196 |
// (string->plist) map
|
196 |
vector<vector<int>* > plists;
|
197 |
vector<vector<int>* > plists;
|
|
... |
|
... |
205 |
// stem-expanded: we don't know which matched)
|
206 |
// stem-expanded: we don't know which matched)
|
206 |
for (vector<string>::const_iterator it = terms.begin();
|
207 |
for (vector<string>::const_iterator it = terms.begin();
|
207 |
it != terms.end(); it++) {
|
208 |
it != terms.end(); it++) {
|
208 |
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
209 |
map<string, vector<int> >::iterator pl = m_plists.find(*it);
|
209 |
if (pl == m_plists.end()) {
|
210 |
if (pl == m_plists.end()) {
|
210 |
LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
|
211 |
LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
|
211 |
(*it).c_str()));
|
212 |
(*it).c_str()));
|
212 |
continue;
|
213 |
continue;
|
213 |
}
|
214 |
}
|
214 |
plists.push_back(&(pl->second));
|
215 |
plists.push_back(&(pl->second));
|
215 |
plistToTerm[&(pl->second)] = *it;
|
216 |
plistToTerm[&(pl->second)] = *it;
|
216 |
realgroup.push_back(*it);
|
217 |
realgroup.push_back(*it);
|
217 |
}
|
218 |
}
|
218 |
LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n",
|
219 |
LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n",
|
219 |
window, vecStringToString(realgroup).c_str()));
|
220 |
window, vecStringToString(realgroup).c_str()));
|
220 |
if (plists.size() < 2) {
|
221 |
if (plists.size() < 2) {
|
221 |
LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
|
222 |
LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
|
222 |
return false;
|
223 |
return false;
|
223 |
}
|
224 |
}
|
224 |
// Sort the positions lists so that the shorter is first
|
225 |
// Sort the positions lists so that the shorter is first
|
225 |
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
226 |
std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
|
226 |
|
227 |
|
|
... |
|
... |
241 |
it != plists[0]->end(); it++) {
|
242 |
it != plists[0]->end(); it++) {
|
242 |
int pos = *it;
|
243 |
int pos = *it;
|
243 |
int sta = int(10E9), sto = 0;
|
244 |
int sta = int(10E9), sto = 0;
|
244 |
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
245 |
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
245 |
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
|
246 |
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
|
246 |
LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n",
|
247 |
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
|
247 |
sta, sto));
|
248 |
sta, sto));
|
248 |
// Maybe extend the window by 1st term position, this was not
|
249 |
// Maybe extend the window by 1st term position, this was not
|
249 |
// done by do_prox..
|
250 |
// done by do_prox..
|
250 |
SETMINMAX(pos, sta, sto);
|
251 |
SETMINMAX(pos, sta, sto);
|
251 |
// Translate the position window into a byte offset window
|
252 |
// Translate the position window into a byte offset window
|
252 |
int bs = 0;
|
253 |
int bs = 0;
|
253 |
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
254 |
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
254 |
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
255 |
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
255 |
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
256 |
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
256 |
LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
|
257 |
LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
|
257 |
i1->second.first, i2->second.second));
|
258 |
i1->second.first, i2->second.second));
|
258 |
tboffs.push_back(pair<int, int>(i1->second.first,
|
259 |
tboffs.push_back(pair<int, int>(i1->second.first,
|
259 |
i2->second.second));
|
260 |
i2->second.second));
|
260 |
bs = i1->second.first;
|
261 |
bs = i1->second.first;
|
261 |
} else {
|
262 |
} else {
|
|
... |
|
... |
276 |
return a.second > b.second;
|
277 |
return a.second > b.second;
|
277 |
}
|
278 |
}
|
278 |
};
|
279 |
};
|
279 |
|
280 |
|
280 |
// Do the phrase match thing, then merge the highlight lists
|
281 |
// Do the phrase match thing, then merge the highlight lists
|
281 |
bool myTextSplitCB::matchGroups()
|
282 |
bool TextSplitPTR::matchGroups()
|
282 |
{
|
283 |
{
|
283 |
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
284 |
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
284 |
vector<int>::const_iterator sit = m_slacks.begin();
|
285 |
vector<int>::const_iterator sit = m_slacks.begin();
|
285 |
for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
|
286 |
for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
|
286 |
matchGroup(*vit, *sit + (*vit).size());
|
287 |
matchGroup(*vit, *sit + (*vit).size());
|
|
... |
|
... |
331 |
}
|
332 |
}
|
332 |
|
333 |
|
333 |
// Compute the positions for the query terms. We use the text
|
334 |
// Compute the positions for the query terms. We use the text
|
334 |
// splitter to break the text into words, and compare the words to
|
335 |
// splitter to break the text into words, and compare the words to
|
335 |
// the search terms,
|
336 |
// the search terms,
|
336 |
myTextSplitCB cb(terms, groups, slacks);
|
337 |
TextSplitPTR splitter(terms, groups, slacks);
|
337 |
TextSplit splitter(&cb);
|
|
|
338 |
// Note: the splitter returns the term locations in byte, not
|
338 |
// Note: the splitter returns the term locations in byte, not
|
339 |
// character, offsets.
|
339 |
// character, offsets.
|
340 |
splitter.text_to_words(in);
|
340 |
splitter.text_to_words(in);
|
341 |
LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
|
341 |
LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
|
342 |
|
342 |
|
343 |
// Compute the positions for NEAR and PHRASE groups.
|
343 |
// Compute the positions for NEAR and PHRASE groups.
|
344 |
cb.matchGroups();
|
344 |
splitter.matchGroups();
|
345 |
|
345 |
|
346 |
out.clear();
|
346 |
out.clear();
|
347 |
out.push_back("");
|
347 |
out.push_back("");
|
348 |
list<string>::iterator olit = out.begin();
|
348 |
list<string>::iterator olit = out.begin();
|
349 |
|
349 |
|
|
... |
|
... |
351 |
*olit = header();
|
351 |
*olit = header();
|
352 |
|
352 |
|
353 |
// Iterator for the list of input term positions. We use it to
|
353 |
// Iterator for the list of input term positions. We use it to
|
354 |
// output highlight tags and to compute term positions in the
|
354 |
// output highlight tags and to compute term positions in the
|
355 |
// output text
|
355 |
// output text
|
356 |
vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
|
356 |
vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
|
357 |
vector<pair<int, int> >::iterator tPosEnd = cb.tboffs.end();
|
357 |
vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
|
358 |
|
358 |
|
359 |
#if 0
|
359 |
#if 0
|
360 |
for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
|
360 |
for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
|
361 |
it != cb.tboffs.end(); it++) {
|
361 |
it != splitter.tboffs.end(); it++) {
|
362 |
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
|
362 |
LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
|
363 |
}
|
363 |
}
|
364 |
#endif
|
364 |
#endif
|
365 |
|
365 |
|
366 |
// Input character iterator
|
366 |
// Input character iterator
|
|
... |
|
... |
410 |
*olit += endMatch();
|
410 |
*olit += endMatch();
|
411 |
*olit += endAnchor();
|
411 |
*olit += endAnchor();
|
412 |
}
|
412 |
}
|
413 |
// Skip all highlight areas that would overlap this one
|
413 |
// Skip all highlight areas that would overlap this one
|
414 |
int crend = tPosIt->second;
|
414 |
int crend = tPosIt->second;
|
415 |
while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
|
415 |
while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
|
416 |
tPosIt++;
|
416 |
tPosIt++;
|
417 |
inrcltag = 0;
|
417 |
inrcltag = 0;
|
418 |
}
|
418 |
}
|
419 |
}
|
419 |
}
|
420 |
|
420 |
|