|
a/src/query/plaintorich.cpp |
|
b/src/query/plaintorich.cpp |
|
... |
|
... |
75 |
m_gterms.insert(*it);
|
75 |
m_gterms.insert(*it);
|
76 |
}
|
76 |
}
|
77 |
}
|
77 |
}
|
78 |
}
|
78 |
}
|
79 |
|
79 |
|
80 |
// Callback called by the text-to-words breaker for each word
|
80 |
// Accept word and its position. If word is search term, add
|
|
|
81 |
// highlight zone definition. If word is part of search group
|
|
|
82 |
// (phrase or near), update positions list.
|
81 |
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
83 |
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
82 |
string dumb;
|
84 |
string dumb;
|
83 |
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
85 |
if (!unacmaybefold(term, dumb, "UTF-8", true)) {
|
84 |
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
|
86 |
LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
|
85 |
term.c_str()));
|
87 |
term.c_str()));
|
|
... |
|
... |
91 |
// If this word is a search term, remember its byte-offset span.
|
93 |
// If this word is a search term, remember its byte-offset span.
|
92 |
if (m_terms.find(dumb) != m_terms.end()) {
|
94 |
if (m_terms.find(dumb) != m_terms.end()) {
|
93 |
tboffs.push_back(pair<int, int>(bts, bte));
|
95 |
tboffs.push_back(pair<int, int>(bts, bte));
|
94 |
}
|
96 |
}
|
95 |
|
97 |
|
|
|
98 |
// If word is part of a search group, update its positions list
|
96 |
if (m_gterms.find(dumb) != m_gterms.end()) {
|
99 |
if (m_gterms.find(dumb) != m_gterms.end()) {
|
97 |
// Term group (phrase/near) handling
|
100 |
// Term group (phrase/near) handling
|
98 |
m_plists[dumb].push_back(pos);
|
101 |
m_plists[dumb].push_back(pos);
|
99 |
m_gpostobytes[pos] = pair<int,int>(bts, bte);
|
102 |
m_gpostobytes[pos] = pair<int,int>(bts, bte);
|
100 |
//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
|
103 |
//LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
|
101 |
}
|
104 |
}
|
|
|
105 |
|
|
|
106 |
// Check for cancellation request
|
102 |
if ((m_wcount++ & 0xfff) == 0)
|
107 |
if ((m_wcount++ & 0xfff) == 0)
|
103 |
CancelCheck::instance().checkCancel();
|
108 |
CancelCheck::instance().checkCancel();
|
|
|
109 |
|
104 |
return true;
|
110 |
return true;
|
105 |
}
|
111 |
}
|
106 |
|
112 |
|
107 |
// Must be called after the split to find the phrase/near match positions
|
113 |
// Must be called after the split to find the phrase/near match positions
|
108 |
virtual bool matchGroups();
|
114 |
virtual bool matchGroups();
|
|
... |
|
... |
138 |
|
144 |
|
139 |
#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \
|
145 |
#define SETMINMAX(POS, STA, STO) {if ((POS) < (STA)) (STA) = (POS); \
|
140 |
if ((POS) > (STO)) (STO) = (POS);}
|
146 |
if ((POS) > (STO)) (STO) = (POS);}
|
141 |
|
147 |
|
142 |
// Recursively check that each term is inside the window (which is
|
148 |
// Recursively check that each term is inside the window (which is
|
143 |
// readjusted as the successive terms are found). i is the index for
|
149 |
// readjusted as the successive terms are found).
|
144 |
// the next position list to use (initially 1)
|
150 |
// @param window the search window width
|
|
|
151 |
// @param plists the position list vector
|
|
|
152 |
// @param i the position list to process (we then recurse with the next list)
|
|
|
153 |
// @param min the current minimum pos for a found term
|
|
|
154 |
// @param max the current maximum pos for a found term
|
|
|
155 |
// @param sp, ep output: the found area
|
|
|
156 |
// @param minpos bottom of search: this is the highest point of
|
|
|
157 |
// any previous match. We don't look below this as overlapping matches
|
|
|
158 |
// make no sense for highlighting.
|
145 |
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
|
159 |
static bool do_proximity_test(int window, vector<vector<int>* >& plists,
|
146 |
unsigned int i, int min, int max,
|
160 |
unsigned int i, int min, int max,
|
147 |
int *sp, int *ep)
|
161 |
int *sp, int *ep, int minpos)
|
148 |
{
|
162 |
{
|
|
|
163 |
LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n",
|
|
|
164 |
window, i, min, max, minpos));
|
149 |
int tmp = max + 1;
|
165 |
int tmp = max + 1 - window;
|
150 |
// take care to avoid underflow
|
166 |
if (tmp < minpos)
|
151 |
if (window <= tmp)
|
|
|
152 |
tmp -= window;
|
167 |
tmp = minpos;
|
153 |
else
|
168 |
|
154 |
tmp = 0;
|
169 |
// Find 1st position bigger than window start
|
155 |
vector<int>::iterator it = plists[i]->begin();
|
170 |
vector<int>::iterator it = plists[i]->begin();
|
156 |
|
|
|
157 |
// Find 1st position bigger than window start
|
|
|
158 |
while (it != plists[i]->end() && *it < tmp)
|
171 |
while (it != plists[i]->end() && *it < tmp)
|
159 |
it++;
|
172 |
it++;
|
160 |
|
173 |
|
161 |
// Try each position inside window in turn for match with other lists
|
174 |
// Try each position inside window in turn for match with other lists
|
162 |
while (it != plists[i]->end()) {
|
175 |
while (it != plists[i]->end()) {
|
|
... |
|
... |
165 |
return false;
|
178 |
return false;
|
166 |
if (i + 1 == plists.size()) {
|
179 |
if (i + 1 == plists.size()) {
|
167 |
SETMINMAX(pos, *sp, *ep);
|
180 |
SETMINMAX(pos, *sp, *ep);
|
168 |
return true;
|
181 |
return true;
|
169 |
}
|
182 |
}
|
170 |
if (pos < min) {
|
183 |
SETMINMAX(pos, min, max);
|
171 |
min = pos;
|
|
|
172 |
} else if (pos > max) {
|
|
|
173 |
max = pos;
|
|
|
174 |
}
|
|
|
175 |
if (do_proximity_test(window, plists, i + 1, min, max, sp, ep)) {
|
184 |
if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
|
176 |
SETMINMAX(pos, *sp, *ep);
|
185 |
SETMINMAX(pos, *sp, *ep);
|
177 |
return true;
|
186 |
return true;
|
178 |
}
|
187 |
}
|
179 |
it++;
|
188 |
it++;
|
180 |
}
|
189 |
}
|
181 |
return false;
|
190 |
return false;
|
182 |
}
|
191 |
}
|
183 |
|
192 |
|
184 |
// Check if there is a NEAR match for the group of terms
|
193 |
// Find NEAR matches for the input group of terms, update highlight map
|
185 |
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
|
194 |
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
|
186 |
{
|
195 |
{
|
187 |
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
|
196 |
LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
|
188 |
vecStringToString(terms).c_str()));
|
197 |
vecStringToString(terms).c_str()));
|
189 |
|
198 |
|
|
... |
|
... |
230 |
}
|
239 |
}
|
231 |
LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
|
240 |
LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
|
232 |
it->second.c_str(), plists[0]->size()));
|
241 |
it->second.c_str(), plists[0]->size()));
|
233 |
}
|
242 |
}
|
234 |
|
243 |
|
|
|
244 |
// Minpos is the highest end of a found match. While looking for
|
|
|
245 |
// further matches, we don't want the search to extend before
|
|
|
246 |
// this, because it does not make sense for highlight regions to
|
|
|
247 |
// overlap
|
|
|
248 |
int minpos = 0;
|
235 |
// Walk the shortest plist and look for matches
|
249 |
// Walk the shortest plist and look for matches
|
236 |
for (vector<int>::iterator it = plists[0]->begin();
|
250 |
for (vector<int>::iterator it = plists[0]->begin();
|
237 |
it != plists[0]->end(); it++) {
|
251 |
it != plists[0]->end(); it++) {
|
238 |
int pos = *it;
|
252 |
int pos = *it;
|
239 |
int sta = int(10E9), sto = 0;
|
253 |
int sta = int(10E9), sto = 0;
|
240 |
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
254 |
LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
|
241 |
if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
|
255 |
if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
|
242 |
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
|
256 |
LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n",
|
243 |
sta, sto));
|
257 |
sta, sto));
|
244 |
// Maybe extend the window by 1st term position, this was not
|
258 |
// Maybe extend the window by 1st term position, this was not
|
245 |
// done by do_prox..
|
259 |
// done by do_prox..
|
246 |
SETMINMAX(pos, sta, sto);
|
260 |
SETMINMAX(pos, sta, sto);
|
|
|
261 |
minpos = sto+1;
|
247 |
// Translate the position window into a byte offset window
|
262 |
// Translate the position window into a byte offset window
|
248 |
int bs = 0;
|
263 |
int bs = 0;
|
249 |
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
264 |
map<int, pair<int, int> >::iterator i1 = m_gpostobytes.find(sta);
|
250 |
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
265 |
map<int, pair<int, int> >::iterator i2 = m_gpostobytes.find(sto);
|
251 |
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
266 |
if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
|
|
... |
|
... |
255 |
i2->second.second));
|
270 |
i2->second.second));
|
256 |
bs = i1->second.first;
|
271 |
bs = i1->second.first;
|
257 |
} else {
|
272 |
} else {
|
258 |
LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
|
273 |
LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
|
259 |
}
|
274 |
}
|
|
|
275 |
} else {
|
|
|
276 |
LOGDEB0(("matchGroup: no group match found at this position\n"));
|
260 |
}
|
277 |
}
|
261 |
}
|
278 |
}
|
262 |
|
279 |
|
263 |
return true;
|
280 |
return true;
|
264 |
}
|
281 |
}
|
|
... |
|
... |
271 |
return a.first < b.first;
|
288 |
return a.first < b.first;
|
272 |
return a.second > b.second;
|
289 |
return a.second > b.second;
|
273 |
}
|
290 |
}
|
274 |
};
|
291 |
};
|
275 |
|
292 |
|
276 |
// Do the phrase match thing, then merge the highlight lists
|
293 |
// Look for matches to PHRASE and NEAR term groups. Actually, we
|
|
|
294 |
// handle all groups as NEAR (ignore order).
|
277 |
bool TextSplitPTR::matchGroups()
|
295 |
bool TextSplitPTR::matchGroups()
|
278 |
{
|
296 |
{
|
279 |
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
297 |
vector<vector<string> >::const_iterator vit = m_groups.begin();
|
280 |
vector<int>::const_iterator sit = m_slacks.begin();
|
298 |
vector<int>::const_iterator sit = m_slacks.begin();
|
281 |
for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
|
299 |
for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
|
282 |
matchGroup(*vit, *sit + (*vit).size());
|
300 |
matchGroup(*vit, *sit + (*vit).size());
|
283 |
}
|
301 |
}
|
284 |
|
302 |
|
285 |
// Sort by start and end offsets. The merging of overlapping entries
|
303 |
// Sort regions by increasing start and decreasing width.
|
286 |
// will be handled during output.
|
304 |
// The output process will skip overlapping entries.
|
287 |
std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
|
305 |
std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
|
288 |
return true;
|
306 |
return true;
|
289 |
}
|
307 |
}
|
290 |
|
308 |
|
291 |
|
309 |
|
292 |
// Fix result text for display inside the gui text window.
|
310 |
// Fix result text for display inside the gui text window.
|
293 |
//
|
311 |
//
|
294 |
// To compute the term character positions in the output text, we used
|
312 |
// We call overridden functions to output header data, beginnings and ends of
|
295 |
// to emulate how qt's textedit counts chars (ignoring tags and
|
313 |
// matches etc.
|
296 |
// duplicate whitespace etc...). This was tricky business, dependant
|
|
|
297 |
// on qtextedit internals, and we don't do it any more, so we finally
|
|
|
298 |
// don't know the term par/car positions in the editor text.
|
|
|
299 |
// Instead, we now mark the search term positions with html anchors
|
|
|
300 |
//
|
314 |
//
|
301 |
// We output the result in chunks, arranging not to cut in the middle of
|
315 |
// If the input is text, we output the result in chunks, arranging not
|
302 |
// a tag, which would confuse qtextedit.
|
316 |
// to cut in the middle of a tag, which would confuse qtextedit. If
|
|
|
317 |
// the input is html, the body is always a single output chunk.
|
303 |
bool PlainToRich::plaintorich(const string& in,
|
318 |
bool PlainToRich::plaintorich(const string& in,
|
304 |
list<string>& out, // Output chunk list
|
319 |
list<string>& out, // Output chunk list
|
305 |
const HiliteData& hdata,
|
320 |
const HiliteData& hdata,
|
306 |
int chunksize)
|
321 |
int chunksize)
|
307 |
{
|
322 |
{
|
|
... |
|
... |
309 |
const vector<string>& terms(hdata.terms);
|
324 |
const vector<string>& terms(hdata.terms);
|
310 |
const vector<vector<string> >& groups(hdata.groups);
|
325 |
const vector<vector<string> >& groups(hdata.groups);
|
311 |
const vector<int>& slacks(hdata.gslks);
|
326 |
const vector<int>& slacks(hdata.gslks);
|
312 |
|
327 |
|
313 |
if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
|
328 |
if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
|
314 |
LOGDEB0(("plaintorich: terms: \n"));
|
|
|
315 |
string sterms = vecStringToString(terms);
|
329 |
string sterms = vecStringToString(terms);
|
316 |
LOGDEB0((" %s\n", sterms.c_str()));
|
330 |
LOGDEB0(("plaintorich: terms: %s\n", sterms.c_str()));
|
317 |
sterms = "\n";
|
331 |
sterms.clear();
|
318 |
LOGDEB0(("plaintorich: groups: \n"));
|
|
|
319 |
for (vector<vector<string> >::const_iterator vit = groups.begin();
|
332 |
for (vector<vector<string> >::const_iterator vit = groups.begin();
|
320 |
vit != groups.end(); vit++) {
|
333 |
vit != groups.end(); vit++) {
|
321 |
sterms += "GROUP: ";
|
334 |
sterms += "GROUP: ";
|
322 |
sterms += vecStringToString(*vit);
|
335 |
sterms += vecStringToString(*vit);
|
323 |
sterms += "\n";
|
336 |
sterms += "\n";
|
324 |
}
|
337 |
}
|
325 |
LOGDEB0((" %s", sterms.c_str()));
|
338 |
LOGDEB0(("plaintorich: groups:\n %s", sterms.c_str()));
|
326 |
LOGDEB2((" TEXT:[%s]\n", in.c_str()));
|
339 |
LOGDEB2((" TEXT:[%s]\n", in.c_str()));
|
327 |
}
|
340 |
}
|
328 |
|
341 |
|
329 |
// Compute the positions for the query terms. We use the text
|
342 |
// Compute the positions for the query terms. We use the text
|
330 |
// splitter to break the text into words, and compare the words to
|
343 |
// splitter to break the text into words, and compare the words to
|
|
... |
|
... |
392 |
// If we still have terms positions, check (byte) position. If
|
405 |
// If we still have terms positions, check (byte) position. If
|
393 |
// we are at or after a term match, mark.
|
406 |
// we are at or after a term match, mark.
|
394 |
if (tPosIt != tPosEnd) {
|
407 |
if (tPosIt != tPosEnd) {
|
395 |
int ibyteidx = chariter.getBpos();
|
408 |
int ibyteidx = chariter.getBpos();
|
396 |
if (ibyteidx == tPosIt->first) {
|
409 |
if (ibyteidx == tPosIt->first) {
|
397 |
if (!intag && ibyteidx > (int)headend) {
|
410 |
if (!intag && ibyteidx >= (int)headend) {
|
398 |
*olit += startAnchor(anchoridx);
|
411 |
*olit += startAnchor(anchoridx);
|
399 |
*olit += startMatch();
|
412 |
*olit += startMatch();
|
400 |
}
|
413 |
}
|
401 |
anchoridx++;
|
414 |
anchoridx++;
|
402 |
inrcltag = 1;
|
415 |
inrcltag = 1;
|