Switch to unified view

a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp
...
...
75
        m_gterms.insert(*it);
75
        m_gterms.insert(*it);
76
        }
76
        }
77
    }
77
    }
78
    }
78
    }
79
79
80
    // Callback called by the text-to-words breaker for each word
80
    // Accept word and its position. If word is search term, add
81
    // highlight zone definition. If word is part of search group
82
    // (phrase or near), update positions list.
81
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
83
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
82
    string dumb;
84
    string dumb;
83
    if (!unacmaybefold(term, dumb, "UTF-8", true)) {
85
    if (!unacmaybefold(term, dumb, "UTF-8", true)) {
84
        LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
86
        LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
85
                     term.c_str()));
87
                     term.c_str()));
...
...
91
    // If this word is a search term, remember its byte-offset span. 
93
    // If this word is a search term, remember its byte-offset span. 
92
    if (m_terms.find(dumb) != m_terms.end()) {
94
    if (m_terms.find(dumb) != m_terms.end()) {
93
        tboffs.push_back(pair<int, int>(bts, bte));
95
        tboffs.push_back(pair<int, int>(bts, bte));
94
    }
96
    }
95
    
97
    
98
  // If word is part of a search group, update its positions list
96
    if (m_gterms.find(dumb) != m_gterms.end()) {
99
    if (m_gterms.find(dumb) != m_gterms.end()) {
97
        // Term group (phrase/near) handling
100
        // Term group (phrase/near) handling
98
        m_plists[dumb].push_back(pos);
101
        m_plists[dumb].push_back(pos);
99
        m_gpostobytes[pos] = pair<int,int>(bts, bte);
102
        m_gpostobytes[pos] = pair<int,int>(bts, bte);
100
        //LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
103
        //LOGDEB2(("Recorded bpos for %d: %d %d\n", pos, bts, bte));
101
    }
104
    }
105
106
  // Check for cancellation request
102
    if ((m_wcount++ & 0xfff) == 0)
107
    if ((m_wcount++ & 0xfff) == 0)
103
        CancelCheck::instance().checkCancel();
108
        CancelCheck::instance().checkCancel();
109
104
    return true;
110
    return true;
105
    }
111
    }
106
112
107
    // Must be called after the split to find the phrase/near match positions
113
    // Must be called after the split to find the phrase/near match positions
108
    virtual bool matchGroups();
114
    virtual bool matchGroups();
...
...
138
144
139
#define SETMINMAX(POS, STA, STO)  {if ((POS) < (STA)) (STA) = (POS); \
145
#define SETMINMAX(POS, STA, STO)  {if ((POS) < (STA)) (STA) = (POS); \
140
    if ((POS) > (STO)) (STO) = (POS);}
146
    if ((POS) > (STO)) (STO) = (POS);}
141
147
142
// Recursively check that each term is inside the window (which is
148
// Recursively check that each term is inside the window (which is
143
// readjusted as the successive terms are found). i is the index for
149
// readjusted as the successive terms are found).
144
// the next position list to use (initially 1)
150
// @param window the search window width
151
// @param plists the position list vector
152
// @param i the position list to process (we then recurse with the next list)
153
// @param min the current minimum pos for a found term
154
// @param max the current maximum pos for a found term
155
// @param sp, ep output: the found area
156
// @param minpos bottom of search: this is the highest point of
157
//    any previous match. We don't look below this as overlapping matches 
158
//    make no sense for highlighting.
145
static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
159
static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
146
                  unsigned int i, int min, int max, 
160
                  unsigned int i, int min, int max, 
147
                  int *sp, int *ep)
161
                  int *sp, int *ep, int minpos)
148
{
162
{
163
    LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n", 
164
       window, i, min, max, minpos));
149
    int tmp = max + 1;
165
    int tmp = max + 1 - window;
150
    // take care to avoid underflow
166
    if (tmp < minpos)
151
    if (window <= tmp) 
152
    tmp -= window; 
167
    tmp = minpos;
153
    else 
168
154
  tmp = 0;
169
    // Find 1st position bigger than window start
155
    vector<int>::iterator it = plists[i]->begin();
170
    vector<int>::iterator it = plists[i]->begin();
156
157
    // Find 1st position bigger than window start
158
    while (it != plists[i]->end() && *it < tmp)
171
    while (it != plists[i]->end() && *it < tmp)
159
    it++;
172
    it++;
160
173
161
    // Try each position inside window in turn for match with other lists
174
    // Try each position inside window in turn for match with other lists
162
    while (it != plists[i]->end()) {
175
    while (it != plists[i]->end()) {
...
...
165
        return false;
178
        return false;
166
    if (i + 1 == plists.size()) {
179
    if (i + 1 == plists.size()) {
167
        SETMINMAX(pos, *sp, *ep);
180
        SETMINMAX(pos, *sp, *ep);
168
        return true;
181
        return true;
169
    }
182
    }
170
  if (pos < min) {
183
  SETMINMAX(pos, min, max);
171
      min = pos;
172
  } else if (pos > max) {
173
      max = pos;
174
  }
175
    if (do_proximity_test(window, plists, i + 1, min, max, sp, ep)) {
184
    if (do_proximity_test(window,plists, i + 1, min, max, sp, ep, minpos)) {
176
        SETMINMAX(pos, *sp, *ep);
185
        SETMINMAX(pos, *sp, *ep);
177
        return true;
186
        return true;
178
    }
187
    }
179
    it++;
188
    it++;
180
    }
189
    }
181
    return false;
190
    return false;
182
}
191
}
183
192
184
// Check if there is a NEAR match for the group of terms
193
// Find NEAR matches for the input group of terms, update highlight map
185
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
194
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
186
{
195
{
187
    LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
196
    LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
188
        vecStringToString(terms).c_str()));
197
        vecStringToString(terms).c_str()));
189
198
...
...
230
    }
239
    }
231
    LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
240
    LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
232
        it->second.c_str(), plists[0]->size()));
241
        it->second.c_str(), plists[0]->size()));
233
    }
242
    }
234
243
244
    // Minpos is the highest end of a found match. While looking for
245
    // further matches, we don't want the search to extend before
246
    // this, because it does not make sense for highlight regions to
247
    // overlap
248
    int minpos = 0;
235
    // Walk the shortest plist and look for matches
249
    // Walk the shortest plist and look for matches
236
    for (vector<int>::iterator it = plists[0]->begin(); 
250
    for (vector<int>::iterator it = plists[0]->begin(); 
237
     it != plists[0]->end(); it++) {
251
     it != plists[0]->end(); it++) {
238
    int pos = *it;
252
    int pos = *it;
239
    int sta = int(10E9), sto = 0;
253
    int sta = int(10E9), sto = 0;
240
    LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
254
    LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
241
    if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
255
    if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
242
        LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
256
        LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
243
             sta, sto)); 
257
             sta, sto)); 
244
        // Maybe extend the window by 1st term position, this was not
258
        // Maybe extend the window by 1st term position, this was not
245
        // done by do_prox..
259
        // done by do_prox..
246
        SETMINMAX(pos, sta, sto);
260
        SETMINMAX(pos, sta, sto);
261
      minpos = sto+1;
247
        // Translate the position window into a byte offset window
262
        // Translate the position window into a byte offset window
248
        int bs = 0;
263
        int bs = 0;
249
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
264
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
250
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
265
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
251
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
266
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
...
...
255
                        i2->second.second));
270
                        i2->second.second));
256
        bs = i1->second.first;
271
        bs = i1->second.first;
257
        } else {
272
        } else {
258
        LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
273
        LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
259
        }
274
        }
275
  } else {
276
      LOGDEB0(("matchGroup: no group match found at this position\n"));
260
    }
277
    }
261
    }
278
    }
262
279
263
    return true;
280
    return true;
264
}
281
}
...
...
271
        return a.first < b.first;
288
        return a.first < b.first;
272
    return a.second > b.second;
289
    return a.second > b.second;
273
    }
290
    }
274
};
291
};
275
292
276
// Do the phrase match thing, then merge the highlight lists
293
// Look for matches to PHRASE and NEAR term groups. Actually, we
294
// handle all groups as NEAR (ignore order).
277
bool TextSplitPTR::matchGroups()
295
bool TextSplitPTR::matchGroups()
278
{
296
{
279
    vector<vector<string> >::const_iterator vit = m_groups.begin();
297
    vector<vector<string> >::const_iterator vit = m_groups.begin();
280
    vector<int>::const_iterator sit = m_slacks.begin();
298
    vector<int>::const_iterator sit = m_slacks.begin();
281
    for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
299
    for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
282
    matchGroup(*vit, *sit + (*vit).size());
300
    matchGroup(*vit, *sit + (*vit).size());
283
    }
301
    }
284
302
285
    // Sort by start and end offsets. The merging of overlapping entries
303
    // Sort regions by increasing start and decreasing width.  
286
    // will be handled during output.
304
    // The output process will skip overlapping entries.
287
    std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
305
    std::sort(tboffs.begin(), tboffs.end(), PairIntCmpFirst());
288
    return true;
306
    return true;
289
}
307
}
290
308
291
309
292
// Fix result text for display inside the gui text window.
310
// Fix result text for display inside the gui text window.
293
//
311
//
294
// To compute the term character positions in the output text, we used
312
// We call overridden functions to output header data, beginnings and ends of
295
// to emulate how qt's textedit counts chars (ignoring tags and
313
// matches etc.
296
// duplicate whitespace etc...). This was tricky business, dependant
297
// on qtextedit internals, and we don't do it any more, so we finally
298
// don't know the term par/car positions in the editor text.  
299
// Instead, we now mark the search term positions with html anchors
300
//
314
//
301
// We output the result in chunks, arranging not to cut in the middle of
315
// If the input is text, we output the result in chunks, arranging not
302
// a tag, which would confuse qtextedit.
316
// to cut in the middle of a tag, which would confuse qtextedit. If
317
// the input is html, the body is always a single output chunk.
303
bool PlainToRich::plaintorich(const string& in, 
318
bool PlainToRich::plaintorich(const string& in, 
304
                  list<string>& out, // Output chunk list
319
                  list<string>& out, // Output chunk list
305
                  const HiliteData& hdata,
320
                  const HiliteData& hdata,
306
                  int chunksize)
321
                  int chunksize)
307
{
322
{
...
...
309
    const vector<string>& terms(hdata.terms);
324
    const vector<string>& terms(hdata.terms);
310
    const vector<vector<string> >& groups(hdata.groups);
325
    const vector<vector<string> >& groups(hdata.groups);
311
    const vector<int>& slacks(hdata.gslks);
326
    const vector<int>& slacks(hdata.gslks);
312
327
313
    if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
328
    if (0 && DebugLog::getdbl()->getlevel() >= DEBDEB0) {
314
  LOGDEB0(("plaintorich: terms: \n"));
315
    string sterms = vecStringToString(terms);
329
    string sterms = vecStringToString(terms);
316
    LOGDEB0(("  %s\n", sterms.c_str()));
330
    LOGDEB0(("plaintorich: terms: %s\n", sterms.c_str()));
317
  sterms = "\n";
331
  sterms.clear();
318
  LOGDEB0(("plaintorich: groups: \n"));
319
    for (vector<vector<string> >::const_iterator vit = groups.begin(); 
332
    for (vector<vector<string> >::const_iterator vit = groups.begin(); 
320
         vit != groups.end(); vit++) {
333
         vit != groups.end(); vit++) {
321
        sterms += "GROUP: ";
334
        sterms += "GROUP: ";
322
        sterms += vecStringToString(*vit);
335
        sterms += vecStringToString(*vit);
323
        sterms += "\n";
336
        sterms += "\n";
324
    }
337
    }
325
    LOGDEB0(("  %s", sterms.c_str()));
338
    LOGDEB0(("plaintorich: groups:\n %s", sterms.c_str()));
326
        LOGDEB2(("  TEXT:[%s]\n", in.c_str()));
339
        LOGDEB2(("  TEXT:[%s]\n", in.c_str()));
327
    }
340
    }
328
341
329
    // Compute the positions for the query terms.  We use the text
342
    // Compute the positions for the query terms.  We use the text
330
    // splitter to break the text into words, and compare the words to
343
    // splitter to break the text into words, and compare the words to
...
...
392
    // If we still have terms positions, check (byte) position. If
405
    // If we still have terms positions, check (byte) position. If
393
    // we are at or after a term match, mark.
406
    // we are at or after a term match, mark.
394
    if (tPosIt != tPosEnd) {
407
    if (tPosIt != tPosEnd) {
395
        int ibyteidx = chariter.getBpos();
408
        int ibyteidx = chariter.getBpos();
396
        if (ibyteidx == tPosIt->first) {
409
        if (ibyteidx == tPosIt->first) {
397
        if (!intag && ibyteidx > (int)headend) {
410
        if (!intag && ibyteidx >= (int)headend) {
398
            *olit += startAnchor(anchoridx);
411
            *olit += startAnchor(anchoridx);
399
            *olit += startMatch();
412
            *olit += startMatch();
400
        }
413
        }
401
        anchoridx++;
414
        anchoridx++;
402
                inrcltag = 1;
415
                inrcltag = 1;