Switch to unified view

a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp
...
...
73
    :  m_wcount(0), m_hdata(hdata)
73
    :  m_wcount(0), m_hdata(hdata)
74
    {
74
    {
75
    // We separate single terms and groups and extract the group
75
    // We separate single terms and groups and extract the group
76
    // terms for computing positions list before looking for group
76
    // terms for computing positions list before looking for group
77
    // matches
77
    // matches
78
79
    for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
78
    for (vector<vector<string> >::const_iterator vit = hdata.groups.begin();
80
         vit != hdata.groups.end(); vit++) {
79
         vit != hdata.groups.end(); vit++) {
81
        if (vit->size() == 1) {
80
        if (vit->size() == 1) {
82
#ifndef RCL_INDEX_STRIPCHARS
83
      if (o_index_stripchars) {
84
#endif
85
            m_terms[vit->front()] = vit - hdata.groups.begin();
81
        m_terms[vit->front()] = vit - hdata.groups.begin();
86
#ifndef RCL_INDEX_STRIPCHARS
87
      } else {
88
          string dumb = vit->front();
89
          unacmaybefold(vit->front(), dumb, "UTF-8", UNACOP_UNACFOLD);
90
          m_terms[dumb] = vit - hdata.groups.begin();
91
      }
92
#endif
93
        } else if (vit->size() > 1) {
82
        } else if (vit->size() > 1) {
94
        for (vector<string>::const_iterator it = vit->begin(); 
83
        for (vector<string>::const_iterator it = vit->begin(); 
95
             it != vit->end(); it++) {
84
             it != vit->end(); it++) {
96
#ifndef RCL_INDEX_STRIPCHARS
97
      if (o_index_stripchars) {
98
#endif
99
            m_gterms.insert(*it);
85
            m_gterms.insert(*it);
100
#ifndef RCL_INDEX_STRIPCHARS
101
      } else {
102
          string dumb = *it;
103
          unacmaybefold(*it, dumb, "UTF-8", UNACOP_UNACFOLD);
104
          m_gterms.insert(dumb);
105
      }
106
#endif
107
        }
86
        }
108
        }
87
        }
109
    }
88
    }
110
    }
89
    }
111
90
112
    // Accept word and its position. If word is search term, add
91
    // Accept word and its position. If word is search term, add
113
    // highlight zone definition. If word is part of search group
92
    // highlight zone definition. If word is part of search group
114
    // (phrase or near), update positions list.
93
    // (phrase or near), update positions list.
115
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
94
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
116
    string dumb;
95
    string dumb = term;
96
#ifndef RCL_INDEX_STRIPCHARS
97
  if (o_index_stripchars) {
98
#endif
117
    if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
99
        if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
118
        LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
100
      LOGINFO(("PlainToRich::takeword: unac failed for [%s]\n",
119
                     term.c_str()));
101
           term.c_str()));
120
        return true;
102
      return true;
103
      }
104
#ifndef RCL_INDEX_STRIPCHARS
121
    }
105
    }
106
#endif
107
122
    //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
108
    //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
123
    // pos, bts, bte));
109
    // pos, bts, bte));
124
110
125
    // If this word is a search term, remember its byte-offset span. 
111
    // If this word is a search term, remember its byte-offset span. 
126
    map<string, unsigned int>::const_iterator it = m_terms.find(dumb);
112
    map<string, unsigned int>::const_iterator it = m_terms.find(dumb);
...
...
194
//    make no sense for highlighting.
180
//    make no sense for highlighting.
195
static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
181
static bool do_proximity_test(int window, vector<vector<int>* >& plists, 
196
                  unsigned int i, int min, int max, 
182
                  unsigned int i, int min, int max, 
197
                  int *sp, int *ep, int minpos)
183
                  int *sp, int *ep, int minpos)
198
{
184
{
199
    LOGDEB0(("do_prox_test: win %d i %d min %d max %d minpos %d\n", 
185
    LOGDEB1(("do_prox_test: win %d i %d min %d max %d minpos %d\n", 
200
         window, i, min, max, minpos));
186
         window, i, min, max, minpos));
201
    int tmp = max + 1 - window;
187
    int tmp = max + 1 - window;
202
    if (tmp < minpos)
188
    if (tmp < minpos)
203
    tmp = minpos;
189
    tmp = minpos;
204
190
...
...
232
bool TextSplitPTR::matchGroup(unsigned int grpidx)
218
bool TextSplitPTR::matchGroup(unsigned int grpidx)
233
{
219
{
234
    const vector<string>& terms = m_hdata.groups[grpidx];
220
    const vector<string>& terms = m_hdata.groups[grpidx];
235
    int window = m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx];
221
    int window = m_hdata.groups[grpidx].size() + m_hdata.slacks[grpidx];
236
222
237
    LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
223
    LOGDEB1(("TextSplitPTR::matchGroup:d %d: %s\n", window,
238
        vecStringToString(terms).c_str()));
224
        vecStringToString(terms).c_str()));
239
225
240
    // The position lists we are going to work with. We extract them from the 
226
    // The position lists we are going to work with. We extract them from the 
241
    // (string->plist) map
227
    // (string->plist) map
242
    vector<vector<int>* > plists;
228
    vector<vector<int>* > plists;
...
...
249
    // the search, so that some terms are not found.
235
    // the search, so that some terms are not found.
250
    for (vector<string>::const_iterator it = terms.begin(); 
236
    for (vector<string>::const_iterator it = terms.begin(); 
251
     it != terms.end(); it++) {
237
     it != terms.end(); it++) {
252
    map<string, vector<int> >::iterator pl = m_plists.find(*it);
238
    map<string, vector<int> >::iterator pl = m_plists.find(*it);
253
    if (pl == m_plists.end()) {
239
    if (pl == m_plists.end()) {
254
        LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
240
        LOGDEB1(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
255
            (*it).c_str()));
241
            (*it).c_str()));
256
        return false;
242
        return false;
257
    }
243
    }
258
    plists.push_back(&(pl->second));
244
    plists.push_back(&(pl->second));
259
    plistToTerm[&(pl->second)] = *it;
245
    plistToTerm[&(pl->second)] = *it;
260
    }
246
    }
261
    // I think this can't actually happen, was useful when we used to
247
    // I think this can't actually happen, was useful when we used to
262
    // prune the groups, but doesn't hurt.
248
    // prune the groups, but doesn't hurt.
263
    if (plists.size() < 2) {
249
    if (plists.size() < 2) {
264
    LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
250
    LOGDEB1(("TextSplitPTR::matchGroup: no actual groups found\n"));
265
    return false;
251
    return false;
266
    }
252
    }
267
    // Sort the positions lists so that the shorter is first
253
    // Sort the positions lists so that the shorter is first
268
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
254
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
269
255
...
...
273
    if (it == plistToTerm.end()) {
259
    if (it == plistToTerm.end()) {
274
        // SuperWeird
260
        // SuperWeird
275
        LOGERR(("matchGroup: term for first list not found !?!\n"));
261
        LOGERR(("matchGroup: term for first list not found !?!\n"));
276
        return false;
262
        return false;
277
    }
263
    }
278
    LOGDEB0(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
264
    LOGDEB1(("matchGroup: walking the shortest plist. Term [%s], len %d\n",
279
        it->second.c_str(), plists[0]->size()));
265
        it->second.c_str(), plists[0]->size()));
280
    }
266
    }
281
267
282
    // Minpos is the highest end of a found match. While looking for
268
    // Minpos is the highest end of a found match. While looking for
283
    // further matches, we don't want the search to extend before
269
    // further matches, we don't want the search to extend before
...
...
287
    // Walk the shortest plist and look for matches
273
    // Walk the shortest plist and look for matches
288
    for (vector<int>::iterator it = plists[0]->begin(); 
274
    for (vector<int>::iterator it = plists[0]->begin(); 
289
     it != plists[0]->end(); it++) {
275
     it != plists[0]->end(); it++) {
290
    int pos = *it;
276
    int pos = *it;
291
    int sta = int(10E9), sto = 0;
277
    int sta = int(10E9), sto = 0;
292
    LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
278
    LOGDEB2(("MatchGroup: Testing at pos %d\n", pos));
293
    if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
279
    if (do_proximity_test(window,plists, 1, pos, pos, &sta, &sto, minpos)) {
294
        LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
280
        LOGDEB1(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
295
             sta, sto)); 
281
             sta, sto)); 
296
        // Maybe extend the window by 1st term position, this was not
282
        // Maybe extend the window by 1st term position, this was not
297
        // done by do_prox..
283
        // done by do_prox..
298
        SETMINMAX(pos, sta, sto);
284
        SETMINMAX(pos, sta, sto);
299
        minpos = sto+1;
285
        minpos = sto+1;
300
        // Translate the position window into a byte offset window
286
        // Translate the position window into a byte offset window
301
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
287
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
302
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
288
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
303
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
289
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
304
        LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
290
        LOGDEB2(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
305
            i1->second.first, i2->second.second));
291
            i1->second.first, i2->second.second));
306
        tboffs.push_back(MatchEntry(i1->second.first, 
292
        tboffs.push_back(MatchEntry(i1->second.first, 
307
                        i2->second.second, grpidx));
293
                        i2->second.second, grpidx));
308
        } else {
294
        } else {
309
        LOGDEB(("matchGroup: no bpos found for %d or %d\n", sta, sto));
295
        LOGDEB0(("matchGroup: no bpos found for %d or %d\n", sta, sto));
310
        }
296
        }
311
    } else {
297
    } else {
312
        LOGDEB1(("matchGroup: no group match found at this position\n"));
298
        LOGDEB1(("matchGroup: no group match found at this position\n"));
313
    }
299
    }
314
    }
300
    }
...
...
357
                  list<string>& out, // Output chunk list
343
                  list<string>& out, // Output chunk list
358
                  const HighlightData& hdata,
344
                  const HighlightData& hdata,
359
                  int chunksize)
345
                  int chunksize)
360
{
346
{
361
    Chrono chron;
347
    Chrono chron;
348
    bool ret = true;
349
    LOGDEB1(("plaintorichich: in: [%s]\n", in.c_str()));
362
350
363
    m_hdata = &hdata;
351
    m_hdata = &hdata;
364
    // Compute the positions for the query terms.  We use the text
352
    // Compute the positions for the query terms.  We use the text
365
    // splitter to break the text into words, and compare the words to
353
    // splitter to break the text into words, and compare the words to
366
    // the search terms,
354
    // the search terms,
...
...
377
    out.push_back("");
365
    out.push_back("");
378
    list<string>::iterator olit = out.begin();
366
    list<string>::iterator olit = out.begin();
379
367
380
    // Rich text output
368
    // Rich text output
381
    *olit = header();
369
    *olit = header();
370
371
    // No term matches. Happens, for example on a snippet selected for
372
    // a term match when we are actually looking for a group match
373
    // (the snippet generator does this...).
374
    if (splitter.tboffs.empty()) {
375
  LOGDEB1(("plaintorich: no term matches\n"));
376
  ret = false;
377
    }
382
378
383
    // Iterator for the list of input term positions. We use it to
379
    // Iterator for the list of input term positions. We use it to
384
    // output highlight tags and to compute term positions in the
380
    // output highlight tags and to compute term positions in the
385
    // output text
381
    // output text
386
    vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
382
    vector<MatchEntry>::iterator tPosIt = splitter.tboffs.begin();
...
...
548
    fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
544
    fprintf(fp, "ENDOFPLAINTORICHOUTPUT\n");
549
    fclose(fp);
545
    fclose(fp);
550
    }
546
    }
551
#endif
547
#endif
552
    LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));
548
    LOGDEB2(("plaintorich: done %d mS\n", chron.millis()));
553
    return true;
549
    return ret;
554
}
550
}