Switch to unified view

a/src/query/plaintorich.cpp b/src/query/plaintorich.cpp
...
...
56
    return sterms;
56
    return sterms;
57
}
57
}
58
58
59
// Text splitter callback used to take note of the position of query terms 
59
// Text splitter callback used to take note of the position of query terms 
60
// inside the result text. This is then used to insert highlight tags. 
60
// inside the result text. This is then used to insert highlight tags. 
61
class myTextSplitCB : public TextSplitCB {
61
class TextSplitPTR : public TextSplit {
62
 public:
62
 public:
63
63
64
    // Out: begin and end byte positions of query terms/groups in text
64
    // Out: begin and end byte positions of query terms/groups in text
65
    vector<pair<int, int> > tboffs;  
65
    vector<pair<int, int> > tboffs;  
66
66
67
    myTextSplitCB(const vector<string>& its, 
67
    TextSplitPTR(const vector<string>& its, 
68
        const vector<vector<string> >&groups, 
68
                 const vector<vector<string> >&groups, 
69
        const vector<int>& slacks) 
69
                 const vector<int>& slacks) 
70
    :  m_wcount(0), m_groups(groups), m_slacks(slacks)
70
    :  m_wcount(0), m_groups(groups), m_slacks(slacks)
71
    {
71
    {
72
    for (vector<string>::const_iterator it = its.begin(); 
72
    for (vector<string>::const_iterator it = its.begin(); 
73
         it != its.end(); it++) {
73
         it != its.end(); it++) {
74
        m_terms.insert(*it);
74
        m_terms.insert(*it);
...
...
84
84
85
    // Callback called by the text-to-words breaker for each word
85
    // Callback called by the text-to-words breaker for each word
86
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
86
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
87
    string dumb;
87
    string dumb;
88
    if (!unacmaybefold(term, dumb, "UTF-8", true)) {
88
    if (!unacmaybefold(term, dumb, "UTF-8", true)) {
89
        LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n", term.c_str()));
89
        LOGINFO(("PlainToRich::splitter::takeword: unac failed for [%s]\n",
90
                     term.c_str()));
90
        return true;
91
        return true;
91
    }
92
    }
92
    //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
93
    //LOGDEB2(("Input dumbbed term: '%s' %d %d %d\n", dumb.c_str(), 
93
    // pos, bts, bte));
94
    // pos, bts, bte));
94
95
...
...
184
    }
185
    }
185
    return false;
186
    return false;
186
}
187
}
187
188
188
// Check if there is a NEAR match for the group of terms
189
// Check if there is a NEAR match for the group of terms
189
bool myTextSplitCB::matchGroup(const vector<string>& terms, int window)
190
bool TextSplitPTR::matchGroup(const vector<string>& terms, int window)
190
{
191
{
191
    LOGDEB0(("myTextSplitCB::matchGroup:d %d: %s\n", window,
192
    LOGDEB0(("TextSplitPTR::matchGroup:d %d: %s\n", window,
192
        vecStringToString(terms).c_str()));
193
        vecStringToString(terms).c_str()));
193
194
194
    // The position lists we are going to work with. We extract them from the 
195
    // The position lists we are going to work with. We extract them from the 
195
    // (string->plist) map
196
    // (string->plist) map
196
    vector<vector<int>* > plists;
197
    vector<vector<int>* > plists;
...
...
205
    // stem-expanded: we don't know which matched)
206
    // stem-expanded: we don't know which matched)
206
    for (vector<string>::const_iterator it = terms.begin(); 
207
    for (vector<string>::const_iterator it = terms.begin(); 
207
     it != terms.end(); it++) {
208
     it != terms.end(); it++) {
208
    map<string, vector<int> >::iterator pl = m_plists.find(*it);
209
    map<string, vector<int> >::iterator pl = m_plists.find(*it);
209
    if (pl == m_plists.end()) {
210
    if (pl == m_plists.end()) {
210
        LOGDEB0(("myTextSplitCB::matchGroup: [%s] not found in m_plists\n",
211
        LOGDEB0(("TextSplitPTR::matchGroup: [%s] not found in m_plists\n",
211
            (*it).c_str()));
212
            (*it).c_str()));
212
        continue;
213
        continue;
213
    }
214
    }
214
    plists.push_back(&(pl->second));
215
    plists.push_back(&(pl->second));
215
    plistToTerm[&(pl->second)] = *it;
216
    plistToTerm[&(pl->second)] = *it;
216
    realgroup.push_back(*it);
217
    realgroup.push_back(*it);
217
    }
218
    }
218
    LOGDEB0(("myTextSplitCB::matchGroup:d %d:real group after expansion %s\n", 
219
    LOGDEB0(("TextSplitPTR::matchGroup:d %d:real group after expansion %s\n", 
219
         window, vecStringToString(realgroup).c_str()));
220
         window, vecStringToString(realgroup).c_str()));
220
    if (plists.size() < 2) {
221
    if (plists.size() < 2) {
221
    LOGDEB0(("myTextSplitCB::matchGroup: no actual groups found\n"));
222
    LOGDEB0(("TextSplitPTR::matchGroup: no actual groups found\n"));
222
    return false;
223
    return false;
223
    }
224
    }
224
    // Sort the positions lists so that the shorter is first
225
    // Sort the positions lists so that the shorter is first
225
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
226
    std::sort(plists.begin(), plists.end(), VecIntCmpShorter());
226
227
...
...
241
     it != plists[0]->end(); it++) {
242
     it != plists[0]->end(); it++) {
242
    int pos = *it;
243
    int pos = *it;
243
    int sta = int(10E9), sto = 0;
244
    int sta = int(10E9), sto = 0;
244
    LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
245
    LOGDEB0(("MatchGroup: Testing at pos %d\n", pos));
245
    if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
246
    if (do_proximity_test(window, plists, 1, pos, pos, &sta, &sto)) {
246
        LOGDEB0(("myTextSplitCB::matchGroup: MATCH termpos [%d,%d]\n", 
247
        LOGDEB0(("TextSplitPTR::matchGroup: MATCH termpos [%d,%d]\n", 
247
             sta, sto)); 
248
             sta, sto)); 
248
        // Maybe extend the window by 1st term position, this was not
249
        // Maybe extend the window by 1st term position, this was not
249
        // done by do_prox..
250
        // done by do_prox..
250
        SETMINMAX(pos, sta, sto);
251
        SETMINMAX(pos, sta, sto);
251
        // Translate the position window into a byte offset window
252
        // Translate the position window into a byte offset window
252
        int bs = 0;
253
        int bs = 0;
253
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
254
        map<int, pair<int, int> >::iterator i1 =  m_gpostobytes.find(sta);
254
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
255
        map<int, pair<int, int> >::iterator i2 =  m_gpostobytes.find(sto);
255
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
256
        if (i1 != m_gpostobytes.end() && i2 != m_gpostobytes.end()) {
256
        LOGDEB0(("myTextSplitCB::matchGroup: pushing bpos %d %d\n",
257
        LOGDEB0(("TextSplitPTR::matchGroup: pushing bpos %d %d\n",
257
            i1->second.first, i2->second.second));
258
            i1->second.first, i2->second.second));
258
        tboffs.push_back(pair<int, int>(i1->second.first, 
259
        tboffs.push_back(pair<int, int>(i1->second.first, 
259
                        i2->second.second));
260
                        i2->second.second));
260
        bs = i1->second.first;
261
        bs = i1->second.first;
261
        } else {
262
        } else {
...
...
276
    return a.second > b.second;
277
    return a.second > b.second;
277
    }
278
    }
278
};
279
};
279
280
280
// Do the phrase match thing, then merge the highlight lists
281
// Do the phrase match thing, then merge the highlight lists
281
bool myTextSplitCB::matchGroups()
282
bool TextSplitPTR::matchGroups()
282
{
283
{
283
    vector<vector<string> >::const_iterator vit = m_groups.begin();
284
    vector<vector<string> >::const_iterator vit = m_groups.begin();
284
    vector<int>::const_iterator sit = m_slacks.begin();
285
    vector<int>::const_iterator sit = m_slacks.begin();
285
    for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
286
    for (; vit != m_groups.end() && sit != m_slacks.end(); vit++, sit++) {
286
    matchGroup(*vit, *sit + (*vit).size());
287
    matchGroup(*vit, *sit + (*vit).size());
...
...
331
    }
332
    }
332
333
333
    // Compute the positions for the query terms.  We use the text
334
    // Compute the positions for the query terms.  We use the text
334
    // splitter to break the text into words, and compare the words to
335
    // splitter to break the text into words, and compare the words to
335
    // the search terms,
336
    // the search terms,
336
    myTextSplitCB cb(terms, groups, slacks);
337
    TextSplitPTR splitter(terms, groups, slacks);
337
    TextSplit splitter(&cb);
338
    // Note: the splitter returns the term locations in byte, not
338
    // Note: the splitter returns the term locations in byte, not
339
    // character, offsets.
339
    // character, offsets.
340
    splitter.text_to_words(in);
340
    splitter.text_to_words(in);
341
    LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
341
    LOGDEB0(("plaintorich: split done %d mS\n", chron.millis()));
342
342
343
    // Compute the positions for NEAR and PHRASE groups.
343
    // Compute the positions for NEAR and PHRASE groups.
344
    cb.matchGroups();
344
    splitter.matchGroups();
345
345
346
    out.clear();
346
    out.clear();
347
    out.push_back("");
347
    out.push_back("");
348
    list<string>::iterator olit = out.begin();
348
    list<string>::iterator olit = out.begin();
349
349
...
...
351
    *olit = header();
351
    *olit = header();
352
352
353
    // Iterator for the list of input term positions. We use it to
353
    // Iterator for the list of input term positions. We use it to
354
    // output highlight tags and to compute term positions in the
354
    // output highlight tags and to compute term positions in the
355
    // output text
355
    // output text
356
    vector<pair<int, int> >::iterator tPosIt = cb.tboffs.begin();
356
    vector<pair<int, int> >::iterator tPosIt = splitter.tboffs.begin();
357
    vector<pair<int, int> >::iterator tPosEnd = cb.tboffs.end();
357
    vector<pair<int, int> >::iterator tPosEnd = splitter.tboffs.end();
358
358
359
#if 0
359
#if 0
360
    for (vector<pair<int, int> >::const_iterator it = cb.tboffs.begin();
360
    for (vector<pair<int, int> >::const_iterator it = splitter.tboffs.begin();
361
     it != cb.tboffs.end(); it++) {
361
     it != splitter.tboffs.end(); it++) {
362
    LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
362
    LOGDEB2(("plaintorich: region: %d %d\n", it->first, it->second));
363
    }
363
    }
364
#endif
364
#endif
365
365
366
    // Input character iterator
366
    // Input character iterator
...
...
410
            *olit += endMatch();
410
            *olit += endMatch();
411
            *olit += endAnchor();
411
            *olit += endAnchor();
412
        }
412
        }
413
        // Skip all highlight areas that would overlap this one
413
        // Skip all highlight areas that would overlap this one
414
        int crend = tPosIt->second;
414
        int crend = tPosIt->second;
415
        while (tPosIt != cb.tboffs.end() && tPosIt->first < crend)
415
        while (tPosIt != splitter.tboffs.end() && tPosIt->first < crend)
416
            tPosIt++;
416
            tPosIt++;
417
                inrcltag = 0;
417
                inrcltag = 0;
418
        }
418
        }
419
    }
419
    }
420
        
420