a b/src/rcldb/rclabsfromtext.cpp
1
/* Copyright (C) 2004-2017 J.F.Dockes
2
 *   This program is free software; you can redistribute it and/or modify
3
 *   it under the terms of the GNU General Public License as published by
4
 *   the Free Software Foundation; either version 2 of the License, or
5
 *   (at your option) any later version.
6
 *
7
 *   This program is distributed in the hope that it will be useful,
8
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
 *   GNU General Public License for more details.
11
 *
12
 *   You should have received a copy of the GNU General Public License
13
 *   along with this program; if not, write to the
14
 *   Free Software Foundation, Inc.,
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
17
#include "autoconfig.h"
18
19
#include <math.h>
20
21
#include <map>
22
#include <unordered_map>
23
#include <deque>
24
#include <algorithm>
25
26
#include "log.h"
27
#include "rcldb.h"
28
#include "rcldb_p.h"
29
#include "rclquery.h"
30
#include "rclquery_p.h"
31
#include "textsplit.h"
32
#include "hldata.h"
33
#include "chrono.h"
34
#include "unacpp.h"
35
#include "zlibut.h"
36
37
using namespace std;
38
39
// We now let plaintorich do the highlight tags insertions which is
40
// wasteful because we have most of the information (but the perf hit
41
// is small because it's only called on the output fragments, not on
42
// the whole text). The highlight zone computation code has been left
43
// around just in case I change my mind.
44
#undef COMPUTE_HLZONES
45
46
namespace Rcl {
47
48
// Chars we turn to spaces in the Snippets
49
static const string cstr_nc("\n\r\x0c\\");
50
51
// Fragment descriptor. A fragment is a text area with one or several
52
// matched terms and some context. It is ranked according to the
53
// matched term weights and the near/phrase matches get a boost.
54
struct MatchFragment {
55
    // Start/End byte offsets of fragment in the document text
56
    int start;
57
    int stop;
58
    // Weight for this fragment (bigger better)
59
    double coef;
60
#ifdef COMPUTE_HLZONES
61
    // Highlight areas (each is one or several contiguous match
62
    // terms). Because a fragment extends around a match, there
63
    // can be several contiguous or separate matches in a given
64
    // fragment.
65
    vector<pair<int,int>> hlzones;
66
#endif
67
    // Position of the first matched term (for page number computations)
68
    unsigned int hitpos;
69
    // "best term" for this match (e.g. for use as ext app search term)
70
    string term;
71
        
72
    MatchFragment(int sta, int sto, double c,
73
#ifdef COMPUTE_HLZONES
74
                  vector<pair<int,int>>& hl,
75
#endif
76
                  unsigned int pos, string& trm) 
77
        : start(sta), stop(sto), coef(c), hitpos(pos) {
78
#ifdef COMPUTE_HLZONES
79
        hlzones.swap(hl);
80
#endif
81
        term.swap(trm);
82
    }
83
};
84
85
86
// Text splitter for finding the match areas in the document text.
87
class TextSplitABS : public TextSplit {
88
public:
89
90
    TextSplitABS(const vector<string>& matchTerms,
91
                 const HighlightData& hdata,
92
                 unordered_map<string, double>& wordcoefs,
93
                 unsigned int ctxwords,
94
                 Flags flags = TXTS_NONE)
95
        :  TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
96
           m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
97
98
        // Take note of the group (phrase/near) terms because we need
99
        // to compute the position lists for them.
100
        for (const auto& group : hdata.groups) {
101
            if (group.size() > 1) {
102
                for (const auto& term: group) {
103
                    m_gterms.insert(term);
104
                }
105
            }
106
        }
107
    }
108
109
    // Accept a word and its position. If the word is a matched term,
110
    // add/update fragment definition.
111
    virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
112
        LOGDEB2("takeword: " << term << endl);
113
114
        // Remember recent past
115
        m_prevterms.push_back(pair<int,int>(bts,bte));
116
        if (m_prevterms.size() > m_ctxwords+1) {
117
            m_prevterms.pop_front();
118
        }
119
120
        string dumb;
121
        if (o_index_stripchars) {
122
            if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
123
                LOGINFO("abstract: unac failed for [" << term << "]\n");
124
                return true;
125
            }
126
        } else {
127
            dumb = term;
128
        }
129
130
        if (m_terms.find(dumb) != m_terms.end()) {
131
            // This word is a search term. Extend or create fragment
132
            LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first <<
133
                   ", " << m_curfrag.second << " remain " <<
134
                   m_remainingWords << endl);
135
            double coef = m_wordcoefs[dumb];
136
            if (!m_remainingWords) {
137
                // No current fragment. Start one
138
                m_curhitpos = baseTextPosition + pos;
139
                m_curfrag.first = m_prevterms.front().first;
140
                m_curfrag.second = m_prevterms.back().second;
141
#ifdef COMPUTE_HLZONES
142
                m_curhlzones.push_back(pair<int,int>(bts, bte));
143
#endif
144
                m_curterm = term;
145
                m_curtermcoef = coef;
146
            } else {
147
                LOGDEB2("Extending current fragment: " << m_remainingWords <<
148
                       " -> " << m_ctxwords << endl);
149
                m_extcount++;
150
#ifdef COMPUTE_HLZONES
151
                if (m_prevwordhit) {
152
                    m_curhlzones.back().second = bte;
153
                } else {
154
                    m_curhlzones.push_back(pair<int,int>(bts, bte));
155
                }
156
#endif
157
                if (coef > m_curtermcoef) {
158
                    m_curterm = term;
159
                    m_curtermcoef = coef;
160
                }
161
            }
162
163
#ifdef COMPUTE_HLZONES
164
            m_prevwordhit = true;
165
#endif
166
            m_curfragcoef += coef;
167
            m_remainingWords = m_ctxwords + 1;
168
            if (m_extcount > 3) {
169
                // Limit expansion of contiguous fragments (this is to
170
                // avoid common terms in search causing long
171
                // heavyweight meaningless fragments. Also, limit length).
172
                m_remainingWords = 1;
173
                m_extcount = 0;
174
            }
175
176
            // If the term is part of a near/phrase group, update its
177
            // positions list
178
            if (m_gterms.find(dumb) != m_gterms.end()) {
179
                // Term group (phrase/near) handling
180
                m_plists[dumb].push_back(pos);
181
                m_gpostobytes[pos] = pair<int,int>(bts, bte);
182
                LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " <<
183
                        bte << "\n");
184
            }
185
        }
186
#ifdef COMPUTE_HLZONES
187
        else {
188
            // Not a matched term
189
            m_prevwordhit = false;
190
        }
191
#endif
192
193
        
194
        if (m_remainingWords) {
195
            // Fragment currently open. Time to close ?
196
            m_remainingWords--;
197
            m_curfrag.second = bte;
198
            if (m_remainingWords == 0) {
199
                if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) {
200
                    // Don't push bad fragments if we have a lot already
201
                    m_fragments.push_back(MatchFragment(m_curfrag.first,
202
                                                     m_curfrag.second,
203
                                                     m_curfragcoef,
204
#ifdef COMPUTE_HLZONES
205
                                                     m_curhlzones,
206
#endif
207
                                                     m_curhitpos,
208
                                                     m_curterm
209
                                              ));
210
                }
211
                m_totalcoef += m_curfragcoef;
212
                m_curfragcoef = 0.0;
213
                m_curtermcoef = 0.0;
214
            }
215
        }
216
        return true;
217
    }
218
    
219
    const vector<MatchFragment>& getFragments() {
220
        return m_fragments;
221
    }
222
223
224
    // After the text is split: use the group terms positions lists to
225
    // find the group matches. We process everything as NEAR (no
226
    // PHRASE specific processing).
227
    void updgroups() {
228
        vector<GroupMatchEntry> tboffs;
229
230
        // Look for matches to PHRASE and NEAR term groups and finalize
231
        // the matched regions list (sort it by increasing start then
232
        // decreasing length). We process all groups as NEAR (ignore order).
233
        for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
234
            if (m_hdata.groups[i].size() > 1) {
235
                matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs);
236
            }
237
        }
238
239
        // Sort the fragments by increasing start and decreasing width
240
        std::sort(m_fragments.begin(), m_fragments.end(),
241
                  [](const MatchFragment& a, const MatchFragment& b) -> bool {
242
                      if (a.start != b.start)
243
                          return a.start < b.start;
244
                      return a.stop - a.start > b.stop - a.stop;
245
                  }
246
            );
247
        
248
        // Sort the group regions by increasing start and decreasing width.  
249
        std::sort(tboffs.begin(), tboffs.end(),
250
                  [](const GroupMatchEntry& a, const GroupMatchEntry& b)
251
                  -> bool {
252
                      if (a.offs.first != b.offs.first)
253
                          return a.offs.first < b.offs.first;
254
                      return a.offs.second > b.offs.second;
255
                  }
256
            );
257
258
        // Give a boost to fragments which contain a group match
259
        // (phrase/near), they are dear to the user's heart.  list are
260
        // sorted, so we never go back in the fragment list (can
261
        // always start the search where we previously stopped).
262
        auto fragit = m_fragments.begin();
263
        for (const auto& grpmatch : tboffs) {
264
            LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first <<
265
                   "-" << grpmatch.offs.second << " curfrag " <<
266
                   fragit->start << "-" << fragit->stop << endl);
267
            while (fragit->stop < grpmatch.offs.first) {
268
                fragit++;
269
                if (fragit == m_fragments.end()) {
270
                    return;
271
                }
272
            }
273
            if (fragit->start <= grpmatch.offs.first &&
274
                fragit->stop >= grpmatch.offs.second) {
275
                // grp in frag
276
                fragit->coef += 10.0;
277
            }
278
        }
279
280
        return;
281
    }
282
    
283
private:
284
    // Past terms because we need to go back for context before a hit
285
    deque<pair<int,int>>  m_prevterms;
286
    // Data about the fragment we are building
287
    pair<int,int> m_curfrag{0,0};
288
    double m_curfragcoef{0.0};
289
    unsigned int m_remainingWords{0};
290
    unsigned int m_extcount{0};
291
#ifdef COMPUTE_HLZONES
292
    vector<pair<int,int>> m_curhlzones;
293
    bool m_prevwordhit{false};
294
#endif
295
    // Current sum of fragment weights
296
    double m_totalcoef{0.0};
297
    // Position of 1st term match (for page number computations)
298
    unsigned int m_curhitpos{0};
299
    // "best" term
300
    string m_curterm;
301
    double m_curtermcoef{0.0};
302
303
    // Group terms, extracted from m_hdata 
304
    unordered_set<string> m_gterms;
305
    // group/near terms word positions.
306
    map<string, vector<int> > m_plists;
307
    map<int, pair<int, int> > m_gpostobytes;
308
    
309
    // Input
310
    unordered_set<string> m_terms;
311
    const HighlightData& m_hdata;
312
    unordered_map<string, double>& m_wordcoefs;
313
    unsigned int m_ctxwords;
314
315
    // Result: begin and end byte positions of query terms/groups in text
316
    vector<MatchFragment> m_fragments;  
317
};
318
319
int Query::Native::abstractFromText(
320
    Rcl::Db::Native *ndb,
321
    Xapian::docid docid,
322
    const vector<string>& matchTerms,
323
    const multimap<double, vector<string>> byQ,
324
    double totalweight,
325
    int ctxwords,
326
    unsigned int maxtotaloccs,
327
    vector<Snippet>& vabs,
328
    Chrono&
329
    )
330
{
331
    Xapian::Database& xrdb(ndb->xrdb);
332
333
    string rawtext;
334
    if (!ndb->getRawText(docid, rawtext)) {
335
        LOGDEB0("abstractFromText: can't fetch text\n");
336
        return ABSRES_ERROR;
337
    }
338
339
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2)  && \
340
    (defined(RAWTEXT_IN_DATA))
341
    // Tryout the Xapian internal method.
342
    string snippet = xmset.snippet(rawtext);
343
    LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
344
#endif
345
346
    // We need the q coefs for individual terms
347
    unordered_map<string, double> wordcoefs;
348
    for (const auto& mment : byQ) {
349
        for (const auto& word : mment.second) {
350
            wordcoefs[word] = mment.first;
351
        }
352
    }
353
354
    // Note: getTerms() was already called by qualityTerms, so this is
355
    // a bit wasteful. I guess that the performance impact is
356
    // negligible though. To be checked ? We need the highlightdata for the
357
    // phrase/near groups.
358
    HighlightData hld;
359
    if (m_q->m_sd) {
360
        m_q->m_sd->getTerms(hld);
361
    }
362
363
    TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
364
                          TextSplit::TXTS_ONLYSPANS);
365
    splitter.text_to_words(rawtext);
366
    splitter.updgroups();
367
368
    // Sort the fragments by decreasing weight
369
    const vector<MatchFragment>& res1 = splitter.getFragments();
370
    vector<MatchFragment> result(res1.begin(), res1.end());
371
    std::sort(result.begin(), result.end(),
372
              [](const MatchFragment& a,
373
                 const MatchFragment& b) -> bool { 
374
                  return a.coef > b.coef; 
375
              }
376
        );
377
378
379
    vector<int> vpbreaks;
380
    ndb->getPagePositions(docid, vpbreaks);
381
382
    // Build the output snippets array by merging the fragments, their
383
    // main term and the page positions. 
384
    unsigned int count = 0;
385
    for (const auto& entry : result) {
386
        string frag = neutchars(
387
            rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc);
388
389
#ifdef COMPUTE_HLZONES
390
        // This would need to be modified to take tag parameters
391
        // instead of the const strings
392
        static const string starthit("<span style='color: blue;'>");
393
        static const string endhit("</span>");
394
        size_t inslen = 0;
395
        for (const auto& hlzone: entry.hlzones) {
396
            frag.replace(hlzone.first - entry.start + inslen, 0, starthit);
397
            inslen += starthit.size();
398
            frag.replace(hlzone.second - entry.start + inslen, 0, endhit);
399
            inslen += endhit.size();
400
        }
401
#endif
402
        LOGDEB("=== FRAGMENT: Coef: " << entry.coef << ": " << frag << endl);
403
        int page = 0;
404
        if (vpbreaks.size() > 1) {
405
            page = ndb->getPageNumberForPosition(vpbreaks, entry.hitpos);
406
            if (page < 0)
407
                page = 0;
408
        }
409
        vabs.push_back(Snippet(page, frag).setTerm(entry.term));
410
        if (count++ >= maxtotaloccs)
411
            break;
412
    }
413
    return ABSRES_OK;
414
}
415
416
}