|
a |
|
b/src/rcldb/rclabsfromtext.cpp |
|
|
1 |
/* Copyright (C) 2004-2017 J.F.Dockes
|
|
|
2 |
* This program is free software; you can redistribute it and/or modify
|
|
|
3 |
* it under the terms of the GNU General Public License as published by
|
|
|
4 |
* the Free Software Foundation; either version 2 of the License, or
|
|
|
5 |
* (at your option) any later version.
|
|
|
6 |
*
|
|
|
7 |
* This program is distributed in the hope that it will be useful,
|
|
|
8 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
9 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
10 |
* GNU General Public License for more details.
|
|
|
11 |
*
|
|
|
12 |
* You should have received a copy of the GNU General Public License
|
|
|
13 |
* along with this program; if not, write to the
|
|
|
14 |
* Free Software Foundation, Inc.,
|
|
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
|
16 |
*/
|
|
|
17 |
#include "autoconfig.h"
|
|
|
18 |
|
|
|
19 |
#include <math.h>
|
|
|
20 |
|
|
|
21 |
#include <map>
|
|
|
22 |
#include <unordered_map>
|
|
|
23 |
#include <deque>
|
|
|
24 |
#include <algorithm>
|
|
|
25 |
|
|
|
26 |
#include "log.h"
|
|
|
27 |
#include "rcldb.h"
|
|
|
28 |
#include "rcldb_p.h"
|
|
|
29 |
#include "rclquery.h"
|
|
|
30 |
#include "rclquery_p.h"
|
|
|
31 |
#include "textsplit.h"
|
|
|
32 |
#include "hldata.h"
|
|
|
33 |
#include "chrono.h"
|
|
|
34 |
#include "unacpp.h"
|
|
|
35 |
#include "zlibut.h"
|
|
|
36 |
|
|
|
37 |
using namespace std;
|
|
|
38 |
|
|
|
39 |
// We now let plaintorich do the highlight tags insertions which is
|
|
|
40 |
// wasteful because we have most of the information (but the perf hit
|
|
|
41 |
// is small because it's only called on the output fragments, not on
|
|
|
42 |
// the whole text). The highlight zone computation code has been left
|
|
|
43 |
// around just in case I change my mind.
|
|
|
44 |
#undef COMPUTE_HLZONES
|
|
|
45 |
|
|
|
46 |
namespace Rcl {
|
|
|
47 |
|
|
|
48 |
// Chars we turn to spaces in the Snippets
|
|
|
49 |
static const string cstr_nc("\n\r\x0c\\");
|
|
|
50 |
|
|
|
51 |
// Fragment descriptor. A fragment is a text area with one or several
|
|
|
52 |
// matched terms and some context. It is ranked according to the
|
|
|
53 |
// matched term weights and the near/phrase matches get a boost.
|
|
|
54 |
struct MatchFragment {
|
|
|
55 |
// Start/End byte offsets of fragment in the document text
|
|
|
56 |
int start;
|
|
|
57 |
int stop;
|
|
|
58 |
// Weight for this fragment (bigger better)
|
|
|
59 |
double coef;
|
|
|
60 |
#ifdef COMPUTE_HLZONES
|
|
|
61 |
// Highlight areas (each is one or several contiguous match
|
|
|
62 |
// terms). Because a fragment extends around a match, there
|
|
|
63 |
// can be several contiguous or separate matches in a given
|
|
|
64 |
// fragment.
|
|
|
65 |
vector<pair<int,int>> hlzones;
|
|
|
66 |
#endif
|
|
|
67 |
// Position of the first matched term (for page number computations)
|
|
|
68 |
unsigned int hitpos;
|
|
|
69 |
// "best term" for this match (e.g. for use as ext app search term)
|
|
|
70 |
string term;
|
|
|
71 |
|
|
|
72 |
MatchFragment(int sta, int sto, double c,
|
|
|
73 |
#ifdef COMPUTE_HLZONES
|
|
|
74 |
vector<pair<int,int>>& hl,
|
|
|
75 |
#endif
|
|
|
76 |
unsigned int pos, string& trm)
|
|
|
77 |
: start(sta), stop(sto), coef(c), hitpos(pos) {
|
|
|
78 |
#ifdef COMPUTE_HLZONES
|
|
|
79 |
hlzones.swap(hl);
|
|
|
80 |
#endif
|
|
|
81 |
term.swap(trm);
|
|
|
82 |
}
|
|
|
83 |
};
|
|
|
84 |
|
|
|
85 |
|
|
|
86 |
// Text splitter for finding the match areas in the document text.
|
|
|
87 |
class TextSplitABS : public TextSplit {
|
|
|
88 |
public:
|
|
|
89 |
|
|
|
90 |
TextSplitABS(const vector<string>& matchTerms,
|
|
|
91 |
const HighlightData& hdata,
|
|
|
92 |
unordered_map<string, double>& wordcoefs,
|
|
|
93 |
unsigned int ctxwords,
|
|
|
94 |
Flags flags = TXTS_NONE)
|
|
|
95 |
: TextSplit(flags), m_terms(matchTerms.begin(), matchTerms.end()),
|
|
|
96 |
m_hdata(hdata), m_wordcoefs(wordcoefs), m_ctxwords(ctxwords) {
|
|
|
97 |
|
|
|
98 |
// Take note of the group (phrase/near) terms because we need
|
|
|
99 |
// to compute the position lists for them.
|
|
|
100 |
for (const auto& group : hdata.groups) {
|
|
|
101 |
if (group.size() > 1) {
|
|
|
102 |
for (const auto& term: group) {
|
|
|
103 |
m_gterms.insert(term);
|
|
|
104 |
}
|
|
|
105 |
}
|
|
|
106 |
}
|
|
|
107 |
}
|
|
|
108 |
|
|
|
109 |
// Accept a word and its position. If the word is a matched term,
|
|
|
110 |
// add/update fragment definition.
|
|
|
111 |
virtual bool takeword(const std::string& term, int pos, int bts, int bte) {
|
|
|
112 |
LOGDEB2("takeword: " << term << endl);
|
|
|
113 |
|
|
|
114 |
// Remember recent past
|
|
|
115 |
m_prevterms.push_back(pair<int,int>(bts,bte));
|
|
|
116 |
if (m_prevterms.size() > m_ctxwords+1) {
|
|
|
117 |
m_prevterms.pop_front();
|
|
|
118 |
}
|
|
|
119 |
|
|
|
120 |
string dumb;
|
|
|
121 |
if (o_index_stripchars) {
|
|
|
122 |
if (!unacmaybefold(term, dumb, "UTF-8", UNACOP_UNACFOLD)) {
|
|
|
123 |
LOGINFO("abstract: unac failed for [" << term << "]\n");
|
|
|
124 |
return true;
|
|
|
125 |
}
|
|
|
126 |
} else {
|
|
|
127 |
dumb = term;
|
|
|
128 |
}
|
|
|
129 |
|
|
|
130 |
if (m_terms.find(dumb) != m_terms.end()) {
|
|
|
131 |
// This word is a search term. Extend or create fragment
|
|
|
132 |
LOGDEB2("match: [" << dumb << "] current: " << m_curfrag.first <<
|
|
|
133 |
", " << m_curfrag.second << " remain " <<
|
|
|
134 |
m_remainingWords << endl);
|
|
|
135 |
double coef = m_wordcoefs[dumb];
|
|
|
136 |
if (!m_remainingWords) {
|
|
|
137 |
// No current fragment. Start one
|
|
|
138 |
m_curhitpos = baseTextPosition + pos;
|
|
|
139 |
m_curfrag.first = m_prevterms.front().first;
|
|
|
140 |
m_curfrag.second = m_prevterms.back().second;
|
|
|
141 |
#ifdef COMPUTE_HLZONES
|
|
|
142 |
m_curhlzones.push_back(pair<int,int>(bts, bte));
|
|
|
143 |
#endif
|
|
|
144 |
m_curterm = term;
|
|
|
145 |
m_curtermcoef = coef;
|
|
|
146 |
} else {
|
|
|
147 |
LOGDEB2("Extending current fragment: " << m_remainingWords <<
|
|
|
148 |
" -> " << m_ctxwords << endl);
|
|
|
149 |
m_extcount++;
|
|
|
150 |
#ifdef COMPUTE_HLZONES
|
|
|
151 |
if (m_prevwordhit) {
|
|
|
152 |
m_curhlzones.back().second = bte;
|
|
|
153 |
} else {
|
|
|
154 |
m_curhlzones.push_back(pair<int,int>(bts, bte));
|
|
|
155 |
}
|
|
|
156 |
#endif
|
|
|
157 |
if (coef > m_curtermcoef) {
|
|
|
158 |
m_curterm = term;
|
|
|
159 |
m_curtermcoef = coef;
|
|
|
160 |
}
|
|
|
161 |
}
|
|
|
162 |
|
|
|
163 |
#ifdef COMPUTE_HLZONES
|
|
|
164 |
m_prevwordhit = true;
|
|
|
165 |
#endif
|
|
|
166 |
m_curfragcoef += coef;
|
|
|
167 |
m_remainingWords = m_ctxwords + 1;
|
|
|
168 |
if (m_extcount > 3) {
|
|
|
169 |
// Limit expansion of contiguous fragments (this is to
|
|
|
170 |
// avoid common terms in search causing long
|
|
|
171 |
// heavyweight meaningless fragments. Also, limit length).
|
|
|
172 |
m_remainingWords = 1;
|
|
|
173 |
m_extcount = 0;
|
|
|
174 |
}
|
|
|
175 |
|
|
|
176 |
// If the term is part of a near/phrase group, update its
|
|
|
177 |
// positions list
|
|
|
178 |
if (m_gterms.find(dumb) != m_gterms.end()) {
|
|
|
179 |
// Term group (phrase/near) handling
|
|
|
180 |
m_plists[dumb].push_back(pos);
|
|
|
181 |
m_gpostobytes[pos] = pair<int,int>(bts, bte);
|
|
|
182 |
LOGDEB2("Recorded bpos for " << pos << ": " << bts << " " <<
|
|
|
183 |
bte << "\n");
|
|
|
184 |
}
|
|
|
185 |
}
|
|
|
186 |
#ifdef COMPUTE_HLZONES
|
|
|
187 |
else {
|
|
|
188 |
// Not a matched term
|
|
|
189 |
m_prevwordhit = false;
|
|
|
190 |
}
|
|
|
191 |
#endif
|
|
|
192 |
|
|
|
193 |
|
|
|
194 |
if (m_remainingWords) {
|
|
|
195 |
// Fragment currently open. Time to close ?
|
|
|
196 |
m_remainingWords--;
|
|
|
197 |
m_curfrag.second = bte;
|
|
|
198 |
if (m_remainingWords == 0) {
|
|
|
199 |
if (m_totalcoef < 5.0 || m_curfragcoef >= 1.0) {
|
|
|
200 |
// Don't push bad fragments if we have a lot already
|
|
|
201 |
m_fragments.push_back(MatchFragment(m_curfrag.first,
|
|
|
202 |
m_curfrag.second,
|
|
|
203 |
m_curfragcoef,
|
|
|
204 |
#ifdef COMPUTE_HLZONES
|
|
|
205 |
m_curhlzones,
|
|
|
206 |
#endif
|
|
|
207 |
m_curhitpos,
|
|
|
208 |
m_curterm
|
|
|
209 |
));
|
|
|
210 |
}
|
|
|
211 |
m_totalcoef += m_curfragcoef;
|
|
|
212 |
m_curfragcoef = 0.0;
|
|
|
213 |
m_curtermcoef = 0.0;
|
|
|
214 |
}
|
|
|
215 |
}
|
|
|
216 |
return true;
|
|
|
217 |
}
|
|
|
218 |
|
|
|
219 |
const vector<MatchFragment>& getFragments() {
|
|
|
220 |
return m_fragments;
|
|
|
221 |
}
|
|
|
222 |
|
|
|
223 |
|
|
|
224 |
// After the text is split: use the group terms positions lists to
|
|
|
225 |
// find the group matches. We process everything as NEAR (no
|
|
|
226 |
// PHRASE specific processing).
|
|
|
227 |
void updgroups() {
|
|
|
228 |
vector<GroupMatchEntry> tboffs;
|
|
|
229 |
|
|
|
230 |
// Look for matches to PHRASE and NEAR term groups and finalize
|
|
|
231 |
// the matched regions list (sort it by increasing start then
|
|
|
232 |
// decreasing length). We process all groups as NEAR (ignore order).
|
|
|
233 |
for (unsigned int i = 0; i < m_hdata.groups.size(); i++) {
|
|
|
234 |
if (m_hdata.groups[i].size() > 1) {
|
|
|
235 |
matchGroup(m_hdata, i, m_plists, m_gpostobytes, tboffs);
|
|
|
236 |
}
|
|
|
237 |
}
|
|
|
238 |
|
|
|
239 |
// Sort the fragments by increasing start and decreasing width
|
|
|
240 |
std::sort(m_fragments.begin(), m_fragments.end(),
|
|
|
241 |
[](const MatchFragment& a, const MatchFragment& b) -> bool {
|
|
|
242 |
if (a.start != b.start)
|
|
|
243 |
return a.start < b.start;
|
|
|
244 |
return a.stop - a.start > b.stop - a.stop;
|
|
|
245 |
}
|
|
|
246 |
);
|
|
|
247 |
|
|
|
248 |
// Sort the group regions by increasing start and decreasing width.
|
|
|
249 |
std::sort(tboffs.begin(), tboffs.end(),
|
|
|
250 |
[](const GroupMatchEntry& a, const GroupMatchEntry& b)
|
|
|
251 |
-> bool {
|
|
|
252 |
if (a.offs.first != b.offs.first)
|
|
|
253 |
return a.offs.first < b.offs.first;
|
|
|
254 |
return a.offs.second > b.offs.second;
|
|
|
255 |
}
|
|
|
256 |
);
|
|
|
257 |
|
|
|
258 |
// Give a boost to fragments which contain a group match
|
|
|
259 |
// (phrase/near), they are dear to the user's heart. list are
|
|
|
260 |
// sorted, so we never go back in the fragment list (can
|
|
|
261 |
// always start the search where we previously stopped).
|
|
|
262 |
auto fragit = m_fragments.begin();
|
|
|
263 |
for (const auto& grpmatch : tboffs) {
|
|
|
264 |
LOGDEB2("LOOKING FOR FRAGMENT: group: " << grpmatch.offs.first <<
|
|
|
265 |
"-" << grpmatch.offs.second << " curfrag " <<
|
|
|
266 |
fragit->start << "-" << fragit->stop << endl);
|
|
|
267 |
while (fragit->stop < grpmatch.offs.first) {
|
|
|
268 |
fragit++;
|
|
|
269 |
if (fragit == m_fragments.end()) {
|
|
|
270 |
return;
|
|
|
271 |
}
|
|
|
272 |
}
|
|
|
273 |
if (fragit->start <= grpmatch.offs.first &&
|
|
|
274 |
fragit->stop >= grpmatch.offs.second) {
|
|
|
275 |
// grp in frag
|
|
|
276 |
fragit->coef += 10.0;
|
|
|
277 |
}
|
|
|
278 |
}
|
|
|
279 |
|
|
|
280 |
return;
|
|
|
281 |
}
|
|
|
282 |
|
|
|
283 |
private:
|
|
|
284 |
// Past terms because we need to go back for context before a hit
|
|
|
285 |
deque<pair<int,int>> m_prevterms;
|
|
|
286 |
// Data about the fragment we are building
|
|
|
287 |
pair<int,int> m_curfrag{0,0};
|
|
|
288 |
double m_curfragcoef{0.0};
|
|
|
289 |
unsigned int m_remainingWords{0};
|
|
|
290 |
unsigned int m_extcount{0};
|
|
|
291 |
#ifdef COMPUTE_HLZONES
|
|
|
292 |
vector<pair<int,int>> m_curhlzones;
|
|
|
293 |
bool m_prevwordhit{false};
|
|
|
294 |
#endif
|
|
|
295 |
// Current sum of fragment weights
|
|
|
296 |
double m_totalcoef{0.0};
|
|
|
297 |
// Position of 1st term match (for page number computations)
|
|
|
298 |
unsigned int m_curhitpos{0};
|
|
|
299 |
// "best" term
|
|
|
300 |
string m_curterm;
|
|
|
301 |
double m_curtermcoef{0.0};
|
|
|
302 |
|
|
|
303 |
// Group terms, extracted from m_hdata
|
|
|
304 |
unordered_set<string> m_gterms;
|
|
|
305 |
// group/near terms word positions.
|
|
|
306 |
map<string, vector<int> > m_plists;
|
|
|
307 |
map<int, pair<int, int> > m_gpostobytes;
|
|
|
308 |
|
|
|
309 |
// Input
|
|
|
310 |
unordered_set<string> m_terms;
|
|
|
311 |
const HighlightData& m_hdata;
|
|
|
312 |
unordered_map<string, double>& m_wordcoefs;
|
|
|
313 |
unsigned int m_ctxwords;
|
|
|
314 |
|
|
|
315 |
// Result: begin and end byte positions of query terms/groups in text
|
|
|
316 |
vector<MatchFragment> m_fragments;
|
|
|
317 |
};
|
|
|
318 |
|
|
|
319 |
int Query::Native::abstractFromText(
|
|
|
320 |
Rcl::Db::Native *ndb,
|
|
|
321 |
Xapian::docid docid,
|
|
|
322 |
const vector<string>& matchTerms,
|
|
|
323 |
const multimap<double, vector<string>> byQ,
|
|
|
324 |
double totalweight,
|
|
|
325 |
int ctxwords,
|
|
|
326 |
unsigned int maxtotaloccs,
|
|
|
327 |
vector<Snippet>& vabs,
|
|
|
328 |
Chrono&
|
|
|
329 |
)
|
|
|
330 |
{
|
|
|
331 |
Xapian::Database& xrdb(ndb->xrdb);
|
|
|
332 |
|
|
|
333 |
string rawtext;
|
|
|
334 |
if (!ndb->getRawText(docid, rawtext)) {
|
|
|
335 |
LOGDEB0("abstractFromText: can't fetch text\n");
|
|
|
336 |
return ABSRES_ERROR;
|
|
|
337 |
}
|
|
|
338 |
|
|
|
339 |
#if 0 && ! (XAPIAN_MAJOR_VERSION <= 1 && XAPIAN_MINOR_VERSION <= 2) && \
|
|
|
340 |
(defined(RAWTEXT_IN_DATA))
|
|
|
341 |
// Tryout the Xapian internal method.
|
|
|
342 |
string snippet = xmset.snippet(rawtext);
|
|
|
343 |
LOGDEB("SNIPPET: [" << snippet << "] END SNIPPET\n");
|
|
|
344 |
#endif
|
|
|
345 |
|
|
|
346 |
// We need the q coefs for individual terms
|
|
|
347 |
unordered_map<string, double> wordcoefs;
|
|
|
348 |
for (const auto& mment : byQ) {
|
|
|
349 |
for (const auto& word : mment.second) {
|
|
|
350 |
wordcoefs[word] = mment.first;
|
|
|
351 |
}
|
|
|
352 |
}
|
|
|
353 |
|
|
|
354 |
// Note: getTerms() was already called by qualityTerms, so this is
|
|
|
355 |
// a bit wasteful. I guess that the performance impact is
|
|
|
356 |
// negligible though. To be checked ? We need the highlightdata for the
|
|
|
357 |
// phrase/near groups.
|
|
|
358 |
HighlightData hld;
|
|
|
359 |
if (m_q->m_sd) {
|
|
|
360 |
m_q->m_sd->getTerms(hld);
|
|
|
361 |
}
|
|
|
362 |
|
|
|
363 |
TextSplitABS splitter(matchTerms, hld, wordcoefs, ctxwords,
|
|
|
364 |
TextSplit::TXTS_ONLYSPANS);
|
|
|
365 |
splitter.text_to_words(rawtext);
|
|
|
366 |
splitter.updgroups();
|
|
|
367 |
|
|
|
368 |
// Sort the fragments by decreasing weight
|
|
|
369 |
const vector<MatchFragment>& res1 = splitter.getFragments();
|
|
|
370 |
vector<MatchFragment> result(res1.begin(), res1.end());
|
|
|
371 |
std::sort(result.begin(), result.end(),
|
|
|
372 |
[](const MatchFragment& a,
|
|
|
373 |
const MatchFragment& b) -> bool {
|
|
|
374 |
return a.coef > b.coef;
|
|
|
375 |
}
|
|
|
376 |
);
|
|
|
377 |
|
|
|
378 |
|
|
|
379 |
vector<int> vpbreaks;
|
|
|
380 |
ndb->getPagePositions(docid, vpbreaks);
|
|
|
381 |
|
|
|
382 |
// Build the output snippets array by merging the fragments, their
|
|
|
383 |
// main term and the page positions.
|
|
|
384 |
unsigned int count = 0;
|
|
|
385 |
for (const auto& entry : result) {
|
|
|
386 |
string frag = neutchars(
|
|
|
387 |
rawtext.substr(entry.start, entry.stop - entry.start), cstr_nc);
|
|
|
388 |
|
|
|
389 |
#ifdef COMPUTE_HLZONES
|
|
|
390 |
// This would need to be modified to take tag parameters
|
|
|
391 |
// instead of the const strings
|
|
|
392 |
static const string starthit("<span style='color: blue;'>");
|
|
|
393 |
static const string endhit("</span>");
|
|
|
394 |
size_t inslen = 0;
|
|
|
395 |
for (const auto& hlzone: entry.hlzones) {
|
|
|
396 |
frag.replace(hlzone.first - entry.start + inslen, 0, starthit);
|
|
|
397 |
inslen += starthit.size();
|
|
|
398 |
frag.replace(hlzone.second - entry.start + inslen, 0, endhit);
|
|
|
399 |
inslen += endhit.size();
|
|
|
400 |
}
|
|
|
401 |
#endif
|
|
|
402 |
LOGDEB("=== FRAGMENT: Coef: " << entry.coef << ": " << frag << endl);
|
|
|
403 |
int page = 0;
|
|
|
404 |
if (vpbreaks.size() > 1) {
|
|
|
405 |
page = ndb->getPageNumberForPosition(vpbreaks, entry.hitpos);
|
|
|
406 |
if (page < 0)
|
|
|
407 |
page = 0;
|
|
|
408 |
}
|
|
|
409 |
vabs.push_back(Snippet(page, frag).setTerm(entry.term));
|
|
|
410 |
if (count++ >= maxtotaloccs)
|
|
|
411 |
break;
|
|
|
412 |
}
|
|
|
413 |
return ABSRES_OK;
|
|
|
414 |
}
|
|
|
415 |
|
|
|
416 |
}
|