|
a/src/rcldb/rclabstract.cpp |
|
b/src/rcldb/rclabstract.cpp |
|
... |
|
... |
48 |
{
|
48 |
{
|
49 |
string a;
|
49 |
string a;
|
50 |
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
50 |
for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
51 |
a = a + *it + " ";
|
51 |
a = a + *it + " ";
|
52 |
}
|
52 |
}
|
53 |
LOGDEB("" << (what) << ": " << (a) << "\n" );
|
53 |
LOGDEB("" << what << ": " << a << "\n");
|
54 |
}
|
54 |
}
|
55 |
#else
|
55 |
#else
|
56 |
#define LOGABS LOGDEB2
|
56 |
#define LOGABS LOGDEB2
|
57 |
static void listList(const string&, const vector<string>&)
|
57 |
static void listList(const string&, const vector<string>&)
|
58 |
{
|
58 |
{
|
|
... |
|
... |
65 |
// result in general.
|
65 |
// result in general.
|
66 |
static const bool prune_prefixed_terms = true;
|
66 |
static const bool prune_prefixed_terms = true;
|
67 |
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
67 |
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
68 |
{
|
68 |
{
|
69 |
for (vector<string>::const_iterator qit = in.begin();
|
69 |
for (vector<string>::const_iterator qit = in.begin();
|
70 |
qit != in.end(); qit++) {
|
70 |
qit != in.end(); qit++) {
|
71 |
if (prune_prefixed_terms) {
|
71 |
if (prune_prefixed_terms) {
|
72 |
if (has_prefix(*qit))
|
72 |
if (has_prefix(*qit))
|
73 |
continue;
|
73 |
continue;
|
74 |
}
|
74 |
}
|
75 |
out.push_back(strip_prefix(*qit));
|
75 |
out.push_back(strip_prefix(*qit));
|
76 |
}
|
76 |
}
|
77 |
sort(out.begin(), out.end());
|
77 |
sort(out.begin(), out.end());
|
78 |
vector<string>::iterator it = unique(out.begin(), out.end());
|
78 |
vector<string>::iterator it = unique(out.begin(), out.end());
|
79 |
out.resize(it - out.begin());
|
79 |
out.resize(it - out.begin());
|
80 |
}
|
80 |
}
|
81 |
|
81 |
|
82 |
bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
|
82 |
bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
|
83 |
{
|
83 |
{
|
84 |
if (!xenquire) {
|
84 |
if (!xenquire) {
|
85 |
LOGERR("Query::getMatchTerms: no query opened\n" );
|
85 |
LOGERR("Query::getMatchTerms: no query opened\n");
|
86 |
return false;
|
86 |
return false;
|
87 |
}
|
87 |
}
|
88 |
|
88 |
|
89 |
terms.clear();
|
89 |
terms.clear();
|
90 |
Xapian::TermIterator it;
|
90 |
Xapian::TermIterator it;
|
91 |
Xapian::docid id = Xapian::docid(xdocid);
|
91 |
Xapian::docid id = Xapian::docid(xdocid);
|
|
... |
|
... |
93 |
XAPTRY(iterms.insert(iterms.begin(),
|
93 |
XAPTRY(iterms.insert(iterms.begin(),
|
94 |
xenquire->get_matching_terms_begin(id),
|
94 |
xenquire->get_matching_terms_begin(id),
|
95 |
xenquire->get_matching_terms_end(id)),
|
95 |
xenquire->get_matching_terms_end(id)),
|
96 |
m_q->m_db->m_ndb->xrdb, m_q->m_reason);
|
96 |
m_q->m_db->m_ndb->xrdb, m_q->m_reason);
|
97 |
if (!m_q->m_reason.empty()) {
|
97 |
if (!m_q->m_reason.empty()) {
|
98 |
LOGERR("getMatchTerms: xapian error: " << (m_q->m_reason) << "\n" );
|
98 |
LOGERR("getMatchTerms: xapian error: " << m_q->m_reason << "\n");
|
99 |
return false;
|
99 |
return false;
|
100 |
}
|
100 |
}
|
101 |
noPrefixList(iterms, terms);
|
101 |
noPrefixList(iterms, terms);
|
102 |
return true;
|
102 |
return true;
|
103 |
}
|
103 |
}
|
104 |
|
104 |
|
|
... |
|
... |
107 |
// while computing abstracts for the different result documents.
|
107 |
// while computing abstracts for the different result documents.
|
108 |
void Query::Native::setDbWideQTermsFreqs()
|
108 |
void Query::Native::setDbWideQTermsFreqs()
|
109 |
{
|
109 |
{
|
110 |
// Do it once only for a given query.
|
110 |
// Do it once only for a given query.
|
111 |
if (!termfreqs.empty())
|
111 |
if (!termfreqs.empty())
|
112 |
return;
|
112 |
return;
|
113 |
|
113 |
|
114 |
vector<string> qterms;
|
114 |
vector<string> qterms;
|
115 |
{
|
115 |
{
|
116 |
vector<string> iqterms;
|
116 |
vector<string> iqterms;
|
117 |
m_q->getQueryTerms(iqterms);
|
117 |
m_q->getQueryTerms(iqterms);
|
118 |
noPrefixList(iqterms, qterms);
|
118 |
noPrefixList(iqterms, qterms);
|
119 |
}
|
119 |
}
|
120 |
// listList("Query terms: ", qterms);
|
120 |
// listList("Query terms: ", qterms);
|
121 |
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
121 |
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
122 |
|
122 |
|
123 |
double doccnt = xrdb.get_doccount();
|
123 |
double doccnt = xrdb.get_doccount();
|
124 |
if (doccnt == 0)
|
124 |
if (doccnt == 0)
|
125 |
doccnt = 1;
|
125 |
doccnt = 1;
|
126 |
|
126 |
|
127 |
for (vector<string>::const_iterator qit = qterms.begin();
|
127 |
for (vector<string>::const_iterator qit = qterms.begin();
|
128 |
qit != qterms.end(); qit++) {
|
128 |
qit != qterms.end(); qit++) {
|
129 |
termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
129 |
termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
130 |
LOGABS("setDbWideQTermFreqs: [" << (qit) << "] db freq " << (termfreqs[*qit]) << "\n" );
|
130 |
LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " <<
|
|
|
131 |
termfreqs[*qit] << "\n");
|
131 |
}
|
132 |
}
|
132 |
}
|
133 |
}
|
133 |
|
134 |
|
134 |
// Compute matched terms quality coefficients for a matched document by
|
135 |
// Compute matched terms quality coefficients for a matched document by
|
135 |
// retrieving the Within Document Frequencies and multiplying by
|
136 |
// retrieving the Within Document Frequencies and multiplying by
|
|
... |
|
... |
141 |
// common stem, which seems wrong, we group the terms by
|
142 |
// common stem, which seems wrong, we group the terms by
|
142 |
// root, compute a frequency for the group from the sum of member
|
143 |
// root, compute a frequency for the group from the sum of member
|
143 |
// occurrences, and let the frequency for each group member be the
|
144 |
// occurrences, and let the frequency for each group member be the
|
144 |
// aggregated frequency.
|
145 |
// aggregated frequency.
|
145 |
double Query::Native::qualityTerms(Xapian::docid docid,
|
146 |
double Query::Native::qualityTerms(Xapian::docid docid,
|
146 |
const vector<string>& terms,
|
147 |
const vector<string>& terms,
|
147 |
multimap<double, vector<string> >& byQ)
|
148 |
multimap<double, vector<string> >& byQ)
|
148 |
{
|
149 |
{
|
149 |
LOGABS("qualityTerms\n" );
|
150 |
LOGABS("qualityTerms\n");
|
150 |
setDbWideQTermsFreqs();
|
151 |
setDbWideQTermsFreqs();
|
151 |
|
152 |
|
152 |
map<string, double> termQcoefs;
|
153 |
map<string, double> termQcoefs;
|
153 |
double totalweight = 0;
|
154 |
double totalweight = 0;
|
154 |
|
155 |
|
155 |
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
156 |
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
|
156 |
double doclen = xrdb.get_doclength(docid);
|
157 |
double doclen = xrdb.get_doclength(docid);
|
157 |
if (doclen == 0)
|
158 |
if (doclen == 0)
|
158 |
doclen = 1;
|
159 |
doclen = 1;
|
159 |
HighlightData hld;
|
160 |
HighlightData hld;
|
160 |
if (m_q->m_sd) {
|
161 |
if (m_q->m_sd) {
|
161 |
m_q->m_sd->getTerms(hld);
|
162 |
m_q->m_sd->getTerms(hld);
|
162 |
}
|
163 |
}
|
163 |
|
164 |
|
164 |
#ifdef DEBUGABSTRACT
|
165 |
#ifdef DEBUGABSTRACT
|
165 |
{
|
166 |
{
|
166 |
string deb;
|
167 |
string deb;
|
167 |
hld.toString(deb);
|
168 |
hld.toString(deb);
|
168 |
LOGABS("qualityTerms: hld: " << (deb) << "\n" );
|
169 |
LOGABS("qualityTerms: hld: " << deb << "\n");
|
169 |
}
|
170 |
}
|
170 |
#endif
|
171 |
#endif
|
171 |
|
172 |
|
172 |
// Group the input terms by the user term they were possibly expanded from
|
173 |
// Group the input terms by the user term they were possibly expanded from
|
173 |
map<string, vector<string> > byRoot;
|
174 |
map<string, vector<string> > byRoot;
|
174 |
for (vector<string>::const_iterator qit = terms.begin();
|
175 |
for (vector<string>::const_iterator qit = terms.begin();
|
175 |
qit != terms.end(); qit++) {
|
176 |
qit != terms.end(); qit++) {
|
176 |
map<string, string>::const_iterator eit = hld.terms.find(*qit);
|
177 |
map<string, string>::const_iterator eit = hld.terms.find(*qit);
|
177 |
if (eit != hld.terms.end()) {
|
178 |
if (eit != hld.terms.end()) {
|
178 |
byRoot[eit->second].push_back(*qit);
|
179 |
byRoot[eit->second].push_back(*qit);
|
179 |
} else {
|
180 |
} else {
|
180 |
LOGDEB0("qualityTerms: [" << ((*qit)) << "] not found in hld\n" );
|
181 |
LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n");
|
181 |
byRoot[*qit].push_back(*qit);
|
182 |
byRoot[*qit].push_back(*qit);
|
182 |
}
|
183 |
}
|
183 |
}
|
184 |
}
|
184 |
|
185 |
|
185 |
#ifdef DEBUGABSTRACT
|
186 |
#ifdef DEBUGABSTRACT
|
186 |
{
|
187 |
{
|
187 |
string byRootstr;
|
188 |
string byRootstr;
|
188 |
for (map<string, vector<string> >::const_iterator debit =
|
189 |
for (map<string, vector<string> >::const_iterator debit =
|
189 |
byRoot.begin(); debit != byRoot.end(); debit++) {
|
190 |
byRoot.begin(); debit != byRoot.end(); debit++) {
|
190 |
byRootstr.append("[").append(debit->first).append("]->");
|
191 |
byRootstr.append("[").append(debit->first).append("]->");
|
191 |
for (vector<string>::const_iterator it = debit->second.begin();
|
192 |
for (vector<string>::const_iterator it = debit->second.begin();
|
192 |
it != debit->second.end(); it++) {
|
193 |
it != debit->second.end(); it++) {
|
193 |
byRootstr.append("[").append(*it).append("] ");
|
194 |
byRootstr.append("[").append(*it).append("] ");
|
194 |
}
|
195 |
}
|
195 |
byRootstr.append("\n");
|
196 |
byRootstr.append("\n");
|
196 |
}
|
197 |
}
|
197 |
LOGABS("\nqualityTerms: uterms to terms: " << (byRootstr) << "\n" );
|
198 |
LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n");
|
198 |
}
|
199 |
}
|
199 |
#endif
|
200 |
#endif
|
200 |
|
201 |
|
201 |
// Compute in-document and global frequencies for the groups.
|
202 |
// Compute in-document and global frequencies for the groups.
|
202 |
map<string, double> grpwdfs;
|
203 |
map<string, double> grpwdfs;
|
203 |
map<string, double> grptfreqs;
|
204 |
map<string, double> grptfreqs;
|
204 |
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
205 |
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
205 |
git != byRoot.end(); git++) {
|
206 |
git != byRoot.end(); git++) {
|
206 |
for (vector<string>::const_iterator qit = git->second.begin();
|
207 |
for (vector<string>::const_iterator qit = git->second.begin();
|
207 |
qit != git->second.end(); qit++) {
|
208 |
qit != git->second.end(); qit++) {
|
208 |
Xapian::TermIterator term = xrdb.termlist_begin(docid);
|
209 |
Xapian::TermIterator term = xrdb.termlist_begin(docid);
|
209 |
term.skip_to(*qit);
|
210 |
term.skip_to(*qit);
|
210 |
if (term != xrdb.termlist_end(docid) && *term == *qit) {
|
211 |
if (term != xrdb.termlist_end(docid) && *term == *qit) {
|
211 |
if (grpwdfs.find(git->first) != grpwdfs.end()) {
|
212 |
if (grpwdfs.find(git->first) != grpwdfs.end()) {
|
212 |
grpwdfs[git->first] = term.get_wdf() / doclen;
|
213 |
grpwdfs[git->first] = term.get_wdf() / doclen;
|
213 |
grptfreqs[git->first] = termfreqs[*qit];
|
214 |
grptfreqs[git->first] = termfreqs[*qit];
|
214 |
} else {
|
215 |
} else {
|
215 |
grpwdfs[git->first] += term.get_wdf() / doclen;
|
216 |
grpwdfs[git->first] += term.get_wdf() / doclen;
|
216 |
grptfreqs[git->first] += termfreqs[*qit];
|
217 |
grptfreqs[git->first] += termfreqs[*qit];
|
217 |
}
|
218 |
}
|
218 |
}
|
219 |
}
|
219 |
}
|
220 |
}
|
220 |
}
|
221 |
}
|
221 |
|
222 |
|
222 |
// Build a sorted by quality container for the groups
|
223 |
// Build a sorted by quality container for the groups
|
223 |
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
224 |
for (map<string, vector<string> >::const_iterator git = byRoot.begin();
|
224 |
git != byRoot.end(); git++) {
|
225 |
git != byRoot.end(); git++) {
|
225 |
double q = (grpwdfs[git->first]) * grptfreqs[git->first];
|
226 |
double q = (grpwdfs[git->first]) * grptfreqs[git->first];
|
226 |
q = -log10(q);
|
227 |
q = -log10(q);
|
227 |
if (q < 3) {
|
228 |
if (q < 3) {
|
228 |
q = 0.05;
|
229 |
q = 0.05;
|
229 |
} else if (q < 4) {
|
230 |
} else if (q < 4) {
|
230 |
q = 0.3;
|
231 |
q = 0.3;
|
231 |
} else if (q < 5) {
|
232 |
} else if (q < 5) {
|
232 |
q = 0.7;
|
233 |
q = 0.7;
|
233 |
} else if (q < 6) {
|
234 |
} else if (q < 6) {
|
234 |
q = 0.8;
|
235 |
q = 0.8;
|
235 |
} else {
|
236 |
} else {
|
236 |
q = 1;
|
237 |
q = 1;
|
237 |
}
|
238 |
}
|
238 |
totalweight += q;
|
239 |
totalweight += q;
|
239 |
byQ.insert(pair<double, vector<string> >(q, git->second));
|
240 |
byQ.insert(pair<double, vector<string> >(q, git->second));
|
240 |
}
|
241 |
}
|
241 |
|
242 |
|
242 |
#ifdef DEBUGABSTRACT
|
243 |
#ifdef DEBUGABSTRACT
|
243 |
for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
|
244 |
for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
|
244 |
mit != byQ.rend(); mit++) {
|
245 |
mit != byQ.rend(); mit++) {
|
245 |
LOGABS("qualityTerms: group\n" );
|
246 |
LOGABS("qualityTerms: group\n");
|
246 |
for (vector<string>::const_iterator qit = mit->second.begin();
|
247 |
for (vector<string>::const_iterator qit = mit->second.begin();
|
247 |
qit != mit->second.end(); qit++) {
|
248 |
qit != mit->second.end(); qit++) {
|
248 |
LOGABS("" << (mit->first) << "->[" << (qit) << "]\n" );
|
249 |
LOGABS("" << mit->first << "->[" << *qit << "]\n");
|
249 |
}
|
250 |
}
|
250 |
}
|
251 |
}
|
251 |
#endif
|
252 |
#endif
|
252 |
return totalweight;
|
253 |
return totalweight;
|
253 |
}
|
254 |
}
|
254 |
|
255 |
|
255 |
// Return page number for first match of "significant" term.
|
256 |
// Return page number for first match of "significant" term.
|
256 |
int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
257 |
int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
|
257 |
{
|
258 |
{
|
258 |
LOGDEB("Query::Native::getFirstMatchPage\n");
|
259 |
LOGDEB("Query::Native::getFirstMatchPage\n");
|
259 |
if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
|
260 |
if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
|
260 |
LOGERR("Query::getFirstMatchPage: no db\n" );
|
261 |
LOGERR("Query::getFirstMatchPage: no db\n");
|
261 |
return -1;
|
262 |
return -1;
|
262 |
}
|
263 |
}
|
263 |
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
264 |
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
264 |
Xapian::Database& xrdb(ndb->xrdb);
|
265 |
Xapian::Database& xrdb(ndb->xrdb);
|
265 |
|
266 |
|
266 |
vector<string> terms;
|
267 |
vector<string> terms;
|
267 |
getMatchTerms(docid, terms);
|
268 |
getMatchTerms(docid, terms);
|
268 |
|
269 |
|
269 |
if (terms.empty()) {
|
270 |
if (terms.empty()) {
|
270 |
LOGDEB("getFirstMatchPage: empty match term list (field match?)\n" );
|
271 |
LOGDEB("getFirstMatchPage: empty match term list (field match?)\n");
|
271 |
return -1;
|
272 |
return -1;
|
272 |
}
|
273 |
}
|
273 |
|
274 |
|
274 |
vector<int> pagepos;
|
275 |
vector<int> pagepos;
|
275 |
ndb->getPagePositions(docid, pagepos);
|
276 |
ndb->getPagePositions(docid, pagepos);
|
276 |
if (pagepos.empty())
|
277 |
if (pagepos.empty())
|
277 |
return -1;
|
278 |
return -1;
|
278 |
|
279 |
|
279 |
setDbWideQTermsFreqs();
|
280 |
setDbWideQTermsFreqs();
|
280 |
|
281 |
|
281 |
// We try to use a page which matches the "best" term. Get a sorted list
|
282 |
// We try to use a page which matches the "best" term. Get a sorted list
|
282 |
multimap<double, vector<string> > byQ;
|
283 |
multimap<double, vector<string> > byQ;
|
283 |
qualityTerms(docid, terms, byQ);
|
284 |
qualityTerms(docid, terms, byQ);
|
284 |
|
285 |
|
285 |
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
286 |
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
286 |
mit != byQ.rend(); mit++) {
|
287 |
mit != byQ.rend(); mit++) {
|
287 |
for (vector<string>::const_iterator qit = mit->second.begin();
|
288 |
for (vector<string>::const_iterator qit = mit->second.begin();
|
288 |
qit != mit->second.end(); qit++) {
|
289 |
qit != mit->second.end(); qit++) {
|
289 |
string qterm = *qit;
|
290 |
string qterm = *qit;
|
290 |
Xapian::PositionIterator pos;
|
291 |
Xapian::PositionIterator pos;
|
291 |
string emptys;
|
292 |
string emptys;
|
292 |
try {
|
293 |
try {
|
293 |
for (pos = xrdb.positionlist_begin(docid, qterm);
|
294 |
for (pos = xrdb.positionlist_begin(docid, qterm);
|
294 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
295 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
295 |
int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
|
296 |
int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
|
296 |
if (pagenum > 0) {
|
297 |
if (pagenum > 0) {
|
297 |
term = qterm;
|
298 |
term = qterm;
|
298 |
return pagenum;
|
299 |
return pagenum;
|
299 |
}
|
300 |
}
|
300 |
}
|
301 |
}
|
301 |
} catch (...) {
|
302 |
} catch (...) {
|
302 |
// Term does not occur. No problem.
|
303 |
// Term does not occur. No problem.
|
303 |
}
|
304 |
}
|
304 |
}
|
305 |
}
|
305 |
}
|
306 |
}
|
306 |
return -1;
|
307 |
return -1;
|
307 |
}
|
308 |
}
|
308 |
|
309 |
|
309 |
// Build a document abstract by extracting text chunks around the query terms
|
310 |
// Build a document abstract by extracting text chunks around the query terms
|
310 |
// This uses the db termlists, not the original document.
|
311 |
// This uses the db termlists, not the original document.
|
311 |
//
|
312 |
//
|
312 |
// DatabaseModified and other general exceptions are catched and
|
313 |
// DatabaseModified and other general exceptions are catched and
|
313 |
// possibly retried by our caller
|
314 |
// possibly retried by our caller
|
314 |
int Query::Native::makeAbstract(Xapian::docid docid,
|
315 |
int Query::Native::makeAbstract(Xapian::docid docid,
|
315 |
vector<Snippet>& vabs,
|
316 |
vector<Snippet>& vabs,
|
316 |
int imaxoccs, int ictxwords)
|
317 |
int imaxoccs, int ictxwords)
|
317 |
{
|
318 |
{
|
318 |
Chrono chron;
|
319 |
Chrono chron;
|
319 |
LOGABS("makeAbstract: docid " << (long(docid)) << " imaxoccs " << (imaxoccs) << " ictxwords " << (ictxwords) << "\n" );
|
320 |
LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
|
|
|
321 |
imaxoccs << " ictxwords " << ictxwords << "\n");
|
320 |
|
322 |
|
321 |
// The (unprefixed) terms matched by this document
|
323 |
// The (unprefixed) terms matched by this document
|
322 |
vector<string> matchedTerms;
|
324 |
vector<string> matchedTerms;
|
323 |
getMatchTerms(docid, matchedTerms);
|
325 |
getMatchTerms(docid, matchedTerms);
|
324 |
if (matchedTerms.empty()) {
|
326 |
if (matchedTerms.empty()) {
|
325 |
LOGDEB("makeAbstract::Empty term list\n" );
|
327 |
LOGDEB("makeAbstract:"<<chron.millis()<<"mS:Empty term list\n");
|
326 |
return ABSRES_ERROR;
|
328 |
return ABSRES_ERROR;
|
327 |
}
|
329 |
}
|
328 |
|
330 |
|
329 |
listList("Match terms: ", matchedTerms);
|
331 |
listList("Match terms: ", matchedTerms);
|
330 |
|
332 |
|
331 |
// Retrieve the term frequencies for the query terms. This is
|
333 |
// Retrieve the term frequencies for the query terms. This is
|
|
... |
|
... |
337 |
// going to try and show text around the less common search terms.
|
339 |
// going to try and show text around the less common search terms.
|
338 |
// Terms issued from an original one by stem expansion are
|
340 |
// Terms issued from an original one by stem expansion are
|
339 |
// aggregated by the qualityTerms() routine.
|
341 |
// aggregated by the qualityTerms() routine.
|
340 |
multimap<double, vector<string> > byQ;
|
342 |
multimap<double, vector<string> > byQ;
|
341 |
double totalweight = qualityTerms(docid, matchedTerms, byQ);
|
343 |
double totalweight = qualityTerms(docid, matchedTerms, byQ);
|
342 |
LOGABS("makeAbstract:" << (chron.ms()) << ": computed Qcoefs.\n" );
|
344 |
LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
|
343 |
// This can't happen, but would crash us
|
345 |
// This can't happen, but would crash us
|
344 |
if (totalweight == 0.0) {
|
346 |
if (totalweight == 0.0) {
|
345 |
LOGERR("makeAbstract: totalweight == 0.0 !\n" );
|
347 |
LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
|
346 |
return ABSRES_ERROR;
|
348 |
return ABSRES_ERROR;
|
347 |
}
|
349 |
}
|
348 |
|
350 |
|
349 |
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
351 |
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
|
350 |
Xapian::Database& xrdb(ndb->xrdb);
|
352 |
Xapian::Database& xrdb(ndb->xrdb);
|
351 |
|
353 |
|
|
... |
|
... |
372 |
// average word size. It was a mistake to have the user max
|
374 |
// average word size. It was a mistake to have the user max
|
373 |
// abstract size parameter in characters, we basically only deal
|
375 |
// abstract size parameter in characters, we basically only deal
|
374 |
// with words. We used to limit the character size at the end, but
|
376 |
// with words. We used to limit the character size at the end, but
|
375 |
// this damaged our careful selection of terms
|
377 |
// this damaged our careful selection of terms
|
376 |
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
378 |
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
|
377 |
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
|
379 |
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
|
378 |
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
|
380 |
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
|
379 |
LOGABS("makeAbstract:" << (chron.ms()) << ": mxttloccs " << (maxtotaloccs) << " ctxwords " << (ctxwords) << "\n" );
|
381 |
LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
|
|
|
382 |
maxtotaloccs << " ctxwords " << ctxwords << "\n");
|
380 |
|
383 |
|
381 |
int ret = ABSRES_OK;
|
384 |
int ret = ABSRES_OK;
|
382 |
|
385 |
|
383 |
// Let's go populate
|
386 |
// Let's go populate
|
384 |
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
387 |
for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
|
385 |
mit != byQ.rend(); mit++) {
|
388 |
mit != byQ.rend(); mit++) {
|
386 |
unsigned int maxgrpoccs;
|
389 |
unsigned int maxgrpoccs;
|
387 |
double q;
|
390 |
double q;
|
388 |
if (byQ.size() == 1) {
|
391 |
if (byQ.size() == 1) {
|
389 |
maxgrpoccs = maxtotaloccs;
|
392 |
maxgrpoccs = maxtotaloccs;
|
390 |
q = 1.0;
|
393 |
q = 1.0;
|
391 |
} else {
|
394 |
} else {
|
392 |
// We give more slots to the better term groups
|
395 |
// We give more slots to the better term groups
|
393 |
q = mit->first / totalweight;
|
396 |
q = mit->first / totalweight;
|
394 |
maxgrpoccs = int(ceil(maxtotaloccs * q));
|
397 |
maxgrpoccs = int(ceil(maxtotaloccs * q));
|
395 |
}
|
398 |
}
|
396 |
unsigned int grpoccs = 0;
|
399 |
unsigned int grpoccs = 0;
|
397 |
|
400 |
|
398 |
for (vector<string>::const_iterator qit = mit->second.begin();
|
401 |
for (vector<string>::const_iterator qit = mit->second.begin();
|
399 |
qit != mit->second.end(); qit++) {
|
402 |
qit != mit->second.end(); qit++) {
|
400 |
|
403 |
|
401 |
// Group done ?
|
404 |
// Group done ?
|
402 |
if (grpoccs >= maxgrpoccs)
|
405 |
if (grpoccs >= maxgrpoccs)
|
403 |
break;
|
406 |
break;
|
404 |
|
407 |
|
405 |
string qterm = *qit;
|
408 |
string qterm = *qit;
|
406 |
|
409 |
|
407 |
LOGABS("makeAbstract: [" << (qterm) << "] " << (maxgrpoccs) << " max grp occs (coef " << (q) << ")\n" );
|
410 |
LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
|
|
|
411 |
" max grp occs (coef " << q << ")\n");
|
408 |
|
412 |
|
409 |
// The match term may span several words
|
413 |
// The match term may span several words
|
410 |
int qtrmwrdcnt =
|
414 |
int qtrmwrdcnt =
|
411 |
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
415 |
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
|
412 |
|
416 |
|
413 |
Xapian::PositionIterator pos;
|
417 |
Xapian::PositionIterator pos;
|
414 |
// There may be query terms not in this doc. This raises an
|
418 |
// There may be query terms not in this doc. This raises an
|
415 |
// exception when requesting the position list, we catch it ??
|
419 |
// exception when requesting the position list, we catch it ??
|
416 |
// Not clear how this can happen because we are walking the
|
420 |
// Not clear how this can happen because we are walking the
|
417 |
// match list returned by Xapian. Maybe something with the
|
421 |
// match list returned by Xapian. Maybe something with the
|
418 |
// fields?
|
422 |
// fields?
|
419 |
string emptys;
|
423 |
string emptys;
|
420 |
try {
|
424 |
try {
|
421 |
for (pos = xrdb.positionlist_begin(docid, qterm);
|
425 |
for (pos = xrdb.positionlist_begin(docid, qterm);
|
422 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
426 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
423 |
int ipos = *pos;
|
427 |
int ipos = *pos;
|
424 |
if (ipos < int(baseTextPosition)) // Not in text body
|
428 |
if (ipos < int(baseTextPosition)) // Not in text body
|
425 |
continue;
|
429 |
continue;
|
426 |
LOGABS("makeAbstract: [" << (qterm) << "] at pos " << (ipos) << " grpoccs " << (grpoccs) << " maxgrpoccs " << (maxgrpoccs) << "\n" );
|
430 |
LOGABS("makeAbstract: [" << qterm << "] at pos " <<
|
|
|
431 |
ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
|
|
|
432 |
maxgrpoccs << "\n");
|
427 |
|
433 |
|
428 |
totaloccs++;
|
434 |
totaloccs++;
|
429 |
grpoccs++;
|
435 |
grpoccs++;
|
430 |
|
436 |
|
431 |
// Add adjacent slots to the set to populate at next
|
437 |
// Add adjacent slots to the set to populate at next
|
432 |
// step by inserting empty strings. Special provisions
|
438 |
// step by inserting empty strings. Special provisions
|
433 |
// for adding ellipsis and for positions overlapped by
|
439 |
// for adding ellipsis and for positions overlapped by
|
434 |
// the match term.
|
440 |
// the match term.
|
435 |
unsigned int sta = MAX(int(baseTextPosition),
|
441 |
unsigned int sta = MAX(int(baseTextPosition),
|
436 |
ipos - ctxwords);
|
442 |
ipos - ctxwords);
|
437 |
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
443 |
unsigned int sto = ipos + qtrmwrdcnt-1 +
|
438 |
m_q->m_db->getAbsCtxLen();
|
444 |
m_q->m_db->getAbsCtxLen();
|
439 |
for (unsigned int ii = sta; ii <= sto; ii++) {
|
445 |
for (unsigned int ii = sta; ii <= sto; ii++) {
|
440 |
if (ii == (unsigned int)ipos) {
|
446 |
if (ii == (unsigned int)ipos) {
|
441 |
sparseDoc[ii] = qterm;
|
447 |
sparseDoc[ii] = qterm;
|
442 |
searchTermPositions.insert(ii);
|
448 |
searchTermPositions.insert(ii);
|
443 |
if (ii > maxpos)
|
449 |
if (ii > maxpos)
|
444 |
maxpos = ii;
|
450 |
maxpos = ii;
|
445 |
} else if (ii > (unsigned int)ipos &&
|
451 |
} else if (ii > (unsigned int)ipos &&
|
446 |
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
452 |
ii < (unsigned int)ipos + qtrmwrdcnt) {
|
447 |
sparseDoc[ii] = occupiedmarker;
|
453 |
sparseDoc[ii] = occupiedmarker;
|
448 |
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
|
454 |
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
|
449 |
// For an empty slot, the test has a side
|
455 |
// For an empty slot, the test has a side
|
450 |
// effect of inserting an empty string which
|
456 |
// effect of inserting an empty string which
|
451 |
// is what we want.
|
457 |
// is what we want.
|
452 |
sparseDoc[ii] = emptys;
|
458 |
sparseDoc[ii] = emptys;
|
453 |
}
|
459 |
}
|
454 |
}
|
460 |
}
|
455 |
// Add ellipsis at the end. This may be replaced later by
|
461 |
// Add ellipsis at the end. This may be replaced later by
|
456 |
// an overlapping extract. Take care not to replace an
|
462 |
// an overlapping extract. Take care not to replace an
|
457 |
// empty string here, we really want an empty slot,
|
463 |
// empty string here, we really want an empty slot,
|
458 |
// use find()
|
464 |
// use find()
|
459 |
if (sparseDoc.find(sto+1) == sparseDoc.end()) {
|
465 |
if (sparseDoc.find(sto+1) == sparseDoc.end()) {
|
460 |
sparseDoc[sto+1] = cstr_ellipsis;
|
466 |
sparseDoc[sto+1] = cstr_ellipsis;
|
461 |
}
|
467 |
}
|
462 |
|
468 |
|
463 |
// Group done ?
|
469 |
// Group done ?
|
464 |
if (grpoccs >= maxgrpoccs) {
|
470 |
if (grpoccs >= maxgrpoccs) {
|
465 |
ret |= ABSRES_TRUNC;
|
471 |
ret |= ABSRES_TRUNC;
|
466 |
LOGABS("Db::makeAbstract: max group occs cutoff\n" );
|
472 |
LOGABS("Db::makeAbstract: max group occs cutoff\n");
|
467 |
break;
|
473 |
break;
|
468 |
}
|
474 |
}
|
469 |
// Global done ?
|
475 |
// Global done ?
|
470 |
if (totaloccs >= maxtotaloccs) {
|
476 |
if (totaloccs >= maxtotaloccs) {
|
471 |
ret |= ABSRES_TRUNC;
|
477 |
ret |= ABSRES_TRUNC;
|
472 |
LOGABS("Db::makeAbstract: max occurrences cutoff\n" );
|
478 |
LOGABS("Db::makeAbstract: max occurrences cutoff\n");
|
473 |
break;
|
479 |
break;
|
474 |
}
|
480 |
}
|
475 |
}
|
481 |
}
|
476 |
} catch (...) {
|
482 |
} catch (...) {
|
477 |
// Term does not occur. No problem.
|
483 |
// Term does not occur. No problem.
|
478 |
}
|
484 |
}
|
479 |
|
485 |
|
480 |
if (totaloccs >= maxtotaloccs) {
|
486 |
if (totaloccs >= maxtotaloccs) {
|
481 |
ret |= ABSRES_TRUNC;
|
487 |
ret |= ABSRES_TRUNC;
|
482 |
LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );
|
488 |
LOGABS("Db::makeAbstract: max1 occurrences cutoff\n");
|
483 |
break;
|
489 |
break;
|
484 |
}
|
490 |
}
|
485 |
}
|
491 |
}
|
486 |
}
|
492 |
}
|
487 |
maxpos += ctxwords + 1;
|
493 |
maxpos += ctxwords + 1;
|
488 |
|
494 |
|
489 |
LOGABS("makeAbstract:" << (chron.millis()) << ":chosen number of positions " << (totaloccs) << "\n" );
|
495 |
LOGABS("makeAbstract:" << chron.millis() <<
|
|
|
496 |
"mS:chosen number of positions " << totaloccs << "\n");
|
490 |
// This can happen if there are term occurences in the keywords
|
497 |
// This can happen if there are term occurences in the keywords
|
491 |
// etc. but not elsewhere ?
|
498 |
// etc. but not elsewhere ?
|
492 |
if (totaloccs == 0) {
|
499 |
if (totaloccs == 0) {
|
493 |
LOGDEB("makeAbstract: no occurrences\n" );
|
500 |
LOGDEB("makeAbstract: no occurrences\n");
|
494 |
return ABSRES_OK;
|
501 |
return ABSRES_OK;
|
495 |
}
|
502 |
}
|
496 |
|
503 |
|
497 |
// Walk all document's terms position lists and populate slots
|
504 |
// Walk all document's terms position lists and populate slots
|
498 |
// around the query terms. We arbitrarily truncate the list to
|
505 |
// around the query terms. We arbitrarily truncate the list to
|
499 |
// avoid taking forever. If we do cutoff, the abstract may be
|
506 |
// avoid taking forever. If we do cutoff, the abstract may be
|
500 |
// inconsistant (missing words, potentially altering meaning),
|
507 |
// inconsistant (missing words, potentially altering meaning),
|
501 |
// which is bad.
|
508 |
// which is bad.
|
502 |
{
|
509 |
{
|
503 |
Xapian::TermIterator term;
|
510 |
Xapian::TermIterator term;
|
504 |
int cutoff = m_q->m_snipMaxPosWalk;
|
511 |
int cutoff = m_q->m_snipMaxPosWalk;
|
505 |
for (term = xrdb.termlist_begin(docid);
|
512 |
for (term = xrdb.termlist_begin(docid);
|
506 |
term != xrdb.termlist_end(docid); term++) {
|
513 |
term != xrdb.termlist_end(docid); term++) {
|
507 |
// Ignore prefixed terms
|
514 |
// Ignore prefixed terms
|
508 |
if (has_prefix(*term))
|
515 |
if (has_prefix(*term))
|
509 |
continue;
|
516 |
continue;
|
510 |
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
517 |
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
511 |
ret |= ABSRES_TERMMISS;
|
518 |
ret |= ABSRES_TERMMISS;
|
512 |
LOGDEB0("makeAbstract: max term count cutoff " << (m_q->m_snipMaxPosWalk) << "\n" );
|
519 |
LOGDEB0("makeAbstract: max term count cutoff " <<
|
513 |
break;
|
520 |
m_q->m_snipMaxPosWalk << "\n");
|
|
|
521 |
break;
|
|
|
522 |
}
|
|
|
523 |
|
|
|
524 |
map<unsigned int, string>::iterator vit;
|
|
|
525 |
Xapian::PositionIterator pos;
|
|
|
526 |
for (pos = xrdb.positionlist_begin(docid, *term);
|
|
|
527 |
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
|
|
528 |
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
|
|
529 |
ret |= ABSRES_TERMMISS;
|
|
|
530 |
LOGDEB0("makeAbstract: max term count cutoff " <<
|
|
|
531 |
m_q->m_snipMaxPosWalk << "\n");
|
|
|
532 |
break;
|
|
|
533 |
}
|
|
|
534 |
// If we are beyond the max possible position, stop
|
|
|
535 |
// for this term
|
|
|
536 |
if (*pos > maxpos) {
|
|
|
537 |
break;
|
|
|
538 |
}
|
|
|
539 |
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
|
|
|
540 |
// Don't replace a term: the terms list is in
|
|
|
541 |
// alphabetic order, and we may have several terms
|
|
|
542 |
// at the same position, we want to keep only the
|
|
|
543 |
// first one (ie: dockes and dockes@wanadoo.fr)
|
|
|
544 |
if (vit->second.empty()) {
|
|
|
545 |
LOGDEB2("makeAbstract: populating: [" << *term <<
|
|
|
546 |
"] at " << *pos << "\n");
|
|
|
547 |
sparseDoc[*pos] = *term;
|
|
|
548 |
}
|
|
|
549 |
}
|
|
|
550 |
}
|
|
|
551 |
}
|
514 |
}
|
552 |
}
|
515 |
|
553 |
LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");
|
516 |
map<unsigned int, string>::iterator vit;
|
|
|
517 |
Xapian::PositionIterator pos;
|
|
|
518 |
for (pos = xrdb.positionlist_begin(docid, *term);
|
|
|
519 |
pos != xrdb.positionlist_end(docid, *term); pos++) {
|
|
|
520 |
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
|
|
|
521 |
ret |= ABSRES_TERMMISS;
|
|
|
522 |
LOGDEB0("makeAbstract: max term count cutoff " << (m_q->m_snipMaxPosWalk) << "\n" );
|
|
|
523 |
break;
|
|
|
524 |
}
|
|
|
525 |
// If we are beyond the max possible position, stop
|
|
|
526 |
// for this term
|
|
|
527 |
if (*pos > maxpos) {
|
|
|
528 |
break;
|
|
|
529 |
}
|
|
|
530 |
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
|
|
|
531 |
// Don't replace a term: the terms list is in
|
|
|
532 |
// alphabetic order, and we may have several terms
|
|
|
533 |
// at the same position, we want to keep only the
|
|
|
534 |
// first one (ie: dockes and dockes@wanadoo.fr)
|
|
|
535 |
if (vit->second.empty()) {
|
|
|
536 |
LOGDEB2("makeAbstract: populating: [" << ((*term)) << "] at " << (*pos) << "\n" );
|
|
|
537 |
sparseDoc[*pos] = *term;
|
|
|
538 |
}
|
|
|
539 |
}
|
|
|
540 |
}
|
|
|
541 |
}
|
|
|
542 |
}
|
|
|
543 |
|
554 |
|
544 |
#if 0
|
555 |
#if 0
|
545 |
// Debug only: output the full term[position] vector
|
556 |
// Debug only: output the full term[position] vector
|
546 |
bool epty = false;
|
557 |
bool epty = false;
|
547 |
int ipos = 0;
|
558 |
int ipos = 0;
|
548 |
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
559 |
for (map<unsigned int, string>::iterator it = sparseDoc.begin();
|
549 |
it != sparseDoc.end();
|
560 |
it != sparseDoc.end();
|
550 |
it++, ipos++) {
|
561 |
it++, ipos++) {
|
551 |
if (it->empty()) {
|
562 |
if (it->empty()) {
|
552 |
if (!epty)
|
563 |
if (!epty)
|
553 |
LOGDEB("makeAbstract:vec[" << (ipos) << "]: [" << (it) << "]\n" );
|
564 |
LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
|
554 |
epty=true;
|
565 |
epty=true;
|
555 |
} else {
|
566 |
} else {
|
556 |
epty = false;
|
567 |
epty = false;
|
557 |
LOGDEB("makeAbstract:vec[" << (ipos) << "]: [" << (it) << "]\n" );
|
568 |
LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
|
558 |
}
|
569 |
}
|
559 |
}
|
570 |
}
|
560 |
#endif
|
571 |
#endif
|
561 |
|
572 |
|
562 |
vector<int> vpbreaks;
|
573 |
vector<int> vpbreaks;
|
563 |
ndb->getPagePositions(docid, vpbreaks);
|
574 |
ndb->getPagePositions(docid, vpbreaks);
|
564 |
|
575 |
|
565 |
LOGABS("makeAbstract:" << (chron.millis()) << ": extracting. Got " << (vpbreaks.size()) << " pages\n" );
|
576 |
LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
|
|
|
577 |
vpbreaks.size() << " pages\n");
|
566 |
// Finally build the abstract by walking the map (in order of position)
|
578 |
// Finally build the abstract by walking the map (in order of position)
|
567 |
vabs.clear();
|
579 |
vabs.clear();
|
568 |
string chunk;
|
580 |
string chunk;
|
569 |
bool incjk = false;
|
581 |
bool incjk = false;
|
570 |
int page = 0;
|
582 |
int page = 0;
|
571 |
string term;
|
583 |
string term;
|
572 |
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
584 |
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
573 |
it != sparseDoc.end(); it++) {
|
585 |
it != sparseDoc.end(); it++) {
|
574 |
LOGDEB2("Abtract:output " << (it->first) << " -> [" << (it->second) << "]\n" );
|
586 |
LOGDEB2("Abtract:output " << it->first << " -> [" << it->second <<
|
|
|
587 |
"]\n");
|
575 |
if (!occupiedmarker.compare(it->second)) {
|
588 |
if (!occupiedmarker.compare(it->second)) {
|
576 |
LOGDEB("Abstract: qtrm position not filled ??\n" );
|
589 |
LOGDEB("Abstract: qtrm position not filled ??\n");
|
577 |
continue;
|
590 |
continue;
|
578 |
}
|
591 |
}
|
579 |
if (chunk.empty() && !vpbreaks.empty()) {
|
592 |
if (chunk.empty() && !vpbreaks.empty()) {
|
580 |
page = ndb->getPageNumberForPosition(vpbreaks, it->first);
|
593 |
page = ndb->getPageNumberForPosition(vpbreaks, it->first);
|
581 |
if (page < 0)
|
594 |
if (page < 0)
|
582 |
page = 0;
|
595 |
page = 0;
|
583 |
term.clear();
|
596 |
term.clear();
|
584 |
}
|
597 |
}
|
585 |
Utf8Iter uit(it->second);
|
598 |
Utf8Iter uit(it->second);
|
586 |
bool newcjk = false;
|
599 |
bool newcjk = false;
|
587 |
if (TextSplit::isCJK(*uit))
|
600 |
if (TextSplit::isCJK(*uit))
|
588 |
newcjk = true;
|
601 |
newcjk = true;
|
589 |
if (!incjk || (incjk && !newcjk))
|
602 |
if (!incjk || (incjk && !newcjk))
|
590 |
chunk += " ";
|
603 |
chunk += " ";
|
591 |
incjk = newcjk;
|
604 |
incjk = newcjk;
|
592 |
if (searchTermPositions.find(it->first) != searchTermPositions.end())
|
605 |
if (searchTermPositions.find(it->first) != searchTermPositions.end())
|
593 |
term = it->second;
|
606 |
term = it->second;
|
594 |
if (it->second == cstr_ellipsis) {
|
607 |
if (it->second == cstr_ellipsis) {
|
595 |
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
608 |
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
596 |
chunk.clear();
|
609 |
chunk.clear();
|
597 |
} else {
|
610 |
} else {
|
598 |
if (it->second.compare(end_of_field_term) &&
|
611 |
if (it->second.compare(end_of_field_term) &&
|
599 |
it->second.compare(start_of_field_term))
|
612 |
it->second.compare(start_of_field_term))
|
600 |
chunk += it->second;
|
613 |
chunk += it->second;
|
601 |
}
|
614 |
}
|
602 |
}
|
615 |
}
|
603 |
if (!chunk.empty())
|
616 |
if (!chunk.empty())
|
604 |
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
617 |
vabs.push_back(Snippet(page, chunk).setTerm(term));
|
605 |
|
618 |
|
606 |
LOGDEB2("makeAbtract: done in " << (chron.millis()) << " mS\n" );
|
619 |
LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
|
607 |
return ret;
|
620 |
return ret;
|
608 |
}
|
621 |
}
|
609 |
|
622 |
|
610 |
|
623 |
|
611 |
}
|
624 |
}
|
612 |
|
|
|
613 |
|
|
|
614 |
|
|
|