|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.118 2007-06-22 06:14:04 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.119 2007-06-25 10:25:39 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
54 |
#endif
|
54 |
#endif
|
55 |
#ifndef MIN
|
55 |
#ifndef MIN
|
56 |
#define MIN(A,B) (A<B?A:B)
|
56 |
#define MIN(A,B) (A<B?A:B)
|
57 |
#endif
|
57 |
#endif
|
58 |
|
58 |
|
|
|
59 |
// This is the word position offset at which we index the body text
|
|
|
60 |
// (abstract, keywords, etc.. are stored before this)
|
|
|
61 |
static const unsigned int baseTextPosition = 100000;
|
|
|
62 |
|
59 |
#undef MTIME_IN_VALUE
|
63 |
#undef MTIME_IN_VALUE
|
60 |
#ifdef MTIME_IN_VALUE
|
64 |
#ifdef MTIME_IN_VALUE
|
61 |
// Omega compatible values
|
65 |
// Omega compatible values
|
62 |
#define enum value_slot {
|
66 |
#define enum value_slot {
|
63 |
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
|
67 |
VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
|
|
... |
|
... |
101 |
Xapian::Query query; // query descriptor: terms and subqueries
|
105 |
Xapian::Query query; // query descriptor: terms and subqueries
|
102 |
// joined by operators (or/and etc...)
|
106 |
// joined by operators (or/and etc...)
|
103 |
Xapian::Enquire *enquire; // Open query descriptor.
|
107 |
Xapian::Enquire *enquire; // Open query descriptor.
|
104 |
Xapian::MSet mset; // Partial result set
|
108 |
Xapian::MSet mset; // Partial result set
|
105 |
|
109 |
|
106 |
// Term frequencies for current query. See makeAbstract, not used yet.
|
110 |
// Term frequencies for current query. See makeAbstract, setQuery
|
107 |
map<string, int> m_termfreqs;
|
111 |
map<string, double> m_termfreqs;
|
108 |
|
112 |
|
109 |
Native(Db *db)
|
113 |
Native(Db *db)
|
110 |
: m_db(db),
|
114 |
: m_db(db),
|
111 |
m_isopen(false), m_iswritable(false), enquire(0)
|
115 |
m_isopen(false), m_iswritable(false), enquire(0)
|
112 |
{ }
|
116 |
{ }
|
|
... |
|
... |
230 |
}
|
234 |
}
|
231 |
}
|
235 |
}
|
232 |
return out;
|
236 |
return out;
|
233 |
}
|
237 |
}
|
234 |
|
238 |
|
|
|
239 |
//#define DEBUGABSTRACT
|
|
|
240 |
#ifdef DEBUGABSTRACT
|
|
|
241 |
#define LOGABS LOGDEB
|
|
|
242 |
#else
|
|
|
243 |
#define LOGABS LOGDEB2
|
|
|
244 |
#endif
|
|
|
245 |
|
235 |
// Build a document abstract by extracting text chunks around the query terms
|
246 |
// Build a document abstract by extracting text chunks around the query terms
|
236 |
// This uses the db termlists, not the original document.
|
247 |
// This uses the db termlists, not the original document.
|
237 |
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
248 |
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
|
238 |
{
|
249 |
{
|
239 |
Chrono chron;
|
250 |
Chrono chron;
|
240 |
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
251 |
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
241 |
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
252 |
m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
|
242 |
|
253 |
|
243 |
list<string> terms = noPrefixList(iterms);
|
254 |
list<string> terms = noPrefixList(iterms);
|
244 |
if (terms.empty()) {
|
255 |
if (terms.empty()) {
|
245 |
return "";
|
256 |
return "";
|
246 |
}
|
257 |
}
|
247 |
|
258 |
|
248 |
// We may want to use the db-wide freqs to tune the abstracts one
|
259 |
// Retrieve db-wide frequencies for the query terms
|
249 |
// day but we currently don't
|
|
|
250 |
#if 0
|
|
|
251 |
if (m_termfreqs.empty()) {
|
260 |
if (m_termfreqs.empty()) {
|
|
|
261 |
double doccnt = db.get_doccount();
|
|
|
262 |
if (doccnt == 0) doccnt = 1;
|
252 |
for (list<string>::const_iterator qit = terms.begin();
|
263 |
for (list<string>::const_iterator qit = terms.begin();
|
253 |
qit != terms.end(); qit++) {
|
264 |
qit != terms.end(); qit++) {
|
254 |
m_termfreqs[*qit] = db.get_termfreq(*qit);
|
265 |
m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
|
255 |
LOGDEB(("makeAbstract: [%s] db freq %d\n", qit->c_str(),
|
266 |
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
256 |
m_termfreqs[*qit]));
|
267 |
m_termfreqs[*qit]));
|
257 |
}
|
268 |
}
|
258 |
LOGDEB(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
269 |
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
259 |
}
|
270 |
}
|
260 |
#endif
|
|
|
261 |
|
271 |
|
262 |
// Retrieve the term Within Document Frequencies. We are going to try
|
272 |
// Compute a term quality coefficient by retrieving the term
|
|
|
273 |
// Within Document Frequencies and multiplying by overal term
|
|
|
274 |
// frequency, then using log-based thresholds. We are going to try
|
263 |
// and show text around the less common search terms.
|
275 |
// and show text around the less common search terms.
|
264 |
map<string, int> termwdfs;
|
276 |
map<string, double> termQcoefs;
|
265 |
int totalqtermoccs = 0;
|
277 |
double totalweight = 0;
|
|
|
278 |
double doclen = db.get_doclength(docid);
|
|
|
279 |
if (doclen == 0) doclen = 1;
|
266 |
for (list<string>::const_iterator qit = terms.begin();
|
280 |
for (list<string>::const_iterator qit = terms.begin();
|
267 |
qit != terms.end(); qit++) {
|
281 |
qit != terms.end(); qit++) {
|
268 |
Xapian::TermIterator term = db.termlist_begin(docid);
|
282 |
Xapian::TermIterator term = db.termlist_begin(docid);
|
269 |
term.skip_to(*qit);
|
283 |
term.skip_to(*qit);
|
270 |
if (term != db.termlist_end(docid) && *term == *qit) {
|
284 |
if (term != db.termlist_end(docid) && *term == *qit) {
|
271 |
int f = term.get_wdf();
|
285 |
double q = (term.get_wdf() / doclen) * m_termfreqs[*qit];
|
|
|
286 |
q = -log10(q);
|
|
|
287 |
if (q < 3) {
|
|
|
288 |
q = 0.05;
|
|
|
289 |
} else if (q < 4) {
|
|
|
290 |
q = 0.3;
|
|
|
291 |
} else if (q < 5) {
|
|
|
292 |
q = 0.7;
|
|
|
293 |
} else if (q < 6) {
|
|
|
294 |
q = 0.8;
|
|
|
295 |
} else {
|
|
|
296 |
q = 1;
|
|
|
297 |
}
|
272 |
termwdfs[*qit] = f;
|
298 |
termQcoefs[*qit] = q;
|
273 |
totalqtermoccs += f;
|
299 |
totalweight += q;
|
274 |
LOGDEB2(("makeAbstract: [%s] wdf %d\n", qit->c_str(),
|
|
|
275 |
termwdfs[*qit]));
|
|
|
276 |
}
|
300 |
}
|
277 |
}
|
301 |
}
|
278 |
LOGDEB2(("makeAbstract:%d: got wdfs totalqtermoccs %d\n",
|
302 |
LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
|
279 |
chron.ms(), totalqtermoccs));
|
|
|
280 |
if (totalqtermoccs == 0) {
|
|
|
281 |
LOGERR(("makeAbstract: no term occurrences !\n"));
|
|
|
282 |
return "";
|
|
|
283 |
}
|
|
|
284 |
|
303 |
|
285 |
// Build a sorted by frequency term list: it seems reasonable to
|
304 |
// Build a sorted by quality term list.
|
286 |
// prefer sampling around the less frequent terms:
|
|
|
287 |
multimap<int, string> bywdf;
|
305 |
multimap<double, string> byQ;
|
288 |
for (list<string>::const_iterator qit = terms.begin();
|
306 |
for (list<string>::const_iterator qit = terms.begin();
|
289 |
qit != terms.end(); qit++) {
|
307 |
qit != terms.end(); qit++) {
|
290 |
if (termwdfs.find(*qit) != termwdfs.end())
|
308 |
if (termQcoefs.find(*qit) != termQcoefs.end())
|
291 |
bywdf.insert(pair<int,string>(termwdfs[*qit], *qit));
|
309 |
byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
|
292 |
}
|
310 |
}
|
293 |
|
311 |
|
|
|
312 |
#ifdef DEBUGABSTRACT
|
|
|
313 |
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
|
|
314 |
qit != byQ.rend(); qit++) {
|
|
|
315 |
LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
|
|
|
316 |
}
|
|
|
317 |
#endif
|
|
|
318 |
|
|
|
319 |
|
294 |
// For each of the query terms, query xapian for its positions
|
320 |
// For each of the query terms, ask xapian for its positions list
|
295 |
// list in the document. For each position entry, remember it in qtermposs
|
321 |
// in the document. For each position entry, remember it in
|
296 |
// and insert it and its neighbours in the set of 'interesting' positions
|
322 |
// qtermposs and insert it and its neighbours in the set of
|
|
|
323 |
// 'interesting' positions
|
297 |
|
324 |
|
298 |
// The terms 'array' that we partially populate with the document
|
325 |
// The terms 'array' that we partially populate with the document
|
299 |
// terms, at their positions around the search terms positions:
|
326 |
// terms, at their positions around the search terms positions:
|
300 |
map<unsigned int, string> sparseDoc;
|
327 |
map<unsigned int, string> sparseDoc;
|
301 |
|
328 |
|
302 |
// All the query term positions. We remember this mainly because we are
|
329 |
// All the chosen query term positions.
|
303 |
// going to random-shuffle it for selecting the chunks that we actually
|
|
|
304 |
// print.
|
|
|
305 |
vector<unsigned int> qtermposs;
|
330 |
vector<unsigned int> qtermposs;
|
306 |
|
331 |
|
307 |
// Limit the total number of slots we populate.
|
332 |
// Limit the total number of slots we populate. The 7 is taken as
|
|
|
333 |
// average word size. It was a mistake to have the user max
|
|
|
334 |
// abstract size parameter in characters, we basically only deal
|
|
|
335 |
// with words. We used to limit the character size at the end, but
|
|
|
336 |
// this damaged our careful selection of terms
|
308 |
const unsigned int maxtotaloccs =
|
337 |
const unsigned int maxtotaloccs =
|
309 |
MAX(50, m_db->m_synthAbsLen /(4 * (m_db->m_synthAbsWordCtxLen+1)));
|
338 |
m_db->m_synthAbsLen /(7 * (m_db->m_synthAbsWordCtxLen+1));
|
310 |
LOGDEB2(("makeAbstract:%d: ttlqtrms %d mxttloccs %d\n",
|
339 |
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
|
311 |
chron.ms(), totalqtermoccs, maxtotaloccs));
|
340 |
// This can't happen, but would crash us
|
312 |
#if 0
|
341 |
if (totalweight == 0.0) {
|
|
|
342 |
LOGERR(("makeAbstract: 0 totalweight!\n"));
|
|
|
343 |
return "";
|
|
|
344 |
}
|
|
|
345 |
|
|
|
346 |
// Let's go populate
|
313 |
for (multimap<int, string>::iterator qit = bywdf.begin();
|
347 |
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
314 |
qit != bywdf.end(); qit++) {
|
348 |
qit != byQ.rend(); qit++) {
|
315 |
LOGDEB(("%d->[%s]\n", qit->first, qit->second.c_str()));
|
|
|
316 |
}
|
|
|
317 |
#endif
|
|
|
318 |
|
|
|
319 |
// Find the text positions which we will have to fill with terms
|
|
|
320 |
unsigned int totaloccs = 0;
|
|
|
321 |
for (multimap<int, string>::iterator qit = bywdf.begin();
|
|
|
322 |
qit != bywdf.end(); qit++) {
|
|
|
323 |
string qterm = qit->second;
|
349 |
string qterm = qit->second;
|
324 |
unsigned int maxoccs;
|
350 |
unsigned int maxoccs;
|
325 |
if (bywdf.size() == 1) {
|
351 |
if (byQ.size() == 1) {
|
326 |
maxoccs = maxtotaloccs;
|
352 |
maxoccs = maxtotaloccs;
|
327 |
} else {
|
353 |
} else {
|
328 |
float q = (1 - float(termwdfs[qterm]) / float(totalqtermoccs)) /
|
354 |
// We give more slots to the better terms
|
329 |
(bywdf.size() - 1);
|
355 |
float q = qit->first / totalweight;
|
330 |
maxoccs = int(ceil(maxtotaloccs * q));
|
356 |
maxoccs = int(ceil(maxtotaloccs * q));
|
331 |
LOGDEB2(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
|
357 |
LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
|
332 |
qterm.c_str(), maxoccs, q));
|
358 |
qterm.c_str(), maxoccs, q));
|
333 |
}
|
359 |
}
|
334 |
|
360 |
|
335 |
Xapian::PositionIterator pos;
|
361 |
Xapian::PositionIterator pos;
|
336 |
// There may be query terms not in this doc. This raises an
|
362 |
// There may be query terms not in this doc. This raises an
|
|
... |
|
... |
339 |
try {
|
365 |
try {
|
340 |
unsigned int occurrences = 0;
|
366 |
unsigned int occurrences = 0;
|
341 |
for (pos = db.positionlist_begin(docid, qterm);
|
367 |
for (pos = db.positionlist_begin(docid, qterm);
|
342 |
pos != db.positionlist_end(docid, qterm); pos++) {
|
368 |
pos != db.positionlist_end(docid, qterm); pos++) {
|
343 |
unsigned int ipos = *pos;
|
369 |
unsigned int ipos = *pos;
|
344 |
LOGDEB2(("makeAbstract: [%s] at %d\n", qit->c_str(), ipos));
|
370 |
if (ipos < baseTextPosition) // Not in text body
|
|
|
371 |
continue;
|
|
|
372 |
LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n",
|
|
|
373 |
qterm.c_str(), ipos, occurrences, maxoccs));
|
345 |
// Remember the term position
|
374 |
// Remember the term position
|
346 |
qtermposs.push_back(ipos);
|
375 |
qtermposs.push_back(ipos);
|
347 |
// Add adjacent slots to the set to populate at next step
|
376 |
// Add adjacent slots to the set to populate at next step
|
348 |
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
377 |
unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
|
349 |
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
378 |
unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
|
|
... |
|
... |
351 |
if (ii == ipos)
|
380 |
if (ii == ipos)
|
352 |
sparseDoc[ii] = qterm;
|
381 |
sparseDoc[ii] = qterm;
|
353 |
else
|
382 |
else
|
354 |
sparseDoc[ii] = emptys;
|
383 |
sparseDoc[ii] = emptys;
|
355 |
}
|
384 |
}
|
356 |
// Limit the number of occurences we keep for each
|
385 |
// Limit to allocated occurences and total size
|
357 |
// term. The abstract has a finite length anyway !
|
|
|
358 |
if (occurrences++ > maxoccs)
|
386 |
if (++occurrences >= maxoccs ||
|
|
|
387 |
qtermposs.size() >= maxtotaloccs)
|
359 |
break;
|
388 |
break;
|
360 |
}
|
389 |
}
|
361 |
} catch (...) {
|
390 |
} catch (...) {
|
362 |
// Term does not occur. No problem.
|
391 |
// Term does not occur. No problem.
|
363 |
}
|
392 |
}
|
364 |
// Limit total size
|
393 |
if (qtermposs.size() >= maxtotaloccs)
|
365 |
if (totaloccs++ > maxtotaloccs)
|
|
|
366 |
break;
|
394 |
break;
|
367 |
}
|
395 |
}
|
368 |
|
|
|
369 |
LOGDEB2(("makeAbstract:%d:chosen number of positions %d\n",
|
396 |
LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
|
370 |
chron.millis(), qtermposs.size()));
|
397 |
chron.millis(), qtermposs.size()));
|
371 |
|
398 |
|
372 |
// Walk the full document position list (for each term walk
|
399 |
// This can happen if there are term occurences in the keywords
|
373 |
// position list) and populate slots around the query terms. We
|
400 |
// etc. but not elsewhere ?
|
374 |
// arbitrarily truncate the list to avoid taking forever. If we do
|
401 |
if (qtermposs.size() == 0)
|
375 |
// cutoff, the abstract may be inconsistant, which is bad...
|
402 |
return "";
|
|
|
403 |
|
|
|
404 |
// Walk all document's terms position lists and populate slots
|
|
|
405 |
// around the query terms. We arbitrarily truncate the list to
|
|
|
406 |
// avoid taking forever. If we do cutoff, the abstract may be
|
|
|
407 |
// inconsistant (missing words, potentially altering meaning),
|
|
|
408 |
// which is bad...
|
376 |
{
|
409 |
{
|
377 |
Xapian::TermIterator term;
|
410 |
Xapian::TermIterator term;
|
378 |
int cutoff = 500 * 1000;
|
411 |
int cutoff = 500 * 1000;
|
379 |
|
412 |
|
380 |
for (term = db.termlist_begin(docid);
|
413 |
for (term = db.termlist_begin(docid);
|
|
... |
|
... |
399 |
// Don't replace a term: the terms list is in
|
432 |
// Don't replace a term: the terms list is in
|
400 |
// alphabetic order, and we may have several terms
|
433 |
// alphabetic order, and we may have several terms
|
401 |
// at the same position, we want to keep only the
|
434 |
// at the same position, we want to keep only the
|
402 |
// first one (ie: dockes and dockes@wanadoo.fr)
|
435 |
// first one (ie: dockes and dockes@wanadoo.fr)
|
403 |
if (vit->second.empty()) {
|
436 |
if (vit->second.empty()) {
|
404 |
LOGDEB2(("makeAbstract: populating: [%s] at %d\n",
|
437 |
LOGABS(("makeAbstract: populating: [%s] at %d\n",
|
405 |
(*term).c_str(), *pos));
|
438 |
(*term).c_str(), *pos));
|
406 |
sparseDoc[*pos] = *term;
|
439 |
sparseDoc[*pos] = *term;
|
407 |
}
|
440 |
}
|
408 |
}
|
441 |
}
|
409 |
}
|
442 |
}
|
|
... |
|
... |
426 |
LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
459 |
LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
427 |
}
|
460 |
}
|
428 |
}
|
461 |
}
|
429 |
#endif
|
462 |
#endif
|
430 |
|
463 |
|
431 |
LOGDEB2(("makeAbstract:%d: randomizing and extracting\n", chron.millis()));
|
464 |
LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
|
432 |
|
465 |
|
433 |
// We randomize the selection of term positions, from which we
|
466 |
// Add "..." at ends of chunks
|
434 |
// shall pull, starting at the beginning, until the abstract is
|
|
|
435 |
// big enough. The abstract is finally built in correct position
|
|
|
436 |
// order, thanks to the position map.
|
|
|
437 |
random_shuffle(qtermposs.begin(), qtermposs.end());
|
|
|
438 |
map<unsigned int, string> mabs;
|
|
|
439 |
unsigned int abslen = 0;
|
|
|
440 |
|
|
|
441 |
// Extract data around the N first (in random order) query term
|
|
|
442 |
// positions, and store the terms in the map. Don't concatenate
|
|
|
443 |
// immediately into chunks because there might be overlaps
|
|
|
444 |
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
467 |
for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
|
445 |
pos != qtermposs.end(); pos++) {
|
468 |
pos != qtermposs.end(); pos++) {
|
446 |
|
|
|
447 |
if (int(abslen) > m_db->m_synthAbsLen)
|
|
|
448 |
break;
|
|
|
449 |
|
|
|
450 |
unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
|
|
|
451 |
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
469 |
unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
|
452 |
|
|
|
453 |
LOGDEB2(("makeAbstract: %d<-%d->%d\n", sta, *pos, sto));
|
|
|
454 |
|
|
|
455 |
for (unsigned int ii = sta; ii <= sto; ii++) {
|
|
|
456 |
|
|
|
457 |
if (int(abslen) > m_db->m_synthAbsLen)
|
|
|
458 |
break;
|
|
|
459 |
map<unsigned int, string>::const_iterator vit =
|
|
|
460 |
sparseDoc.find(ii);
|
|
|
461 |
if (vit != sparseDoc.end() && !vit->second.empty()) {
|
|
|
462 |
LOGDEB2(("makeAbstract: position %d -> [%s]\n",
|
|
|
463 |
ii, vit->second.c_str()));
|
|
|
464 |
mabs[ii] = vit->second;
|
|
|
465 |
abslen += vit->second.length();
|
|
|
466 |
} else {
|
|
|
467 |
LOGDEB2(("makeAbstract: empty position at %d\n", ii));
|
|
|
468 |
}
|
|
|
469 |
}
|
|
|
470 |
|
470 |
|
471 |
// Possibly add a ... at the end of chunk if it's not
|
471 |
// Possibly add a ... at the end of chunk if it's not
|
472 |
// overlapping
|
472 |
// overlapping
|
473 |
if (mabs.find(sto+1) == mabs.end())
|
473 |
if (sparseDoc.find(sto) != sparseDoc.end() &&
|
|
|
474 |
sparseDoc.find(sto+1) == sparseDoc.end())
|
474 |
mabs[sto+1] = "...";
|
475 |
sparseDoc[sto+1] = "...";
|
475 |
}
|
476 |
}
|
476 |
|
477 |
|
477 |
// Build the abstract by walking the map (in order of position)
|
478 |
// Finally build the abstract by walking the map (in order of position)
|
478 |
string abstract;
|
479 |
string abstract;
|
479 |
for (map<unsigned int, string>::const_iterator it = mabs.begin();
|
480 |
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
480 |
it != mabs.end(); it++) {
|
481 |
it != sparseDoc.end(); it++) {
|
481 |
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
482 |
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
482 |
abstract += it->second + " ";
|
483 |
abstract += it->second + " ";
|
483 |
}
|
484 |
}
|
484 |
|
485 |
|
485 |
// This happens for docs with no terms (only filename) indexed. I'll fix
|
486 |
// This happens for docs with no terms (only filename) indexed? I'll fix
|
486 |
// one day (yeah)
|
487 |
// one day (yeah)
|
487 |
if (!abstract.compare("... "))
|
488 |
if (!abstract.compare("... "))
|
488 |
abstract.clear();
|
489 |
abstract.clear();
|
489 |
|
490 |
|
490 |
LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));
|
491 |
LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));
|
|
... |
|
... |
971 |
splitData.setprefix(emptystring);
|
972 |
splitData.setprefix(emptystring);
|
972 |
splitData.basepos += splitData.curpos + 100;
|
973 |
splitData.basepos += splitData.curpos + 100;
|
973 |
}
|
974 |
}
|
974 |
}
|
975 |
}
|
975 |
|
976 |
|
|
|
977 |
if (splitData.curpos < baseTextPosition)
|
|
|
978 |
splitData.basepos = baseTextPosition;
|
|
|
979 |
else
|
|
|
980 |
splitData.basepos += splitData.curpos + 100;
|
976 |
|
981 |
|
977 |
// Split and index body text
|
982 |
// Finally: split and index body text
|
978 |
LOGDEB2(("Db::add: split body\n"));
|
983 |
LOGDEB2(("Db::add: split body\n"));
|
979 |
if (!dumb_string(doc.text, noacc)) {
|
984 |
if (!dumb_string(doc.text, noacc)) {
|
980 |
LOGERR(("Db::add: dumb_string failed\n"));
|
985 |
LOGERR(("Db::add: dumb_string failed\n"));
|
981 |
return false;
|
986 |
return false;
|
982 |
}
|
987 |
}
|
983 |
splitter.text_to_words(noacc);
|
988 |
splitter.text_to_words(noacc);
|
984 |
splitData.basepos += splitData.curpos + 100;
|
|
|
985 |
|
|
|
986 |
|
989 |
|
987 |
////// Special terms for other metadata. No positions for these.
|
990 |
////// Special terms for other metadata. No positions for these.
|
988 |
// Mime type
|
991 |
// Mime type
|
989 |
newdocument.add_term("T" + doc.mimetype);
|
992 |
newdocument.add_term("T" + doc.mimetype);
|
990 |
|
993 |
|
|
... |
|
... |
1423 |
names.push_back("XIMPOSSIBLE");
|
1426 |
names.push_back("XIMPOSSIBLE");
|
1424 |
}
|
1427 |
}
|
1425 |
return true;
|
1428 |
return true;
|
1426 |
}
|
1429 |
}
|
1427 |
|
1430 |
|
1428 |
// Prepare query out of "advanced search" data
|
1431 |
// Prepare query out of user search data
|
1429 |
bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
|
1432 |
bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
|
1430 |
const string& stemlang)
|
1433 |
const string& stemlang)
|
1431 |
{
|
1434 |
{
|
1432 |
if (!m_ndb) {
|
1435 |
if (!m_ndb) {
|
1433 |
LOGERR(("Db::setQuery: no db!\n"));
|
1436 |
LOGERR(("Db::setQuery: no db!\n"));
|
|
... |
|
... |
1445 |
if (!sdata->toNativeQuery(*this, &xq,
|
1448 |
if (!sdata->toNativeQuery(*this, &xq,
|
1446 |
(opts & Db::QO_STEM) ? stemlang : "")) {
|
1449 |
(opts & Db::QO_STEM) ? stemlang : "")) {
|
1447 |
m_reason += sdata->getReason();
|
1450 |
m_reason += sdata->getReason();
|
1448 |
return false;
|
1451 |
return false;
|
1449 |
}
|
1452 |
}
|
1450 |
|
|
|
1451 |
m_ndb->query = xq;
|
1453 |
m_ndb->query = xq;
|
1452 |
delete m_ndb->enquire;
|
1454 |
delete m_ndb->enquire;
|
1453 |
m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
|
1455 |
m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
|
1454 |
m_ndb->enquire->set_query(m_ndb->query);
|
1456 |
m_ndb->enquire->set_query(m_ndb->query);
|
1455 |
m_ndb->mset = Xapian::MSet();
|
1457 |
m_ndb->mset = Xapian::MSet();
|