Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.118 2007-06-22 06:14:04 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.119 2007-06-25 10:25:39 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
54
#endif
54
#endif
55
#ifndef MIN
55
#ifndef MIN
56
#define MIN(A,B) (A<B?A:B)
56
#define MIN(A,B) (A<B?A:B)
57
#endif
57
#endif
58
58
59
// This is the word position offset at which we index the body text
60
// (abstract, keywords, etc.. are stored before this)
61
static const unsigned int baseTextPosition = 100000;
62
59
#undef MTIME_IN_VALUE
63
#undef MTIME_IN_VALUE
60
#ifdef MTIME_IN_VALUE
64
#ifdef MTIME_IN_VALUE
61
// Omega compatible values
65
// Omega compatible values
62
#define enum value_slot {
66
#define enum value_slot {
63
    VALUE_LASTMOD = 0,  // 4 byte big endian value - seconds since 1970.
67
    VALUE_LASTMOD = 0,  // 4 byte big endian value - seconds since 1970.
...
...
101
    Xapian::Query    query; // query descriptor: terms and subqueries
105
    Xapian::Query    query; // query descriptor: terms and subqueries
102
                // joined by operators (or/and etc...)
106
                // joined by operators (or/and etc...)
103
    Xapian::Enquire *enquire; // Open query descriptor.
107
    Xapian::Enquire *enquire; // Open query descriptor.
104
    Xapian::MSet     mset;    // Partial result set
108
    Xapian::MSet     mset;    // Partial result set
105
109
106
    // Term frequencies for current query. See makeAbstract, not used yet.
110
    // Term frequencies for current query. See makeAbstract, setQuery
107
    map<string, int>  m_termfreqs; 
111
    map<string, double>  m_termfreqs; 
108
    
112
    
109
    Native(Db *db) 
113
    Native(Db *db) 
110
    : m_db(db),
114
    : m_db(db),
111
      m_isopen(false), m_iswritable(false), enquire(0) 
115
      m_isopen(false), m_iswritable(false), enquire(0) 
112
    { }
116
    { }
...
...
230
    }
234
    }
231
    }
235
    }
232
    return out;
236
    return out;
233
}
237
}
234
238
239
//#define DEBUGABSTRACT 
240
#ifdef DEBUGABSTRACT
241
#define LOGABS LOGDEB
242
#else
243
#define LOGABS LOGDEB2
244
#endif
245
235
// Build a document abstract by extracting text chunks around the query terms
246
// Build a document abstract by extracting text chunks around the query terms
236
// This uses the db termlists, not the original document.
247
// This uses the db termlists, not the original document.
237
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
248
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
238
{
249
{
239
    Chrono chron;
250
    Chrono chron;
240
    LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
251
    LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
241
         m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
252
         m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
242
253
243
    list<string> terms = noPrefixList(iterms);
254
    list<string> terms = noPrefixList(iterms);
244
    if (terms.empty()) {
255
    if (terms.empty()) {
245
    return "";
256
    return "";
246
    }
257
    }
247
258
248
    // We may want to use the db-wide freqs to tune the abstracts one
259
    // Retrieve db-wide frequencies for the query terms
249
    // day but we currently don't
250
#if 0
251
    if (m_termfreqs.empty()) {
260
    if (m_termfreqs.empty()) {
261
  double doccnt = db.get_doccount();
262
  if (doccnt == 0) doccnt = 1;
252
    for (list<string>::const_iterator qit = terms.begin(); 
263
    for (list<string>::const_iterator qit = terms.begin(); 
253
         qit != terms.end(); qit++) {
264
         qit != terms.end(); qit++) {
254
        m_termfreqs[*qit] = db.get_termfreq(*qit);
265
        m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
255
        LOGDEB(("makeAbstract: [%s] db freq %d\n", qit->c_str(), 
266
        LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), 
256
             m_termfreqs[*qit]));
267
             m_termfreqs[*qit]));
257
    }
268
    }
258
    LOGDEB(("makeAbstract:%d: got termfreqs\n", chron.ms()));
269
    LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
259
    }
270
    }
260
#endif
261
271
262
    // Retrieve the term Within Document Frequencies. We are going to try 
272
    // Compute a term quality coefficient by retrieving the term
273
    // Within Document Frequencies and multiplying by overal term
274
    // frequency, then using log-based thresholds. We are going to try
263
    // and show text around the less common search terms.
275
    // and show text around the less common search terms.
264
    map<string, int> termwdfs;
276
    map<string, double> termQcoefs;
265
    int totalqtermoccs = 0;
277
    double totalweight = 0;
278
    double doclen = db.get_doclength(docid);
279
    if (doclen == 0) doclen = 1;
266
    for (list<string>::const_iterator qit = terms.begin(); 
280
    for (list<string>::const_iterator qit = terms.begin(); 
267
     qit != terms.end(); qit++) {
281
     qit != terms.end(); qit++) {
268
    Xapian::TermIterator term = db.termlist_begin(docid);
282
    Xapian::TermIterator term = db.termlist_begin(docid);
269
    term.skip_to(*qit);
283
    term.skip_to(*qit);
270
    if (term != db.termlist_end(docid) && *term == *qit) {
284
    if (term != db.termlist_end(docid) && *term == *qit) {
271
      int f = term.get_wdf();
285
      double q = (term.get_wdf() / doclen) * m_termfreqs[*qit];
286
      q = -log10(q);
287
      if (q < 3) {
288
      q = 0.05;
289
      } else if (q < 4) {
290
      q = 0.3;
291
      } else if (q < 5) {
292
      q = 0.7;
293
      } else if (q < 6) {
294
      q = 0.8;
295
      } else {
296
      q = 1;
297
      }
272
        termwdfs[*qit] = f;
298
        termQcoefs[*qit] = q;
273
      totalqtermoccs += f;
299
      totalweight += q;
274
      LOGDEB2(("makeAbstract: [%s] wdf %d\n", qit->c_str(), 
275
           termwdfs[*qit]));
276
    }
300
    }
277
    }    
301
    }    
278
    LOGDEB2(("makeAbstract:%d: got wdfs totalqtermoccs %d\n", 
302
    LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
279
      chron.ms(), totalqtermoccs));
280
    if (totalqtermoccs == 0) {
281
  LOGERR(("makeAbstract: no term occurrences !\n"));
282
  return "";
283
    }
284
303
285
    // Build a sorted by frequency term list: it seems reasonable to
304
    // Build a sorted by quality term list.
286
    // prefer sampling around the less frequent terms:
287
    multimap<int, string> bywdf;
305
    multimap<double, string> byQ;
288
    for (list<string>::const_iterator qit = terms.begin(); 
306
    for (list<string>::const_iterator qit = terms.begin(); 
289
     qit != terms.end(); qit++) {
307
     qit != terms.end(); qit++) {
290
    if (termwdfs.find(*qit) != termwdfs.end())
308
    if (termQcoefs.find(*qit) != termQcoefs.end())
291
        bywdf.insert(pair<int,string>(termwdfs[*qit], *qit));
309
        byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
292
    }
310
    }
293
311
312
#ifdef DEBUGABSTRACT
313
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
314
   qit != byQ.rend(); qit++) {
315
  LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
316
    }
317
#endif
318
319
294
    // For each of the query terms, query xapian for its positions
320
    // For each of the query terms, ask xapian for its positions list
295
    // list in the document. For each position entry, remember it in qtermposs
321
    // in the document. For each position entry, remember it in
296
    // and insert it and its neighbours in the set of 'interesting' positions
322
    // qtermposs and insert it and its neighbours in the set of
323
    // 'interesting' positions
297
324
298
    // The terms 'array' that we partially populate with the document
325
    // The terms 'array' that we partially populate with the document
299
    // terms, at their positions around the search terms positions:
326
    // terms, at their positions around the search terms positions:
300
    map<unsigned int, string> sparseDoc;
327
    map<unsigned int, string> sparseDoc;
301
328
302
    // All the query term positions. We remember this mainly because we are
329
    // All the chosen query term positions. 
303
    // going to random-shuffle it for selecting the chunks that we actually 
304
    // print.
305
    vector<unsigned int> qtermposs; 
330
    vector<unsigned int> qtermposs; 
306
331
307
    // Limit the total number of slots we populate.
332
    // Limit the total number of slots we populate. The 7 is taken as
333
    // average word size. It was a mistake to have the user max
334
    // abstract size parameter in characters, we basically only deal
335
    // with words. We used to limit the character size at the end, but
336
    // this damaged our careful selection of terms
308
    const unsigned int maxtotaloccs = 
337
    const unsigned int maxtotaloccs = 
309
    MAX(50, m_db->m_synthAbsLen /(4 * (m_db->m_synthAbsWordCtxLen+1)));
338
    m_db->m_synthAbsLen /(7 * (m_db->m_synthAbsWordCtxLen+1));
310
    LOGDEB2(("makeAbstract:%d: ttlqtrms %d mxttloccs %d\n", 
339
    LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
311
      chron.ms(), totalqtermoccs,  maxtotaloccs));
340
    // This can't happen, but would crash us
312
#if 0
341
    if (totalweight == 0.0) {
342
  LOGERR(("makeAbstract: 0 totalweight!\n"));
343
  return "";
344
    }
345
346
    // Let's go populate
313
    for (multimap<int, string>::iterator qit = bywdf.begin(); 
347
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
314
     qit != bywdf.end(); qit++) {
348
     qit != byQ.rend(); qit++) {
315
  LOGDEB(("%d->[%s]\n", qit->first, qit->second.c_str()));
316
    }
317
#endif
318
319
    // Find the text positions which we will have to fill with terms
320
    unsigned int totaloccs = 0;
321
    for (multimap<int, string>::iterator qit = bywdf.begin(); 
322
   qit != bywdf.end(); qit++) {
323
    string qterm = qit->second;
349
    string qterm = qit->second;
324
    unsigned int maxoccs;
350
    unsigned int maxoccs;
325
    if (bywdf.size() == 1) {
351
    if (byQ.size() == 1) {
326
        maxoccs = maxtotaloccs;
352
        maxoccs = maxtotaloccs;
327
    } else {
353
    } else {
328
      float q = (1 - float(termwdfs[qterm]) / float(totalqtermoccs)) /
354
      // We give more slots to the better terms
329
      (bywdf.size() - 1);
355
      float q = qit->first / totalweight;
330
        maxoccs = int(ceil(maxtotaloccs * q));
356
        maxoccs = int(ceil(maxtotaloccs * q));
331
        LOGDEB2(("makeAbstract: [%s] %d max occs (coef %.2f)\n", 
357
        LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n", 
332
            qterm.c_str(), maxoccs, q));
358
            qterm.c_str(), maxoccs, q));
333
    }
359
    }
334
        
360
        
335
    Xapian::PositionIterator pos;
361
    Xapian::PositionIterator pos;
336
    // There may be query terms not in this doc. This raises an
362
    // There may be query terms not in this doc. This raises an
...
...
339
    try {
365
    try {
340
        unsigned int occurrences = 0;
366
        unsigned int occurrences = 0;
341
        for (pos = db.positionlist_begin(docid, qterm); 
367
        for (pos = db.positionlist_begin(docid, qterm); 
342
         pos != db.positionlist_end(docid, qterm); pos++) {
368
         pos != db.positionlist_end(docid, qterm); pos++) {
343
        unsigned int ipos = *pos;
369
        unsigned int ipos = *pos;
344
      LOGDEB2(("makeAbstract: [%s] at %d\n", qit->c_str(), ipos));
370
      if (ipos < baseTextPosition) // Not in text body
371
          continue;
372
      LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n",
373
          qterm.c_str(), ipos, occurrences, maxoccs));
345
        // Remember the term position
374
        // Remember the term position
346
        qtermposs.push_back(ipos);
375
        qtermposs.push_back(ipos);
347
        // Add adjacent slots to the set to populate at next step
376
        // Add adjacent slots to the set to populate at next step
348
        unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
377
        unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
349
        unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
378
        unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
...
...
351
            if (ii == ipos)
380
            if (ii == ipos)
352
            sparseDoc[ii] = qterm;
381
            sparseDoc[ii] = qterm;
353
            else
382
            else
354
            sparseDoc[ii] = emptys;
383
            sparseDoc[ii] = emptys;
355
        }
384
        }
356
      // Limit the number of occurences we keep for each
385
      // Limit to allocated occurences and total size
357
      // term. The abstract has a finite length anyway !
358
        if (occurrences++ > maxoccs)
386
        if (++occurrences >= maxoccs || 
387
          qtermposs.size() >= maxtotaloccs)
359
            break;
388
            break;
360
        }
389
        }
361
    } catch (...) {
390
    } catch (...) {
362
        // Term does not occur. No problem.
391
        // Term does not occur. No problem.
363
    }
392
    }
364
  // Limit total size
393
  if (qtermposs.size() >= maxtotaloccs)
365
  if (totaloccs++ > maxtotaloccs)
366
        break;
394
        break;
367
    }
395
    }
368
369
    LOGDEB2(("makeAbstract:%d:chosen number of positions %d\n", 
396
    LOGABS(("makeAbstract:%d:chosen number of positions %d\n", 
370
        chron.millis(), qtermposs.size()));
397
        chron.millis(), qtermposs.size()));
371
398
372
    // Walk the full document position list (for each term walk
399
    // This can happen if there are term occurences in the keywords
373
    // position list) and populate slots around the query terms. We
400
    // etc. but not elsewhere ?
374
    // arbitrarily truncate the list to avoid taking forever. If we do
401
    if (qtermposs.size() == 0) 
375
    // cutoff, the abstract may be inconsistant, which is bad...
402
  return "";
403
404
    // Walk all document's terms position lists and populate slots
405
    // around the query terms. We arbitrarily truncate the list to
406
    // avoid taking forever. If we do cutoff, the abstract may be
407
    // inconsistant (missing words, potentially altering meaning),
408
    // which is bad...
376
    { 
409
    { 
377
    Xapian::TermIterator term;
410
    Xapian::TermIterator term;
378
    int cutoff = 500 * 1000;
411
    int cutoff = 500 * 1000;
379
412
380
    for (term = db.termlist_begin(docid);
413
    for (term = db.termlist_begin(docid);
...
...
399
            // Don't replace a term: the terms list is in
432
            // Don't replace a term: the terms list is in
400
            // alphabetic order, and we may have several terms
433
            // alphabetic order, and we may have several terms
401
            // at the same position, we want to keep only the
434
            // at the same position, we want to keep only the
402
            // first one (ie: dockes and dockes@wanadoo.fr)
435
            // first one (ie: dockes and dockes@wanadoo.fr)
403
            if (vit->second.empty()) {
436
            if (vit->second.empty()) {
404
            LOGDEB2(("makeAbstract: populating: [%s] at %d\n", 
437
            LOGABS(("makeAbstract: populating: [%s] at %d\n", 
405
                (*term).c_str(), *pos));
438
                (*term).c_str(), *pos));
406
            sparseDoc[*pos] = *term;
439
            sparseDoc[*pos] = *term;
407
            }
440
            }
408
        }
441
        }
409
        }
442
        }
...
...
426
        LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
459
        LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
427
    }
460
    }
428
    }
461
    }
429
#endif
462
#endif
430
463
431
    LOGDEB2(("makeAbstract:%d: randomizing and extracting\n", chron.millis()));
464
    LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
432
465
433
    // We randomize the selection of term positions, from which we
466
    // Add "..." at ends of chunks
434
    // shall pull, starting at the beginning, until the abstract is
435
    // big enough. The abstract is finally built in correct position
436
    // order, thanks to the position map.
437
    random_shuffle(qtermposs.begin(), qtermposs.end());
438
    map<unsigned int, string> mabs;
439
    unsigned int abslen = 0;
440
441
    // Extract data around the N first (in random order) query term
442
    // positions, and store the terms in the map. Don't concatenate
443
    // immediately into chunks because there might be overlaps
444
    for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
467
    for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
445
     pos != qtermposs.end(); pos++) {
468
     pos != qtermposs.end(); pos++) {
446
447
  if (int(abslen) > m_db->m_synthAbsLen)
448
      break;
449
450
  unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
451
    unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
469
    unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
452
453
  LOGDEB2(("makeAbstract: %d<-%d->%d\n", sta, *pos, sto));
454
455
  for (unsigned int ii = sta; ii <= sto; ii++) {
456
457
      if (int(abslen) > m_db->m_synthAbsLen)
458
      break;
459
      map<unsigned int, string>::const_iterator vit = 
460
      sparseDoc.find(ii);
461
      if (vit != sparseDoc.end() && !vit->second.empty()) {
462
      LOGDEB2(("makeAbstract: position %d -> [%s]\n", 
463
           ii, vit->second.c_str()));
464
      mabs[ii] = vit->second;
465
      abslen += vit->second.length();
466
      } else {
467
      LOGDEB2(("makeAbstract: empty position at %d\n", ii));
468
      }
469
  }
470
470
471
    // Possibly add a ... at the end of chunk if it's not
471
    // Possibly add a ... at the end of chunk if it's not
472
    // overlapping
472
    // overlapping
473
  if (mabs.find(sto+1) == mabs.end())
473
  if (sparseDoc.find(sto) != sparseDoc.end() && 
474
      sparseDoc.find(sto+1) == sparseDoc.end())
474
        mabs[sto+1] = "...";
475
        sparseDoc[sto+1] = "...";
475
    }
476
    }
476
477
477
    // Build the abstract by walking the map (in order of position)
478
    // Finally build the abstract by walking the map (in order of position)
478
    string abstract;
479
    string abstract;
479
    for (map<unsigned int, string>::const_iterator it = mabs.begin();
480
    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
480
     it != mabs.end(); it++) {
481
     it != sparseDoc.end(); it++) {
481
    LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
482
    LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
482
    abstract += it->second + " ";
483
    abstract += it->second + " ";
483
    }
484
    }
484
485
485
    // This happens for docs with no terms (only filename) indexed. I'll fix 
486
    // This happens for docs with no terms (only filename) indexed? I'll fix 
486
    // one day (yeah)
487
    // one day (yeah)
487
    if (!abstract.compare("... "))
488
    if (!abstract.compare("... "))
488
    abstract.clear();
489
    abstract.clear();
489
490
490
    LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));
491
    LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));
...
...
971
        splitData.setprefix(emptystring);
972
        splitData.setprefix(emptystring);
972
        splitData.basepos += splitData.curpos + 100;
973
        splitData.basepos += splitData.curpos + 100;
973
    }
974
    }
974
    }
975
    }
975
976
977
    if (splitData.curpos < baseTextPosition)
978
  splitData.basepos = baseTextPosition;
979
    else
980
  splitData.basepos += splitData.curpos + 100;
976
981
977
    // Split and index body text
982
    // Finally: split and index body text
978
    LOGDEB2(("Db::add: split body\n"));
983
    LOGDEB2(("Db::add: split body\n"));
979
    if (!dumb_string(doc.text, noacc)) {
984
    if (!dumb_string(doc.text, noacc)) {
980
    LOGERR(("Db::add: dumb_string failed\n"));
985
    LOGERR(("Db::add: dumb_string failed\n"));
981
    return false;
986
    return false;
982
    }
987
    }
983
    splitter.text_to_words(noacc);
988
    splitter.text_to_words(noacc);
984
    splitData.basepos += splitData.curpos + 100;
985
986
989
987
    ////// Special terms for other metadata. No positions for these.
990
    ////// Special terms for other metadata. No positions for these.
988
    // Mime type
991
    // Mime type
989
    newdocument.add_term("T" + doc.mimetype);
992
    newdocument.add_term("T" + doc.mimetype);
990
993
...
...
1423
    names.push_back("XIMPOSSIBLE");
1426
    names.push_back("XIMPOSSIBLE");
1424
    }
1427
    }
1425
    return true;
1428
    return true;
1426
}
1429
}
1427
1430
1428
// Prepare query out of "advanced search" data
1431
// Prepare query out of user search data
1429
bool Db::setQuery(RefCntr<SearchData> sdata, int opts, 
1432
bool Db::setQuery(RefCntr<SearchData> sdata, int opts, 
1430
          const string& stemlang)
1433
          const string& stemlang)
1431
{
1434
{
1432
    if (!m_ndb) {
1435
    if (!m_ndb) {
1433
    LOGERR(("Db::setQuery: no db!\n"));
1436
    LOGERR(("Db::setQuery: no db!\n"));
...
...
1445
    if (!sdata->toNativeQuery(*this, &xq, 
1448
    if (!sdata->toNativeQuery(*this, &xq, 
1446
                  (opts & Db::QO_STEM) ? stemlang : "")) {
1449
                  (opts & Db::QO_STEM) ? stemlang : "")) {
1447
    m_reason += sdata->getReason();
1450
    m_reason += sdata->getReason();
1448
    return false;
1451
    return false;
1449
    }
1452
    }
1450
1451
    m_ndb->query = xq;
1453
    m_ndb->query = xq;
1452
    delete m_ndb->enquire;
1454
    delete m_ndb->enquire;
1453
    m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
1455
    m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
1454
    m_ndb->enquire->set_query(m_ndb->query);
1456
    m_ndb->enquire->set_query(m_ndb->query);
1455
    m_ndb->mset = Xapian::MSet();
1457
    m_ndb->mset = Xapian::MSet();