Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.75 2006-05-09 10:15:14 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.76 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
54
#define MIN(A,B) (A<B?A:B)
54
#define MIN(A,B) (A<B?A:B)
55
#endif
55
#endif
56
#ifndef NO_NAMESPACES
56
#ifndef NO_NAMESPACES
57
namespace Rcl {
57
namespace Rcl {
58
#endif
58
#endif
59
// This is how long an abstract we keep or build from beginning of text when
60
// indexing. It only has an influence on the size of the db as we are free
61
// to shorten it again when displaying
62
#define INDEX_ABSTRACT_SIZE 250
63
64
// This is the size of the abstract that we synthetize out of query
65
// term contexts at query time
66
#define MA_ABSTRACT_SIZE 250
67
// This is how many words (context size) we keep around query terms
68
// when building the abstract
69
#define MA_EXTRACT_WIDTH 4
70
59
71
// Truncate longer path and uniquize with hash . The goal for this is
60
// Truncate longer path and uniquize with hash . The goal for this is
72
// to avoid xapian max term length limitations, not to gain space (we
61
// to avoid xapian max term length limitations, not to gain space (we
73
// gain very little even with very short maxlens like 30)
62
// gain very little even with very short maxlens like 30)
74
#define PATHHASHLEN 150
63
#define PATHHASHLEN 150
...
...
79
68
80
// Data for a xapian database. There could actually be 2 different
69
// Data for a xapian database. There could actually be 2 different
81
// ones for indexing or query as there is not much in common.
70
// ones for indexing or query as there is not much in common.
82
class Native {
71
class Native {
83
 public:
72
 public:
73
    Db *m_db;
84
    bool m_isopen;
74
    bool m_isopen;
85
    bool m_iswritable;
75
    bool m_iswritable;
86
    Db::OpenMode m_mode;
76
    Db::OpenMode m_mode;
87
    string m_basedir;
77
    string m_basedir;
88
78
...
...
104
    bool dbDataToRclDoc(std::string &data, Doc &doc, 
94
    bool dbDataToRclDoc(std::string &data, Doc &doc, 
105
            int qopts,
95
            int qopts,
106
            Xapian::docid docid,
96
            Xapian::docid docid,
107
            const list<string>& terms);
97
            const list<string>& terms);
108
98
109
    Native() 
99
    Native(Db *db) 
100
  : m_db(db),
110
    : m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) 
101
      m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) 
111
    { }
102
    { }
112
    ~Native() {
103
    ~Native() {
113
    delete enquire;
104
    delete enquire;
114
    }
105
    }
115
    bool filterMatch(Db *rdb, Xapian::Document &xdoc) {
106
    bool filterMatch(Db *rdb, Xapian::Document &xdoc) {
...
...
147
    }
138
    }
148
139
149
};
140
};
150
141
151
Db::Db() 
142
Db::Db() 
152
    : m_qOpts(QO_NONE)
143
    : m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250),
144
      m_synthAbsWordCtxLen(4)
153
{
145
{
154
    m_ndb = new Native;
146
    m_ndb = new Native(this);
155
}
147
}
156
148
157
Db::~Db()
149
Db::~Db()
158
{
150
{
159
    LOGDEB1(("Db::~Db\n"));
151
    LOGDEB1(("Db::~Db\n"));
...
...
280
    if (m_ndb->m_iswritable == true) {
272
    if (m_ndb->m_iswritable == true) {
281
        m_ndb->wdb.flush();
273
        m_ndb->wdb.flush();
282
        LOGDEB(("Rcl:Db: Called xapian flush\n"));
274
        LOGDEB(("Rcl:Db: Called xapian flush\n"));
283
    }
275
    }
284
    delete m_ndb;
276
    delete m_ndb;
285
    m_ndb = new Native;
277
    m_ndb = new Native(this);
286
    if (m_ndb)
278
    if (m_ndb)
287
        return true;
279
        return true;
288
    } catch (const Xapian::Error &e) {
280
    } catch (const Xapian::Error &e) {
289
    ermsg = e.get_msg().c_str();
281
    ermsg = e.get_msg().c_str();
290
    } catch (const string &s) {
282
    } catch (const string &s) {
...
...
440
    return true;
432
    return true;
441
    }
433
    }
442
    return true;
434
    return true;
443
}
435
}
444
436
437
// Let our user set the parameters for abstract processing
438
void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
439
{
440
    LOGDEB(("Db::setAbstractParams: trunc %d syntlen %d ctxlen %d\n",
441
      idxtrunc, syntlen, syntctxlen));
442
    if (idxtrunc > 0 && idxtrunc < 2000)
443
  m_idxAbsTruncLen = idxtrunc;
444
    if (syntlen > 0 && syntlen < 2000)
445
  m_synthAbsLen = syntlen;
446
    if (syntctxlen > 0 && syntctxlen < 20)
447
  m_synthAbsWordCtxLen = syntctxlen;
448
}
449
445
// Add document in internal form to the database: index the terms in
450
// Add document in internal form to the database: index the terms in
446
// the title abstract and body and add special terms for file name,
451
// the title abstract and body and add special terms for file name,
447
// date, mime type ... , create the document data record (more
452
// date, mime type ... , create the document data record (more
448
// metadata), and update database
453
// metadata), and update database
449
bool Db::add(const string &fn, const Doc &idoc, 
454
bool Db::add(const string &fn, const Doc &idoc, 
...
...
455
460
456
    Doc doc = idoc;
461
    Doc doc = idoc;
457
462
458
    // Truncate abstract, title and keywords to reasonable lengths. If
463
    // Truncate abstract, title and keywords to reasonable lengths. If
459
    // abstract is currently empty, we make up one with the beginning
464
    // abstract is currently empty, we make up one with the beginning
460
    // of the document.
465
    // of the document. This is then not indexed, but part of the doc
466
    // data so that we can return it to a query without having to
467
    // decode the original file.
461
    bool syntabs = false;
468
    bool syntabs = false;
462
    if (doc.abstract.empty()) {
469
    if (doc.abstract.empty()) {
463
    syntabs = true;
470
    syntabs = true;
464
    doc.abstract = rclSyntAbs + 
471
    doc.abstract = rclSyntAbs + 
465
      truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE);
472
      truncate_to_word(doc.text, m_idxAbsTruncLen);
466
    } else {
473
    } else {
467
    doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
474
    doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
468
    }
475
    }
469
    doc.abstract = neutchars(doc.abstract, "\n\r");
476
    doc.abstract = neutchars(doc.abstract, "\n\r");
470
    doc.title = truncate_to_word(doc.title, 100);
477
    doc.title = truncate_to_word(doc.title, 100);
471
    doc.keywords = truncate_to_word(doc.keywords, 300);
478
    doc.keywords = truncate_to_word(doc.keywords, 300);
472
479
...
...
511
    return false;
518
    return false;
512
    }
519
    }
513
    splitter.text_to_words(noacc);
520
    splitter.text_to_words(noacc);
514
    splitData.basepos += splitData.curpos + 100;
521
    splitData.basepos += splitData.curpos + 100;
515
522
516
    // Split and index abstract
523
    // Split and index abstract. We don't do this if it is synthetic
524
    // any more (this used to give a relevance boost to the beginning
525
    // of text, why ?)
517
    LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
526
    LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
527
    if (!syntabs) {
528
  // syntabs indicator test kept here in case we want to go back
529
  // to indexing synthetic abstracts one day
518
    if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : 
530
  if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : 
519
             doc.abstract, noacc)) {
531
           doc.abstract, noacc)) {
520
    LOGERR(("Db::add: dumb_string failed\n"));
532
        LOGERR(("Db::add: dumb_string failed\n"));
521
    return false;
533
        return false;
522
    }
534
  }
523
    splitter.text_to_words(noacc);
535
  splitter.text_to_words(noacc);
536
    }
524
    splitData.basepos += splitData.curpos + 100;
537
    splitData.basepos += splitData.curpos + 100;
525
538
526
    ////// Special terms for metadata
539
    ////// Special terms for metadata
527
    // Mime type
540
    // Mime type
528
    newdocument.add_term("T" + doc.mimetype);
541
    newdocument.add_term("T" + doc.mimetype);
...
...
1180
    parms.get(string("dmtime"), doc.dmtime);
1193
    parms.get(string("dmtime"), doc.dmtime);
1181
    parms.get(string("origcharset"), doc.origcharset);
1194
    parms.get(string("origcharset"), doc.origcharset);
1182
    parms.get(string("caption"), doc.title);
1195
    parms.get(string("caption"), doc.title);
1183
    parms.get(string("keywords"), doc.keywords);
1196
    parms.get(string("keywords"), doc.keywords);
1184
    parms.get(string("abstract"), doc.abstract);
1197
    parms.get(string("abstract"), doc.abstract);
1198
    // Possibly remove synthetic abstract indicator (if it's there, we
1199
    // used to index the beginning of the text as abstract).
1185
    bool syntabs = false;
1200
    bool syntabs = false;
1186
    if (doc.abstract.find(rclSyntAbs) == 0) {
1201
    if (doc.abstract.find(rclSyntAbs) == 0) {
1187
    doc.abstract = doc.abstract.substr(rclSyntAbs.length());
1202
    doc.abstract = doc.abstract.substr(rclSyntAbs.length());
1188
    syntabs = true;
1203
    syntabs = true;
1189
    }
1204
    }
1205
    // If the option is set and the abstract is synthetic or empty , build 
1206
    // abstract from position data. 
1190
    if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
1207
    if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
1191
    LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n"));
1208
    LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
1192
    if (doc.abstract.empty() || syntabs || 
1209
    if (doc.abstract.empty() || syntabs || 
1193
        (qopts & Db::QO_REPLACE_ABSTRACT))
1210
        (qopts & Db::QO_REPLACE_ABSTRACT))
1194
        doc.abstract = makeAbstract(docid, terms);
1211
        doc.abstract = makeAbstract(docid, terms);
1195
    }
1212
    } 
1196
    parms.get(string("ipath"), doc.ipath);
1213
    parms.get(string("ipath"), doc.ipath);
1197
    parms.get(string("fbytes"), doc.fbytes);
1214
    parms.get(string("fbytes"), doc.fbytes);
1198
    parms.get(string("dbytes"), doc.dbytes);
1215
    parms.get(string("dbytes"), doc.dbytes);
1199
    doc.xdocid = docid;
1216
    doc.xdocid = docid;
1200
    return true;
1217
    return true;
...
...
1395
    // Go through the list of query terms. For each entry in each
1412
    // Go through the list of query terms. For each entry in each
1396
    // position list, populate the slot in the document buffer, and
1413
    // position list, populate the slot in the document buffer, and
1397
    // remember the position and its neigbours
1414
    // remember the position and its neigbours
1398
    vector<unsigned int> qtermposs; // The term positions
1415
    vector<unsigned int> qtermposs; // The term positions
1399
    set<unsigned int> chunkposs; // All the positions we shall populate
1416
    set<unsigned int> chunkposs; // All the positions we shall populate
1417
    int totaloccs = 0;
1400
    for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
1418
    for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
1401
     qit++) {
1419
     qit++) {
1402
    Xapian::PositionIterator pos;
1420
    Xapian::PositionIterator pos;
1403
    // There may be query terms not in this doc. This raises an
1421
    // There may be query terms not in this doc. This raises an
1404
    // exception when requesting the position list, we just catch it.
1422
    // exception when requesting the position list, we just catch it.
...
...
1407
        for (pos = db.positionlist_begin(docid, *qit); 
1425
        for (pos = db.positionlist_begin(docid, *qit); 
1408
         pos != db.positionlist_end(docid, *qit); pos++) {
1426
         pos != db.positionlist_end(docid, *qit); pos++) {
1409
        unsigned int ipos = *pos;
1427
        unsigned int ipos = *pos;
1410
        LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
1428
        LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
1411
        // Possibly extend the array. Do it in big chunks
1429
        // Possibly extend the array. Do it in big chunks
1412
      if (ipos + MA_EXTRACT_WIDTH >= buf.size()) {
1430
      if (ipos + m_db->m_synthAbsWordCtxLen >= buf.size()) {
1413
          buf.resize(ipos + MA_EXTRACT_WIDTH + 1000);
1431
          buf.resize(ipos + m_db->m_synthAbsWordCtxLen + 1000);
1414
        }
1432
        }
1415
        buf[ipos] = *qit;
1433
        buf[ipos] = *qit;
1416
        // Remember the term position
1434
        // Remember the term position
1417
        qtermposs.push_back(ipos);
1435
        qtermposs.push_back(ipos);
1418
        // Add adjacent slots to the set to populate at next step
1436
        // Add adjacent slots to the set to populate at next step
1419
      for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH); 
1437
      for (unsigned int ii = MAX(0, ipos-m_db->m_synthAbsWordCtxLen); 
1420
           ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) {
1438
           ii <= MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1); ii++) {
1421
            chunkposs.insert(ii);
1439
            chunkposs.insert(ii);
1422
        }
1440
        }
1423
        // Limit the number of occurences we keep for each
1441
        // Limit the number of occurences we keep for each
1424
        // term. The abstract has a finite length anyway !
1442
        // term. The abstract has a finite length anyway !
1425
        if (occurrences++ > 10)
1443
        if (occurrences++ > 10)
1426
            break;
1444
            break;
1427
        }
1445
        }
1428
    } catch (...) {
1446
    } catch (...) {
1429
    }
1447
    }
1448
  // Limit total size
1449
  if (totaloccs++ > 100)
1450
      break;
1430
    }
1451
    }
1431
1452
1432
    LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n", 
1453
    LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n", 
1433
        chron.millis(), qtermposs.size()));
1454
        chron.millis(), qtermposs.size()));
1434
1455
...
...
1468
    // Extract data around the first (in random order) term positions,
1489
    // Extract data around the first (in random order) term positions,
1469
    // and store the chunks in the map
1490
    // and store the chunks in the map
1470
    for (vector<unsigned int>::const_iterator it = qtermposs.begin();
1491
    for (vector<unsigned int>::const_iterator it = qtermposs.begin();
1471
     it != qtermposs.end(); it++) {
1492
     it != qtermposs.end(); it++) {
1472
    unsigned int ipos = *it;
1493
    unsigned int ipos = *it;
1473
  unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH);
1494
  unsigned int start = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
1474
  unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1);
1495
  unsigned int end = MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1);
1475
    string chunk;
1496
    string chunk;
1476
    for (unsigned int ii = start; ii <= end; ii++) {
1497
    for (unsigned int ii = start; ii <= end; ii++) {
1477
        if (!buf[ii].empty()) {
1498
        if (!buf[ii].empty()) {
1478
        chunk += buf[ii] + " ";
1499
        chunk += buf[ii] + " ";
1479
        abslen += buf[ii].length();
1500
        abslen += buf[ii].length();
1480
        }
1501
        }
1481
      if (abslen > MA_ABSTRACT_SIZE)
1502
      if (int(abslen) > m_db->m_synthAbsLen)
1482
        break;
1503
        break;
1483
    }
1504
    }
1484
    if (end != buf.size()-1)
1505
    if (end != buf.size()-1)
1485
        chunk += "... ";
1506
        chunk += "... ";
1486
    mabs[ipos] = chunk;
1507
    mabs[ipos] = chunk;
1487
  if (abslen > MA_ABSTRACT_SIZE)
1508
  if (int(abslen) > m_db->m_synthAbsLen)
1488
        break;
1509
        break;
1489
    }
1510
    }
1490
1511
1491
    // Build the abstract by walking the map (in order of position)
1512
    // Build the abstract by walking the map (in order of position)
1492
    string abstract;
1513
    string abstract;