Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.54 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.55 2006-01-27 13:34:42 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
49
#define MAX(A,B) (A>B?A:B)
49
#define MAX(A,B) (A>B?A:B)
50
#endif
50
#endif
51
#ifndef MIN
51
#ifndef MIN
52
#define MIN(A,B) (A<B?A:B)
52
#define MIN(A,B) (A<B?A:B)
53
#endif
53
#endif
54
55
// This is how long an abstract we keep or build from beginning of text when
56
// indexing. It only has an influence on the size of the db as we are free
57
// to shorten it again when displaying
58
#define INDEX_ABSTRACT_SIZE 250
59
60
// This is the size of the abstract that we synthetize out of query
61
// term contexts at query time
62
#define MA_ABSTRACT_SIZE 250
63
// This is how many words (context size) we keep around query terms
64
// when building the abstract
65
#define MA_EXTRACT_WIDTH 4
54
66
55
// Data for a xapian database. There could actually be 2 different
67
// Data for a xapian database. There could actually be 2 different
56
// ones for indexing or query as there is not much in common.
68
// ones for indexing or query as there is not much in common.
57
class Native {
69
class Native {
58
 public:
70
 public:
...
...
335
// Truncate longer path and uniquize with hash . The goal for this is
347
// Truncate longer path and uniquize with hash . The goal for this is
336
// to avoid xapian max term length limitations, not to gain space (we
348
// to avoid xapian max term length limitations, not to gain space (we
337
// gain very little even with very short maxlens like 30)
349
// gain very little even with very short maxlens like 30)
338
#define PATHHASHLEN 150
350
#define PATHHASHLEN 150
339
351
340
#define ABSTRACT_SIZE 200
341
const static string rclSyntAbs = "?!#@";
352
const static string rclSyntAbs = "?!#@";
342
353
343
// Add document in internal form to the database: index the terms in
354
// Add document in internal form to the database: index the terms in
344
// the title abstract and body and add special terms for file name,
355
// the title abstract and body and add special terms for file name,
345
// date, mime type ... , create the document data record (more
356
// date, mime type ... , create the document data record (more
...
...
357
    // Truncate abstract, title and keywords to reasonable lengths. If
368
    // Truncate abstract, title and keywords to reasonable lengths. If
358
    // abstract is currently empty, we make up one with the beginning
369
    // abstract is currently empty, we make up one with the beginning
359
    // of the document.
370
    // of the document.
360
    if (doc.abstract.empty()) {
371
    if (doc.abstract.empty()) {
361
    doc.abstract = rclSyntAbs + 
372
    doc.abstract = rclSyntAbs + 
362
        truncate_to_word(doc.text, ABSTRACT_SIZE);
373
        truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE);
363
    } else {
374
    } else {
364
    doc.abstract = truncate_to_word(doc.abstract, ABSTRACT_SIZE);
375
    doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
365
    }
376
    }
366
    doc.abstract = stripchars(doc.abstract, "\n\r");
377
    doc.abstract = stripchars(doc.abstract, "\n\r");
367
    doc.title = truncate_to_word(doc.title, 100);
378
    doc.title = truncate_to_word(doc.title, 100);
368
    doc.keywords = truncate_to_word(doc.keywords, 300);
379
    doc.keywords = truncate_to_word(doc.keywords, 300);
369
380
...
...
1350
// We build a possibly full size but sparsely populated (only around
1361
// We build a possibly full size but sparsely populated (only around
1351
// the search term) reconstruction of the document. It would be
1362
// the search term) reconstruction of the document. It would be
1352
// possible to compress the array, by having only multiple chunks
1363
// possible to compress the array, by having only multiple chunks
1353
// around the terms, but this would seriously complicate the data
1364
// around the terms, but this would seriously complicate the data
1354
// structure.
1365
// structure.
1355
#define EXTRACT_WIDTH 3
1356
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
1366
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
1357
{
1367
{
1358
    Chrono chron;
1368
    Chrono chron;
1359
    // A buffer that we populate with the document terms, at their position
1369
    // A buffer that we populate with the document terms, at their position
1360
    vector<string> buf;
1370
    vector<string> buf;
...
...
1374
        for (pos = db.positionlist_begin(docid, *qit); 
1384
        for (pos = db.positionlist_begin(docid, *qit); 
1375
         pos != db.positionlist_end(docid, *qit); pos++) {
1385
         pos != db.positionlist_end(docid, *qit); pos++) {
1376
        unsigned int ipos = *pos;
1386
        unsigned int ipos = *pos;
1377
        LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
1387
        LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
1378
        // Possibly extend the array. Do it in big chunks
1388
        // Possibly extend the array. Do it in big chunks
1379
        if (ipos + EXTRACT_WIDTH >= buf.size()) {
1389
        if (ipos + MA_EXTRACT_WIDTH >= buf.size()) {
1380
            buf.resize(ipos + EXTRACT_WIDTH + 1000);
1390
            buf.resize(ipos + MA_EXTRACT_WIDTH + 1000);
1381
        }
1391
        }
1382
        buf[ipos] = *qit;
1392
        buf[ipos] = *qit;
1383
        // Remember the term position
1393
        // Remember the term position
1384
        qtermposs.push_back(ipos);
1394
        qtermposs.push_back(ipos);
1385
        // Add adjacent slots to the set to populate at next step
1395
        // Add adjacent slots to the set to populate at next step
1386
        for (unsigned int ii = MAX(0, ipos-EXTRACT_WIDTH); 
1396
        for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH); 
1387
             ii <= MIN(ipos+EXTRACT_WIDTH, buf.size()-1); ii++) {
1397
             ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) {
1388
            chunkposs.insert(ii);
1398
            chunkposs.insert(ii);
1389
        }
1399
        }
1390
        // Limit the number of occurences we keep for each
1400
        // Limit the number of occurences we keep for each
1391
        // term. The abstract has a finite length anyway !
1401
        // term. The abstract has a finite length anyway !
1392
        if (occurrences++ > 10)
1402
        if (occurrences++ > 10)
...
...
1435
    // Extract data around the first (in random order) term positions,
1445
    // Extract data around the first (in random order) term positions,
1436
    // and store the chunks in the map
1446
    // and store the chunks in the map
1437
    for (vector<unsigned int>::const_iterator it = qtermposs.begin();
1447
    for (vector<unsigned int>::const_iterator it = qtermposs.begin();
1438
     it != qtermposs.end(); it++) {
1448
     it != qtermposs.end(); it++) {
1439
    unsigned int ipos = *it;
1449
    unsigned int ipos = *it;
1440
    unsigned int start = MAX(0, ipos-EXTRACT_WIDTH);
1450
    unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH);
1441
    unsigned int end = MIN(ipos+EXTRACT_WIDTH, buf.size()-1);
1451
    unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1);
1442
    string chunk;
1452
    string chunk;
1443
    for (unsigned int ii = start; ii <= end; ii++) {
1453
    for (unsigned int ii = start; ii <= end; ii++) {
1444
        if (!buf[ii].empty()) {
1454
        if (!buf[ii].empty()) {
1445
        chunk += buf[ii] + " ";
1455
        chunk += buf[ii] + " ";
1446
        abslen += buf[ii].length();
1456
        abslen += buf[ii].length();
1447
        }
1457
        }
1448
      if (abslen > 300)
1458
      if (abslen > MA_ABSTRACT_SIZE)
1449
        break;
1459
        break;
1450
    }
1460
    }
1451
    if (end != buf.size()-1)
1461
    if (end != buf.size()-1)
1452
        chunk += "... ";
1462
        chunk += "... ";
1453
    mabs[ipos] = chunk;
1463
    mabs[ipos] = chunk;
1454
  if (abslen > 300)
1464
  if (abslen > MA_ABSTRACT_SIZE)
1455
        break;
1465
        break;
1456
    }
1466
    }
1457
1467
1458
    // Build the abstract by walking the map (in order of position)
1468
    // Build the abstract by walking the map (in order of position)
1459
    string abstract;
1469
    string abstract;