|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.54 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.55 2006-01-27 13:34:42 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
49 |
#define MAX(A,B) (A>B?A:B)
|
49 |
#define MAX(A,B) (A>B?A:B)
|
50 |
#endif
|
50 |
#endif
|
51 |
#ifndef MIN
|
51 |
#ifndef MIN
|
52 |
#define MIN(A,B) (A<B?A:B)
|
52 |
#define MIN(A,B) (A<B?A:B)
|
53 |
#endif
|
53 |
#endif
|
|
|
54 |
|
|
|
55 |
// This is how long an abstract we keep or build from beginning of text when
|
|
|
56 |
// indexing. It only has an influence on the size of the db as we are free
|
|
|
57 |
// to shorten it again when displaying
|
|
|
58 |
#define INDEX_ABSTRACT_SIZE 250
|
|
|
59 |
|
|
|
60 |
// This is the size of the abstract that we synthetize out of query
|
|
|
61 |
// term contexts at query time
|
|
|
62 |
#define MA_ABSTRACT_SIZE 250
|
|
|
63 |
// This is how many words (context size) we keep around query terms
|
|
|
64 |
// when building the abstract
|
|
|
65 |
#define MA_EXTRACT_WIDTH 4
|
54 |
|
66 |
|
55 |
// Data for a xapian database. There could actually be 2 different
|
67 |
// Data for a xapian database. There could actually be 2 different
|
56 |
// ones for indexing or query as there is not much in common.
|
68 |
// ones for indexing or query as there is not much in common.
|
57 |
class Native {
|
69 |
class Native {
|
58 |
public:
|
70 |
public:
|
|
... |
|
... |
335 |
// Truncate longer path and uniquize with hash . The goal for this is
|
347 |
// Truncate longer path and uniquize with hash . The goal for this is
|
336 |
// to avoid xapian max term length limitations, not to gain space (we
|
348 |
// to avoid xapian max term length limitations, not to gain space (we
|
337 |
// gain very little even with very short maxlens like 30)
|
349 |
// gain very little even with very short maxlens like 30)
|
338 |
#define PATHHASHLEN 150
|
350 |
#define PATHHASHLEN 150
|
339 |
|
351 |
|
340 |
#define ABSTRACT_SIZE 200
|
|
|
341 |
const static string rclSyntAbs = "?!#@";
|
352 |
const static string rclSyntAbs = "?!#@";
|
342 |
|
353 |
|
343 |
// Add document in internal form to the database: index the terms in
|
354 |
// Add document in internal form to the database: index the terms in
|
344 |
// the title abstract and body and add special terms for file name,
|
355 |
// the title abstract and body and add special terms for file name,
|
345 |
// date, mime type ... , create the document data record (more
|
356 |
// date, mime type ... , create the document data record (more
|
|
... |
|
... |
357 |
// Truncate abstract, title and keywords to reasonable lengths. If
|
368 |
// Truncate abstract, title and keywords to reasonable lengths. If
|
358 |
// abstract is currently empty, we make up one with the beginning
|
369 |
// abstract is currently empty, we make up one with the beginning
|
359 |
// of the document.
|
370 |
// of the document.
|
360 |
if (doc.abstract.empty()) {
|
371 |
if (doc.abstract.empty()) {
|
361 |
doc.abstract = rclSyntAbs +
|
372 |
doc.abstract = rclSyntAbs +
|
362 |
truncate_to_word(doc.text, ABSTRACT_SIZE);
|
373 |
truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE);
|
363 |
} else {
|
374 |
} else {
|
364 |
doc.abstract = truncate_to_word(doc.abstract, ABSTRACT_SIZE);
|
375 |
doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
|
365 |
}
|
376 |
}
|
366 |
doc.abstract = stripchars(doc.abstract, "\n\r");
|
377 |
doc.abstract = stripchars(doc.abstract, "\n\r");
|
367 |
doc.title = truncate_to_word(doc.title, 100);
|
378 |
doc.title = truncate_to_word(doc.title, 100);
|
368 |
doc.keywords = truncate_to_word(doc.keywords, 300);
|
379 |
doc.keywords = truncate_to_word(doc.keywords, 300);
|
369 |
|
380 |
|
|
... |
|
... |
1350 |
// We build a possibly full size but sparsely populated (only around
|
1361 |
// We build a possibly full size but sparsely populated (only around
|
1351 |
// the search term) reconstruction of the document. It would be
|
1362 |
// the search term) reconstruction of the document. It would be
|
1352 |
// possible to compress the array, by having only multiple chunks
|
1363 |
// possible to compress the array, by having only multiple chunks
|
1353 |
// around the terms, but this would seriously complicate the data
|
1364 |
// around the terms, but this would seriously complicate the data
|
1354 |
// structure.
|
1365 |
// structure.
|
1355 |
#define EXTRACT_WIDTH 3
|
|
|
1356 |
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
1366 |
string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
|
1357 |
{
|
1367 |
{
|
1358 |
Chrono chron;
|
1368 |
Chrono chron;
|
1359 |
// A buffer that we populate with the document terms, at their position
|
1369 |
// A buffer that we populate with the document terms, at their position
|
1360 |
vector<string> buf;
|
1370 |
vector<string> buf;
|
|
... |
|
... |
1374 |
for (pos = db.positionlist_begin(docid, *qit);
|
1384 |
for (pos = db.positionlist_begin(docid, *qit);
|
1375 |
pos != db.positionlist_end(docid, *qit); pos++) {
|
1385 |
pos != db.positionlist_end(docid, *qit); pos++) {
|
1376 |
unsigned int ipos = *pos;
|
1386 |
unsigned int ipos = *pos;
|
1377 |
LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
1387 |
LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
|
1378 |
// Possibly extend the array. Do it in big chunks
|
1388 |
// Possibly extend the array. Do it in big chunks
|
1379 |
if (ipos + EXTRACT_WIDTH >= buf.size()) {
|
1389 |
if (ipos + MA_EXTRACT_WIDTH >= buf.size()) {
|
1380 |
buf.resize(ipos + EXTRACT_WIDTH + 1000);
|
1390 |
buf.resize(ipos + MA_EXTRACT_WIDTH + 1000);
|
1381 |
}
|
1391 |
}
|
1382 |
buf[ipos] = *qit;
|
1392 |
buf[ipos] = *qit;
|
1383 |
// Remember the term position
|
1393 |
// Remember the term position
|
1384 |
qtermposs.push_back(ipos);
|
1394 |
qtermposs.push_back(ipos);
|
1385 |
// Add adjacent slots to the set to populate at next step
|
1395 |
// Add adjacent slots to the set to populate at next step
|
1386 |
for (unsigned int ii = MAX(0, ipos-EXTRACT_WIDTH);
|
1396 |
for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH);
|
1387 |
ii <= MIN(ipos+EXTRACT_WIDTH, buf.size()-1); ii++) {
|
1397 |
ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) {
|
1388 |
chunkposs.insert(ii);
|
1398 |
chunkposs.insert(ii);
|
1389 |
}
|
1399 |
}
|
1390 |
// Limit the number of occurences we keep for each
|
1400 |
// Limit the number of occurences we keep for each
|
1391 |
// term. The abstract has a finite length anyway !
|
1401 |
// term. The abstract has a finite length anyway !
|
1392 |
if (occurrences++ > 10)
|
1402 |
if (occurrences++ > 10)
|
|
... |
|
... |
1435 |
// Extract data around the first (in random order) term positions,
|
1445 |
// Extract data around the first (in random order) term positions,
|
1436 |
// and store the chunks in the map
|
1446 |
// and store the chunks in the map
|
1437 |
for (vector<unsigned int>::const_iterator it = qtermposs.begin();
|
1447 |
for (vector<unsigned int>::const_iterator it = qtermposs.begin();
|
1438 |
it != qtermposs.end(); it++) {
|
1448 |
it != qtermposs.end(); it++) {
|
1439 |
unsigned int ipos = *it;
|
1449 |
unsigned int ipos = *it;
|
1440 |
unsigned int start = MAX(0, ipos-EXTRACT_WIDTH);
|
1450 |
unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH);
|
1441 |
unsigned int end = MIN(ipos+EXTRACT_WIDTH, buf.size()-1);
|
1451 |
unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1);
|
1442 |
string chunk;
|
1452 |
string chunk;
|
1443 |
for (unsigned int ii = start; ii <= end; ii++) {
|
1453 |
for (unsigned int ii = start; ii <= end; ii++) {
|
1444 |
if (!buf[ii].empty()) {
|
1454 |
if (!buf[ii].empty()) {
|
1445 |
chunk += buf[ii] + " ";
|
1455 |
chunk += buf[ii] + " ";
|
1446 |
abslen += buf[ii].length();
|
1456 |
abslen += buf[ii].length();
|
1447 |
}
|
1457 |
}
|
1448 |
if (abslen > 300)
|
1458 |
if (abslen > MA_ABSTRACT_SIZE)
|
1449 |
break;
|
1459 |
break;
|
1450 |
}
|
1460 |
}
|
1451 |
if (end != buf.size()-1)
|
1461 |
if (end != buf.size()-1)
|
1452 |
chunk += "... ";
|
1462 |
chunk += "... ";
|
1453 |
mabs[ipos] = chunk;
|
1463 |
mabs[ipos] = chunk;
|
1454 |
if (abslen > 300)
|
1464 |
if (abslen > MA_ABSTRACT_SIZE)
|
1455 |
break;
|
1465 |
break;
|
1456 |
}
|
1466 |
}
|
1457 |
|
1467 |
|
1458 |
// Build the abstract by walking the map (in order of position)
|
1468 |
// Build the abstract by walking the map (in order of position)
|
1459 |
string abstract;
|
1469 |
string abstract;
|