recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [52aaa5] .. [ed449c]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.55 2006-01-27 13:34:42 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
#define MAX(A,B) (A>B?A:B)
#endif
#ifndef MIN
#define MIN(A,B) (A<B?A:B)
#endif

// This is how long an abstract we keep or build from beginning of text when
// indexing. It only has an influence on the size of the db as we are free
// to shorten it again when displaying
#define INDEX_ABSTRACT_SIZE 250

// This is the size of the abstract that we synthetize out of query
// term contexts at query time
#define MA_ABSTRACT_SIZE 250
// This is how many words (context size) we keep around query terms
// when building the abstract
#define MA_EXTRACT_WIDTH 4

// Data for a xapian database. There could actually be 2 different
// ones for indexing or query as there is not much in common.
class Native {
 public:
...
// Truncate longer path and uniquize with hash . The goal for this is
// to avoid xapian max term length limitations, not to gain space (we
// gain very little even with very short maxlens like 30)
#define PATHHASHLEN 150


const static string rclSyntAbs = "?!#@";

// Add document in internal form to the database: index the terms in
// the title abstract and body and add special terms for file name,
// date, mime type ... , create the document data record (more
...
    // Truncate abstract, title and keywords to reasonable lengths. If
    // abstract is currently empty, we make up one with the beginning
    // of the document.
    if (doc.abstract.empty()) {
    doc.abstract = rclSyntAbs + 
        truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE);
    } else {
    doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
    }
    doc.abstract = stripchars(doc.abstract, "\n\r");
    doc.title = truncate_to_word(doc.title, 100);
    doc.keywords = truncate_to_word(doc.keywords, 300);

...
// We build a possibly full size but sparsely populated (only around
// the search term) reconstruction of the document. It would be
// possible to compress the array, by having only multiple chunks
// around the terms, but this would seriously complicate the data
// structure.

string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
{
    Chrono chron;
    // A buffer that we populate with the document terms, at their position
    vector<string> buf;
...
        for (pos = db.positionlist_begin(docid, *qit); 
         pos != db.positionlist_end(docid, *qit); pos++) {
        unsigned int ipos = *pos;
        LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
        // Possibly extend the array. Do it in big chunks
        if (ipos + MA_EXTRACT_WIDTH >= buf.size()) {
            buf.resize(ipos + MA_EXTRACT_WIDTH + 1000);
        }
        buf[ipos] = *qit;
        // Remember the term position
        qtermposs.push_back(ipos);
        // Add adjacent slots to the set to populate at next step
        for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH); 
             ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) {
            chunkposs.insert(ii);
        }
        // Limit the number of occurences we keep for each
        // term. The abstract has a finite length anyway !
        if (occurrences++ > 10)
...
    // Extract data around the first (in random order) term positions,
    // and store the chunks in the map
    for (vector<unsigned int>::const_iterator it = qtermposs.begin();
     it != qtermposs.end(); it++) {
    unsigned int ipos = *it;
    unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH);
    unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1);
    string chunk;
    for (unsigned int ii = start; ii <= end; ii++) {
        if (!buf[ii].empty()) {
        chunk += buf[ii] + " ";
        abslen += buf[ii].length();
        }
      if (abslen > MA_ABSTRACT_SIZE)
        break;
    }
    if (end != buf.size()-1)
        chunk += "... ";
    mabs[ipos] = chunk;
  if (abslen > MA_ABSTRACT_SIZE)
        break;
    }

    // Build the abstract by walking the map (in order of position)
    string abstract;

	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.54 2006-01-26 12:28:50 dockes Exp $ (C) 2004 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.55 2006-01-27 13:34:42 dockes Exp $ (C) 2004 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
49	#define MAX(A,B) (A>B?A:B)	49	#define MAX(A,B) (A>B?A:B)
50	#endif	50	#endif
51	#ifndef MIN	51	#ifndef MIN
52	#define MIN(A,B) (A<B?A:B)	52	#define MIN(A,B) (A<B?A:B)
53	#endif	53	#endif
		54
		55	// This is how long an abstract we keep or build from beginning of text when
		56	// indexing. It only has an influence on the size of the db as we are free
		57	// to shorten it again when displaying
		58	#define INDEX_ABSTRACT_SIZE 250
		59
		60	// This is the size of the abstract that we synthetize out of query
		61	// term contexts at query time
		62	#define MA_ABSTRACT_SIZE 250
		63	// This is how many words (context size) we keep around query terms
		64	// when building the abstract
		65	#define MA_EXTRACT_WIDTH 4
54		66
55	// Data for a xapian database. There could actually be 2 different	67	// Data for a xapian database. There could actually be 2 different
56	// ones for indexing or query as there is not much in common.	68	// ones for indexing or query as there is not much in common.
57	class Native {	69	class Native {
58	public:	70	public:
	...		...
335	// Truncate longer path and uniquize with hash . The goal for this is	347	// Truncate longer path and uniquize with hash . The goal for this is
336	// to avoid xapian max term length limitations, not to gain space (we	348	// to avoid xapian max term length limitations, not to gain space (we
337	// gain very little even with very short maxlens like 30)	349	// gain very little even with very short maxlens like 30)
338	#define PATHHASHLEN 150	350	#define PATHHASHLEN 150
339		351
340	#define ABSTRACT_SIZE 200
341	const static string rclSyntAbs = "?!#@";	352	const static string rclSyntAbs = "?!#@";
342		353
343	// Add document in internal form to the database: index the terms in	354	// Add document in internal form to the database: index the terms in
344	// the title abstract and body and add special terms for file name,	355	// the title abstract and body and add special terms for file name,
345	// date, mime type ... , create the document data record (more	356	// date, mime type ... , create the document data record (more
	...		...
357	// Truncate abstract, title and keywords to reasonable lengths. If	368	// Truncate abstract, title and keywords to reasonable lengths. If
358	// abstract is currently empty, we make up one with the beginning	369	// abstract is currently empty, we make up one with the beginning
359	// of the document.	370	// of the document.
360	if (doc.abstract.empty()) {	371	if (doc.abstract.empty()) {
361	doc.abstract = rclSyntAbs +	372	doc.abstract = rclSyntAbs +
362	truncate_to_word(doc.text, ABSTRACT_SIZE);	373	truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE);
363	} else {	374	} else {
364	doc.abstract = truncate_to_word(doc.abstract, ABSTRACT_SIZE);	375	doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
365	}	376	}
366	doc.abstract = stripchars(doc.abstract, "\n\r");	377	doc.abstract = stripchars(doc.abstract, "\n\r");
367	doc.title = truncate_to_word(doc.title, 100);	378	doc.title = truncate_to_word(doc.title, 100);
368	doc.keywords = truncate_to_word(doc.keywords, 300);	379	doc.keywords = truncate_to_word(doc.keywords, 300);
369		380
	...		...
1350	// We build a possibly full size but sparsely populated (only around	1361	// We build a possibly full size but sparsely populated (only around
1351	// the search term) reconstruction of the document. It would be	1362	// the search term) reconstruction of the document. It would be
1352	// possible to compress the array, by having only multiple chunks	1363	// possible to compress the array, by having only multiple chunks
1353	// around the terms, but this would seriously complicate the data	1364	// around the terms, but this would seriously complicate the data
1354	// structure.	1365	// structure.
1355	#define EXTRACT_WIDTH 3
1356	string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)	1366	string Native::makeAbstract(Xapian::docid docid, const list<string>& terms)
1357	{	1367	{
1358	Chrono chron;	1368	Chrono chron;
1359	// A buffer that we populate with the document terms, at their position	1369	// A buffer that we populate with the document terms, at their position
1360	vector<string> buf;	1370	vector<string> buf;
	...		...
1374	for (pos = db.positionlist_begin(docid, *qit);	1384	for (pos = db.positionlist_begin(docid, *qit);
1375	pos != db.positionlist_end(docid, *qit); pos++) {	1385	pos != db.positionlist_end(docid, *qit); pos++) {
1376	unsigned int ipos = *pos;	1386	unsigned int ipos = *pos;
1377	LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));	1387	LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
1378	// Possibly extend the array. Do it in big chunks	1388	// Possibly extend the array. Do it in big chunks
1379	if (ipos + EXTRACT_WIDTH >= buf.size()) {	1389	if (ipos + MA_EXTRACT_WIDTH >= buf.size()) {
1380	buf.resize(ipos + EXTRACT_WIDTH + 1000);	1390	buf.resize(ipos + MA_EXTRACT_WIDTH + 1000);
1381	}	1391	}
1382	buf[ipos] = *qit;	1392	buf[ipos] = *qit;
1383	// Remember the term position	1393	// Remember the term position
1384	qtermposs.push_back(ipos);	1394	qtermposs.push_back(ipos);
1385	// Add adjacent slots to the set to populate at next step	1395	// Add adjacent slots to the set to populate at next step
1386	for (unsigned int ii = MAX(0, ipos-EXTRACT_WIDTH);	1396	for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH);
1387	ii <= MIN(ipos+EXTRACT_WIDTH, buf.size()-1); ii++) {	1397	ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) {
1388	chunkposs.insert(ii);	1398	chunkposs.insert(ii);
1389	}	1399	}
1390	// Limit the number of occurences we keep for each	1400	// Limit the number of occurences we keep for each
1391	// term. The abstract has a finite length anyway !	1401	// term. The abstract has a finite length anyway !
1392	if (occurrences++ > 10)	1402	if (occurrences++ > 10)
	...		...
1435	// Extract data around the first (in random order) term positions,	1445	// Extract data around the first (in random order) term positions,
1436	// and store the chunks in the map	1446	// and store the chunks in the map
1437	for (vector<unsigned int>::const_iterator it = qtermposs.begin();	1447	for (vector<unsigned int>::const_iterator it = qtermposs.begin();
1438	it != qtermposs.end(); it++) {	1448	it != qtermposs.end(); it++) {
1439	unsigned int ipos = *it;	1449	unsigned int ipos = *it;
1440	unsigned int start = MAX(0, ipos-EXTRACT_WIDTH);	1450	unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH);
1441	unsigned int end = MIN(ipos+EXTRACT_WIDTH, buf.size()-1);	1451	unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1);
1442	string chunk;	1452	string chunk;
1443	for (unsigned int ii = start; ii <= end; ii++) {	1453	for (unsigned int ii = start; ii <= end; ii++) {
1444	if (!buf[ii].empty()) {	1454	if (!buf[ii].empty()) {
1445	chunk += buf[ii] + " ";	1455	chunk += buf[ii] + " ";
1446	abslen += buf[ii].length();	1456	abslen += buf[ii].length();
1447	}	1457	}
1448	if (abslen > 300)	1458	if (abslen > MA_ABSTRACT_SIZE)
1449	break;	1459	break;
1450	}	1460	}
1451	if (end != buf.size()-1)	1461	if (end != buf.size()-1)
1452	chunk += "... ";	1462	chunk += "... ";
1453	mabs[ipos] = chunk;	1463	mabs[ipos] = chunk;
1454	if (abslen > 300)	1464	if (abslen > MA_ABSTRACT_SIZE)
1455	break;	1465	break;
1456	}	1466	}
1457		1467
1458	// Build the abstract by walking the map (in order of position)	1468	// Build the abstract by walking the map (in order of position)
1459	string abstract;	1469	string abstract;