recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [8bde98] .. [b536c9]

Switch to side-by-side view

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.75 2006-05-09 10:15:14 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.76 2006-09-13 13:53:35 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -56,17 +56,6 @@
 #ifndef NO_NAMESPACES
 namespace Rcl {
 #endif
-// This is how long an abstract we keep or build from beginning of text when
-// indexing. It only has an influence on the size of the db as we are free
-// to shorten it again when displaying
-#define INDEX_ABSTRACT_SIZE 250
-
-// This is the size of the abstract that we synthetize out of query
-// term contexts at query time
-#define MA_ABSTRACT_SIZE 250
-// This is how many words (context size) we keep around query terms
-// when building the abstract
-#define MA_EXTRACT_WIDTH 4
 
 // Truncate longer path and uniquize with hash . The goal for this is
 // to avoid xapian max term length limitations, not to gain space (we
@@ -81,6 +70,7 @@
 // ones for indexing or query as there is not much in common.
 class Native {
  public:
+    Db *m_db;
     bool m_isopen;
     bool m_iswritable;
     Db::OpenMode m_mode;
@@ -106,8 +96,9 @@
 			Xapian::docid docid,
 			const list<string>& terms);
 
-    Native() 
-	: m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) 
+    Native(Db *db) 
+	: m_db(db),
+	  m_isopen(false), m_iswritable(false), m_mode(Db::DbRO), enquire(0) 
     { }
     ~Native() {
 	delete enquire;
@@ -149,9 +140,10 @@
 };
 
 Db::Db() 
-    : m_qOpts(QO_NONE)
-{
-    m_ndb = new Native;
+    : m_qOpts(QO_NONE), m_idxAbsTruncLen(250), m_synthAbsLen(250),
+      m_synthAbsWordCtxLen(4)
+{
+    m_ndb = new Native(this);
 }
 
 Db::~Db()
@@ -282,7 +274,7 @@
 	    LOGDEB(("Rcl:Db: Called xapian flush\n"));
 	}
 	delete m_ndb;
-	m_ndb = new Native;
+	m_ndb = new Native(this);
 	if (m_ndb)
 	    return true;
     } catch (const Xapian::Error &e) {
@@ -442,6 +434,19 @@
     return true;
 }
 
+// Let our user set the parameters for abstract processing
+void Db::setAbstractParams(int idxtrunc, int syntlen, int syntctxlen)
+{
+    LOGDEB(("Db::setAbstractParams: trunc %d syntlen %d ctxlen %d\n",
+	    idxtrunc, syntlen, syntctxlen));
+    if (idxtrunc > 0 && idxtrunc < 2000)
+	m_idxAbsTruncLen = idxtrunc;
+    if (syntlen > 0 && syntlen < 2000)
+	m_synthAbsLen = syntlen;
+    if (syntctxlen > 0 && syntctxlen < 20)
+	m_synthAbsWordCtxLen = syntctxlen;
+}
+
 // Add document in internal form to the database: index the terms in
 // the title abstract and body and add special terms for file name,
 // date, mime type ... , create the document data record (more
@@ -457,14 +462,16 @@
 
     // Truncate abstract, title and keywords to reasonable lengths. If
     // abstract is currently empty, we make up one with the beginning
-    // of the document.
+    // of the document. This is then not indexed, but part of the doc
+    // data so that we can return it to a query without having to
+    // decode the original file.
     bool syntabs = false;
     if (doc.abstract.empty()) {
 	syntabs = true;
 	doc.abstract = rclSyntAbs + 
-	    truncate_to_word(doc.text, INDEX_ABSTRACT_SIZE);
+	    truncate_to_word(doc.text, m_idxAbsTruncLen);
     } else {
-	doc.abstract = truncate_to_word(doc.abstract, INDEX_ABSTRACT_SIZE);
+	doc.abstract = truncate_to_word(doc.abstract, m_idxAbsTruncLen);
     }
     doc.abstract = neutchars(doc.abstract, "\n\r");
     doc.title = truncate_to_word(doc.title, 100);
@@ -513,14 +520,20 @@
     splitter.text_to_words(noacc);
     splitData.basepos += splitData.curpos + 100;
 
-    // Split and index abstract
+    // Split and index abstract. We don't do this if it is synthetic
+    // any more (this used to give a relevance boost to the beginning
+    // of text, why ?)
     LOGDEB2(("Db::add: split abstract [%s]\n", doc.abstract.c_str()));
-    if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : 
-		     doc.abstract, noacc)) {
-	LOGERR(("Db::add: dumb_string failed\n"));
-	return false;
-    }
-    splitter.text_to_words(noacc);
+    if (!syntabs) {
+	// syntabs indicator test kept here in case we want to go back
+	// to indexing synthetic abstracts one day
+	if (!dumb_string(syntabs ? doc.abstract.substr(rclSyntAbs.length()) : 
+			 doc.abstract, noacc)) {
+	    LOGERR(("Db::add: dumb_string failed\n"));
+	    return false;
+	}
+	splitter.text_to_words(noacc);
+    }
     splitData.basepos += splitData.curpos + 100;
 
     ////// Special terms for metadata
@@ -1182,17 +1195,21 @@
     parms.get(string("caption"), doc.title);
     parms.get(string("keywords"), doc.keywords);
     parms.get(string("abstract"), doc.abstract);
+    // Possibly remove synthetic abstract indicator (if it's there, we
+    // used to index the beginning of the text as abstract).
     bool syntabs = false;
     if (doc.abstract.find(rclSyntAbs) == 0) {
 	doc.abstract = doc.abstract.substr(rclSyntAbs.length());
 	syntabs = true;
     }
+    // If the option is set and the abstract is synthetic or empty , build 
+    // abstract from position data. 
     if ((qopts & Db::QO_BUILD_ABSTRACT) && !terms.empty()) {
-	LOGDEB1(("dbDataToRclDoc:: building abstract from position data\n"));
+	LOGDEB(("dbDataToRclDoc:: building abstract from position data\n"));
 	if (doc.abstract.empty() || syntabs || 
 	    (qopts & Db::QO_REPLACE_ABSTRACT))
 	    doc.abstract = makeAbstract(docid, terms);
-    }
+    } 
     parms.get(string("ipath"), doc.ipath);
     parms.get(string("fbytes"), doc.fbytes);
     parms.get(string("dbytes"), doc.dbytes);
@@ -1397,6 +1414,7 @@
     // remember the position and its neigbours
     vector<unsigned int> qtermposs; // The term positions
     set<unsigned int> chunkposs; // All the positions we shall populate
+    int totaloccs = 0;
     for (list<string>::const_iterator qit = terms.begin(); qit != terms.end();
 	 qit++) {
 	Xapian::PositionIterator pos;
@@ -1409,15 +1427,15 @@
 		unsigned int ipos = *pos;
 		LOGDEB1(("Abstract: [%s] at %d\n", qit->c_str(), ipos));
 		// Possibly extend the array. Do it in big chunks
-		if (ipos + MA_EXTRACT_WIDTH >= buf.size()) {
-		    buf.resize(ipos + MA_EXTRACT_WIDTH + 1000);
+		if (ipos + m_db->m_synthAbsWordCtxLen >= buf.size()) {
+		    buf.resize(ipos + m_db->m_synthAbsWordCtxLen + 1000);
 		}
 		buf[ipos] = *qit;
 		// Remember the term position
 		qtermposs.push_back(ipos);
 		// Add adjacent slots to the set to populate at next step
-		for (unsigned int ii = MAX(0, ipos-MA_EXTRACT_WIDTH); 
-		     ii <= MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1); ii++) {
+		for (unsigned int ii = MAX(0, ipos-m_db->m_synthAbsWordCtxLen); 
+		     ii <= MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1); ii++) {
 		    chunkposs.insert(ii);
 		}
 		// Limit the number of occurences we keep for each
@@ -1427,6 +1445,9 @@
 	    }
 	} catch (...) {
 	}
+	// Limit total size
+	if (totaloccs++ > 100)
+	    break;
     }
 
     LOGDEB1(("Abstract:%d:chosen number of positions %d. Populating\n", 
@@ -1470,21 +1491,21 @@
     for (vector<unsigned int>::const_iterator it = qtermposs.begin();
 	 it != qtermposs.end(); it++) {
 	unsigned int ipos = *it;
-	unsigned int start = MAX(0, ipos-MA_EXTRACT_WIDTH);
-	unsigned int end = MIN(ipos+MA_EXTRACT_WIDTH, buf.size()-1);
+	unsigned int start = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
+	unsigned int end = MIN(ipos+m_db->m_synthAbsWordCtxLen, buf.size()-1);
 	string chunk;
 	for (unsigned int ii = start; ii <= end; ii++) {
 	    if (!buf[ii].empty()) {
 		chunk += buf[ii] + " ";
 		abslen += buf[ii].length();
 	    }
-	    if (abslen > MA_ABSTRACT_SIZE)
+	    if (int(abslen) > m_db->m_synthAbsLen)
 		break;
 	}
 	if (end != buf.size()-1)
 	    chunk += "... ";
 	mabs[ipos] = chunk;
-	if (abslen > MA_ABSTRACT_SIZE)
+	if (int(abslen) > m_db->m_synthAbsLen)
 	    break;
     }