recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [aeda84] .. [89c3df]

Switch to side-by-side view

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.142 2008-09-05 10:34:17 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.143 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 /*
  *   This program is free software; you can redistribute it and/or modify
@@ -129,6 +129,13 @@
     return false;
 }
 
+static const string keycap("caption");
+static const string keymtp("mtype");
+static const string keyfmt("fmtime");
+static const string keydmt("dmtime");
+static const string keyoc("origcharset");
+static const string keyurl("url");
+
 // Turn data record from db into document fields
 bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, 
 				Doc &doc, int percent)
@@ -137,30 +144,37 @@
     ConfSimple parms(&data);
     if (!parms.ok())
 	return false;
-    parms.get(string("url"), doc.url);
-    parms.get(string("mtype"), doc.mimetype);
-    parms.get(string("fmtime"), doc.fmtime);
-    parms.get(string("dmtime"), doc.dmtime);
-    parms.get(string("origcharset"), doc.origcharset);
-    parms.get(string("caption"), doc.meta["title"]);
-    parms.get(string("keywords"), doc.meta["keywords"]);
-    parms.get(string("abstract"), doc.meta["abstract"]);
-    parms.get(string("author"), doc.meta["author"]);
+    parms.get(keyurl, doc.url);
+    parms.get(keymtp, doc.mimetype);
+    parms.get(keyfmt, doc.fmtime);
+    parms.get(keydmt, doc.dmtime);
+    parms.get(keyoc, doc.origcharset);
+    parms.get(keycap, doc.meta[Doc::keytt]);
+    parms.get(Doc::keykw, doc.meta[Doc::keykw]);
+    parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
     // Possibly remove synthetic abstract indicator (if it's there, we
     // used to index the beginning of the text as abstract).
     doc.syntabs = false;
-    if (doc.meta["abstract"].find(rclSyntAbs) == 0) {
-	doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length());
+    if (doc.meta[Doc::keyabs].find(rclSyntAbs) == 0) {
+	doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(rclSyntAbs.length());
 	doc.syntabs = true;
     }
     char buf[20];
     sprintf(buf,"%.2f", float(percent) / 100.0);
-    doc.meta["relevancyrating"] = buf;
+    doc.meta[Doc::keyrr] = buf;
     parms.get(string("ipath"), doc.ipath);
     parms.get(string("fbytes"), doc.fbytes);
     parms.get(string("dbytes"), doc.dbytes);
     parms.get(string("sig"), doc.sig);
     doc.xdocid = docid;
+
+    // Other, not predefined meta fields:
+    list<string> keys = parms.getNames(string());
+    for (list<string>::const_iterator it = keys.begin(); 
+	 it != keys.end(); it++) {
+	if (doc.meta.find(*it) == doc.meta.end()) 
+	    parms.get(*it, doc.meta[*it]);
+    }
     return true;
 }
 
@@ -680,21 +694,21 @@
     // This is the default table
     static map<string, string> fldToPrefs;
     if (fldToPrefs.empty()) {
-	fldToPrefs["abstract"] = string();
+	fldToPrefs[Doc::keyabs] = string();
 	fldToPrefs["ext"] = "XE";
 	fldToPrefs["filename"] = "XSFN";
 
 	fldToPrefs["title"] = "S";
-	fldToPrefs["caption"] = "S";
+	fldToPrefs[keycap] = "S";
 	fldToPrefs["subject"] = "S";
 
-	fldToPrefs["author"] = "A";
+	fldToPrefs[Doc::keyau] = "A";
 	fldToPrefs["creator"] = "A";
 	fldToPrefs["from"] = "A";
 
 	fldToPrefs["keyword"] = "K";
 	fldToPrefs["tag"] = "K";
-	fldToPrefs["keywords"] = "K";
+	fldToPrefs[Doc::keykw] = "K";
 	fldToPrefs["tags"] = "K";
     }
 
@@ -803,6 +817,7 @@
 }
 
 static const int MB = 1024 * 1024;
+static const string nc("\n\r\x0c");
 
 // Add document in internal form to the database: index the terms in
 // the title abstract and body and add special terms for file name,
@@ -831,35 +846,6 @@
 
     Doc doc = idoc;
 
-    // The title, author, abstract and keywords fields are special, they
-    // get stored in the document data record.
-    // Truncate abstract, title and keywords to reasonable lengths. If
-    // abstract is currently empty, we make up one with the beginning
-    // of the document. This is then not indexed, but part of the doc
-    // data so that we can return it to a query without having to
-    // decode the original file.
-    bool syntabs = false;
-    // Note that the map accesses by operator[] create empty entries if they
-    // don't exist yet.
-    if (doc.meta["abstract"].empty()) {
-	syntabs = true;
-	doc.meta["abstract"] = rclSyntAbs + 
-	    neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r");
-    } else {
-	doc.meta["abstract"] = 
-	    neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen),
-		      "\n\r");
-    }
-    if (doc.meta["title"].empty())
-	doc.meta["title"] = doc.utf8fn;
-    doc.meta["title"] = 
-	neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r");
-    doc.meta["author"] = 
-	neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r");
-    doc.meta["keywords"] = 
-	neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r");
-
-
     Xapian::Document newdocument;
     mySplitterCB splitData(newdocument, m_stops);
     TextSplit splitter(&splitData);
@@ -882,11 +868,9 @@
     string pfx;
     for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
 	if (!meta_it->second.empty()) {
-	    if (meta_it->first == "abstract" && syntabs)
-		continue;
 	    if (!fieldToPrefix(meta_it->first, pfx)) {
 		LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
-			meta_it->first.c_str()));
+			 meta_it->first.c_str()));
 		continue;
 	    }
 	    LOGDEB1(("Db::add: field [%s] pfx [%s]: [%s]\n", 
@@ -908,7 +892,7 @@
     else
 	splitData.basepos += splitData.curpos + 100;
 
-    // Finally: split and index body text
+    // Split and index body text
     LOGDEB2(("Db::add: split body\n"));
     if (!dumb_string(doc.text, noacc)) {
 	LOGERR(("Db::add: dumb_string failed\n"));
@@ -958,11 +942,22 @@
     buf[4] = '\0';
     newdocument.add_term("Y" + string(buf)); // Year (YYYY)
 
+
+    //////////////////////////////////////////////////////////////////
     // Document data record. omindex has the following nl separated fields:
     // - url
     // - sample
     // - caption (title limited to 100 chars)
     // - mime type 
+    //
+    // The title, author, abstract and keywords fields are special,
+    // they always get stored in the document data
+    // record. Configurable other fields can be, too.
+    //
+    // We truncate stored fields abstract, title and keywords to
+    // reasonable lengths and suppress newlines (so that the data
+    // record can keep a simple syntax)
+
     string record = "url=" + doc.url;
     record += "\nmtype=" + doc.mimetype;
     record += "\nfmtime=" + doc.fmtime;
@@ -982,20 +977,55 @@
     sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
     record += string("\ndbytes=") + sizebuf;
 
-    if (!doc.ipath.empty()) {
+    if (!doc.ipath.empty())
 	record += "\nipath=" + doc.ipath;
-    }
-    if (!doc.meta["title"].empty())
-	record += "\ncaption=" + doc.meta["title"];
-    if (!doc.meta["keywords"].empty())
-	record += "\nkeywords=" + doc.meta["keywords"];
-    if (!doc.meta["abstract"].empty())
-	record += "\nabstract=" + doc.meta["abstract"];
-    if (!doc.meta["author"].empty()) {
-	record += "\nauthor=" + doc.meta["author"];
+
+    if (doc.meta[Doc::keytt].empty())
+	doc.meta[Doc::keytt] = doc.utf8fn;
+    doc.meta[Doc::keytt] = 
+	neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc);
+    if (!doc.meta[Doc::keytt].empty())
+	record += "\n" + keycap + "=" + doc.meta[Doc::keytt];
+
+    doc.meta[Doc::keykw] = 
+	neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc);
+    if (!doc.meta[Doc::keykw].empty())
+	record += "\n" + Doc::keykw + "=" + doc.meta[Doc::keykw];
+
+    // If abstract is empty, we make up one with the beginning of the
+    // document. This is then not indexed, but part of the doc data so
+    // that we can return it to a query without having to decode the
+    // original file.
+    bool syntabs = false;
+    // Note that the map accesses by operator[] create empty entries if they
+    // don't exist yet.
+    if (doc.meta[Doc::keyabs].empty()) {
+	syntabs = true;
+	if (!doc.text.empty())
+	    doc.meta[Doc::keyabs] = rclSyntAbs + 
+		neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), nc);
+    } else {
+	doc.meta[Doc::keyabs] = 
+	    neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
+		      nc);
+    }
+    if (!doc.meta[Doc::keyabs].empty())
+	record += "\n" + Doc::keyabs + "=" + doc.meta[Doc::keyabs];
+
+    RclConfig *config = RclConfig::getMainConfig();
+    if (config) {
+	const set<string>& stored = config->getStoredFields();
+	for (set<string>::const_iterator it = stored.begin();
+	     it != stored.end(); it++) {
+	    if (!doc.meta[*it].empty()) {
+		string value = 
+		    neutchars(truncate_to_word(doc.meta[*it], 150), nc);
+		record += "\n" + *it + "=" + value;
+	    }
+	}
     }
     record += "\n";
-    LOGDEB1(("Newdocument data: %s\n", record.c_str()));
+    LOGDEB(("Rcl::Db::add: new doc record:\n %s\n", record.c_str()));
     newdocument.set_data(record);
 
     const char *fnc = udi.c_str();