recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [3fbcbc] .. [56a565]

Switch to side-by-side view

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -263,6 +263,110 @@
     return false;
 }
 
+// Clear term from document if its frequency is 0. This should
+// probably be done by Xapian when the freq goes to 0 when removing a
+// posting, but we have to do it ourselves
+bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
+{
+    LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
+
+    // Find the term
+    Xapian::TermIterator xit;
+    XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
+	   xrdb, m_rcldb->m_reason);
+    if (!m_rcldb->m_reason.empty()) {
+	LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n", 
+		term.c_str(), m_rcldb->m_reason.c_str()));
+	return false;
+    }
+    if (xit == xdoc.termlist_end() || term.compare(*xit)) {
+	LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n", 
+		 term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
+	return false;
+    }
+
+    // Clear the term if its frequency is 0
+    if (xit.get_wdf() == 0) {
+	LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
+	XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
+	if (!m_rcldb->m_reason.empty()) {
+	    LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n", 
+		     term.c_str(), m_rcldb->m_reason.c_str()));
+	}
+    }
+    return true;
+}
+
+// Holder for term + pos
+struct DocPosting {
+    DocPosting(string t, Xapian::termpos ps)
+	: term(t), pos(ps) {}
+    string term;
+    Xapian::termpos pos;
+};
+
+// Clear all terms for given field for given document.
+// The terms to be cleared are all those with the appropriate
+// prefix. We also remove the postings for the unprefixed terms (that
+// is, we undo what we did when indexing).
+bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
+			    Xapian::termcount wdfdec)
+{
+    LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
+	     pfx.c_str(), unsigned(xdoc.get_docid())));
+
+    vector<DocPosting> eraselist;
+
+    string wrapd = wrap_prefix(pfx);
+
+    m_rcldb->m_reason.clear();
+    for (int tries = 0; tries < 2; tries++) {
+	try {
+	    Xapian::TermIterator xit;
+	    xit = xdoc.termlist_begin();
+	    xit.skip_to(wrapd);
+	    while (xit != xdoc.termlist_end() && 
+		!(*xit).compare(0, wrapd.size(), wrapd)) {
+		LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
+		Xapian::PositionIterator posit;
+		for (posit = xit.positionlist_begin();
+		     posit != xit.positionlist_end(); posit++) {
+		    eraselist.push_back(DocPosting(*xit, *posit));
+		    eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
+		}
+		xit++;
+	    }
+	} catch (const Xapian::DatabaseModifiedError &e) {
+	    m_rcldb->m_reason = e.get_msg();
+	    xrdb.reopen();
+	    continue;
+	} XCATCHERROR(m_rcldb->m_reason);
+	break;
+    }
+    if (!m_rcldb->m_reason.empty()) {
+	LOGERR(("Db::clearField: failed building erase list: %s\n", 
+		m_rcldb->m_reason.c_str()));
+	return false;
+    }
+
+    // Now remove the found positions, and the terms if the wdf is 0
+    for (vector<DocPosting>::const_iterator it = eraselist.begin();
+	 it != eraselist.end(); it++) {
+	LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n", 
+		 it->term.c_str(), int(it->pos)));
+	XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);, 
+	       xwdb,m_rcldb->m_reason);
+	if (!m_rcldb->m_reason.empty()) {
+	    // Not that this normally fails for non-prefixed XXST and
+	    // ND, don't make a fuss
+	    LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
+		     it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
+	}
+	clearDocTermIfWdf0(xdoc, it->term);
+    }
+    return true;
+}
+
 // Check if doc given by udi is indexed by term
 bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
 {
@@ -460,11 +564,7 @@
 {
 #ifdef IDX_THREADS
     Chrono chron;
-    // In the case where there is a separate (single) db update
-    // thread, we only need to protect the update map update below
-    // (against interaction with threads calling needUpdate()). Else,
-    // all threads from above need to synchronize here
-    PTMutexLocker lock(m_mutex, m_havewriteq);
+    PTMutexLocker lock(m_mutex);
 #endif
 
     // Check file system full every mbyte of indexed text. It's a bit wasteful
@@ -491,11 +591,6 @@
     try {
 	Xapian::docid did = 
 	    xwdb.replace_document(uniterm, newdocument);
-#ifdef IDX_THREADS
-	// Need to protect against interaction with the up-to-date checks
-	// which also update the existence map
-	PTMutexLocker lock(m_mutex, !m_havewriteq);
-#endif
 	if (did < m_rcldb->updated.size()) {
 	    m_rcldb->updated[did] = true;
 	    LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
@@ -934,7 +1029,6 @@
     return false;
 }
 
-
 // The splitter breaks text into words and adds postings to the Xapian
 // document. We use a single object to split all of the document
 // fields and position jumps to separate fields
@@ -1151,7 +1245,7 @@
 	return false;
 
     Xapian::Document newdocument;
-
+    
     // The term processing pipeline:
     TermProcIdx tpidx;
     TermProc *nxt = &tpidx;
@@ -1165,276 +1259,287 @@
     TextSplitDb splitter(newdocument, nxt);
     tpidx.setTSD(&splitter);
 
-    // If the ipath is like a path, index the last element. This is
-    // for compound documents like zip and chm for which the filter
-    // uses the file path as ipath. 
-    if (!doc.ipath.empty() && 
-	doc.ipath.find_first_not_of("0123456789") != string::npos) {
-	string utf8ipathlast;
-	// There is no way in hell we could have an idea of the
-	// charset here, so let's hope it's ascii or utf-8. We call
-	// transcode to strip the bad chars and pray
-	if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
-		      "UTF-8", "UTF-8")) {
-	    splitter.text_to_words(utf8ipathlast);
-	}
-    }
-
-    // Split and index the path from the url for path-based filtering
-    {
-	string path = url_gpath(doc.url);
-	vector<string> vpath;
-	stringToTokens(path, vpath, "/");
-	// If vpath is not /, the last elt is the file/dir name, not a
-	// part of the path.
-	if (vpath.size())
-	    vpath.resize(vpath.size()-1);
-	splitter.curpos = 0;
-	newdocument.add_posting(wrap_prefix(pathelt_prefix),
-				splitter.basepos + splitter.curpos++);
-	for (vector<string>::iterator it = vpath.begin(); 
-	     it != vpath.end(); it++){
-	    if (it->length() > 230) {
-		// Just truncate it. May still be useful because of wildcards
-		*it = it->substr(0, 230);
-	    }
-	    newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
-				    splitter.basepos + splitter.curpos++);
-	}
-    }
-
-    // Index textual metadata.  These are all indexed as text with
-    // positions, as we may want to do phrase searches with them (this
-    // makes no sense for keywords by the way).
-    //
-    // The order has no importance, and we set a position gap of 100
-    // between fields to avoid false proximity matches.
-    map<string, string>::iterator meta_it;
-    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
-	if (!meta_it->second.empty()) {
-	    const FieldTraits *ftp;
-	    // We don't test for an empty prefix here. Some fields are part
-	    // of the internal conf with an empty prefix (ie: abstract).
-	    if (!fieldToTraits(meta_it->first, &ftp)) {
-		LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
-			 meta_it->first.c_str()));
-		continue;
-	    }
-	    LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
-		     meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
-		     meta_it->second.c_str()));
-	    splitter.setprefix(ftp->pfx);
-	    splitter.setwdfinc(ftp->wdfinc);
-	    if (!splitter.text_to_words(meta_it->second))
-                LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
-                        meta_it->first.c_str()));
-	}
-    }
-    splitter.setprefix(string());
-    splitter.setwdfinc(1);
-
-    if (splitter.curpos < baseTextPosition)
-	splitter.basepos = baseTextPosition;
-
-    // Split and index body text
-    LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
-
-#ifdef TEXTSPLIT_STATS
-    splitter.resetStats();
-#endif
-    if (!splitter.text_to_words(doc.text))
-        LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
-
-#ifdef TEXTSPLIT_STATS
-    // Reject bad data. unrecognized base64 text is characterized by
-    // high avg word length and high variation (because there are
-    // word-splitters like +/ inside the data).
-    TextSplit::Stats::Values v = splitter.getStats();
-    // v.avglen > 15 && v.sigma > 12 
-    if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
-	LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
-	 "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
-		 v.count, v.avglen, v.sigma, doc.url.c_str(), 
-		 doc.ipath.c_str(), doc.text.c_str()));
-	return true;
-    }
-#endif
-
-    ////// Special terms for other metadata. No positions for these.
-    // Mime type
-    newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
-
-    // Simple file name indexed unsplit for specific "file name"
-    // searches. This is not the same as a filename: clause inside the
-    // query language.
-    // We also add a term for the filename extension if any.
-    string utf8fn;
-    if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
-	string fn;
-	if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
-	    // We should truncate after extracting the extension, but this is
-	    // a pathological case anyway
-	    if (fn.size() > 230)
-		utf8truncate(fn, 230);
-	    string::size_type pos = fn.rfind('.');
-	    if (pos != string::npos && pos != fn.length() - 1) {
-		newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
-					     fn.substr(pos + 1));
-	    }
-	    newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
-	}
-    }
-
     // Udi unique term: this is used for file existence/uptodate
     // checks, and unique id for the replace_document() call.
     string uniterm = make_uniterm(udi);
-    newdocument.add_boolean_term(uniterm);
-    // Parent term. This is used to find all descendents, mostly to delete them 
-    // when the parent goes away
-    if (!parent_udi.empty()) {
-	newdocument.add_boolean_term(make_parentterm(parent_udi));
-    }
-    // Dates etc.
-    time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : 
-			 doc.dmtime.c_str());
-    struct tm *tm = localtime(&mtime);
-    char buf[9];
-    snprintf(buf, 9, "%04d%02d%02d",
-	    tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
-    // Date (YYYYMMDD)
-    newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); 
-    // Month (YYYYMM)
-    buf[6] = '\0';
-    newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
-    // Year (YYYY)
-    buf[4] = '\0';
-    newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); 
-
-
-    //////////////////////////////////////////////////////////////////
-    // Document data record. omindex has the following nl separated fields:
-    // - url
-    // - sample
-    // - caption (title limited to 100 chars)
-    // - mime type 
-    //
-    // The title, author, abstract and keywords fields are special,
-    // they always get stored in the document data
-    // record. Configurable other fields can be, too.
-    //
-    // We truncate stored fields abstract, title and keywords to
-    // reasonable lengths and suppress newlines (so that the data
-    // record can keep a simple syntax)
-
-    string record;
-    RECORD_APPEND(record, Doc::keyurl, doc.url);
-    RECORD_APPEND(record, Doc::keytp, doc.mimetype);
-    // We left-zero-pad the times so that they are lexico-sortable
-    leftzeropad(doc.fmtime, 11);
-    RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
-    if (!doc.dmtime.empty()) {
-	leftzeropad(doc.dmtime, 11);
-	RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
-    }
-    RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
-
-    if (doc.fbytes.empty())
-	doc.fbytes = doc.pcbytes;
-
-    if (!doc.fbytes.empty()) {
-	RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
-	leftzeropad(doc.fbytes, 12);
-	newdocument.add_value(VALUE_SIZE, doc.fbytes);
-    }
-    if (doc.haschildren) {
-	newdocument.add_boolean_term(has_children_term);
-    }	
-    if (!doc.pcbytes.empty())
-	RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
-    char sizebuf[30]; 
-    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
-    RECORD_APPEND(record, Doc::keyds, sizebuf);
-
-    // Note that we add the signature both as a value and in the data record
-    if (!doc.sig.empty()) {
-	RECORD_APPEND(record, Doc::keysig, doc.sig);
-	newdocument.add_value(VALUE_SIG, doc.sig);
-    }
-
-    if (!doc.ipath.empty())
-	RECORD_APPEND(record, Doc::keyipt, doc.ipath);
-
-    doc.meta[Doc::keytt] = 
-	neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
-    if (!doc.meta[Doc::keytt].empty())
-	RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
-
-    trimstring(doc.meta[Doc::keykw], " \t\r\n");
-    doc.meta[Doc::keykw] = 
-	neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
-    // No need to explicitly append the keywords, this will be done by 
-    // the "stored" loop
-
-    // If abstract is empty, we make up one with the beginning of the
-    // document. This is then not indexed, but part of the doc data so
-    // that we can return it to a query without having to decode the
-    // original file.
-    bool syntabs = false;
-    // Note that the map accesses by operator[] create empty entries if they
-    // don't exist yet.
-    trimstring(doc.meta[Doc::keyabs], " \t\r\n");
-    if (doc.meta[Doc::keyabs].empty()) {
-	syntabs = true;
-	if (!doc.text.empty())
-	    doc.meta[Doc::keyabs] = cstr_syntAbs + 
-		neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
+
+    if (doc.onlyxattr) {
+	// Only updating an existing doc with new extended attributes
+	// data.  Need to read the old doc and its data record
+	// first. This is so different from the normal processing that
+	// it uses a fully separate code path (with some duplication
+	// unfortunately)
+	if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
+	    return false;
     } else {
-	doc.meta[Doc::keyabs] = 
-	    neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
-		      cstr_nc);
-    }
-
-    const set<string>& stored = m_config->getStoredFields();
-    for (set<string>::const_iterator it = stored.begin();
-	 it != stored.end(); it++) {
-	string nm = m_config->fieldCanon(*it);
-	if (!doc.meta[nm].empty()) {
-	    string value = 
-		neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
-	    RECORD_APPEND(record, nm, value);
-	}
-    }
-
-    // If empty pages (multiple break at same pos) were recorded, save
-    // them (this is because we have no way to record them in the
-    // Xapian list
-    if (!tpidx.m_pageincrvec.empty()) {
-	ostringstream multibreaks;
-	for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
-	    if (i != 0)
-		multibreaks << ",";
-	    multibreaks << tpidx.m_pageincrvec[i].first << "," << 
-		tpidx.m_pageincrvec[i].second;
-	}
-	RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
-    }
+
+	// If the ipath is like a path, index the last element. This is
+	// for compound documents like zip and chm for which the filter
+	// uses the file path as ipath. 
+	if (!doc.ipath.empty() && 
+	    doc.ipath.find_first_not_of("0123456789") != string::npos) {
+	    string utf8ipathlast;
+	    // There is no way in hell we could have an idea of the
+	    // charset here, so let's hope it's ascii or utf-8. We call
+	    // transcode to strip the bad chars and pray
+	    if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
+			  "UTF-8", "UTF-8")) {
+		splitter.text_to_words(utf8ipathlast);
+	    }
+	}
+
+	// Split and index the path from the url for path-based filtering
+	{
+	    string path = url_gpath(doc.url);
+	    vector<string> vpath;
+	    stringToTokens(path, vpath, "/");
+	    // If vpath is not /, the last elt is the file/dir name, not a
+	    // part of the path.
+	    if (vpath.size())
+		vpath.resize(vpath.size()-1);
+	    splitter.curpos = 0;
+	    newdocument.add_posting(wrap_prefix(pathelt_prefix),
+				    splitter.basepos + splitter.curpos++);
+	    for (vector<string>::iterator it = vpath.begin(); 
+		 it != vpath.end(); it++){
+		if (it->length() > 230) {
+		    // Just truncate it. May still be useful because of wildcards
+		    *it = it->substr(0, 230);
+		}
+		newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
+					splitter.basepos + splitter.curpos++);
+	    }
+	}
+
+	// Index textual metadata.  These are all indexed as text with
+	// positions, as we may want to do phrase searches with them (this
+	// makes no sense for keywords by the way).
+	//
+	// The order has no importance, and we set a position gap of 100
+	// between fields to avoid false proximity matches.
+	map<string, string>::iterator meta_it;
+	for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
+	    if (!meta_it->second.empty()) {
+		const FieldTraits *ftp;
+		// We don't test for an empty prefix here. Some fields are part
+		// of the internal conf with an empty prefix (ie: abstract).
+		if (!fieldToTraits(meta_it->first, &ftp)) {
+		    LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
+			     meta_it->first.c_str()));
+		    continue;
+		}
+		LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
+			 meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
+			 meta_it->second.c_str()));
+		splitter.setprefix(ftp->pfx);
+		splitter.setwdfinc(ftp->wdfinc);
+		if (!splitter.text_to_words(meta_it->second))
+		    LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
+			    meta_it->first.c_str()));
+	    }
+	}
+	splitter.setprefix(string());
+	splitter.setwdfinc(1);
+
+	if (splitter.curpos < baseTextPosition)
+	    splitter.basepos = baseTextPosition;
+
+	// Split and index body text
+	LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
+
+#ifdef TEXTSPLIT_STATS
+	splitter.resetStats();
+#endif
+	if (!splitter.text_to_words(doc.text))
+	    LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
+
+#ifdef TEXTSPLIT_STATS
+	// Reject bad data. unrecognized base64 text is characterized by
+	// high avg word length and high variation (because there are
+	// word-splitters like +/ inside the data).
+	TextSplit::Stats::Values v = splitter.getStats();
+	// v.avglen > 15 && v.sigma > 12 
+	if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
+	    LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
+		     "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
+		     v.count, v.avglen, v.sigma, doc.url.c_str(), 
+		     doc.ipath.c_str(), doc.text.c_str()));
+	    return true;
+	}
+#endif
+
+	////// Special terms for other metadata. No positions for these.
+	// Mime type
+	newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
+
+	// Simple file name indexed unsplit for specific "file name"
+	// searches. This is not the same as a filename: clause inside the
+	// query language.
+	// We also add a term for the filename extension if any.
+	string utf8fn;
+	if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
+	    string fn;
+	    if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
+		// We should truncate after extracting the extension, but this is
+		// a pathological case anyway
+		if (fn.size() > 230)
+		    utf8truncate(fn, 230);
+		string::size_type pos = fn.rfind('.');
+		if (pos != string::npos && pos != fn.length() - 1) {
+		    newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
+						 fn.substr(pos + 1));
+		}
+		newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
+	    }
+	}
+
+	newdocument.add_boolean_term(uniterm);
+	// Parent term. This is used to find all descendents, mostly
+	// to delete them when the parent goes away
+	if (!parent_udi.empty()) {
+	    newdocument.add_boolean_term(make_parentterm(parent_udi));
+	}
+	// Dates etc.
+	time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : 
+			     doc.dmtime.c_str());
+	struct tm *tm = localtime(&mtime);
+	char buf[9];
+	snprintf(buf, 9, "%04d%02d%02d",
+		 tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
+	// Date (YYYYMMDD)
+	newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); 
+	// Month (YYYYMM)
+	buf[6] = '\0';
+	newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
+	// Year (YYYY)
+	buf[4] = '\0';
+	newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); 
+
+
+	//////////////////////////////////////////////////////////////////
+	// Document data record. omindex has the following nl separated fields:
+	// - url
+	// - sample
+	// - caption (title limited to 100 chars)
+	// - mime type 
+	//
+	// The title, author, abstract and keywords fields are special,
+	// they always get stored in the document data
+	// record. Configurable other fields can be, too.
+	//
+	// We truncate stored fields abstract, title and keywords to
+	// reasonable lengths and suppress newlines (so that the data
+	// record can keep a simple syntax)
+
+	string record;
+	RECORD_APPEND(record, Doc::keyurl, doc.url);
+	RECORD_APPEND(record, Doc::keytp, doc.mimetype);
+	// We left-zero-pad the times so that they are lexico-sortable
+	leftzeropad(doc.fmtime, 11);
+	RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
+	if (!doc.dmtime.empty()) {
+	    leftzeropad(doc.dmtime, 11);
+	    RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
+	}
+	RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
+
+	if (doc.fbytes.empty())
+	    doc.fbytes = doc.pcbytes;
+
+	if (!doc.fbytes.empty()) {
+	    RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
+	    leftzeropad(doc.fbytes, 12);
+	    newdocument.add_value(VALUE_SIZE, doc.fbytes);
+	}
+	if (doc.haschildren) {
+	    newdocument.add_boolean_term(has_children_term);
+	}	
+	if (!doc.pcbytes.empty())
+	    RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
+	char sizebuf[30]; 
+	sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
+	RECORD_APPEND(record, Doc::keyds, sizebuf);
+
+	// Note that we add the signature both as a value and in the data record
+	if (!doc.sig.empty()) {
+	    RECORD_APPEND(record, Doc::keysig, doc.sig);
+	    newdocument.add_value(VALUE_SIG, doc.sig);
+	}
+
+	if (!doc.ipath.empty())
+	    RECORD_APPEND(record, Doc::keyipt, doc.ipath);
+
+	doc.meta[Doc::keytt] = 
+	    neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
+	if (!doc.meta[Doc::keytt].empty())
+	    RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
+
+	trimstring(doc.meta[Doc::keykw], " \t\r\n");
+	doc.meta[Doc::keykw] = 
+	    neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
+	// No need to explicitly append the keywords, this will be done by 
+	// the "stored" loop
+
+	// If abstract is empty, we make up one with the beginning of the
+	// document. This is then not indexed, but part of the doc data so
+	// that we can return it to a query without having to decode the
+	// original file.
+	bool syntabs = false;
+	// Note that the map accesses by operator[] create empty entries if they
+	// don't exist yet.
+	trimstring(doc.meta[Doc::keyabs], " \t\r\n");
+	if (doc.meta[Doc::keyabs].empty()) {
+	    syntabs = true;
+	    if (!doc.text.empty())
+		doc.meta[Doc::keyabs] = cstr_syntAbs + 
+		    neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
+	} else {
+	    doc.meta[Doc::keyabs] = 
+		neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
+			  cstr_nc);
+	}
+
+	const set<string>& stored = m_config->getStoredFields();
+	for (set<string>::const_iterator it = stored.begin();
+	     it != stored.end(); it++) {
+	    string nm = m_config->fieldCanon(*it);
+	    if (!doc.meta[nm].empty()) {
+		string value = 
+		    neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
+		RECORD_APPEND(record, nm, value);
+	    }
+	}
+
+	// If empty pages (multiple break at same pos) were recorded, save
+	// them (this is because we have no way to record them in the
+	// Xapian list
+	if (!tpidx.m_pageincrvec.empty()) {
+	    ostringstream multibreaks;
+	    for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
+		if (i != 0)
+		    multibreaks << ",";
+		multibreaks << tpidx.m_pageincrvec[i].first << "," << 
+		    tpidx.m_pageincrvec[i].second;
+	    }
+	    RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
+	}
     
-    // If the file's md5 was computed, add value and term. 
-    // The value is optionally used for query result duplicate elimination, 
-    // and the term to find the duplicates.
-    // We don't do this for empty docs.
-    const string *md5;
-    if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
-	md5->compare(cstr_md5empty)) {
-	string digest;
-	MD5HexScan(*md5, digest);
-	newdocument.add_value(VALUE_MD5, digest);
-	newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
-    }
-
-    LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
-    newdocument.set_data(record);
-
+	// If the file's md5 was computed, add value and term. 
+	// The value is optionally used for query result duplicate elimination, 
+	// and the term to find the duplicates.
+	// We don't do this for empty docs.
+	const string *md5;
+	if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
+	    md5->compare(cstr_md5empty)) {
+	    string digest;
+	    MD5HexScan(*md5, digest);
+	    newdocument.add_value(VALUE_MD5, digest);
+	    newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
+	}
+
+	LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
+	newdocument.set_data(record);
+    }
 #ifdef IDX_THREADS
     if (m_ndb->m_havewriteq) {
 	DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, 
@@ -1450,6 +1555,81 @@
 
     return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument, 
 				   doc.text.length());
+}
+
+bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
+				    Doc &doc, Xapian::Document& xdoc)
+{
+    LOGDEB0(("Db::docToXdocXattrOnly\n"));
+    PTMutexLocker lock(m_mutex);
+
+    // Read existing document and its data record
+    if (getDoc(udi, 0, xdoc) == 0) {
+	LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
+	return false;
+    }
+    string data;
+    XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
+    if (!m_rcldb->m_reason.empty()) {
+        LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
+        return false;
+    }
+
+    // Clear the term lists for the incoming fields and index the new values
+    map<string, string>::iterator meta_it;
+    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
+	const FieldTraits *ftp;
+	if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
+	    LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
+		     meta_it->first.c_str()));
+	    continue;
+	}
+	// Clear the previous terms for the field
+	clearField(xdoc, ftp->pfx, ftp->wdfinc);
+	LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n", 
+		 meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
+		 meta_it->second.c_str()));
+	splitter->setprefix(ftp->pfx);
+	splitter->setwdfinc(ftp->wdfinc);
+	if (!splitter->text_to_words(meta_it->second))
+	    LOGDEB(("Db::xattrOnly: split failed for %s\n", 
+		    meta_it->first.c_str()));
+    }
+    xdoc.add_value(VALUE_SIG, doc.sig);
+
+    // Parse current data record into a dict for ease of processing
+    ConfSimple datadic(data);
+    if (!datadic.ok()) {
+	LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
+	return false;
+    }
+
+    // For each "stored" field, check if set in doc metadata and
+    // update the value if it is
+    const set<string>& stored = m_rcldb->m_config->getStoredFields();
+    for (set<string>::const_iterator it = stored.begin();
+	 it != stored.end(); it++) {
+	string nm = m_rcldb->m_config->fieldCanon(*it);
+	if (doc.getmeta(nm, 0)) {
+	    string value = 
+		neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
+	    datadic.set(nm, value, "");
+	}
+    }
+
+    // Recreate the record. We want to do this with the local RECORD_APPEND
+    // method for consistency in format, instead of using ConfSimple print
+    vector<string> names = datadic.getNames("");
+    data.clear();
+    for (vector<string>::const_iterator it = names.begin(); 
+	 it != names.end(); it++) {
+	string value;
+	datadic.get(*it, value, "");
+	RECORD_APPEND(data, *it, value);
+    }
+    RECORD_APPEND(data, Doc::keysig, doc.sig);
+    xdoc.set_data(data);
+    return true;
 }
 
 #ifdef IDX_THREADS