recoll / Code / Diff of /src/index/indexer.cpp

Diff of /src/index/indexer.cpp [69dcb9] .. [d14601]

Switch to side-by-side view

--- a/src/index/indexer.cpp
+++ b/src/index/indexer.cpp
@@ -25,569 +25,13 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include <errno.h>
-#include <cstring>
-#include <fnmatch.h>
 
-#include <iostream>
-#include <list>
-#include <map>
-#include <algorithm>
-
-#include "pathut.h"
-#include "conftree.h"
-#include "rclconfig.h"
-#include "fstreewalk.h"
-#include "rcldb.h"
-#include "readfile.h"
+#include "debuglog.h"
 #include "indexer.h"
-#include "csguess.h"
-#include "transcode.h"
-#include "debuglog.h"
-#include "internfile.h"
-#include "smallut.h"
-#include "wipedir.h"
-#include "fileudi.h"
-
-#ifdef RCL_USE_ASPELL
-#include "rclaspell.h"
-#endif
-
-// When using extended attributes, we have to use the ctime. 
-// This is quite an expensive price to pay...
-#ifdef RCL_USE_XATTR
-#define RCL_STTIME st_ctime
-#else
-#define RCL_STTIME st_mtime
-#endif // RCL_USE_XATTR
-
-#ifndef NO_NAMESPACES
-using namespace std;
-#endif /* NO_NAMESPACES */
-
-#ifndef deleteZ
-#define deleteZ(X) {delete X;X = 0;}
-#endif
-
-DbIndexer::~DbIndexer() {
-    // Maybe clean up temporary directory
-    if (m_tmpdir.length()) {
-	wipedir(m_tmpdir);
-	if (rmdir(m_tmpdir.c_str()) < 0) {
-	    LOGERR(("DbIndexer::~DbIndexer: cannot clear temp dir %s\n",
-		    m_tmpdir.c_str()));
-	}
-    }
-    m_db.close();
-}
-
-list<string> DbIndexer::getStemmerNames()
-{
-    return Rcl::Db::getStemmerNames();
-}
-
-// Index each directory in the topdirs for a given db
-bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
-{
-    if (!init(resetbefore))
-	return false;
-
-    if (m_updater) {
-	m_updater->status.reset();
-	m_updater->status.dbtotdocs = m_db.docCnt();
-    }
-
-    m_walker.setSkippedPaths(m_config->getSkippedPaths());
-
-    for (list<string>::const_iterator it = topdirs->begin();
-	 it != topdirs->end(); it++) {
-	LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(), 
-		getDbDir().c_str()));
-
-	// Set the current directory in config so that subsequent
-	// getConfParams() will get local values
-	m_config->setKeyDir(*it);
-
-	// Adjust the "follow symlinks" option
-	bool follow;
-	if (m_config->getConfParam("followLinks", &follow) && follow) {
-	    m_walker.setOpts(FsTreeWalker::FtwFollow);
-	} else {
-	    m_walker.setOpts(FsTreeWalker::FtwOptNone);
-	}	    
-
-	int abslen;
-	if (m_config->getConfParam("idxabsmlen", &abslen))
-	    m_db.setAbstractParams(abslen, -1, -1);
-
-	// Set up skipped patterns for this subtree. This probably should be
-	// done in the directory change code in processone() instead.
-	m_walker.setSkippedNames(m_config->getSkippedNames());
-
-	// Walk the directory tree
-	if (m_walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
-	    LOGERR(("DbIndexer::index: error while indexing %s: %s\n", 
-		    it->c_str(), m_walker.getReason().c_str()));
-	    return false;
-	}
-    }
-    if (m_updater) {
-	m_updater->status.fn.erase();
-	m_updater->status.phase = DbIxStatus::DBIXS_PURGE;
-	m_updater->update();
-    }
-
-    // Get rid of all database entries that don't exist in the
-    // filesystem anymore.
-    m_db.purge();
-
-    createStemmingDatabases();
-    createAspellDict();
-
-    if (m_updater) {
-	m_updater->status.phase = DbIxStatus::DBIXS_CLOSING;
-	m_updater->status.fn.erase();
-	m_updater->update();
-    }
-    // The close would be done in our destructor, but we want status here
-    if (!m_db.close()) {
-	LOGERR(("DbIndexer::index: error closing database in %s\n", 
-		getDbDir().c_str()));
-	return false;
-    }
-    string missing;
-    FileInterner::getMissingDescription(missing);
-    if (!missing.empty()) {
-	LOGINFO(("DbIndexer::index missing helper program(s):\n%s\n", 
-		 missing.c_str()));
-    }
-    m_config->storeMissingHelperDesc(missing);
-    return true;
-}
-
-// Create stemming databases. We also remove those which are not
-// configured. 
-bool DbIndexer::createStemmingDatabases()
-{
-    string slangs;
-    if (m_config->getConfParam("indexstemminglanguages", slangs)) {
-	list<string> langs;
-	stringToStrings(slangs, langs);
-
-	// Get the list of existing stem dbs from the database (some may have 
-	// been manually created, we just keep those from the config
-	list<string> dblangs = m_db.getStemLangs();
-	list<string>::const_iterator it;
-	for (it = dblangs.begin(); it != dblangs.end(); it++) {
-	    if (find(langs.begin(), langs.end(), *it) == langs.end())
-		m_db.deleteStemDb(*it);
-	}
-	for (it = langs.begin(); it != langs.end(); it++) {
-	    if (m_updater) {
-		m_updater->status.phase = DbIxStatus::DBIXS_STEMDB;
-		m_updater->status.fn = *it;
-		m_updater->update();
-	    }
-	    m_db.createStemDb(*it);
-	}
-    }
-    return true;
-}
-
-bool DbIndexer::init(bool resetbefore, bool rdonly)
-{
-    if (!rdonly && (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0)) {
-	string reason;
-	if (!maketmpdir(m_tmpdir, reason)) {
-	    LOGERR(("DbIndexer: cannot create temporary directory: %s\n",
-		    reason.c_str()));
-	    return false;
-	}
-    }
-    Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
-	resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
-    if (!m_db.open(mode)) {
-	LOGERR(("DbIndexer: error opening database %s\n", getDbDir().c_str()));
-	return false;
-    }
-
-    return true;
-}
-
-bool DbIndexer::createStemDb(const string &lang)
-{
-    if (!init(false, true))
-	return false;
-    return m_db.createStemDb(lang);
-}
-
-// The language for the aspell dictionary is handled internally by the aspell
-// module, either from a configuration variable or the NLS environment.
-bool DbIndexer::createAspellDict()
-{
-    LOGDEB2(("DbIndexer::createAspellDict()\n"));
-#ifdef RCL_USE_ASPELL
-    // For the benefit of the real-time indexer, we only initialize
-    // noaspell from the configuration once. It can then be set to
-    // true if dictionary generation fails, which avoids retrying
-    // it forever.
-    static int noaspell = -12345;
-    if (noaspell == -12345) {
-	noaspell = false;
-	m_config->getConfParam("noaspell", &noaspell);
-    }
-    if (noaspell)
-	return true;
-
-    if (!init(false, true))
-	return false;
-    Aspell aspell(m_config);
-    string reason;
-    if (!aspell.init(reason)) {
-	LOGERR(("DbIndexer::createAspellDict: aspell init failed: %s\n", 
-		reason.c_str()));
-	noaspell = true;
-	return false;
-    }
-    LOGDEB(("DbIndexer::createAspellDict: creating dictionary\n"));
-    if (!aspell.buildDict(m_db, reason)) {
-	LOGERR(("DbIndexer::createAspellDict: aspell buildDict failed: %s\n", 
-		reason.c_str()));
-	noaspell = true;
-	return false;
-    }
-#endif
-    return true;
-}
-
-/** 
- * Index individual files, out of a full tree run. No database purging
- */
-bool DbIndexer::indexFiles(const list<string> &filenames)
-{
-    bool called_init = false;
-
-    list<string>::const_iterator it;
-    for (it = filenames.begin(); it != filenames.end(); it++) {
-	string dir = path_getfather(*it);
-	m_config->setKeyDir(dir);
-	int abslen;
-	if (m_config->getConfParam("idxabsmlen", &abslen))
-	    m_db.setAbstractParams(abslen, -1, -1);
-	struct stat stb;
-	if (lstat(it->c_str(), &stb) != 0) {
-	    LOGERR(("DbIndexer::indexFiles: lstat(%s): %s", it->c_str(),
-		    strerror(errno)));
-	    continue;
-	}
-
-	// If we get to indexing directory names one day, will need to test 
-	// against dbdir here to avoid modification loops (with rclmon).
-	if (!S_ISREG(stb.st_mode)) {
-	    LOGDEB2(("DbIndexer::indexFiles: %s: not a regular file\n", 
-		    it->c_str()));
-	    continue;
-	}
-
-	static string lstdir;
-	static list<string> skpl;
-	if (lstdir.compare(dir)) {
-	    LOGDEB(("Recomputing list of skipped names\n"));
-	    skpl = m_config->getSkippedNames();
-	    lstdir = dir;
-	}
-	if (!skpl.empty()) {
-	    list<string>::const_iterator skit;
-	    string fn = path_getsimple(*it);
-	    for (skit = skpl.begin(); skit != skpl.end(); skit++) {
-		if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
-		    LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str()));
-		    goto skipped;
-		}
-	    }
-	}
-	// Defer opening db until really needed.
-	if (!called_init) {
-	    if (!init())
-		return false;
-	    called_init = true;
-	}
-	if (processone(*it, &stb, FsTreeWalker::FtwRegular) != 
-	    FsTreeWalker::FtwOk) {
-	    LOGERR(("DbIndexer::indexFiles: processone failed\n"));
-	    return false;
-	}
-    skipped: 
-	false; // Need a statement here to make compiler happy ??
-    }
-
-    // The close would be done in our destructor, but we want status here
-    if (!m_db.close()) {
-	LOGERR(("DbIndexer::indexfiles: error closing database in %s\n", 
-		getDbDir().c_str()));
-	return false;
-    }
-    return true;
-}
-
-
-/** Purge docs for given files out of the database */
-bool DbIndexer::purgeFiles(const list<string> &filenames)
-{
-    if (!init())
-	return false;
-
-    list<string>::const_iterator it;
-    for (it = filenames.begin(); it != filenames.end(); it++) {
-	string udi;
-	make_udi(*it, "", udi);
-	if (!m_db.purgeFile(udi)) {
-	    LOGERR(("DbIndexer::purgeFiles: Database error\n"));
-	    return false;
-	}
-    }
-
-    // The close would be done in our destructor, but we want status here
-    if (!m_db.close()) {
-	LOGERR(("DbIndexer::purgefiles: error closing database in %s\n", 
-		getDbDir().c_str()));
-	return false;
-    }
-    return true;
-}
-
-// Local fields can be set for fs subtrees in the configuration file 
-void DbIndexer::localfieldsfromconf()
-{
-    LOGDEB(("DbIndexer::localfieldsfromconf\n"));
-    m_localfields.clear();
-    string sfields;
-    if (!m_config->getConfParam("localfields", sfields))
-        return;
-    list<string> lfields;
-    if (!stringToStrings(sfields, lfields)) {
-        LOGERR(("DbIndexer::localfieldsfromconf: bad syntax for [%s]\n", 
-                sfields.c_str()));
-        return;
-    }
-    for (list<string>::const_iterator it = lfields.begin();
-         it != lfields.end(); it++) {
-        ConfSimple conf(*it, 1, true);
-        list<string> nmlst = conf.getNames("");
-        for (list<string>::const_iterator it1 = nmlst.begin();
-             it1 != nmlst.end(); it1++) {
-            conf.get(*it1, m_localfields[*it1]);
-            LOGDEB2(("DbIndexer::localfieldsfromconf: [%s] => [%s]\n",
-                    (*it1).c_str(), m_localfields[*it1].c_str()));
-        }
-    }
-}
-
-// 
-void DbIndexer::setlocalfields(Rcl::Doc& doc)
-{
-    for (map<string, string>::const_iterator it = m_localfields.begin();
-         it != m_localfields.end(); it++) {
-        // Should local fields override those coming from the document
-        // ? I think not, but not too sure
-        if (doc.meta.find(it->second) == doc.meta.end()) {
-            doc.meta[it->first] = it->second;
-        }
-    }
-}
-
-
-/// This method gets called for every file and directory found by the
-/// tree walker. 
-///
-/// It checks with the db if the file has changed and needs to be
-/// reindexed. If so, it calls internfile() which will identify the
-/// file type and call an appropriate handler to convert the document into
-/// internal format, which we then add to the database.
-///
-/// Accent and majuscule handling are performed by the db module when doing
-/// the actual indexing work. The Rcl::Doc created by internfile()
-/// mostly contains pretty raw utf8 data.
-FsTreeWalker::Status 
-DbIndexer::processone(const std::string &fn, const struct stat *stp, 
-		      FsTreeWalker::CbFlag flg)
-{
-    if (m_updater && !m_updater->update()) {
-        return FsTreeWalker::FtwStop;
-    }
-
-    // If we're changing directories, possibly adjust parameters (set
-    // the current directory in configuration object)
-    if (flg == FsTreeWalker::FtwDirEnter || 
-	flg == FsTreeWalker::FtwDirReturn) {
-	m_config->setKeyDir(fn);
-
-	int abslen;
-	if (m_config->getConfParam("idxabsmlen", &abslen))
-	    m_db.setAbstractParams(abslen, -1, -1);
-
-        // Adjust local fields from config for this subtree
-        if (m_havelocalfields)
-            localfieldsfromconf();
-
-	if (flg == FsTreeWalker::FtwDirReturn)
-	    return FsTreeWalker::FtwOk;
-    }
-
-    ////////////////////
-    // Check db up to date ? Doing this before file type
-    // identification means that, if usesystemfilecommand is switched
-    // from on to off it may happen that some files which are now
-    // without mime type will not be purged from the db, resulting
-    // in possible 'cannot intern file' messages at query time...
-
-    // Document signature. This is based on m/ctime and size and used
-    // for the uptodate check (the value computed here is checked
-    // against the stored one). Changing the computation forces a full
-    // reindex of course.
-    char cbuf[100]; 
-    sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
-    string sig = cbuf;
-    string udi;
-    make_udi(fn, "", udi);
-    if (!m_db.needUpdate(udi, sig)) {
-	LOGDEB(("processone: up to date: %s\n", fn.c_str()));
-	if (m_updater) {
-	    // Status bar update, abort request etc.
-	    m_updater->status.fn = fn;
-	    if (!m_updater->update()) {
-		return FsTreeWalker::FtwStop;
-	    }
-	}
-	return FsTreeWalker::FtwOk;
-    }
-
-    LOGDEB0(("processone: processing: [%s] %s\n", 
-             displayableBytes(stp->st_size).c_str(), fn.c_str()));
-
-    FileInterner interner(fn, stp, m_config, m_tmpdir, FileInterner::FIF_none);
-
-    // File name transcoded to utf8 for indexation. 
-    string charset = m_config->getDefCharset(true);
-    // If this fails, the file name won't be indexed, no big deal
-    // Note that we used to do the full path here, but I ended up believing
-    // that it made more sense to use only the file name
-    string utf8fn; int ercnt;
-    if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) {
-	LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n",
-		charset.c_str(), path_getsimple(fn).c_str()));
-    } else if (ercnt) {
-	LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n",
-		ercnt, charset.c_str(), path_getsimple(fn).c_str()));
-    }
-    LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
-	     path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), 
-	     "UTF-8"));
-
-    string parent_udi;
-    make_udi(fn, "", parent_udi);
-    Rcl::Doc doc;
-    const string plus("+");
-    char ascdate[20];
-    sprintf(ascdate, "%ld", long(stp->st_mtime));
-
-    FileInterner::Status fis = FileInterner::FIAgain;
-    bool hadNullIpath = false;
-    while (fis == FileInterner::FIAgain) {
-	doc.erase();
-	string ipath;
-	fis = interner.internfile(doc, ipath);
-
-        // Index at least the file name even if there was an error.
-        // We'll change the signature to ensure that the indexing will
-        // be retried every time.
-
-
-	// Internal access path for multi-document files
-	if (ipath.empty())
-	    hadNullIpath = true;
-	else
-	    doc.ipath = ipath;
-
-	// Set file name, mod time and url if not done by filter
-	if (doc.fmtime.empty())
-	    doc.fmtime = ascdate;
-        if (doc.url.empty())
-            doc.url = string("file://") + fn;
-	if (doc.utf8fn.empty())
-	    doc.utf8fn = utf8fn;
-
-	char cbuf[100]; 
-	sprintf(cbuf, "%ld", (long)stp->st_size);
-	doc.fbytes = cbuf;
-	// Document signature for up to date checks: concatenate
-	// m/ctime and size. Looking for changes only, no need to
-	// parseback so no need for reversible formatting. Also set,
-	// but never used, for subdocs.
-	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
-	doc.sig = cbuf;
-	// If there was an error, ensure indexing will be
-	// retried. This is for the once missing, later installed
-	// filter case. It can make indexing much slower (if there are
-	// myriads of such files, the ext script is executed for them
-	// and fails every time)
-	if (fis == FileInterner::FIError) {
-	    doc.sig += plus;
-	}
-
-        // Possibly add fields from local config
-        if (m_havelocalfields) 
-            setlocalfields(doc);
-	// Add document to database. If there is an ipath, add it as a children
-	// of the file document.
-	string udi;
-	make_udi(fn, ipath, udi);
-	if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc)) 
-	    return FsTreeWalker::FtwError;
-
-	// Tell what we are doing and check for interrupt request
-	if (m_updater) {
-	    ++(m_updater->status.docsdone);
-            m_updater->status.fn = fn;
-            if (!ipath.empty())
-                m_updater->status.fn += "|" + ipath;
-            if (!m_updater->update()) {
-                return FsTreeWalker::FtwStop;
-            }
-	}
-    }
-
-    // If we had no instance with a null ipath, we create an empty
-    // document to stand for the file itself, to be used mainly for up
-    // to date checks. Typically this happens for an mbox file.
-    if (hadNullIpath == false) {
-	LOGDEB1(("Creating empty doc for file\n"));
-	Rcl::Doc fileDoc;
-	fileDoc.fmtime = ascdate;
-	fileDoc.utf8fn = utf8fn;
-	fileDoc.mimetype = interner.getMimetype();
-	fileDoc.url = string("file://") + fn;
-
-	char cbuf[100]; 
-	sprintf(cbuf, "%ld", (long)stp->st_size);
-	fileDoc.fbytes = cbuf;
-	// Document signature for up to date checks.
-	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
-	fileDoc.sig = cbuf;
-	if (!m_db.addOrUpdate(parent_udi, "", fileDoc)) 
-	    return FsTreeWalker::FtwError;
-    }
-
-    return FsTreeWalker::FtwOk;
-}
-
-////////////////////////////////////////////////////////////////////////////
-// ConIndexer methods: ConfIndexer is the top-level object, that could
-// in theory index multiple directories to multiple databases. In practise we
-// have a single database per configuration.
 
 ConfIndexer::~ConfIndexer()
 {
-     deleteZ(m_dbindexer);
+     deleteZ(m_fsindexer);
 }
 
 bool ConfIndexer::index(bool resetbefore)
@@ -634,13 +78,13 @@
     // The dbmap now has dbdir as key and directory lists as values.
     // Index each directory group in turn
     for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {
-	m_dbindexer = new DbIndexer(m_config, m_updater);
-	if (!m_dbindexer->indexDb(resetbefore, &dbit->second)) {
-	    deleteZ(m_dbindexer);
+	m_fsindexer = new FsIndexer(m_config, m_updater);
+	if (!m_fsindexer->indexTrees(resetbefore, &dbit->second)) {
+	    deleteZ(m_fsindexer);
 	    m_reason = "Failed indexing in " + dbit->first;
 	    return false;
 	}
-	deleteZ(m_dbindexer);
+	deleteZ(m_fsindexer);
     }
     return true;
 }