recoll / Code / [c562f8] /src/index/fsindexer.cpp

[c562f8]: src / index / fsindexer.cpp History

fsindexer.cpp 482 lines (424 with data), 15.3 kB

#ifndef lint
static char rcsid[] = "@(#$Id: $ (C) 2009 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
#ifdef HAVE_CONFIG_H
#include "autoconfig.h"
#endif

#include <stdio.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>
#include <cstring>
#include <fnmatch.h>

#include <iostream>
#include <list>
#include <map>
#include <algorithm>

#include "pathut.h"
#include "conftree.h"
#include "rclconfig.h"
#include "fstreewalk.h"
#include "rcldb.h"
#include "readfile.h"
#include "indexer.h"
#include "fsindexer.h"
#include "csguess.h"
#include "transcode.h"
#include "debuglog.h"
#include "internfile.h"
#include "smallut.h"
#include "wipedir.h"
#include "fileudi.h"
#include "cancelcheck.h"

// When using extended attributes, we have to use the ctime. 
// This is quite an expensive price to pay...
#ifdef RCL_USE_XATTR
#define RCL_STTIME st_ctime
#else
#define RCL_STTIME st_mtime
#endif // RCL_USE_XATTR

#ifndef NO_NAMESPACES
using namespace std;
#endif /* NO_NAMESPACES */

#ifndef deleteZ
#define deleteZ(X) {delete X;X = 0;}
#endif

FsIndexer::~FsIndexer() {
}

bool FsIndexer::init()
{
    if (m_tdl.empty()) {
        m_tdl = m_config->getTopdirs();
        if (m_tdl.empty()) {
            LOGERR(("FsIndexers: no topdirs list defined\n"));
            return false;
        }
    }
    return true;
}

// Recursively index each directory in the topdirs:
bool FsIndexer::index()
{
    if (!init())
	return false;

    if (m_updater) {
	m_updater->status.reset();
	m_updater->status.dbtotdocs = m_db->docCnt();
    }

    m_walker.setSkippedPaths(m_config->getSkippedPaths());
    m_walker.addSkippedPath(path_tildexpand("~/.beagle"));
    for (list<string>::const_iterator it = m_tdl.begin();
	 it != m_tdl.end(); it++) {
	LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(), 
		getDbDir().c_str()));

	// Set the current directory in config so that subsequent
	// getConfParams() will get local values
	m_config->setKeyDir(*it);

	// Adjust the "follow symlinks" option
	bool follow;
	if (m_config->getConfParam("followLinks", &follow) && follow) {
	    m_walker.setOpts(FsTreeWalker::FtwFollow);
	} else {
	    m_walker.setOpts(FsTreeWalker::FtwOptNone);
	}	    

	int abslen;
	if (m_config->getConfParam("idxabsmlen", &abslen))
	    m_db->setAbstractParams(abslen, -1, -1);

	// Walk the directory tree
	if (m_walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
	    LOGERR(("FsIndexer::index: error while indexing %s: %s\n", 
		    it->c_str(), m_walker.getReason().c_str()));
	    return false;
	}
    }

    string missing;
    FileInterner::getMissingDescription(missing);
    if (!missing.empty()) {
	LOGINFO(("FsIndexer::index missing helper program(s):\n%s\n", 
		 missing.c_str()));
    }
    m_config->storeMissingHelperDesc(missing);
    return true;
}

static bool matchesSkipped(const list<string>& tdl,
                           FsTreeWalker& walker,
                           const string& path)
{
    // First check what (if any) topdir this is in:
    string td;
    for (list<string>::const_iterator it = tdl.begin(); it != tdl.end(); it++) {
        if (path.find(*it) == 0) {
            td = *it;
            break;
        }
    }
    if (td.empty()) {
        LOGDEB(("FsIndexer::indexFiles: skipping [%s] (ntd)\n", path.c_str()));
        return true;
    }

    // Check path against skippedPaths. 
    if (walker.inSkippedPaths(path)) {
        LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpp)\n", path.c_str()));
        return true;
    }

    // Then check all path components up to the topdir against skippedNames
    string mpath = path;
    while (mpath.length() >= td.length() && mpath.length() > 1) {
        string fn = path_getsimple(mpath);
        if (walker.inSkippedNames(fn)) {
            LOGDEB(("FsIndexer::indexFiles: skipping [%s] (skpn)\n", 
                    path.c_str()));
            return true;
        }

        string::size_type len = mpath.length();
        mpath = path_getfather(mpath);
        // getfather normally returns a path ending with /, getsimple 
        // would then return ''
        if (!mpath.empty() && mpath[mpath.size()-1] == '/')
            mpath.erase(mpath.size()-1);
        // should not be necessary, but lets be prudent. If the
        // path did not shorten, something is seriously amiss
        // (could be an assert actually)
        if (mpath.length() >= len)
            return true;
    }
    return false;
}

/** 
 * Index individual files, out of a full tree run. No database purging
 */
bool FsIndexer::indexFiles(list<string>& files)
{
    if (!init())
        return false;

    // We use an FsTreeWalker just for handling the skipped path/name lists
    FsTreeWalker walker;
    walker.setSkippedPaths(m_config->getSkippedPaths());
    m_walker.addSkippedPath(path_tildexpand("~/.beagle"));

    for (list<string>::iterator it = files.begin(); it != files.end(); ) {
        LOGDEB2(("FsIndexer::indexFiles: [%s]\n", it->c_str()));

        m_config->setKeyDir(path_getfather(*it));
        if (m_havelocalfields)
            localfieldsfromconf();
        walker.setSkippedNames(m_config->getSkippedNames());

	// Check path against indexed areas and skipped names/paths
        if (matchesSkipped(m_tdl, walker, *it)) {
            it++; continue;
        }

	struct stat stb;
	if (lstat(it->c_str(), &stb) != 0) {
	    LOGERR(("FsIndexer::indexFiles: lstat(%s): %s", it->c_str(),
		    strerror(errno)));
            it++; continue;
	}
	// If we get to indexing directory names one day, will need to test 
	// against dbdir here to avoid modification loops (with rclmon).
	if (!S_ISREG(stb.st_mode)) {
	    LOGDEB(("FsIndexer::indexFiles: skipping [%s] (nr)\n", 
		    it->c_str()));
            it++; continue;
	}

	int abslen;
	if (m_config->getConfParam("idxabsmlen", &abslen))
	    m_db->setAbstractParams(abslen, -1, -1);
        
	if (processone(*it, &stb, FsTreeWalker::FtwRegular) != 
	    FsTreeWalker::FtwOk) {
	    LOGERR(("FsIndexer::indexFiles: processone failed\n"));
	    return false;
	}
        it = files.erase(it);
    }

    return true;
}


/** Purge docs for given files out of the database */
bool FsIndexer::purgeFiles(list<string>& files)
{
    if (!init())
	return false;
    for (list<string>::iterator it = files.begin(); it != files.end(); ) {
	string udi;
	make_udi(*it, "", udi);
        // rcldb::purgefile returns true if the udi was either not
        // found or deleted, false only in case of actual error
        bool existed;
	if (!m_db->purgeFile(udi, &existed)) {
	    LOGERR(("FsIndexer::purgeFiles: Database error\n"));
	    return false;
	}
        // If we actually deleted something, take it off the list
        if (existed) {
            it = files.erase(it);
        } else {
            it++;
        }
    }

    return true;
}

// Local fields can be set for fs subtrees in the configuration file 
void FsIndexer::localfieldsfromconf()
{
    LOGDEB0(("FsIndexer::localfieldsfromconf\n"));
    m_localfields.clear();
    m_config->addLocalFields(&m_localfields);
}

// 
void FsIndexer::setlocalfields(Rcl::Doc& doc)
{
    for (map<string, string>::const_iterator it = m_localfields.begin();
         it != m_localfields.end(); it++) {
        // Should local fields override those coming from the document
        // ? I think not, but not too sure
        if (doc.meta.find(it->second) == doc.meta.end()) {
            doc.meta[it->first] = it->second;
        }
    }
}


/// This method gets called for every file and directory found by the
/// tree walker. 
///
/// It checks with the db if the file has changed and needs to be
/// reindexed. If so, it calls internfile() which will identify the
/// file type and call an appropriate handler to convert the document into
/// internal format, which we then add to the database.
///
/// Accent and majuscule handling are performed by the db module when doing
/// the actual indexing work. The Rcl::Doc created by internfile()
/// mostly contains pretty raw utf8 data.
FsTreeWalker::Status 
FsIndexer::processone(const std::string &fn, const struct stat *stp, 
		      FsTreeWalker::CbFlag flg)
{
    if (m_updater && !m_updater->update()) {
        return FsTreeWalker::FtwStop;
    }

    // If we're changing directories, possibly adjust parameters (set
    // the current directory in configuration object)
    if (flg == FsTreeWalker::FtwDirEnter || 
	flg == FsTreeWalker::FtwDirReturn) {
	m_config->setKeyDir(fn);

	// Set up skipped patterns for this subtree. 
	m_walker.setSkippedNames(m_config->getSkippedNames());

	int abslen;
	if (m_config->getConfParam("idxabsmlen", &abslen))
	    m_db->setAbstractParams(abslen, -1, -1);

        // Adjust local fields from config for this subtree
        if (m_havelocalfields)
            localfieldsfromconf();

	if (flg == FsTreeWalker::FtwDirReturn)
	    return FsTreeWalker::FtwOk;
    }

    ////////////////////
    // Check db up to date ? Doing this before file type
    // identification means that, if usesystemfilecommand is switched
    // from on to off it may happen that some files which are now
    // without mime type will not be purged from the db, resulting
    // in possible 'cannot intern file' messages at query time...

    // Document signature. This is based on m/ctime and size and used
    // for the uptodate check (the value computed here is checked
    // against the stored one). Changing the computation forces a full
    // reindex of course.
    char cbuf[100]; 
    sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
    string sig = cbuf;
    string udi;
    make_udi(fn, "", udi);
    if (!m_db->needUpdate(udi, sig)) {
	LOGDEB(("processone: up to date: %s\n", fn.c_str()));
	if (m_updater) {
	    // Status bar update, abort request etc.
	    m_updater->status.fn = fn;
	    if (!m_updater->update()) {
		return FsTreeWalker::FtwStop;
	    }
	}
	return FsTreeWalker::FtwOk;
    }

    LOGDEB0(("processone: processing: [%s] %s\n", 
             displayableBytes(stp->st_size).c_str(), fn.c_str()));

    FileInterner interner(fn, stp, m_config, m_tmpdir, FileInterner::FIF_none);
    if (!interner.ok()) {
        // no indexing whatsoever in this case. This typically means that
        // indexallfilenames is not set
        return FsTreeWalker::FtwOk;
    }

    // File name transcoded to utf8 for indexing. 
    string charset = m_config->getDefCharset(true);
    // If this fails, the file name won't be indexed, no big deal
    // Note that we used to do the full path here, but I ended up believing
    // that it made more sense to use only the file name
    string utf8fn; int ercnt;
    if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) {
	LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n",
		charset.c_str(), path_getsimple(fn).c_str()));
    } else if (ercnt) {
	LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n",
		ercnt, charset.c_str(), path_getsimple(fn).c_str()));
    }
    LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
	     path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), 
	     "UTF-8"));

    string parent_udi;
    make_udi(fn, "", parent_udi);
    Rcl::Doc doc;
    const string plus("+");
    char ascdate[30];
    sprintf(ascdate, "%ld", long(stp->st_mtime));

    FileInterner::Status fis = FileInterner::FIAgain;
    bool hadNullIpath = false;
    while (fis == FileInterner::FIAgain) {
	doc.erase();
        try {
            fis = interner.internfile(doc);
        } catch (CancelExcept) {
            LOGERR(("fsIndexer::processone: interrupted\n"));
            return FsTreeWalker::FtwStop;
        }

        // Index at least the file name even if there was an error.
        // We'll change the signature to ensure that the indexing will
        // be retried every time.

	// Internal access path for multi-document files
	if (doc.ipath.empty())
	    hadNullIpath = true;

	// Set file name, mod time and url if not done by filter
	if (doc.fmtime.empty())
	    doc.fmtime = ascdate;
        if (doc.url.empty())
            doc.url = string("file://") + fn;
	if (doc.utf8fn.empty())
	    doc.utf8fn = utf8fn;

	char cbuf[100]; 
	sprintf(cbuf, "%ld", (long)stp->st_size);
	doc.fbytes = cbuf;
	// Document signature for up to date checks: concatenate
	// m/ctime and size. Looking for changes only, no need to
	// parseback so no need for reversible formatting. Also set,
	// but never used, for subdocs.
	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
	doc.sig = cbuf;
	// If there was an error, ensure indexing will be
	// retried. This is for the once missing, later installed
	// filter case. It can make indexing much slower (if there are
	// myriads of such files, the ext script is executed for them
	// and fails every time)
	if (fis == FileInterner::FIError) {
	    doc.sig += plus;
	}

        // Possibly add fields from local config
        if (m_havelocalfields) 
            setlocalfields(doc);
	// Add document to database. If there is an ipath, add it as a children
	// of the file document.
	string udi;
	make_udi(fn, doc.ipath, udi);
	if (!m_db->addOrUpdate(udi, doc.ipath.empty() ? "" : parent_udi, doc)) 
	    return FsTreeWalker::FtwError;

	// Tell what we are doing and check for interrupt request
	if (m_updater) {
	    ++(m_updater->status.docsdone);
            if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
                m_updater->status.dbtotdocs = m_updater->status.docsdone;
            m_updater->status.fn = fn;
            if (!doc.ipath.empty())
                m_updater->status.fn += "|" + doc.ipath;
            if (!m_updater->update()) {
                return FsTreeWalker::FtwStop;
            }
	}
    }

    // If we had no instance with a null ipath, we create an empty
    // document to stand for the file itself, to be used mainly for up
    // to date checks. Typically this happens for an mbox file.
    if (hadNullIpath == false) {
	LOGDEB1(("Creating empty doc for file\n"));
	Rcl::Doc fileDoc;
	fileDoc.fmtime = ascdate;
	fileDoc.utf8fn = utf8fn;
	fileDoc.mimetype = interner.getMimetype();
	fileDoc.url = string("file://") + fn;

	char cbuf[100]; 
	sprintf(cbuf, "%ld", (long)stp->st_size);
	fileDoc.fbytes = cbuf;
	// Document signature for up to date checks.
	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
	fileDoc.sig = cbuf;
	if (!m_db->addOrUpdate(parent_udi, "", fileDoc)) 
	    return FsTreeWalker::FtwError;
    }

    return FsTreeWalker::FtwOk;
}