recoll / Code / [f20fae] /src/rcldb/stemdb.cpp

[f20fae]: src / rcldb / stemdb.cpp History

stemdb.cpp 369 lines (329 with data), 11.4 kB

/* Copyright (C) 2005 J.F.Dockes 
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

/**
 * Management of the auxiliary databases listing stems and their expansion 
 * terms
 */
#include <unistd.h>

#include <algorithm>
#include <map>

#include <xapian.h>

#include "stemdb.h"
#include "wipedir.h"
#include "pathut.h"
#include "debuglog.h"
#include "smallut.h"
#include "utf8iter.h"
#include "textsplit.h"

using namespace std;

namespace Rcl {
namespace StemDb {


static const string cstr_stemdirstem = "stem_";

/// Compute name of stem db for given base database and language
static string stemdbname(const string& dbdir, const string& lang)
{
    return path_cat(dbdir, cstr_stemdirstem + lang);
}

list<string> getLangs(const string& dbdir)
{
    string pattern = cstr_stemdirstem + "*";
    list<string> dirs = path_dirglob(dbdir, pattern);
    for (list<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
	*it = path_basename(*it);
	*it = it->substr(cstr_stemdirstem.length(), string::npos);
    }
    return dirs;
}

bool deleteDb(const string& dbdir, const string& lang)
{
    string dir = stemdbname(dbdir, lang);
    if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
	return true;
    return false;
}

// Autoclean/delete directory 
class DirWiper {
 public:
    string dir;
    bool do_it;
    DirWiper(string d) : dir(d), do_it(true) {}
    ~DirWiper() {
	if (do_it) {
	    wipedir(dir);
	    rmdir(dir.c_str());
	}
    }
};

inline static bool
p_notlowerascii(unsigned int c)
{
    if (c < 'a' || (c > 'z' && c < 128))
	return true;
    return false;
}

static bool addAssoc(Xapian::WritableDatabase &sdb, const string& stem,
                     const list<string>& derivs)
{
    Xapian::Document newdocument;
    newdocument.add_term(stem);
    // The doc data is just parents=blank-separated-list
    string record = "parents=";
    for (list<string>::const_iterator it = derivs.begin(); 
         it != derivs.end(); it++) {
        record += *it + " ";
    }
    record += "\n";
    LOGDEB2(("createStemDb: stmdoc data: [%s]\n", record.c_str()));
    newdocument.set_data(record);
    try {
        sdb.replace_document(stem, newdocument);
    } catch (...) {
        LOGERR(("Db::createstemdb(addAssoc): replace failed\n"));
        return false;
    }
    return true;
}


/**
 * Create database of stem to parents associations for a given language.
 * We walk the list of all terms, stem them, and create another Xapian db
 * with documents indexed by a single term (the stem), and with the list of
 * parent terms in the document data.
 */
bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
{
    LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
    Chrono cron;

    // First build the in-memory stem database:
    // We walk the list of all terms, and stem each. 
    //   If the stem is identical to the term, no need to create an entry
    // Else, we add an entry to the multimap.
    // At the end, we only save stem-terms associations with several terms, the
    // others are not useful
    // Note: a map<string, list<string> > would probably be more efficient
    multimap<string, string> assocs;
    // Statistics
    int nostem=0; // Dont even try: not-alphanum (incomplete for now)
    int stemconst=0; // Stem == term
    int stemdiff=0;  // Count of all different stems
    int stemmultiple = 0; // Count of stems with multiple derivatives
    try {
        Xapian::Stem stemmer(lang);
        Xapian::TermIterator it;
        for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
            // Deciding if we try to stem the term. 

	    // If it has any
            // non-lowercase 7bit char (that is, numbers, capitals and
            // punctuation) dont. 
            string::iterator sit = (*it).begin(), eit = sit + (*it).length();
            if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
                ++nostem;
                LOGDEB1(("stemskipped: [%s], because of 0x%x\n", 
                         (*it).c_str(), *sit));
                continue;
            }

	    // Detect and skip CJK terms.
	    // We're still sending all other multibyte utf-8 chars to
            // the stemmer, which is not too well defined for
            // xapian<1.0 (very obsolete now), but seems to work
            // anyway. There shouldnt be too many in any case because
            // accents are stripped at this point. Effect of stripping
            // accents on stemming unknown, hopefuly none, there is
            // nothing we can do about it.
	    Utf8Iter utfit(*it);
	    if (TextSplit::isCJK(*utfit)) {
		// LOGDEB(("stemskipped: Skipping CJK\n"));
		continue;
	    }

            string stem = stemmer(*it);
            LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
                     stem.c_str()));
            if (stem == *it) {
                ++stemconst;
                continue;
            }
            assocs.insert(pair<string,string>(stem, *it));
        }
    } catch (const Xapian::Error &e) {
        LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
        return false;
    } catch (...) {
        LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", 
                lang.c_str()));
        return false;
    }
    LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n", 
             lang.c_str(), cron.secs()));

    // Create xapian database for stem relations
    string stemdbdir = stemdbname(dbdir, lang);
    // We want to get rid of the db dir in case of error. This gets disarmed
    // just before success return.
    DirWiper wiper(stemdbdir);
    string ermsg;
    Xapian::WritableDatabase sdb;
    try {
        sdb = Xapian::WritableDatabase(stemdbdir, 
                                       Xapian::DB_CREATE_OR_OVERWRITE);
    } catch (const Xapian::Error &e) {
        ermsg = e.get_msg();
    } catch (const string &s) {
        ermsg = s;
    } catch (const char *s) {
        ermsg = s;
    } catch (...) {
        ermsg = "Caught unknown exception";
    }
    if (!ermsg.empty()) {
        LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n", 
                stemdbdir.c_str(), ermsg.c_str()));
        return false;
    }

    // Enter pseud-docs in db by walking the multimap.
    string stem;
    list<string> derivs;
    for (multimap<string,string>::const_iterator it = assocs.begin();
         it != assocs.end(); it++) {
        if (stem == it->first) {
            // Staying with same stem
            derivs.push_back(it->second);
            // cerr << " " << it->second << endl;
        } else {
            // Changing stems 
            ++stemdiff;
            LOGDEB2(("createStemDb: stem [%s]\n", stem.c_str()));

            // We need an entry even if there is only one derivative
            // so that it is possible to search by entering the stem
            // even if it doesnt exist as a term
            if (!derivs.empty()) {

                if (derivs.size() > 1)
                    ++stemmultiple;
                    
                if (!addAssoc(sdb, stem, derivs)) {
                    return false;
                }
                derivs.clear();
            }
            stem = it->first;
            derivs.push_back(it->second);
            //	    cerr << "\n" << stem << " " << it->second;
        }
    }
    if (!derivs.empty()) {
        if (derivs.size() > 1)
            ++stemmultiple;
                    
        if (!addAssoc(sdb, stem, derivs)) {
            return false;
        }
    }

    LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n", 
             lang.c_str(), cron.secs()));
    LOGDEB(("Stem map size: %d stems %d mult %d no %d const %d\n", 
	    assocs.size(), stemdiff, stemmultiple, nostem, stemconst));
    wiper.do_it = false;
    return true;
}

static string stringlistdisp(const list<string>& sl)
{
    string s;
    for (list<string>::const_iterator it = sl.begin(); it!= sl.end(); it++)
	s += "[" + *it + "] ";
    if (!s.empty())
	s.erase(s.length()-1);
    return s;
}

/**
 * Expand term to list of all terms which stem to the same term, for one
 * expansion language
 */
static bool stemExpandOne(const std::string& dbdir, 
			  const std::string& lang,
			  const std::string& term,
			  list<string>& result)
{
    try {
	Xapian::Stem stemmer(lang);
	string stem = stemmer(term);
	LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n", 
                lang.c_str(), term.c_str(), stem.c_str()));

	// Open stem database
	string stemdbdir = stemdbname(dbdir, lang);
	Xapian::Database sdb(stemdbdir);
	LOGDEB0(("stemExpand: %s lastdocid: %d\n", 
		stemdbdir.c_str(), sdb.get_lastdocid()));

	// Try to fetch the doc from the stem db
	if (!sdb.term_exists(stem)) {
	    LOGDEB0(("Db::stemExpand: no term for %s\n", stem.c_str()));
	} else {
	    Xapian::PostingIterator did = sdb.postlist_begin(stem);
	    if (did == sdb.postlist_end(stem)) {
		LOGDEB0(("stemExpand: no term(1) for %s\n",stem.c_str()));
	    } else {
		Xapian::Document doc = sdb.get_document(*did);
		string data = doc.get_data();

		// Build expansion list from database data No need for
		// a conftree, but we need to massage the data a
		// little
		string::size_type pos = data.find('=');
		string::size_type pos1 = data.rfind('\n');
		if (pos == string::npos || pos1 == string::npos || 
		    pos1 <= pos+1) {
		    LOGERR(("stemExpand: bad data in db: [%s]\n", 
			    data.c_str()));
		} else {
		    ++pos;
		    stringToStrings(data.substr(pos, pos1-pos), result);
		}
	    }
	}

	// If the user term or stem are not in the list, add them
	if (find(result.begin(), result.end(), term) == result.end()) {
	    result.push_back(term);
	}
	if (find(result.begin(), result.end(), stem) == result.end()) {
	    result.push_back(stem);
	}
	LOGDEB0(("stemExpand:%s: %s ->  %s\n", lang.c_str(), stem.c_str(),
		 stringlistdisp(result).c_str()));

    } catch (...) {
	LOGERR(("stemExpand: error accessing stem db. dbdir [%s] lang [%s]\n",
		dbdir.c_str(), lang.c_str()));
	result.push_back(term);
	return false;
    }

    return true;
}
    
/**
 * Expand term to list of all terms which stem to the same term, add the
 * expansion sets for possibly multiple expansion languages
 */
bool stemExpand(const std::string& dbdir, 
		const std::string& langs,
		const std::string& term,
		list<string>& result)
{

    list<string> llangs;
    stringToStrings(langs, llangs);
    for (list<string>::const_iterator it = llangs.begin();
	 it != llangs.end(); it++) {
	list<string> oneexp;
	stemExpandOne(dbdir, *it, term, oneexp);
	result.insert(result.end(), oneexp.begin(), oneexp.end());
    }
    result.sort();
    result.unique();
    return true;
}


}
}