recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [ac09a9] .. [cecd1b]

Switch to unified view


/* Copyright (C) 2004-2018 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
...
#include "chrono.h"
#include "utf8iter.h"
#include "searchdata.h"
#include "rclquery.h"
#include "rclquery_p.h"
#include "rclvalues.h"
#include "md5ut.h"
#include "rclversion.h"
#include "cancelcheck.h"
#include "termproc.h"
#include "expansiondbs.h"
...
#include "utf8fn.h"
#include "wipedir.h"
#ifdef RCL_USE_ASPELL
#include "rclaspell.h"
#endif
#include "zlibut.h"

#ifndef XAPIAN_AT_LEAST
// Added in Xapian 1.4.2. Define it here for older versions
#define XAPIAN_AT_LEAST(A,B,C) \
 (XAPIAN_MAJOR_VERSION > (A) || \
 (XAPIAN_MAJOR_VERSION == (A) && \
 (XAPIAN_MINOR_VERSION > (B) || \
 (XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
#endif


// Recoll index format version is stored in user metadata. When this change,
// we can't open the db and will have to reindex.
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
static const string cstr_RCL_IDX_VERSION("1");
static const string cstr_RCL_IDX_DESCRIPTOR_KEY("RCL_IDX_DESCRIPTOR_KEY");

static const string cstr_mbreaks("rclmbreaks");

namespace Rcl {

...
    }
    bool status = false;
    switch (tsk->op) {
    case DbUpdTask::AddOrUpdate:
        LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
        status = ndbp->addOrUpdateWrite(
                tsk->udi, tsk->uniterm, tsk->doc, tsk->txtlen, tsk->rawztext);
        break;
    case DbUpdTask::Delete:
        LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
        status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm);
        break;
...
           writeqlen << " wqts " << writethreads << "\n");
}

#endif // IDX_THREADS

void Db::Native::openWrite(const string& dir, Db::OpenMode mode)
{
    int action = (mode == Db::DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
        Xapian::DB_CREATE_OR_OVERWRITE;

#ifdef _WIN32
    // Xapian is quite bad at erasing partial db which can
    // occur because of open file deletion errors on
    // Windows. 
    if (mode == DbTrunc) {
        if (path_exists(path_cat(dir, "iamchert"))) {
            wipedir(dir);
            unlink(dir.c_str());
        }
    }
#endif
    
    if (::access(dir.c_str(), 0) == 0) {
        // Existing index
        xwdb = Xapian::WritableDatabase(dir, action);
    } else {
        // New index. If possible, and depending on config, use a stub
        // to force using Chert. No sense in doing this if we are
        // storing the text anyway.
#if XAPIAN_AT_LEAST(1,3,0) && XAPIAN_HAS_CHERT_BACKEND
        // Xapian with Glass and Chert support. If storedoctext is
        // specified in the configuration, use the default backend
        // (Glass), else force Chert. There might be reasons why
        // someone would want to use Chert and store text anyway, but
        // it's an exotic case, and things are complicated enough
        // already.
        if (o_index_storedoctext) {
            xwdb = Xapian::WritableDatabase(dir, action);
            m_storetext = true;
        } else {
            // Force Chert format, don't store the text.
            string stub = path_cat(m_rcldb->m_config->getConfDir(),
                                   "xapian.stub");
            FILE *fp = fopen(stub.c_str(), "w");
            if (nullptr == fp) {
                throw(string("Can't create ") + stub);
            }
            fprintf(fp, "chert %s\n", dir.c_str());
            fclose(fp);
            xwdb = Xapian::WritableDatabase(stub, action);
            m_storetext = false;
        }
#elif (! XAPIAN_AT_LEAST(1,3,0)) || XAPIAN_AT_LEAST(1,5,0)
        // Old Xapian (chert only) or newer (no chert). Use the
        // default index backend and let the user decide of the
        // abstract generation method. The configured default is to
        // store the text.
        xwdb = Xapian::WritableDatabase(dir, action);
        m_storetext = o_index_storedoctext;
#endif
        // Set the storetext value inside the index descriptor (new
        // with recoll 1.24, maybe we'll have other stuff to store in
        // there in the future).
        string desc = string("storetext=") + (m_storetext ? "1" : "0") + "\n";
        xwdb.set_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY, desc);
    }
    
    // If the index is empty, write the data format version at once
    // to avoid stupid error messages:
    if (xwdb.get_doccount() == 0) {
        xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
    }

    m_iswritable = true;

#ifdef IDX_THREADS
    maybeStartThreads();
#endif
}

void Db::Native::openRead(const string& dir)
{
    m_iswritable = false;
    xrdb = Xapian::Database(dir);
    string desc = xrdb.get_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY);
    ConfSimple cf(desc, 1);
    string val;
    m_storetext = false;
    if (cf.get("storetext", val) && stringToBool(val)) {
        m_storetext = true;
    }
    LOGDEB("Db::openRead: index " << (m_storetext?"stores":"does not store") <<
           " document text\n");
}

/* See comment in class declaration: return all subdocuments of a
 * document given by its unique id. */

bool Db::Native::subDocs(const string &udi, int idxi, 
             vector<Xapian::docid>& docids) 
{
    LOGDEB2("subDocs: [" << uniterm << "]\n");
    string pterm = make_parentterm(udi);
...
    return 0;
}

// Turn data record from db into document fields
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, 
              Doc &doc, bool fetchtext)
{
    LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n");
    ConfSimple parms(data);
    if (!parms.ok())
    return false;
...
    if (doc.meta.find(*it) == doc.meta.end())
        parms.get(*it, doc.meta[*it]);
    }
    doc.meta[Doc::keyurl] = doc.url;
    doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
    if (fetchtext) {
        getRawText(docid, doc.text);
    }
    return true;
}

bool Db::Native::hasPages(Xapian::docid docid)
{
...
    vector<int>::const_iterator it = 
    upper_bound(pbreaks.begin(), pbreaks.end(), pos);
    return int(it - pbreaks.begin() + 1);
}

bool Db::Native::getRawText(Xapian::docid docid, string& rawtext)
{
    if (!m_storetext) {
        LOGDEB("Db::Native::getRawText: document text not stored in index\n");
        return false;
    }
    string reason;
    XAPTRY(rawtext = xrdb.get_metadata(rawtextMetaKey(docid)), xrdb, reason);
    if (!reason.empty()) {
        LOGERR("Rcl::Db::getRawText: could not get value: " << reason << endl);
        return false;
    }
    if (rawtext.empty()) {
        return true;
    }
    ZLibUtBuf cbuf;
    inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf);
    rawtext.assign(cbuf.getBuf(), cbuf.getCnt());
    return true;
}

// Note: we're passed a Xapian::Document* because Xapian
// reference-counting is not mt-safe. We take ownership and need
// to delete it before returning.
bool Db::Native::addOrUpdateWrite(
    const string& udi, const string& uniterm, Xapian::Document *newdocument_ptr, 
    size_t textlen, const string& rawztext)
{
#ifdef IDX_THREADS
    Chrono chron;
    std::unique_lock<std::mutex> lock(m_mutex);
#endif
    std::unique_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);

    // Check file system full every mbyte of indexed text. It's a bit wasteful
    // to do this after having prepared the document, but it needs to be in
    // the single-threaded section.
    if (m_rcldb->m_maxFsOccupPc > 0 && 
...

    const char *fnc = udi.c_str();
    string ermsg;

    // Add db entry or update existing entry:
    Xapian::docid did = 0;
    try {

    did = xwdb.replace_document(uniterm, *newdocument_ptr);
    if (did < m_rcldb->updated.size()) {
            // This is necessary because only the file-level docs are tested
            // by needUpdate(), so the subdocs existence flags are only set
            // here.
        m_rcldb->updated[did] = true;
        LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n");
    } else {
        LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
    }
    } XCATCHERROR(ermsg);

    if (!ermsg.empty()) {
    LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
    ermsg.erase();
    // FIXME: is this ever actually needed?
    try {
...
        LOGERR("Db::add: add_document failed: " << ermsg << "\n");
        return false;
    }
    }

    XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext),
           xwdb, m_rcldb->m_reason);
    if (!m_rcldb->m_reason.empty()) {
        LOGERR("Db::addOrUpdate: set_metadata error: " <<
               m_rcldb->m_reason << "\n");
        // This only affects snippets, so let's say not fatal
    }
    
    // Test if we're over the flush threshold (limit memory usage):
    bool ret = m_rcldb->maybeflush(textlen);
#ifdef IDX_THREADS
    m_totalworkns += chron.nanos();
#endif
...
        LOGINFO("purgeFileWrite: got empty sig\n");
        return false;
        }
    } else {
        LOGDEB("purgeFile: delete docid " << *docid << "\n");
            deleteDocument(*docid);
    }
    vector<Xapian::docid> docids;
    subDocs(udi, 0, docids);
    LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n");
    for (vector<Xapian::docid>::iterator it = docids.begin();
...
        }
        }
        
        if (!orphansOnly || sig != subdocsig) {
        LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
        deleteDocument(*it);
        }
    }
    return true;
    } XCATCHERROR(ermsg);
    if (!ermsg.empty()) {
...
    vector<string> res;
    stringToStrings(Xapian::Stem::get_available_languages(), res);
    return res;
}


bool Db::open(OpenMode mode, OpenError *error)
{
    if (error)
    *error = DbOpenMainDb;

...
    string ermsg;
    try {
    switch (mode) {
    case DbUpd:
    case DbTrunc: 
            m_ndb->openWrite(dir, mode);
            updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);

































            // We used to open a readonly object in addition to the
            // r/w one because some operations were faster when
            // performed through a Database: no forced flushes on
            // allterms_begin(), used in subDocs(). This issue has
            // been gone for a long time (now: Xapian 1.2) and the
            // separate objects seem to trigger other Xapian issues,

            // so the query db is now a clone of the update one.
            m_ndb->xrdb = m_ndb->xwdb;
            LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");






        break;
    case DbRO:
    default:
            m_ndb->openRead(dir);
            for (auto& db : m_extraDbs) {


        if (error)
            *error = DbOpenExtraDb;
        LOGDEB("Db::Open: adding query db [" << &db << "]\n");
                // An error here used to be non-fatal (1.13 and older)
                // but I can't see why
                m_ndb->xrdb.add_database(Xapian::Database(db));
        }
        break;
    }
    if (error)
        *error = DbOpenMainDb;
...
    string aerr;
    bool mstripped = true;
    LOGDEB("Db::testDbDir: [" << dir << "]\n");
    try {
    Xapian::Database db(dir);
  // If the prefix for mimetype is wrapped, it's an unstripped
  // index. T has been in use in recoll since the beginning and
  // all documents have a T field (possibly empty).
    Xapian::TermIterator term = db.allterms_begin(":T:");
    if (term == db.allterms_end()) {
        mstripped = true;
        } else {
        mstripped = false;
        }
        LOGDEB("testDbDir: " << dir << " is a " <<
               (mstripped ? "stripped" : "raw") << " index\n");
    } XCATCHERROR(aerr);
    if (!aerr.empty()) {
    LOGERR("Db::Open: error while trying to open database from [" <<
               dir << "]: " << aerr << "\n");
    return false;
...
    tpidx.setTSD(&splitter);

    // Udi unique term: this is used for file existence/uptodate
    // checks, and unique id for the replace_document() call.
    string uniterm = make_uniterm(udi);
    string rawztext; // Doc compressed text

    if (doc.onlyxattr) {
    // Only updating an existing doc with new extended attributes
    // data.  Need to read the old doc and its data record
    // first. This is so different from the normal processing that
...
        newdocument.add_posting(wrap_prefix(pathelt_prefix),
                    splitter.basepos + splitter.curpos++);
        for (vector<string>::iterator it = vpath.begin(); 
         it != vpath.end(); it++){
        if (it->length() > 230) {
            // Just truncate it. May still be useful because
          // of wildcards
            *it = it->substr(0, 230);
        }
        newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
                    splitter.basepos + splitter.curpos++);
        }
...
    // positions, as we may want to do phrase searches with them (this
    // makes no sense for keywords by the way).
    //
    // The order has no importance, and we set a position gap of 100
    // between fields to avoid false proximity matches.
  for (const auto& entry: doc.meta) {

        if (entry.second.empty()) {
                continue;
            }
            const FieldTraits *ftp{nullptr};
            fieldToTraits(entry.first, &ftp);
            if (ftp && ftp->valueslot) {
                LOGDEB("Adding value: for field " << entry.first << " slot "
                       << ftp->valueslot << endl);
                add_field_value(newdocument, *ftp, entry.second);
            }

            // There was an old comment here about not testing for
            // empty prefix, and we indeed did not test. I don't think
            // that it makes sense any more (and was in disagreement
            // with the LOG message. Really now: no prefix: no
            // indexing.
            if (ftp && !ftp->pfx.empty()) {
                LOGDEB0("Db::add: field [" << entry.first << "] pfx [" <<
                        ftp->pfx << "] inc " << ftp->wdfinc << ": [" <<
                        entry.second << "]\n");
                splitter.setTraits(*ftp);
                if (!splitter.text_to_words(entry.second)) {
                    LOGDEB("Db::addOrUpdate: split failed for " <<
                           entry.first << "\n");
                }
            } else {
                LOGDEB0("Db::add: no prefix for field [" <<
                        entry.first << "], no indexing\n");
            }
    }

        // Reset to no prefix and default params
        splitter.setTraits(FieldTraits());

...
    LOGDEB2("Db::add: split body: [" << doc.text << "]\n");

#ifdef TEXTSPLIT_STATS
    splitter.resetStats();
#endif
    if (!splitter.text_to_words(doc.text)) {
        LOGDEB("Db::addOrUpdate: split failed for main text\n");
        } else {
            if (m_ndb->m_storetext) {
                ZLibUtBuf buf;
                deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
                rawztext.assign(buf.getBuf(), buf.getCnt());
            }
        }

#ifdef TEXTSPLIT_STATS
    // Reject bad data. unrecognized base64 text is characterized by
    // high avg word length and high variation (because there are
    // word-splitters like +/ inside the data).
...
    // We also add a term for the filename extension if any.
    string utf8fn;
    if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
        string fn;
        if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
        // We should truncate after extracting the extension,
        // but this is a pathological case anyway
        if (fn.size() > 230)
            utf8truncate(fn, 230);
        string::size_type pos = fn.rfind('.');
        if (pos != string::npos && pos != fn.length() - 1) {
            newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
                         fn.substr(pos + 1));
        }
        newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn,0);
        }
    }

    newdocument.add_boolean_term(uniterm);
    // Parent term. This is used to find all descendents, mostly
...
    LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
    newdocument.set_data(record);
    }
#ifdef IDX_THREADS
    if (m_ndb->m_havewriteq) {
  DbUpdTask *tp = new DbUpdTask(
            DbUpdTask::AddOrUpdate, udi, uniterm, newdocument_ptr,
            doc.text.length(), rawztext);
    if (!m_ndb->m_wqueue.put(tp)) {
        LOGERR("Db::addOrUpdate:Cant queue task\n");
            delete newdocument_ptr;
        return false;
    } else {
...
    }
    }
#endif

    return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument_ptr,
                   doc.text.length(), rawztext);
}

bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
                    Doc &doc, Xapian::Document& xdoc)
{
...
            // size from the data record, but this would be
            // bad for performance.
            Xapian::termcount trms = m_ndb->xwdb.get_doclength(docid);
            maybeflush(trms * 5);
        }
        m_ndb->deleteDocument(docid);
        LOGDEB("Db::purge: deleted document #" << docid << "\n");
        } catch (const Xapian::DocNotFoundError &) {
        LOGDEB0("Db::purge: document #" << docid << " not found\n");
        } catch (const Xapian::Error &e) {
        LOGERR("Db::purge: document #" << docid << ": " <<
...
    if (!exists)
    return true;

#ifdef IDX_THREADS
    if (m_ndb->m_havewriteq) {
        string rztxt;
    DbUpdTask *tp = new DbUpdTask(DbUpdTask::Delete, udi, uniterm, 
                      0, (size_t)-1, rztxt);
    if (!m_ndb->m_wqueue.put(tp)) {
        LOGERR("Db::purgeFile:Cant queue task\n");
        return false;
    } else {
        return true;
...

    string uniterm = make_uniterm(udi);

#ifdef IDX_THREADS
    if (m_ndb->m_havewriteq) {
        string rztxt;
    DbUpdTask *tp = new DbUpdTask(DbUpdTask::PurgeOrphans, udi, uniterm, 
                      0, (size_t)-1, rztxt);
    if (!m_ndb->m_wqueue.put(tp)) {
        LOGERR("Db::purgeFile:Cant queue task\n");
        return false;
    } else {
        return true;

	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
1	/* Copyright (C) 2004 J.F.Dockes	1	/* Copyright (C) 2004-2018 J.F.Dockes
2	* This program is free software; you can redistribute it and/or modify	2	* This program is free software; you can redistribute it and/or modify
3	* it under the terms of the GNU General Public License as published by	3	* it under the terms of the GNU General Public License as published by
4	* the Free Software Foundation; either version 2 of the License, or	4	* the Free Software Foundation; either version 2 of the License, or
5	* (at your option) any later version.	5	* (at your option) any later version.
6	*	6	*
	...		...
48	#include "chrono.h"	48	#include "chrono.h"
49	#include "utf8iter.h"	49	#include "utf8iter.h"
50	#include "searchdata.h"	50	#include "searchdata.h"
51	#include "rclquery.h"	51	#include "rclquery.h"
52	#include "rclquery_p.h"	52	#include "rclquery_p.h"
		53	#include "rclvalues.h"
53	#include "md5ut.h"	54	#include "md5ut.h"
54	#include "rclversion.h"	55	#include "rclversion.h"
55	#include "cancelcheck.h"	56	#include "cancelcheck.h"
56	#include "termproc.h"	57	#include "termproc.h"
57	#include "expansiondbs.h"	58	#include "expansiondbs.h"
	...		...
60	#include "utf8fn.h"	61	#include "utf8fn.h"
61	#include "wipedir.h"	62	#include "wipedir.h"
62	#ifdef RCL_USE_ASPELL	63	#ifdef RCL_USE_ASPELL
63	#include "rclaspell.h"	64	#include "rclaspell.h"
64	#endif	65	#endif
		66	#include "zlibut.h"
		67
		68	#ifndef XAPIAN_AT_LEAST
		69	// Added in Xapian 1.4.2. Define it here for older versions
		70	#define XAPIAN_AT_LEAST(A,B,C) \
		71	(XAPIAN_MAJOR_VERSION > (A) \|\| \
		72	(XAPIAN_MAJOR_VERSION == (A) && \
		73	(XAPIAN_MINOR_VERSION > (B) \|\| \
		74	(XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
		75	#endif
		76
65		77
66	// Recoll index format version is stored in user metadata. When this change,	78	// Recoll index format version is stored in user metadata. When this change,
67	// we can't open the db and will have to reindex.	79	// we can't open the db and will have to reindex.
68	static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");	80	static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
69	static const string cstr_RCL_IDX_VERSION("1");	81	static const string cstr_RCL_IDX_VERSION("1");
		82	static const string cstr_RCL_IDX_DESCRIPTOR_KEY("RCL_IDX_DESCRIPTOR_KEY");
70		83
71	static const string cstr_mbreaks("rclmbreaks");	84	static const string cstr_mbreaks("rclmbreaks");
72		85
73	namespace Rcl {	86	namespace Rcl {
74		87
	...		...
188	}	201	}
189	bool status = false;	202	bool status = false;
190	switch (tsk->op) {	203	switch (tsk->op) {
191	case DbUpdTask::AddOrUpdate:	204	case DbUpdTask::AddOrUpdate:
192	LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");	205	LOGDEB("DbUpdWorker: got add/update task, ql " << qsz << "\n");
193	status = ndbp->addOrUpdateWrite(tsk->udi, tsk->uniterm,	206	status = ndbp->addOrUpdateWrite(
194	tsk->doc, tsk->txtlen);	207	tsk->udi, tsk->uniterm, tsk->doc, tsk->txtlen, tsk->rawztext);
195	break;	208	break;
196	case DbUpdTask::Delete:	209	case DbUpdTask::Delete:
197	LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");	210	LOGDEB("DbUpdWorker: got delete task, ql " << qsz << "\n");
198	status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm);	211	status = ndbp->purgeFileWrite(false, tsk->udi, tsk->uniterm);
199	break;	212	break;
	...		...
236	writeqlen << " wqts " << writethreads << "\n");	249	writeqlen << " wqts " << writethreads << "\n");
237	}	250	}
238		251
239	#endif // IDX_THREADS	252	#endif // IDX_THREADS
240		253
		254	void Db::Native::openWrite(const string& dir, Db::OpenMode mode)
		255	{
		256	int action = (mode == Db::DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
		257	Xapian::DB_CREATE_OR_OVERWRITE;
		258
		259	#ifdef _WIN32
		260	// Xapian is quite bad at erasing partial db which can
		261	// occur because of open file deletion errors on
		262	// Windows.
		263	if (mode == DbTrunc) {
		264	if (path_exists(path_cat(dir, "iamchert"))) {
		265	wipedir(dir);
		266	unlink(dir.c_str());
		267	}
		268	}
		269	#endif
		270
		271	if (::access(dir.c_str(), 0) == 0) {
		272	// Existing index
		273	xwdb = Xapian::WritableDatabase(dir, action);
		274	} else {
		275	// New index. If possible, and depending on config, use a stub
		276	// to force using Chert. No sense in doing this if we are
		277	// storing the text anyway.
		278	#if XAPIAN_AT_LEAST(1,3,0) && XAPIAN_HAS_CHERT_BACKEND
		279	// Xapian with Glass and Chert support. If storedoctext is
		280	// specified in the configuration, use the default backend
		281	// (Glass), else force Chert. There might be reasons why
		282	// someone would want to use Chert and store text anyway, but
		283	// it's an exotic case, and things are complicated enough
		284	// already.
		285	if (o_index_storedoctext) {
		286	xwdb = Xapian::WritableDatabase(dir, action);
		287	m_storetext = true;
		288	} else {
		289	// Force Chert format, don't store the text.
		290	string stub = path_cat(m_rcldb->m_config->getConfDir(),
		291	"xapian.stub");
		292	FILE *fp = fopen(stub.c_str(), "w");
		293	if (nullptr == fp) {
		294	throw(string("Can't create ") + stub);
		295	}
		296	fprintf(fp, "chert %s\n", dir.c_str());
		297	fclose(fp);
		298	xwdb = Xapian::WritableDatabase(stub, action);
		299	m_storetext = false;
		300	}
		301	#elif (! XAPIAN_AT_LEAST(1,3,0)) \|\| XAPIAN_AT_LEAST(1,5,0)
		302	// Old Xapian (chert only) or newer (no chert). Use the
		303	// default index backend and let the user decide of the
		304	// abstract generation method. The configured default is to
		305	// store the text.
		306	xwdb = Xapian::WritableDatabase(dir, action);
		307	m_storetext = o_index_storedoctext;
		308	#endif
		309	// Set the storetext value inside the index descriptor (new
		310	// with recoll 1.24, maybe we'll have other stuff to store in
		311	// there in the future).
		312	string desc = string("storetext=") + (m_storetext ? "1" : "0") + "\n";
		313	xwdb.set_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY, desc);
		314	}
		315
		316	// If the index is empty, write the data format version at once
		317	// to avoid stupid error messages:
		318	if (xwdb.get_doccount() == 0) {
		319	xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
		320	}
		321
		322	m_iswritable = true;
		323
		324	#ifdef IDX_THREADS
		325	maybeStartThreads();
		326	#endif
		327	}
		328
		329	void Db::Native::openRead(const string& dir)
		330	{
		331	m_iswritable = false;
		332	xrdb = Xapian::Database(dir);
		333	string desc = xrdb.get_metadata(cstr_RCL_IDX_DESCRIPTOR_KEY);
		334	ConfSimple cf(desc, 1);
		335	string val;
		336	m_storetext = false;
		337	if (cf.get("storetext", val) && stringToBool(val)) {
		338	m_storetext = true;
		339	}
		340	LOGDEB("Db::openRead: index " << (m_storetext?"stores":"does not store") <<
		341	" document text\n");
		342	}
		343
241	/* See comment in class declaration: return all subdocuments of a	344	/* See comment in class declaration: return all subdocuments of a
242	* document given by its unique id.	345	* document given by its unique id. */
243	*/
244	bool Db::Native::subDocs(const string &udi, int idxi,	346	bool Db::Native::subDocs(const string &udi, int idxi,
245	vector<Xapian::docid>& docids)	347	vector<Xapian::docid>& docids)
246	{	348	{
247	LOGDEB2("subDocs: [" << uniterm << "]\n");	349	LOGDEB2("subDocs: [" << uniterm << "]\n");
248	string pterm = make_parentterm(udi);	350	string pterm = make_parentterm(udi);
	...		...
439	return 0;	541	return 0;
440	}	542	}
441		543
442	// Turn data record from db into document fields	544	// Turn data record from db into document fields
443	bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,	545	bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
444	Doc &doc)	546	Doc &doc, bool fetchtext)
445	{	547	{
446	LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n");	548	LOGDEB2("Db::dbDataToRclDoc: data:\n" << data << "\n");
447	ConfSimple parms(data);	549	ConfSimple parms(data);
448	if (!parms.ok())	550	if (!parms.ok())
449	return false;	551	return false;
	...		...
499	if (doc.meta.find(*it) == doc.meta.end())	601	if (doc.meta.find(*it) == doc.meta.end())
500	parms.get(it, doc.meta[it]);	602	parms.get(it, doc.meta[it]);
501	}	603	}
502	doc.meta[Doc::keyurl] = doc.url;	604	doc.meta[Doc::keyurl] = doc.url;
503	doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;	605	doc.meta[Doc::keymt] = doc.dmtime.empty() ? doc.fmtime : doc.dmtime;
		606	if (fetchtext) {
		607	getRawText(docid, doc.text);
		608	}
504	return true;	609	return true;
505	}	610	}
506		611
507	bool Db::Native::hasPages(Xapian::docid docid)	612	bool Db::Native::hasPages(Xapian::docid docid)
508	{	613	{
	...		...
578	vector<int>::const_iterator it =	683	vector<int>::const_iterator it =
579	upper_bound(pbreaks.begin(), pbreaks.end(), pos);	684	upper_bound(pbreaks.begin(), pbreaks.end(), pos);
580	return int(it - pbreaks.begin() + 1);	685	return int(it - pbreaks.begin() + 1);
581	}	686	}
582		687
		688	bool Db::Native::getRawText(Xapian::docid docid, string& rawtext)
		689	{
		690	if (!m_storetext) {
		691	LOGDEB("Db::Native::getRawText: document text not stored in index\n");
		692	return false;
		693	}
		694	string reason;
		695	XAPTRY(rawtext = xrdb.get_metadata(rawtextMetaKey(docid)), xrdb, reason);
		696	if (!reason.empty()) {
		697	LOGERR("Rcl::Db::getRawText: could not get value: " << reason << endl);
		698	return false;
		699	}
		700	if (rawtext.empty()) {
		701	return true;
		702	}
		703	ZLibUtBuf cbuf;
		704	inflateToBuf(rawtext.c_str(), rawtext.size(), cbuf);
		705	rawtext.assign(cbuf.getBuf(), cbuf.getCnt());
		706	return true;
		707	}
		708
583	// Note: we're passed a Xapian::Document* because Xapian	709	// Note: we're passed a Xapian::Document* because Xapian
584	// reference-counting is not mt-safe. We take ownership and need	710	// reference-counting is not mt-safe. We take ownership and need
585	// to delete it before returning.	711	// to delete it before returning.
586	bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,	712	bool Db::Native::addOrUpdateWrite(
587	Xapian::Document *newdocument_ptr,	713	const string& udi, const string& uniterm, Xapian::Document *newdocument_ptr,
588	size_t textlen)	714	size_t textlen, const string& rawztext)
589	{	715	{
590	#ifdef IDX_THREADS	716	#ifdef IDX_THREADS
591	Chrono chron;	717	Chrono chron;
592	std::unique_lock<std::mutex> lock(m_mutex);	718	std::unique_lock<std::mutex> lock(m_mutex);
593	#endif	719	#endif
594	std::shared_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);	720	std::unique_ptr<Xapian::Document> doc_cleaner(newdocument_ptr);
595		721
596	// Check file system full every mbyte of indexed text. It's a bit wasteful	722	// Check file system full every mbyte of indexed text. It's a bit wasteful
597	// to do this after having prepared the document, but it needs to be in	723	// to do this after having prepared the document, but it needs to be in
598	// the single-threaded section.	724	// the single-threaded section.
599	if (m_rcldb->m_maxFsOccupPc > 0 &&	725	if (m_rcldb->m_maxFsOccupPc > 0 &&
	...		...
612		738
613	const char *fnc = udi.c_str();	739	const char *fnc = udi.c_str();
614	string ermsg;	740	string ermsg;
615		741
616	// Add db entry or update existing entry:	742	// Add db entry or update existing entry:
		743	Xapian::docid did = 0;
617	try {	744	try {
618	Xapian::docid did =
619	xwdb.replace_document(uniterm, *newdocument_ptr);	745	did = xwdb.replace_document(uniterm, *newdocument_ptr);
620	if (did < m_rcldb->updated.size()) {	746	if (did < m_rcldb->updated.size()) {
621	// This is necessary because only the file-level docs are tested	747	// This is necessary because only the file-level docs are tested
622	// by needUpdate(), so the subdocs existence flags are only set	748	// by needUpdate(), so the subdocs existence flags are only set
623	// here.	749	// here.
624	m_rcldb->updated[did] = true;	750	m_rcldb->updated[did] = true;
625	LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n");	751	LOGINFO("Db::add: docid " << did << " updated [" << fnc << "]\n");
626	} else {	752	} else {
627	LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");	753	LOGINFO("Db::add: docid " << did << " added [" << fnc << "]\n");
628	}	754	}
629	} XCATCHERROR(ermsg);	755	} XCATCHERROR(ermsg);
630
631	if (!ermsg.empty()) {	756	if (!ermsg.empty()) {
632	LOGERR("Db::add: replace_document failed: " << ermsg << "\n");	757	LOGERR("Db::add: replace_document failed: " << ermsg << "\n");
633	ermsg.erase();	758	ermsg.erase();
634	// FIXME: is this ever actually needed?	759	// FIXME: is this ever actually needed?
635	try {	760	try {
	...		...
641	LOGERR("Db::add: add_document failed: " << ermsg << "\n");	766	LOGERR("Db::add: add_document failed: " << ermsg << "\n");
642	return false;	767	return false;
643	}	768	}
644	}	769	}
645		770
		771	XAPTRY(xwdb.set_metadata(rawtextMetaKey(did), rawztext),
		772	xwdb, m_rcldb->m_reason);
		773	if (!m_rcldb->m_reason.empty()) {
		774	LOGERR("Db::addOrUpdate: set_metadata error: " <<
		775	m_rcldb->m_reason << "\n");
		776	// This only affects snippets, so let's say not fatal
		777	}
		778
646	// Test if we're over the flush threshold (limit memory usage):	779	// Test if we're over the flush threshold (limit memory usage):
647	bool ret = m_rcldb->maybeflush(textlen);	780	bool ret = m_rcldb->maybeflush(textlen);
648	#ifdef IDX_THREADS	781	#ifdef IDX_THREADS
649	m_totalworkns += chron.nanos();	782	m_totalworkns += chron.nanos();
650	#endif	783	#endif
	...		...
680	LOGINFO("purgeFileWrite: got empty sig\n");	813	LOGINFO("purgeFileWrite: got empty sig\n");
681	return false;	814	return false;
682	}	815	}
683	} else {	816	} else {
684	LOGDEB("purgeFile: delete docid " << *docid << "\n");	817	LOGDEB("purgeFile: delete docid " << *docid << "\n");
685	xwdb.delete_document(*docid);	818	deleteDocument(*docid);
686	}	819	}
687	vector<Xapian::docid> docids;	820	vector<Xapian::docid> docids;
688	subDocs(udi, 0, docids);	821	subDocs(udi, 0, docids);
689	LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n");	822	LOGDEB("purgeFile: subdocs cnt " << docids.size() << "\n");
690	for (vector<Xapian::docid>::iterator it = docids.begin();	823	for (vector<Xapian::docid>::iterator it = docids.begin();
	...		...
703	}	836	}
704	}	837	}
705		838
706	if (!orphansOnly \|\| sig != subdocsig) {	839	if (!orphansOnly \|\| sig != subdocsig) {
707	LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");	840	LOGDEB("Db::purgeFile: delete subdoc " << *it << "\n");
708	xwdb.delete_document(*it);	841	deleteDocument(*it);
709	}	842	}
710	}	843	}
711	return true;	844	return true;
712	} XCATCHERROR(ermsg);	845	} XCATCHERROR(ermsg);
713	if (!ermsg.empty()) {	846	if (!ermsg.empty()) {
	...		...
765	vector<string> res;	898	vector<string> res;
766	stringToStrings(Xapian::Stem::get_available_languages(), res);	899	stringToStrings(Xapian::Stem::get_available_languages(), res);
767	return res;	900	return res;
768	}	901	}
769		902
		903
770	bool Db::open(OpenMode mode, OpenError *error)	904	bool Db::open(OpenMode mode, OpenError *error)
771	{	905	{
772	if (error)	906	if (error)
773	*error = DbOpenMainDb;	907	*error = DbOpenMainDb;
774		908
	...		...
791	string ermsg;	925	string ermsg;
792	try {	926	try {
793	switch (mode) {	927	switch (mode) {
794	case DbUpd:	928	case DbUpd:
795	case DbTrunc:	929	case DbTrunc:
796	{	930	m_ndb->openWrite(dir, mode);
797	// Xapian is quite bad at erasing partial db which can	931	updated = vector<bool>(m_ndb->xwdb.get_lastdocid() + 1, false);
798	// occur because of open file deletion errors on
799	// Windows.
800	if (mode == DbTrunc) {
801	if (path_exists(path_cat(dir, "iamchert"))) {
802	wipedir(dir);
803	unlink(dir.c_str());
804	}
805	}
806	int action = (mode == DbUpd) ? Xapian::DB_CREATE_OR_OPEN :
807	Xapian::DB_CREATE_OR_OVERWRITE;
808	if (::access(dir.c_str(), 0) != 0) {
809	// New index. use a stub to force using Chert
810	string stub = path_cat(m_config->getConfDir(),
811	"xapian.stub");
812	FILE *fp = fopen(stub.c_str(), "w");
813	if (nullptr == fp) {
814	throw(string("Can't create ") + stub);
815	}
816	fprintf(fp, "chert %s\n", dir.c_str());
817	fclose(fp);
818	m_ndb->xwdb = Xapian::WritableDatabase(stub, action);
819	} else {
820	m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
821	}
822	// If db is empty, write the data format version at once
823	// to avoid stupid error messages:
824	if (m_ndb->xwdb.get_doccount() == 0)
825	m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY,
826	cstr_RCL_IDX_VERSION);
827	m_ndb->m_iswritable = true;
828	#ifdef IDX_THREADS
829	m_ndb->maybeStartThreads();
830	#endif
831	// We used to open a readonly object in addition to	932	// We used to open a readonly object in addition to the
832	// the r/w one because some operations were faster	933	// r/w one because some operations were faster when
833	// when performed through a Database: no forced	934	// performed through a Database: no forced flushes on
834	// flushes on allterms_begin(), used in	935	// allterms_begin(), used in subDocs(). This issue has
835	// subDocs(). This issue has been gone for a long time	936	// been gone for a long time (now: Xapian 1.2) and the
836	// (now: Xapian 1.2) and the separate objects seem to	937	// separate objects seem to trigger other Xapian issues,
837	// trigger other Xapian issues, so the query db is now
838	// a clone of the update one.	938	// so the query db is now a clone of the update one.
839	m_ndb->xrdb = m_ndb->xwdb;	939	m_ndb->xrdb = m_ndb->xwdb;
840	LOGDEB("Db::open: lastdocid: " << m_ndb->xwdb.get_lastdocid() <<	940	LOGDEB("Db::open: lastdocid: " <<m_ndb->xwdb.get_lastdocid()<<"\n");
841	"\n");
842	LOGDEB2("Db::open: resetting updated\n");
843	updated.resize(m_ndb->xwdb.get_lastdocid() + 1);
844	for (unsigned int i = 0; i < updated.size(); i++)
845	updated[i] = false;
846	}
847	break;	941	break;
848	case DbRO:	942	case DbRO:
849	default:	943	default:
850	m_ndb->m_iswritable = false;	944	m_ndb->openRead(dir);
851	m_ndb->xrdb = Xapian::Database(dir);	945	for (auto& db : m_extraDbs) {
852	for (vector<string>::iterator it = m_extraDbs.begin();
853	it != m_extraDbs.end(); it++) {
854	if (error)	946	if (error)
855	*error = DbOpenExtraDb;	947	*error = DbOpenExtraDb;
856	LOGDEB("Db::Open: adding query db [" << &(*it) << "]\n");	948	LOGDEB("Db::Open: adding query db [" << &db << "]\n");
857	// An error here used to be non-fatal (1.13 and older)	949	// An error here used to be non-fatal (1.13 and older)
858	// but I can't see why	950	// but I can't see why
859	m_ndb->xrdb.add_database(Xapian::Database(*it));	951	m_ndb->xrdb.add_database(Xapian::Database(db));
860	}	952	}
861	break;	953	break;
862	}	954	}
863	if (error)	955	if (error)
864	*error = DbOpenMainDb;	956	*error = DbOpenMainDb;
	...		...
1049	string aerr;	1141	string aerr;
1050	bool mstripped = true;	1142	bool mstripped = true;
1051	LOGDEB("Db::testDbDir: [" << dir << "]\n");	1143	LOGDEB("Db::testDbDir: [" << dir << "]\n");
1052	try {	1144	try {
1053	Xapian::Database db(dir);	1145	Xapian::Database db(dir);
1054	// If we have terms with a leading ':' it's an	1146	// If the prefix for mimetype is wrapped, it's an unstripped
1055	// unstripped index	1147	// index. T has been in use in recoll since the beginning and
		1148	// all documents have a T field (possibly empty).
1056	Xapian::TermIterator term = db.allterms_begin(":");	1149	Xapian::TermIterator term = db.allterms_begin(":T:");
1057	if (term == db.allterms_end())	1150	if (term == db.allterms_end()) {
1058	mstripped = true;	1151	mstripped = true;
1059	else	1152	} else {
1060	mstripped = false;	1153	mstripped = false;
		1154	}
		1155	LOGDEB("testDbDir: " << dir << " is a " <<
		1156	(mstripped ? "stripped" : "raw") << " index\n");
1061	} XCATCHERROR(aerr);	1157	} XCATCHERROR(aerr);
1062	if (!aerr.empty()) {	1158	if (!aerr.empty()) {
1063	LOGERR("Db::Open: error while trying to open database from [" <<	1159	LOGERR("Db::Open: error while trying to open database from [" <<
1064	dir << "]: " << aerr << "\n");	1160	dir << "]: " << aerr << "\n");
1065	return false;	1161	return false;
	...		...
1368	tpidx.setTSD(&splitter);	1464	tpidx.setTSD(&splitter);
1369		1465
1370	// Udi unique term: this is used for file existence/uptodate	1466	// Udi unique term: this is used for file existence/uptodate
1371	// checks, and unique id for the replace_document() call.	1467	// checks, and unique id for the replace_document() call.
1372	string uniterm = make_uniterm(udi);	1468	string uniterm = make_uniterm(udi);
		1469	string rawztext; // Doc compressed text
1373		1470
1374	if (doc.onlyxattr) {	1471	if (doc.onlyxattr) {
1375	// Only updating an existing doc with new extended attributes	1472	// Only updating an existing doc with new extended attributes
1376	// data. Need to read the old doc and its data record	1473	// data. Need to read the old doc and its data record
1377	// first. This is so different from the normal processing that	1474	// first. This is so different from the normal processing that
	...		...
1419	newdocument.add_posting(wrap_prefix(pathelt_prefix),	1516	newdocument.add_posting(wrap_prefix(pathelt_prefix),
1420	splitter.basepos + splitter.curpos++);	1517	splitter.basepos + splitter.curpos++);
1421	for (vector<string>::iterator it = vpath.begin();	1518	for (vector<string>::iterator it = vpath.begin();
1422	it != vpath.end(); it++){	1519	it != vpath.end(); it++){
1423	if (it->length() > 230) {	1520	if (it->length() > 230) {
1424	// Just truncate it. May still be useful because of wildcards	1521	// Just truncate it. May still be useful because
		1522	// of wildcards
1425	*it = it->substr(0, 230);	1523	*it = it->substr(0, 230);
1426	}	1524	}
1427	newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,	1525	newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
1428	splitter.basepos + splitter.curpos++);	1526	splitter.basepos + splitter.curpos++);
1429	}	1527	}
	...		...
1434	// positions, as we may want to do phrase searches with them (this	1532	// positions, as we may want to do phrase searches with them (this
1435	// makes no sense for keywords by the way).	1533	// makes no sense for keywords by the way).
1436	//	1534	//
1437	// The order has no importance, and we set a position gap of 100	1535	// The order has no importance, and we set a position gap of 100
1438	// between fields to avoid false proximity matches.	1536	// between fields to avoid false proximity matches.
1439	map<string, string>::iterator meta_it;	1537	for (const auto& entry: doc.meta) {
1440	for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
1441	if (!meta_it->second.empty()) {	1538	if (entry.second.empty()) {
1442	const FieldTraits *ftp;	1539	continue;
1443	// We don't test for an empty prefix here. Some fields are part	1540	}
1444	// of the internal conf with an empty prefix (ie: abstract).	1541	const FieldTraits *ftp{nullptr};
1445	if (!fieldToTraits(meta_it->first, &ftp)) {	1542	fieldToTraits(entry.first, &ftp);
1446	LOGDEB0("Db::add: no prefix for field [" <<	1543	if (ftp && ftp->valueslot) {
1447	meta_it->first << "], no indexing\n");	1544	LOGDEB("Adding value: for field " << entry.first << " slot "
1448	continue;	1545	<< ftp->valueslot << endl);
1449	}	1546	add_field_value(newdocument, *ftp, entry.second);
		1547	}
		1548
		1549	// There was an old comment here about not testing for
		1550	// empty prefix, and we indeed did not test. I don't think
		1551	// that it makes sense any more (and was in disagreement
		1552	// with the LOG message. Really now: no prefix: no
		1553	// indexing.
		1554	if (ftp && !ftp->pfx.empty()) {
1450	LOGDEB0("Db::add: field [" << meta_it->first << "] pfx [" <<	1555	LOGDEB0("Db::add: field [" << entry.first << "] pfx [" <<
1451	ftp->pfx << "] inc " << ftp->wdfinc << ": [" <<	1556	ftp->pfx << "] inc " << ftp->wdfinc << ": [" <<
1452	meta_it->second << "]\n");	1557	entry.second << "]\n");
1453	splitter.setTraits(*ftp);	1558	splitter.setTraits(*ftp);
1454	if (!splitter.text_to_words(meta_it->second)) {	1559	if (!splitter.text_to_words(entry.second)) {
1455	LOGDEB("Db::addOrUpdate: split failed for " <<	1560	LOGDEB("Db::addOrUpdate: split failed for " <<
1456	meta_it->first << "\n");	1561	entry.first << "\n");
1457	}	1562	}
1458	}	1563	} else {
		1564	LOGDEB0("Db::add: no prefix for field [" <<
		1565	entry.first << "], no indexing\n");
		1566	}
1459	}	1567	}
1460		1568
1461	// Reset to no prefix and default params	1569	// Reset to no prefix and default params
1462	splitter.setTraits(FieldTraits());	1570	splitter.setTraits(FieldTraits());
1463		1571
	...		...
1468	LOGDEB2("Db::add: split body: [" << doc.text << "]\n");	1576	LOGDEB2("Db::add: split body: [" << doc.text << "]\n");
1469		1577
1470	#ifdef TEXTSPLIT_STATS	1578	#ifdef TEXTSPLIT_STATS
1471	splitter.resetStats();	1579	splitter.resetStats();
1472	#endif	1580	#endif
1473	if (!splitter.text_to_words(doc.text))	1581	if (!splitter.text_to_words(doc.text)) {
1474	LOGDEB("Db::addOrUpdate: split failed for main text\n");	1582	LOGDEB("Db::addOrUpdate: split failed for main text\n");
		1583	} else {
		1584	if (m_ndb->m_storetext) {
		1585	ZLibUtBuf buf;
		1586	deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
		1587	rawztext.assign(buf.getBuf(), buf.getCnt());
		1588	}
		1589	}
1475		1590
1476	#ifdef TEXTSPLIT_STATS	1591	#ifdef TEXTSPLIT_STATS
1477	// Reject bad data. unrecognized base64 text is characterized by	1592	// Reject bad data. unrecognized base64 text is characterized by
1478	// high avg word length and high variation (because there are	1593	// high avg word length and high variation (because there are
1479	// word-splitters like +/ inside the data).	1594	// word-splitters like +/ inside the data).
	...		...
1499	// We also add a term for the filename extension if any.	1614	// We also add a term for the filename extension if any.