* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#ifndef lint
static char rcsid[] = "@(#$Id: $ (C) 2005 J.F.Dockes";
#include "autoconfig.h"
#include <sys/types.h>
#include <string.h>
#include "autoconfig.h"
#include "pathut.h"
#include "debuglog.h"
#include "fstreewalk.h"
#include "beaglequeue.h"
#include "beaglequeuecache.h"
#include "circache.h"
#include "smallut.h"
#include "fileudi.h"
#include "internfile.h"
#include "wipedir.h"
#include "indexer.h"
#include "readfile.h"
#include "conftree.h"
#include "transcode.h"
#include "cancelcheck.h"
#include <vector>
#include <fstream>
#include <sstream>
using namespace std;
#include <sys/stat.h>
// Beagle creates a file named .xxx (where xxx is the name for the main file
// in the queue), to hold external metadata (http or created by Beagle).
// This class reads the .xxx, dotfile, and turns it into an Rcl::Doc holder
class BeagleDotFile {
BeagleDotFile(RclConfig *conf, const string& fn)
: m_conf(conf), m_fn(fn)
// Read input line, strip it of eol and return as c++ string
bool readLine(string& line)
static const int LL = 2048;
char cline[LL];
cline[0] = 0;
m_input.getline(cline, LL-1);
if (!m_input.good()) {
if (m_input.bad()) {
LOGERR(("beagleDotFileRead: input.bad()\n"));
return false;
int ll = strlen(cline);
while (ll > 0 && (cline[ll-1] == '\n' || cline[ll-1] == '\r')) {
cline[ll-1] = 0;
line.assign(cline, ll);
LOGDEB2(("BeagleDotFile:readLine: [%s]\n", line.c_str()));
return true;
// Process a beagle dot file and set interesting stuff in the doc
bool toDoc(Rcl::Doc& doc)
string line;
m_input.open(m_fn.c_str(), ios::in);
if (!m_input.good()) {
LOGERR(("BeagleDotFile: open failed for [%s]\n", m_fn.c_str()));
return false;
// Read the 3 first lines:
// - url
// - hit type: we only know about Bookmark and WebHistory for now
// - content-type.
if (!readLine(line))
return false;
doc.url = line;
if (!readLine(line))
return false;
doc.meta[Rcl::Doc::keybght] = line;
if (!readLine(line))
return false;
doc.mimetype = line;
// We set the bookmarks mtype as html (the text is empty
// anyway), so that the html viewer will be called on 'Open'
bool isbookmark = false;
if (!stringlowercmp("bookmark", doc.meta[Rcl::Doc::keybght])) {
isbookmark = true;
doc.mimetype = "text/html";
string confstr;
string ss(" ");
// Read the rest: fields and keywords. We do a little
// massaging of the input lines, then use a ConfSimple to
// parse, and finally insert the key/value pairs into the doc
// meta[] array
for (;;) {
if (!readLine(line)) {
// Eof hopefully
if (line.find("t:") != 0)
line = line.substr(2);
confstr += line + "\n";
ConfSimple fields(confstr, 1);
list<string> names = fields.getNames("");
for (list<string>::iterator it = names.begin();
it != names.end(); it++) {
string value;
fields.get(*it, value, "");
if (!value.compare("undefined") || !value.compare("null"))
string *valuep = &value;
string cvalue;
if (isbookmark) {
// It appears that bookmarks are stored in the users'
// locale charset (not too sure). No idea what to do
// for other types, would have to check the plugin.
string charset = m_conf->getDefCharset(true);
transcode(value, cvalue, charset, "UTF-8");
valuep = &cvalue;
string caname = m_conf->fieldCanon(*it);
doc.meta[caname].append(ss + *valuep);
// Finally build the confsimple that we will save to the
// cache, from the doc fields. This could also be done in
// parallel with the doc.meta build above, but simpler this
// way. We need it because not all interesting doc fields are
// in the meta array (ie: mimetype, url), and we want
// something homogenous and easy to save.
for (map<string,string>::const_iterator it = doc.meta.begin();
it != doc.meta.end(); it++) {
m_fields.set((*it).first, (*it).second, "");
m_fields.set("url", doc.url, "");
m_fields.set("mimetype", doc.mimetype, "");
return true;
RclConfig *m_conf;
ConfSimple m_fields;
string m_fn;
ifstream m_input;
const string badtmpdirname = "/no/such/dir/really/can/exist";
// Initialize. Compute paths and create a temporary directory that will be
// used by internfile()
BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
if (!m_config->getConfParam("beaglequeuedir", m_queuedir))
m_queuedir = path_tildexpand("~/.beagle/ToIndex/");
if (m_db && (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0)) {
string reason;
if (!maketmpdir(m_tmpdir, reason)) {
LOGERR(("DbIndexer: cannot create temporary directory: %s\n",
m_tmpdir = badtmpdirname;
m_cache = new BeagleQueueCache(cnf);
if (m_tmpdir.length() && m_tmpdir.compare(badtmpdirname)) {
if (rmdir(m_tmpdir.c_str()) < 0) {
LOGERR(("BeagleQueueIndexer::~: cannot clear temp dir %s\n",
// Index document stored in the cache.
bool BeagleQueueIndexer::indexFromCache(const string& udi)
if (!m_db)
return false;
Rcl::Doc dotdoc;
string data;
string hittype;
if (!m_cache || !m_cache->getFromCache(udi, dotdoc, data, &hittype))
return false;
if (hittype.empty()) {
LOGERR(("BeagleIndexer::index: cc entry has no hit type\n"));
return false;
if (!stringlowercmp("bookmark", hittype)) {
// Just index the dotdoc
dotdoc.meta[Rcl::Doc::keybcknd] = "BGL";
return m_db->addOrUpdate(udi, "", dotdoc);
} else if (stringlowercmp("webhistory", dotdoc.meta[Rcl::Doc::keybght]) ||
(dotdoc.mimetype.compare("text/html") &&
dotdoc.mimetype.compare("text/plain"))) {
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
dotdoc.meta[Rcl::Doc::keybght].c_str(), dotdoc.mimetype.c_str()));
return true;
} else {
Rcl::Doc doc;
FileInterner interner(data, m_config, m_tmpdir,
string ipath;
FileInterner::Status fis;
try {
fis = interner.internfile(doc, ipath);
} catch (CancelExcept) {
LOGERR(("BeagleQueueIndexer: interrupted\n"));
return false;
if (fis != FileInterner::FIDone) {
LOGERR(("BeagleQueueIndexer: bad status from internfile\n"));
return false;
doc.mimetype = dotdoc.mimetype;
doc.fmtime = dotdoc.fmtime;
doc.url = dotdoc.url;
doc.fbytes = dotdoc.fbytes;
doc.sig = "";
doc.meta[Rcl::Doc::keybcknd] = "BGL";
return m_db->addOrUpdate(udi, "", doc);
void BeagleQueueIndexer::updstatus(const string& udi)
if (m_updater) {
if (m_updater->status.dbtotdocs < m_updater->status.docsdone)
m_updater->status.dbtotdocs = m_updater->status.docsdone;
m_updater->status.fn = udi;
bool BeagleQueueIndexer::index()
if (!m_db)
return false;
LOGDEB(("BeagleQueueIndexer::processqueue: [%s]\n", m_queuedir.c_str()));
if (!m_cache || !m_cache->cc()) {
LOGERR(("BeagleQueueIndexer: cache initialization failed\n"));
return false;
CirCache *cc = m_cache->cc();
// First check/index files found in the cache. If the index was reset,
// this actually does work, else it sets the existence flags (avoid
// purging). We don't do this when called from indexFiles
if (!m_nocacheindex) {
bool eof;
if (!cc->rewind(eof)) {
// rewind can return eof if the cache is empty
if (!eof)
return false;
while (cc->next(eof)) {
string udi;
if (!cc->getCurrentUdi(udi)) {
LOGERR(("BeagleQueueIndexer:: cache file damaged\n"));
if (udi.empty())
if (m_db->needUpdate(udi, "")) {
try {
// indexFromCache does a CirCache::get(). We could
// arrange to use a getCurrent() instead, would be more
// efficient
} catch (CancelExcept) {
LOGERR(("BeagleQueueIndexer: interrupted\n"));
return false;
// Finally index the queue
FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
FsTreeWalker::Status status =walker.walk(m_queuedir, *this);
LOGDEB(("BeagleQueueIndexer::processqueue: done: status %d\n", status));
return true;
// Index a list of files (sent by the real time monitor)
bool BeagleQueueIndexer::indexFiles(list<string>& files)
if (!m_db) {
LOGERR(("BeagleQueueIndexer::indexfiles no db??\n"));
return false;
for (list<string>::iterator it = files.begin(); it != files.end();) {
if (it->empty()) {//??
it++; continue;
string father = path_getfather(*it);
if (father.compare(m_queuedir)) {
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nq)\n",
it++; continue;
// Pb: we are often called with the dot file, before the
// normal file exists, and sometimes never called for the
// normal file afterwards (ie for bookmarks where the normal
// file is empty). So we perform a normal queue run at the end
// of the function to catch older stuff. Still this is not
// perfect, sometimes some files will not be indexed before
// the next run.
string fn = path_getsimple(*it);
if (fn.empty() || fn.at(0) == '.') {
it++; continue;
struct stat st;
if (lstat(it->c_str(), &st) != 0) {
LOGERR(("BeagleQueueIndexer::indexfiles: cant stat [%s]\n",
it++; continue;
if (!S_ISREG(st.st_mode)) {
LOGDEB(("BeagleQueueIndexer::indexfiles: skipping [%s] (nr)\n",
it++; continue;
processone(*it, &st, FsTreeWalker::FtwRegular);
it = files.erase(it);
m_nocacheindex = true;
// Note: no need to reset nocacheindex, we're in the monitor now
return true;
BeagleQueueIndexer::processone(const string &path,
const struct stat *stp,
FsTreeWalker::CbFlag flg)
if (!m_db) //??
return FsTreeWalker::FtwError;
bool dounlink = false;
if (flg != FsTreeWalker::FtwRegular)
return FsTreeWalker::FtwOk;
string dotpath = path_cat(path_getfather(path),
string(".") + path_getsimple(path));
LOGDEB(("BeagleQueueIndexer: prc1: [%s]\n", path.c_str()));
BeagleDotFile dotfile(m_config, dotpath);
Rcl::Doc dotdoc;
string udi, udipath;
if (!dotfile.toDoc(dotdoc))
goto out;
// Have to use the hit type for the udi, because the same url can exist
// as a bookmark or a page.
udipath = path_cat(dotdoc.meta[Rcl::Doc::keybght], url_gpath(dotdoc.url));
make_udi(udipath, "", udi);
LOGDEB(("BeagleQueueIndexer: prc1: udi [%s]\n", udi.c_str()));
char ascdate[20];
sprintf(ascdate, "%ld", long(stp->st_mtime));
// We only process bookmarks or text/html and text/plain files.
if (!stringlowercmp("bookmark", dotdoc.meta[Rcl::Doc::keybght])) {
// For bookmarks, we just index the doc that was built from the
// metadata.
if (dotdoc.fmtime.empty())
dotdoc.fmtime = ascdate;
char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size);
dotdoc.fbytes = cbuf;
// Document signature for up to date checks: none.
dotdoc.sig = "";
dotdoc.meta[Rcl::Doc::keybcknd] = "BGL";
if (!m_db->addOrUpdate(udi, "", dotdoc))
return FsTreeWalker::FtwError;
} else if (stringlowercmp("webhistory", dotdoc.meta[Rcl::Doc::keybght]) ||
(dotdoc.mimetype.compare("text/html") &&
dotdoc.mimetype.compare("text/plain"))) {
LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
dotdoc.meta[Rcl::Doc::keybght].c_str(), dotdoc.mimetype.c_str()));
// Unlink them anyway
dounlink = true;
goto out;
} else {
Rcl::Doc doc;
// Store the dotdoc fields in the future doc. In case someone wants
// to use beagle-generated fields like beagle:inurl
doc.meta = dotdoc.meta;
FileInterner interner(path, stp, m_config, m_tmpdir,
string ipath;
FileInterner::Status fis;
try {
fis = interner.internfile(doc, ipath);
} catch (CancelExcept) {
LOGERR(("BeagleQueueIndexer: interrupted\n"));
goto out;
if (fis != FileInterner::FIDone && fis != FileInterner::FIAgain) {
LOGERR(("BeagleQueueIndexer: bad status from internfile\n"));
// TOBEDONE: internfile can return FIAgain here if it is
// paging a big text file, we should loop. Means we're
// only indexing the first page for text/plain files
// bigger than the page size (dlft: 1MB) for now.
goto out;
if (doc.fmtime.empty())
doc.fmtime = ascdate;
char cbuf[100];
sprintf(cbuf, "%ld", (long)stp->st_size);
doc.fbytes = cbuf;
// Document signature for up to date checks: none.
doc.sig = "";
doc.url = dotdoc.url;
doc.meta[Rcl::Doc::keybcknd] = "BGL";
if (!m_db->addOrUpdate(udi, "", doc))
return FsTreeWalker::FtwError;
// Copy to cache
// doc fields not in meta, needing saving to the cache
dotfile.m_fields.set("fmtime", dotdoc.fmtime, "");
dotfile.m_fields.set("fbytes", dotdoc.fbytes, "");
dotfile.m_fields.set("udi", udi, "");
string fdata;
file_to_string(path, fdata);
if (!m_cache || !m_cache->cc()) {
LOGERR(("BeagleQueueIndexer: cache initialization failed\n"));
goto out;
if (!m_cache->cc()->put(udi, &dotfile.m_fields, fdata, 0)) {
LOGERR(("BeagleQueueIndexer::prc1: cache_put failed; %s\n",
goto out;
dounlink = true;
if (dounlink) {
return FsTreeWalker::FtwOk;