Switch to side-by-side view

--- a/src/index/beaglequeue.cpp
+++ b/src/index/beaglequeue.cpp
@@ -18,6 +18,10 @@
 static char rcsid[] = "@(#$Id: $ (C) 2005 J.F.Dockes";
 #endif
 #include "autoconfig.h"
+
+#include <sys/types.h>
+
+#include "autoconfig.h"
 #include "pathut.h"
 #include "debuglog.h"
 #include "fstreewalk.h"
@@ -27,9 +31,14 @@
 #include "internfile.h"
 #include "wipedir.h"
 #include "circache.h"
+#include "indexer.h"
+#include "readfile.h"
+#include "conftree.h"
+#include "transcode.h"
 
 #include <vector>
 #include <fstream>
+#include <sstream>
 using namespace std;
 
 #include <sys/stat.h>
@@ -42,9 +51,7 @@
 public:
     BeagleDotFile(RclConfig *conf, const string& fn)
         : m_conf(conf), m_fn(fn)
-    {
-
-    }
+    { }
 
     bool readLine(string& line)
     {
@@ -92,13 +99,20 @@
             return false;
         doc.mimetype = line;
 
-        if (doc.mimetype.empty() && 
-            !stringlowercmp("bookmark", doc.meta[keybght]))
-            doc.mimetype = "text/plain";
+        // We set the bookmarks mtype as html, the text is empty
+        // anyway, so that the html viewer will be called on 'Open'
+        bool isbookmark = false;
+        if (!stringlowercmp("bookmark", doc.meta[keybght])) {
+            isbookmark = true;
+            doc.mimetype = "text/html";
+        }
 
         string confstr;
         string ss(" ");
-        // Read the rest: fields and keywords
+        // Read the rest: fields and keywords. We do a little
+        // massaging of the input lines, then use a ConfSimple to
+        // parse, and finally insert the key/value pairs into the doc
+        // meta[] array
         for (;;) {
             if (!readLine(line)) {
                 // Eof hopefully
@@ -109,7 +123,6 @@
             line = line.substr(2);
             confstr += line + "\n";
         }
-
         ConfSimple fields(confstr, 1);
         list<string> names = fields.getNames("");
         for (list<string>::iterator it = names.begin();
@@ -118,23 +131,50 @@
             fields.get(*it, value, "");
             if (!value.compare("undefined") || !value.compare("null"))
                 continue;
+
+            string *valuep = &value;
+            string cvalue;
+            if (isbookmark) {
+                // It appears that bookmarks are stored in the users'
+                // locale charset (not too sure). No idea what to do
+                // for other types, would have to check the plugin.
+                string charset = m_conf->getDefCharset(true);
+                transcode(value, cvalue, charset,  "UTF-8"); 
+                valuep = &cvalue;
+            }
+                
             string caname = m_conf->fieldCanon(*it);
-            doc.meta[caname].append(ss + value);
-        }
+            doc.meta[caname].append(ss + *valuep);
+        }
+
+        // Finally build the confsimple that we will save to the
+        // cache, out of document fields. This could also be done in
+        // parallel with the doc.meta build above, but simpler this way.
+        for (map<string,string>::const_iterator it = doc.meta.begin();
+             it != doc.meta.end(); it++) {
+            m_fields.set((*it).first, (*it).second, "");
+        }
+        m_fields.set("url", doc.url, "");
+        m_fields.set("mimetype", doc.mimetype, "");
+
         return true;
     }    
 
     RclConfig *m_conf;
+    ConfSimple m_fields;
     string m_fn;
     ifstream m_input;
 };
 
 const string badtmpdirname = "/no/such/dir/really/can/exist";
-BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf)
-    : m_config(cnf), m_db(cnf)
+BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
+                                       DbIxStatusUpdater *updfunc)
+    : m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc)
 {
+
     if (!m_config->getConfParam("beaglequeuedir", m_queuedir))
         m_queuedir = path_tildexpand("~/.beagle/ToIndex");
+
     if (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0) {
 	string reason;
         if (!maketmpdir(m_tmpdir, reason)) {
@@ -143,12 +183,20 @@
             m_tmpdir = badtmpdirname;
 	}
     }
-    Rcl::Db::OpenMode mode = Rcl::Db::DbUpd;
-    if (!m_db.open(mode)) {
-	LOGERR(("BeagleQueueIndexer: error opening database %s\n", 
-                m_config->getDbDir().c_str()));
-	return;
-    }
+
+    string ccdir;
+    m_config->getConfParam("webcachedir", ccdir);
+    if (ccdir.empty())
+        ccdir = "webcache";
+    ccdir = path_tildexpand(ccdir);
+    // If not an absolute path, compute relative to config dir
+    if (ccdir.at(0) != '/')
+        ccdir = path_cat(m_config->getConfDir(), ccdir);
+
+    int maxmbs = 20;
+    m_config->getConfParam("webcachemaxmbs", &maxmbs);
+    m_cache = new CirCache(ccdir);
+    m_cache->create(off_t(maxmbs)*1000*1024, true);
 }
 
 BeagleQueueIndexer::~BeagleQueueIndexer()
@@ -161,13 +209,106 @@
 		    m_tmpdir.c_str()));
 	}
     }
-    m_db.close();
+    deleteZ(m_cache);
 }
 
-bool BeagleQueueIndexer::processqueue()
+bool BeagleQueueIndexer::indexFromCache(const string& udi)
+{
+    string dict, data;
+
+    // This is horribly inefficient and needs fixing either by saving
+    // the offsets during the forward scan, or using an auxiliary isam
+    // map
+    if (!m_cache->get(udi, dict, data))
+        return false;
+
+    ConfSimple cf(dict, 1);
+
+    string hittype;
+    if (!cf.get(keybght, hittype, "")) {
+        LOGERR(("BeagleIndexer::index: cc entry has no hit type\n"));
+        return false;
+    }
+
+    // Build a doc from saved metadata 
+    Rcl::Doc dotdoc;
+    cf.get("url", dotdoc.url, "");
+    cf.get("mimetype", dotdoc.mimetype, "");
+    cf.get("fmtime", dotdoc.fmtime, "");
+    cf.get("fbytes", dotdoc.fbytes, "");
+    dotdoc.sig = "";
+    list<string> names = cf.getNames("");
+    for (list<string>::const_iterator it = names.begin();
+         it != names.end(); it++) {
+        cf.get(*it, dotdoc.meta[*it], "");
+    }
+
+    if (!stringlowercmp("bookmark", hittype)) {
+        // Just index the dotdoc
+        return m_db->addOrUpdate(udi, "", dotdoc);
+    } else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) ||
+               (dotdoc.mimetype.compare("text/html") &&
+                dotdoc.mimetype.compare("text/plain"))) {
+        LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
+                dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str()));
+        return true;
+    } else {
+        Rcl::Doc doc;
+        FileInterner interner(data, m_config, m_tmpdir, 
+                              FileInterner::FIF_doUseInputMimetype,
+                              dotdoc.mimetype);
+        string ipath;
+        FileInterner::Status fis = interner.internfile(doc, ipath);
+        if (fis != FileInterner::FIDone) {
+            LOGERR(("BeagleQueueIndexer: bad status from internfile\n"));
+            return false;
+        }
+
+        doc.mimetype = dotdoc.mimetype;
+        doc.fmtime = dotdoc.fmtime;
+        doc.url = dotdoc.url;
+        doc.fbytes = dotdoc.fbytes;
+        doc.sig = "";
+
+        return m_db->addOrUpdate(udi, "", doc);
+    }
+}
+
+bool BeagleQueueIndexer::index()
 {
     LOGDEB(("BeagleQueueIndexer::processqueue: dir: [%s]\n", 
             m_queuedir.c_str()));
+    m_config->setKeyDir(m_queuedir);
+
+    // First walk the cache to set the existence flags. We do not
+    // actually check uptodateness because all files in the cache are
+    // supposedly already indexed.
+    //TBD: change this as the cache needs reindexing after an index reset!
+    // Also, we need to read the cache backwards so that the newest
+    // version of each file gets indexed? Or find a way to index
+    // multiple versions ?
+    bool eof;
+    if (!m_cache->rewind(eof)) {
+        if (!eof)
+            return false;
+    }
+    vector<string> alludis;
+    alludis.reserve(20000);
+    while (m_cache->next(eof)) {
+        string dict;
+        m_cache->getcurrentdict(dict);
+        ConfSimple cf(dict, 1);
+        string udi;
+        if (!cf.get("udi", udi, ""))
+            continue;
+        alludis.push_back(udi);
+    }
+    for (vector<string>::reverse_iterator it = alludis.rbegin();
+         it != alludis.rend(); it++) {
+        if (m_db->needUpdate(*it, "")) {
+            indexFromCache(*it);
+        }
+    }
 
     FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
     walker.addSkippedName(".*");
@@ -181,12 +322,15 @@
                                const struct stat *stp,
                                FsTreeWalker::CbFlag flg)
 {
+    bool dounlink = false;
+
     if (flg != FsTreeWalker::FtwRegular) 
         return FsTreeWalker::FtwOk;
 
     string dotpath = path_cat(path_getfather(path), 
                               string(".") + path_getsimple(path));
     LOGDEB(("BeagleQueueIndexer: prc1: [%s]\n", path.c_str()));
+
     BeagleDotFile dotfile(m_config, dotpath);
     Rcl::Doc dotdoc;
     string udi, udipath;
@@ -205,12 +349,32 @@
 
     // We only process bookmarks or text/html and text/plain files.
     if (!stringlowercmp("bookmark", dotdoc.meta[keybght])) {
+        // For bookmarks, we just index the doc that was built from the
+        // metadata.
+        if (dotdoc.fmtime.empty())
+            dotdoc.fmtime = ascdate;
+
+        char cbuf[100]; 
+        sprintf(cbuf, "%ld", (long)stp->st_size);
+        dotdoc.fbytes = cbuf;
+
+        // Document signature for up to date checks: none. 
+        dotdoc.sig = "";
+        
+        // doc fields not in meta, needing saving to the cache
+        dotfile.m_fields.set("fmtime", dotdoc.fmtime, "");
+        dotfile.m_fields.set("fbytes", dotdoc.fbytes, "");
+
+        if (!m_db->addOrUpdate(udi, "", dotdoc)) 
+            return FsTreeWalker::FtwError;
 
     } else if (stringlowercmp("webhistory", dotdoc.meta[keybght]) ||
                (dotdoc.mimetype.compare("text/html") &&
                 dotdoc.mimetype.compare("text/plain"))) {
         LOGDEB(("BeagleQueueIndexer: skipping: hittype %s mimetype %s\n",
                 dotdoc.meta[keybght].c_str(), dotdoc.mimetype.c_str()));
+        // Unlink them anyway
+        dounlink = true;
         goto out;
     } else {
         Rcl::Doc doc;
@@ -230,17 +394,34 @@
         char cbuf[100]; 
         sprintf(cbuf, "%ld", (long)stp->st_size);
         doc.fbytes = cbuf;
-        // Document signature for up to date checks: none. The file is
-        // going to be deleted anyway. We always reindex what comes in
-        // the queue.  It would probably be possible to extract some
-        // http data to avoid this.
+        // Document signature for up to date checks: none. 
         doc.sig = "";
         doc.url = dotdoc.url;
-        if (!m_db.addOrUpdate(udi, "", doc)) 
+
+        // doc fields not in meta, needing saving to the cache
+        dotfile.m_fields.set("fmtime", dotdoc.fmtime, "");
+        dotfile.m_fields.set("fbytes", dotdoc.fbytes, "");
+
+        if (!m_db->addOrUpdate(udi, "", doc)) 
             return FsTreeWalker::FtwError;
-    }
+
+    }
+
+    // Copy to cache
+    {
+        stringstream o;
+        dotfile.m_fields.write(o);
+        string fdata;
+        file_to_string(path, fdata);
+        if (!m_cache->put(udi, o.str(), fdata))
+            goto out;
+    }
+
+    dounlink = true;
 out:
-//    unlink(path.c_str());
-//    unlink(dotpath.c_str());
+    if (dounlink) {
+        unlink(path.c_str());
+        unlink(dotpath.c_str());
+    }
     return FsTreeWalker::FtwOk;
 }