--- a/src/index/beaglequeue.cpp
+++ b/src/index/beaglequeue.cpp
@@ -47,16 +47,20 @@
const string keybght("beagleHitType");
-#define LL 2048
-
+
+// Beagle creates a file named .xxx (where xxx is the name for the main file
+// in the queue), to hold external metadata (http or created by Beagle).
+// This class reads the .xxx, dotfile, and turns it into an Rcl::Doc holder
class BeagleDotFile {
public:
BeagleDotFile(RclConfig *conf, const string& fn)
: m_conf(conf), m_fn(fn)
- { }
-
+ {}
+
+ // Read input line, strip it of eol and return as c++ string
bool readLine(string& line)
{
+ static const int LL = 2048;
char cline[LL];
cline[0] = 0;
m_input.getline(cline, LL-1);
@@ -101,8 +105,8 @@
return false;
doc.mimetype = line;
- // We set the bookmarks mtype as html, the text is empty
- // anyway, so that the html viewer will be called on 'Open'
+ // We set the bookmarks mtype as html (the text is empty
+ // anyway), so that the html viewer will be called on 'Open'
bool isbookmark = false;
if (!stringlowercmp("bookmark", doc.meta[keybght])) {
isbookmark = true;
@@ -150,8 +154,11 @@
}
// Finally build the confsimple that we will save to the
- // cache, out of document fields. This could also be done in
- // parallel with the doc.meta build above, but simpler this way.
+ // cache, from the doc fields. This could also be done in
+ // parallel with the doc.meta build above, but simpler this
+ // way. We need it because not all interesting doc fields are
+ // in the meta array (ie: mimetype, url), and we want
+ // something homogenous and easy to save.
for (map<string,string>::const_iterator it = doc.meta.begin();
it != doc.meta.end(); it++) {
m_fields.set((*it).first, (*it).second, "");
@@ -169,6 +176,9 @@
};
const string badtmpdirname = "/no/such/dir/really/can/exist";
+
+// Initialize. Compute paths and create a temporary directory that will be
+// used by internfile()
BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
DbIxStatusUpdater *updfunc)
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
@@ -216,6 +226,8 @@
deleteZ(m_cache);
}
+// Read document from cache. Return the metadata as an Rcl::Doc
+// @param htt Beagle Hit Type
bool BeagleQueueIndexer::getFromCache(const string& udi, Rcl::Doc &dotdoc,
string& data, string *htt)
{
@@ -243,6 +255,7 @@
return true;
}
+// Index document stored in the cache.
bool BeagleQueueIndexer::indexFromCache(const string& udi)
{
if (!m_db)
@@ -304,18 +317,31 @@
{
if (!m_db)
return false;
- LOGDEB(("BeagleQueueIndexer::processqueue: dir: [%s]\n",
- m_queuedir.c_str()));
+ LOGDEB(("BeagleQueueIndexer::processqueue: [%s]\n", m_queuedir.c_str()));
m_config->setKeyDir(m_queuedir);
- // First check that files in the cache are in the index, in case this
- // has been reset. We don't do this when called from indexFiles
+ // First check/index files found in the cache. If the index was reset,
+ // this actually does work, else it sets the existence flags (avoid
+ // purging). We don't do this when called from indexFiles
if (!m_nocacheindex) {
bool eof;
if (!m_cache->rewind(eof)) {
+ // rewind can return eof if the cache is empty
if (!eof)
return false;
}
+
+ // The cache is walked in chronogical order, but we want to
+ // index the newest files first (there can be several versions
+ // of a given file in the cache). Have to revert the
+ // list. This would be a problem with a big cache, because the
+ // udis can be big (ie 150 chars), and would be more
+ // efficiently performed by the cache, which could use the
+ // smaller offsets.
+ //
+ // Another approach would be to just walk chronogical and
+ // reindex all versions: would waste processing but save
+ // memory
vector<string> alludis;
alludis.reserve(20000);
while (m_cache->next(eof)) {
@@ -340,6 +366,7 @@
}
}
+ // Finally index the queue
FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
walker.addSkippedName(".*");
FsTreeWalker::Status status =walker.walk(m_queuedir, *this);
@@ -347,6 +374,7 @@
return true;
}
+// Index a list of files (sent by the real time monitor)
bool BeagleQueueIndexer::indexFiles(list<string>& files)
{
LOGDEB(("BeagleQueueIndexer::indexFiles\n"));
@@ -489,7 +517,6 @@
return FsTreeWalker::FtwError;
}
-
// Copy to cache
{
// doc fields not in meta, needing saving to the cache