--- a/src/index/recollindex.cpp
+++ b/src/index/recollindex.cpp
@@ -1,6 +1,8 @@
#ifndef lint
-static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.2 2004-12-14 17:54:16 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: recollindex.cpp,v 1.3 2004-12-15 15:00:37 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
+
+#include <sys/stat.h>
#include <strings.h>
@@ -14,24 +16,49 @@
#include "rcldb.h"
#include "readfile.h"
#include "indexer.h"
+#include "csguess.h"
+#include "transcode.h"
using namespace std;
-Rcl::Doc* textPlainToDoc(RclConfig *conf, const string &fn,
- const string &mtype)
-{
- return 0;
-}
-
+bool textPlainToDoc(RclConfig *conf, const string &fn,
+ const string &mtype, Rcl::Doc &docout)
+{
+ string otext;
+ if (!file_to_string(fn, otext))
+ return false;
+
+ // Try to guess charset, then convert to utf-8, and fill document fields
+ string charset;
+ if (conf->guesscharset) {
+ charset = csguess(otext, conf->defcharset);
+ } else
+ charset = conf->defcharset;
+ string utf8;
+ if (transcode(otext, charset, utf8, "UTF-8"))
+ return 0;
+
+ Rcl::Doc out;
+ out.origcharset = charset;
+ out.text = utf8;
+ docout = out;
+ return true;
+}
+
+// Map of mime types to internal interner functions. This could just as well
+// be an if else if suite inside getMimeHandler(), but this is prettier ?
static map<string, MimeHandlerFunc> ihandlers;
+// Static object to get the map to be initialized at program start.
class IHandler_Init {
public:
IHandler_Init() {
ihandlers["text/plain"] = textPlainToDoc;
+ // Add new associations here when needed
}
};
static IHandler_Init ihandleriniter;
+
/**
* Return handler function for given mime type
@@ -75,6 +102,9 @@
}
}
+/**
+ * Bunch holder for data used while indexing a directory tree
+ */
class DirIndexer {
FsTreeWalker walker;
RclConfig *config;
@@ -95,23 +125,23 @@
void DirIndexer::index()
{
-#if 0
if (!db.open(dbdir, Rcl::Db::DbUpd)) {
cerr << "Error opening database in " << dbdir << " for " <<
topdir << endl;
return;
}
-#endif
walker.walk(topdir, indexfile, this);
-#if 0
if (!db.close()) {
cerr << "Error closing database in " << dbdir << " for " <<
topdir << endl;
return;
}
-#endif
-}
-
+}
+
+/**
+ * This function gets called for every file and directory found by the
+ * tree walker. Adjust parameters and index files if/when needed.
+ */
FsTreeWalker::Status
indexfile(void *cdata, const std::string &fn, const struct stat *stp,
FsTreeWalker::CbFlag flg)
@@ -144,24 +174,23 @@
return FsTreeWalker::FtwOk;
}
- // Check if file has already been indexed, and has changed since
- // - Make path term,
- // - query db: postlist_begin->docid
- // - fetch doc (get_document(docid)
- // - check date field, maybe skip
+ if (!me->db.needUpdate(fn, stp))
+ return FsTreeWalker::FtwOk;
// Turn file into a document. The document has fields for title, body
// etc., all text converted to utf8
- Rcl::Doc *doc = fun(me->config, fn, mime);
-
-#if 0
+ Rcl::Doc doc;
+ if (!fun(me->config, fn, mime, doc))
+ return FsTreeWalker::FtwOk;
+
// Set up xapian document, add postings and misc fields,
// add to or update database.
- dbadd(doc);
-#endif
+ if (!me->db.add(fn, doc))
+ return FsTreeWalker::FtwError;
return FsTreeWalker::FtwOk;
}
+
int main(int argc, const char **argv)
@@ -180,7 +209,7 @@
}
vector<string> tdl;
if (ConfTree::stringToStrings(topdirs, tdl)) {
- for (int i = 0; i < tdl.size(); i++) {
+ for (unsigned int i = 0; i < tdl.size(); i++) {
string topdir = tdl[i];
cout << topdir << endl;
string dbdir;