--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@@ -1,5 +1,5 @@
#ifndef lint
-static char rcsid[] = "@(#$Id: internfile.cpp,v 1.18 2006-12-13 09:13:18 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: internfile.cpp,v 1.19 2006-12-15 12:40:02 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
* This program is free software; you can redistribute it and/or modify
@@ -32,12 +32,14 @@
#endif /* NO_NAMESPACES */
#include "internfile.h"
+#include "rcldoc.h"
#include "mimetype.h"
#include "debuglog.h"
#include "mimehandler.h"
#include "execmd.h"
#include "pathut.h"
#include "wipedir.h"
+#include "rclconfig.h"
// Execute the command to uncompress a file into a temporary one.
static bool uncompressfile(RclConfig *conf, const string& ifn,
@@ -106,98 +108,262 @@
// internfile
FileInterner::FileInterner(const std::string &f, RclConfig *cnf,
const string& td, const string *imime)
- : m_fn(f), m_cfg(cnf), m_tdir(td), m_handler(0)
-{
- // We are actually going to access the file, so it's ok
- // performancewise to check this config variable at every call
- // even if it can only change when we change directories
- string usfc;
- int usfci;
- if (!cnf->getConfParam("usesystemfilecommand", usfc))
- usfci = 0;
- else
- usfci = atoi(usfc.c_str()) ? 1 : 0;
+ : m_cfg(cnf), m_fn(f), m_forPreview(imime?true:false), m_tdir(td)
+{
+ bool usfci = false;
+ cnf->getConfParam("usesystemfilecommand", &usfci);
LOGDEB1(("FileInterner::FileInterner: usfci now %d\n", usfci));
-
- bool forPreview = imime ? true : false;
// We need to run mime type identification in any case to check
// for a compressed file.
- m_mime = mimetype(m_fn, m_cfg, usfci);
+ string l_mime = mimetype(m_fn, m_cfg, usfci);
// If identification fails, try to use the input parameter. This
// is then normally not a compressed type (it's the mime type from
// the db), and is only set when previewing, not for indexing
- if (m_mime.empty() && imime)
- m_mime = *imime;
-
- if (!m_mime.empty()) {
+ if (l_mime.empty() && imime)
+ l_mime = *imime;
+
+ if (!l_mime.empty()) {
// Has mime: check for a compressed file. If so, create a
// temporary uncompressed file, and rerun the mime type
// identification, then do the rest with the temp file.
list<string>ucmd;
- if (m_cfg->getUncompressor(m_mime, ucmd)) {
+ if (m_cfg->getUncompressor(l_mime, ucmd)) {
if (!uncompressfile(m_cfg, m_fn, ucmd, m_tdir, m_tfile)) {
return;
}
LOGDEB(("internfile: after ucomp: m_tdir %s, tfile %s\n",
m_tdir.c_str(), m_tfile.c_str()));
m_fn = m_tfile;
- m_mime = mimetype(m_fn, m_cfg, usfci);
- if (m_mime.empty() && imime)
- m_mime = *imime;
- }
- }
-
- if (m_mime.empty()) {
+ l_mime = mimetype(m_fn, m_cfg, usfci);
+ if (l_mime.empty() && imime)
+ l_mime = *imime;
+ }
+ }
+
+ if (l_mime.empty()) {
// No mime type. We let it through as config may warrant that
// we index all file names
LOGDEB(("internfile: (no mime) [%s]\n", m_fn.c_str()));
}
// Look for appropriate handler (might still return empty)
- m_handler = getMimeHandler(m_mime, m_cfg);
-
- if (!m_handler) {
+ Dijon::Filter *df = getMimeHandler(l_mime, m_cfg);
+
+ if (!df) {
// No handler for this type, for now :( if indexallfilenames
// is set in the config, this normally wont happen (we get mh_unknown)
- LOGDEB(("FileInterner::FileInterner: %s: no handler\n",
- m_mime.c_str()));
+ LOGDEB(("FileInterner:: no handler for %s\n", l_mime.c_str()));
return;
}
- m_handler->setForPreview(forPreview);
- LOGDEB(("FileInterner::FileInterner: %s [%s]\n", m_mime.c_str(),
+ df->set_property(Dijon::Filter::OPERATING_MODE,
+ m_forPreview ? "view" : "index");
+
+ string charset = m_cfg->getDefCharset();
+ df->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
+ if (!df->set_document_file(m_fn)) {
+ LOGERR(("FileInterner:: error parsing %s\n", m_fn.c_str()));
+ return;
+ }
+ m_handlers.reserve(20);
+ m_handlers.push_back(df);
+ LOGDEB(("FileInterner::FileInterner: %s [%s]\n", l_mime.c_str(),
m_fn.c_str()));
}
+static const unsigned int MAXHANDLERS = 20;
+
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
{
- if (!m_handler) {
- LOGERR(("FileInterner::internfile: no handler !!\n"));
+ if (m_handlers.size() != 1) {
+ LOGERR(("FileInterner::internfile: bad stack size %d !!\n",
+ m_handlers.size()));
return FIError;
}
- // Turn file into a document. The document has fields for title, body
- // etc., all text converted to utf8
- MimeHandler::Status mhs =
- m_handler->mkDoc(m_cfg, m_fn, m_mime, doc, ipath);
- FileInterner::Status ret = FIError;
- switch (mhs) {
- case MimeHandler::MHError:
- LOGERR(("FileInterner::internfile: error parsing %s\n", m_fn.c_str()));
- break;
- case MimeHandler::MHDone: ret = FIDone;break;
- case MimeHandler::MHAgain: ret = FIAgain;break;
- }
-
- doc.mimetype = m_mime;
- return ret;
+ // Note that the vector is big enough for the maximum stack. All values
+ // over the last significant one are ""
+ vector<string> vipath(MAXHANDLERS);
+ int vipathidx = 0;
+ if (!ipath.empty()) {
+ list<string> lipath;
+ stringToTokens(ipath, lipath, "|", true);
+ vipath.insert(vipath.begin(), lipath.begin(), lipath.end());
+ if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
+ LOGERR(("FileInterner::internfile: can't skip\n"));
+ return FIError;
+ }
+ }
+
+
+ /* Try to get doc from the topmost filter */
+ while (!m_handlers.empty()) {
+ if (!vipath.empty()) {
+
+ }
+ if (!m_handlers.back()->has_documents()) {
+ // No docs at the current top level. Pop and see if there
+ // is something at the previous one
+ delete m_handlers.back();
+ m_handlers.pop_back();
+ continue;
+ }
+
+ if (!m_handlers.back()->next_document()) {
+ LOGERR(("FileInterner::internfile: next_document failed\n"));
+ return FIError;
+ }
+
+ // Look at what we've got
+ const std::map<std::string, std::string> *docdata =
+ &m_handlers.back()->get_meta_data();
+ map<string,string>::const_iterator it;
+ string charset;
+ it = docdata->find("charset");
+ if (it != docdata->end())
+ charset = it->second;
+ string mimetype;
+ it = docdata->find("mimetype");
+ if (it != docdata->end())
+ mimetype = it->second;
+
+ LOGDEB(("FileInterner::internfile:next_doc is %s\n",mimetype.c_str()));
+ // If we find a text/plain doc, we're done
+ if (!strcmp(mimetype.c_str(), "text/plain"))
+ break;
+
+ // Got a non text/plain doc. We need to stack another
+ // filter. Check current size
+ if (m_handlers.size() > MAXHANDLERS) {
+ // Stack too big. Skip this and go on to check if there is
+ // something else in the current back()
+ LOGDEB(("FileInterner::internfile: stack too high\n"));
+ continue;
+ }
+
+ Dijon::Filter *again = getMimeHandler(mimetype, m_cfg);
+ if (!again) {
+ // If we can't find a filter, this doc can't be handled
+ // but there can be other ones so we go on
+ LOGERR(("FileInterner::internfile: no filter for [%s]\n",
+ mimetype.c_str()));
+ continue;
+ }
+ again->set_property(Dijon::Filter::OPERATING_MODE,
+ m_forPreview ? "view" : "index");
+ again->set_property(Dijon::Filter::DEFAULT_CHARSET,
+ charset);
+ string ns;
+ const string *txt = &ns;
+ it = docdata->find("content");
+ if (it != docdata->end())
+ txt = &it->second;
+ if (!again->set_document_string(*txt)) {
+ LOGERR(("FileInterner::internfile: error reparsing for %s\n",
+ m_fn.c_str()));
+ delete again;
+ continue;
+ }
+ // add filter and go on
+ m_handlers.push_back(again);
+ if (!m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
+ LOGERR(("FileInterner::internfile: can't skip\n"));
+ return FIError;
+ }
+ }
+
+ if (m_handlers.empty()) {
+ LOGERR(("FileInterner::internfile: stack empty\n"));
+ return FIError;
+ }
+ if (!m_forPreview) {
+ string &ipath = doc.ipath;
+ bool hasipath = false;
+ for (vector<Dijon::Filter*>::const_iterator it = m_handlers.begin();
+ it != m_handlers.end(); it++) {
+ map<string,string>::const_iterator iti =
+ (*it)->get_meta_data().find("ipath");
+ if (iti != (*it)->get_meta_data().end()) {
+ if (!iti->second.empty())
+ hasipath = true;
+ ipath += iti->second + "|";
+ } else {
+ ipath += "|";
+ }
+ }
+ if (hasipath) {
+ LOGDEB(("IPATH [%s]\n", ipath.c_str()));
+ string::size_type sit = ipath.find_last_not_of("|");
+ if (sit == string::npos)
+ ipath.erase();
+ else if (sit < ipath.length() -1)
+ ipath.erase(sit+1);
+ } else {
+ ipath.erase();
+ }
+ }
+
+ dijontorcl(m_handlers.back(), doc);
+
+ // Destack what can be
+ while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
+ delete m_handlers.back();
+ m_handlers.pop_back();
+ }
+ if (m_handlers.empty() || !m_handlers.back()->has_documents())
+ return FIDone;
+ else
+ return FIAgain;
+}
+
+
+bool FileInterner::dijontorcl(Dijon::Filter *df, Rcl::Doc& doc)
+{
+ const std::map<std::string, std::string> *docdata = &df->get_meta_data();
+ map<string,string>::const_iterator it;
+
+ it = docdata->find("mimetype");
+ if (it != docdata->end())
+ doc.mimetype = it->second;
+
+ it = docdata->find("origcharset");
+ if (it != docdata->end())
+ doc.origcharset = it->second;
+
+ it = docdata->find("content");
+ if (it != docdata->end())
+ doc.text = it->second;
+
+ it = docdata->find("title");
+ if (it != docdata->end())
+ doc.title = it->second;
+
+ it = docdata->find("keywords");
+ if (it != docdata->end())
+ doc.keywords = it->second;
+
+ it = docdata->find("modificationdate");
+ if (it != docdata->end())
+ doc.dmtime = it->second;
+
+ it = docdata->find("abstract");
+ if (it != docdata->end()) {
+ doc.abstract = it->second;
+ } else {
+ it = docdata->find("sample");
+ if (it != docdata->end())
+ doc.abstract = it->second;
+ }
+ return true;
}
FileInterner::~FileInterner()
{
- delete m_handler;
- m_handler = 0;
+ while (!m_handlers.empty()) {
+ delete m_handlers.back();
+ m_handlers.pop_back();
+ }
tmpcleanup();
}
@@ -212,6 +378,8 @@
#include "debuglog.h"
#include "rclinit.h"
#include "internfile.h"
+#include "rclconfig.h"
+#include "rcldoc.h"
static string thisprog;