--- a/src/internfile/internfile.cpp
+++ b/src/internfile/internfile.cpp
@@ -118,7 +118,6 @@
// Split into "constructor calls init()" to allow use from other constructor
FileInterner::FileInterner(const string &fn, const struct stat *stp,
RclConfig *cnf, int flags, const string *imime)
- : m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
{
LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n");
if (fn.empty()) {
@@ -219,8 +218,18 @@
LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n");
}
+ // Get fields computed from extended attributes. We use the
+ // original file, not the m_fn which may be the uncompressed temp
+ // file
+ if (!m_noxattrs)
+ reapXAttrs(m_cfg, f, m_XAttrsFields);
+
+ // Gather metadata from external commands as configured.
+ reapMetaCmds(m_cfg, f, m_cmdFields);
+
+ m_mimetype = l_mime;
+
// Look for appropriate handler (might still return empty)
- m_mimetype = l_mime;
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
if (!df || df->is_unknown()) {
@@ -234,15 +243,6 @@
m_forPreview ? "view" : "index");
df->set_property(Dijon::Filter::DJF_UDI, udi);
- // Get fields computed from extended attributes. We use the
- // original file, not the m_fn which may be the uncompressed temp
- // file
- if (!m_noxattrs)
- reapXAttrs(m_cfg, f, m_XAttrsFields);
-
- // Gather metadata from external commands as configured.
- reapMetaCmds(m_cfg, f, m_cmdFields);
-
df->set_docsize(docsize);
if (!df->set_document_file(l_mime, m_fn)) {
delete df;
@@ -258,7 +258,6 @@
// Setup from memory data (ie: out of the web cache). imime needs to be set.
FileInterner::FileInterner(const string &data, RclConfig *cnf,
int flags, const string& imime)
- : m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
{
LOGDEB0("FileInterner::FileInterner(data)\n");
initcommon(cnf, flags);
@@ -313,7 +312,7 @@
void FileInterner::initcommon(RclConfig *cnf, int flags)
{
m_cfg = cnf;
- m_forPreview = ((flags & FIF_forPreview) != 0);
+ m_uncomp = m_forPreview = ((flags & FIF_forPreview) != 0);
// Initialize handler stack.
m_handlers.reserve(MAXHANDLERS);
for (unsigned int i = 0; i < MAXHANDLERS; i++)
@@ -324,7 +323,6 @@
}
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
- : m_ok(false), m_missingdatap(0), m_uncomp(((flags & FIF_forPreview) != 0))
{
LOGDEB0("FileInterner::FileInterner(idoc)\n");
initcommon(cnf, flags);
@@ -347,6 +345,9 @@
init(rawdoc.data, cnf, flags, idoc.mimetype);
break;
case DocFetcher::RawDoc::RDK_DATADIRECT:
+ // Note: only used for demo with the sample python external
+ // mbox indexer at this point. The external program is
+ // responsible for all the extraction process.
init(rawdoc.data, cnf, flags, idoc.mimetype);
m_direct = true;
break;
@@ -735,8 +736,8 @@
}
}
if (!setres) {
- LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
- " for mtype " << mimetype << "\n");
+ LOGINFO("FileInterner::addHandler: set_doc failed inside [" << m_fn <<
+ "] for mtype " << mimetype << "\n");
delete newflt;
if (m_forPreview)
return ADD_ERROR;
@@ -918,36 +919,24 @@
TempFile temp(new TempFileInternal(
cnf->getSuffixFromMimeType(mimetype)));
if (!temp->ok()) {
- LOGERR("FileInterner::interntofile: can't create temp file\n");
+ LOGERR("FileInterner::tempFileForMT: can't create temp file\n");
return false;
}
otemp = temp;
return true;
}
-// Extract document (typically subdoc of multidoc) into temporary file.
-// We do the usual internfile stuff: create a temporary directory,
-// then create an interner and call internfile. The target mtype is set to
-// the input mtype, so that no data conversion is performed.
-// We then write the data out of the resulting document into the output file.
-// There are two temporary objects:
-// - The internfile temporary directory gets destroyed by its destructor
-// - The output temporary file which is held in a reference-counted
-// object and will be deleted when done with.
-//
-// If the ipath is null, maybe we're called because the file is not
-// stored in the regular file system. We use the docfetcher to get a
-// copy (in topdocToFile())
-//
-// We currently don't handle the case of an internal doc of a non-fs document.
-
-bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
- RclConfig *cnf, const Rcl::Doc& idoc)
+// Static method, creates a FileInterner object to do the job.
+bool FileInterner::idocToFile(
+ TempFile& otemp, const string& tofile, RclConfig *cnf,
+ const Rcl::Doc& idoc, bool uncompress)
{
LOGDEB("FileInterner::idocToFile\n");
if (idoc.ipath.empty()) {
- return topdocToFile(otemp, tofile, cnf, idoc);
+ // Because of the mandatory first conversion in the
+ // FileInterner constructor, need to use a specific method.
+ return topdocToFile(otemp, tofile, cnf, idoc, uncompress);
}
// We set FIF_forPreview for consistency with the previous version
@@ -958,17 +947,21 @@
return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype);
}
-bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile,
- RclConfig *cnf, const Rcl::Doc& idoc)
+// This is only needed because the FileInterner constructor always performs
+// the first conversion, so that we need another approach for accessing the
+// original document (targetmtype won't do).
+bool FileInterner::topdocToFile(
+ TempFile& otemp, const string& tofile,
+ RclConfig *cnf, const Rcl::Doc& idoc, bool uncompress)
{
DocFetcher *fetcher = docFetcherMake(cnf, idoc);
if (fetcher == 0) {
- LOGERR("FileInterner::idocToFile no backend\n");
+ LOGERR("FileInterner::topdocToFile no backend\n");
return false;
}
DocFetcher::RawDoc rawdoc;
if (!fetcher->fetch(cnf, idoc, rawdoc)) {
- LOGERR("FileInterner::idocToFile fetcher failed\n");
+ LOGERR("FileInterner::topdocToFile fetcher failed\n");
return false;
}
const char *filename = "";
@@ -983,13 +976,24 @@
}
string reason;
switch (rawdoc.kind) {
- case DocFetcher::RawDoc::RDK_FILENAME:
- if (!copyfile(rawdoc.data.c_str(), filename, reason)) {
+ case DocFetcher::RawDoc::RDK_FILENAME: {
+ string fn(rawdoc.data);
+ TempFile temp;
+ if (uncompress && isCompressed(fn, cnf)) {
+ if (!maybeUncompressToTemp(temp, fn, cnf, idoc)) {
+ LOGERR("FileInterner::idocToFile: uncompress failed\n");
+ return false;
+ }
+ }
+ fn = temp ? temp->filename() : rawdoc.data;
+ if (!copyfile(fn.c_str(), filename, reason)) {
LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
return false;
}
+ }
break;
case DocFetcher::RawDoc::RDK_DATA:
+ case DocFetcher::RawDoc::RDK_DATADIRECT:
if (!stringtofile(rawdoc.data, filename, reason)) {
LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n");
return false;
@@ -1019,11 +1023,12 @@
}
// Specialcase text/html. This is to work around a bug that will
- // get fixed some day: internfile initialisation does not check
- // targetmtype, so that at least one conversion is always
- // performed. A common case would be an "Open" on an html file
- // (we'd end up with text/plain content). As the html version is
- // saved in this case, use it.
+ // get fixed some day: the internfile constructor always loads the
+ // first handler so that at least one conversion is always
+ // performed (and the access to the original data may be lost). A
+ // common case is an "Open" on an HTML file (we end up
+ // with text/plain content). As the HTML version is saved in this
+ // case, use it.
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
doc.text = get_html();
doc.mimetype = cstr_texthtml;