|
a/src/internfile/internfile.cpp |
|
b/src/internfile/internfile.cpp |
|
... |
|
... |
116 |
// Empty handler on return says that we're in error, this will be
|
116 |
// Empty handler on return says that we're in error, this will be
|
117 |
// processed by the first call to internfile().
|
117 |
// processed by the first call to internfile().
|
118 |
// Split into "constructor calls init()" to allow use from other constructor
|
118 |
// Split into "constructor calls init()" to allow use from other constructor
|
119 |
FileInterner::FileInterner(const string &fn, const struct stat *stp,
|
119 |
FileInterner::FileInterner(const string &fn, const struct stat *stp,
|
120 |
RclConfig *cnf, int flags, const string *imime)
|
120 |
RclConfig *cnf, int flags, const string *imime)
|
121 |
: m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
|
|
|
122 |
{
|
121 |
{
|
123 |
LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n");
|
122 |
LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n");
|
124 |
if (fn.empty()) {
|
123 |
if (fn.empty()) {
|
125 |
LOGERR("FileInterner::FileInterner: empty file name!\n");
|
124 |
LOGERR("FileInterner::FileInterner: empty file name!\n");
|
126 |
return;
|
125 |
return;
|
|
... |
|
... |
217 |
// No mime type. We let it through as config may warrant that
|
216 |
// No mime type. We let it through as config may warrant that
|
218 |
// we index all file names
|
217 |
// we index all file names
|
219 |
LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n");
|
218 |
LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n");
|
220 |
}
|
219 |
}
|
221 |
|
220 |
|
|
|
221 |
// Get fields computed from extended attributes. We use the
|
|
|
222 |
// original file, not the m_fn which may be the uncompressed temp
|
|
|
223 |
// file
|
|
|
224 |
if (!m_noxattrs)
|
|
|
225 |
reapXAttrs(m_cfg, f, m_XAttrsFields);
|
|
|
226 |
|
|
|
227 |
// Gather metadata from external commands as configured.
|
|
|
228 |
reapMetaCmds(m_cfg, f, m_cmdFields);
|
|
|
229 |
|
|
|
230 |
m_mimetype = l_mime;
|
|
|
231 |
|
222 |
// Look for appropriate handler (might still return empty)
|
232 |
// Look for appropriate handler (might still return empty)
|
223 |
m_mimetype = l_mime;
|
|
|
224 |
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
|
233 |
RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
|
225 |
|
234 |
|
226 |
if (!df || df->is_unknown()) {
|
235 |
if (!df || df->is_unknown()) {
|
227 |
// No real handler for this type, for now :(
|
236 |
// No real handler for this type, for now :(
|
228 |
LOGDEB("FileInterner:: unprocessed mime: [" << l_mime << "] [" << f <<
|
237 |
LOGDEB("FileInterner:: unprocessed mime: [" << l_mime << "] [" << f <<
|
|
... |
|
... |
232 |
}
|
241 |
}
|
233 |
df->set_property(Dijon::Filter::OPERATING_MODE,
|
242 |
df->set_property(Dijon::Filter::OPERATING_MODE,
|
234 |
m_forPreview ? "view" : "index");
|
243 |
m_forPreview ? "view" : "index");
|
235 |
df->set_property(Dijon::Filter::DJF_UDI, udi);
|
244 |
df->set_property(Dijon::Filter::DJF_UDI, udi);
|
236 |
|
245 |
|
237 |
// Get fields computed from extended attributes. We use the
|
|
|
238 |
// original file, not the m_fn which may be the uncompressed temp
|
|
|
239 |
// file
|
|
|
240 |
if (!m_noxattrs)
|
|
|
241 |
reapXAttrs(m_cfg, f, m_XAttrsFields);
|
|
|
242 |
|
|
|
243 |
// Gather metadata from external commands as configured.
|
|
|
244 |
reapMetaCmds(m_cfg, f, m_cmdFields);
|
|
|
245 |
|
|
|
246 |
df->set_docsize(docsize);
|
246 |
df->set_docsize(docsize);
|
247 |
if (!df->set_document_file(l_mime, m_fn)) {
|
247 |
if (!df->set_document_file(l_mime, m_fn)) {
|
248 |
delete df;
|
248 |
delete df;
|
249 |
LOGERR("FileInterner:: error converting " << m_fn << "\n");
|
249 |
LOGERR("FileInterner:: error converting " << m_fn << "\n");
|
250 |
return;
|
250 |
return;
|
|
... |
|
... |
256 |
}
|
256 |
}
|
257 |
|
257 |
|
258 |
// Setup from memory data (ie: out of the web cache). imime needs to be set.
|
258 |
// Setup from memory data (ie: out of the web cache). imime needs to be set.
|
259 |
FileInterner::FileInterner(const string &data, RclConfig *cnf,
|
259 |
FileInterner::FileInterner(const string &data, RclConfig *cnf,
|
260 |
int flags, const string& imime)
|
260 |
int flags, const string& imime)
|
261 |
: m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
|
|
|
262 |
{
|
261 |
{
|
263 |
LOGDEB0("FileInterner::FileInterner(data)\n");
|
262 |
LOGDEB0("FileInterner::FileInterner(data)\n");
|
264 |
initcommon(cnf, flags);
|
263 |
initcommon(cnf, flags);
|
265 |
init(data, cnf, flags, imime);
|
264 |
init(data, cnf, flags, imime);
|
266 |
}
|
265 |
}
|
|
... |
|
... |
311 |
}
|
310 |
}
|
312 |
|
311 |
|
313 |
void FileInterner::initcommon(RclConfig *cnf, int flags)
|
312 |
void FileInterner::initcommon(RclConfig *cnf, int flags)
|
314 |
{
|
313 |
{
|
315 |
m_cfg = cnf;
|
314 |
m_cfg = cnf;
|
316 |
m_forPreview = ((flags & FIF_forPreview) != 0);
|
315 |
m_uncomp = m_forPreview = ((flags & FIF_forPreview) != 0);
|
317 |
// Initialize handler stack.
|
316 |
// Initialize handler stack.
|
318 |
m_handlers.reserve(MAXHANDLERS);
|
317 |
m_handlers.reserve(MAXHANDLERS);
|
319 |
for (unsigned int i = 0; i < MAXHANDLERS; i++)
|
318 |
for (unsigned int i = 0; i < MAXHANDLERS; i++)
|
320 |
m_tmpflgs[i] = false;
|
319 |
m_tmpflgs[i] = false;
|
321 |
m_targetMType = cstr_textplain;
|
320 |
m_targetMType = cstr_textplain;
|
322 |
m_cfg->getConfParam("noxattrfields", &m_noxattrs);
|
321 |
m_cfg->getConfParam("noxattrfields", &m_noxattrs);
|
323 |
m_direct = false;
|
322 |
m_direct = false;
|
324 |
}
|
323 |
}
|
325 |
|
324 |
|
326 |
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
|
325 |
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
|
327 |
: m_ok(false), m_missingdatap(0), m_uncomp(((flags & FIF_forPreview) != 0))
|
|
|
328 |
{
|
326 |
{
|
329 |
LOGDEB0("FileInterner::FileInterner(idoc)\n");
|
327 |
LOGDEB0("FileInterner::FileInterner(idoc)\n");
|
330 |
initcommon(cnf, flags);
|
328 |
initcommon(cnf, flags);
|
331 |
|
329 |
|
332 |
DocFetcher *fetcher = docFetcherMake(cnf, idoc);
|
330 |
DocFetcher *fetcher = docFetcherMake(cnf, idoc);
|
|
... |
|
... |
345 |
break;
|
343 |
break;
|
346 |
case DocFetcher::RawDoc::RDK_DATA:
|
344 |
case DocFetcher::RawDoc::RDK_DATA:
|
347 |
init(rawdoc.data, cnf, flags, idoc.mimetype);
|
345 |
init(rawdoc.data, cnf, flags, idoc.mimetype);
|
348 |
break;
|
346 |
break;
|
349 |
case DocFetcher::RawDoc::RDK_DATADIRECT:
|
347 |
case DocFetcher::RawDoc::RDK_DATADIRECT:
|
|
|
348 |
// Note: only used for demo with the sample python external
|
|
|
349 |
// mbox indexer at this point. The external program is
|
|
|
350 |
// responsible for all the extraction process.
|
350 |
init(rawdoc.data, cnf, flags, idoc.mimetype);
|
351 |
init(rawdoc.data, cnf, flags, idoc.mimetype);
|
351 |
m_direct = true;
|
352 |
m_direct = true;
|
352 |
break;
|
353 |
break;
|
353 |
default:
|
354 |
default:
|
354 |
LOGERR("FileInterner::FileInterner(idoc): bad rawdoc kind ??\n");
|
355 |
LOGERR("FileInterner::FileInterner(idoc): bad rawdoc kind ??\n");
|
|
... |
|
... |
733 |
m_imgtmp = m_tempfiles.back();
|
734 |
m_imgtmp = m_tempfiles.back();
|
734 |
}
|
735 |
}
|
735 |
}
|
736 |
}
|
736 |
}
|
737 |
}
|
737 |
if (!setres) {
|
738 |
if (!setres) {
|
738 |
LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
|
739 |
LOGINFO("FileInterner::addHandler: set_doc failed inside [" << m_fn <<
|
739 |
" for mtype " << mimetype << "\n");
|
740 |
"] for mtype " << mimetype << "\n");
|
740 |
delete newflt;
|
741 |
delete newflt;
|
741 |
if (m_forPreview)
|
742 |
if (m_forPreview)
|
742 |
return ADD_ERROR;
|
743 |
return ADD_ERROR;
|
743 |
return ADD_CONTINUE;
|
744 |
return ADD_CONTINUE;
|
744 |
}
|
745 |
}
|
|
... |
|
... |
916 |
const string& mimetype)
|
917 |
const string& mimetype)
|
917 |
{
|
918 |
{
|
918 |
TempFile temp(new TempFileInternal(
|
919 |
TempFile temp(new TempFileInternal(
|
919 |
cnf->getSuffixFromMimeType(mimetype)));
|
920 |
cnf->getSuffixFromMimeType(mimetype)));
|
920 |
if (!temp->ok()) {
|
921 |
if (!temp->ok()) {
|
921 |
LOGERR("FileInterner::interntofile: can't create temp file\n");
|
922 |
LOGERR("FileInterner::tempFileForMT: can't create temp file\n");
|
922 |
return false;
|
923 |
return false;
|
923 |
}
|
924 |
}
|
924 |
otemp = temp;
|
925 |
otemp = temp;
|
925 |
return true;
|
926 |
return true;
|
926 |
}
|
927 |
}
|
927 |
|
928 |
|
928 |
// Extract document (typically subdoc of multidoc) into temporary file.
|
929 |
// Static method, creates a FileInterner object to do the job.
|
929 |
// We do the usual internfile stuff: create a temporary directory,
|
930 |
bool FileInterner::idocToFile(
|
930 |
// then create an interner and call internfile. The target mtype is set to
|
931 |
TempFile& otemp, const string& tofile, RclConfig *cnf,
|
931 |
// the input mtype, so that no data conversion is performed.
|
932 |
const Rcl::Doc& idoc, bool uncompress)
|
932 |
// We then write the data out of the resulting document into the output file.
|
|
|
933 |
// There are two temporary objects:
|
|
|
934 |
// - The internfile temporary directory gets destroyed by its destructor
|
|
|
935 |
// - The output temporary file which is held in a reference-counted
|
|
|
936 |
// object and will be deleted when done with.
|
|
|
937 |
//
|
|
|
938 |
// If the ipath is null, maybe we're called because the file is not
|
|
|
939 |
// stored in the regular file system. We use the docfetcher to get a
|
|
|
940 |
// copy (in topdocToFile())
|
|
|
941 |
//
|
|
|
942 |
// We currently don't handle the case of an internal doc of a non-fs document.
|
|
|
943 |
|
|
|
944 |
bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
|
|
|
945 |
RclConfig *cnf, const Rcl::Doc& idoc)
|
|
|
946 |
{
|
933 |
{
|
947 |
LOGDEB("FileInterner::idocToFile\n");
|
934 |
LOGDEB("FileInterner::idocToFile\n");
|
948 |
|
935 |
|
949 |
if (idoc.ipath.empty()) {
|
936 |
if (idoc.ipath.empty()) {
|
|
|
937 |
// Because of the mandatory first conversion in the
|
|
|
938 |
// FileInterner constructor, need to use a specific method.
|
950 |
return topdocToFile(otemp, tofile, cnf, idoc);
|
939 |
return topdocToFile(otemp, tofile, cnf, idoc, uncompress);
|
951 |
}
|
940 |
}
|
952 |
|
941 |
|
953 |
// We set FIF_forPreview for consistency with the previous version
|
942 |
// We set FIF_forPreview for consistency with the previous version
|
954 |
// which determined this by looking at mtype!=null. Probably
|
943 |
// which determined this by looking at mtype!=null. Probably
|
955 |
// doesn't change anything in this case.
|
944 |
// doesn't change anything in this case.
|
956 |
FileInterner interner(idoc, cnf, FIF_forPreview);
|
945 |
FileInterner interner(idoc, cnf, FIF_forPreview);
|
957 |
interner.setTargetMType(idoc.mimetype);
|
946 |
interner.setTargetMType(idoc.mimetype);
|
958 |
return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype);
|
947 |
return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype);
|
959 |
}
|
948 |
}
|
960 |
|
949 |
|
961 |
bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile,
|
950 |
// This is only needed because the FileInterner constructor always performs
|
962 |
RclConfig *cnf, const Rcl::Doc& idoc)
|
951 |
// the first conversion, so that we need another approach for accessing the
|
|
|
952 |
// original document (targetmtype won't do).
|
|
|
953 |
bool FileInterner::topdocToFile(
|
|
|
954 |
TempFile& otemp, const string& tofile,
|
|
|
955 |
RclConfig *cnf, const Rcl::Doc& idoc, bool uncompress)
|
963 |
{
|
956 |
{
|
964 |
DocFetcher *fetcher = docFetcherMake(cnf, idoc);
|
957 |
DocFetcher *fetcher = docFetcherMake(cnf, idoc);
|
965 |
if (fetcher == 0) {
|
958 |
if (fetcher == 0) {
|
966 |
LOGERR("FileInterner::idocToFile no backend\n");
|
959 |
LOGERR("FileInterner::topdocToFile no backend\n");
|
967 |
return false;
|
960 |
return false;
|
968 |
}
|
961 |
}
|
969 |
DocFetcher::RawDoc rawdoc;
|
962 |
DocFetcher::RawDoc rawdoc;
|
970 |
if (!fetcher->fetch(cnf, idoc, rawdoc)) {
|
963 |
if (!fetcher->fetch(cnf, idoc, rawdoc)) {
|
971 |
LOGERR("FileInterner::idocToFile fetcher failed\n");
|
964 |
LOGERR("FileInterner::topdocToFile fetcher failed\n");
|
972 |
return false;
|
965 |
return false;
|
973 |
}
|
966 |
}
|
974 |
const char *filename = "";
|
967 |
const char *filename = "";
|
975 |
TempFile temp;
|
968 |
TempFile temp;
|
976 |
if (tofile.empty()) {
|
969 |
if (tofile.empty()) {
|
|
... |
|
... |
981 |
} else {
|
974 |
} else {
|
982 |
filename = tofile.c_str();
|
975 |
filename = tofile.c_str();
|
983 |
}
|
976 |
}
|
984 |
string reason;
|
977 |
string reason;
|
985 |
switch (rawdoc.kind) {
|
978 |
switch (rawdoc.kind) {
|
986 |
case DocFetcher::RawDoc::RDK_FILENAME:
|
979 |
case DocFetcher::RawDoc::RDK_FILENAME: {
|
|
|
980 |
string fn(rawdoc.data);
|
|
|
981 |
TempFile temp;
|
|
|
982 |
if (uncompress && isCompressed(fn, cnf)) {
|
|
|
983 |
if (!maybeUncompressToTemp(temp, fn, cnf, idoc)) {
|
|
|
984 |
LOGERR("FileInterner::idocToFile: uncompress failed\n");
|
|
|
985 |
return false;
|
|
|
986 |
}
|
|
|
987 |
}
|
|
|
988 |
fn = temp ? temp->filename() : rawdoc.data;
|
987 |
if (!copyfile(rawdoc.data.c_str(), filename, reason)) {
|
989 |
if (!copyfile(fn.c_str(), filename, reason)) {
|
988 |
LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
|
990 |
LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
|
989 |
return false;
|
991 |
return false;
|
990 |
}
|
992 |
}
|
|
|
993 |
}
|
991 |
break;
|
994 |
break;
|
992 |
case DocFetcher::RawDoc::RDK_DATA:
|
995 |
case DocFetcher::RawDoc::RDK_DATA:
|
|
|
996 |
case DocFetcher::RawDoc::RDK_DATADIRECT:
|
993 |
if (!stringtofile(rawdoc.data, filename, reason)) {
|
997 |
if (!stringtofile(rawdoc.data, filename, reason)) {
|
994 |
LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n");
|
998 |
LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n");
|
995 |
return false;
|
999 |
return false;
|
996 |
}
|
1000 |
}
|
997 |
break;
|
1001 |
break;
|
|
... |
|
... |
1017 |
LOGERR("FileInterner::interntofile: internfile() failed\n");
|
1021 |
LOGERR("FileInterner::interntofile: internfile() failed\n");
|
1018 |
return false;
|
1022 |
return false;
|
1019 |
}
|
1023 |
}
|
1020 |
|
1024 |
|
1021 |
// Specialcase text/html. This is to work around a bug that will
|
1025 |
// Specialcase text/html. This is to work around a bug that will
|
1022 |
// get fixed some day: internfile initialisation does not check
|
1026 |
// get fixed some day: the internfile constructor always loads the
|
1023 |
// targetmtype, so that at least one conversion is always
|
1027 |
// first handler so that at least one conversion is always
|
1024 |
// performed. A common case would be an "Open" on an html file
|
1028 |
// performed (and the access to the original data may be lost). A
|
1025 |
// (we'd end up with text/plain content). As the html version is
|
1029 |
// common case is an "Open" on an HTML file (we end up
|
1026 |
// saved in this case, use it.
|
1030 |
// with text/plain content). As the HTML version is saved in this
|
|
|
1031 |
// case, use it.
|
1027 |
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
|
1032 |
if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
|
1028 |
doc.text = get_html();
|
1033 |
doc.text = get_html();
|
1029 |
doc.mimetype = cstr_texthtml;
|
1034 |
doc.mimetype = cstr_texthtml;
|
1030 |
}
|
1035 |
}
|
1031 |
|
1036 |
|