Switch to unified view

a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp
...
...
116
// Empty handler on return says that we're in error, this will be
116
// Empty handler on return says that we're in error, this will be
117
// processed by the first call to internfile().
117
// processed by the first call to internfile().
118
// Split into "constructor calls init()" to allow use from other constructor
118
// Split into "constructor calls init()" to allow use from other constructor
119
FileInterner::FileInterner(const string &fn, const struct stat *stp,
119
FileInterner::FileInterner(const string &fn, const struct stat *stp,
120
               RclConfig *cnf, int flags, const string *imime)
120
               RclConfig *cnf, int flags, const string *imime)
121
    : m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
122
{
121
{
123
    LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n");
122
    LOGDEB0("FileInterner::FileInterner(fn=" << fn << ")\n");
124
    if (fn.empty()) {
123
    if (fn.empty()) {
125
    LOGERR("FileInterner::FileInterner: empty file name!\n");
124
    LOGERR("FileInterner::FileInterner: empty file name!\n");
126
    return;
125
    return;
...
...
217
    // No mime type. We let it through as config may warrant that
216
    // No mime type. We let it through as config may warrant that
218
    // we index all file names
217
    // we index all file names
219
    LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n");
218
    LOGDEB0("FileInterner:: no mime: [" << m_fn << "]\n");
220
    }
219
    }
221
220
221
    // Get fields computed from extended attributes. We use the
222
    // original file, not the m_fn which may be the uncompressed temp
223
    // file
224
    if (!m_noxattrs)
225
  reapXAttrs(m_cfg, f, m_XAttrsFields);
226
227
    // Gather metadata from external commands as configured.
228
    reapMetaCmds(m_cfg, f, m_cmdFields);
229
230
    m_mimetype = l_mime;
231
222
    // Look for appropriate handler (might still return empty)
232
    // Look for appropriate handler (might still return empty)
223
    m_mimetype = l_mime;
224
    RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
233
    RecollFilter *df = getMimeHandler(l_mime, m_cfg, !m_forPreview);
225
234
226
    if (!df || df->is_unknown()) {
235
    if (!df || df->is_unknown()) {
227
    // No real handler for this type, for now :( 
236
    // No real handler for this type, for now :( 
228
    LOGDEB("FileInterner:: unprocessed mime: [" << l_mime << "] [" << f <<
237
    LOGDEB("FileInterner:: unprocessed mime: [" << l_mime << "] [" << f <<
...
...
232
    }
241
    }
233
    df->set_property(Dijon::Filter::OPERATING_MODE, 
242
    df->set_property(Dijon::Filter::OPERATING_MODE, 
234
             m_forPreview ? "view" : "index");
243
             m_forPreview ? "view" : "index");
235
    df->set_property(Dijon::Filter::DJF_UDI, udi);
244
    df->set_property(Dijon::Filter::DJF_UDI, udi);
236
245
237
    // Get fields computed from extended attributes. We use the
238
    // original file, not the m_fn which may be the uncompressed temp
239
    // file
240
    if (!m_noxattrs)
241
  reapXAttrs(m_cfg, f, m_XAttrsFields);
242
243
    // Gather metadata from external commands as configured.
244
    reapMetaCmds(m_cfg, f, m_cmdFields);
245
246
    df->set_docsize(docsize);
246
    df->set_docsize(docsize);
247
    if (!df->set_document_file(l_mime, m_fn)) {
247
    if (!df->set_document_file(l_mime, m_fn)) {
248
    delete df;
248
    delete df;
249
    LOGERR("FileInterner:: error converting " << m_fn << "\n");
249
    LOGERR("FileInterner:: error converting " << m_fn << "\n");
250
    return;
250
    return;
...
...
256
}
256
}
257
257
258
// Setup from memory data (ie: out of the web cache). imime needs to be set.
258
// Setup from memory data (ie: out of the web cache). imime needs to be set.
259
FileInterner::FileInterner(const string &data, RclConfig *cnf, 
259
FileInterner::FileInterner(const string &data, RclConfig *cnf, 
260
                           int flags, const string& imime)
260
                           int flags, const string& imime)
261
    : m_ok(false), m_missingdatap(0), m_uncomp((flags & FIF_forPreview) != 0)
262
{
261
{
263
    LOGDEB0("FileInterner::FileInterner(data)\n");
262
    LOGDEB0("FileInterner::FileInterner(data)\n");
264
    initcommon(cnf, flags);
263
    initcommon(cnf, flags);
265
    init(data, cnf, flags, imime);
264
    init(data, cnf, flags, imime);
266
}
265
}
...
...
311
}
310
}
312
311
313
void FileInterner::initcommon(RclConfig *cnf, int flags)
312
void FileInterner::initcommon(RclConfig *cnf, int flags)
314
{
313
{
315
    m_cfg = cnf;
314
    m_cfg = cnf;
316
    m_forPreview = ((flags & FIF_forPreview) != 0);
315
    m_uncomp = m_forPreview = ((flags & FIF_forPreview) != 0);
317
    // Initialize handler stack.
316
    // Initialize handler stack.
318
    m_handlers.reserve(MAXHANDLERS);
317
    m_handlers.reserve(MAXHANDLERS);
319
    for (unsigned int i = 0; i < MAXHANDLERS; i++)
318
    for (unsigned int i = 0; i < MAXHANDLERS; i++)
320
    m_tmpflgs[i] = false;
319
    m_tmpflgs[i] = false;
321
    m_targetMType = cstr_textplain;
320
    m_targetMType = cstr_textplain;
322
    m_cfg->getConfParam("noxattrfields", &m_noxattrs);
321
    m_cfg->getConfParam("noxattrfields", &m_noxattrs);
323
    m_direct = false;
322
    m_direct = false;
324
}
323
}
325
324
326
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
325
FileInterner::FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags)
327
    : m_ok(false), m_missingdatap(0), m_uncomp(((flags & FIF_forPreview) != 0))
328
{
326
{
329
    LOGDEB0("FileInterner::FileInterner(idoc)\n");
327
    LOGDEB0("FileInterner::FileInterner(idoc)\n");
330
    initcommon(cnf, flags);
328
    initcommon(cnf, flags);
331
329
332
    DocFetcher *fetcher = docFetcherMake(cnf, idoc);
330
    DocFetcher *fetcher = docFetcherMake(cnf, idoc);
...
...
345
    break;
343
    break;
346
    case DocFetcher::RawDoc::RDK_DATA:
344
    case DocFetcher::RawDoc::RDK_DATA:
347
        init(rawdoc.data, cnf, flags, idoc.mimetype);
345
        init(rawdoc.data, cnf, flags, idoc.mimetype);
348
    break;
346
    break;
349
    case DocFetcher::RawDoc::RDK_DATADIRECT:
347
    case DocFetcher::RawDoc::RDK_DATADIRECT:
348
        // Note: only used for demo with the sample python external
349
        // mbox indexer at this point. The external program is
350
        // responsible for all the extraction process.
350
        init(rawdoc.data, cnf, flags, idoc.mimetype);
351
        init(rawdoc.data, cnf, flags, idoc.mimetype);
351
        m_direct = true;
352
        m_direct = true;
352
        break;
353
        break;
353
    default:
354
    default:
354
    LOGERR("FileInterner::FileInterner(idoc): bad rawdoc kind ??\n");
355
    LOGERR("FileInterner::FileInterner(idoc): bad rawdoc kind ??\n");
...
...
733
        m_imgtmp = m_tempfiles.back();
734
        m_imgtmp = m_tempfiles.back();
734
        }
735
        }
735
    }
736
    }
736
    }
737
    }
737
    if (!setres) {
738
    if (!setres) {
738
    LOGINFO("FileInterner::addHandler: set_doc failed inside " << m_fn <<
739
    LOGINFO("FileInterner::addHandler: set_doc failed inside [" << m_fn <<
739
                "  for mtype " << mimetype << "\n");
740
                "]  for mtype " << mimetype << "\n");
740
    delete newflt;
741
    delete newflt;
741
    if (m_forPreview)
742
    if (m_forPreview)
742
        return ADD_ERROR;
743
        return ADD_ERROR;
743
    return ADD_CONTINUE;
744
    return ADD_CONTINUE;
744
    }
745
    }
...
...
916
                                 const string& mimetype)
917
                                 const string& mimetype)
917
{
918
{
918
    TempFile temp(new TempFileInternal(
919
    TempFile temp(new TempFileInternal(
919
                      cnf->getSuffixFromMimeType(mimetype)));
920
                      cnf->getSuffixFromMimeType(mimetype)));
920
    if (!temp->ok()) {
921
    if (!temp->ok()) {
921
        LOGERR("FileInterner::interntofile: can't create temp file\n");
922
        LOGERR("FileInterner::tempFileForMT: can't create temp file\n");
922
        return false;
923
        return false;
923
    }
924
    }
924
    otemp = temp;
925
    otemp = temp;
925
    return true;
926
    return true;
926
}
927
}
927
928
928
// Extract document (typically subdoc of multidoc) into temporary file. 
929
// Static method, creates a FileInterner object to do the job.
929
// We do the usual internfile stuff: create a temporary directory,
930
bool FileInterner::idocToFile(
930
// then create an interner and call internfile. The target mtype is set to
931
    TempFile& otemp, const string& tofile, RclConfig *cnf,
931
// the input mtype, so that no data conversion is performed.
932
    const Rcl::Doc& idoc, bool uncompress)
932
// We then write the data out of the resulting document into the output file.
933
// There are two temporary objects:
934
// - The internfile temporary directory gets destroyed by its destructor
935
// - The output temporary file which is held in a reference-counted
936
//   object and will be deleted when done with.
937
//
938
// If the ipath is null, maybe we're called because the file is not
939
// stored in the regular file system. We use the docfetcher to get a
940
// copy (in topdocToFile())
941
// 
942
// We currently don't handle the case of an internal doc of a non-fs document.
943
944
bool FileInterner::idocToFile(TempFile& otemp, const string& tofile,
945
                RclConfig *cnf, const Rcl::Doc& idoc)
946
{
933
{
947
    LOGDEB("FileInterner::idocToFile\n");
934
    LOGDEB("FileInterner::idocToFile\n");
948
935
949
    if (idoc.ipath.empty()) {
936
    if (idoc.ipath.empty()) {
937
        // Because of the mandatory first conversion in the
938
        // FileInterner constructor, need to use a specific method.
950
    return topdocToFile(otemp, tofile, cnf, idoc);
939
    return topdocToFile(otemp, tofile, cnf, idoc, uncompress);
951
    }
940
    }
952
941
953
    // We set FIF_forPreview for consistency with the previous version
942
    // We set FIF_forPreview for consistency with the previous version
954
    // which determined this by looking at mtype!=null. Probably
943
    // which determined this by looking at mtype!=null. Probably
955
    // doesn't change anything in this case.
944
    // doesn't change anything in this case.
956
    FileInterner interner(idoc, cnf, FIF_forPreview);
945
    FileInterner interner(idoc, cnf, FIF_forPreview);
957
    interner.setTargetMType(idoc.mimetype);
946
    interner.setTargetMType(idoc.mimetype);
958
    return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype);
947
    return interner.interntofile(otemp, tofile, idoc.ipath, idoc.mimetype);
959
}
948
}
960
949
961
bool FileInterner::topdocToFile(TempFile& otemp, const string& tofile,
950
// This is only needed because the FileInterner constructor always performs
962
                                RclConfig *cnf, const Rcl::Doc& idoc)
951
// the first conversion, so that we need another approach for accessing the
952
// original document (targetmtype won't do).
953
bool FileInterner::topdocToFile(
954
    TempFile& otemp, const string& tofile,
955
    RclConfig *cnf, const Rcl::Doc& idoc, bool uncompress)
963
{
956
{
964
    DocFetcher *fetcher = docFetcherMake(cnf, idoc);
957
    DocFetcher *fetcher = docFetcherMake(cnf, idoc);
965
    if (fetcher == 0) {
958
    if (fetcher == 0) {
966
        LOGERR("FileInterner::idocToFile no backend\n");
959
        LOGERR("FileInterner::topdocToFile no backend\n");
967
        return false;
960
        return false;
968
    }
961
    }
969
    DocFetcher::RawDoc rawdoc;
962
    DocFetcher::RawDoc rawdoc;
970
    if (!fetcher->fetch(cnf, idoc, rawdoc)) {
963
    if (!fetcher->fetch(cnf, idoc, rawdoc)) {
971
        LOGERR("FileInterner::idocToFile fetcher failed\n");
964
        LOGERR("FileInterner::topdocToFile fetcher failed\n");
972
        return false;
965
        return false;
973
    }
966
    }
974
    const char *filename = "";
967
    const char *filename = "";
975
    TempFile temp;
968
    TempFile temp;
976
    if (tofile.empty()) {
969
    if (tofile.empty()) {
...
...
981
    } else {
974
    } else {
982
        filename = tofile.c_str();
975
        filename = tofile.c_str();
983
    }
976
    }
984
    string reason;
977
    string reason;
985
    switch (rawdoc.kind) {
978
    switch (rawdoc.kind) {
986
    case DocFetcher::RawDoc::RDK_FILENAME:
979
    case DocFetcher::RawDoc::RDK_FILENAME: {
980
        string fn(rawdoc.data);
981
        TempFile temp;
982
        if (uncompress && isCompressed(fn, cnf)) {
983
            if (!maybeUncompressToTemp(temp, fn, cnf, idoc)) {
984
                LOGERR("FileInterner::idocToFile: uncompress failed\n");
985
                return false;
986
            }
987
        }
988
        fn = temp ? temp->filename() : rawdoc.data;
987
        if (!copyfile(rawdoc.data.c_str(), filename, reason)) {
989
        if (!copyfile(fn.c_str(), filename, reason)) {
988
            LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
990
            LOGERR("FileInterner::idocToFile: copyfile: " << reason << "\n");
989
            return false;
991
            return false;
990
        }
992
        }
993
    }
991
        break;
994
        break;
992
    case DocFetcher::RawDoc::RDK_DATA:
995
    case DocFetcher::RawDoc::RDK_DATA:
996
    case DocFetcher::RawDoc::RDK_DATADIRECT:
993
        if (!stringtofile(rawdoc.data, filename, reason)) {
997
        if (!stringtofile(rawdoc.data, filename, reason)) {
994
            LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n");
998
            LOGERR("FileInterner::idocToFile: stringtofile: " << reason <<"\n");
995
            return false;
999
            return false;
996
        }
1000
        }
997
        break;
1001
        break;
...
...
1017
    LOGERR("FileInterner::interntofile: internfile() failed\n");
1021
    LOGERR("FileInterner::interntofile: internfile() failed\n");
1018
    return false;
1022
    return false;
1019
    }
1023
    }
1020
1024
1021
    // Specialcase text/html. This is to work around a bug that will
1025
    // Specialcase text/html. This is to work around a bug that will
1022
    // get fixed some day: internfile initialisation does not check
1026
    // get fixed some day: the internfile constructor always loads the
1023
    // targetmtype, so that at least one conversion is always
1027
    // first handler so that at least one conversion is always
1024
    // performed. A common case would be an "Open" on an html file
1028
    // performed (and the access to the original data may be lost). A
1025
    // (we'd end up with text/plain content). As the html version is
1029
    // common case is an "Open" on an HTML file (we end up
1026
    // saved in this case, use it.  
1030
    // with text/plain content). As the HTML version is saved in this
1031
    // case, use it.
1027
    if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
1032
    if (!stringlowercmp(cstr_texthtml, mimetype) && !get_html().empty()) {
1028
        doc.text = get_html();
1033
        doc.text = get_html();
1029
        doc.mimetype = cstr_texthtml;
1034
        doc.mimetype = cstr_texthtml;
1030
    }
1035
    }
1031
1036