Switch to unified view

a/src/internfile/internfile.cpp b/src/internfile/internfile.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.39 2008-08-26 07:33:05 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: internfile.cpp,v 1.40 2008-09-05 10:36:06 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
45
45
46
// The internal path element separator. This can't be the same as the rcldb 
46
// The internal path element separator. This can't be the same as the rcldb 
47
// file to ipath separator : "|"
47
// file to ipath separator : "|"
48
static const string isep(":");
48
static const string isep(":");
49
49
50
// This is used when the user wants to retrieve a search result doc's parent
51
// (ie message having a given attachment)
50
bool FileInterner::getEnclosing(const string &url, const string &ipath,
52
bool FileInterner::getEnclosing(const string &url, const string &ipath,
51
                string &eurl, string &eipath)
53
                string &eurl, string &eipath)
52
{
54
{
53
    eurl = url;
55
    eurl = url;
54
    eipath = ipath;
56
    eipath = ipath;
...
...
63
    }
65
    }
64
    LOGDEB(("FileInterner::getEnclosing() after: [%s]\n", eipath.c_str()));
66
    LOGDEB(("FileInterner::getEnclosing() after: [%s]\n", eipath.c_str()));
65
    return true;
67
    return true;
66
}
68
}
67
69
68
// Execute the command to uncompress a file into a temporary one.
70
// Uncompress input file into a temporary one, by executing the appropriate
71
// script.
69
static bool uncompressfile(RclConfig *conf, const string& ifn, 
72
static bool uncompressfile(RclConfig *conf, const string& ifn, 
70
               const list<string>& cmdv, const string& tdir, 
73
               const list<string>& cmdv, const string& tdir, 
71
               string& tfile)
74
               string& tfile)
72
{
75
{
73
    // Make sure tmp dir is empty. we guarantee this to filters
76
    // Make sure tmp dir is empty. we guarantee this to filters
...
...
101
    if (tfile[tfile.length() - 1] == '\n')
104
    if (tfile[tfile.length() - 1] == '\n')
102
    tfile.erase(tfile.length() - 1, 1);
105
    tfile.erase(tfile.length() - 1, 1);
103
    return true;
106
    return true;
104
}
107
}
105
108
109
// Delete temporary uncompressed file
106
void FileInterner::tmpcleanup()
110
void FileInterner::tmpcleanup()
107
{
111
{
108
    if (m_tdir.empty() || m_tfile.empty())
112
    if (m_tdir.empty() || m_tfile.empty())
109
    return;
113
    return;
110
    if (unlink(m_tfile.c_str()) < 0) {
114
    if (unlink(m_tfile.c_str()) < 0) {
...
...
112
        m_tfile.c_str(), errno));
116
        m_tfile.c_str(), errno));
113
    return;
117
    return;
114
    }
118
    }
115
}
119
}
116
120
117
// Handler==0 on return says we're in error, will be handled when calling
121
// Constructor: identify the input file, possibly create an
118
// internfile
122
// uncompressed temporary copy, and create the top filter for the
123
// uncompressed file type.
124
//
125
// Empty handler on return says that we're in error, this will be
126
// processed by the first call to internfile().
119
FileInterner::FileInterner(const std::string &f, const struct stat *stp,
127
FileInterner::FileInterner(const std::string &f, const struct stat *stp,
120
               RclConfig *cnf, 
128
               RclConfig *cnf, 
121
               const string& td, const string *imime)
129
               const string& td, const string *imime)
122
    : m_cfg(cnf), m_fn(f), m_forPreview(imime?true:false), m_tdir(td)
130
    : m_cfg(cnf), m_fn(f), m_forPreview(imime?true:false), m_tdir(td)
123
{
131
{
...
...
196
     it != m_handlers.end(); it++)
204
     it != m_handlers.end(); it++)
197
    delete *it;
205
    delete *it;
198
    // m_tempfiles will take care of itself
206
    // m_tempfiles will take care of itself
199
}
207
}
200
208
209
// Create a temporary file for a block of data (ie: attachment) found
210
// while walking the internal document tree, with a type for which the
211
// handler needs an actual file (ie : external script).
201
bool FileInterner::dataToTempFile(const string& dt, const string& mt, 
212
bool FileInterner::dataToTempFile(const string& dt, const string& mt, 
202
                  string& fn)
213
                  string& fn)
203
{
214
{
204
    // Find appropriate suffix for mime type
215
    // Find appropriate suffix for mime type
205
    TempFile temp(new TempFileInternal(m_cfg->getSuffixFromMimeType(mt)));
216
    TempFile temp(new TempFileInternal(m_cfg->getSuffixFromMimeType(mt)));
...
...
229
    return true;
240
    return true;
230
}
241
}
231
242
232
// See if the error string is formatted as a missing helper message,
243
// See if the error string is formatted as a missing helper message,
233
// accumulate helper name if it is
244
// accumulate helper name if it is
234
void FileInterner::maybeExternalMissing(const string& msg)
245
void FileInterner::checkExternalMissing(const string& msg)
235
{
246
{
236
    if (msg.find("RECFILTERROR") == 0) {
247
    if (msg.find("RECFILTERROR") == 0) {
237
    list<string> lerr;
248
    list<string> lerr;
238
    stringToStrings(msg, lerr);
249
    stringToStrings(msg, lerr);
239
    if (lerr.size() > 2) {
250
    if (lerr.size() > 2) {
...
...
245
        }
256
        }
246
    }           
257
    }           
247
    }
258
    }
248
}
259
}
249
260
261
// Return the list of missing external helper apps that we saw while
262
// working
250
const list<string>& FileInterner::getMissingExternal() 
263
const list<string>& FileInterner::getMissingExternal() 
251
{
264
{
252
    m_missingExternal.sort();
265
    m_missingExternal.sort();
253
    m_missingExternal.unique();
266
    m_missingExternal.unique();
254
    return m_missingExternal;
267
    return m_missingExternal;
...
...
258
    m_missingExternal.sort();
271
    m_missingExternal.sort();
259
    m_missingExternal.unique();
272
    m_missingExternal.unique();
260
    stringsToString(m_missingExternal, out);
273
    stringsToString(m_missingExternal, out);
261
}
274
}
262
275
276
// Helper for extracting a value from a map.
263
static inline bool getKeyValue(const map<string, string>& docdata, 
277
static inline bool getKeyValue(const map<string, string>& docdata, 
264
                   const string& key, string& value)
278
                   const string& key, string& value)
265
{
279
{
266
    map<string,string>::const_iterator it;
280
    map<string,string>::const_iterator it;
267
    it = docdata.find(key);
281
    it = docdata.find(key);
...
...
308
    doc.meta.erase(keyds);
322
    doc.meta.erase(keyds);
309
    }
323
    }
310
    return true;
324
    return true;
311
}
325
}
312
326
313
// Collect the ipath stack. 
327
// Collect the ipath from the current path in the document tree.
314
// While we're at it, we also set the mimetype and filename, which are special 
328
// While we're at it, we also set the mimetype and filename, which are special 
315
// properties: we want to get them from the topmost doc
329
// properties: we want to get them from the topmost doc
316
// with an ipath, not the last one which is usually text/plain
330
// with an ipath, not the last one which is usually text/plain
317
// We also set the author and modification time from the last doc
331
// We also set the author and modification time from the last doc
318
// which has them.
332
// which has them.
...
...
368
    }
382
    }
369
    delete m_handlers.back();
383
    delete m_handlers.back();
370
    m_handlers.pop_back();
384
    m_handlers.pop_back();
371
}
385
}
372
386
387
enum addResols {ADD_OK, ADD_CONTINUE, ADD_BREAK, ADD_ERROR};
388
389
// Just got document from current top handler. See what type it is,
390
// and possibly add a filter/handler to the stack
391
int FileInterner::addHandler()
392
{
393
    const std::map<std::string, std::string>& docdata = 
394
  m_handlers.back()->get_meta_data();
395
    string charset, mimetype;
396
    getKeyValue(docdata, keycs, charset);
397
    getKeyValue(docdata, keymt, mimetype);
398
399
    LOGDEB(("FileInterner::addHandler: next_doc is %s\n", mimetype.c_str()));
400
    // If we find a document of the target type (text/plain in
401
    // general), we're done decoding
402
    if (!stringicmp(mimetype, m_targetMType)) {
403
  LOGDEB1(("FileInterner::addHandler: target reached\n"));
404
  return ADD_BREAK;
405
    }
406
407
    // We need to stack another handler. Check stack size
408
    if (m_handlers.size() > MAXHANDLERS) {
409
  // Stack too big. Skip this and go on to check if there is
410
  // something else in the current back()
411
  LOGERR(("FileInterner::addHandler: stack too high\n"));
412
  return ADD_CONTINUE;
413
    }
414
415
    Dijon::Filter *newflt = getMimeHandler(mimetype, m_cfg);
416
    if (!newflt) {
417
  // If we can't find a handler, this doc can't be handled
418
  // but there can be other ones so we go on
419
  LOGINFO(("FileInterner::addHandler: no filter for [%s]\n",
420
       mimetype.c_str()));
421
  return ADD_CONTINUE;
422
    }
423
    newflt->set_property(Dijon::Filter::OPERATING_MODE, 
424
          m_forPreview ? "view" : "index");
425
    newflt->set_property(Dijon::Filter::DEFAULT_CHARSET, charset);
426
427
    // Get content: we don't use getkeyvalue() here to avoid copying
428
    // the text, which may be big.
429
    string ns;
430
    const string *txt = &ns;
431
    {
432
  map<string,string>::const_iterator it;
433
  it = docdata.find(keyct);
434
  if (it != docdata.end())
435
      txt = &it->second;
436
    }
437
    bool setres = false;
438
    if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
439
  setres = newflt->set_document_string(*txt);
440
    } else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
441
  setres = newflt->set_document_data(txt->c_str(), txt->length());
442
    } else if (newflt->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) {
443
  string filename;
444
  if (dataToTempFile(*txt, mimetype, filename)) {
445
      if (!(setres = newflt->set_document_file(filename))) {
446
      m_tmpflgs[m_handlers.size()-1] = false;
447
      m_tempfiles.pop_back();
448
      }
449
  }
450
    }
451
    if (!setres) {
452
  LOGINFO(("FileInterner::addHandler: set_doc failed inside %s "
453
       " for mtype %s\n", m_fn.c_str(), mimetype.c_str()));
454
  delete newflt;
455
  if (m_forPreview)
456
      return ADD_ERROR;
457
  return ADD_CONTINUE;
458
    }
459
    // add handler and go on, maybe this one will give us text...
460
    m_handlers.push_back(newflt);
461
    LOGDEB1(("FileInterner::addHandler: added\n"));
462
    return ADD_OK;
463
}
464
465
// Information and debug after a next_document error
466
void FileInterner::processNextDocError()
467
{
468
    Rcl::Doc doc; string ipath;
469
    collectIpathAndMT(doc, ipath);
470
    m_reason = m_handlers.back()->get_error();
471
    checkExternalMissing(m_reason);
472
    LOGERR(("FileInterner::internfile: next_document error "
473
      "[%s%s%s] %s\n", m_fn.c_str(), ipath.empty() ? "" : "|", 
474
      ipath.c_str(), m_reason.c_str()));
475
}
476
373
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
477
FileInterner::Status FileInterner::internfile(Rcl::Doc& doc, string& ipath)
374
{
478
{
375
    LOGDEB(("FileInterner::internfile. ipath [%s]\n", ipath.c_str()));
479
    LOGDEB(("FileInterner::internfile. ipath [%s]\n", ipath.c_str()));
376
    if (m_handlers.size() < 1) {
480
    if (m_handlers.size() < 1) {
377
    // Just means the constructor failed
481
    // Just means the constructor failed
378
    LOGERR(("FileInterner::internfile: constructor failed\n"));
482
    LOGERR(("FileInterner::internfile: constructor failed\n"));
379
    return FIError;
483
    return FIError;
380
    }
484
    }
381
485
382
    // Ipath vector.
486
    // Input Ipath vector when retrieving a given subdoc for previewing
383
    // Note that the vector is big enough for the maximum stack. All values
487
    // Note that the vector is big enough for the maximum stack. All values
384
    // over the last significant one are ""
488
    // over the last significant one are ""
385
    // We set the ipath for the first handler here, others are set
489
    // We set the ipath for the first handler here, others are set
386
    // when they're pushed on the stack
490
    // when they're pushed on the stack
387
    vector<string> vipath(MAXHANDLERS);
491
    vector<string> vipath(MAXHANDLERS);
...
...
394
        LOGERR(("FileInterner::internfile: can't skip\n"));
498
        LOGERR(("FileInterner::internfile: can't skip\n"));
395
        return FIError;
499
        return FIError;
396
    }
500
    }
397
    }
501
    }
398
502
399
    /* Try to get doc from the topmost filter */
503
    // Try to get doc from the topmost handler
400
    // Security counter: we try not to loop but ...
504
    // Security counter: looping happens when we stack one other 
505
    // handler or when walking the file document tree without finding
506
    // something to index (typical exemple: email with multiple image
507
    // attachments and no image filter installed). So we need to be
508
    // quite generous here, especially because there is another
509
    // security in the form of a maximum handler stack size.
401
    int loop = 0;
510
    int loop = 0;
402
    while (!m_handlers.empty()) {
511
    while (!m_handlers.empty()) {
403
    if (loop++ > 30) {
512
    if (loop++ > 1000) {
404
        LOGERR(("FileInterner:: looping!\n"));
513
        LOGERR(("FileInterner:: looping!\n"));
405
        return FIError;
514
        return FIError;
406
    }
515
    }
516
  // If there are no more docs at the current top level we pop and
517
  // see if there is something at the previous one
407
    if (!m_handlers.back()->has_documents()) {
518
    if (!m_handlers.back()->has_documents()) {
408
      // No docs at the current top level. Pop and see if there
409
      // is something at the previous one
410
        popHandler();
519
        popHandler();
411
        continue;
520
        continue;
412
    }
521
    }
413
522
414
  // Don't stop on next_document() error. There might be ie an
523
  // While indexing, don't stop on next_document() error. There
415
    // error while decoding an attachment, but we still want to
524
    // might be ie an error while decoding an attachment, but we
416
  // process the rest of the mbox!
525
  // still want to process the rest of the mbox! For preview: fatal.
417
    if (!m_handlers.back()->next_document()) {
526
    if (!m_handlers.back()->next_document()) {
418
      Rcl::Doc doc; string ipath;
527
      processNextDocError(); // Debug etc.
419
      collectIpathAndMT(doc, ipath);
420
      m_reason = m_handlers.back()->get_error();
421
      maybeExternalMissing(m_reason);
422
      LOGERR(("FileInterner::internfile: next_document error [%s%s%s] %s\n",
423
          m_fn.c_str(), ipath.empty()?"":"|", ipath.c_str(), 
424
          m_reason.c_str()));
425
      // If fetching a specific document, this is fatal
426
        if (m_forPreview) {
528
        if (m_forPreview) 
427
        return FIError;
529
        return FIError;
428
      }
429
        popHandler();
530
        popHandler();
430
        continue;
531
        continue;
431
    }
532
    }
432
533
433
  // Look at what we've got
534
  // Look at the type for the next document and possibly add
434
  const std::map<std::string, std::string>& docdata = 
535
  // handler to stack.
435
      m_handlers.back()->get_meta_data();
536
  switch (addHandler()) {
436
  string charset, mimetype;
537
  case ADD_OK: // Just go through: handler has been stacked, use it
437
  getKeyValue(docdata, keycs, charset);
438
  getKeyValue(docdata, keymt, mimetype);
439
440
  LOGDEB(("FileInterner::internfile: next_doc is %s\n",
441
      mimetype.c_str()));
442
  // If we find a text/plain doc, we're done
443
  if (!stringicmp(mimetype, m_targetMType))
444
        break;
538
        break; 
445
539
  case ADD_CONTINUE: 
446
  // Got a non text/plain doc. We need to stack another
540
      // forget this doc and retrieve next from current handler
447
  // filter. Check current size
541
      // (ipath stays same)
448
  if (m_handlers.size() > MAXHANDLERS) {
449
      // Stack too big. Skip this and go on to check if there is
450
      // something else in the current back()
451
      LOGINFO(("FileInterner::internfile: stack too high\n"));
452
        continue;
542
        continue;
453
  }
543
  case ADD_BREAK: 
454
544
      // Stop looping: doc type ok, need complete its processing
455
  Dijon::Filter *again = getMimeHandler(mimetype, m_cfg);
545
      // and return it
456
  if (!again) {
546
      goto breakloop; // when you have to you have to
457
      // If we can't find a filter, this doc can't be handled
547
  case ADD_ERROR: return FIError;
458
      // but there can be other ones so we go on
459
      LOGINFO(("FileInterner::internfile: no filter for [%s]\n",
460
          mimetype.c_str()));
461
      continue;
462
  }
463
  again->set_property(Dijon::Filter::OPERATING_MODE, 
464
              m_forPreview ? "view" : "index");
465
  again->set_property(Dijon::Filter::DEFAULT_CHARSET, 
466
              charset);
467
  string ns;
468
  const string *txt = &ns;
469
  map<string,string>::const_iterator it;
470
  it = docdata.find("content");
471
  if (it != docdata.end())
472
      txt = &it->second;
473
474
  bool setres = false;
475
  if (again->is_data_input_ok(Dijon::Filter::DOCUMENT_STRING)) {
476
      setres = again->set_document_string(*txt);
477
  } else if (again->is_data_input_ok(Dijon::Filter::DOCUMENT_DATA)) {
478
      setres = again->set_document_data(txt->c_str(), txt->length());
479
  }else if(again->is_data_input_ok(Dijon::Filter::DOCUMENT_FILE_NAME)) {
480
      string filename;
481
      if (dataToTempFile(*txt, mimetype, filename)) {
482
      if (!(setres = again->set_document_file(filename))) {
483
          m_tmpflgs[m_handlers.size()-1] = false;
484
          m_tempfiles.pop_back();
485
     }
548
    }
486
      }
549
487
  }
488
  if (!setres) {
489
      LOGINFO(("FileInterner::internfile: set_doc failed inside %s\n", 
490
          m_fn.c_str()));
491
      delete again;
492
      if (m_forPreview)
493
      return FIError;
494
      continue;
495
  }
496
  // add filter and go on, maybe this one will give us text...
497
  m_handlers.push_back(again);
498
    if (!ipath.empty() &&
550
    if (!ipath.empty() &&
499
        !m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
551
        !m_handlers.back()->skip_to_document(vipath[m_handlers.size()-1])){
500
        LOGERR(("FileInterner::internfile: can't skip\n"));
552
        LOGERR(("FileInterner::internfile: can't skip\n"));
501
        return FIError;
553
        return FIError;
502
    }
554
    }
503
    }
555
    }
556
 breakloop:
504
557
505
    if (m_handlers.empty()) {
558
    if (m_handlers.empty()) {
506
    LOGERR(("FileInterner::internfile: conversion ended with no doc\n"));
559
    LOGERR(("FileInterner::internfile: conversion ended with no doc\n"));
507
    return FIError;
560
    return FIError;
508
    }
561
    }
509
562
510
    // If indexing compute ipath and significant mimetype Note that
563
    // If indexing compute ipath and significant mimetype.
511
    // ipath is returned through the parameter not doc.ipath We also
564
    // ipath is returned through the parameter not doc.ipath We also
512
    // retrieve some metadata fields from the ancesters (like date or
565
    // retrieve some metadata fields from the ancesters (like date or
513
    // author). This is useful for email attachments. The values will
566
    // author). This is useful for email attachments. The values will
514
    // be replaced by those found by dijontorcl if any, so the order
567
    // be replaced by those found by dijontorcl if any, so the order
515
    // of calls is important.
568
    // of calls is important.
516
    if (!m_forPreview)
569
    if (!m_forPreview)
517
    collectIpathAndMT(doc, ipath);
570
    collectIpathAndMT(doc, ipath);
518
    // Keep this AFTER collectIpathAndMT
571
    // Keep this AFTER collectIpathAndMT
519
    dijontorcl(doc);
572
    dijontorcl(doc);
520
573
521
    // Destack what can be
574
    // Possibly destack so that we can test for FIDone.
522
    while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
575
    while (!m_handlers.empty() && !m_handlers.back()->has_documents()) {
523
    popHandler();
576
    popHandler();
524
    }
577
    }
525
    if (m_handlers.empty())
578
    if (m_handlers.empty())
526
    return FIDone;
579
    return FIDone;