Parent: [d630cb] (diff)

Child: [f70c92] (diff)

Download this file

internfile.h    298 lines (263 with data), 11.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
/* Copyright (C) 2004 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _INTERNFILE_H_INCLUDED_
#define _INTERNFILE_H_INCLUDED_
#include "autoconfig.h"
#include <string>
#include <vector>
#include <map>
#include <set>
using std::string;
using std::vector;
using std::map;
using std::set;
#include "mimehandler.h"
#include "uncomp.h"
#include "pathut.h"
class RclConfig;
namespace Rcl {
class Doc;
}
struct stat;
/** Storage for missing helper program info. We want to keep this out of the
* FileInterner class, because the data will typically be accumulated by several
* FileInterner objects. Can't use static member either (because there
* may be several separate usages of the class which shouldn't mix
* their data).
*/
class FIMissingStore {
public:
FIMissingStore() {}
FIMissingStore(const string& in);
virtual ~FIMissingStore() {}
virtual void addMissing(const string& prog, const string& mt)
{
m_typesForMissing[prog].insert(mt);
}
// Get simple progs list string
virtual void getMissingExternal(string& out);
// Get progs + assoc mtypes description string
virtual void getMissingDescription(string& out);
// Missing external programs
map<string, set<string> > m_typesForMissing;
};
/**
* Convert data from file-serialized form (either an actual File
* System file or a memory image) into one or several documents in
* internal representation (Rcl::Doc). This can be used for indexing,
* or viewing at query time (GUI preview), or extracting an internal
* document out of a compound file into a simple one.
*
* Things work a little differently when indexing or previewing:
* - When indexing, all data has to come from the datastore, and it is
* normally desired that all found subdocuments be returned (ie:
* all messages and attachments out of a single file mail folder)
* - When previewing, some data is taken from the index (ie: the mime type
* is already known, and a single document usually needs to be processed,
* so that the full doc identifier is passed in: high level url
* (ie: file path) and internal identifier: ipath, ie: message and
* attachment number.
*
* Internfile is the part of the code which knows about ipath structure.
*
* The class has a number of static helper method which could just as well not
* be members and are in there just for namespace reasons.
*
*/
class FileInterner {
public:
/** Operation modifier flags */
enum Flags {FIF_none, FIF_forPreview, FIF_doUseInputMimetype};
/** Return values for internfile() */
enum Status {FIError, FIDone, FIAgain};
/** Constructors take the initial step to preprocess the data object and
* create the top filter */
/**
* Identify and possibly decompress file, and create the top filter.
* - The mtype parameter is not always set (it is when the object is
* created for previewing a file).
* - Filter output may be different for previewing and indexing.
*
* This constructor is now only used for indexing, the form with
* an Rcl::Doc parameter to identify the data is always used
* at query time.
*
* @param fn file name.
* @param stp pointer to updated stat struct.
* @param cnf Recoll configuration.
* @param td temporary directory to use as working space if
* decompression needed. Must be private and will be wiped clean.
* @param mtype mime type if known. For a compressed file this is the
* mime type for the uncompressed version.
*/
FileInterner(const string &fn, const struct stat *stp,
RclConfig *cnf, int flags, const string *mtype = 0);
/**
* Alternate constructor for the case where the data is in memory.
* This is mainly for data extracted from the web cache. The mime type
* must be set, input must be already uncompressed.
*/
FileInterner(const string &data, RclConfig *cnf,
int flags, const string& mtype);
/**
* Alternate constructor used at query time. We don't know where
* the data was stored, this is determined from the Rcl::Doc data
*
* @param idoc Rcl::Doc object built from index data. The back-end
* storage identifier (rclbes field) is used to build the
* appropriate fetcher which uses the rest of the Doc fields (url,
* ipath...) to retrieve the file or a file reference, which we
* then process normally.
*/
FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, int flags);
~FileInterner();
void setMissingStore(FIMissingStore *st)
{
m_missingdatap = st;
}
/**
* Turn file or file part into Recoll document.
*
* For multidocument files (ie: mail folder), this must be called
* multiple times to retrieve the subdocuments.
*
* @param doc output document
* @param ipath internal path. If set by caller, the specified subdoc will
* be returned. Else the next document according to current state will
* be returned, and doc.ipath will be set on output.
* @return FIError and FIDone are self-explanatory. If FIAgain is returned,
* this is a multi-document file, with more subdocs, and internfile()
* should be called again to get the following one(s).
*/
Status internfile(Rcl::Doc& doc, const string &ipath = "");
/** Extract subdoc defined by ipath in idoc to file. See params for
idocToFile() */
bool interntofile(TempFile& otemp, const string& tofile,
const string& ipath, const string& mimetype);
/** Return the file's (top level object) mimetype (useful for
* creating the pseudo-doc for container files)
*/
const string& getMimetype() {return m_mimetype;}
/** We normally always return text/plain data. A caller can request
* that we stop conversion at the native document type (ie: extracting
* an email attachment in its native form for an external viewer)
*/
void setTargetMType(const string& tp) {m_targetMType = tp;}
/** In case we see an html version while converting, it is set aside
* and can be recovered
*/
const string& get_html() {return m_html;}
/** If we happen to be processing an image file and need a temp file,
we keep it around to save work for our caller, which can get it here */
TempFile get_imgtmp() {return m_imgtmp;}
const string& getReason() const
{
return m_reason;
}
bool ok() const
{
return m_ok;
}
/**
* Get UDI for immediate parent for document.
*
* This is not in general the same as the "parent" document used
* with Rcl::Db::addOrUpdate(). The latter is the enclosing file,
* this would be for exemple the email containing the attachment.
* This is in internfile because of the ipath computation.
*/
static bool getEnclosingUDI(const Rcl::Doc &doc, string& udi);
/** Return last element in ipath, like basename */
static std::string getLastIpathElt(const std::string& ipath);
/**
* Build sig for doc coming from rcldb. This is here because we know how
* to query the right backend. Used to check up-to-dateness at query time */
static bool makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig);
/** Extract internal document into temporary file.
* This is used mainly for starting an external viewer for a
* subdocument (ie: mail attachment). This really would not need to be
* a member. It creates a FileInterner object to do the actual work
* @return true for success.
* @param temp output reference-counted temp file object (goes
* away magically). Only used if tofile.empty()
* @param tofile output file if not empty.
* @param cnf The recoll config
* @param doc Doc data taken from the index. We use it to construct a
* FileInterner object.
*/
static bool idocToFile(TempFile& temp, const string& tofile,
RclConfig *cnf, const Rcl::Doc& doc);
/**
* Does file appear to be the compressed version of a document?
*/
static bool isCompressed(const string& fn, RclConfig *cnf);
/**
* Check input compressed, allocate temp file and uncompress if it is.
* @return true if ok, false for error. Actual decompression is indicated
* by the TempFile status (!isNull())
*/
static bool maybeUncompressToTemp(TempFile& temp, const string& fn,
RclConfig *cnf, const Rcl::Doc& doc);
private:
static const unsigned int MAXHANDLERS = 20;
RclConfig *m_cfg;
string m_fn;
string m_mimetype; // Mime type for [uncompressed] file
bool m_forPreview;
string m_html; // Possibly set-aside html text for preview
TempFile m_imgtmp; // Possible reference to an image temp file
string m_targetMType;
string m_reachedMType; // target or text/plain
string m_tfile;
bool m_ok; // Set after construction if ok
// Fields found in file extended attributes. This is kept here,
// not in the file-level handler because we are only interested in
// the top-level file, not any temp file necessitated by
// processing the internal doc hierarchy.
map<string, string> m_XAttrsFields;
// Fields gathered by executing configured external commands
map<string, string> m_cmdFields;
// Filter stack, path to the current document from which we're
// fetching subdocs
vector<RecollFilter*> m_handlers;
// Temporary files used for decoding the current stack
bool m_tmpflgs[MAXHANDLERS];
vector<TempFile> m_tempfiles;
// Error data if any
string m_reason;
FIMissingStore *m_missingdatap;
Uncomp m_uncomp;
bool m_noxattrs; // disable xattrs usage
// Pseudo-constructors
void init(const string &fn, const struct stat *stp,
RclConfig *cnf, int flags, const string *mtype = 0);
void init(const string &data, RclConfig *cnf, int flags,
const string& mtype);
void initcommon(RclConfig *cnf, int flags);
bool dijontorcl(Rcl::Doc&);
void collectIpathAndMT(Rcl::Doc&) const;
TempFile dataToTempFile(const string& data, const string& mt);
void popHandler();
int addHandler();
void checkExternalMissing(const string& msg, const string& mt);
void processNextDocError(Rcl::Doc &doc);
static bool tempFileForMT(TempFile& otemp, RclConfig *cnf,
const std::string& mimetype);
static bool topdocToFile(TempFile& otemp, const std::string& tofile,
RclConfig *cnf, const Rcl::Doc& idoc);
};
#endif /* _INTERNFILE_H_INCLUDED_ */