|
a/src/internfile/internfile.h |
|
b/src/internfile/internfile.h |
|
... |
|
... |
24 |
using std::string;
|
24 |
using std::string;
|
25 |
using std::vector;
|
25 |
using std::vector;
|
26 |
using std::map;
|
26 |
using std::map;
|
27 |
using std::set;
|
27 |
using std::set;
|
28 |
|
28 |
|
|
|
29 |
#include "Filter.h"
|
|
|
30 |
// The class changes according to RCL_USE_XATTR
|
|
|
31 |
#include "autoconfig.h"
|
29 |
#include "pathut.h"
|
32 |
#include "pathut.h"
|
30 |
#include "Filter.h"
|
|
|
31 |
// Beware: the class changes according to RCL_USE_XATTR, so any file
|
|
|
32 |
// including this needs autoconfig.h
|
|
|
33 |
#include "autoconfig.h"
|
|
|
34 |
|
33 |
|
35 |
class RclConfig;
|
34 |
class RclConfig;
|
36 |
namespace Rcl {
|
35 |
namespace Rcl {
|
37 |
class Doc;
|
36 |
class Doc;
|
38 |
}
|
37 |
}
|
|
... |
|
... |
62 |
// Missing external programs
|
61 |
// Missing external programs
|
63 |
map<string, set<string> > m_typesForMissing;
|
62 |
map<string, set<string> > m_typesForMissing;
|
64 |
};
|
63 |
};
|
65 |
|
64 |
|
66 |
/**
|
65 |
/**
|
67 |
* A class to convert data from a datastore (file-system, firefox
|
66 |
* Convert data from file-serialized form (either an actual File
|
68 |
* history, etc.) into possibly one or severaldocuments in internal
|
67 |
* System file or a memory image) into one or several documents in
|
69 |
* representation, either for indexing or viewing at query time (gui preview).
|
68 |
* internal representation (Rcl::Doc). This can be used for indexing,
|
|
|
69 |
* or viewing at query time (GUI preview), or extracting an internal
|
|
|
70 |
* document out of a compound file into a simple one.
|
|
|
71 |
*
|
70 |
* Things work a little differently when indexing or previewing:
|
72 |
* Things work a little differently when indexing or previewing:
|
71 |
* - When indexing, all data has to come from the datastore, and it is
|
73 |
* - When indexing, all data has to come from the datastore, and it is
|
72 |
* normally desired that all found subdocuments be returned (ie:
|
74 |
* normally desired that all found subdocuments be returned (ie:
|
73 |
* all messages and attachments out of a single file mail folder)
|
75 |
* all messages and attachments out of a single file mail folder)
|
74 |
* - When previewing, some data is taken from the index (ie: the mime type
|
76 |
* - When previewing, some data is taken from the index (ie: the mime type
|
75 |
* is already known, and a single document usually needs to be processed,
|
77 |
* is already known, and a single document usually needs to be processed,
|
76 |
* so that the full doc identifier is passed in: high level url
|
78 |
* so that the full doc identifier is passed in: high level url
|
77 |
* (ie: file path) and internal identifier: ipath, ie: message and
|
79 |
* (ie: file path) and internal identifier: ipath, ie: message and
|
78 |
* attachment number.
|
80 |
* attachment number.
|
|
|
81 |
*
|
|
|
82 |
* Internfile is the part of the code which knows about ipath structure.
|
|
|
83 |
*
|
|
|
84 |
* The class has a number of static helper method which could just as well not
|
|
|
85 |
* be members and are in there just for namespace reasons.
|
|
|
86 |
*
|
79 |
*/
|
87 |
*/
|
80 |
class FileInterner {
|
88 |
class FileInterner {
|
81 |
public:
|
89 |
public:
|
82 |
/// Operation modifier flags
|
90 |
/** Operation modifier flags */
|
83 |
enum Flags {FIF_none, FIF_forPreview, FIF_doUseInputMimetype};
|
91 |
enum Flags {FIF_none, FIF_forPreview, FIF_doUseInputMimetype};
|
84 |
/// Return values for internfile()
|
92 |
/** Return values for internfile() */
|
85 |
enum Status {FIError, FIDone, FIAgain};
|
93 |
enum Status {FIError, FIDone, FIAgain};
|
86 |
|
|
|
87 |
/**
|
|
|
88 |
* Get immediate parent for document.
|
|
|
89 |
*
|
|
|
90 |
* This is not in general the same as the "parent" document used
|
|
|
91 |
* with Rcl::Db::addOrUpdate(). The latter is generally the enclosing file,
|
|
|
92 |
* this would be for exemple the email containing the attachment.
|
|
|
93 |
*/
|
|
|
94 |
static bool getEnclosing(const string &url, const string &ipath,
|
|
|
95 |
string &eurl, string &eipath, string& udi);
|
|
|
96 |
|
|
|
97 |
/** Return last element in ipath, like basename */
|
|
|
98 |
static std::string getLastIpathElt(const std::string& ipath);
|
|
|
99 |
|
94 |
|
100 |
/** Constructors take the initial step to preprocess the data object and
|
95 |
/** Constructors take the initial step to preprocess the data object and
|
101 |
* create the top filter */
|
96 |
* create the top filter */
|
102 |
|
97 |
|
103 |
/**
|
98 |
/**
|
104 |
* Identify and possibly decompress file, and create the top filter.
|
99 |
* Identify and possibly decompress file, and create the top filter.
|
105 |
* - The mtype parameter is not always set (it is when the object is
|
100 |
* - The mtype parameter is not always set (it is when the object is
|
106 |
* created for previewing a file).
|
101 |
* created for previewing a file).
|
107 |
* - Filter output may be different for previewing and indexing.
|
102 |
* - Filter output may be different for previewing and indexing.
|
108 |
*
|
103 |
*
|
|
|
104 |
* This constructor is now only used for indexing, the form with
|
|
|
105 |
* an Rcl::Doc parameter to identify the data is always used
|
|
|
106 |
* at query time.
|
|
|
107 |
*
|
109 |
* @param fn file name
|
108 |
* @param fn file name.
|
110 |
* @param stp pointer to updated stat struct.
|
109 |
* @param stp pointer to updated stat struct.
|
111 |
* @param cnf Recoll configuration
|
110 |
* @param cnf Recoll configuration.
|
112 |
* @param td temporary directory to use as working space if
|
111 |
* @param td temporary directory to use as working space if
|
113 |
* decompression needed. Must be private and will be wiped clean.
|
112 |
* decompression needed. Must be private and will be wiped clean.
|
114 |
* @param mtype mime type if known. For a compressed file this is the
|
113 |
* @param mtype mime type if known. For a compressed file this is the
|
115 |
* mime type for the uncompressed version.
|
114 |
* mime type for the uncompressed version.
|
116 |
*/
|
115 |
*/
|
117 |
FileInterner(const string &fn, const struct stat *stp,
|
116 |
FileInterner(const string &fn, const struct stat *stp,
|
118 |
RclConfig *cnf, TempDir &td, int flags,
|
117 |
RclConfig *cnf, TempDir &td, int flags,
|
119 |
const string *mtype = 0);
|
118 |
const string *mtype = 0);
|
120 |
|
119 |
|
121 |
/**
|
120 |
/**
|
122 |
* Alternate constructor for the case where the data is in memory.
|
121 |
* Alternate constructor for the case where the data is in memory.
|
123 |
* This is mainly for data extracted from the web cache. The mime type
|
122 |
* This is mainly for data extracted from the web cache. The mime type
|
124 |
* must be set, input must be uncompressed.
|
123 |
* must be set, input must be already uncompressed.
|
125 |
*/
|
124 |
*/
|
126 |
FileInterner(const string &data, RclConfig *cnf, TempDir &td,
|
125 |
FileInterner(const string &data, RclConfig *cnf, TempDir &td,
|
127 |
int flags, const string& mtype);
|
126 |
int flags, const string& mtype);
|
128 |
|
127 |
|
129 |
/**
|
128 |
/**
|
130 |
* Alternate constructor for the case where it is not known where
|
129 |
* Alternate constructor used at query time. We don't know where
|
131 |
* the data will come from. We'll use the doc fields and try our
|
130 |
* the data was stored, this is determined from the Rcl::Doc data
|
132 |
* best. This is only used at query time, the idoc was built from index
|
|
|
133 |
* data.
|
131 |
*
|
|
|
132 |
* @param idoc Rcl::Doc object built from index data. The back-end
|
|
|
133 |
* storage identifier (rclbes field) is used to build the
|
|
|
134 |
* appropriate fetcher which uses the rest of the Doc fields (url,
|
|
|
135 |
* ipath...) to retrieve the file or a file reference, which we
|
|
|
136 |
* then process normally.
|
134 |
*/
|
137 |
*/
|
135 |
FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, TempDir &td,
|
138 |
FileInterner(const Rcl::Doc& idoc, RclConfig *cnf, TempDir &td,
|
136 |
int flags);
|
139 |
int flags);
|
137 |
|
140 |
|
138 |
/**
|
|
|
139 |
* Build sig for doc coming from rcldb. This is here because we know how
|
|
|
140 |
* to query the right backend */
|
|
|
141 |
static bool makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig);
|
|
|
142 |
|
|
|
143 |
~FileInterner();
|
141 |
~FileInterner();
|
144 |
|
142 |
|
145 |
void setMissingStore(FIMissingStore *st)
|
143 |
void setMissingStore(FIMissingStore *st)
|
146 |
{
|
144 |
{
|
147 |
m_missingdatap = st;
|
145 |
m_missingdatap = st;
|
148 |
}
|
146 |
}
|
149 |
|
147 |
|
150 |
/**
|
148 |
/**
|
151 |
* Turn file or file part into Recoll document.
|
149 |
* Turn file or file part into Recoll document.
|
152 |
*
|
150 |
*
|
153 |
* For multidocument files (ie: mail folder), this must be called multiple
|
151 |
* For multidocument files (ie: mail folder), this must be called
|
154 |
* times to retrieve the subdocuments
|
152 |
* multiple times to retrieve the subdocuments.
|
|
|
153 |
*
|
155 |
* @param doc output document
|
154 |
* @param doc output document
|
156 |
* @param ipath internal path. If set by caller, the specified subdoc will
|
155 |
* @param ipath internal path. If set by caller, the specified subdoc will
|
157 |
* be returned. Else the next document according to current state will
|
156 |
* be returned. Else the next document according to current state will
|
158 |
* be returned, and doc.ipath will be set on output.
|
157 |
* be returned, and doc.ipath will be set on output.
|
159 |
* @return FIError and FIDone are self-explanatory. If FIAgain is returned,
|
158 |
* @return FIError and FIDone are self-explanatory. If FIAgain is returned,
|
|
... |
|
... |
167 |
*/
|
166 |
*/
|
168 |
const string& getMimetype() {return m_mimetype;}
|
167 |
const string& getMimetype() {return m_mimetype;}
|
169 |
|
168 |
|
170 |
/** We normally always return text/plain data. A caller can request
|
169 |
/** We normally always return text/plain data. A caller can request
|
171 |
* that we stop conversion at the native document type (ie: extracting
|
170 |
* that we stop conversion at the native document type (ie: extracting
|
172 |
* an email attachment and starting an external viewer)
|
171 |
* an email attachment in its native form for an external viewer)
|
173 |
*/
|
172 |
*/
|
174 |
void setTargetMType(const string& tp) {m_targetMType = tp;}
|
173 |
void setTargetMType(const string& tp) {m_targetMType = tp;}
|
175 |
|
174 |
|
176 |
/** In case we see an html version while converting, it is set aside
|
175 |
/** In case we see an html version while converting, it is set aside
|
177 |
* and can be recovered
|
176 |
* and can be recovered
|
|
... |
|
... |
180 |
|
179 |
|
181 |
/** If we happen to be processing an image file and need a temp file,
|
180 |
/** If we happen to be processing an image file and need a temp file,
|
182 |
we keep it around to save work for our caller, which can get it here */
|
181 |
we keep it around to save work for our caller, which can get it here */
|
183 |
TempFile get_imgtmp() {return m_imgtmp;}
|
182 |
TempFile get_imgtmp() {return m_imgtmp;}
|
184 |
|
183 |
|
|
|
184 |
const string& getReason() const
|
|
|
185 |
{
|
|
|
186 |
return m_reason;
|
|
|
187 |
}
|
|
|
188 |
bool ok() const
|
|
|
189 |
{
|
|
|
190 |
return m_ok;
|
|
|
191 |
}
|
|
|
192 |
|
|
|
193 |
/**
|
|
|
194 |
* Get immediate parent for document.
|
|
|
195 |
*
|
|
|
196 |
* This is not in general the same as the "parent" document used
|
|
|
197 |
* with Rcl::Db::addOrUpdate(). The latter is the enclosing file,
|
|
|
198 |
* this would be for exemple the email containing the attachment.
|
|
|
199 |
*/
|
|
|
200 |
static bool getEnclosing(const string &url, const string &ipath,
|
|
|
201 |
string &eurl, string &eipath, string& udi);
|
|
|
202 |
|
|
|
203 |
/** Return last element in ipath, like basename */
|
|
|
204 |
static std::string getLastIpathElt(const std::string& ipath);
|
|
|
205 |
|
|
|
206 |
/**
|
|
|
207 |
* Build sig for doc coming from rcldb. This is here because we know how
|
|
|
208 |
* to query the right backend. Used to check up-to-dateness at query time */
|
|
|
209 |
static bool makesig(RclConfig *cnf, const Rcl::Doc& idoc, string& sig);
|
|
|
210 |
|
185 |
/** Extract internal document into temporary file.
|
211 |
/** Extract internal document into temporary file.
|
186 |
* This is used mainly for starting an external viewer for a
|
212 |
* This is used mainly for starting an external viewer for a
|
187 |
* subdocument (ie: mail attachment).
|
213 |
* subdocument (ie: mail attachment). This really would not need to be
|
|
|
214 |
* a member. It creates a FileInterner object to do the actual work
|
188 |
* @return true for success.
|
215 |
* @return true for success.
|
189 |
* @param temp output reference-counted temp file object (goes
|
216 |
* @param temp output reference-counted temp file object (goes
|
190 |
* away magically). Only used if tofile.empty()
|
217 |
* away magically). Only used if tofile.empty()
|
191 |
* @param tofile output file if not null
|
218 |
* @param tofile output file if not empty.
|
192 |
* @param cnf The recoll config
|
219 |
* @param cnf The recoll config
|
193 |
* @param doc Doc data taken from the index. We use it to access the
|
220 |
* @param doc Doc data taken from the index. We use it to construct a
|
194 |
* actual document (ie: use mtype, fn, ipath...).
|
221 |
* FileInterner object.
|
195 |
*/
|
222 |
*/
|
196 |
static bool idocToFile(TempFile& temp, const string& tofile,
|
223 |
static bool idocToFile(TempFile& temp, const string& tofile,
|
197 |
RclConfig *cnf, const Rcl::Doc& doc);
|
224 |
RclConfig *cnf, const Rcl::Doc& doc);
|
198 |
|
225 |
|
199 |
/**
|
226 |
/**
|
|
... |
|
... |
207 |
* by the TempFile status (!isNull())
|
234 |
* by the TempFile status (!isNull())
|
208 |
*/
|
235 |
*/
|
209 |
static bool maybeUncompressToTemp(TempFile& temp, const string& fn,
|
236 |
static bool maybeUncompressToTemp(TempFile& temp, const string& fn,
|
210 |
RclConfig *cnf, const Rcl::Doc& doc);
|
237 |
RclConfig *cnf, const Rcl::Doc& doc);
|
211 |
|
238 |
|
212 |
const string& getReason() const {return m_reason;}
|
|
|
213 |
static void getMissingExternal(FIMissingStore *st, string& missing);
|
239 |
static void getMissingExternal(FIMissingStore *st, string& missing);
|
214 |
static void getMissingDescription(FIMissingStore *st, string& desc);
|
240 |
static void getMissingDescription(FIMissingStore *st, string& desc);
|
215 |
// Parse "missing" file contents into memory struct
|
241 |
// Parse "missing" file contents into memory struct
|
216 |
static void getMissingFromDescription(FIMissingStore *st, const string& desc);
|
242 |
static void getMissingFromDescription(FIMissingStore *st, const string& desc);
|
217 |
bool ok() {return m_ok;}
|
|
|
218 |
|
243 |
|
219 |
private:
|
244 |
private:
|
220 |
static const unsigned int MAXHANDLERS = 20;
|
245 |
static const unsigned int MAXHANDLERS = 20;
|
221 |
RclConfig *m_cfg;
|
246 |
RclConfig *m_cfg;
|
222 |
string m_fn;
|
247 |
string m_fn;
|