Child: [9244e3] (diff)

Download this file

Filter.h    208 lines (174 with data), 7.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Library General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _DIJON_FILTER_H
#define _DIJON_FILTER_H
#include <stdint.h>
#include <string>
#include <set>
#include <map>
class RclConfig;
namespace Dijon
{
class Filter;
/** Provides the list of MIME types supported by the filter(s).
* The character string is allocated with new[].
* This function is exported by dynamically loaded filter libraries.
*/
typedef bool (get_filter_types_func)(std::set<std::string> &);
/** Returns what data should be passed to the filter(s).
* Output is cast from Filter::DataInput to int for convenience.
* This function is exported by dynamically loaded filter libraries.
* The aim is to let the client application know before-hand whether
* it should load documents or not.
*/
typedef bool (check_filter_data_input_func)(int);
/** Returns a Filter that handles the given MIME type.
* The Filter object is allocated with new.
* This function is exported by dynamically loaded filter libraries
* and serves as a factory for Filter objects, so that the client
* application doesn't have to know which Filter sub-types handle
* which MIME types.
*/
typedef Filter *(get_filter_func)(const std::string &);
/// Filter interface.
class Filter
{
public:
/// Destroys the filter.
Filter()
{
}
virtual ~Filter() {}
virtual void setConfig(RclConfig *) = 0;
// Enumerations.
/** What data a filter supports as input.
* It can be either the whole document data, its file name, or its URI.
*/
typedef enum { DOCUMENT_DATA=0, DOCUMENT_STRING, DOCUMENT_FILE_NAME,
DOCUMENT_URI } DataInput;
/** Input properties supported by the filter.
*
* - DEFAULT_CHARSET is the source encoding that should be used
* for reading/transcoding the original data if there is no
* other way to determine it (ie: for text/plain files)
* - OPERATING_MODE can be set to either view or index.
* - DJF_UDI Unique document identifier. This can be useful if the
* filter wants to manage a persistent cache.
*/
typedef enum { DEFAULT_CHARSET=0, OPERATING_MODE, DJF_UDI } Properties;
// Information.
/// Returns the MIME type handled by the filter.
std::string get_mime_type(void) const
{
return m_mimeType;
}
/// Returns what data the filter requires as input.
virtual bool is_data_input_ok(DataInput input) const = 0;
// Initialization.
/** Sets a property, prior to calling set_document_XXX().
* Returns false if the property is not supported.
*/
virtual bool set_property(Properties prop_name,
const std::string &prop_value) = 0;
/** (Re)initializes the filter with the given data.
* Caller should ensure the given pointer is valid until the
* Filter object is destroyed, as some filters may not need to
* do a deep copy of the data.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_data(const std::string& mtype,
const char *data_ptr,
size_t data_length) = 0;
/** (Re)initializes the filter with the given data.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_string(const std::string& mtype,
const std::string &data_str) = 0;
/** (Re)initializes the filter with the given file.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_file(const std::string& mtype,
const std::string &file_path) = 0;
/** (Re)initializes the filter with the given URI.
* Call next_document() to position the filter onto the first document.
* Returns false if this input is not supported or an error occured.
*/
virtual bool set_document_uri(const std::string& mtype,
const std::string &uri) = 0;
/** Set the document size meta_data element. This is the size
of the immediate containing file (ie, a .doc, a .odt), not
the size of, ie, a containing archive or .gz nor the size
of the extracted text. This is set externally, because the
surrounding code quite often has a better idea about it
(having created a temp file, etc.), and this saves more
stat() calls The value is stored inside metaData, docsize
key
*/
virtual void set_docsize(int64_t size) = 0;
// Going from one nested document to the next.
/** Returns true if there are nested documents left to extract.
* Returns false if the end of the parent document was reached
* or an error occured.
*/
virtual bool has_documents(void) const = 0;
/** Moves to the next nested document.
* Returns false if there are none left.
*/
virtual bool next_document(void) = 0;
/** Skips to the nested document with the given ipath.
* Returns false if no such document exists.
*/
virtual bool skip_to_document(const std::string &ipath) = 0;
// Accessing documents' contents.
/// Returns the message for the most recent error that has occured.
virtual std::string get_error(void) const = 0;
/** Returns a dictionary of metadata extracted from the current document.
* Metadata fields may include one or more of the following :
* content, title, ipath, mimetype, language, charset, author, creator,
* publisher, modificationdate, creationdate, size
* Special considerations apply :
* - content may contain binary data, watch out !
* - ipath is an internal path to the nested document that can be
* later passed to skip_to_document(). It may be empty if the parent
* document's type doesn't allow embedding, in which case the filter
* should only return one document.
* - mimetype should be text/plain if the document could be handled
* internally, empty if unknown. If any other value, it is expected
* that the client application can pass the nested document's content
* to another filter that supports this particular type.
*/
virtual const std::map<std::string, std::string> &get_meta_data(void) const
{
return m_metaData;
}
virtual void clear() {m_metaData.clear();}
virtual bool is_unknown() {return false;}
protected:
/// The MIME type handled by the filter.
std::string m_mimeType;
/// Metadata dictionary.
std::map<std::string, std::string> m_metaData;
private:
/// Filter objects cannot be copied.
Filter(const Filter &other);
/// Filter objects cannot be copied.
Filter& operator=(const Filter& other);
};
}
#endif // _DIJON_FILTER_H