|
a/src/rcldb/rcldoc.h |
|
b/src/rcldb/rcldoc.h |
|
... |
|
... |
17 |
#ifndef _RCLDOC_H_INCLUDED_
|
17 |
#ifndef _RCLDOC_H_INCLUDED_
|
18 |
#define _RCLDOC_H_INCLUDED_
|
18 |
#define _RCLDOC_H_INCLUDED_
|
19 |
|
19 |
|
20 |
#include <string>
|
20 |
#include <string>
|
21 |
#include <map>
|
21 |
#include <map>
|
22 |
using std::string;
|
22 |
#include <vector>
|
23 |
using std::map;
|
|
|
24 |
|
23 |
|
25 |
#include "smallut.h"
|
24 |
#include "smallut.h"
|
26 |
|
25 |
|
27 |
namespace Rcl {
|
26 |
namespace Rcl {
|
28 |
|
27 |
|
|
... |
|
... |
44 |
// We indicate the routine that sets them up during indexing
|
43 |
// We indicate the routine that sets them up during indexing
|
45 |
|
44 |
|
46 |
// Binary or url-encoded url. No transcoding: this is used to access files
|
45 |
// Binary or url-encoded url. No transcoding: this is used to access files
|
47 |
// Index: computed by Db::add caller.
|
46 |
// Index: computed by Db::add caller.
|
48 |
// Query: from doc data.
|
47 |
// Query: from doc data.
|
49 |
string url;
|
48 |
std::string url;
|
50 |
|
49 |
|
51 |
// When we do path translation for documents from external indexes, we
|
50 |
// When we do path translation for documents from external indexes, we
|
52 |
// save the original path:
|
51 |
// save the original path:
|
53 |
string idxurl;
|
52 |
std::string idxurl;
|
54 |
// And the originating db. 0 is base, 1 first external etc.
|
53 |
// And the originating db. 0 is base, 1 first external etc.
|
55 |
int idxi;
|
54 |
int idxi;
|
56 |
|
55 |
|
57 |
// Internal path for multi-doc files. Ascii
|
56 |
// Internal path for multi-doc files. Ascii
|
58 |
// Set by FsIndexer::processone
|
57 |
// Set by FsIndexer::processone
|
59 |
string ipath;
|
58 |
std::string ipath;
|
60 |
|
59 |
|
61 |
// Mime type. Set by FileInterner::internfile
|
60 |
// Mime type. Set by FileInterner::internfile
|
62 |
string mimetype;
|
61 |
std::string mimetype;
|
63 |
|
62 |
|
64 |
// File modification time as decimal ascii unix time
|
63 |
// File modification time as decimal ascii unix time
|
65 |
// Set by FsIndexer::processone
|
64 |
// Set by FsIndexer::processone
|
66 |
string fmtime;
|
65 |
std::string fmtime;
|
67 |
|
66 |
|
68 |
// Data reference date (same format). Ie: mail date
|
67 |
// Data reference date (same format). Ie: mail date
|
69 |
// Possibly set by mimetype-specific handler
|
68 |
// Possibly set by mimetype-specific handler
|
70 |
// Filter::metaData["modificationdate"]
|
69 |
// Filter::metaData["modificationdate"]
|
71 |
string dmtime;
|
70 |
std::string dmtime;
|
72 |
|
71 |
|
73 |
// Charset we transcoded the 'text' field from (in case we want back)
|
72 |
// Charset we transcoded the 'text' field from (in case we want back)
|
74 |
// Possibly set by handler
|
73 |
// Possibly set by handler
|
75 |
string origcharset;
|
74 |
std::string origcharset;
|
76 |
|
75 |
|
77 |
// A map for textual metadata like, author, keywords, abstract,
|
76 |
// A map for textual metadata like, author, keywords, abstract,
|
78 |
// title. The entries are possibly set by the mimetype-specific
|
77 |
// title. The entries are possibly set by the mimetype-specific
|
79 |
// handler. If a fieldname-to-prefix translation exists, the
|
78 |
// handler. If a fieldname-to-prefix translation exists, the
|
80 |
// terms in the value will be indexed with a prefix.
|
79 |
// terms in the value will be indexed with a prefix.
|
81 |
// Only some predefined fields are stored in the data record:
|
80 |
// Only some predefined fields are stored in the data record:
|
82 |
// "title", "keywords", "abstract", "author", but if a field name is
|
81 |
// "title", "keywords", "abstract", "author", but if a field name is
|
83 |
// in the "stored" configuration list, it will be stored too.
|
82 |
// in the "stored" configuration list, it will be stored too.
|
84 |
map<string, string> meta;
|
83 |
std::map<std::string, std::string> meta;
|
85 |
|
84 |
|
86 |
// Attribute for the "abstract" entry. true if it is just the top
|
85 |
// Attribute for the "abstract" entry. true if it is just the top
|
87 |
// of doc, not a native document attribute. Not stored directly, but
|
86 |
// of doc, not a native document attribute. Not stored directly, but
|
88 |
// as an indicative prefix at the beginning of the abstract (ugly hack)
|
87 |
// as an indicative prefix at the beginning of the abstract (ugly hack)
|
89 |
bool syntabs;
|
88 |
bool syntabs;
|
90 |
|
89 |
|
91 |
// File size. This is the size of the compressed file or of the
|
90 |
// File size. This is the size of the compressed file or of the
|
92 |
// external containing archive.
|
91 |
// external containing archive.
|
93 |
// Index: Set by caller prior to Db::Add.
|
92 |
// Index: Set by caller prior to Db::Add.
|
94 |
// Query: Set from data record
|
93 |
// Query: Set from data record
|
95 |
string pcbytes;
|
94 |
std::string pcbytes;
|
96 |
|
95 |
|
97 |
// Document size, ie, size of the .odt or .xls.
|
96 |
// Document size, ie, size of the .odt or .xls.
|
98 |
// Index: Set in internfile from the filter stack
|
97 |
// Index: Set in internfile from the filter stack
|
99 |
// Query: set from data record
|
98 |
// Query: set from data record
|
100 |
string fbytes;
|
99 |
std::string fbytes;
|
101 |
|
100 |
|
102 |
// Doc text size.
|
101 |
// Doc text size.
|
103 |
// Index: from text.length().
|
102 |
// Index: from text.length().
|
104 |
// Query: set by rcldb from index data record
|
103 |
// Query: set by rcldb from index data record
|
105 |
string dbytes;
|
104 |
std::string dbytes;
|
106 |
|
105 |
|
107 |
// Doc signature. Used for up to date checks.
|
106 |
// Doc signature. Used for up to date checks.
|
108 |
// Index: set by Db::Add caller. Query: set from doc data.
|
107 |
// Index: set by Db::Add caller. Query: set from doc data.
|
109 |
// This is opaque to rcldb, and could just as well be ctime, size,
|
108 |
// This is opaque to rcldb, and could just as well be ctime, size,
|
110 |
// ctime+size, md5, whatever.
|
109 |
// ctime+size, md5, whatever.
|
111 |
string sig;
|
110 |
std::string sig;
|
112 |
|
111 |
|
113 |
/////////////////////////////////////////////////
|
112 |
/////////////////////////////////////////////////
|
114 |
// The following fields don't go to the db record, so they can't
|
113 |
// The following fields don't go to the db record, so they can't
|
115 |
// be retrieved at query time
|
114 |
// be retrieved at query time
|
116 |
|
115 |
|
117 |
// Main document text. This is plaintext utf-8 text to be split
|
116 |
// Main document text. This is plaintext utf-8 text to be split
|
118 |
// and indexed
|
117 |
// and indexed
|
119 |
string text;
|
118 |
std::string text;
|
120 |
|
119 |
|
121 |
/////////////////////////////////////////////////
|
120 |
/////////////////////////////////////////////////
|
122 |
// Misc stuff
|
121 |
// Misc stuff
|
123 |
|
122 |
|
124 |
int pc; // relevancy percentage, used by sortseq, convenience
|
123 |
int pc; // relevancy percentage, used by sortseq, convenience
|
|
... |
|
... |
168 |
Doc()
|
167 |
Doc()
|
169 |
: idxi(0), syntabs(false), pc(0), xdocid(0),
|
168 |
: idxi(0), syntabs(false), pc(0), xdocid(0),
|
170 |
haspages(false), haschildren(false), onlyxattr(false) {
|
169 |
haspages(false), haschildren(false), onlyxattr(false) {
|
171 |
}
|
170 |
}
|
172 |
/** Get value for named field. If value pointer is 0, just test existence */
|
171 |
/** Get value for named field. If value pointer is 0, just test existence */
|
173 |
bool getmeta(const string& nm, string *value = 0) const
|
172 |
bool getmeta(const std::string& nm, std::string *value = 0) const
|
174 |
{
|
173 |
{
|
175 |
map<string,string>::const_iterator it = meta.find(nm);
|
174 |
const auto it = meta.find(nm);
|
176 |
if (it != meta.end()) {
|
175 |
if (it != meta.end()) {
|
177 |
if (value)
|
176 |
if (value)
|
178 |
*value = it->second;
|
177 |
*value = it->second;
|
179 |
return true;
|
178 |
return true;
|
180 |
} else {
|
179 |
} else {
|
181 |
return false;
|
180 |
return false;
|
182 |
}
|
181 |
}
|
183 |
}
|
182 |
}
|
184 |
/** Nocopy getvalue. sets pointer to entry value if exists */
|
183 |
/** Nocopy getvalue. sets pointer to entry value if exists */
|
185 |
bool peekmeta(const string& nm, const string **value = 0) const
|
184 |
bool peekmeta(const std::string& nm, const std::string **value = 0) const
|
186 |
{
|
185 |
{
|
187 |
map<string,string>::const_iterator it = meta.find(nm);
|
186 |
const auto it = meta.find(nm);
|
188 |
if (it != meta.end()) {
|
187 |
if (it != meta.end()) {
|
189 |
if (value)
|
188 |
if (value)
|
190 |
*value = &(it->second);
|
189 |
*value = &(it->second);
|
191 |
return true;
|
190 |
return true;
|
192 |
} else {
|
191 |
} else {
|
193 |
return false;
|
192 |
return false;
|
194 |
}
|
193 |
}
|
195 |
}
|
194 |
}
|
196 |
|
195 |
|
197 |
// Create entry or append text to existing entry.
|
196 |
// Create entry or append text to existing entry.
|
198 |
bool addmeta(const string& nm, const string& value)
|
197 |
bool addmeta(const std::string& nm, const std::string& value)
|
199 |
{
|
198 |
{
|
200 |
map<string,string>::iterator mit = meta.find(nm);
|
199 |
auto mit = meta.find(nm);
|
201 |
if (mit == meta.end()) {
|
200 |
if (mit == meta.end()) {
|
202 |
meta[nm] = value;
|
201 |
meta[nm] = value;
|
203 |
} else if (mit->second.empty()) {
|
202 |
} else if (mit->second.empty()) {
|
204 |
mit->second = value;
|
203 |
mit->second = value;
|
205 |
} else {
|
204 |
} else {
|
206 |
// It may happen that the same attr exists several times
|
205 |
// It may happen that the same attr exists several times
|
207 |
// in the internfile stack. Avoid duplicating values.
|
206 |
// in the internfile stack. Avoid duplicating values.
|
208 |
if (mit->second != value)
|
207 |
if (mit->second != value)
|
209 |
mit->second += string(" - ") + value;
|
208 |
mit->second += std::string(" - ") + value;
|
210 |
}
|
209 |
}
|
211 |
return true;
|
210 |
return true;
|
212 |
}
|
211 |
}
|
213 |
|
212 |
|
214 |
/* Is this document stored as a regular filesystem file ?
|
213 |
/* Is this document stored as a regular filesystem file ?
|
215 |
* (as opposed to e.g. a webcache file), not a subdoc,
|
214 |
* (as opposed to e.g. a webcache file), not a subdoc,
|
216 |
*/
|
215 |
*/
|
217 |
bool isFsFile() {
|
216 |
bool isFsFile() {
|
218 |
string backend;
|
217 |
std::string backend;
|
219 |
getmeta(keybcknd, &backend);
|
218 |
getmeta(keybcknd, &backend);
|
220 |
if (!backend.empty() && backend.compare("FS"))
|
219 |
if (!backend.empty() && backend.compare("FS"))
|
221 |
return false;
|
220 |
return false;
|
222 |
return true;
|
221 |
return true;
|
223 |
}
|
222 |
}
|
|
... |
|
... |
230 |
// fields in the meta array, these are the names used). Defined in
|
229 |
// fields in the meta array, these are the names used). Defined in
|
231 |
// rcldoc.cpp. Fields stored in the meta[] array (ie, title,
|
230 |
// rcldoc.cpp. Fields stored in the meta[] array (ie, title,
|
232 |
// author), _must_ use these canonical values, not aliases. This is
|
231 |
// author), _must_ use these canonical values, not aliases. This is
|
233 |
// enforced in internfile.cpp and misc other bits of metadata-gathering
|
232 |
// enforced in internfile.cpp and misc other bits of metadata-gathering
|
234 |
// code
|
233 |
// code
|
235 |
static const string keyurl; // url
|
234 |
static const std::string keyurl; // url
|
236 |
// childurl. This is set when working with the parent of the result, to hold
|
235 |
// childurl. This is set when working with the parent of the result, to hold
|
237 |
// the child of interest url, typically to highlight a directory entry
|
236 |
// the child of interest url, typically to highlight a directory entry
|
238 |
static const string keychildurl;
|
237 |
static const std::string keychildurl;
|
239 |
// file name. This is set for filesystem-level containers or
|
238 |
// file name. This is set for filesystem-level containers or
|
240 |
// documents, and not inherited by subdocuments (which can get a
|
239 |
// documents, and not inherited by subdocuments (which can get a
|
241 |
// keyfn anyway from, e.g, an attachment filename value). Subdocs
|
240 |
// keyfn anyway from, e.g, an attachment filename value). Subdocs
|
242 |
// used to inherit the file name, but this was undesirable (you
|
241 |
// used to inherit the file name, but this was undesirable (you
|
243 |
// usually don't want to see all subdocs when searching for the
|
242 |
// usually don't want to see all subdocs when searching for the
|
244 |
// file name). Instead the container file name is now set in the
|
243 |
// file name). Instead the container file name is now set in the
|
245 |
// document record but not indexed (see next entry).
|
244 |
// document record but not indexed (see next entry).
|
246 |
static const string keyfn;
|
245 |
static const std::string keyfn;
|
247 |
// Container file name. This is set for all subdocuments of a
|
246 |
// Container file name. This is set for all subdocuments of a
|
248 |
// given top level container. It is not indexed by default but
|
247 |
// given top level container. It is not indexed by default but
|
249 |
// stored in the document record keyfn field if this is still
|
248 |
// stored in the document record keyfn field if this is still
|
250 |
// empty when we create it, for display purposes.
|
249 |
// empty when we create it, for display purposes.
|
251 |
static const string keytcfn;
|
250 |
static const std::string keytcfn;
|
252 |
static const string keyipt; // ipath
|
251 |
static const std::string keyipt; // ipath
|
253 |
static const string keytp; // mime type
|
252 |
static const std::string keytp; // mime type
|
254 |
static const string keyfmt; // file mtime
|
253 |
static const std::string keyfmt; // file mtime
|
255 |
static const string keydmt; // document mtime
|
254 |
static const std::string keydmt; // document mtime
|
256 |
static const string keymt; // mtime dmtime if set else fmtime
|
255 |
static const std::string keymt; // mtime dmtime if set else fmtime
|
257 |
static const string keyoc; // original charset
|
256 |
static const std::string keyoc; // original charset
|
258 |
static const string keypcs; // document outer container size
|
257 |
static const std::string keypcs; // document outer container size
|
259 |
static const string keyfs; // document size
|
258 |
static const std::string keyfs; // document size
|
260 |
static const string keyds; // document text size
|
259 |
static const std::string keyds; // document text size
|
261 |
static const string keysz; // dbytes if set else fbytes else pcbytes
|
260 |
static const std::string keysz; // dbytes if set else fbytes else pcbytes
|
262 |
static const string keysig; // sig
|
261 |
static const std::string keysig; // sig
|
263 |
static const string keyrr; // relevancy rating
|
262 |
static const std::string keyrr; // relevancy rating
|
264 |
static const string keycc; // Collapse count
|
263 |
static const std::string keycc; // Collapse count
|
265 |
static const string keyabs; // abstract
|
264 |
static const std::string keyabs; // abstract
|
266 |
static const string keyau; // author
|
265 |
static const std::string keyau; // author
|
267 |
static const string keytt; // title
|
266 |
static const std::string keytt; // title
|
268 |
static const string keykw; // keywords
|
267 |
static const std::string keykw; // keywords
|
269 |
static const string keymd5; // file md5 checksum
|
268 |
static const std::string keymd5; // file md5 checksum
|
270 |
static const string keybcknd; // backend type for data not from the filesys
|
269 |
static const std::string keybcknd; // backend type for data not from the filesys
|
271 |
// udi back from index. Only set by Rcl::Query::getdoc().
|
270 |
// udi back from index. Only set by Rcl::Query::getdoc().
|
272 |
static const string keyudi;
|
271 |
static const std::string keyudi;
|
273 |
static const string keyapptg; // apptag. Set from localfields (fsindexer)
|
272 |
static const std::string keyapptg; // apptag. Set from localfields (fsindexer)
|
274 |
static const string keybght; // beagle hit type ("beagleHitType")
|
273 |
static const std::string keybght; // beagle hit type ("beagleHitType")
|
275 |
};
|
274 |
};
|
276 |
|
275 |
|
|
|
276 |
extern bool docsToPaths(std::vector<Doc> &docs,std::vector<std::string> &paths);
|
277 |
|
277 |
|
278 |
}
|
278 |
}
|
279 |
|
279 |
|
280 |
#endif /* _RCLDOC_H_INCLUDED_ */
|
280 |
#endif /* _RCLDOC_H_INCLUDED_ */
|