a/src/rcldb/rcldoc.h b/src/rcldb/rcldoc.h
...
...
17
#ifndef _RCLDOC_H_INCLUDED_
17
#ifndef _RCLDOC_H_INCLUDED_
18
#define _RCLDOC_H_INCLUDED_
18
#define _RCLDOC_H_INCLUDED_
19
19
20
#include <string>
20
#include <string>
21
#include <map>
21
#include <map>
22
using std::string;
22
#include <vector>
23
using std::map;
24
23
25
#include "smallut.h"
24
#include "smallut.h"
26
25
27
namespace Rcl {
26
namespace Rcl {
28
27
...
...
44
    // We indicate the routine that sets them up during indexing
43
    // We indicate the routine that sets them up during indexing
45
    
44
    
46
    // Binary or url-encoded url. No transcoding: this is used to access files 
45
    // Binary or url-encoded url. No transcoding: this is used to access files 
47
    // Index: computed by Db::add caller. 
46
    // Index: computed by Db::add caller. 
48
    // Query: from doc data.
47
    // Query: from doc data.
49
    string url;
48
    std::string url;
50
49
51
    // When we do path translation for documents from external indexes, we
50
    // When we do path translation for documents from external indexes, we
52
    // save the original path:
51
    // save the original path:
53
    string idxurl;
52
    std::string idxurl;
54
    // And the originating db. 0 is base, 1 first external etc.
53
    // And the originating db. 0 is base, 1 first external etc.
55
    int idxi;
54
    int idxi;
56
55
57
    // Internal path for multi-doc files. Ascii
56
    // Internal path for multi-doc files. Ascii
58
    // Set by FsIndexer::processone    
57
    // Set by FsIndexer::processone    
59
    string ipath;
58
    std::string ipath;
60
59
61
    // Mime type. Set by FileInterner::internfile
60
    // Mime type. Set by FileInterner::internfile
62
    string mimetype;     
61
    std::string mimetype;     
63
62
64
    // File modification time as decimal ascii unix time
63
    // File modification time as decimal ascii unix time
65
    // Set by FsIndexer::processone
64
    // Set by FsIndexer::processone
66
    string fmtime;
65
    std::string fmtime;
67
66
68
    // Data reference date (same format). Ie: mail date
67
    // Data reference date (same format). Ie: mail date
69
    // Possibly set by mimetype-specific handler
68
    // Possibly set by mimetype-specific handler
70
    // Filter::metaData["modificationdate"]
69
    // Filter::metaData["modificationdate"]
71
    string dmtime;
70
    std::string dmtime;
72
71
73
    // Charset we transcoded the 'text' field from (in case we want back)
72
    // Charset we transcoded the 'text' field from (in case we want back)
74
    // Possibly set by handler
73
    // Possibly set by handler
75
    string origcharset;  
74
    std::string origcharset;  
76
75
77
    // A map for textual metadata like, author, keywords, abstract,
76
    // A map for textual metadata like, author, keywords, abstract,
78
    // title.  The entries are possibly set by the mimetype-specific
77
    // title.  The entries are possibly set by the mimetype-specific
79
    // handler. If a fieldname-to-prefix translation exists, the
78
    // handler. If a fieldname-to-prefix translation exists, the
80
    // terms in the value will be indexed with a prefix.
79
    // terms in the value will be indexed with a prefix.
81
    // Only some predefined fields are stored in the data record:
80
    // Only some predefined fields are stored in the data record:
82
    // "title", "keywords", "abstract", "author", but if a field name is
81
    // "title", "keywords", "abstract", "author", but if a field name is
83
    // in the "stored" configuration list, it will be stored too.
82
    // in the "stored" configuration list, it will be stored too.
84
    map<string, string> meta; 
83
    std::map<std::string, std::string> meta; 
85
84
86
    // Attribute for the "abstract" entry. true if it is just the top
85
    // Attribute for the "abstract" entry. true if it is just the top
87
    // of doc, not a native document attribute. Not stored directly, but
86
    // of doc, not a native document attribute. Not stored directly, but
88
    // as an indicative prefix at the beginning of the abstract (ugly hack)
87
    // as an indicative prefix at the beginning of the abstract (ugly hack)
89
    bool   syntabs;      
88
    bool   syntabs;      
90
    
89
    
91
    // File size. This is the size of the compressed file or of the
90
    // File size. This is the size of the compressed file or of the
92
    // external containing archive.
91
    // external containing archive.
93
    // Index: Set by caller prior to Db::Add. 
92
    // Index: Set by caller prior to Db::Add. 
94
    // Query: Set from data record
93
    // Query: Set from data record
95
    string pcbytes;       
94
    std::string pcbytes;       
96
95
97
    // Document size, ie, size of the .odt or .xls.
96
    // Document size, ie, size of the .odt or .xls.
98
    // Index: Set in internfile from the filter stack
97
    // Index: Set in internfile from the filter stack
99
    // Query: set from data record
98
    // Query: set from data record
100
    string fbytes;
99
    std::string fbytes;
101
100
102
    // Doc text size. 
101
    // Doc text size. 
103
    // Index: from text.length(). 
102
    // Index: from text.length(). 
104
    // Query: set by rcldb from index data record
103
    // Query: set by rcldb from index data record
105
    string dbytes;
104
    std::string dbytes;
106
105
107
    // Doc signature. Used for up to date checks. 
106
    // Doc signature. Used for up to date checks. 
108
    // Index: set by Db::Add caller. Query: set from doc data.
107
    // Index: set by Db::Add caller. Query: set from doc data.
109
    // This is opaque to rcldb, and could just as well be ctime, size,
108
    // This is opaque to rcldb, and could just as well be ctime, size,
110
    // ctime+size, md5, whatever.
109
    // ctime+size, md5, whatever.
111
    string sig;
110
    std::string sig;
112
111
113
    /////////////////////////////////////////////////
112
    /////////////////////////////////////////////////
114
    // The following fields don't go to the db record, so they can't
113
    // The following fields don't go to the db record, so they can't
115
    // be retrieved at query time
114
    // be retrieved at query time
116
115
117
    // Main document text. This is plaintext utf-8 text to be split
116
    // Main document text. This is plaintext utf-8 text to be split
118
    // and indexed
117
    // and indexed
119
    string text; 
118
    std::string text; 
120
119
121
    /////////////////////////////////////////////////
120
    /////////////////////////////////////////////////
122
    // Misc stuff
121
    // Misc stuff
123
122
124
    int pc; // relevancy percentage, used by sortseq, convenience
123
    int pc; // relevancy percentage, used by sortseq, convenience
...
...
168
    Doc()
167
    Doc()
169
    : idxi(0), syntabs(false), pc(0), xdocid(0),
168
    : idxi(0), syntabs(false), pc(0), xdocid(0),
170
      haspages(false), haschildren(false), onlyxattr(false) {
169
      haspages(false), haschildren(false), onlyxattr(false) {
171
    }
170
    }
172
    /** Get value for named field. If value pointer is 0, just test existence */
171
    /** Get value for named field. If value pointer is 0, just test existence */
173
    bool getmeta(const string& nm, string *value = 0) const
172
    bool getmeta(const std::string& nm, std::string *value = 0) const
174
    {
173
    {
175
  map<string,string>::const_iterator it = meta.find(nm);
174
  const auto it = meta.find(nm);
176
    if (it != meta.end()) {
175
    if (it != meta.end()) {
177
        if (value)
176
        if (value)
178
        *value = it->second;
177
        *value = it->second;
179
        return true;
178
        return true;
180
    } else {
179
    } else {
181
        return false;
180
        return false;
182
    }
181
    }
183
    }
182
    }
184
    /** Nocopy getvalue. sets pointer to entry value if exists */
183
    /** Nocopy getvalue. sets pointer to entry value if exists */
185
    bool peekmeta(const string& nm, const string **value = 0) const
184
    bool peekmeta(const std::string& nm, const std::string **value = 0) const
186
    {
185
    {
187
  map<string,string>::const_iterator it = meta.find(nm);
186
  const auto it = meta.find(nm);
188
    if (it != meta.end()) {
187
    if (it != meta.end()) {
189
        if (value)
188
        if (value)
190
        *value = &(it->second);
189
        *value = &(it->second);
191
        return true;
190
        return true;
192
    } else {
191
    } else {
193
        return false;
192
        return false;
194
    }
193
    }
195
    }
194
    }
196
195
197
    // Create entry or append text to existing entry.
196
    // Create entry or append text to existing entry.
198
    bool addmeta(const string& nm, const string& value) 
197
    bool addmeta(const std::string& nm, const std::string& value) 
199
    {
198
    {
200
  map<string,string>::iterator mit = meta.find(nm);
199
  auto mit = meta.find(nm);
201
    if (mit == meta.end()) {
200
    if (mit == meta.end()) {
202
        meta[nm] = value;
201
        meta[nm] = value;
203
    } else if (mit->second.empty()) {
202
    } else if (mit->second.empty()) {
204
        mit->second = value;
203
        mit->second = value;
205
    } else {
204
    } else {
206
        // It may happen that the same attr exists several times
205
        // It may happen that the same attr exists several times
207
        // in the internfile stack. Avoid duplicating values.
206
        // in the internfile stack. Avoid duplicating values.
208
        if (mit->second != value)
207
        if (mit->second != value)
209
        mit->second += string(" - ") + value;
208
        mit->second += std::string(" - ") + value;
210
    }
209
    }
211
    return true;
210
    return true;
212
    }
211
    }
213
212
214
    /* Is this document stored as a regular filesystem file ?
213
    /* Is this document stored as a regular filesystem file ?
215
     * (as opposed to e.g. a webcache file), not a subdoc, 
214
     * (as opposed to e.g. a webcache file), not a subdoc, 
216
     */
215
     */
217
    bool isFsFile() {
216
    bool isFsFile() {
218
        string backend;
217
        std::string backend;
219
        getmeta(keybcknd, &backend);
218
        getmeta(keybcknd, &backend);
220
        if (!backend.empty() && backend.compare("FS"))
219
        if (!backend.empty() && backend.compare("FS"))
221
            return false;
220
            return false;
222
        return true;
221
        return true;
223
    }
222
    }
...
...
230
    // fields in the meta array, these are the names used). Defined in
229
    // fields in the meta array, these are the names used). Defined in
231
    // rcldoc.cpp. Fields stored in the meta[] array (ie, title,
230
    // rcldoc.cpp. Fields stored in the meta[] array (ie, title,
232
    // author), _must_ use these canonical values, not aliases. This is 
231
    // author), _must_ use these canonical values, not aliases. This is 
233
    // enforced in internfile.cpp and misc other bits of metadata-gathering 
232
    // enforced in internfile.cpp and misc other bits of metadata-gathering 
234
    // code
233
    // code
235
    static const string keyurl; // url
234
    static const std::string keyurl; // url
236
    // childurl. This is set when working with the parent of the result, to hold
235
    // childurl. This is set when working with the parent of the result, to hold
237
    // the child of interest url, typically to highlight a directory entry
236
    // the child of interest url, typically to highlight a directory entry
238
    static const string keychildurl; 
237
    static const std::string keychildurl; 
239
    // file name. This is set for filesystem-level containers or
238
    // file name. This is set for filesystem-level containers or
240
    // documents, and not inherited by subdocuments (which can get a
239
    // documents, and not inherited by subdocuments (which can get a
241
    // keyfn anyway from, e.g, an attachment filename value).  Subdocs
240
    // keyfn anyway from, e.g, an attachment filename value).  Subdocs
242
    // used to inherit the file name, but this was undesirable (you
241
    // used to inherit the file name, but this was undesirable (you
243
    // usually don't want to see all subdocs when searching for the
242
    // usually don't want to see all subdocs when searching for the
244
    // file name). Instead the container file name is now set in the
243
    // file name). Instead the container file name is now set in the
245
    // document record but not indexed (see next entry).
244
    // document record but not indexed (see next entry).
246
    static const string keyfn;  
245
    static const std::string keyfn;  
247
    // Container file name. This is set for all subdocuments of a
246
    // Container file name. This is set for all subdocuments of a
248
    // given top level container. It is not indexed by default but
247
    // given top level container. It is not indexed by default but
249
    // stored in the document record keyfn field if this is still
248
    // stored in the document record keyfn field if this is still
250
    // empty when we create it, for display purposes.
249
    // empty when we create it, for display purposes.
251
    static const string keytcfn;
250
    static const std::string keytcfn;
252
    static const string keyipt; // ipath
251
    static const std::string keyipt; // ipath
253
    static const string keytp;  // mime type
252
    static const std::string keytp;  // mime type
254
    static const string keyfmt; // file mtime
253
    static const std::string keyfmt; // file mtime
255
    static const string keydmt; // document mtime
254
    static const std::string keydmt; // document mtime
256
    static const string keymt;  // mtime dmtime if set else fmtime
255
    static const std::string keymt;  // mtime dmtime if set else fmtime
257
    static const string keyoc;  // original charset
256
    static const std::string keyoc;  // original charset
258
    static const string keypcs;  // document outer container size
257
    static const std::string keypcs;  // document outer container size
259
    static const string keyfs;  // document size
258
    static const std::string keyfs;  // document size
260
    static const string keyds;  // document text size
259
    static const std::string keyds;  // document text size
261
    static const string keysz;  // dbytes if set else fbytes else pcbytes
260
    static const std::string keysz;  // dbytes if set else fbytes else pcbytes
262
    static const string keysig; // sig
261
    static const std::string keysig; // sig
263
    static const string keyrr;  // relevancy rating
262
    static const std::string keyrr;  // relevancy rating
264
    static const string keycc;  // Collapse count
263
    static const std::string keycc;  // Collapse count
265
    static const string keyabs; // abstract
264
    static const std::string keyabs; // abstract
266
    static const string keyau;  // author
265
    static const std::string keyau;  // author
267
    static const string keytt;  // title
266
    static const std::string keytt;  // title
268
    static const string keykw;  // keywords
267
    static const std::string keykw;  // keywords
269
    static const string keymd5; // file md5 checksum
268
    static const std::string keymd5; // file md5 checksum
270
    static const string keybcknd; // backend type for data not from the filesys
269
    static const std::string keybcknd; // backend type for data not from the filesys
271
    // udi back from index. Only set by Rcl::Query::getdoc().
270
    // udi back from index. Only set by Rcl::Query::getdoc().
272
    static const string keyudi;
271
    static const std::string keyudi;
273
    static const string keyapptg; // apptag. Set from localfields (fsindexer)
272
    static const std::string keyapptg; // apptag. Set from localfields (fsindexer)
274
    static const string keybght;  // beagle hit type ("beagleHitType")
273
    static const std::string keybght;  // beagle hit type ("beagleHitType")
275
};
274
};
276
275
276
extern bool docsToPaths(std::vector<Doc> &docs,std::vector<std::string> &paths);
277
277
278
}
278
}
279
279
280
#endif /* _RCLDOC_H_INCLUDED_ */
280
#endif /* _RCLDOC_H_INCLUDED_ */