Parent: [a4498c] (diff)

Child: [7d3048] (diff)

Download this file

rcldoc.h    151 lines (127 with data), 5.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _RCLDOC_H_INCLUDED_
#define _RCLDOC_H_INCLUDED_
/* @(#$Id: rcldoc.h,v 1.9 2008-09-08 16:49:10 dockes Exp $ (C) 2006 J.F.Dockes */
#include <string>
#include <map>
#ifndef NO_NAMESPACES
using std::string;
using std::map;
namespace Rcl {
#endif
/**
* Dumb holder for document attributes and data.
*
* This is used both for indexing, where fields are filled-up by the
* indexer prior to adding to the index, and for querying, where
* fields are filled from data stored in the index. Not all fields are
* in use at both index and query times, and not all field data is
* stored at index time (for example the "text" field is split and
* indexed, but not stored as such)
*/
class Doc {
public:
////////////////////////////////////////////////////////////
// The following fields are stored into the document data record (so they
// can be accessed after a query without fetching the actual document).
// We indicate the routine that sets them up during indexing
// This is just "file://" + binary filename. No transcoding: this
// is used to access files
// Index: computed from fn by Db::add caller. Query: from doc data.
string url;
// Transcoded version of the simple file name for SFN-prefixed
// specific file name indexation
// Index: set by DbIndexer::processone
string utf8fn;
// Internal path for multi-doc files. Ascii
// Set by DbIndexer::processone
string ipath;
// Mime type. Set by FileInterner::internfile
string mimetype;
// File modification time as decimal ascii unix time
// Set by DbIndexer::processone
string fmtime;
// Data reference date (same format). Ie: mail date
// Possibly set by mimetype-specific handler
string dmtime;
// Charset we transcoded the 'text' field from (in case we want back)
// Possibly set by handler
string origcharset;
// A map for textual metadata like, author, keywords, abstract,
// title. The entries are possibly set by the mimetype-specific
// handler. If a fieldname-to-prefix translation exists, the
// terms in the value will be indexed with a prefix.
// Only some predefined fields are stored in the data record:
// "title", "keywords", "abstract", "author", but if a field name is
// in the "stored" configuration list, it will be stored too.
map<string, string> meta;
// Attribute for the "abstract" entry. true if it is just the top
// of doc, not a native document attribute. Not stored directly, but
// as an indicative prefix at the beginning of the abstract (ugly hack)
bool syntabs;
// File size. Index: Set by caller prior to Db::Add. Query: set by
// rcldb from index doc data. Historically this always has
// represented the whole file size (as from stat()), but there
// would be a need for a 3rd value for multidoc files (file
// size/doc size/ doc text size)
string fbytes;
// Doc text size. Index: from text.length(). Query: set by rcldb from
// index doc data.
string dbytes;
// Doc signature. Used for up to date checks.
// Index: set by Db::Add caller. Query: set from doc data.
// This is opaque to rcldb, and could just as well be ctime, size,
// ctime+size, md5, whatever.
string sig;
/////////////////////////////////////////////////
// The following fields don't go to the db record, so they can't
// be retrieved at query time
// Main document text. This is plaintext utf-8 text to be split
// and indexed
string text;
int pc; // relevancy percentage, used by sortseq, convenience
unsigned long xdocid; // Opaque: rcldb doc identifier.
///////////////////////////////////////////////////////////////////
void erase() {
url.erase();
utf8fn.erase();
ipath.erase();
mimetype.erase();
fmtime.erase();
dmtime.erase();
origcharset.erase();
meta.clear();
syntabs = false;
fbytes.erase();
dbytes.erase();
sig.erase();
text.erase();
pc = 0;
xdocid = 0;
}
static const string keyfn;
static const string keyrr;
static const string keyabs;
static const string keyau;
static const string keytt;
static const string keykw;
};
#ifndef NO_NAMESPACES
}
#endif
#endif /* _RCLDOC_H_INCLUDED_ */