|
a/src/index/beaglequeue.cpp |
|
b/src/index/beaglequeue.cpp |
|
... |
|
... |
45 |
|
45 |
|
46 |
#include <sys/stat.h>
|
46 |
#include <sys/stat.h>
|
47 |
|
47 |
|
48 |
const string keybght("beagleHitType");
|
48 |
const string keybght("beagleHitType");
|
49 |
|
49 |
|
50 |
#define LL 2048
|
|
|
51 |
|
50 |
|
|
|
51 |
// Beagle creates a file named .xxx (where xxx is the name for the main file
|
|
|
52 |
// in the queue), to hold external metadata (http or created by Beagle).
|
|
|
53 |
// This class reads the .xxx, dotfile, and turns it into an Rcl::Doc holder
|
52 |
class BeagleDotFile {
|
54 |
class BeagleDotFile {
|
53 |
public:
|
55 |
public:
|
54 |
BeagleDotFile(RclConfig *conf, const string& fn)
|
56 |
BeagleDotFile(RclConfig *conf, const string& fn)
|
55 |
: m_conf(conf), m_fn(fn)
|
57 |
: m_conf(conf), m_fn(fn)
|
56 |
{ }
|
58 |
{}
|
57 |
|
59 |
|
|
|
60 |
// Read input line, strip it of eol and return as c++ string
|
58 |
bool readLine(string& line)
|
61 |
bool readLine(string& line)
|
59 |
{
|
62 |
{
|
|
|
63 |
static const int LL = 2048;
|
60 |
char cline[LL];
|
64 |
char cline[LL];
|
61 |
cline[0] = 0;
|
65 |
cline[0] = 0;
|
62 |
m_input.getline(cline, LL-1);
|
66 |
m_input.getline(cline, LL-1);
|
63 |
if (!m_input.good()) {
|
67 |
if (!m_input.good()) {
|
64 |
if (m_input.bad()) {
|
68 |
if (m_input.bad()) {
|
|
... |
|
... |
99 |
doc.meta[keybght] = line;
|
103 |
doc.meta[keybght] = line;
|
100 |
if (!readLine(line))
|
104 |
if (!readLine(line))
|
101 |
return false;
|
105 |
return false;
|
102 |
doc.mimetype = line;
|
106 |
doc.mimetype = line;
|
103 |
|
107 |
|
104 |
// We set the bookmarks mtype as html, the text is empty
|
108 |
// We set the bookmarks mtype as html (the text is empty
|
105 |
// anyway, so that the html viewer will be called on 'Open'
|
109 |
// anyway), so that the html viewer will be called on 'Open'
|
106 |
bool isbookmark = false;
|
110 |
bool isbookmark = false;
|
107 |
if (!stringlowercmp("bookmark", doc.meta[keybght])) {
|
111 |
if (!stringlowercmp("bookmark", doc.meta[keybght])) {
|
108 |
isbookmark = true;
|
112 |
isbookmark = true;
|
109 |
doc.mimetype = "text/html";
|
113 |
doc.mimetype = "text/html";
|
110 |
}
|
114 |
}
|
|
... |
|
... |
148 |
string caname = m_conf->fieldCanon(*it);
|
152 |
string caname = m_conf->fieldCanon(*it);
|
149 |
doc.meta[caname].append(ss + *valuep);
|
153 |
doc.meta[caname].append(ss + *valuep);
|
150 |
}
|
154 |
}
|
151 |
|
155 |
|
152 |
// Finally build the confsimple that we will save to the
|
156 |
// Finally build the confsimple that we will save to the
|
153 |
// cache, out of document fields. This could also be done in
|
157 |
// cache, from the doc fields. This could also be done in
|
154 |
// parallel with the doc.meta build above, but simpler this way.
|
158 |
// parallel with the doc.meta build above, but simpler this
|
|
|
159 |
// way. We need it because not all interesting doc fields are
|
|
|
160 |
// in the meta array (ie: mimetype, url), and we want
|
|
|
161 |
// something homogenous and easy to save.
|
155 |
for (map<string,string>::const_iterator it = doc.meta.begin();
|
162 |
for (map<string,string>::const_iterator it = doc.meta.begin();
|
156 |
it != doc.meta.end(); it++) {
|
163 |
it != doc.meta.end(); it++) {
|
157 |
m_fields.set((*it).first, (*it).second, "");
|
164 |
m_fields.set((*it).first, (*it).second, "");
|
158 |
}
|
165 |
}
|
159 |
m_fields.set("url", doc.url, "");
|
166 |
m_fields.set("url", doc.url, "");
|
|
... |
|
... |
167 |
string m_fn;
|
174 |
string m_fn;
|
168 |
ifstream m_input;
|
175 |
ifstream m_input;
|
169 |
};
|
176 |
};
|
170 |
|
177 |
|
171 |
const string badtmpdirname = "/no/such/dir/really/can/exist";
|
178 |
const string badtmpdirname = "/no/such/dir/really/can/exist";
|
|
|
179 |
|
|
|
180 |
// Initialize. Compute paths and create a temporary directory that will be
|
|
|
181 |
// used by internfile()
|
172 |
BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
182 |
BeagleQueueIndexer::BeagleQueueIndexer(RclConfig *cnf, Rcl::Db *db,
|
173 |
DbIxStatusUpdater *updfunc)
|
183 |
DbIxStatusUpdater *updfunc)
|
174 |
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
|
184 |
: m_config(cnf), m_db(db), m_cache(0), m_updater(updfunc),
|
175 |
m_nocacheindex(false)
|
185 |
m_nocacheindex(false)
|
176 |
{
|
186 |
{
|
|
... |
|
... |
214 |
}
|
224 |
}
|
215 |
}
|
225 |
}
|
216 |
deleteZ(m_cache);
|
226 |
deleteZ(m_cache);
|
217 |
}
|
227 |
}
|
218 |
|
228 |
|
|
|
229 |
// Read document from cache. Return the metadata as an Rcl::Doc
|
|
|
230 |
// @param htt Beagle Hit Type
|
219 |
bool BeagleQueueIndexer::getFromCache(const string& udi, Rcl::Doc &dotdoc,
|
231 |
bool BeagleQueueIndexer::getFromCache(const string& udi, Rcl::Doc &dotdoc,
|
220 |
string& data, string *htt)
|
232 |
string& data, string *htt)
|
221 |
{
|
233 |
{
|
222 |
string dict;
|
234 |
string dict;
|
223 |
|
235 |
|
|
... |
|
... |
241 |
cf.get(*it, dotdoc.meta[*it], "");
|
253 |
cf.get(*it, dotdoc.meta[*it], "");
|
242 |
}
|
254 |
}
|
243 |
return true;
|
255 |
return true;
|
244 |
}
|
256 |
}
|
245 |
|
257 |
|
|
|
258 |
// Index document stored in the cache.
|
246 |
bool BeagleQueueIndexer::indexFromCache(const string& udi)
|
259 |
bool BeagleQueueIndexer::indexFromCache(const string& udi)
|
247 |
{
|
260 |
{
|
248 |
if (!m_db)
|
261 |
if (!m_db)
|
249 |
return false;
|
262 |
return false;
|
250 |
|
263 |
|
|
... |
|
... |
302 |
|
315 |
|
303 |
bool BeagleQueueIndexer::index()
|
316 |
bool BeagleQueueIndexer::index()
|
304 |
{
|
317 |
{
|
305 |
if (!m_db)
|
318 |
if (!m_db)
|
306 |
return false;
|
319 |
return false;
|
307 |
LOGDEB(("BeagleQueueIndexer::processqueue: dir: [%s]\n",
|
320 |
LOGDEB(("BeagleQueueIndexer::processqueue: [%s]\n", m_queuedir.c_str()));
|
308 |
m_queuedir.c_str()));
|
|
|
309 |
m_config->setKeyDir(m_queuedir);
|
321 |
m_config->setKeyDir(m_queuedir);
|
310 |
|
322 |
|
311 |
// First check that files in the cache are in the index, in case this
|
323 |
// First check/index files found in the cache. If the index was reset,
|
|
|
324 |
// this actually does work, else it sets the existence flags (avoid
|
312 |
// has been reset. We don't do this when called from indexFiles
|
325 |
// purging). We don't do this when called from indexFiles
|
313 |
if (!m_nocacheindex) {
|
326 |
if (!m_nocacheindex) {
|
314 |
bool eof;
|
327 |
bool eof;
|
315 |
if (!m_cache->rewind(eof)) {
|
328 |
if (!m_cache->rewind(eof)) {
|
|
|
329 |
// rewind can return eof if the cache is empty
|
316 |
if (!eof)
|
330 |
if (!eof)
|
317 |
return false;
|
331 |
return false;
|
318 |
}
|
332 |
}
|
|
|
333 |
|
|
|
334 |
// The cache is walked in chronogical order, but we want to
|
|
|
335 |
// index the newest files first (there can be several versions
|
|
|
336 |
// of a given file in the cache). Have to revert the
|
|
|
337 |
// list. This would be a problem with a big cache, because the
|
|
|
338 |
// udis can be big (ie 150 chars), and would be more
|
|
|
339 |
// efficiently performed by the cache, which could use the
|
|
|
340 |
// smaller offsets.
|
|
|
341 |
//
|
|
|
342 |
// Another approach would be to just walk chronogical and
|
|
|
343 |
// reindex all versions: would waste processing but save
|
|
|
344 |
// memory
|
319 |
vector<string> alludis;
|
345 |
vector<string> alludis;
|
320 |
alludis.reserve(20000);
|
346 |
alludis.reserve(20000);
|
321 |
while (m_cache->next(eof)) {
|
347 |
while (m_cache->next(eof)) {
|
322 |
string dict;
|
348 |
string dict;
|
323 |
m_cache->getcurrentdict(dict);
|
349 |
m_cache->getcurrentdict(dict);
|
|
... |
|
... |
338 |
}
|
364 |
}
|
339 |
}
|
365 |
}
|
340 |
}
|
366 |
}
|
341 |
}
|
367 |
}
|
342 |
|
368 |
|
|
|
369 |
// Finally index the queue
|
343 |
FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
|
370 |
FsTreeWalker walker(FsTreeWalker::FtwNoRecurse);
|
344 |
walker.addSkippedName(".*");
|
371 |
walker.addSkippedName(".*");
|
345 |
FsTreeWalker::Status status =walker.walk(m_queuedir, *this);
|
372 |
FsTreeWalker::Status status =walker.walk(m_queuedir, *this);
|
346 |
LOGDEB(("BeagleQueueIndexer::processqueue: done: status %d\n", status));
|
373 |
LOGDEB(("BeagleQueueIndexer::processqueue: done: status %d\n", status));
|
347 |
return true;
|
374 |
return true;
|
348 |
}
|
375 |
}
|
349 |
|
376 |
|
|
|
377 |
// Index a list of files (sent by the real time monitor)
|
350 |
bool BeagleQueueIndexer::indexFiles(list<string>& files)
|
378 |
bool BeagleQueueIndexer::indexFiles(list<string>& files)
|
351 |
{
|
379 |
{
|
352 |
LOGDEB(("BeagleQueueIndexer::indexFiles\n"));
|
380 |
LOGDEB(("BeagleQueueIndexer::indexFiles\n"));
|
353 |
|
381 |
|
354 |
if (!m_db) {
|
382 |
if (!m_db) {
|
|
... |
|
... |
487 |
doc.meta[Rcl::Doc::keybcknd] = "BGL";
|
515 |
doc.meta[Rcl::Doc::keybcknd] = "BGL";
|
488 |
if (!m_db->addOrUpdate(udi, "", doc))
|
516 |
if (!m_db->addOrUpdate(udi, "", doc))
|
489 |
return FsTreeWalker::FtwError;
|
517 |
return FsTreeWalker::FtwError;
|
490 |
}
|
518 |
}
|
491 |
|
519 |
|
492 |
|
|
|
493 |
// Copy to cache
|
520 |
// Copy to cache
|
494 |
{
|
521 |
{
|
495 |
// doc fields not in meta, needing saving to the cache
|
522 |
// doc fields not in meta, needing saving to the cache
|
496 |
dotfile.m_fields.set("fmtime", dotdoc.fmtime, "");
|
523 |
dotfile.m_fields.set("fmtime", dotdoc.fmtime, "");
|
497 |
dotfile.m_fields.set("fbytes", dotdoc.fbytes, "");
|
524 |
dotfile.m_fields.set("fbytes", dotdoc.fbytes, "");
|