|
a/src/index/indexer.cpp |
|
b/src/index/indexer.cpp |
|
... |
|
... |
23 |
|
23 |
|
24 |
#include <stdio.h>
|
24 |
#include <stdio.h>
|
25 |
#include <sys/stat.h>
|
25 |
#include <sys/stat.h>
|
26 |
#include <unistd.h>
|
26 |
#include <unistd.h>
|
27 |
#include <errno.h>
|
27 |
#include <errno.h>
|
28 |
#include <cstring>
|
|
|
29 |
#include <fnmatch.h>
|
|
|
30 |
|
28 |
|
31 |
#include <iostream>
|
|
|
32 |
#include <list>
|
|
|
33 |
#include <map>
|
|
|
34 |
#include <algorithm>
|
|
|
35 |
|
|
|
36 |
#include "pathut.h"
|
|
|
37 |
#include "conftree.h"
|
|
|
38 |
#include "rclconfig.h"
|
29 |
#include "debuglog.h"
|
39 |
#include "fstreewalk.h"
|
|
|
40 |
#include "rcldb.h"
|
|
|
41 |
#include "readfile.h"
|
|
|
42 |
#include "indexer.h"
|
30 |
#include "indexer.h"
|
43 |
#include "csguess.h"
|
|
|
44 |
#include "transcode.h"
|
|
|
45 |
#include "debuglog.h"
|
|
|
46 |
#include "internfile.h"
|
|
|
47 |
#include "smallut.h"
|
|
|
48 |
#include "wipedir.h"
|
|
|
49 |
#include "fileudi.h"
|
|
|
50 |
|
|
|
51 |
#ifdef RCL_USE_ASPELL
|
|
|
52 |
#include "rclaspell.h"
|
|
|
53 |
#endif
|
|
|
54 |
|
|
|
55 |
// When using extended attributes, we have to use the ctime.
|
|
|
56 |
// This is quite an expensive price to pay...
|
|
|
57 |
#ifdef RCL_USE_XATTR
|
|
|
58 |
#define RCL_STTIME st_ctime
|
|
|
59 |
#else
|
|
|
60 |
#define RCL_STTIME st_mtime
|
|
|
61 |
#endif // RCL_USE_XATTR
|
|
|
62 |
|
|
|
63 |
#ifndef NO_NAMESPACES
|
|
|
64 |
using namespace std;
|
|
|
65 |
#endif /* NO_NAMESPACES */
|
|
|
66 |
|
|
|
67 |
#ifndef deleteZ
|
|
|
68 |
#define deleteZ(X) {delete X;X = 0;}
|
|
|
69 |
#endif
|
|
|
70 |
|
|
|
71 |
DbIndexer::~DbIndexer() {
|
|
|
72 |
// Maybe clean up temporary directory
|
|
|
73 |
if (m_tmpdir.length()) {
|
|
|
74 |
wipedir(m_tmpdir);
|
|
|
75 |
if (rmdir(m_tmpdir.c_str()) < 0) {
|
|
|
76 |
LOGERR(("DbIndexer::~DbIndexer: cannot clear temp dir %s\n",
|
|
|
77 |
m_tmpdir.c_str()));
|
|
|
78 |
}
|
|
|
79 |
}
|
|
|
80 |
m_db.close();
|
|
|
81 |
}
|
|
|
82 |
|
|
|
83 |
list<string> DbIndexer::getStemmerNames()
|
|
|
84 |
{
|
|
|
85 |
return Rcl::Db::getStemmerNames();
|
|
|
86 |
}
|
|
|
87 |
|
|
|
88 |
// Index each directory in the topdirs for a given db
|
|
|
89 |
bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
|
|
|
90 |
{
|
|
|
91 |
if (!init(resetbefore))
|
|
|
92 |
return false;
|
|
|
93 |
|
|
|
94 |
if (m_updater) {
|
|
|
95 |
m_updater->status.reset();
|
|
|
96 |
m_updater->status.dbtotdocs = m_db.docCnt();
|
|
|
97 |
}
|
|
|
98 |
|
|
|
99 |
m_walker.setSkippedPaths(m_config->getSkippedPaths());
|
|
|
100 |
|
|
|
101 |
for (list<string>::const_iterator it = topdirs->begin();
|
|
|
102 |
it != topdirs->end(); it++) {
|
|
|
103 |
LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(),
|
|
|
104 |
getDbDir().c_str()));
|
|
|
105 |
|
|
|
106 |
// Set the current directory in config so that subsequent
|
|
|
107 |
// getConfParams() will get local values
|
|
|
108 |
m_config->setKeyDir(*it);
|
|
|
109 |
|
|
|
110 |
// Adjust the "follow symlinks" option
|
|
|
111 |
bool follow;
|
|
|
112 |
if (m_config->getConfParam("followLinks", &follow) && follow) {
|
|
|
113 |
m_walker.setOpts(FsTreeWalker::FtwFollow);
|
|
|
114 |
} else {
|
|
|
115 |
m_walker.setOpts(FsTreeWalker::FtwOptNone);
|
|
|
116 |
}
|
|
|
117 |
|
|
|
118 |
int abslen;
|
|
|
119 |
if (m_config->getConfParam("idxabsmlen", &abslen))
|
|
|
120 |
m_db.setAbstractParams(abslen, -1, -1);
|
|
|
121 |
|
|
|
122 |
// Set up skipped patterns for this subtree. This probably should be
|
|
|
123 |
// done in the directory change code in processone() instead.
|
|
|
124 |
m_walker.setSkippedNames(m_config->getSkippedNames());
|
|
|
125 |
|
|
|
126 |
// Walk the directory tree
|
|
|
127 |
if (m_walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
|
|
|
128 |
LOGERR(("DbIndexer::index: error while indexing %s: %s\n",
|
|
|
129 |
it->c_str(), m_walker.getReason().c_str()));
|
|
|
130 |
return false;
|
|
|
131 |
}
|
|
|
132 |
}
|
|
|
133 |
if (m_updater) {
|
|
|
134 |
m_updater->status.fn.erase();
|
|
|
135 |
m_updater->status.phase = DbIxStatus::DBIXS_PURGE;
|
|
|
136 |
m_updater->update();
|
|
|
137 |
}
|
|
|
138 |
|
|
|
139 |
// Get rid of all database entries that don't exist in the
|
|
|
140 |
// filesystem anymore.
|
|
|
141 |
m_db.purge();
|
|
|
142 |
|
|
|
143 |
createStemmingDatabases();
|
|
|
144 |
createAspellDict();
|
|
|
145 |
|
|
|
146 |
if (m_updater) {
|
|
|
147 |
m_updater->status.phase = DbIxStatus::DBIXS_CLOSING;
|
|
|
148 |
m_updater->status.fn.erase();
|
|
|
149 |
m_updater->update();
|
|
|
150 |
}
|
|
|
151 |
// The close would be done in our destructor, but we want status here
|
|
|
152 |
if (!m_db.close()) {
|
|
|
153 |
LOGERR(("DbIndexer::index: error closing database in %s\n",
|
|
|
154 |
getDbDir().c_str()));
|
|
|
155 |
return false;
|
|
|
156 |
}
|
|
|
157 |
string missing;
|
|
|
158 |
FileInterner::getMissingDescription(missing);
|
|
|
159 |
if (!missing.empty()) {
|
|
|
160 |
LOGINFO(("DbIndexer::index missing helper program(s):\n%s\n",
|
|
|
161 |
missing.c_str()));
|
|
|
162 |
}
|
|
|
163 |
m_config->storeMissingHelperDesc(missing);
|
|
|
164 |
return true;
|
|
|
165 |
}
|
|
|
166 |
|
|
|
167 |
// Create stemming databases. We also remove those which are not
|
|
|
168 |
// configured.
|
|
|
169 |
bool DbIndexer::createStemmingDatabases()
|
|
|
170 |
{
|
|
|
171 |
string slangs;
|
|
|
172 |
if (m_config->getConfParam("indexstemminglanguages", slangs)) {
|
|
|
173 |
list<string> langs;
|
|
|
174 |
stringToStrings(slangs, langs);
|
|
|
175 |
|
|
|
176 |
// Get the list of existing stem dbs from the database (some may have
|
|
|
177 |
// been manually created, we just keep those from the config
|
|
|
178 |
list<string> dblangs = m_db.getStemLangs();
|
|
|
179 |
list<string>::const_iterator it;
|
|
|
180 |
for (it = dblangs.begin(); it != dblangs.end(); it++) {
|
|
|
181 |
if (find(langs.begin(), langs.end(), *it) == langs.end())
|
|
|
182 |
m_db.deleteStemDb(*it);
|
|
|
183 |
}
|
|
|
184 |
for (it = langs.begin(); it != langs.end(); it++) {
|
|
|
185 |
if (m_updater) {
|
|
|
186 |
m_updater->status.phase = DbIxStatus::DBIXS_STEMDB;
|
|
|
187 |
m_updater->status.fn = *it;
|
|
|
188 |
m_updater->update();
|
|
|
189 |
}
|
|
|
190 |
m_db.createStemDb(*it);
|
|
|
191 |
}
|
|
|
192 |
}
|
|
|
193 |
return true;
|
|
|
194 |
}
|
|
|
195 |
|
|
|
196 |
bool DbIndexer::init(bool resetbefore, bool rdonly)
|
|
|
197 |
{
|
|
|
198 |
if (!rdonly && (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0)) {
|
|
|
199 |
string reason;
|
|
|
200 |
if (!maketmpdir(m_tmpdir, reason)) {
|
|
|
201 |
LOGERR(("DbIndexer: cannot create temporary directory: %s\n",
|
|
|
202 |
reason.c_str()));
|
|
|
203 |
return false;
|
|
|
204 |
}
|
|
|
205 |
}
|
|
|
206 |
Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
|
|
|
207 |
resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
|
|
|
208 |
if (!m_db.open(mode)) {
|
|
|
209 |
LOGERR(("DbIndexer: error opening database %s\n", getDbDir().c_str()));
|
|
|
210 |
return false;
|
|
|
211 |
}
|
|
|
212 |
|
|
|
213 |
return true;
|
|
|
214 |
}
|
|
|
215 |
|
|
|
216 |
bool DbIndexer::createStemDb(const string &lang)
|
|
|
217 |
{
|
|
|
218 |
if (!init(false, true))
|
|
|
219 |
return false;
|
|
|
220 |
return m_db.createStemDb(lang);
|
|
|
221 |
}
|
|
|
222 |
|
|
|
223 |
// The language for the aspell dictionary is handled internally by the aspell
|
|
|
224 |
// module, either from a configuration variable or the NLS environment.
|
|
|
225 |
bool DbIndexer::createAspellDict()
|
|
|
226 |
{
|
|
|
227 |
LOGDEB2(("DbIndexer::createAspellDict()\n"));
|
|
|
228 |
#ifdef RCL_USE_ASPELL
|
|
|
229 |
// For the benefit of the real-time indexer, we only initialize
|
|
|
230 |
// noaspell from the configuration once. It can then be set to
|
|
|
231 |
// true if dictionary generation fails, which avoids retrying
|
|
|
232 |
// it forever.
|
|
|
233 |
static int noaspell = -12345;
|
|
|
234 |
if (noaspell == -12345) {
|
|
|
235 |
noaspell = false;
|
|
|
236 |
m_config->getConfParam("noaspell", &noaspell);
|
|
|
237 |
}
|
|
|
238 |
if (noaspell)
|
|
|
239 |
return true;
|
|
|
240 |
|
|
|
241 |
if (!init(false, true))
|
|
|
242 |
return false;
|
|
|
243 |
Aspell aspell(m_config);
|
|
|
244 |
string reason;
|
|
|
245 |
if (!aspell.init(reason)) {
|
|
|
246 |
LOGERR(("DbIndexer::createAspellDict: aspell init failed: %s\n",
|
|
|
247 |
reason.c_str()));
|
|
|
248 |
noaspell = true;
|
|
|
249 |
return false;
|
|
|
250 |
}
|
|
|
251 |
LOGDEB(("DbIndexer::createAspellDict: creating dictionary\n"));
|
|
|
252 |
if (!aspell.buildDict(m_db, reason)) {
|
|
|
253 |
LOGERR(("DbIndexer::createAspellDict: aspell buildDict failed: %s\n",
|
|
|
254 |
reason.c_str()));
|
|
|
255 |
noaspell = true;
|
|
|
256 |
return false;
|
|
|
257 |
}
|
|
|
258 |
#endif
|
|
|
259 |
return true;
|
|
|
260 |
}
|
|
|
261 |
|
|
|
262 |
/**
|
|
|
263 |
* Index individual files, out of a full tree run. No database purging
|
|
|
264 |
*/
|
|
|
265 |
bool DbIndexer::indexFiles(const list<string> &filenames)
|
|
|
266 |
{
|
|
|
267 |
bool called_init = false;
|
|
|
268 |
|
|
|
269 |
list<string>::const_iterator it;
|
|
|
270 |
for (it = filenames.begin(); it != filenames.end(); it++) {
|
|
|
271 |
string dir = path_getfather(*it);
|
|
|
272 |
m_config->setKeyDir(dir);
|
|
|
273 |
int abslen;
|
|
|
274 |
if (m_config->getConfParam("idxabsmlen", &abslen))
|
|
|
275 |
m_db.setAbstractParams(abslen, -1, -1);
|
|
|
276 |
struct stat stb;
|
|
|
277 |
if (lstat(it->c_str(), &stb) != 0) {
|
|
|
278 |
LOGERR(("DbIndexer::indexFiles: lstat(%s): %s", it->c_str(),
|
|
|
279 |
strerror(errno)));
|
|
|
280 |
continue;
|
|
|
281 |
}
|
|
|
282 |
|
|
|
283 |
// If we get to indexing directory names one day, will need to test
|
|
|
284 |
// against dbdir here to avoid modification loops (with rclmon).
|
|
|
285 |
if (!S_ISREG(stb.st_mode)) {
|
|
|
286 |
LOGDEB2(("DbIndexer::indexFiles: %s: not a regular file\n",
|
|
|
287 |
it->c_str()));
|
|
|
288 |
continue;
|
|
|
289 |
}
|
|
|
290 |
|
|
|
291 |
static string lstdir;
|
|
|
292 |
static list<string> skpl;
|
|
|
293 |
if (lstdir.compare(dir)) {
|
|
|
294 |
LOGDEB(("Recomputing list of skipped names\n"));
|
|
|
295 |
skpl = m_config->getSkippedNames();
|
|
|
296 |
lstdir = dir;
|
|
|
297 |
}
|
|
|
298 |
if (!skpl.empty()) {
|
|
|
299 |
list<string>::const_iterator skit;
|
|
|
300 |
string fn = path_getsimple(*it);
|
|
|
301 |
for (skit = skpl.begin(); skit != skpl.end(); skit++) {
|
|
|
302 |
if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
|
|
|
303 |
LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str()));
|
|
|
304 |
goto skipped;
|
|
|
305 |
}
|
|
|
306 |
}
|
|
|
307 |
}
|
|
|
308 |
// Defer opening db until really needed.
|
|
|
309 |
if (!called_init) {
|
|
|
310 |
if (!init())
|
|
|
311 |
return false;
|
|
|
312 |
called_init = true;
|
|
|
313 |
}
|
|
|
314 |
if (processone(*it, &stb, FsTreeWalker::FtwRegular) !=
|
|
|
315 |
FsTreeWalker::FtwOk) {
|
|
|
316 |
LOGERR(("DbIndexer::indexFiles: processone failed\n"));
|
|
|
317 |
return false;
|
|
|
318 |
}
|
|
|
319 |
skipped:
|
|
|
320 |
false; // Need a statement here to make compiler happy ??
|
|
|
321 |
}
|
|
|
322 |
|
|
|
323 |
// The close would be done in our destructor, but we want status here
|
|
|
324 |
if (!m_db.close()) {
|
|
|
325 |
LOGERR(("DbIndexer::indexfiles: error closing database in %s\n",
|
|
|
326 |
getDbDir().c_str()));
|
|
|
327 |
return false;
|
|
|
328 |
}
|
|
|
329 |
return true;
|
|
|
330 |
}
|
|
|
331 |
|
|
|
332 |
|
|
|
333 |
/** Purge docs for given files out of the database */
|
|
|
334 |
bool DbIndexer::purgeFiles(const list<string> &filenames)
|
|
|
335 |
{
|
|
|
336 |
if (!init())
|
|
|
337 |
return false;
|
|
|
338 |
|
|
|
339 |
list<string>::const_iterator it;
|
|
|
340 |
for (it = filenames.begin(); it != filenames.end(); it++) {
|
|
|
341 |
string udi;
|
|
|
342 |
make_udi(*it, "", udi);
|
|
|
343 |
if (!m_db.purgeFile(udi)) {
|
|
|
344 |
LOGERR(("DbIndexer::purgeFiles: Database error\n"));
|
|
|
345 |
return false;
|
|
|
346 |
}
|
|
|
347 |
}
|
|
|
348 |
|
|
|
349 |
// The close would be done in our destructor, but we want status here
|
|
|
350 |
if (!m_db.close()) {
|
|
|
351 |
LOGERR(("DbIndexer::purgefiles: error closing database in %s\n",
|
|
|
352 |
getDbDir().c_str()));
|
|
|
353 |
return false;
|
|
|
354 |
}
|
|
|
355 |
return true;
|
|
|
356 |
}
|
|
|
357 |
|
|
|
358 |
// Local fields can be set for fs subtrees in the configuration file
|
|
|
359 |
void DbIndexer::localfieldsfromconf()
|
|
|
360 |
{
|
|
|
361 |
LOGDEB(("DbIndexer::localfieldsfromconf\n"));
|
|
|
362 |
m_localfields.clear();
|
|
|
363 |
string sfields;
|
|
|
364 |
if (!m_config->getConfParam("localfields", sfields))
|
|
|
365 |
return;
|
|
|
366 |
list<string> lfields;
|
|
|
367 |
if (!stringToStrings(sfields, lfields)) {
|
|
|
368 |
LOGERR(("DbIndexer::localfieldsfromconf: bad syntax for [%s]\n",
|
|
|
369 |
sfields.c_str()));
|
|
|
370 |
return;
|
|
|
371 |
}
|
|
|
372 |
for (list<string>::const_iterator it = lfields.begin();
|
|
|
373 |
it != lfields.end(); it++) {
|
|
|
374 |
ConfSimple conf(*it, 1, true);
|
|
|
375 |
list<string> nmlst = conf.getNames("");
|
|
|
376 |
for (list<string>::const_iterator it1 = nmlst.begin();
|
|
|
377 |
it1 != nmlst.end(); it1++) {
|
|
|
378 |
conf.get(*it1, m_localfields[*it1]);
|
|
|
379 |
LOGDEB2(("DbIndexer::localfieldsfromconf: [%s] => [%s]\n",
|
|
|
380 |
(*it1).c_str(), m_localfields[*it1].c_str()));
|
|
|
381 |
}
|
|
|
382 |
}
|
|
|
383 |
}
|
|
|
384 |
|
|
|
385 |
//
|
|
|
386 |
void DbIndexer::setlocalfields(Rcl::Doc& doc)
|
|
|
387 |
{
|
|
|
388 |
for (map<string, string>::const_iterator it = m_localfields.begin();
|
|
|
389 |
it != m_localfields.end(); it++) {
|
|
|
390 |
// Should local fields override those coming from the document
|
|
|
391 |
// ? I think not, but not too sure
|
|
|
392 |
if (doc.meta.find(it->second) == doc.meta.end()) {
|
|
|
393 |
doc.meta[it->first] = it->second;
|
|
|
394 |
}
|
|
|
395 |
}
|
|
|
396 |
}
|
|
|
397 |
|
|
|
398 |
|
|
|
399 |
/// This method gets called for every file and directory found by the
|
|
|
400 |
/// tree walker.
|
|
|
401 |
///
|
|
|
402 |
/// It checks with the db if the file has changed and needs to be
|
|
|
403 |
/// reindexed. If so, it calls internfile() which will identify the
|
|
|
404 |
/// file type and call an appropriate handler to convert the document into
|
|
|
405 |
/// internal format, which we then add to the database.
|
|
|
406 |
///
|
|
|
407 |
/// Accent and majuscule handling are performed by the db module when doing
|
|
|
408 |
/// the actual indexing work. The Rcl::Doc created by internfile()
|
|
|
409 |
/// mostly contains pretty raw utf8 data.
|
|
|
410 |
FsTreeWalker::Status
|
|
|
411 |
DbIndexer::processone(const std::string &fn, const struct stat *stp,
|
|
|
412 |
FsTreeWalker::CbFlag flg)
|
|
|
413 |
{
|
|
|
414 |
if (m_updater && !m_updater->update()) {
|
|
|
415 |
return FsTreeWalker::FtwStop;
|
|
|
416 |
}
|
|
|
417 |
|
|
|
418 |
// If we're changing directories, possibly adjust parameters (set
|
|
|
419 |
// the current directory in configuration object)
|
|
|
420 |
if (flg == FsTreeWalker::FtwDirEnter ||
|
|
|
421 |
flg == FsTreeWalker::FtwDirReturn) {
|
|
|
422 |
m_config->setKeyDir(fn);
|
|
|
423 |
|
|
|
424 |
int abslen;
|
|
|
425 |
if (m_config->getConfParam("idxabsmlen", &abslen))
|
|
|
426 |
m_db.setAbstractParams(abslen, -1, -1);
|
|
|
427 |
|
|
|
428 |
// Adjust local fields from config for this subtree
|
|
|
429 |
if (m_havelocalfields)
|
|
|
430 |
localfieldsfromconf();
|
|
|
431 |
|
|
|
432 |
if (flg == FsTreeWalker::FtwDirReturn)
|
|
|
433 |
return FsTreeWalker::FtwOk;
|
|
|
434 |
}
|
|
|
435 |
|
|
|
436 |
////////////////////
|
|
|
437 |
// Check db up to date ? Doing this before file type
|
|
|
438 |
// identification means that, if usesystemfilecommand is switched
|
|
|
439 |
// from on to off it may happen that some files which are now
|
|
|
440 |
// without mime type will not be purged from the db, resulting
|
|
|
441 |
// in possible 'cannot intern file' messages at query time...
|
|
|
442 |
|
|
|
443 |
// Document signature. This is based on m/ctime and size and used
|
|
|
444 |
// for the uptodate check (the value computed here is checked
|
|
|
445 |
// against the stored one). Changing the computation forces a full
|
|
|
446 |
// reindex of course.
|
|
|
447 |
char cbuf[100];
|
|
|
448 |
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
|
|
|
449 |
string sig = cbuf;
|
|
|
450 |
string udi;
|
|
|
451 |
make_udi(fn, "", udi);
|
|
|
452 |
if (!m_db.needUpdate(udi, sig)) {
|
|
|
453 |
LOGDEB(("processone: up to date: %s\n", fn.c_str()));
|
|
|
454 |
if (m_updater) {
|
|
|
455 |
// Status bar update, abort request etc.
|
|
|
456 |
m_updater->status.fn = fn;
|
|
|
457 |
if (!m_updater->update()) {
|
|
|
458 |
return FsTreeWalker::FtwStop;
|
|
|
459 |
}
|
|
|
460 |
}
|
|
|
461 |
return FsTreeWalker::FtwOk;
|
|
|
462 |
}
|
|
|
463 |
|
|
|
464 |
LOGDEB0(("processone: processing: [%s] %s\n",
|
|
|
465 |
displayableBytes(stp->st_size).c_str(), fn.c_str()));
|
|
|
466 |
|
|
|
467 |
FileInterner interner(fn, stp, m_config, m_tmpdir, FileInterner::FIF_none);
|
|
|
468 |
|
|
|
469 |
// File name transcoded to utf8 for indexation.
|
|
|
470 |
string charset = m_config->getDefCharset(true);
|
|
|
471 |
// If this fails, the file name won't be indexed, no big deal
|
|
|
472 |
// Note that we used to do the full path here, but I ended up believing
|
|
|
473 |
// that it made more sense to use only the file name
|
|
|
474 |
string utf8fn; int ercnt;
|
|
|
475 |
if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) {
|
|
|
476 |
LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n",
|
|
|
477 |
charset.c_str(), path_getsimple(fn).c_str()));
|
|
|
478 |
} else if (ercnt) {
|
|
|
479 |
LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n",
|
|
|
480 |
ercnt, charset.c_str(), path_getsimple(fn).c_str()));
|
|
|
481 |
}
|
|
|
482 |
LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
|
|
|
483 |
path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(),
|
|
|
484 |
"UTF-8"));
|
|
|
485 |
|
|
|
486 |
string parent_udi;
|
|
|
487 |
make_udi(fn, "", parent_udi);
|
|
|
488 |
Rcl::Doc doc;
|
|
|
489 |
const string plus("+");
|
|
|
490 |
char ascdate[20];
|
|
|
491 |
sprintf(ascdate, "%ld", long(stp->st_mtime));
|
|
|
492 |
|
|
|
493 |
FileInterner::Status fis = FileInterner::FIAgain;
|
|
|
494 |
bool hadNullIpath = false;
|
|
|
495 |
while (fis == FileInterner::FIAgain) {
|
|
|
496 |
doc.erase();
|
|
|
497 |
string ipath;
|
|
|
498 |
fis = interner.internfile(doc, ipath);
|
|
|
499 |
|
|
|
500 |
// Index at least the file name even if there was an error.
|
|
|
501 |
// We'll change the signature to ensure that the indexing will
|
|
|
502 |
// be retried every time.
|
|
|
503 |
|
|
|
504 |
|
|
|
505 |
// Internal access path for multi-document files
|
|
|
506 |
if (ipath.empty())
|
|
|
507 |
hadNullIpath = true;
|
|
|
508 |
else
|
|
|
509 |
doc.ipath = ipath;
|
|
|
510 |
|
|
|
511 |
// Set file name, mod time and url if not done by filter
|
|
|
512 |
if (doc.fmtime.empty())
|
|
|
513 |
doc.fmtime = ascdate;
|
|
|
514 |
if (doc.url.empty())
|
|
|
515 |
doc.url = string("file://") + fn;
|
|
|
516 |
if (doc.utf8fn.empty())
|
|
|
517 |
doc.utf8fn = utf8fn;
|
|
|
518 |
|
|
|
519 |
char cbuf[100];
|
|
|
520 |
sprintf(cbuf, "%ld", (long)stp->st_size);
|
|
|
521 |
doc.fbytes = cbuf;
|
|
|
522 |
// Document signature for up to date checks: concatenate
|
|
|
523 |
// m/ctime and size. Looking for changes only, no need to
|
|
|
524 |
// parseback so no need for reversible formatting. Also set,
|
|
|
525 |
// but never used, for subdocs.
|
|
|
526 |
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
|
|
|
527 |
doc.sig = cbuf;
|
|
|
528 |
// If there was an error, ensure indexing will be
|
|
|
529 |
// retried. This is for the once missing, later installed
|
|
|
530 |
// filter case. It can make indexing much slower (if there are
|
|
|
531 |
// myriads of such files, the ext script is executed for them
|
|
|
532 |
// and fails every time)
|
|
|
533 |
if (fis == FileInterner::FIError) {
|
|
|
534 |
doc.sig += plus;
|
|
|
535 |
}
|
|
|
536 |
|
|
|
537 |
// Possibly add fields from local config
|
|
|
538 |
if (m_havelocalfields)
|
|
|
539 |
setlocalfields(doc);
|
|
|
540 |
// Add document to database. If there is an ipath, add it as a children
|
|
|
541 |
// of the file document.
|
|
|
542 |
string udi;
|
|
|
543 |
make_udi(fn, ipath, udi);
|
|
|
544 |
if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc))
|
|
|
545 |
return FsTreeWalker::FtwError;
|
|
|
546 |
|
|
|
547 |
// Tell what we are doing and check for interrupt request
|
|
|
548 |
if (m_updater) {
|
|
|
549 |
++(m_updater->status.docsdone);
|
|
|
550 |
m_updater->status.fn = fn;
|
|
|
551 |
if (!ipath.empty())
|
|
|
552 |
m_updater->status.fn += "|" + ipath;
|
|
|
553 |
if (!m_updater->update()) {
|
|
|
554 |
return FsTreeWalker::FtwStop;
|
|
|
555 |
}
|
|
|
556 |
}
|
|
|
557 |
}
|
|
|
558 |
|
|
|
559 |
// If we had no instance with a null ipath, we create an empty
|
|
|
560 |
// document to stand for the file itself, to be used mainly for up
|
|
|
561 |
// to date checks. Typically this happens for an mbox file.
|
|
|
562 |
if (hadNullIpath == false) {
|
|
|
563 |
LOGDEB1(("Creating empty doc for file\n"));
|
|
|
564 |
Rcl::Doc fileDoc;
|
|
|
565 |
fileDoc.fmtime = ascdate;
|
|
|
566 |
fileDoc.utf8fn = utf8fn;
|
|
|
567 |
fileDoc.mimetype = interner.getMimetype();
|
|
|
568 |
fileDoc.url = string("file://") + fn;
|
|
|
569 |
|
|
|
570 |
char cbuf[100];
|
|
|
571 |
sprintf(cbuf, "%ld", (long)stp->st_size);
|
|
|
572 |
fileDoc.fbytes = cbuf;
|
|
|
573 |
// Document signature for up to date checks.
|
|
|
574 |
sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
|
|
|
575 |
fileDoc.sig = cbuf;
|
|
|
576 |
if (!m_db.addOrUpdate(parent_udi, "", fileDoc))
|
|
|
577 |
return FsTreeWalker::FtwError;
|
|
|
578 |
}
|
|
|
579 |
|
|
|
580 |
return FsTreeWalker::FtwOk;
|
|
|
581 |
}
|
|
|
582 |
|
|
|
583 |
////////////////////////////////////////////////////////////////////////////
|
|
|
584 |
// ConIndexer methods: ConfIndexer is the top-level object, that could
|
|
|
585 |
// in theory index multiple directories to multiple databases. In practise we
|
|
|
586 |
// have a single database per configuration.
|
|
|
587 |
|
31 |
|
588 |
ConfIndexer::~ConfIndexer()
|
32 |
ConfIndexer::~ConfIndexer()
|
589 |
{
|
33 |
{
|
590 |
deleteZ(m_dbindexer);
|
34 |
deleteZ(m_fsindexer);
|
591 |
}
|
35 |
}
|
592 |
|
36 |
|
593 |
bool ConfIndexer::index(bool resetbefore)
|
37 |
bool ConfIndexer::index(bool resetbefore)
|
594 |
{
|
38 |
{
|
595 |
list<string> tdl = m_config->getTopdirs();
|
39 |
list<string> tdl = m_config->getTopdirs();
|
|
... |
|
... |
632 |
m_config->setKeyDir("");
|
76 |
m_config->setKeyDir("");
|
633 |
|
77 |
|
634 |
// The dbmap now has dbdir as key and directory lists as values.
|
78 |
// The dbmap now has dbdir as key and directory lists as values.
|
635 |
// Index each directory group in turn
|
79 |
// Index each directory group in turn
|
636 |
for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {
|
80 |
for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {
|
637 |
m_dbindexer = new DbIndexer(m_config, m_updater);
|
81 |
m_fsindexer = new FsIndexer(m_config, m_updater);
|
638 |
if (!m_dbindexer->indexDb(resetbefore, &dbit->second)) {
|
82 |
if (!m_fsindexer->indexTrees(resetbefore, &dbit->second)) {
|
639 |
deleteZ(m_dbindexer);
|
83 |
deleteZ(m_fsindexer);
|
640 |
m_reason = "Failed indexing in " + dbit->first;
|
84 |
m_reason = "Failed indexing in " + dbit->first;
|
641 |
return false;
|
85 |
return false;
|
642 |
}
|
86 |
}
|
643 |
deleteZ(m_dbindexer);
|
87 |
deleteZ(m_fsindexer);
|
644 |
}
|
88 |
}
|
645 |
return true;
|
89 |
return true;
|
646 |
}
|
90 |
}
|