Switch to unified view

a/src/index/indexer.cpp b/src/index/indexer.cpp
...
...
23
23
24
#include <stdio.h>
24
#include <stdio.h>
25
#include <sys/stat.h>
25
#include <sys/stat.h>
26
#include <unistd.h>
26
#include <unistd.h>
27
#include <errno.h>
27
#include <errno.h>
28
#include <cstring>
29
#include <fnmatch.h>
30
28
31
#include <iostream>
32
#include <list>
33
#include <map>
34
#include <algorithm>
35
36
#include "pathut.h"
37
#include "conftree.h"
38
#include "rclconfig.h"
29
#include "debuglog.h"
39
#include "fstreewalk.h"
40
#include "rcldb.h"
41
#include "readfile.h"
42
#include "indexer.h"
30
#include "indexer.h"
43
#include "csguess.h"
44
#include "transcode.h"
45
#include "debuglog.h"
46
#include "internfile.h"
47
#include "smallut.h"
48
#include "wipedir.h"
49
#include "fileudi.h"
50
51
#ifdef RCL_USE_ASPELL
52
#include "rclaspell.h"
53
#endif
54
55
// When using extended attributes, we have to use the ctime. 
56
// This is quite an expensive price to pay...
57
#ifdef RCL_USE_XATTR
58
#define RCL_STTIME st_ctime
59
#else
60
#define RCL_STTIME st_mtime
61
#endif // RCL_USE_XATTR
62
63
#ifndef NO_NAMESPACES
64
using namespace std;
65
#endif /* NO_NAMESPACES */
66
67
#ifndef deleteZ
68
#define deleteZ(X) {delete X;X = 0;}
69
#endif
70
71
DbIndexer::~DbIndexer() {
72
    // Maybe clean up temporary directory
73
    if (m_tmpdir.length()) {
74
  wipedir(m_tmpdir);
75
  if (rmdir(m_tmpdir.c_str()) < 0) {
76
      LOGERR(("DbIndexer::~DbIndexer: cannot clear temp dir %s\n",
77
          m_tmpdir.c_str()));
78
  }
79
    }
80
    m_db.close();
81
}
82
83
list<string> DbIndexer::getStemmerNames()
84
{
85
    return Rcl::Db::getStemmerNames();
86
}
87
88
// Index each directory in the topdirs for a given db
89
bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
90
{
91
    if (!init(resetbefore))
92
  return false;
93
94
    if (m_updater) {
95
  m_updater->status.reset();
96
  m_updater->status.dbtotdocs = m_db.docCnt();
97
    }
98
99
    m_walker.setSkippedPaths(m_config->getSkippedPaths());
100
101
    for (list<string>::const_iterator it = topdirs->begin();
102
   it != topdirs->end(); it++) {
103
  LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(), 
104
      getDbDir().c_str()));
105
106
  // Set the current directory in config so that subsequent
107
  // getConfParams() will get local values
108
  m_config->setKeyDir(*it);
109
110
  // Adjust the "follow symlinks" option
111
  bool follow;
112
  if (m_config->getConfParam("followLinks", &follow) && follow) {
113
      m_walker.setOpts(FsTreeWalker::FtwFollow);
114
  } else {
115
      m_walker.setOpts(FsTreeWalker::FtwOptNone);
116
  }       
117
118
  int abslen;
119
  if (m_config->getConfParam("idxabsmlen", &abslen))
120
      m_db.setAbstractParams(abslen, -1, -1);
121
122
  // Set up skipped patterns for this subtree. This probably should be
123
  // done in the directory change code in processone() instead.
124
  m_walker.setSkippedNames(m_config->getSkippedNames());
125
126
  // Walk the directory tree
127
  if (m_walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
128
      LOGERR(("DbIndexer::index: error while indexing %s: %s\n", 
129
          it->c_str(), m_walker.getReason().c_str()));
130
      return false;
131
  }
132
    }
133
    if (m_updater) {
134
  m_updater->status.fn.erase();
135
  m_updater->status.phase = DbIxStatus::DBIXS_PURGE;
136
  m_updater->update();
137
    }
138
139
    // Get rid of all database entries that don't exist in the
140
    // filesystem anymore.
141
    m_db.purge();
142
143
    createStemmingDatabases();
144
    createAspellDict();
145
146
    if (m_updater) {
147
  m_updater->status.phase = DbIxStatus::DBIXS_CLOSING;
148
  m_updater->status.fn.erase();
149
  m_updater->update();
150
    }
151
    // The close would be done in our destructor, but we want status here
152
    if (!m_db.close()) {
153
  LOGERR(("DbIndexer::index: error closing database in %s\n", 
154
      getDbDir().c_str()));
155
  return false;
156
    }
157
    string missing;
158
    FileInterner::getMissingDescription(missing);
159
    if (!missing.empty()) {
160
  LOGINFO(("DbIndexer::index missing helper program(s):\n%s\n", 
161
       missing.c_str()));
162
    }
163
    m_config->storeMissingHelperDesc(missing);
164
    return true;
165
}
166
167
// Create stemming databases. We also remove those which are not
168
// configured. 
169
bool DbIndexer::createStemmingDatabases()
170
{
171
    string slangs;
172
    if (m_config->getConfParam("indexstemminglanguages", slangs)) {
173
  list<string> langs;
174
  stringToStrings(slangs, langs);
175
176
  // Get the list of existing stem dbs from the database (some may have 
177
  // been manually created, we just keep those from the config
178
  list<string> dblangs = m_db.getStemLangs();
179
  list<string>::const_iterator it;
180
  for (it = dblangs.begin(); it != dblangs.end(); it++) {
181
      if (find(langs.begin(), langs.end(), *it) == langs.end())
182
      m_db.deleteStemDb(*it);
183
  }
184
  for (it = langs.begin(); it != langs.end(); it++) {
185
      if (m_updater) {
186
      m_updater->status.phase = DbIxStatus::DBIXS_STEMDB;
187
      m_updater->status.fn = *it;
188
      m_updater->update();
189
      }
190
      m_db.createStemDb(*it);
191
  }
192
    }
193
    return true;
194
}
195
196
bool DbIndexer::init(bool resetbefore, bool rdonly)
197
{
198
    if (!rdonly && (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0)) {
199
  string reason;
200
  if (!maketmpdir(m_tmpdir, reason)) {
201
      LOGERR(("DbIndexer: cannot create temporary directory: %s\n",
202
          reason.c_str()));
203
      return false;
204
  }
205
    }
206
    Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
207
  resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
208
    if (!m_db.open(mode)) {
209
  LOGERR(("DbIndexer: error opening database %s\n", getDbDir().c_str()));
210
  return false;
211
    }
212
213
    return true;
214
}
215
216
bool DbIndexer::createStemDb(const string &lang)
217
{
218
    if (!init(false, true))
219
  return false;
220
    return m_db.createStemDb(lang);
221
}
222
223
// The language for the aspell dictionary is handled internally by the aspell
224
// module, either from a configuration variable or the NLS environment.
225
bool DbIndexer::createAspellDict()
226
{
227
    LOGDEB2(("DbIndexer::createAspellDict()\n"));
228
#ifdef RCL_USE_ASPELL
229
    // For the benefit of the real-time indexer, we only initialize
230
    // noaspell from the configuration once. It can then be set to
231
    // true if dictionary generation fails, which avoids retrying
232
    // it forever.
233
    static int noaspell = -12345;
234
    if (noaspell == -12345) {
235
  noaspell = false;
236
  m_config->getConfParam("noaspell", &noaspell);
237
    }
238
    if (noaspell)
239
  return true;
240
241
    if (!init(false, true))
242
  return false;
243
    Aspell aspell(m_config);
244
    string reason;
245
    if (!aspell.init(reason)) {
246
  LOGERR(("DbIndexer::createAspellDict: aspell init failed: %s\n", 
247
      reason.c_str()));
248
  noaspell = true;
249
  return false;
250
    }
251
    LOGDEB(("DbIndexer::createAspellDict: creating dictionary\n"));
252
    if (!aspell.buildDict(m_db, reason)) {
253
  LOGERR(("DbIndexer::createAspellDict: aspell buildDict failed: %s\n", 
254
      reason.c_str()));
255
  noaspell = true;
256
  return false;
257
    }
258
#endif
259
    return true;
260
}
261
262
/** 
263
 * Index individual files, out of a full tree run. No database purging
264
 */
265
bool DbIndexer::indexFiles(const list<string> &filenames)
266
{
267
    bool called_init = false;
268
269
    list<string>::const_iterator it;
270
    for (it = filenames.begin(); it != filenames.end(); it++) {
271
  string dir = path_getfather(*it);
272
  m_config->setKeyDir(dir);
273
  int abslen;
274
  if (m_config->getConfParam("idxabsmlen", &abslen))
275
      m_db.setAbstractParams(abslen, -1, -1);
276
  struct stat stb;
277
  if (lstat(it->c_str(), &stb) != 0) {
278
      LOGERR(("DbIndexer::indexFiles: lstat(%s): %s", it->c_str(),
279
          strerror(errno)));
280
      continue;
281
  }
282
283
  // If we get to indexing directory names one day, will need to test 
284
  // against dbdir here to avoid modification loops (with rclmon).
285
  if (!S_ISREG(stb.st_mode)) {
286
      LOGDEB2(("DbIndexer::indexFiles: %s: not a regular file\n", 
287
          it->c_str()));
288
      continue;
289
  }
290
291
  static string lstdir;
292
  static list<string> skpl;
293
  if (lstdir.compare(dir)) {
294
      LOGDEB(("Recomputing list of skipped names\n"));
295
      skpl = m_config->getSkippedNames();
296
      lstdir = dir;
297
  }
298
  if (!skpl.empty()) {
299
      list<string>::const_iterator skit;
300
      string fn = path_getsimple(*it);
301
      for (skit = skpl.begin(); skit != skpl.end(); skit++) {
302
      if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
303
          LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str()));
304
          goto skipped;
305
      }
306
      }
307
  }
308
  // Defer opening db until really needed.
309
  if (!called_init) {
310
      if (!init())
311
      return false;
312
      called_init = true;
313
  }
314
  if (processone(*it, &stb, FsTreeWalker::FtwRegular) != 
315
      FsTreeWalker::FtwOk) {
316
      LOGERR(("DbIndexer::indexFiles: processone failed\n"));
317
      return false;
318
  }
319
    skipped: 
320
  false; // Need a statement here to make compiler happy ??
321
    }
322
323
    // The close would be done in our destructor, but we want status here
324
    if (!m_db.close()) {
325
  LOGERR(("DbIndexer::indexfiles: error closing database in %s\n", 
326
      getDbDir().c_str()));
327
  return false;
328
    }
329
    return true;
330
}
331
332
333
/** Purge docs for given files out of the database */
334
bool DbIndexer::purgeFiles(const list<string> &filenames)
335
{
336
    if (!init())
337
  return false;
338
339
    list<string>::const_iterator it;
340
    for (it = filenames.begin(); it != filenames.end(); it++) {
341
  string udi;
342
  make_udi(*it, "", udi);
343
  if (!m_db.purgeFile(udi)) {
344
      LOGERR(("DbIndexer::purgeFiles: Database error\n"));
345
      return false;
346
  }
347
    }
348
349
    // The close would be done in our destructor, but we want status here
350
    if (!m_db.close()) {
351
  LOGERR(("DbIndexer::purgefiles: error closing database in %s\n", 
352
      getDbDir().c_str()));
353
  return false;
354
    }
355
    return true;
356
}
357
358
// Local fields can be set for fs subtrees in the configuration file 
359
void DbIndexer::localfieldsfromconf()
360
{
361
    LOGDEB(("DbIndexer::localfieldsfromconf\n"));
362
    m_localfields.clear();
363
    string sfields;
364
    if (!m_config->getConfParam("localfields", sfields))
365
        return;
366
    list<string> lfields;
367
    if (!stringToStrings(sfields, lfields)) {
368
        LOGERR(("DbIndexer::localfieldsfromconf: bad syntax for [%s]\n", 
369
                sfields.c_str()));
370
        return;
371
    }
372
    for (list<string>::const_iterator it = lfields.begin();
373
         it != lfields.end(); it++) {
374
        ConfSimple conf(*it, 1, true);
375
        list<string> nmlst = conf.getNames("");
376
        for (list<string>::const_iterator it1 = nmlst.begin();
377
             it1 != nmlst.end(); it1++) {
378
            conf.get(*it1, m_localfields[*it1]);
379
            LOGDEB2(("DbIndexer::localfieldsfromconf: [%s] => [%s]\n",
380
                    (*it1).c_str(), m_localfields[*it1].c_str()));
381
        }
382
    }
383
}
384
385
// 
386
void DbIndexer::setlocalfields(Rcl::Doc& doc)
387
{
388
    for (map<string, string>::const_iterator it = m_localfields.begin();
389
         it != m_localfields.end(); it++) {
390
        // Should local fields override those coming from the document
391
        // ? I think not, but not too sure
392
        if (doc.meta.find(it->second) == doc.meta.end()) {
393
            doc.meta[it->first] = it->second;
394
        }
395
    }
396
}
397
398
399
/// This method gets called for every file and directory found by the
400
/// tree walker. 
401
///
402
/// It checks with the db if the file has changed and needs to be
403
/// reindexed. If so, it calls internfile() which will identify the
404
/// file type and call an appropriate handler to convert the document into
405
/// internal format, which we then add to the database.
406
///
407
/// Accent and majuscule handling are performed by the db module when doing
408
/// the actual indexing work. The Rcl::Doc created by internfile()
409
/// mostly contains pretty raw utf8 data.
410
FsTreeWalker::Status 
411
DbIndexer::processone(const std::string &fn, const struct stat *stp, 
412
            FsTreeWalker::CbFlag flg)
413
{
414
    if (m_updater && !m_updater->update()) {
415
        return FsTreeWalker::FtwStop;
416
    }
417
418
    // If we're changing directories, possibly adjust parameters (set
419
    // the current directory in configuration object)
420
    if (flg == FsTreeWalker::FtwDirEnter || 
421
  flg == FsTreeWalker::FtwDirReturn) {
422
  m_config->setKeyDir(fn);
423
424
  int abslen;
425
  if (m_config->getConfParam("idxabsmlen", &abslen))
426
      m_db.setAbstractParams(abslen, -1, -1);
427
428
        // Adjust local fields from config for this subtree
429
        if (m_havelocalfields)
430
            localfieldsfromconf();
431
432
  if (flg == FsTreeWalker::FtwDirReturn)
433
      return FsTreeWalker::FtwOk;
434
    }
435
436
    ////////////////////
437
    // Check db up to date ? Doing this before file type
438
    // identification means that, if usesystemfilecommand is switched
439
    // from on to off it may happen that some files which are now
440
    // without mime type will not be purged from the db, resulting
441
    // in possible 'cannot intern file' messages at query time...
442
443
    // Document signature. This is based on m/ctime and size and used
444
    // for the uptodate check (the value computed here is checked
445
    // against the stored one). Changing the computation forces a full
446
    // reindex of course.
447
    char cbuf[100]; 
448
    sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
449
    string sig = cbuf;
450
    string udi;
451
    make_udi(fn, "", udi);
452
    if (!m_db.needUpdate(udi, sig)) {
453
  LOGDEB(("processone: up to date: %s\n", fn.c_str()));
454
  if (m_updater) {
455
      // Status bar update, abort request etc.
456
      m_updater->status.fn = fn;
457
      if (!m_updater->update()) {
458
      return FsTreeWalker::FtwStop;
459
      }
460
  }
461
  return FsTreeWalker::FtwOk;
462
    }
463
464
    LOGDEB0(("processone: processing: [%s] %s\n", 
465
             displayableBytes(stp->st_size).c_str(), fn.c_str()));
466
467
    FileInterner interner(fn, stp, m_config, m_tmpdir, FileInterner::FIF_none);
468
469
    // File name transcoded to utf8 for indexation. 
470
    string charset = m_config->getDefCharset(true);
471
    // If this fails, the file name won't be indexed, no big deal
472
    // Note that we used to do the full path here, but I ended up believing
473
    // that it made more sense to use only the file name
474
    string utf8fn; int ercnt;
475
    if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) {
476
  LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n",
477
      charset.c_str(), path_getsimple(fn).c_str()));
478
    } else if (ercnt) {
479
  LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n",
480
      ercnt, charset.c_str(), path_getsimple(fn).c_str()));
481
    }
482
    LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
483
       path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), 
484
       "UTF-8"));
485
486
    string parent_udi;
487
    make_udi(fn, "", parent_udi);
488
    Rcl::Doc doc;
489
    const string plus("+");
490
    char ascdate[20];
491
    sprintf(ascdate, "%ld", long(stp->st_mtime));
492
493
    FileInterner::Status fis = FileInterner::FIAgain;
494
    bool hadNullIpath = false;
495
    while (fis == FileInterner::FIAgain) {
496
  doc.erase();
497
  string ipath;
498
  fis = interner.internfile(doc, ipath);
499
500
        // Index at least the file name even if there was an error.
501
        // We'll change the signature to ensure that the indexing will
502
        // be retried every time.
503
504
505
  // Internal access path for multi-document files
506
  if (ipath.empty())
507
      hadNullIpath = true;
508
  else
509
      doc.ipath = ipath;
510
511
  // Set file name, mod time and url if not done by filter
512
  if (doc.fmtime.empty())
513
      doc.fmtime = ascdate;
514
        if (doc.url.empty())
515
            doc.url = string("file://") + fn;
516
  if (doc.utf8fn.empty())
517
      doc.utf8fn = utf8fn;
518
519
  char cbuf[100]; 
520
  sprintf(cbuf, "%ld", (long)stp->st_size);
521
  doc.fbytes = cbuf;
522
  // Document signature for up to date checks: concatenate
523
  // m/ctime and size. Looking for changes only, no need to
524
  // parseback so no need for reversible formatting. Also set,
525
  // but never used, for subdocs.
526
  sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
527
  doc.sig = cbuf;
528
  // If there was an error, ensure indexing will be
529
  // retried. This is for the once missing, later installed
530
  // filter case. It can make indexing much slower (if there are
531
  // myriads of such files, the ext script is executed for them
532
  // and fails every time)
533
  if (fis == FileInterner::FIError) {
534
      doc.sig += plus;
535
  }
536
537
        // Possibly add fields from local config
538
        if (m_havelocalfields) 
539
            setlocalfields(doc);
540
  // Add document to database. If there is an ipath, add it as a children
541
  // of the file document.
542
  string udi;
543
  make_udi(fn, ipath, udi);
544
  if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc)) 
545
      return FsTreeWalker::FtwError;
546
547
  // Tell what we are doing and check for interrupt request
548
  if (m_updater) {
549
      ++(m_updater->status.docsdone);
550
            m_updater->status.fn = fn;
551
            if (!ipath.empty())
552
                m_updater->status.fn += "|" + ipath;
553
            if (!m_updater->update()) {
554
                return FsTreeWalker::FtwStop;
555
            }
556
  }
557
    }
558
559
    // If we had no instance with a null ipath, we create an empty
560
    // document to stand for the file itself, to be used mainly for up
561
    // to date checks. Typically this happens for an mbox file.
562
    if (hadNullIpath == false) {
563
  LOGDEB1(("Creating empty doc for file\n"));
564
  Rcl::Doc fileDoc;
565
  fileDoc.fmtime = ascdate;
566
  fileDoc.utf8fn = utf8fn;
567
  fileDoc.mimetype = interner.getMimetype();
568
  fileDoc.url = string("file://") + fn;
569
570
  char cbuf[100]; 
571
  sprintf(cbuf, "%ld", (long)stp->st_size);
572
  fileDoc.fbytes = cbuf;
573
  // Document signature for up to date checks.
574
  sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
575
  fileDoc.sig = cbuf;
576
  if (!m_db.addOrUpdate(parent_udi, "", fileDoc)) 
577
      return FsTreeWalker::FtwError;
578
    }
579
580
    return FsTreeWalker::FtwOk;
581
}
582
583
////////////////////////////////////////////////////////////////////////////
584
// ConIndexer methods: ConfIndexer is the top-level object, that could
585
// in theory index multiple directories to multiple databases. In practise we
586
// have a single database per configuration.
587
31
588
ConfIndexer::~ConfIndexer()
32
ConfIndexer::~ConfIndexer()
589
{
33
{
590
     deleteZ(m_dbindexer);
34
     deleteZ(m_fsindexer);
591
}
35
}
592
36
593
bool ConfIndexer::index(bool resetbefore)
37
bool ConfIndexer::index(bool resetbefore)
594
{
38
{
595
    list<string> tdl = m_config->getTopdirs();
39
    list<string> tdl = m_config->getTopdirs();
...
...
632
    m_config->setKeyDir("");
76
    m_config->setKeyDir("");
633
77
634
    // The dbmap now has dbdir as key and directory lists as values.
78
    // The dbmap now has dbdir as key and directory lists as values.
635
    // Index each directory group in turn
79
    // Index each directory group in turn
636
    for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {
80
    for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {
637
    m_dbindexer = new DbIndexer(m_config, m_updater);
81
    m_fsindexer = new FsIndexer(m_config, m_updater);
638
    if (!m_dbindexer->indexDb(resetbefore, &dbit->second)) {
82
    if (!m_fsindexer->indexTrees(resetbefore, &dbit->second)) {
639
        deleteZ(m_dbindexer);
83
        deleteZ(m_fsindexer);
640
        m_reason = "Failed indexing in " + dbit->first;
84
        m_reason = "Failed indexing in " + dbit->first;
641
        return false;
85
        return false;
642
    }
86
    }
643
    deleteZ(m_dbindexer);
87
    deleteZ(m_fsindexer);
644
    }
88
    }
645
    return true;
89
    return true;
646
}
90
}