Switch to unified view

a b/src/index/fsindexer.cpp
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: $ (C) 2009 J.F.Dockes";
3
#endif
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
8
 *   (at your option) any later version.
9
 *
10
 *   This program is distributed in the hope that it will be useful,
11
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13
 *   GNU General Public License for more details.
14
 *
15
 *   You should have received a copy of the GNU General Public License
16
 *   along with this program; if not, write to the
17
 *   Free Software Foundation, Inc.,
18
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
19
 */
20
#ifdef HAVE_CONFIG_H
21
#include "autoconfig.h"
22
#endif
23
24
#include <stdio.h>
25
#include <sys/stat.h>
26
#include <unistd.h>
27
#include <errno.h>
28
#include <cstring>
29
#include <fnmatch.h>
30
31
#include <iostream>
32
#include <list>
33
#include <map>
34
#include <algorithm>
35
36
#include "pathut.h"
37
#include "conftree.h"
38
#include "rclconfig.h"
39
#include "fstreewalk.h"
40
#include "rcldb.h"
41
#include "readfile.h"
42
#include "indexer.h"
43
#include "fsindexer.h"
44
#include "csguess.h"
45
#include "transcode.h"
46
#include "debuglog.h"
47
#include "internfile.h"
48
#include "smallut.h"
49
#include "wipedir.h"
50
#include "fileudi.h"
51
52
#ifdef RCL_USE_ASPELL
53
#include "rclaspell.h"
54
#endif
55
56
// When using extended attributes, we have to use the ctime. 
57
// This is quite an expensive price to pay...
58
#ifdef RCL_USE_XATTR
59
#define RCL_STTIME st_ctime
60
#else
61
#define RCL_STTIME st_mtime
62
#endif // RCL_USE_XATTR
63
64
#ifndef NO_NAMESPACES
65
using namespace std;
66
#endif /* NO_NAMESPACES */
67
68
#ifndef deleteZ
69
#define deleteZ(X) {delete X;X = 0;}
70
#endif
71
72
FsIndexer::~FsIndexer() {
73
    // Maybe clean up temporary directory
74
    if (m_tmpdir.length()) {
75
  wipedir(m_tmpdir);
76
  if (rmdir(m_tmpdir.c_str()) < 0) {
77
      LOGERR(("FsIndexer::~FsIndexer: cannot clear temp dir %s\n",
78
          m_tmpdir.c_str()));
79
  }
80
    }
81
    m_db.close();
82
}
83
84
list<string> FsIndexer::getStemmerNames()
85
{
86
    return Rcl::Db::getStemmerNames();
87
}
88
89
// Index each directory in the topdirs for a given db
90
bool FsIndexer::indexTrees(bool resetbefore, list<string> *topdirs)
91
{
92
    if (!init(resetbefore))
93
  return false;
94
95
    if (m_updater) {
96
  m_updater->status.reset();
97
  m_updater->status.dbtotdocs = m_db.docCnt();
98
    }
99
100
    m_walker.setSkippedPaths(m_config->getSkippedPaths());
101
102
    for (list<string>::const_iterator it = topdirs->begin();
103
   it != topdirs->end(); it++) {
104
  LOGDEB(("FsIndexer::index: Indexing %s into %s\n", it->c_str(), 
105
      getDbDir().c_str()));
106
107
  // Set the current directory in config so that subsequent
108
  // getConfParams() will get local values
109
  m_config->setKeyDir(*it);
110
111
  // Adjust the "follow symlinks" option
112
  bool follow;
113
  if (m_config->getConfParam("followLinks", &follow) && follow) {
114
      m_walker.setOpts(FsTreeWalker::FtwFollow);
115
  } else {
116
      m_walker.setOpts(FsTreeWalker::FtwOptNone);
117
  }       
118
119
  int abslen;
120
  if (m_config->getConfParam("idxabsmlen", &abslen))
121
      m_db.setAbstractParams(abslen, -1, -1);
122
123
  // Set up skipped patterns for this subtree. This probably should be
124
  // done in the directory change code in processone() instead.
125
  m_walker.setSkippedNames(m_config->getSkippedNames());
126
127
  // Walk the directory tree
128
  if (m_walker.walk(*it, *this) != FsTreeWalker::FtwOk) {
129
      LOGERR(("FsIndexer::index: error while indexing %s: %s\n", 
130
          it->c_str(), m_walker.getReason().c_str()));
131
      return false;
132
  }
133
    }
134
    if (m_updater) {
135
  m_updater->status.fn.erase();
136
  m_updater->status.phase = DbIxStatus::DBIXS_PURGE;
137
  m_updater->update();
138
    }
139
140
    // Get rid of all database entries that don't exist in the
141
    // filesystem anymore.
142
    m_db.purge();
143
144
    createStemmingDatabases();
145
    createAspellDict();
146
147
    if (m_updater) {
148
  m_updater->status.phase = DbIxStatus::DBIXS_CLOSING;
149
  m_updater->status.fn.erase();
150
  m_updater->update();
151
    }
152
    // The close would be done in our destructor, but we want status here
153
    if (!m_db.close()) {
154
  LOGERR(("FsIndexer::index: error closing database in %s\n", 
155
      getDbDir().c_str()));
156
  return false;
157
    }
158
    string missing;
159
    FileInterner::getMissingDescription(missing);
160
    if (!missing.empty()) {
161
  LOGINFO(("FsIndexer::index missing helper program(s):\n%s\n", 
162
       missing.c_str()));
163
    }
164
    m_config->storeMissingHelperDesc(missing);
165
    return true;
166
}
167
168
// Create stemming databases. We also remove those which are not
169
// configured. 
170
bool FsIndexer::createStemmingDatabases()
171
{
172
    string slangs;
173
    if (m_config->getConfParam("indexstemminglanguages", slangs)) {
174
  list<string> langs;
175
  stringToStrings(slangs, langs);
176
177
  // Get the list of existing stem dbs from the database (some may have 
178
  // been manually created, we just keep those from the config
179
  list<string> dblangs = m_db.getStemLangs();
180
  list<string>::const_iterator it;
181
  for (it = dblangs.begin(); it != dblangs.end(); it++) {
182
      if (find(langs.begin(), langs.end(), *it) == langs.end())
183
      m_db.deleteStemDb(*it);
184
  }
185
  for (it = langs.begin(); it != langs.end(); it++) {
186
      if (m_updater) {
187
      m_updater->status.phase = DbIxStatus::DBIXS_STEMDB;
188
      m_updater->status.fn = *it;
189
      m_updater->update();
190
      }
191
      m_db.createStemDb(*it);
192
  }
193
    }
194
    return true;
195
}
196
197
bool FsIndexer::init(bool resetbefore, bool rdonly)
198
{
199
    if (!rdonly && (m_tmpdir.empty() || access(m_tmpdir.c_str(), 0) < 0)) {
200
  string reason;
201
  if (!maketmpdir(m_tmpdir, reason)) {
202
      LOGERR(("FsIndexer: cannot create temporary directory: %s\n",
203
          reason.c_str()));
204
      return false;
205
  }
206
    }
207
    Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
208
  resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
209
    if (!m_db.open(mode)) {
210
  LOGERR(("FsIndexer: error opening database %s\n", getDbDir().c_str()));
211
  return false;
212
    }
213
214
    return true;
215
}
216
217
bool FsIndexer::createStemDb(const string &lang)
218
{
219
    if (!init(false, true))
220
  return false;
221
    return m_db.createStemDb(lang);
222
}
223
224
// The language for the aspell dictionary is handled internally by the aspell
225
// module, either from a configuration variable or the NLS environment.
226
bool FsIndexer::createAspellDict()
227
{
228
    LOGDEB2(("FsIndexer::createAspellDict()\n"));
229
#ifdef RCL_USE_ASPELL
230
    // For the benefit of the real-time indexer, we only initialize
231
    // noaspell from the configuration once. It can then be set to
232
    // true if dictionary generation fails, which avoids retrying
233
    // it forever.
234
    static int noaspell = -12345;
235
    if (noaspell == -12345) {
236
  noaspell = false;
237
  m_config->getConfParam("noaspell", &noaspell);
238
    }
239
    if (noaspell)
240
  return true;
241
242
    if (!init(false, true))
243
  return false;
244
    Aspell aspell(m_config);
245
    string reason;
246
    if (!aspell.init(reason)) {
247
  LOGERR(("FsIndexer::createAspellDict: aspell init failed: %s\n", 
248
      reason.c_str()));
249
  noaspell = true;
250
  return false;
251
    }
252
    LOGDEB(("FsIndexer::createAspellDict: creating dictionary\n"));
253
    if (!aspell.buildDict(m_db, reason)) {
254
  LOGERR(("FsIndexer::createAspellDict: aspell buildDict failed: %s\n", 
255
      reason.c_str()));
256
  noaspell = true;
257
  return false;
258
    }
259
#endif
260
    return true;
261
}
262
263
/** 
264
 * Index individual files, out of a full tree run. No database purging
265
 */
266
bool FsIndexer::indexFiles(const list<string> &filenames)
267
{
268
    bool called_init = false;
269
270
    list<string>::const_iterator it;
271
    for (it = filenames.begin(); it != filenames.end(); it++) {
272
  string dir = path_getfather(*it);
273
  m_config->setKeyDir(dir);
274
  int abslen;
275
  if (m_config->getConfParam("idxabsmlen", &abslen))
276
      m_db.setAbstractParams(abslen, -1, -1);
277
  struct stat stb;
278
  if (lstat(it->c_str(), &stb) != 0) {
279
      LOGERR(("FsIndexer::indexFiles: lstat(%s): %s", it->c_str(),
280
          strerror(errno)));
281
      continue;
282
  }
283
284
  // If we get to indexing directory names one day, will need to test 
285
  // against dbdir here to avoid modification loops (with rclmon).
286
  if (!S_ISREG(stb.st_mode)) {
287
      LOGDEB2(("FsIndexer::indexFiles: %s: not a regular file\n", 
288
          it->c_str()));
289
      continue;
290
  }
291
292
  static string lstdir;
293
  static list<string> skpl;
294
  if (lstdir.compare(dir)) {
295
      LOGDEB(("Recomputing list of skipped names\n"));
296
      skpl = m_config->getSkippedNames();
297
      lstdir = dir;
298
  }
299
  if (!skpl.empty()) {
300
      list<string>::const_iterator skit;
301
      string fn = path_getsimple(*it);
302
      for (skit = skpl.begin(); skit != skpl.end(); skit++) {
303
      if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
304
          LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str()));
305
          goto skipped;
306
      }
307
      }
308
  }
309
  // Defer opening db until really needed.
310
  if (!called_init) {
311
      if (!init())
312
      return false;
313
      called_init = true;
314
  }
315
  if (processone(*it, &stb, FsTreeWalker::FtwRegular) != 
316
      FsTreeWalker::FtwOk) {
317
      LOGERR(("FsIndexer::indexFiles: processone failed\n"));
318
      return false;
319
  }
320
    skipped: 
321
  false; // Need a statement here to make compiler happy ??
322
    }
323
324
    // The close would be done in our destructor, but we want status here
325
    if (!m_db.close()) {
326
  LOGERR(("FsIndexer::indexfiles: error closing database in %s\n", 
327
      getDbDir().c_str()));
328
  return false;
329
    }
330
    return true;
331
}
332
333
334
/** Purge docs for given files out of the database */
335
bool FsIndexer::purgeFiles(const list<string> &filenames)
336
{
337
    if (!init())
338
  return false;
339
340
    list<string>::const_iterator it;
341
    for (it = filenames.begin(); it != filenames.end(); it++) {
342
  string udi;
343
  make_udi(*it, "", udi);
344
  if (!m_db.purgeFile(udi)) {
345
      LOGERR(("FsIndexer::purgeFiles: Database error\n"));
346
      return false;
347
  }
348
    }
349
350
    // The close would be done in our destructor, but we want status here
351
    if (!m_db.close()) {
352
  LOGERR(("FsIndexer::purgefiles: error closing database in %s\n", 
353
      getDbDir().c_str()));
354
  return false;
355
    }
356
    return true;
357
}
358
359
// Local fields can be set for fs subtrees in the configuration file 
360
void FsIndexer::localfieldsfromconf()
361
{
362
    LOGDEB(("FsIndexer::localfieldsfromconf\n"));
363
    m_localfields.clear();
364
    string sfields;
365
    if (!m_config->getConfParam("localfields", sfields))
366
        return;
367
    list<string> lfields;
368
    if (!stringToStrings(sfields, lfields)) {
369
        LOGERR(("FsIndexer::localfieldsfromconf: bad syntax for [%s]\n", 
370
                sfields.c_str()));
371
        return;
372
    }
373
    for (list<string>::const_iterator it = lfields.begin();
374
         it != lfields.end(); it++) {
375
        ConfSimple conf(*it, 1, true);
376
        list<string> nmlst = conf.getNames("");
377
        for (list<string>::const_iterator it1 = nmlst.begin();
378
             it1 != nmlst.end(); it1++) {
379
            conf.get(*it1, m_localfields[*it1]);
380
            LOGDEB2(("FsIndexer::localfieldsfromconf: [%s] => [%s]\n",
381
                    (*it1).c_str(), m_localfields[*it1].c_str()));
382
        }
383
    }
384
}
385
386
// 
387
void FsIndexer::setlocalfields(Rcl::Doc& doc)
388
{
389
    for (map<string, string>::const_iterator it = m_localfields.begin();
390
         it != m_localfields.end(); it++) {
391
        // Should local fields override those coming from the document
392
        // ? I think not, but not too sure
393
        if (doc.meta.find(it->second) == doc.meta.end()) {
394
            doc.meta[it->first] = it->second;
395
        }
396
    }
397
}
398
399
400
/// This method gets called for every file and directory found by the
401
/// tree walker. 
402
///
403
/// It checks with the db if the file has changed and needs to be
404
/// reindexed. If so, it calls internfile() which will identify the
405
/// file type and call an appropriate handler to convert the document into
406
/// internal format, which we then add to the database.
407
///
408
/// Accent and majuscule handling are performed by the db module when doing
409
/// the actual indexing work. The Rcl::Doc created by internfile()
410
/// mostly contains pretty raw utf8 data.
411
FsTreeWalker::Status 
412
FsIndexer::processone(const std::string &fn, const struct stat *stp, 
413
            FsTreeWalker::CbFlag flg)
414
{
415
    if (m_updater && !m_updater->update()) {
416
        return FsTreeWalker::FtwStop;
417
    }
418
419
    // If we're changing directories, possibly adjust parameters (set
420
    // the current directory in configuration object)
421
    if (flg == FsTreeWalker::FtwDirEnter || 
422
  flg == FsTreeWalker::FtwDirReturn) {
423
  m_config->setKeyDir(fn);
424
425
  int abslen;
426
  if (m_config->getConfParam("idxabsmlen", &abslen))
427
      m_db.setAbstractParams(abslen, -1, -1);
428
429
        // Adjust local fields from config for this subtree
430
        if (m_havelocalfields)
431
            localfieldsfromconf();
432
433
  if (flg == FsTreeWalker::FtwDirReturn)
434
      return FsTreeWalker::FtwOk;
435
    }
436
437
    ////////////////////
438
    // Check db up to date ? Doing this before file type
439
    // identification means that, if usesystemfilecommand is switched
440
    // from on to off it may happen that some files which are now
441
    // without mime type will not be purged from the db, resulting
442
    // in possible 'cannot intern file' messages at query time...
443
444
    // Document signature. This is based on m/ctime and size and used
445
    // for the uptodate check (the value computed here is checked
446
    // against the stored one). Changing the computation forces a full
447
    // reindex of course.
448
    char cbuf[100]; 
449
    sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
450
    string sig = cbuf;
451
    string udi;
452
    make_udi(fn, "", udi);
453
    if (!m_db.needUpdate(udi, sig)) {
454
  LOGDEB(("processone: up to date: %s\n", fn.c_str()));
455
  if (m_updater) {
456
      // Status bar update, abort request etc.
457
      m_updater->status.fn = fn;
458
      if (!m_updater->update()) {
459
      return FsTreeWalker::FtwStop;
460
      }
461
  }
462
  return FsTreeWalker::FtwOk;
463
    }
464
465
    LOGDEB0(("processone: processing: [%s] %s\n", 
466
             displayableBytes(stp->st_size).c_str(), fn.c_str()));
467
468
    FileInterner interner(fn, stp, m_config, m_tmpdir, FileInterner::FIF_none);
469
470
    // File name transcoded to utf8 for indexation. 
471
    string charset = m_config->getDefCharset(true);
472
    // If this fails, the file name won't be indexed, no big deal
473
    // Note that we used to do the full path here, but I ended up believing
474
    // that it made more sense to use only the file name
475
    string utf8fn; int ercnt;
476
    if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) {
477
  LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n",
478
      charset.c_str(), path_getsimple(fn).c_str()));
479
    } else if (ercnt) {
480
  LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n",
481
      ercnt, charset.c_str(), path_getsimple(fn).c_str()));
482
    }
483
    LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
484
       path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(), 
485
       "UTF-8"));
486
487
    string parent_udi;
488
    make_udi(fn, "", parent_udi);
489
    Rcl::Doc doc;
490
    const string plus("+");
491
    char ascdate[20];
492
    sprintf(ascdate, "%ld", long(stp->st_mtime));
493
494
    FileInterner::Status fis = FileInterner::FIAgain;
495
    bool hadNullIpath = false;
496
    while (fis == FileInterner::FIAgain) {
497
  doc.erase();
498
  string ipath;
499
  fis = interner.internfile(doc, ipath);
500
501
        // Index at least the file name even if there was an error.
502
        // We'll change the signature to ensure that the indexing will
503
        // be retried every time.
504
505
506
  // Internal access path for multi-document files
507
  if (ipath.empty())
508
      hadNullIpath = true;
509
  else
510
      doc.ipath = ipath;
511
512
  // Set file name, mod time and url if not done by filter
513
  if (doc.fmtime.empty())
514
      doc.fmtime = ascdate;
515
        if (doc.url.empty())
516
            doc.url = string("file://") + fn;
517
  if (doc.utf8fn.empty())
518
      doc.utf8fn = utf8fn;
519
520
  char cbuf[100]; 
521
  sprintf(cbuf, "%ld", (long)stp->st_size);
522
  doc.fbytes = cbuf;
523
  // Document signature for up to date checks: concatenate
524
  // m/ctime and size. Looking for changes only, no need to
525
  // parseback so no need for reversible formatting. Also set,
526
  // but never used, for subdocs.
527
  sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
528
  doc.sig = cbuf;
529
  // If there was an error, ensure indexing will be
530
  // retried. This is for the once missing, later installed
531
  // filter case. It can make indexing much slower (if there are
532
  // myriads of such files, the ext script is executed for them
533
  // and fails every time)
534
  if (fis == FileInterner::FIError) {
535
      doc.sig += plus;
536
  }
537
538
        // Possibly add fields from local config
539
        if (m_havelocalfields) 
540
            setlocalfields(doc);
541
  // Add document to database. If there is an ipath, add it as a children
542
  // of the file document.
543
  string udi;
544
  make_udi(fn, ipath, udi);
545
  if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc)) 
546
      return FsTreeWalker::FtwError;
547
548
  // Tell what we are doing and check for interrupt request
549
  if (m_updater) {
550
      ++(m_updater->status.docsdone);
551
            m_updater->status.fn = fn;
552
            if (!ipath.empty())
553
                m_updater->status.fn += "|" + ipath;
554
            if (!m_updater->update()) {
555
                return FsTreeWalker::FtwStop;
556
            }
557
  }
558
    }
559
560
    // If we had no instance with a null ipath, we create an empty
561
    // document to stand for the file itself, to be used mainly for up
562
    // to date checks. Typically this happens for an mbox file.
563
    if (hadNullIpath == false) {
564
  LOGDEB1(("Creating empty doc for file\n"));
565
  Rcl::Doc fileDoc;
566
  fileDoc.fmtime = ascdate;
567
  fileDoc.utf8fn = utf8fn;
568
  fileDoc.mimetype = interner.getMimetype();
569
  fileDoc.url = string("file://") + fn;
570
571
  char cbuf[100]; 
572
  sprintf(cbuf, "%ld", (long)stp->st_size);
573
  fileDoc.fbytes = cbuf;
574
  // Document signature for up to date checks.
575
  sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
576
  fileDoc.sig = cbuf;
577
  if (!m_db.addOrUpdate(parent_udi, "", fileDoc)) 
578
      return FsTreeWalker::FtwError;
579
    }
580
581
    return FsTreeWalker::FtwOk;
582
}