recoll / Code / Diff of /src/index/indexer.cpp

Diff of /src/index/indexer.cpp [69dcb9] .. [d14601]

Switch to unified view


...

#include <stdio.h>
#include <sys/stat.h>
#include <unistd.h>
#include <errno.h>










#include "debuglog.h"



#include "indexer.h"

































































































































































































































































































































































































































































































































































ConfIndexer::~ConfIndexer()
{
     deleteZ(m_fsindexer);
}

bool ConfIndexer::index(bool resetbefore)
{
    list<string> tdl = m_config->getTopdirs();
...
    m_config->setKeyDir("");

    // The dbmap now has dbdir as key and directory lists as values.
    // Index each directory group in turn
    for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {
    m_fsindexer = new FsIndexer(m_config, m_updater);
    if (!m_fsindexer->indexTrees(resetbefore, &dbit->second)) {
        deleteZ(m_fsindexer);
        m_reason = "Failed indexing in " + dbit->first;
        return false;
    }
    deleteZ(m_fsindexer);
    }
    return true;
}

	a/src/index/indexer.cpp		b/src/index/indexer.cpp
	...		...
23		23
24	#include <stdio.h>	24	#include <stdio.h>
25	#include <sys/stat.h>	25	#include <sys/stat.h>
26	#include <unistd.h>	26	#include <unistd.h>
27	#include <errno.h>	27	#include <errno.h>
28	#include <cstring>
29	#include <fnmatch.h>
30		28
31	#include <iostream>
32	#include <list>
33	#include <map>
34	#include <algorithm>
35
36	#include "pathut.h"
37	#include "conftree.h"
38	#include "rclconfig.h"	29	#include "debuglog.h"
39	#include "fstreewalk.h"
40	#include "rcldb.h"
41	#include "readfile.h"
42	#include "indexer.h"	30	#include "indexer.h"
43	#include "csguess.h"
44	#include "transcode.h"
45	#include "debuglog.h"
46	#include "internfile.h"
47	#include "smallut.h"
48	#include "wipedir.h"
49	#include "fileudi.h"
50
51	#ifdef RCL_USE_ASPELL
52	#include "rclaspell.h"
53	#endif
54
55	// When using extended attributes, we have to use the ctime.
56	// This is quite an expensive price to pay...
57	#ifdef RCL_USE_XATTR
58	#define RCL_STTIME st_ctime
59	#else
60	#define RCL_STTIME st_mtime
61	#endif // RCL_USE_XATTR
62
63	#ifndef NO_NAMESPACES
64	using namespace std;
65	#endif /* NO_NAMESPACES */
66
67	#ifndef deleteZ
68	#define deleteZ(X) {delete X;X = 0;}
69	#endif
70
71	DbIndexer::~DbIndexer() {
72	// Maybe clean up temporary directory
73	if (m_tmpdir.length()) {
74	wipedir(m_tmpdir);
75	if (rmdir(m_tmpdir.c_str()) < 0) {
76	LOGERR(("DbIndexer::~DbIndexer: cannot clear temp dir %s\n",
77	m_tmpdir.c_str()));
78	}
79	}
80	m_db.close();
81	}
82
83	list<string> DbIndexer::getStemmerNames()
84	{
85	return Rcl::Db::getStemmerNames();
86	}
87
88	// Index each directory in the topdirs for a given db
89	bool DbIndexer::indexDb(bool resetbefore, list<string> *topdirs)
90	{
91	if (!init(resetbefore))
92	return false;
93
94	if (m_updater) {
95	m_updater->status.reset();
96	m_updater->status.dbtotdocs = m_db.docCnt();
97	}
98
99	m_walker.setSkippedPaths(m_config->getSkippedPaths());
100
101	for (list<string>::const_iterator it = topdirs->begin();
102	it != topdirs->end(); it++) {
103	LOGDEB(("DbIndexer::index: Indexing %s into %s\n", it->c_str(),
104	getDbDir().c_str()));
105
106	// Set the current directory in config so that subsequent
107	// getConfParams() will get local values
108	m_config->setKeyDir(*it);
109
110	// Adjust the "follow symlinks" option
111	bool follow;
112	if (m_config->getConfParam("followLinks", &follow) && follow) {
113	m_walker.setOpts(FsTreeWalker::FtwFollow);
114	} else {
115	m_walker.setOpts(FsTreeWalker::FtwOptNone);
116	}
117
118	int abslen;
119	if (m_config->getConfParam("idxabsmlen", &abslen))
120	m_db.setAbstractParams(abslen, -1, -1);
121
122	// Set up skipped patterns for this subtree. This probably should be
123	// done in the directory change code in processone() instead.
124	m_walker.setSkippedNames(m_config->getSkippedNames());
125
126	// Walk the directory tree
127	if (m_walker.walk(it, this) != FsTreeWalker::FtwOk) {
128	LOGERR(("DbIndexer::index: error while indexing %s: %s\n",
129	it->c_str(), m_walker.getReason().c_str()));
130	return false;
131	}
132	}
133	if (m_updater) {
134	m_updater->status.fn.erase();
135	m_updater->status.phase = DbIxStatus::DBIXS_PURGE;
136	m_updater->update();
137	}
138
139	// Get rid of all database entries that don't exist in the
140	// filesystem anymore.
141	m_db.purge();
142
143	createStemmingDatabases();
144	createAspellDict();
145
146	if (m_updater) {
147	m_updater->status.phase = DbIxStatus::DBIXS_CLOSING;
148	m_updater->status.fn.erase();
149	m_updater->update();
150	}
151	// The close would be done in our destructor, but we want status here
152	if (!m_db.close()) {
153	LOGERR(("DbIndexer::index: error closing database in %s\n",
154	getDbDir().c_str()));
155	return false;
156	}
157	string missing;
158	FileInterner::getMissingDescription(missing);
159	if (!missing.empty()) {
160	LOGINFO(("DbIndexer::index missing helper program(s):\n%s\n",
161	missing.c_str()));
162	}
163	m_config->storeMissingHelperDesc(missing);
164	return true;
165	}
166
167	// Create stemming databases. We also remove those which are not
168	// configured.
169	bool DbIndexer::createStemmingDatabases()
170	{
171	string slangs;
172	if (m_config->getConfParam("indexstemminglanguages", slangs)) {
173	list<string> langs;
174	stringToStrings(slangs, langs);
175
176	// Get the list of existing stem dbs from the database (some may have
177	// been manually created, we just keep those from the config
178	list<string> dblangs = m_db.getStemLangs();
179	list<string>::const_iterator it;
180	for (it = dblangs.begin(); it != dblangs.end(); it++) {
181	if (find(langs.begin(), langs.end(), *it) == langs.end())
182	m_db.deleteStemDb(*it);
183	}
184	for (it = langs.begin(); it != langs.end(); it++) {
185	if (m_updater) {
186	m_updater->status.phase = DbIxStatus::DBIXS_STEMDB;
187	m_updater->status.fn = *it;
188	m_updater->update();
189	}
190	m_db.createStemDb(*it);
191	}
192	}
193	return true;
194	}
195
196	bool DbIndexer::init(bool resetbefore, bool rdonly)
197	{
198	if (!rdonly && (m_tmpdir.empty() \|\| access(m_tmpdir.c_str(), 0) < 0)) {
199	string reason;
200	if (!maketmpdir(m_tmpdir, reason)) {
201	LOGERR(("DbIndexer: cannot create temporary directory: %s\n",
202	reason.c_str()));
203	return false;
204	}
205	}
206	Rcl::Db::OpenMode mode = rdonly ? Rcl::Db::DbRO :
207	resetbefore ? Rcl::Db::DbTrunc : Rcl::Db::DbUpd;
208	if (!m_db.open(mode)) {
209	LOGERR(("DbIndexer: error opening database %s\n", getDbDir().c_str()));
210	return false;
211	}
212
213	return true;
214	}
215
216	bool DbIndexer::createStemDb(const string &lang)
217	{
218	if (!init(false, true))
219	return false;
220	return m_db.createStemDb(lang);
221	}
222
223	// The language for the aspell dictionary is handled internally by the aspell
224	// module, either from a configuration variable or the NLS environment.
225	bool DbIndexer::createAspellDict()
226	{
227	LOGDEB2(("DbIndexer::createAspellDict()\n"));
228	#ifdef RCL_USE_ASPELL
229	// For the benefit of the real-time indexer, we only initialize
230	// noaspell from the configuration once. It can then be set to
231	// true if dictionary generation fails, which avoids retrying
232	// it forever.
233	static int noaspell = -12345;
234	if (noaspell == -12345) {
235	noaspell = false;
236	m_config->getConfParam("noaspell", &noaspell);
237	}
238	if (noaspell)
239	return true;
240
241	if (!init(false, true))
242	return false;
243	Aspell aspell(m_config);
244	string reason;
245	if (!aspell.init(reason)) {
246	LOGERR(("DbIndexer::createAspellDict: aspell init failed: %s\n",
247	reason.c_str()));
248	noaspell = true;
249	return false;
250	}
251	LOGDEB(("DbIndexer::createAspellDict: creating dictionary\n"));
252	if (!aspell.buildDict(m_db, reason)) {
253	LOGERR(("DbIndexer::createAspellDict: aspell buildDict failed: %s\n",
254	reason.c_str()));
255	noaspell = true;
256	return false;
257	}
258	#endif
259	return true;
260	}
261
262	/**
263	* Index individual files, out of a full tree run. No database purging
264	*/
265	bool DbIndexer::indexFiles(const list<string> &filenames)
266	{
267	bool called_init = false;
268
269	list<string>::const_iterator it;
270	for (it = filenames.begin(); it != filenames.end(); it++) {
271	string dir = path_getfather(*it);
272	m_config->setKeyDir(dir);
273	int abslen;
274	if (m_config->getConfParam("idxabsmlen", &abslen))
275	m_db.setAbstractParams(abslen, -1, -1);
276	struct stat stb;
277	if (lstat(it->c_str(), &stb) != 0) {
278	LOGERR(("DbIndexer::indexFiles: lstat(%s): %s", it->c_str(),
279	strerror(errno)));
280	continue;
281	}
282
283	// If we get to indexing directory names one day, will need to test
284	// against dbdir here to avoid modification loops (with rclmon).
285	if (!S_ISREG(stb.st_mode)) {
286	LOGDEB2(("DbIndexer::indexFiles: %s: not a regular file\n",
287	it->c_str()));
288	continue;
289	}
290
291	static string lstdir;
292	static list<string> skpl;
293	if (lstdir.compare(dir)) {
294	LOGDEB(("Recomputing list of skipped names\n"));
295	skpl = m_config->getSkippedNames();
296	lstdir = dir;
297	}
298	if (!skpl.empty()) {
299	list<string>::const_iterator skit;
300	string fn = path_getsimple(*it);
301	for (skit = skpl.begin(); skit != skpl.end(); skit++) {
302	if (fnmatch(skit->c_str(), fn.c_str(), 0) == 0) {
303	LOGDEB(("Skipping [%s] :matches skip list\n", fn.c_str()));
304	goto skipped;
305	}
306	}
307	}
308	// Defer opening db until really needed.
309	if (!called_init) {
310	if (!init())
311	return false;
312	called_init = true;
313	}
314	if (processone(*it, &stb, FsTreeWalker::FtwRegular) !=
315	FsTreeWalker::FtwOk) {
316	LOGERR(("DbIndexer::indexFiles: processone failed\n"));
317	return false;
318	}
319	skipped:
320	false; // Need a statement here to make compiler happy ??
321	}
322
323	// The close would be done in our destructor, but we want status here
324	if (!m_db.close()) {
325	LOGERR(("DbIndexer::indexfiles: error closing database in %s\n",
326	getDbDir().c_str()));
327	return false;
328	}
329	return true;
330	}
331
332
333	/** Purge docs for given files out of the database */
334	bool DbIndexer::purgeFiles(const list<string> &filenames)
335	{
336	if (!init())
337	return false;
338
339	list<string>::const_iterator it;
340	for (it = filenames.begin(); it != filenames.end(); it++) {
341	string udi;
342	make_udi(*it, "", udi);
343	if (!m_db.purgeFile(udi)) {
344	LOGERR(("DbIndexer::purgeFiles: Database error\n"));
345	return false;
346	}
347	}
348
349	// The close would be done in our destructor, but we want status here
350	if (!m_db.close()) {
351	LOGERR(("DbIndexer::purgefiles: error closing database in %s\n",
352	getDbDir().c_str()));
353	return false;
354	}
355	return true;
356	}
357
358	// Local fields can be set for fs subtrees in the configuration file
359	void DbIndexer::localfieldsfromconf()
360	{
361	LOGDEB(("DbIndexer::localfieldsfromconf\n"));
362	m_localfields.clear();
363	string sfields;
364	if (!m_config->getConfParam("localfields", sfields))
365	return;
366	list<string> lfields;
367	if (!stringToStrings(sfields, lfields)) {
368	LOGERR(("DbIndexer::localfieldsfromconf: bad syntax for [%s]\n",
369	sfields.c_str()));
370	return;
371	}
372	for (list<string>::const_iterator it = lfields.begin();
373	it != lfields.end(); it++) {
374	ConfSimple conf(*it, 1, true);
375	list<string> nmlst = conf.getNames("");
376	for (list<string>::const_iterator it1 = nmlst.begin();
377	it1 != nmlst.end(); it1++) {
378	conf.get(it1, m_localfields[it1]);
379	LOGDEB2(("DbIndexer::localfieldsfromconf: [%s] => [%s]\n",
380	(it1).c_str(), m_localfields[it1].c_str()));
381	}
382	}
383	}
384
385	//
386	void DbIndexer::setlocalfields(Rcl::Doc& doc)
387	{
388	for (map<string, string>::const_iterator it = m_localfields.begin();
389	it != m_localfields.end(); it++) {
390	// Should local fields override those coming from the document
391	// ? I think not, but not too sure
392	if (doc.meta.find(it->second) == doc.meta.end()) {
393	doc.meta[it->first] = it->second;
394	}
395	}
396	}
397
398
399	/// This method gets called for every file and directory found by the
400	/// tree walker.
401	///
402	/// It checks with the db if the file has changed and needs to be
403	/// reindexed. If so, it calls internfile() which will identify the
404	/// file type and call an appropriate handler to convert the document into
405	/// internal format, which we then add to the database.
406	///
407	/// Accent and majuscule handling are performed by the db module when doing
408	/// the actual indexing work. The Rcl::Doc created by internfile()
409	/// mostly contains pretty raw utf8 data.
410	FsTreeWalker::Status
411	DbIndexer::processone(const std::string &fn, const struct stat *stp,
412	FsTreeWalker::CbFlag flg)
413	{
414	if (m_updater && !m_updater->update()) {
415	return FsTreeWalker::FtwStop;
416	}
417
418	// If we're changing directories, possibly adjust parameters (set
419	// the current directory in configuration object)
420	if (flg == FsTreeWalker::FtwDirEnter \|\|
421	flg == FsTreeWalker::FtwDirReturn) {
422	m_config->setKeyDir(fn);
423
424	int abslen;
425	if (m_config->getConfParam("idxabsmlen", &abslen))
426	m_db.setAbstractParams(abslen, -1, -1);
427
428	// Adjust local fields from config for this subtree
429	if (m_havelocalfields)
430	localfieldsfromconf();
431
432	if (flg == FsTreeWalker::FtwDirReturn)
433	return FsTreeWalker::FtwOk;
434	}
435
436	////////////////////
437	// Check db up to date ? Doing this before file type
438	// identification means that, if usesystemfilecommand is switched
439	// from on to off it may happen that some files which are now
440	// without mime type will not be purged from the db, resulting
441	// in possible 'cannot intern file' messages at query time...
442
443	// Document signature. This is based on m/ctime and size and used
444	// for the uptodate check (the value computed here is checked
445	// against the stored one). Changing the computation forces a full
446	// reindex of course.
447	char cbuf[100];
448	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
449	string sig = cbuf;
450	string udi;
451	make_udi(fn, "", udi);
452	if (!m_db.needUpdate(udi, sig)) {
453	LOGDEB(("processone: up to date: %s\n", fn.c_str()));
454	if (m_updater) {
455	// Status bar update, abort request etc.
456	m_updater->status.fn = fn;
457	if (!m_updater->update()) {
458	return FsTreeWalker::FtwStop;
459	}
460	}
461	return FsTreeWalker::FtwOk;
462	}
463
464	LOGDEB0(("processone: processing: [%s] %s\n",
465	displayableBytes(stp->st_size).c_str(), fn.c_str()));
466
467	FileInterner interner(fn, stp, m_config, m_tmpdir, FileInterner::FIF_none);
468
469	// File name transcoded to utf8 for indexation.
470	string charset = m_config->getDefCharset(true);
471	// If this fails, the file name won't be indexed, no big deal
472	// Note that we used to do the full path here, but I ended up believing
473	// that it made more sense to use only the file name
474	string utf8fn; int ercnt;
475	if (!transcode(path_getsimple(fn), utf8fn, charset, "UTF-8", &ercnt)) {
476	LOGERR(("processone: fn transcode failure from [%s] to UTF-8: %s\n",
477	charset.c_str(), path_getsimple(fn).c_str()));
478	} else if (ercnt) {
479	LOGDEB(("processone: fn transcode %d errors from [%s] to UTF-8: %s\n",
480	ercnt, charset.c_str(), path_getsimple(fn).c_str()));
481	}
482	LOGDEB2(("processone: fn transcoded from [%s] to [%s] (%s->%s)\n",
483	path_getsimple(fn).c_str(), utf8fn.c_str(), charset.c_str(),
484	"UTF-8"));
485
486	string parent_udi;
487	make_udi(fn, "", parent_udi);
488	Rcl::Doc doc;
489	const string plus("+");
490	char ascdate[20];
491	sprintf(ascdate, "%ld", long(stp->st_mtime));
492
493	FileInterner::Status fis = FileInterner::FIAgain;
494	bool hadNullIpath = false;
495	while (fis == FileInterner::FIAgain) {
496	doc.erase();
497	string ipath;
498	fis = interner.internfile(doc, ipath);
499
500	// Index at least the file name even if there was an error.
501	// We'll change the signature to ensure that the indexing will
502	// be retried every time.
503
504
505	// Internal access path for multi-document files
506	if (ipath.empty())
507	hadNullIpath = true;
508	else
509	doc.ipath = ipath;
510
511	// Set file name, mod time and url if not done by filter
512	if (doc.fmtime.empty())
513	doc.fmtime = ascdate;
514	if (doc.url.empty())
515	doc.url = string("file://") + fn;
516	if (doc.utf8fn.empty())
517	doc.utf8fn = utf8fn;
518
519	char cbuf[100];
520	sprintf(cbuf, "%ld", (long)stp->st_size);
521	doc.fbytes = cbuf;
522	// Document signature for up to date checks: concatenate
523	// m/ctime and size. Looking for changes only, no need to
524	// parseback so no need for reversible formatting. Also set,
525	// but never used, for subdocs.
526	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
527	doc.sig = cbuf;
528	// If there was an error, ensure indexing will be
529	// retried. This is for the once missing, later installed
530	// filter case. It can make indexing much slower (if there are
531	// myriads of such files, the ext script is executed for them
532	// and fails every time)
533	if (fis == FileInterner::FIError) {
534	doc.sig += plus;
535	}
536
537	// Possibly add fields from local config
538	if (m_havelocalfields)
539	setlocalfields(doc);
540	// Add document to database. If there is an ipath, add it as a children
541	// of the file document.
542	string udi;
543	make_udi(fn, ipath, udi);
544	if (!m_db.addOrUpdate(udi, ipath.empty() ? "" : parent_udi, doc))
545	return FsTreeWalker::FtwError;
546
547	// Tell what we are doing and check for interrupt request
548	if (m_updater) {
549	++(m_updater->status.docsdone);
550	m_updater->status.fn = fn;
551	if (!ipath.empty())
552	m_updater->status.fn += "\|" + ipath;
553	if (!m_updater->update()) {
554	return FsTreeWalker::FtwStop;
555	}
556	}
557	}
558
559	// If we had no instance with a null ipath, we create an empty
560	// document to stand for the file itself, to be used mainly for up
561	// to date checks. Typically this happens for an mbox file.
562	if (hadNullIpath == false) {
563	LOGDEB1(("Creating empty doc for file\n"));
564	Rcl::Doc fileDoc;
565	fileDoc.fmtime = ascdate;
566	fileDoc.utf8fn = utf8fn;
567	fileDoc.mimetype = interner.getMimetype();
568	fileDoc.url = string("file://") + fn;
569
570	char cbuf[100];
571	sprintf(cbuf, "%ld", (long)stp->st_size);
572	fileDoc.fbytes = cbuf;
573	// Document signature for up to date checks.
574	sprintf(cbuf, "%ld%ld", (long)stp->st_size, (long)stp->RCL_STTIME);
575	fileDoc.sig = cbuf;
576	if (!m_db.addOrUpdate(parent_udi, "", fileDoc))
577	return FsTreeWalker::FtwError;
578	}
579
580	return FsTreeWalker::FtwOk;
581	}
582
583	////////////////////////////////////////////////////////////////////////////
584	// ConIndexer methods: ConfIndexer is the top-level object, that could
585	// in theory index multiple directories to multiple databases. In practise we
586	// have a single database per configuration.
587		31
588	ConfIndexer::~ConfIndexer()	32	ConfIndexer::~ConfIndexer()
589	{	33	{
590	deleteZ(m_dbindexer);	34	deleteZ(m_fsindexer);
591	}	35	}
592		36
593	bool ConfIndexer::index(bool resetbefore)	37	bool ConfIndexer::index(bool resetbefore)
594	{	38	{
595	list<string> tdl = m_config->getTopdirs();	39	list<string> tdl = m_config->getTopdirs();
	...		...
632	m_config->setKeyDir("");	76	m_config->setKeyDir("");
633		77
634	// The dbmap now has dbdir as key and directory lists as values.	78	// The dbmap now has dbdir as key and directory lists as values.
635	// Index each directory group in turn	79	// Index each directory group in turn
636	for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {	80	for (dbit = dbmap.begin(); dbit != dbmap.end(); dbit++) {
637	m_dbindexer = new DbIndexer(m_config, m_updater);	81	m_fsindexer = new FsIndexer(m_config, m_updater);
638	if (!m_dbindexer->indexDb(resetbefore, &dbit->second)) {	82	if (!m_fsindexer->indexTrees(resetbefore, &dbit->second)) {
639	deleteZ(m_dbindexer);	83	deleteZ(m_fsindexer);
640	m_reason = "Failed indexing in " + dbit->first;	84	m_reason = "Failed indexing in " + dbit->first;
641	return false;	85	return false;
642	}	86	}
643	deleteZ(m_dbindexer);	87	deleteZ(m_fsindexer);
644	}	88	}
645	return true;	89	return true;
646	}	90	}