recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [81c171] .. [b4493e]

Switch to unified view


...
#include "internfile.h"
#include "utf8fn.h"
#ifdef RCL_USE_ASPELL
#include "rclaspell.h"
#endif
#include "zlibut.h"

// Recoll index format version is stored in user metadata. When this change,
// we can't open the db and will have to reindex.
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
static const string cstr_RCL_IDX_VERSION("1");
...
    LOGDEB2("Db::add: split body: [" << doc.text << "]\n");

#ifdef TEXTSPLIT_STATS
    splitter.resetStats();
#endif
    if (!splitter.text_to_words(doc.text)) {
        LOGDEB("Db::addOrUpdate: split failed for main text\n");
        } else {
#ifdef RAWTEXT_IN_VALUE
            if (o_index_storerawtext) {
                ZLibUtBuf buf;
                deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
                string tt;
                tt.assign(buf.getBuf(), buf.getCnt());
                newdocument.add_value(VALUE_RAWTEXT, tt);
            }
#endif
        }

#ifdef TEXTSPLIT_STATS
    // Reject bad data. unrecognized base64 text is characterized by
    // high avg word length and high variation (because there are
    // word-splitters like +/ inside the data).
...
        MD5HexScan(*md5, digest);
        newdocument.add_value(VALUE_MD5, digest);
        newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
    }

#ifdef RAWTEXT_IN_DATA
        if (o_index_storerawtext) {
            RECORD_APPEND(record, string("RAWTEXT"),
                          neutchars(doc.text, cstr_nc));
        }
#endif
    LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
    newdocument.set_data(record);
    }
#ifdef IDX_THREADS
    if (m_ndb->m_havewriteq) {

	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
	...		...
59	#include "internfile.h"	59	#include "internfile.h"
60	#include "utf8fn.h"	60	#include "utf8fn.h"
61	#ifdef RCL_USE_ASPELL	61	#ifdef RCL_USE_ASPELL
62	#include "rclaspell.h"	62	#include "rclaspell.h"
63	#endif	63	#endif
		64	#include "zlibut.h"
64		65
65	// Recoll index format version is stored in user metadata. When this change,	66	// Recoll index format version is stored in user metadata. When this change,
66	// we can't open the db and will have to reindex.	67	// we can't open the db and will have to reindex.
67	static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");	68	static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
68	static const string cstr_RCL_IDX_VERSION("1");	69	static const string cstr_RCL_IDX_VERSION("1");
	...		...
1456	LOGDEB2("Db::add: split body: [" << doc.text << "]\n");	1457	LOGDEB2("Db::add: split body: [" << doc.text << "]\n");
1457		1458
1458	#ifdef TEXTSPLIT_STATS	1459	#ifdef TEXTSPLIT_STATS
1459	splitter.resetStats();	1460	splitter.resetStats();
1460	#endif	1461	#endif
1461	if (!splitter.text_to_words(doc.text))	1462	if (!splitter.text_to_words(doc.text)) {
1462	LOGDEB("Db::addOrUpdate: split failed for main text\n");	1463	LOGDEB("Db::addOrUpdate: split failed for main text\n");
		1464	} else {
		1465	#ifdef RAWTEXT_IN_VALUE
		1466	if (o_index_storerawtext) {
		1467	ZLibUtBuf buf;
		1468	deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
		1469	string tt;
		1470	tt.assign(buf.getBuf(), buf.getCnt());
		1471	newdocument.add_value(VALUE_RAWTEXT, tt);
		1472	}
		1473	#endif
		1474	}
1463		1475
1464	#ifdef TEXTSPLIT_STATS	1476	#ifdef TEXTSPLIT_STATS
1465	// Reject bad data. unrecognized base64 text is characterized by	1477	// Reject bad data. unrecognized base64 text is characterized by
1466	// high avg word length and high variation (because there are	1478	// high avg word length and high variation (because there are
1467	// word-splitters like +/ inside the data).	1479	// word-splitters like +/ inside the data).
	...		...
1668	MD5HexScan(*md5, digest);	1680	MD5HexScan(*md5, digest);
1669	newdocument.add_value(VALUE_MD5, digest);	1681	newdocument.add_value(VALUE_MD5, digest);
1670	newdocument.add_boolean_term(wrap_prefix("XM") + *md5);	1682	newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
1671	}	1683	}
1672		1684
		1685	#ifdef RAWTEXT_IN_DATA
		1686	if (o_index_storerawtext) {
		1687	RECORD_APPEND(record, string("RAWTEXT"),
		1688	neutchars(doc.text, cstr_nc));
		1689	}
		1690	#endif
1673	LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");	1691	LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
1674	newdocument.set_data(record);	1692	newdocument.set_data(record);
1675	}	1693	}
1676	#ifdef IDX_THREADS	1694	#ifdef IDX_THREADS
1677	if (m_ndb->m_havewriteq) {	1695	if (m_ndb->m_havewriteq) {