|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
59 |
#include "internfile.h"
|
59 |
#include "internfile.h"
|
60 |
#include "utf8fn.h"
|
60 |
#include "utf8fn.h"
|
61 |
#ifdef RCL_USE_ASPELL
|
61 |
#ifdef RCL_USE_ASPELL
|
62 |
#include "rclaspell.h"
|
62 |
#include "rclaspell.h"
|
63 |
#endif
|
63 |
#endif
|
|
|
64 |
#include "zlibut.h"
|
64 |
|
65 |
|
65 |
// Recoll index format version is stored in user metadata. When this change,
|
66 |
// Recoll index format version is stored in user metadata. When this change,
|
66 |
// we can't open the db and will have to reindex.
|
67 |
// we can't open the db and will have to reindex.
|
67 |
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
68 |
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
|
68 |
static const string cstr_RCL_IDX_VERSION("1");
|
69 |
static const string cstr_RCL_IDX_VERSION("1");
|
|
... |
|
... |
1456 |
LOGDEB2("Db::add: split body: [" << doc.text << "]\n");
|
1457 |
LOGDEB2("Db::add: split body: [" << doc.text << "]\n");
|
1457 |
|
1458 |
|
1458 |
#ifdef TEXTSPLIT_STATS
|
1459 |
#ifdef TEXTSPLIT_STATS
|
1459 |
splitter.resetStats();
|
1460 |
splitter.resetStats();
|
1460 |
#endif
|
1461 |
#endif
|
1461 |
if (!splitter.text_to_words(doc.text))
|
1462 |
if (!splitter.text_to_words(doc.text)) {
|
1462 |
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
1463 |
LOGDEB("Db::addOrUpdate: split failed for main text\n");
|
|
|
1464 |
} else {
|
|
|
1465 |
#ifdef RAWTEXT_IN_VALUE
|
|
|
1466 |
if (o_index_storerawtext) {
|
|
|
1467 |
ZLibUtBuf buf;
|
|
|
1468 |
deflateToBuf(doc.text.c_str(), doc.text.size(), buf);
|
|
|
1469 |
string tt;
|
|
|
1470 |
tt.assign(buf.getBuf(), buf.getCnt());
|
|
|
1471 |
newdocument.add_value(VALUE_RAWTEXT, tt);
|
|
|
1472 |
}
|
|
|
1473 |
#endif
|
|
|
1474 |
}
|
1463 |
|
1475 |
|
1464 |
#ifdef TEXTSPLIT_STATS
|
1476 |
#ifdef TEXTSPLIT_STATS
|
1465 |
// Reject bad data. unrecognized base64 text is characterized by
|
1477 |
// Reject bad data. unrecognized base64 text is characterized by
|
1466 |
// high avg word length and high variation (because there are
|
1478 |
// high avg word length and high variation (because there are
|
1467 |
// word-splitters like +/ inside the data).
|
1479 |
// word-splitters like +/ inside the data).
|
|
... |
|
... |
1668 |
MD5HexScan(*md5, digest);
|
1680 |
MD5HexScan(*md5, digest);
|
1669 |
newdocument.add_value(VALUE_MD5, digest);
|
1681 |
newdocument.add_value(VALUE_MD5, digest);
|
1670 |
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
1682 |
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
1671 |
}
|
1683 |
}
|
1672 |
|
1684 |
|
|
|
1685 |
#ifdef RAWTEXT_IN_DATA
|
|
|
1686 |
if (o_index_storerawtext) {
|
|
|
1687 |
RECORD_APPEND(record, string("RAWTEXT"),
|
|
|
1688 |
neutchars(doc.text, cstr_nc));
|
|
|
1689 |
}
|
|
|
1690 |
#endif
|
1673 |
LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
|
1691 |
LOGDEB0("Rcl::Db::add: new doc record:\n" << record << "\n");
|
1674 |
newdocument.set_data(record);
|
1692 |
newdocument.set_data(record);
|
1675 |
}
|
1693 |
}
|
1676 |
#ifdef IDX_THREADS
|
1694 |
#ifdef IDX_THREADS
|
1677 |
if (m_ndb->m_havewriteq) {
|
1695 |
if (m_ndb->m_havewriteq) {
|