|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
12 |
* You should have received a copy of the GNU General Public License
|
12 |
* You should have received a copy of the GNU General Public License
|
13 |
* along with this program; if not, write to the
|
13 |
* along with this program; if not, write to the
|
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
|
|
17 |
#include "autoconfig.h"
|
|
|
18 |
|
17 |
#include <stdio.h>
|
19 |
#include <stdio.h>
|
18 |
#include <cstring>
|
20 |
#include <cstring>
|
19 |
#include <unistd.h>
|
21 |
#include <unistd.h>
|
20 |
#include <fnmatch.h>
|
22 |
#include <fnmatch.h>
|
21 |
#include <regex.h>
|
23 |
#include <regex.h>
|
|
... |
|
... |
51 |
#include "md5.h"
|
53 |
#include "md5.h"
|
52 |
#include "rclversion.h"
|
54 |
#include "rclversion.h"
|
53 |
#include "cancelcheck.h"
|
55 |
#include "cancelcheck.h"
|
54 |
#include "ptmutex.h"
|
56 |
#include "ptmutex.h"
|
55 |
#include "termproc.h"
|
57 |
#include "termproc.h"
|
|
|
58 |
#include "expansiondbs.h"
|
56 |
|
59 |
|
57 |
#ifndef MAX
|
60 |
#ifndef MAX
|
58 |
#define MAX(A,B) (A>B?A:B)
|
61 |
#define MAX(A,B) (A>B?A:B)
|
59 |
#endif
|
62 |
#endif
|
60 |
#ifndef MIN
|
63 |
#ifndef MIN
|
|
... |
|
... |
82 |
static const string mimetype_prefix = "T";
|
85 |
static const string mimetype_prefix = "T";
|
83 |
static const string xapday_prefix = "D";
|
86 |
static const string xapday_prefix = "D";
|
84 |
static const string xapmonth_prefix = "M";
|
87 |
static const string xapmonth_prefix = "M";
|
85 |
static const string xapyear_prefix = "Y";
|
88 |
static const string xapyear_prefix = "Y";
|
86 |
const string pathelt_prefix = "XP";
|
89 |
const string pathelt_prefix = "XP";
|
|
|
90 |
#ifdef RCL_INDEX_STRIPCHARS
|
87 |
const string start_of_field_term = "XXST";
|
91 |
const string start_of_field_term = "XXST";
|
88 |
const string end_of_field_term = "XXND";
|
92 |
const string end_of_field_term = "XXND";
|
89 |
static const string page_break_term = "XXPG";
|
93 |
static const string page_break_term = "XXPG";
|
|
|
94 |
#else
|
|
|
95 |
string start_of_field_term;
|
|
|
96 |
string end_of_field_term;
|
|
|
97 |
const string page_break_term = "XXPG/";
|
|
|
98 |
#endif
|
|
|
99 |
|
90 |
// Field name for the unsplit file name. Has to exist in the field file
|
100 |
// Field name for the unsplit file name. Has to exist in the field file
|
91 |
// because of usage in termmatch()
|
101 |
// because of usage in termmatch()
|
92 |
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
102 |
static const string unsplitFilenameFieldName = "rclUnsplitFN";
|
93 |
static const string unsplitfilename_prefix = "XSFS";
|
103 |
static const string unsplitfilename_prefix = "XSFS";
|
94 |
|
104 |
|
|
... |
|
... |
195 |
// un-prefixed, so this is simpler and better.
|
205 |
// un-prefixed, so this is simpler and better.
|
196 |
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
206 |
static void noPrefixList(const vector<string>& in, vector<string>& out)
|
197 |
{
|
207 |
{
|
198 |
for (vector<string>::const_iterator qit = in.begin();
|
208 |
for (vector<string>::const_iterator qit = in.begin();
|
199 |
qit != in.end(); qit++) {
|
209 |
qit != in.end(); qit++) {
|
200 |
if (qit->size() && !('A' <= (*qit)[0] && (*qit)[0] <= 'Z'))
|
210 |
if (!has_prefix(*qit))
|
201 |
out.push_back(*qit);
|
211 |
out.push_back(*qit);
|
202 |
}
|
212 |
}
|
203 |
}
|
213 |
}
|
204 |
|
214 |
|
205 |
#undef DEBUGABSTRACT
|
215 |
#undef DEBUGABSTRACT
|
|
... |
|
... |
589 |
int cutoff = 500 * 1000;
|
599 |
int cutoff = 500 * 1000;
|
590 |
|
600 |
|
591 |
for (term = xrdb.termlist_begin(docid);
|
601 |
for (term = xrdb.termlist_begin(docid);
|
592 |
term != xrdb.termlist_end(docid); term++) {
|
602 |
term != xrdb.termlist_end(docid); term++) {
|
593 |
// Ignore prefixed terms
|
603 |
// Ignore prefixed terms
|
594 |
if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
|
604 |
if (has_prefix(*term))
|
595 |
continue;
|
605 |
continue;
|
596 |
if (cutoff-- < 0) {
|
606 |
if (cutoff-- < 0) {
|
597 |
ret = ABSRES_TRUNC;
|
607 |
ret = ABSRES_TRUNC;
|
598 |
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
608 |
LOGDEB0(("makeAbstract: max term count cutoff\n"));
|
599 |
break;
|
609 |
break;
|
|
... |
|
... |
670 |
incjk = newcjk;
|
680 |
incjk = newcjk;
|
671 |
if (it->second == cstr_ellipsis) {
|
681 |
if (it->second == cstr_ellipsis) {
|
672 |
vabs.push_back(pair<int,string>(page, chunk));
|
682 |
vabs.push_back(pair<int,string>(page, chunk));
|
673 |
chunk.clear();
|
683 |
chunk.clear();
|
674 |
} else {
|
684 |
} else {
|
|
|
685 |
if (it->second.compare(end_of_field_term) &&
|
|
|
686 |
it->second.compare(start_of_field_term))
|
675 |
chunk += it->second;
|
687 |
chunk += it->second;
|
676 |
}
|
688 |
}
|
677 |
}
|
689 |
}
|
678 |
if (!chunk.empty())
|
690 |
if (!chunk.empty())
|
679 |
vabs.push_back(pair<int, string>(page, chunk));
|
691 |
vabs.push_back(pair<int, string>(page, chunk));
|
680 |
|
692 |
|
|
... |
|
... |
690 |
: m_ndb(0), m_config(cfp), m_idxAbsTruncLen(250), m_synthAbsLen(250),
|
702 |
: m_ndb(0), m_config(cfp), m_idxAbsTruncLen(250), m_synthAbsLen(250),
|
691 |
m_synthAbsWordCtxLen(4), m_flushMb(-1),
|
703 |
m_synthAbsWordCtxLen(4), m_flushMb(-1),
|
692 |
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
704 |
m_curtxtsz(0), m_flushtxtsz(0), m_occtxtsz(0), m_occFirstCheck(1),
|
693 |
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
705 |
m_maxFsOccupPc(0), m_mode(Db::DbRO)
|
694 |
{
|
706 |
{
|
|
|
707 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
708 |
if (start_of_field_term.empty()) {
|
|
|
709 |
if (o_index_stripchars) {
|
|
|
710 |
start_of_field_term = "XXST";
|
|
|
711 |
end_of_field_term = "XXND";
|
|
|
712 |
} else {
|
|
|
713 |
start_of_field_term = "XXST/";
|
|
|
714 |
end_of_field_term = "XXND/";
|
|
|
715 |
}
|
|
|
716 |
}
|
|
|
717 |
#endif
|
|
|
718 |
|
695 |
m_ndb = new Native(this);
|
719 |
m_ndb = new Native(this);
|
696 |
if (m_config) {
|
720 |
if (m_config) {
|
697 |
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
721 |
m_config->getConfParam("maxfsoccuppc", &m_maxFsOccupPc);
|
698 |
m_config->getConfParam("idxflushmb", &m_flushMb);
|
722 |
m_config->getConfParam("idxflushmb", &m_flushMb);
|
699 |
}
|
723 |
}
|
|
... |
|
... |
892 |
{
|
916 |
{
|
893 |
int res = -1;
|
917 |
int res = -1;
|
894 |
if (!m_ndb || !m_ndb->m_isopen)
|
918 |
if (!m_ndb || !m_ndb->m_isopen)
|
895 |
return -1;
|
919 |
return -1;
|
896 |
|
920 |
|
897 |
string term;
|
921 |
string term = _term;
|
|
|
922 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
923 |
if (o_index_stripchars)
|
|
|
924 |
#endif
|
898 |
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
925 |
if (!unacmaybefold(_term, term, "UTF-8", UNACOP_UNACFOLD)) {
|
899 |
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
926 |
LOGINFO(("Db::termDocCnt: unac failed for [%s]\n", _term.c_str()));
|
900 |
return 0;
|
927 |
return 0;
|
901 |
}
|
928 |
}
|
902 |
|
929 |
|
903 |
if (m_stops.isStop(term)) {
|
930 |
if (m_stops.isStop(term)) {
|
904 |
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
|
931 |
LOGDEB1(("Db::termDocCnt [%s] in stop list\n", term.c_str()));
|
905 |
return 0;
|
932 |
return 0;
|
906 |
}
|
933 |
}
|
|
... |
|
... |
1012 |
: TextSplitP(prc),
|
1039 |
: TextSplitP(prc),
|
1013 |
doc(d), basepos(1), curpos(0), wdfinc(1)
|
1040 |
doc(d), basepos(1), curpos(0), wdfinc(1)
|
1014 |
{}
|
1041 |
{}
|
1015 |
// Reimplement text_to_words to add start and end special terms
|
1042 |
// Reimplement text_to_words to add start and end special terms
|
1016 |
virtual bool text_to_words(const string &in);
|
1043 |
virtual bool text_to_words(const string &in);
|
|
|
1044 |
|
1017 |
void setprefix(const string& pref) {prefix = pref;}
|
1045 |
void setprefix(const string& pref)
|
|
|
1046 |
{
|
|
|
1047 |
if (pref.empty())
|
|
|
1048 |
prefix.clear();
|
|
|
1049 |
else
|
|
|
1050 |
prefix = wrap_prefix(pref);
|
|
|
1051 |
}
|
|
|
1052 |
|
1018 |
void setwdfinc(int i) {wdfinc = i;}
|
1053 |
void setwdfinc(int i)
|
|
|
1054 |
{
|
|
|
1055 |
wdfinc = i;
|
|
|
1056 |
}
|
1019 |
|
1057 |
|
1020 |
friend class TermProcIdx;
|
1058 |
friend class TermProcIdx;
|
1021 |
|
1059 |
|
1022 |
private:
|
1060 |
private:
|
1023 |
// If prefix is set, we also add a posting for the prefixed terms
|
1061 |
// If prefix is set, we also add a posting for the prefixed terms
|
|
... |
|
... |
1145 |
#ifdef TESTING_XAPIAN_SPELL
|
1183 |
#ifdef TESTING_XAPIAN_SPELL
|
1146 |
string Db::getSpellingSuggestion(const string& word)
|
1184 |
string Db::getSpellingSuggestion(const string& word)
|
1147 |
{
|
1185 |
{
|
1148 |
if (m_ndb == 0)
|
1186 |
if (m_ndb == 0)
|
1149 |
return string();
|
1187 |
return string();
|
|
|
1188 |
|
1150 |
string term;
|
1189 |
string term = word;
|
|
|
1190 |
|
|
|
1191 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
1192 |
if (o_index_stripchars)
|
|
|
1193 |
#endif
|
1151 |
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
1194 |
if (!unacmaybefold(word, term, "UTF-8", UNACOP_UNACFOLD)) {
|
1152 |
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
1195 |
LOGINFO(("Db::getSpelling: unac failed for [%s]\n", word.c_str()));
|
1153 |
return string();
|
1196 |
return string();
|
1154 |
}
|
1197 |
}
|
|
|
1198 |
|
1155 |
if (!isSpellingCandidate(term))
|
1199 |
if (!isSpellingCandidate(term))
|
1156 |
return string();
|
1200 |
return string();
|
1157 |
return m_ndb->xrdb.get_spelling_suggestion(term);
|
1201 |
return m_ndb->xrdb.get_spelling_suggestion(term);
|
1158 |
}
|
1202 |
}
|
1159 |
#endif
|
1203 |
#endif
|
|
... |
|
... |
1257 |
|
1301 |
|
1258 |
// The term processing pipeline:
|
1302 |
// The term processing pipeline:
|
1259 |
TermProcIdx tpidx;
|
1303 |
TermProcIdx tpidx;
|
1260 |
TermProc *nxt = &tpidx;
|
1304 |
TermProc *nxt = &tpidx;
|
1261 |
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
1305 |
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
1262 |
// TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
1306 |
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
|
|
1307 |
|
1263 |
TermProcPrep tpprep(nxt); nxt = &tpprep;
|
1308 |
TermProcPrep tpprep(nxt);
|
|
|
1309 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
1310 |
if (o_index_stripchars)
|
|
|
1311 |
#endif
|
|
|
1312 |
nxt = &tpprep;
|
1264 |
|
1313 |
|
1265 |
TextSplitDb splitter(newdocument, nxt);
|
1314 |
TextSplitDb splitter(newdocument, nxt);
|
1266 |
tpidx.setTSD(&splitter);
|
1315 |
tpidx.setTSD(&splitter);
|
1267 |
|
1316 |
|
1268 |
// If the ipath is like a path, index the last element. This is
|
1317 |
// If the ipath is like a path, index the last element. This is
|
|
... |
|
... |
1284 |
{
|
1333 |
{
|
1285 |
string path = url_gpath(doc.url);
|
1334 |
string path = url_gpath(doc.url);
|
1286 |
vector<string> vpath;
|
1335 |
vector<string> vpath;
|
1287 |
stringToTokens(path, vpath, "/");
|
1336 |
stringToTokens(path, vpath, "/");
|
1288 |
splitter.curpos = 0;
|
1337 |
splitter.curpos = 0;
|
1289 |
newdocument.add_posting(pathelt_prefix,
|
1338 |
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
1290 |
splitter.basepos + splitter.curpos++);
|
1339 |
splitter.basepos + splitter.curpos++);
|
1291 |
for (vector<string>::iterator it = vpath.begin();
|
1340 |
for (vector<string>::iterator it = vpath.begin();
|
1292 |
it != vpath.end(); it++){
|
1341 |
it != vpath.end(); it++){
|
1293 |
if (it->length() > 230) {
|
1342 |
if (it->length() > 230) {
|
1294 |
// Just truncate it. May still be useful because of wildcards
|
1343 |
// Just truncate it. May still be useful because of wildcards
|
1295 |
*it = it->substr(0, 230);
|
1344 |
*it = it->substr(0, 230);
|
1296 |
}
|
1345 |
}
|
1297 |
newdocument.add_posting(pathelt_prefix + *it,
|
1346 |
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
1298 |
splitter.basepos + splitter.curpos++);
|
1347 |
splitter.basepos + splitter.curpos++);
|
1299 |
}
|
1348 |
}
|
1300 |
}
|
1349 |
}
|
1301 |
|
1350 |
|
1302 |
// Index textual metadata. These are all indexed as text with
|
1351 |
// Index textual metadata. These are all indexed as text with
|
|
... |
|
... |
1337 |
if (!splitter.text_to_words(doc.text))
|
1386 |
if (!splitter.text_to_words(doc.text))
|
1338 |
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
1387 |
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
1339 |
|
1388 |
|
1340 |
////// Special terms for other metadata. No positions for these.
|
1389 |
////// Special terms for other metadata. No positions for these.
|
1341 |
// Mime type
|
1390 |
// Mime type
|
1342 |
newdocument.add_term(mimetype_prefix + doc.mimetype);
|
1391 |
newdocument.add_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
1343 |
|
1392 |
|
1344 |
// Simple file name indexed unsplit for specific "file name"
|
1393 |
// Simple file name indexed unsplit for specific "file name"
|
1345 |
// searches. This is not the same as a filename: clause inside the
|
1394 |
// searches. This is not the same as a filename: clause inside the
|
1346 |
// query language.
|
1395 |
// query language.
|
1347 |
// We also add a term for the filename extension if any.
|
1396 |
// We also add a term for the filename extension if any.
|
|
... |
|
... |
1353 |
// a pathological case anyway
|
1402 |
// a pathological case anyway
|
1354 |
if (fn.size() > 230)
|
1403 |
if (fn.size() > 230)
|
1355 |
utf8truncate(fn, 230);
|
1404 |
utf8truncate(fn, 230);
|
1356 |
string::size_type pos = fn.rfind('.');
|
1405 |
string::size_type pos = fn.rfind('.');
|
1357 |
if (pos != string::npos && pos != fn.length() - 1) {
|
1406 |
if (pos != string::npos && pos != fn.length() - 1) {
|
1358 |
newdocument.add_term(fileext_prefix + fn.substr(pos + 1));
|
1407 |
newdocument.add_term(wrap_prefix(fileext_prefix) +
|
|
|
1408 |
fn.substr(pos + 1));
|
1359 |
}
|
1409 |
}
|
1360 |
newdocument.add_term(unsplitfilename_prefix + fn);
|
1410 |
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn);
|
1361 |
}
|
1411 |
}
|
1362 |
}
|
1412 |
}
|
1363 |
|
1413 |
|
1364 |
// Udi unique term: this is used for file existence/uptodate
|
1414 |
// Udi unique term: this is used for file existence/uptodate
|
1365 |
// checks, and unique id for the replace_document() call.
|
1415 |
// checks, and unique id for the replace_document() call.
|
|
... |
|
... |
1374 |
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
1424 |
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
1375 |
doc.dmtime.c_str());
|
1425 |
doc.dmtime.c_str());
|
1376 |
struct tm *tm = localtime(&mtime);
|
1426 |
struct tm *tm = localtime(&mtime);
|
1377 |
char buf[9];
|
1427 |
char buf[9];
|
1378 |
snprintf(buf, 9, "%04d%02d%02d",
|
1428 |
snprintf(buf, 9, "%04d%02d%02d",
|
1379 |
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
1429 |
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
|
|
1430 |
// Date (YYYYMMDD)
|
1380 |
newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
|
1431 |
newdocument.add_term(wrap_prefix(xapday_prefix) + string(buf));
|
|
|
1432 |
// Month (YYYYMM)
|
1381 |
buf[6] = '\0';
|
1433 |
buf[6] = '\0';
|
1382 |
newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
|
1434 |
newdocument.add_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
|
|
1435 |
// Year (YYYY)
|
1383 |
buf[4] = '\0';
|
1436 |
buf[4] = '\0';
|
1384 |
newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
|
1437 |
newdocument.add_term(wrap_prefix(xapyear_prefix) + string(buf));
|
1385 |
|
1438 |
|
1386 |
|
1439 |
|
1387 |
//////////////////////////////////////////////////////////////////
|
1440 |
//////////////////////////////////////////////////////////////////
|
1388 |
// Document data record. omindex has the following nl separated fields:
|
1441 |
// Document data record. omindex has the following nl separated fields:
|
1389 |
// - url
|
1442 |
// - url
|
|
... |
|
... |
1854 |
bool Db::maxYearSpan(int *minyear, int *maxyear)
|
1907 |
bool Db::maxYearSpan(int *minyear, int *maxyear)
|
1855 |
{
|
1908 |
{
|
1856 |
*minyear = 1000000;
|
1909 |
*minyear = 1000000;
|
1857 |
*maxyear = -1000000;
|
1910 |
*maxyear = -1000000;
|
1858 |
TermMatchResult result;
|
1911 |
TermMatchResult result;
|
1859 |
if (!termMatch(ET_WILD, string(), "*", result, 5000, "xapyear"))
|
1912 |
if (!termMatch(ET_WILD, string(), "*", result, -1, "xapyear"))
|
1860 |
return false;
|
1913 |
return false;
|
1861 |
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
1914 |
for (vector<TermMatchEntry>::const_iterator it = result.entries.begin();
|
1862 |
it != result.entries.end(); it++) {
|
1915 |
it != result.entries.end(); it++) {
|
1863 |
if (!it->term.empty()) {
|
1916 |
if (!it->term.empty()) {
|
1864 |
int year = atoi(it->term.c_str()+1);
|
1917 |
int year = atoi(it->term.c_str()+1);
|
|
... |
|
... |
1919 |
// the input string prior to these chars.
|
1972 |
// the input string prior to these chars.
|
1920 |
const string cstr_wildSpecChars = "*?[";
|
1973 |
const string cstr_wildSpecChars = "*?[";
|
1921 |
const string cstr_regSpecChars = "(.[{";
|
1974 |
const string cstr_regSpecChars = "(.[{";
|
1922 |
|
1975 |
|
1923 |
// Find all index terms that match a wildcard or regular expression
|
1976 |
// Find all index terms that match a wildcard or regular expression
|
|
|
1977 |
// If field is set, we return a list of appropriately prefixed terms (which
|
|
|
1978 |
// are going to be used to build a Xapian query).
|
1924 |
bool Db::termMatch(MatchType typ, const string &lang,
|
1979 |
bool Db::termMatch(MatchType typ, const string &lang,
|
1925 |
const string &root,
|
1980 |
const string &root,
|
1926 |
TermMatchResult& res,
|
1981 |
TermMatchResult& res,
|
1927 |
int max,
|
1982 |
int max,
|
1928 |
const string& field,
|
1983 |
const string& field)
|
1929 |
string *prefixp
|
|
|
1930 |
)
|
|
|
1931 |
{
|
1984 |
{
|
1932 |
if (!m_ndb || !m_ndb->m_isopen)
|
1985 |
if (!m_ndb || !m_ndb->m_isopen)
|
1933 |
return false;
|
1986 |
return false;
|
1934 |
Xapian::Database xdb = m_ndb->xdb();
|
1987 |
Xapian::Database xdb = m_ndb->xdb();
|
1935 |
|
1988 |
|
1936 |
res.clear();
|
|
|
1937 |
XAPTRY(res.dbdoccount = xdb.get_doccount();
|
1989 |
XAPTRY(res.dbdoccount = xdb.get_doccount();
|
1938 |
res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
|
1990 |
res.dbavgdoclen = xdb.get_avlength(), xdb, m_reason);
|
1939 |
if (!m_reason.empty())
|
1991 |
if (!m_reason.empty())
|
1940 |
return false;
|
1992 |
return false;
|
1941 |
|
1993 |
|
1942 |
// Get rid of capitals and accents
|
1994 |
// Get rid of capitals and accents
|
|
|
1995 |
|
1943 |
string droot;
|
1996 |
string droot = root;
|
|
|
1997 |
|
|
|
1998 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
1999 |
if (o_index_stripchars)
|
|
|
2000 |
#endif
|
1944 |
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
2001 |
if (!unacmaybefold(root, droot, "UTF-8", UNACOP_UNACFOLD)) {
|
1945 |
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
2002 |
LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
|
1946 |
return false;
|
2003 |
return false;
|
1947 |
}
|
2004 |
}
|
|
|
2005 |
|
1948 |
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
|
2006 |
string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
|
1949 |
|
2007 |
|
1950 |
string prefix;
|
2008 |
string prefix;
|
1951 |
if (!field.empty()) {
|
2009 |
if (!field.empty()) {
|
1952 |
const FieldTraits *ftp = 0;
|
2010 |
const FieldTraits *ftp = 0;
|
1953 |
if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
|
2011 |
if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
|
1954 |
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
2012 |
LOGDEB(("Db::termMatch: field is not indexed (no prefix): [%s]\n",
|
1955 |
field.c_str()));
|
2013 |
field.c_str()));
|
1956 |
} else {
|
2014 |
} else {
|
1957 |
prefix = ftp->pfx;
|
2015 |
prefix = wrap_prefix(ftp->pfx);
|
1958 |
}
|
2016 |
}
|
1959 |
if (prefixp)
|
2017 |
}
|
1960 |
*prefixp = prefix;
|
2018 |
res.prefix = prefix;
|
1961 |
}
|
|
|
1962 |
|
2019 |
|
1963 |
if (typ == ET_STEM) {
|
2020 |
if (typ == ET_STEM) {
|
1964 |
if (!stemExpand(lang, root, res, max))
|
2021 |
if (!stemExpand(lang, root, res, max))
|
1965 |
return false;
|
2022 |
return false;
|
1966 |
sort(res.entries.begin(), res.entries.end());
|
|
|
1967 |
unique(res.entries.begin(), res.entries.end());
|
|
|
1968 |
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
2023 |
for (vector<TermMatchEntry>::iterator it = res.entries.begin();
|
1969 |
it != res.entries.end(); it++) {
|
2024 |
it != res.entries.end(); it++) {
|
1970 |
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
2025 |
XAPTRY(it->wcf = xdb.get_collection_freq(it->term);
|
1971 |
it->docs = xdb.get_termfreq(it->term),
|
2026 |
it->docs = xdb.get_termfreq(it->term),
|
1972 |
xdb, m_reason);
|
2027 |
xdb, m_reason);
|
|
... |
|
... |
2052 |
}
|
2107 |
}
|
2053 |
|
2108 |
|
2054 |
TermMatchCmpByTerm tcmp;
|
2109 |
TermMatchCmpByTerm tcmp;
|
2055 |
sort(res.entries.begin(), res.entries.end(), tcmp);
|
2110 |
sort(res.entries.begin(), res.entries.end(), tcmp);
|
2056 |
TermMatchTermEqual teq;
|
2111 |
TermMatchTermEqual teq;
|
|
|
2112 |
vector<TermMatchEntry>::iterator uit =
|
2057 |
unique(res.entries.begin(), res.entries.end(), teq);
|
2113 |
unique(res.entries.begin(), res.entries.end(), teq);
|
|
|
2114 |
res.entries.resize(uit - res.entries.begin());
|
2058 |
TermMatchCmpByWcf wcmp;
|
2115 |
TermMatchCmpByWcf wcmp;
|
2059 |
sort(res.entries.begin(), res.entries.end(), wcmp);
|
2116 |
sort(res.entries.begin(), res.entries.end(), wcmp);
|
2060 |
if (max > 0) {
|
2117 |
if (max > 0) {
|
2061 |
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
2118 |
res.entries.resize(MIN(res.entries.size(), (unsigned int)max));
|
2062 |
}
|
2119 |
}
|