|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
20 |
#include <fnmatch.h>
|
20 |
#include <fnmatch.h>
|
21 |
#include <regex.h>
|
21 |
#include <regex.h>
|
22 |
#include <math.h>
|
22 |
#include <math.h>
|
23 |
#include <time.h>
|
23 |
#include <time.h>
|
24 |
|
24 |
|
25 |
#include <iostream>
|
|
|
26 |
#include <string>
|
25 |
#include <string>
|
27 |
#include <vector>
|
26 |
#include <vector>
|
28 |
#include <algorithm>
|
27 |
#include <algorithm>
|
|
|
28 |
#include <sstream>
|
29 |
|
29 |
|
30 |
#ifndef NO_NAMESPACES
|
30 |
#ifndef NO_NAMESPACES
|
31 |
using namespace std;
|
31 |
using namespace std;
|
32 |
#endif /* NO_NAMESPACES */
|
32 |
#endif /* NO_NAMESPACES */
|
33 |
|
33 |
|
|
... |
|
... |
68 |
|
68 |
|
69 |
// This is the word position offset at which we index the body text
|
69 |
// This is the word position offset at which we index the body text
|
70 |
// (abstract, keywords, etc.. are stored before this)
|
70 |
// (abstract, keywords, etc.. are stored before this)
|
71 |
static const unsigned int baseTextPosition = 100000;
|
71 |
static const unsigned int baseTextPosition = 100000;
|
72 |
|
72 |
|
|
|
73 |
static const string cstr_mbreaks("rclmbreaks");
|
|
|
74 |
|
73 |
#ifndef NO_NAMESPACES
|
75 |
#ifndef NO_NAMESPACES
|
74 |
namespace Rcl {
|
76 |
namespace Rcl {
|
75 |
#endif
|
77 |
#endif
|
76 |
|
78 |
|
77 |
// Some prefixes that we could get from the fields file, but are not going
|
79 |
// Some prefixes that we could get from the fields file, but are not going
|
|
... |
|
... |
300 |
}
|
302 |
}
|
301 |
|
303 |
|
302 |
// Return the positions list for the page break term
|
304 |
// Return the positions list for the page break term
|
303 |
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
|
305 |
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
|
304 |
{
|
306 |
{
|
|
|
307 |
// Need to retrieve the document record to check for multiple page breaks
|
|
|
308 |
// that we store there for lack of better place
|
|
|
309 |
map<int, int> mbreaksmap;
|
|
|
310 |
try {
|
|
|
311 |
Xapian::Document xdoc = xrdb.get_document(docid);
|
|
|
312 |
string data = xdoc.get_data();
|
|
|
313 |
Doc doc;
|
|
|
314 |
string mbreaks;
|
|
|
315 |
if (dbDataToRclDoc(docid, data, doc) &&
|
|
|
316 |
doc.getmeta(cstr_mbreaks, &mbreaks)) {
|
|
|
317 |
vector<string> values;
|
|
|
318 |
stringToTokens(mbreaks, values, ",");
|
|
|
319 |
for (unsigned int i = 0; i < values.size() / 2; i += 2) {
|
|
|
320 |
int pos = atoi(values[i].c_str()) + baseTextPosition;
|
|
|
321 |
int incr = atoi(values[i+1].c_str());
|
|
|
322 |
mbreaksmap[pos] = incr;
|
|
|
323 |
}
|
|
|
324 |
}
|
|
|
325 |
} catch (...) {
|
|
|
326 |
}
|
|
|
327 |
|
305 |
string qterm = page_break_term;
|
328 |
string qterm = page_break_term;
|
306 |
Xapian::PositionIterator pos;
|
329 |
Xapian::PositionIterator pos;
|
307 |
try {
|
330 |
try {
|
308 |
for (pos = xrdb.positionlist_begin(docid, qterm);
|
331 |
for (pos = xrdb.positionlist_begin(docid, qterm);
|
309 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
332 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
310 |
int ipos = *pos;
|
333 |
int ipos = *pos;
|
311 |
if (ipos < int(baseTextPosition)) {
|
334 |
if (ipos < int(baseTextPosition)) {
|
312 |
// Not in text body. Strange...
|
335 |
// Not in text body. Strange...
|
313 |
continue;
|
336 |
continue;
|
|
|
337 |
}
|
|
|
338 |
map<int, int>::iterator it = mbreaksmap.find(ipos);
|
|
|
339 |
if (it != mbreaksmap.end()) {
|
|
|
340 |
LOGDEB1(("getPagePositions: found multibreak at %d incr %d\n",
|
|
|
341 |
ipos, it->second));
|
|
|
342 |
for (int i = 0 ; i < it->second; i++)
|
|
|
343 |
vpos.push_back(ipos);
|
314 |
}
|
344 |
}
|
315 |
vpos.push_back(ipos);
|
345 |
vpos.push_back(ipos);
|
316 |
}
|
346 |
}
|
317 |
} catch (...) {
|
347 |
} catch (...) {
|
318 |
// Term does not occur. No problem.
|
348 |
// Term does not occur. No problem.
|
|
... |
|
... |
355 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
385 |
pos != xrdb.positionlist_end(docid, qterm); pos++) {
|
356 |
int ipos = *pos;
|
386 |
int ipos = *pos;
|
357 |
if (ipos < int(baseTextPosition)) // Not in text body
|
387 |
if (ipos < int(baseTextPosition)) // Not in text body
|
358 |
continue;
|
388 |
continue;
|
359 |
// What page ?
|
389 |
// What page ?
|
360 |
LOGABS(("getFirstPageMatch: looking for match for [%s]\n",
|
390 |
LOGABS(("getFirstPageMatch: search match for [%s] pos %d\n",
|
361 |
qterm.c_str()));
|
391 |
qterm.c_str(), ipos));
|
362 |
vector<int>::const_iterator it =
|
392 |
vector<int>::const_iterator it =
|
363 |
lower_bound(pagepos.begin(), pagepos.end(), ipos);
|
393 |
upper_bound(pagepos.begin(), pagepos.end(), ipos);
|
364 |
if (it != pagepos.end())
|
|
|
365 |
return it - pagepos.begin() + 1;
|
394 |
return it - pagepos.begin() + 1;
|
366 |
}
|
395 |
}
|
367 |
} catch (...) {
|
396 |
} catch (...) {
|
368 |
// Term does not occur. No problem.
|
397 |
// Term does not occur. No problem.
|
369 |
}
|
398 |
}
|
370 |
}
|
399 |
}
|
|
... |
|
... |
1000 |
return true;
|
1029 |
return true;
|
1001 |
}
|
1030 |
}
|
1002 |
|
1031 |
|
1003 |
class TermProcIdx : public TermProc {
|
1032 |
class TermProcIdx : public TermProc {
|
1004 |
public:
|
1033 |
public:
|
1005 |
TermProcIdx() : TermProc(0), m_ts(0) {}
|
1034 |
TermProcIdx() : TermProc(0), m_ts(0), m_lastpagepos(0), m_pageincr(0) {}
|
1006 |
void setTSD(TextSplitDb *ts) {m_ts = ts;}
|
1035 |
void setTSD(TextSplitDb *ts) {m_ts = ts;}
|
1007 |
|
1036 |
|
1008 |
bool takeword(const std::string &term, int pos, int, int)
|
1037 |
bool takeword(const std::string &term, int pos, int, int)
|
1009 |
{
|
1038 |
{
|
1010 |
// Compute absolute position (pos is relative to current segment),
|
1039 |
// Compute absolute position (pos is relative to current segment),
|
|
... |
|
... |
1031 |
return false;
|
1060 |
return false;
|
1032 |
}
|
1061 |
}
|
1033 |
void newpage(int pos)
|
1062 |
void newpage(int pos)
|
1034 |
{
|
1063 |
{
|
1035 |
pos += m_ts->basepos;
|
1064 |
pos += m_ts->basepos;
|
|
|
1065 |
LOGDEB2(("newpage: %d\n", pos));
|
|
|
1066 |
if (pos < int(baseTextPosition))
|
|
|
1067 |
return;
|
|
|
1068 |
|
1036 |
m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
|
1069 |
m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
|
|
|
1070 |
if (pos == m_lastpagepos) {
|
|
|
1071 |
m_pageincr++;
|
|
|
1072 |
LOGDEB2(("newpage: same pos, pageincr %d lastpagepos %d\n",
|
|
|
1073 |
m_pageincr, m_lastpagepos));
|
|
|
1074 |
} else {
|
|
|
1075 |
LOGDEB2(("newpage: pos change, pageincr %d lastpagepos %d\n",
|
|
|
1076 |
m_pageincr, m_lastpagepos));
|
|
|
1077 |
if (m_pageincr > 0) {
|
|
|
1078 |
// Remember the multiple page break at this position
|
|
|
1079 |
m_pageincrvec.push_back(
|
|
|
1080 |
pair<int, int>(m_lastpagepos - baseTextPosition,
|
|
|
1081 |
m_pageincr));
|
1037 |
}
|
1082 |
}
|
1038 |
private:
|
1083 |
m_pageincr = 0;
|
|
|
1084 |
}
|
|
|
1085 |
m_lastpagepos = pos;
|
|
|
1086 |
}
|
|
|
1087 |
|
|
|
1088 |
virtual bool flush()
|
|
|
1089 |
{
|
|
|
1090 |
if (m_pageincr > 0) {
|
|
|
1091 |
m_pageincrvec.push_back(
|
|
|
1092 |
pair<int, int>(m_lastpagepos - baseTextPosition,
|
|
|
1093 |
m_pageincr));
|
|
|
1094 |
m_pageincr = 0;
|
|
|
1095 |
}
|
|
|
1096 |
return TermProc::flush();
|
|
|
1097 |
}
|
|
|
1098 |
|
1039 |
TextSplitDb *m_ts;
|
1099 |
TextSplitDb *m_ts;
|
|
|
1100 |
// Auxiliary page breaks data for positions with multiple page breaks.
|
|
|
1101 |
int m_lastpagepos;
|
|
|
1102 |
// increment of page breaks at same pos. Normally 0, 1.. when several
|
|
|
1103 |
// breaks at the same pos
|
|
|
1104 |
int m_pageincr;
|
|
|
1105 |
vector <pair<int, int> > m_pageincrvec;
|
1040 |
};
|
1106 |
};
|
1041 |
|
1107 |
|
1042 |
|
1108 |
|
1043 |
#ifdef TESTING_XAPIAN_SPELL
|
1109 |
#ifdef TESTING_XAPIAN_SPELL
|
1044 |
string Db::getSpellingSuggestion(const string& word)
|
1110 |
string Db::getSpellingSuggestion(const string& word)
|
|
... |
|
... |
1272 |
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
1338 |
time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
1273 |
doc.dmtime.c_str());
|
1339 |
doc.dmtime.c_str());
|
1274 |
struct tm *tm = localtime(&mtime);
|
1340 |
struct tm *tm = localtime(&mtime);
|
1275 |
char buf[9];
|
1341 |
char buf[9];
|
1276 |
snprintf(buf, 9, "%04d%02d%02d",
|
1342 |
snprintf(buf, 9, "%04d%02d%02d",
|
1277 |
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
1343 |
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
1278 |
newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
|
1344 |
newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
|
1279 |
buf[6] = '\0';
|
1345 |
buf[6] = '\0';
|
1280 |
newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
|
1346 |
newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
|
1281 |
buf[4] = '\0';
|
1347 |
buf[4] = '\0';
|
1282 |
newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
|
1348 |
newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
|
|
... |
|
... |
1373 |
neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
|
1439 |
neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
|
1374 |
RECORD_APPEND(record, nm, value);
|
1440 |
RECORD_APPEND(record, nm, value);
|
1375 |
}
|
1441 |
}
|
1376 |
}
|
1442 |
}
|
1377 |
|
1443 |
|
|
|
1444 |
// If empty pages (multiple break at same pos) were recorded, save
|
|
|
1445 |
// them (this is because we have no way to record them in the
|
|
|
1446 |
// Xapian list
|
|
|
1447 |
if (!tpidx.m_pageincrvec.empty()) {
|
|
|
1448 |
ostringstream multibreaks;
|
|
|
1449 |
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
|
|
|
1450 |
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
|
|
|
1451 |
tpidx.m_pageincrvec[i].second;
|
|
|
1452 |
}
|
|
|
1453 |
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
|
|
|
1454 |
}
|
|
|
1455 |
|
1378 |
// If the file's md5 was computed, add value. This is optionally
|
1456 |
// If the file's md5 was computed, add value. This is optionally
|
1379 |
// used for query result duplicate elimination.
|
1457 |
// used for query result duplicate elimination.
|
1380 |
string& md5 = doc.meta[Doc::keymd5];
|
1458 |
string& md5 = doc.meta[Doc::keymd5];
|
1381 |
if (!md5.empty()) {
|
1459 |
if (!md5.empty()) {
|
1382 |
string digest;
|
1460 |
string digest;
|