|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
261 |
}
|
261 |
}
|
262 |
}
|
262 |
}
|
263 |
return false;
|
263 |
return false;
|
264 |
}
|
264 |
}
|
265 |
|
265 |
|
|
|
266 |
// Clear term from document if its frequency is 0. This should
|
|
|
267 |
// probably be done by Xapian when the freq goes to 0 when removing a
|
|
|
268 |
// posting, but we have to do it ourselves
|
|
|
269 |
bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
|
|
|
270 |
{
|
|
|
271 |
LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
|
|
|
272 |
|
|
|
273 |
// Find the term
|
|
|
274 |
Xapian::TermIterator xit;
|
|
|
275 |
XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
|
|
|
276 |
xrdb, m_rcldb->m_reason);
|
|
|
277 |
if (!m_rcldb->m_reason.empty()) {
|
|
|
278 |
LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n",
|
|
|
279 |
term.c_str(), m_rcldb->m_reason.c_str()));
|
|
|
280 |
return false;
|
|
|
281 |
}
|
|
|
282 |
if (xit == xdoc.termlist_end() || term.compare(*xit)) {
|
|
|
283 |
LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n",
|
|
|
284 |
term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
|
|
|
285 |
return false;
|
|
|
286 |
}
|
|
|
287 |
|
|
|
288 |
// Clear the term if its frequency is 0
|
|
|
289 |
if (xit.get_wdf() == 0) {
|
|
|
290 |
LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
|
|
|
291 |
XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
|
|
|
292 |
if (!m_rcldb->m_reason.empty()) {
|
|
|
293 |
LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n",
|
|
|
294 |
term.c_str(), m_rcldb->m_reason.c_str()));
|
|
|
295 |
}
|
|
|
296 |
}
|
|
|
297 |
return true;
|
|
|
298 |
}
|
|
|
299 |
|
|
|
300 |
// Holder for term + pos
|
|
|
301 |
struct DocPosting {
|
|
|
302 |
DocPosting(string t, Xapian::termpos ps)
|
|
|
303 |
: term(t), pos(ps) {}
|
|
|
304 |
string term;
|
|
|
305 |
Xapian::termpos pos;
|
|
|
306 |
};
|
|
|
307 |
|
|
|
308 |
// Clear all terms for given field for given document.
|
|
|
309 |
// The terms to be cleared are all those with the appropriate
|
|
|
310 |
// prefix. We also remove the postings for the unprefixed terms (that
|
|
|
311 |
// is, we undo what we did when indexing).
|
|
|
312 |
bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
|
|
|
313 |
Xapian::termcount wdfdec)
|
|
|
314 |
{
|
|
|
315 |
LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
|
|
|
316 |
pfx.c_str(), unsigned(xdoc.get_docid())));
|
|
|
317 |
|
|
|
318 |
vector<DocPosting> eraselist;
|
|
|
319 |
|
|
|
320 |
string wrapd = wrap_prefix(pfx);
|
|
|
321 |
|
|
|
322 |
m_rcldb->m_reason.clear();
|
|
|
323 |
for (int tries = 0; tries < 2; tries++) {
|
|
|
324 |
try {
|
|
|
325 |
Xapian::TermIterator xit;
|
|
|
326 |
xit = xdoc.termlist_begin();
|
|
|
327 |
xit.skip_to(wrapd);
|
|
|
328 |
while (xit != xdoc.termlist_end() &&
|
|
|
329 |
!(*xit).compare(0, wrapd.size(), wrapd)) {
|
|
|
330 |
LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
|
|
|
331 |
Xapian::PositionIterator posit;
|
|
|
332 |
for (posit = xit.positionlist_begin();
|
|
|
333 |
posit != xit.positionlist_end(); posit++) {
|
|
|
334 |
eraselist.push_back(DocPosting(*xit, *posit));
|
|
|
335 |
eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
|
|
|
336 |
}
|
|
|
337 |
xit++;
|
|
|
338 |
}
|
|
|
339 |
} catch (const Xapian::DatabaseModifiedError &e) {
|
|
|
340 |
m_rcldb->m_reason = e.get_msg();
|
|
|
341 |
xrdb.reopen();
|
|
|
342 |
continue;
|
|
|
343 |
} XCATCHERROR(m_rcldb->m_reason);
|
|
|
344 |
break;
|
|
|
345 |
}
|
|
|
346 |
if (!m_rcldb->m_reason.empty()) {
|
|
|
347 |
LOGERR(("Db::clearField: failed building erase list: %s\n",
|
|
|
348 |
m_rcldb->m_reason.c_str()));
|
|
|
349 |
return false;
|
|
|
350 |
}
|
|
|
351 |
|
|
|
352 |
// Now remove the found positions, and the terms if the wdf is 0
|
|
|
353 |
for (vector<DocPosting>::const_iterator it = eraselist.begin();
|
|
|
354 |
it != eraselist.end(); it++) {
|
|
|
355 |
LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n",
|
|
|
356 |
it->term.c_str(), int(it->pos)));
|
|
|
357 |
XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);,
|
|
|
358 |
xwdb,m_rcldb->m_reason);
|
|
|
359 |
if (!m_rcldb->m_reason.empty()) {
|
|
|
360 |
// Not that this normally fails for non-prefixed XXST and
|
|
|
361 |
// ND, don't make a fuss
|
|
|
362 |
LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
|
|
|
363 |
it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
|
|
|
364 |
}
|
|
|
365 |
clearDocTermIfWdf0(xdoc, it->term);
|
|
|
366 |
}
|
|
|
367 |
return true;
|
|
|
368 |
}
|
|
|
369 |
|
266 |
// Check if doc given by udi is indexed by term
|
370 |
// Check if doc given by udi is indexed by term
|
267 |
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
|
371 |
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
|
268 |
{
|
372 |
{
|
269 |
LOGDEB2(("Native::hasTerm: udi [%s] term [%s]\n",udi.c_str(),term.c_str()));
|
373 |
LOGDEB2(("Native::hasTerm: udi [%s] term [%s]\n",udi.c_str(),term.c_str()));
|
270 |
Xapian::Document xdoc;
|
374 |
Xapian::Document xdoc;
|
|
... |
|
... |
458 |
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
562 |
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm,
|
459 |
Xapian::Document& newdocument, size_t textlen)
|
563 |
Xapian::Document& newdocument, size_t textlen)
|
460 |
{
|
564 |
{
|
461 |
#ifdef IDX_THREADS
|
565 |
#ifdef IDX_THREADS
|
462 |
Chrono chron;
|
566 |
Chrono chron;
|
463 |
// In the case where there is a separate (single) db update
|
|
|
464 |
// thread, we only need to protect the update map update below
|
|
|
465 |
// (against interaction with threads calling needUpdate()). Else,
|
|
|
466 |
// all threads from above need to synchronize here
|
|
|
467 |
PTMutexLocker lock(m_mutex, m_havewriteq);
|
567 |
PTMutexLocker lock(m_mutex);
|
468 |
#endif
|
568 |
#endif
|
469 |
|
569 |
|
470 |
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
570 |
// Check file system full every mbyte of indexed text. It's a bit wasteful
|
471 |
// to do this after having prepared the document, but it needs to be in
|
571 |
// to do this after having prepared the document, but it needs to be in
|
472 |
// the single-threaded section.
|
572 |
// the single-threaded section.
|
|
... |
|
... |
489 |
|
589 |
|
490 |
// Add db entry or update existing entry:
|
590 |
// Add db entry or update existing entry:
|
491 |
try {
|
591 |
try {
|
492 |
Xapian::docid did =
|
592 |
Xapian::docid did =
|
493 |
xwdb.replace_document(uniterm, newdocument);
|
593 |
xwdb.replace_document(uniterm, newdocument);
|
494 |
#ifdef IDX_THREADS
|
|
|
495 |
// Need to protect against interaction with the up-to-date checks
|
|
|
496 |
// which also update the existence map
|
|
|
497 |
PTMutexLocker lock(m_mutex, !m_havewriteq);
|
|
|
498 |
#endif
|
|
|
499 |
if (did < m_rcldb->updated.size()) {
|
594 |
if (did < m_rcldb->updated.size()) {
|
500 |
m_rcldb->updated[did] = true;
|
595 |
m_rcldb->updated[did] = true;
|
501 |
LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
|
596 |
LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
|
502 |
} else {
|
597 |
} else {
|
503 |
LOGINFO(("Db::add: docid %d added [%s]\n", did, fnc));
|
598 |
LOGINFO(("Db::add: docid %d added [%s]\n", did, fnc));
|
|
... |
|
... |
932 |
|
1027 |
|
933 |
*ftpp = 0;
|
1028 |
*ftpp = 0;
|
934 |
return false;
|
1029 |
return false;
|
935 |
}
|
1030 |
}
|
936 |
|
1031 |
|
937 |
|
|
|
938 |
// The splitter breaks text into words and adds postings to the Xapian
|
1032 |
// The splitter breaks text into words and adds postings to the Xapian
|
939 |
// document. We use a single object to split all of the document
|
1033 |
// document. We use a single object to split all of the document
|
940 |
// fields and position jumps to separate fields
|
1034 |
// fields and position jumps to separate fields
|
941 |
class TextSplitDb : public TextSplitP {
|
1035 |
class TextSplitDb : public TextSplitP {
|
942 |
public:
|
1036 |
public:
|
|
... |
|
... |
1149 |
udi.c_str(), parent_udi.c_str()));
|
1243 |
udi.c_str(), parent_udi.c_str()));
|
1150 |
if (m_ndb == 0)
|
1244 |
if (m_ndb == 0)
|
1151 |
return false;
|
1245 |
return false;
|
1152 |
|
1246 |
|
1153 |
Xapian::Document newdocument;
|
1247 |
Xapian::Document newdocument;
|
1154 |
|
1248 |
|
1155 |
// The term processing pipeline:
|
1249 |
// The term processing pipeline:
|
1156 |
TermProcIdx tpidx;
|
1250 |
TermProcIdx tpidx;
|
1157 |
TermProc *nxt = &tpidx;
|
1251 |
TermProc *nxt = &tpidx;
|
1158 |
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
1252 |
TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
|
1159 |
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
1253 |
//TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
|
|
... |
|
... |
1163 |
nxt = &tpprep;
|
1257 |
nxt = &tpprep;
|
1164 |
|
1258 |
|
1165 |
TextSplitDb splitter(newdocument, nxt);
|
1259 |
TextSplitDb splitter(newdocument, nxt);
|
1166 |
tpidx.setTSD(&splitter);
|
1260 |
tpidx.setTSD(&splitter);
|
1167 |
|
1261 |
|
1168 |
// If the ipath is like a path, index the last element. This is
|
|
|
1169 |
// for compound documents like zip and chm for which the filter
|
|
|
1170 |
// uses the file path as ipath.
|
|
|
1171 |
if (!doc.ipath.empty() &&
|
|
|
1172 |
doc.ipath.find_first_not_of("0123456789") != string::npos) {
|
|
|
1173 |
string utf8ipathlast;
|
|
|
1174 |
// There is no way in hell we could have an idea of the
|
|
|
1175 |
// charset here, so let's hope it's ascii or utf-8. We call
|
|
|
1176 |
// transcode to strip the bad chars and pray
|
|
|
1177 |
if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
|
|
|
1178 |
"UTF-8", "UTF-8")) {
|
|
|
1179 |
splitter.text_to_words(utf8ipathlast);
|
|
|
1180 |
}
|
|
|
1181 |
}
|
|
|
1182 |
|
|
|
1183 |
// Split and index the path from the url for path-based filtering
|
|
|
1184 |
{
|
|
|
1185 |
string path = url_gpath(doc.url);
|
|
|
1186 |
vector<string> vpath;
|
|
|
1187 |
stringToTokens(path, vpath, "/");
|
|
|
1188 |
// If vpath is not /, the last elt is the file/dir name, not a
|
|
|
1189 |
// part of the path.
|
|
|
1190 |
if (vpath.size())
|
|
|
1191 |
vpath.resize(vpath.size()-1);
|
|
|
1192 |
splitter.curpos = 0;
|
|
|
1193 |
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
|
|
1194 |
splitter.basepos + splitter.curpos++);
|
|
|
1195 |
for (vector<string>::iterator it = vpath.begin();
|
|
|
1196 |
it != vpath.end(); it++){
|
|
|
1197 |
if (it->length() > 230) {
|
|
|
1198 |
// Just truncate it. May still be useful because of wildcards
|
|
|
1199 |
*it = it->substr(0, 230);
|
|
|
1200 |
}
|
|
|
1201 |
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
|
|
1202 |
splitter.basepos + splitter.curpos++);
|
|
|
1203 |
}
|
|
|
1204 |
}
|
|
|
1205 |
|
|
|
1206 |
// Index textual metadata. These are all indexed as text with
|
|
|
1207 |
// positions, as we may want to do phrase searches with them (this
|
|
|
1208 |
// makes no sense for keywords by the way).
|
|
|
1209 |
//
|
|
|
1210 |
// The order has no importance, and we set a position gap of 100
|
|
|
1211 |
// between fields to avoid false proximity matches.
|
|
|
1212 |
map<string, string>::iterator meta_it;
|
|
|
1213 |
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
|
|
1214 |
if (!meta_it->second.empty()) {
|
|
|
1215 |
const FieldTraits *ftp;
|
|
|
1216 |
// We don't test for an empty prefix here. Some fields are part
|
|
|
1217 |
// of the internal conf with an empty prefix (ie: abstract).
|
|
|
1218 |
if (!fieldToTraits(meta_it->first, &ftp)) {
|
|
|
1219 |
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
|
|
|
1220 |
meta_it->first.c_str()));
|
|
|
1221 |
continue;
|
|
|
1222 |
}
|
|
|
1223 |
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
|
|
|
1224 |
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
|
|
1225 |
meta_it->second.c_str()));
|
|
|
1226 |
splitter.setprefix(ftp->pfx);
|
|
|
1227 |
splitter.setwdfinc(ftp->wdfinc);
|
|
|
1228 |
if (!splitter.text_to_words(meta_it->second))
|
|
|
1229 |
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
|
|
1230 |
meta_it->first.c_str()));
|
|
|
1231 |
}
|
|
|
1232 |
}
|
|
|
1233 |
splitter.setprefix(string());
|
|
|
1234 |
splitter.setwdfinc(1);
|
|
|
1235 |
|
|
|
1236 |
if (splitter.curpos < baseTextPosition)
|
|
|
1237 |
splitter.basepos = baseTextPosition;
|
|
|
1238 |
|
|
|
1239 |
// Split and index body text
|
|
|
1240 |
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
|
|
|
1241 |
|
|
|
1242 |
#ifdef TEXTSPLIT_STATS
|
|
|
1243 |
splitter.resetStats();
|
|
|
1244 |
#endif
|
|
|
1245 |
if (!splitter.text_to_words(doc.text))
|
|
|
1246 |
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
|
|
1247 |
|
|
|
1248 |
#ifdef TEXTSPLIT_STATS
|
|
|
1249 |
// Reject bad data. unrecognized base64 text is characterized by
|
|
|
1250 |
// high avg word length and high variation (because there are
|
|
|
1251 |
// word-splitters like +/ inside the data).
|
|
|
1252 |
TextSplit::Stats::Values v = splitter.getStats();
|
|
|
1253 |
// v.avglen > 15 && v.sigma > 12
|
|
|
1254 |
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
|
|
|
1255 |
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
|
|
|
1256 |
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
|
|
|
1257 |
v.count, v.avglen, v.sigma, doc.url.c_str(),
|
|
|
1258 |
doc.ipath.c_str(), doc.text.c_str()));
|
|
|
1259 |
return true;
|
|
|
1260 |
}
|
|
|
1261 |
#endif
|
|
|
1262 |
|
|
|
1263 |
////// Special terms for other metadata. No positions for these.
|
|
|
1264 |
// Mime type
|
|
|
1265 |
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
|
|
1266 |
|
|
|
1267 |
// Simple file name indexed unsplit for specific "file name"
|
|
|
1268 |
// searches. This is not the same as a filename: clause inside the
|
|
|
1269 |
// query language.
|
|
|
1270 |
// We also add a term for the filename extension if any.
|
|
|
1271 |
string utf8fn;
|
|
|
1272 |
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
|
|
1273 |
string fn;
|
|
|
1274 |
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
|
|
1275 |
// We should truncate after extracting the extension, but this is
|
|
|
1276 |
// a pathological case anyway
|
|
|
1277 |
if (fn.size() > 230)
|
|
|
1278 |
utf8truncate(fn, 230);
|
|
|
1279 |
string::size_type pos = fn.rfind('.');
|
|
|
1280 |
if (pos != string::npos && pos != fn.length() - 1) {
|
|
|
1281 |
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
|
|
|
1282 |
fn.substr(pos + 1));
|
|
|
1283 |
}
|
|
|
1284 |
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
|
|
|
1285 |
}
|
|
|
1286 |
}
|
|
|
1287 |
|
|
|
1288 |
// Udi unique term: this is used for file existence/uptodate
|
1262 |
// Udi unique term: this is used for file existence/uptodate
|
1289 |
// checks, and unique id for the replace_document() call.
|
1263 |
// checks, and unique id for the replace_document() call.
|
1290 |
string uniterm = make_uniterm(udi);
|
1264 |
string uniterm = make_uniterm(udi);
|
1291 |
newdocument.add_boolean_term(uniterm);
|
|
|
1292 |
// Parent term. This is used to find all descendents, mostly to delete them
|
|
|
1293 |
// when the parent goes away
|
|
|
1294 |
if (!parent_udi.empty()) {
|
|
|
1295 |
newdocument.add_boolean_term(make_parentterm(parent_udi));
|
|
|
1296 |
}
|
|
|
1297 |
// Dates etc.
|
|
|
1298 |
time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
|
|
1299 |
doc.dmtime.c_str());
|
|
|
1300 |
struct tm *tm = localtime(&mtime);
|
|
|
1301 |
char buf[9];
|
|
|
1302 |
snprintf(buf, 9, "%04d%02d%02d",
|
|
|
1303 |
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
|
|
1304 |
// Date (YYYYMMDD)
|
|
|
1305 |
newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf));
|
|
|
1306 |
// Month (YYYYMM)
|
|
|
1307 |
buf[6] = '\0';
|
|
|
1308 |
newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
|
|
1309 |
// Year (YYYY)
|
|
|
1310 |
buf[4] = '\0';
|
|
|
1311 |
newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf));
|
|
|
1312 |
|
1265 |
|
1313 |
|
1266 |
if (doc.onlyxattr) {
|
1314 |
//////////////////////////////////////////////////////////////////
|
1267 |
// Only updating an existing doc with new extended attributes
|
1315 |
// Document data record. omindex has the following nl separated fields:
|
1268 |
// data. Need to read the old doc and its data record
|
1316 |
// - url
|
1269 |
// first. This is so different from the normal processing that
|
1317 |
// - sample
|
1270 |
// it uses a fully separate code path (with some duplication
|
1318 |
// - caption (title limited to 100 chars)
|
1271 |
// unfortunately)
|
1319 |
// - mime type
|
1272 |
if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
|
1320 |
//
|
1273 |
return false;
|
1321 |
// The title, author, abstract and keywords fields are special,
|
|
|
1322 |
// they always get stored in the document data
|
|
|
1323 |
// record. Configurable other fields can be, too.
|
|
|
1324 |
//
|
|
|
1325 |
// We truncate stored fields abstract, title and keywords to
|
|
|
1326 |
// reasonable lengths and suppress newlines (so that the data
|
|
|
1327 |
// record can keep a simple syntax)
|
|
|
1328 |
|
|
|
1329 |
string record;
|
|
|
1330 |
RECORD_APPEND(record, Doc::keyurl, doc.url);
|
|
|
1331 |
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
|
|
|
1332 |
// We left-zero-pad the times so that they are lexico-sortable
|
|
|
1333 |
leftzeropad(doc.fmtime, 11);
|
|
|
1334 |
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
|
|
|
1335 |
if (!doc.dmtime.empty()) {
|
|
|
1336 |
leftzeropad(doc.dmtime, 11);
|
|
|
1337 |
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
|
|
|
1338 |
}
|
|
|
1339 |
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
|
|
|
1340 |
|
|
|
1341 |
if (doc.fbytes.empty())
|
|
|
1342 |
doc.fbytes = doc.pcbytes;
|
|
|
1343 |
|
|
|
1344 |
if (!doc.fbytes.empty()) {
|
|
|
1345 |
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
|
|
|
1346 |
leftzeropad(doc.fbytes, 12);
|
|
|
1347 |
newdocument.add_value(VALUE_SIZE, doc.fbytes);
|
|
|
1348 |
}
|
|
|
1349 |
if (doc.haschildren) {
|
|
|
1350 |
newdocument.add_boolean_term(has_children_term);
|
|
|
1351 |
}
|
|
|
1352 |
if (!doc.pcbytes.empty())
|
|
|
1353 |
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
|
|
|
1354 |
char sizebuf[30];
|
|
|
1355 |
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
|
|
1356 |
RECORD_APPEND(record, Doc::keyds, sizebuf);
|
|
|
1357 |
|
|
|
1358 |
// Note that we add the signature both as a value and in the data record
|
|
|
1359 |
if (!doc.sig.empty()) {
|
|
|
1360 |
RECORD_APPEND(record, Doc::keysig, doc.sig);
|
|
|
1361 |
newdocument.add_value(VALUE_SIG, doc.sig);
|
|
|
1362 |
}
|
|
|
1363 |
|
|
|
1364 |
if (!doc.ipath.empty())
|
|
|
1365 |
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
|
|
|
1366 |
|
|
|
1367 |
doc.meta[Doc::keytt] =
|
|
|
1368 |
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
|
|
|
1369 |
if (!doc.meta[Doc::keytt].empty())
|
|
|
1370 |
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
|
|
|
1371 |
|
|
|
1372 |
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
|
|
1373 |
doc.meta[Doc::keykw] =
|
|
|
1374 |
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
|
|
1375 |
// No need to explicitly append the keywords, this will be done by
|
|
|
1376 |
// the "stored" loop
|
|
|
1377 |
|
|
|
1378 |
// If abstract is empty, we make up one with the beginning of the
|
|
|
1379 |
// document. This is then not indexed, but part of the doc data so
|
|
|
1380 |
// that we can return it to a query without having to decode the
|
|
|
1381 |
// original file.
|
|
|
1382 |
bool syntabs = false;
|
|
|
1383 |
// Note that the map accesses by operator[] create empty entries if they
|
|
|
1384 |
// don't exist yet.
|
|
|
1385 |
trimstring(doc.meta[Doc::keyabs], " \t\r\n");
|
|
|
1386 |
if (doc.meta[Doc::keyabs].empty()) {
|
|
|
1387 |
syntabs = true;
|
|
|
1388 |
if (!doc.text.empty())
|
|
|
1389 |
doc.meta[Doc::keyabs] = cstr_syntAbs +
|
|
|
1390 |
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
|
|
|
1391 |
} else {
|
1274 |
} else {
|
|
|
1275 |
|
|
|
1276 |
// If the ipath is like a path, index the last element. This is
|
|
|
1277 |
// for compound documents like zip and chm for which the filter
|
|
|
1278 |
// uses the file path as ipath.
|
|
|
1279 |
if (!doc.ipath.empty() &&
|
|
|
1280 |
doc.ipath.find_first_not_of("0123456789") != string::npos) {
|
|
|
1281 |
string utf8ipathlast;
|
|
|
1282 |
// There is no way in hell we could have an idea of the
|
|
|
1283 |
// charset here, so let's hope it's ascii or utf-8. We call
|
|
|
1284 |
// transcode to strip the bad chars and pray
|
|
|
1285 |
if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
|
|
|
1286 |
"UTF-8", "UTF-8")) {
|
|
|
1287 |
splitter.text_to_words(utf8ipathlast);
|
|
|
1288 |
}
|
|
|
1289 |
}
|
|
|
1290 |
|
|
|
1291 |
// Split and index the path from the url for path-based filtering
|
|
|
1292 |
{
|
|
|
1293 |
string path = url_gpath(doc.url);
|
|
|
1294 |
vector<string> vpath;
|
|
|
1295 |
stringToTokens(path, vpath, "/");
|
|
|
1296 |
// If vpath is not /, the last elt is the file/dir name, not a
|
|
|
1297 |
// part of the path.
|
|
|
1298 |
if (vpath.size())
|
|
|
1299 |
vpath.resize(vpath.size()-1);
|
|
|
1300 |
splitter.curpos = 0;
|
|
|
1301 |
newdocument.add_posting(wrap_prefix(pathelt_prefix),
|
|
|
1302 |
splitter.basepos + splitter.curpos++);
|
|
|
1303 |
for (vector<string>::iterator it = vpath.begin();
|
|
|
1304 |
it != vpath.end(); it++){
|
|
|
1305 |
if (it->length() > 230) {
|
|
|
1306 |
// Just truncate it. May still be useful because of wildcards
|
|
|
1307 |
*it = it->substr(0, 230);
|
|
|
1308 |
}
|
|
|
1309 |
newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it,
|
|
|
1310 |
splitter.basepos + splitter.curpos++);
|
|
|
1311 |
}
|
|
|
1312 |
}
|
|
|
1313 |
|
|
|
1314 |
// Index textual metadata. These are all indexed as text with
|
|
|
1315 |
// positions, as we may want to do phrase searches with them (this
|
|
|
1316 |
// makes no sense for keywords by the way).
|
|
|
1317 |
//
|
|
|
1318 |
// The order has no importance, and we set a position gap of 100
|
|
|
1319 |
// between fields to avoid false proximity matches.
|
|
|
1320 |
map<string, string>::iterator meta_it;
|
|
|
1321 |
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
|
|
1322 |
if (!meta_it->second.empty()) {
|
|
|
1323 |
const FieldTraits *ftp;
|
|
|
1324 |
// We don't test for an empty prefix here. Some fields are part
|
|
|
1325 |
// of the internal conf with an empty prefix (ie: abstract).
|
|
|
1326 |
if (!fieldToTraits(meta_it->first, &ftp)) {
|
|
|
1327 |
LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
|
|
|
1328 |
meta_it->first.c_str()));
|
|
|
1329 |
continue;
|
|
|
1330 |
}
|
|
|
1331 |
LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n",
|
|
|
1332 |
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
|
|
1333 |
meta_it->second.c_str()));
|
|
|
1334 |
splitter.setprefix(ftp->pfx);
|
|
|
1335 |
splitter.setwdfinc(ftp->wdfinc);
|
|
|
1336 |
if (!splitter.text_to_words(meta_it->second))
|
|
|
1337 |
LOGDEB(("Db::addOrUpdate: split failed for %s\n",
|
|
|
1338 |
meta_it->first.c_str()));
|
|
|
1339 |
}
|
|
|
1340 |
}
|
|
|
1341 |
splitter.setprefix(string());
|
|
|
1342 |
splitter.setwdfinc(1);
|
|
|
1343 |
|
|
|
1344 |
if (splitter.curpos < baseTextPosition)
|
|
|
1345 |
splitter.basepos = baseTextPosition;
|
|
|
1346 |
|
|
|
1347 |
// Split and index body text
|
|
|
1348 |
LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
|
|
|
1349 |
|
|
|
1350 |
#ifdef TEXTSPLIT_STATS
|
|
|
1351 |
splitter.resetStats();
|
|
|
1352 |
#endif
|
|
|
1353 |
if (!splitter.text_to_words(doc.text))
|
|
|
1354 |
LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
|
|
|
1355 |
|
|
|
1356 |
#ifdef TEXTSPLIT_STATS
|
|
|
1357 |
// Reject bad data. unrecognized base64 text is characterized by
|
|
|
1358 |
// high avg word length and high variation (because there are
|
|
|
1359 |
// word-splitters like +/ inside the data).
|
|
|
1360 |
TextSplit::Stats::Values v = splitter.getStats();
|
|
|
1361 |
// v.avglen > 15 && v.sigma > 12
|
|
|
1362 |
if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
|
|
|
1363 |
LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
|
|
|
1364 |
"count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
|
|
|
1365 |
v.count, v.avglen, v.sigma, doc.url.c_str(),
|
|
|
1366 |
doc.ipath.c_str(), doc.text.c_str()));
|
|
|
1367 |
return true;
|
|
|
1368 |
}
|
|
|
1369 |
#endif
|
|
|
1370 |
|
|
|
1371 |
////// Special terms for other metadata. No positions for these.
|
|
|
1372 |
// Mime type
|
|
|
1373 |
newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
|
|
|
1374 |
|
|
|
1375 |
// Simple file name indexed unsplit for specific "file name"
|
|
|
1376 |
// searches. This is not the same as a filename: clause inside the
|
|
|
1377 |
// query language.
|
|
|
1378 |
// We also add a term for the filename extension if any.
|
|
|
1379 |
string utf8fn;
|
|
|
1380 |
if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
|
|
|
1381 |
string fn;
|
|
|
1382 |
if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
|
|
|
1383 |
// We should truncate after extracting the extension, but this is
|
|
|
1384 |
// a pathological case anyway
|
|
|
1385 |
if (fn.size() > 230)
|
|
|
1386 |
utf8truncate(fn, 230);
|
|
|
1387 |
string::size_type pos = fn.rfind('.');
|
|
|
1388 |
if (pos != string::npos && pos != fn.length() - 1) {
|
|
|
1389 |
newdocument.add_boolean_term(wrap_prefix(fileext_prefix) +
|
|
|
1390 |
fn.substr(pos + 1));
|
|
|
1391 |
}
|
|
|
1392 |
newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
|
|
|
1393 |
}
|
|
|
1394 |
}
|
|
|
1395 |
|
|
|
1396 |
newdocument.add_boolean_term(uniterm);
|
|
|
1397 |
// Parent term. This is used to find all descendents, mostly
|
|
|
1398 |
// to delete them when the parent goes away
|
|
|
1399 |
if (!parent_udi.empty()) {
|
|
|
1400 |
newdocument.add_boolean_term(make_parentterm(parent_udi));
|
|
|
1401 |
}
|
|
|
1402 |
// Dates etc.
|
|
|
1403 |
time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() :
|
|
|
1404 |
doc.dmtime.c_str());
|
|
|
1405 |
struct tm *tm = localtime(&mtime);
|
|
|
1406 |
char buf[9];
|
|
|
1407 |
snprintf(buf, 9, "%04d%02d%02d",
|
|
|
1408 |
tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
|
|
|
1409 |
// Date (YYYYMMDD)
|
|
|
1410 |
newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf));
|
|
|
1411 |
// Month (YYYYMM)
|
|
|
1412 |
buf[6] = '\0';
|
|
|
1413 |
newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
|
|
|
1414 |
// Year (YYYY)
|
|
|
1415 |
buf[4] = '\0';
|
|
|
1416 |
newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf));
|
|
|
1417 |
|
|
|
1418 |
|
|
|
1419 |
//////////////////////////////////////////////////////////////////
|
|
|
1420 |
// Document data record. omindex has the following nl separated fields:
|
|
|
1421 |
// - url
|
|
|
1422 |
// - sample
|
|
|
1423 |
// - caption (title limited to 100 chars)
|
|
|
1424 |
// - mime type
|
|
|
1425 |
//
|
|
|
1426 |
// The title, author, abstract and keywords fields are special,
|
|
|
1427 |
// they always get stored in the document data
|
|
|
1428 |
// record. Configurable other fields can be, too.
|
|
|
1429 |
//
|
|
|
1430 |
// We truncate stored fields abstract, title and keywords to
|
|
|
1431 |
// reasonable lengths and suppress newlines (so that the data
|
|
|
1432 |
// record can keep a simple syntax)
|
|
|
1433 |
|
|
|
1434 |
string record;
|
|
|
1435 |
RECORD_APPEND(record, Doc::keyurl, doc.url);
|
|
|
1436 |
RECORD_APPEND(record, Doc::keytp, doc.mimetype);
|
|
|
1437 |
// We left-zero-pad the times so that they are lexico-sortable
|
|
|
1438 |
leftzeropad(doc.fmtime, 11);
|
|
|
1439 |
RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
|
|
|
1440 |
if (!doc.dmtime.empty()) {
|
|
|
1441 |
leftzeropad(doc.dmtime, 11);
|
|
|
1442 |
RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
|
|
|
1443 |
}
|
|
|
1444 |
RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
|
|
|
1445 |
|
|
|
1446 |
if (doc.fbytes.empty())
|
|
|
1447 |
doc.fbytes = doc.pcbytes;
|
|
|
1448 |
|
|
|
1449 |
if (!doc.fbytes.empty()) {
|
|
|
1450 |
RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
|
|
|
1451 |
leftzeropad(doc.fbytes, 12);
|
|
|
1452 |
newdocument.add_value(VALUE_SIZE, doc.fbytes);
|
|
|
1453 |
}
|
|
|
1454 |
if (doc.haschildren) {
|
|
|
1455 |
newdocument.add_boolean_term(has_children_term);
|
|
|
1456 |
}
|
|
|
1457 |
if (!doc.pcbytes.empty())
|
|
|
1458 |
RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
|
|
|
1459 |
char sizebuf[30];
|
|
|
1460 |
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
|
|
1461 |
RECORD_APPEND(record, Doc::keyds, sizebuf);
|
|
|
1462 |
|
|
|
1463 |
// Note that we add the signature both as a value and in the data record
|
|
|
1464 |
if (!doc.sig.empty()) {
|
|
|
1465 |
RECORD_APPEND(record, Doc::keysig, doc.sig);
|
|
|
1466 |
newdocument.add_value(VALUE_SIG, doc.sig);
|
|
|
1467 |
}
|
|
|
1468 |
|
|
|
1469 |
if (!doc.ipath.empty())
|
|
|
1470 |
RECORD_APPEND(record, Doc::keyipt, doc.ipath);
|
|
|
1471 |
|
|
|
1472 |
doc.meta[Doc::keytt] =
|
|
|
1473 |
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
|
|
|
1474 |
if (!doc.meta[Doc::keytt].empty())
|
|
|
1475 |
RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
|
|
|
1476 |
|
|
|
1477 |
trimstring(doc.meta[Doc::keykw], " \t\r\n");
|
|
|
1478 |
doc.meta[Doc::keykw] =
|
|
|
1479 |
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
|
|
|
1480 |
// No need to explicitly append the keywords, this will be done by
|
|
|
1481 |
// the "stored" loop
|
|
|
1482 |
|
|
|
1483 |
// If abstract is empty, we make up one with the beginning of the
|
|
|
1484 |
// document. This is then not indexed, but part of the doc data so
|
|
|
1485 |
// that we can return it to a query without having to decode the
|
|
|
1486 |
// original file.
|
|
|
1487 |
bool syntabs = false;
|
|
|
1488 |
// Note that the map accesses by operator[] create empty entries if they
|
|
|
1489 |
// don't exist yet.
|
|
|
1490 |
trimstring(doc.meta[Doc::keyabs], " \t\r\n");
|
|
|
1491 |
if (doc.meta[Doc::keyabs].empty()) {
|
|
|
1492 |
syntabs = true;
|
|
|
1493 |
if (!doc.text.empty())
|
|
|
1494 |
doc.meta[Doc::keyabs] = cstr_syntAbs +
|
|
|
1495 |
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
|
|
|
1496 |
} else {
|
1392 |
doc.meta[Doc::keyabs] =
|
1497 |
doc.meta[Doc::keyabs] =
|
1393 |
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
1498 |
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
1394 |
cstr_nc);
|
1499 |
cstr_nc);
|
1395 |
}
|
1500 |
}
|
1396 |
|
1501 |
|
1397 |
const set<string>& stored = m_config->getStoredFields();
|
1502 |
const set<string>& stored = m_config->getStoredFields();
|
1398 |
for (set<string>::const_iterator it = stored.begin();
|
1503 |
for (set<string>::const_iterator it = stored.begin();
|
1399 |
it != stored.end(); it++) {
|
1504 |
it != stored.end(); it++) {
|
1400 |
string nm = m_config->fieldCanon(*it);
|
1505 |
string nm = m_config->fieldCanon(*it);
|
1401 |
if (!doc.meta[nm].empty()) {
|
1506 |
if (!doc.meta[nm].empty()) {
|
1402 |
string value =
|
1507 |
string value =
|
1403 |
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
1508 |
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
1404 |
RECORD_APPEND(record, nm, value);
|
1509 |
RECORD_APPEND(record, nm, value);
|
1405 |
}
|
|
|
1406 |
}
|
1510 |
}
|
|
|
1511 |
}
|
1407 |
|
1512 |
|
1408 |
// If empty pages (multiple break at same pos) were recorded, save
|
1513 |
// If empty pages (multiple break at same pos) were recorded, save
|
1409 |
// them (this is because we have no way to record them in the
|
1514 |
// them (this is because we have no way to record them in the
|
1410 |
// Xapian list
|
1515 |
// Xapian list
|
1411 |
if (!tpidx.m_pageincrvec.empty()) {
|
1516 |
if (!tpidx.m_pageincrvec.empty()) {
|
1412 |
ostringstream multibreaks;
|
1517 |
ostringstream multibreaks;
|
1413 |
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
|
1518 |
for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
|
1414 |
if (i != 0)
|
1519 |
if (i != 0)
|
1415 |
multibreaks << ",";
|
1520 |
multibreaks << ",";
|
1416 |
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
|
1521 |
multibreaks << tpidx.m_pageincrvec[i].first << "," <<
|
1417 |
tpidx.m_pageincrvec[i].second;
|
1522 |
tpidx.m_pageincrvec[i].second;
|
1418 |
}
|
1523 |
}
|
1419 |
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
|
1524 |
RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
|
1420 |
}
|
1525 |
}
|
1421 |
|
1526 |
|
1422 |
// If the file's md5 was computed, add value and term.
|
1527 |
// If the file's md5 was computed, add value and term.
|
1423 |
// The value is optionally used for query result duplicate elimination,
|
1528 |
// The value is optionally used for query result duplicate elimination,
|
1424 |
// and the term to find the duplicates.
|
1529 |
// and the term to find the duplicates.
|
1425 |
// We don't do this for empty docs.
|
1530 |
// We don't do this for empty docs.
|
1426 |
const string *md5;
|
1531 |
const string *md5;
|
1427 |
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
|
1532 |
if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
|
1428 |
md5->compare(cstr_md5empty)) {
|
1533 |
md5->compare(cstr_md5empty)) {
|
1429 |
string digest;
|
1534 |
string digest;
|
1430 |
MD5HexScan(*md5, digest);
|
1535 |
MD5HexScan(*md5, digest);
|
1431 |
newdocument.add_value(VALUE_MD5, digest);
|
1536 |
newdocument.add_value(VALUE_MD5, digest);
|
1432 |
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
1537 |
newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
|
1433 |
}
|
1538 |
}
|
1434 |
|
1539 |
|
1435 |
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
|
1540 |
LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
|
1436 |
newdocument.set_data(record);
|
1541 |
newdocument.set_data(record);
|
1437 |
|
1542 |
}
|
1438 |
#ifdef IDX_THREADS
|
1543 |
#ifdef IDX_THREADS
|
1439 |
if (m_ndb->m_havewriteq) {
|
1544 |
if (m_ndb->m_havewriteq) {
|
1440 |
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
1545 |
DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm,
|
1441 |
newdocument, doc.text.length());
|
1546 |
newdocument, doc.text.length());
|
1442 |
if (!m_ndb->m_wqueue.put(tp)) {
|
1547 |
if (!m_ndb->m_wqueue.put(tp)) {
|
|
... |
|
... |
1448 |
}
|
1553 |
}
|
1449 |
#endif
|
1554 |
#endif
|
1450 |
|
1555 |
|
1451 |
return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument,
|
1556 |
return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument,
|
1452 |
doc.text.length());
|
1557 |
doc.text.length());
|
|
|
1558 |
}
|
|
|
1559 |
|
|
|
1560 |
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi,
|
|
|
1561 |
Doc &doc, Xapian::Document& xdoc)
|
|
|
1562 |
{
|
|
|
1563 |
LOGDEB0(("Db::docToXdocXattrOnly\n"));
|
|
|
1564 |
PTMutexLocker lock(m_mutex);
|
|
|
1565 |
|
|
|
1566 |
// Read existing document and its data record
|
|
|
1567 |
if (getDoc(udi, 0, xdoc) == 0) {
|
|
|
1568 |
LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
|
|
|
1569 |
return false;
|
|
|
1570 |
}
|
|
|
1571 |
string data;
|
|
|
1572 |
XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
|
|
|
1573 |
if (!m_rcldb->m_reason.empty()) {
|
|
|
1574 |
LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
|
|
|
1575 |
return false;
|
|
|
1576 |
}
|
|
|
1577 |
|
|
|
1578 |
// Clear the term lists for the incoming fields and index the new values
|
|
|
1579 |
map<string, string>::iterator meta_it;
|
|
|
1580 |
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
|
|
1581 |
const FieldTraits *ftp;
|
|
|
1582 |
if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
|
|
|
1583 |
LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
|
|
|
1584 |
meta_it->first.c_str()));
|
|
|
1585 |
continue;
|
|
|
1586 |
}
|
|
|
1587 |
// Clear the previous terms for the field
|
|
|
1588 |
clearField(xdoc, ftp->pfx, ftp->wdfinc);
|
|
|
1589 |
LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n",
|
|
|
1590 |
meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
|
|
|
1591 |
meta_it->second.c_str()));
|
|
|
1592 |
splitter->setprefix(ftp->pfx);
|
|
|
1593 |
splitter->setwdfinc(ftp->wdfinc);
|
|
|
1594 |
if (!splitter->text_to_words(meta_it->second))
|
|
|
1595 |
LOGDEB(("Db::xattrOnly: split failed for %s\n",
|
|
|
1596 |
meta_it->first.c_str()));
|
|
|
1597 |
}
|
|
|
1598 |
xdoc.add_value(VALUE_SIG, doc.sig);
|
|
|
1599 |
|
|
|
1600 |
// Parse current data record into a dict for ease of processing
|
|
|
1601 |
ConfSimple datadic(data);
|
|
|
1602 |
if (!datadic.ok()) {
|
|
|
1603 |
LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
|
|
|
1604 |
return false;
|
|
|
1605 |
}
|
|
|
1606 |
|
|
|
1607 |
// For each "stored" field, check if set in doc metadata and
|
|
|
1608 |
// update the value if it is
|
|
|
1609 |
const set<string>& stored = m_rcldb->m_config->getStoredFields();
|
|
|
1610 |
for (set<string>::const_iterator it = stored.begin();
|
|
|
1611 |
it != stored.end(); it++) {
|
|
|
1612 |
string nm = m_rcldb->m_config->fieldCanon(*it);
|
|
|
1613 |
if (doc.getmeta(nm, 0)) {
|
|
|
1614 |
string value =
|
|
|
1615 |
neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
|
|
|
1616 |
datadic.set(nm, value, "");
|
|
|
1617 |
}
|
|
|
1618 |
}
|
|
|
1619 |
|
|
|
1620 |
// Recreate the record. We want to do this with the local RECORD_APPEND
|
|
|
1621 |
// method for consistency in format, instead of using ConfSimple print
|
|
|
1622 |
vector<string> names = datadic.getNames("");
|
|
|
1623 |
data.clear();
|
|
|
1624 |
for (vector<string>::const_iterator it = names.begin();
|
|
|
1625 |
it != names.end(); it++) {
|
|
|
1626 |
string value;
|
|
|
1627 |
datadic.get(*it, value, "");
|
|
|
1628 |
RECORD_APPEND(data, *it, value);
|
|
|
1629 |
}
|
|
|
1630 |
RECORD_APPEND(data, Doc::keysig, doc.sig);
|
|
|
1631 |
xdoc.set_data(data);
|
|
|
1632 |
return true;
|
1453 |
}
|
1633 |
}
|
1454 |
|
1634 |
|
1455 |
#ifdef IDX_THREADS
|
1635 |
#ifdef IDX_THREADS
|
1456 |
void Db::waitUpdIdle()
|
1636 |
void Db::waitUpdIdle()
|
1457 |
{
|
1637 |
{
|