Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
...
...
261
    }
261
    }
262
    }
262
    }
263
    return false;
263
    return false;
264
}
264
}
265
265
266
// Clear term from document if its frequency is 0. This should
267
// probably be done by Xapian when the freq goes to 0 when removing a
268
// posting, but we have to do it ourselves
269
bool Db::Native::clearDocTermIfWdf0(Xapian::Document& xdoc, const string& term)
270
{
271
    LOGDEB1(("Db::clearDocTermIfWdf0: [%s]\n", term.c_str()));
272
273
    // Find the term
274
    Xapian::TermIterator xit;
275
    XAPTRY(xit = xdoc.termlist_begin(); xit.skip_to(term);,
276
     xrdb, m_rcldb->m_reason);
277
    if (!m_rcldb->m_reason.empty()) {
278
  LOGERR(("Db::clearDocTerm...: [%s] skip failed: %s\n", 
279
      term.c_str(), m_rcldb->m_reason.c_str()));
280
  return false;
281
    }
282
    if (xit == xdoc.termlist_end() || term.compare(*xit)) {
283
  LOGDEB0(("Db::clearDocTermIFWdf0: term [%s] not found. xit: [%s]\n", 
284
       term.c_str(), xit == xdoc.termlist_end() ? "EOL":(*xit).c_str()));
285
  return false;
286
    }
287
288
    // Clear the term if its frequency is 0
289
    if (xit.get_wdf() == 0) {
290
  LOGDEB1(("Db::clearDocTermIfWdf0: clearing [%s]\n", term.c_str()));
291
  XAPTRY(xdoc.remove_term(term), xwdb, m_rcldb->m_reason);
292
  if (!m_rcldb->m_reason.empty()) {
293
      LOGDEB0(("Db::clearDocTermIfWdf0: failed [%s]: %s\n", 
294
           term.c_str(), m_rcldb->m_reason.c_str()));
295
  }
296
    }
297
    return true;
298
}
299
300
// Holder for term + pos
301
struct DocPosting {
302
    DocPosting(string t, Xapian::termpos ps)
303
  : term(t), pos(ps) {}
304
    string term;
305
    Xapian::termpos pos;
306
};
307
308
// Clear all terms for given field for given document.
309
// The terms to be cleared are all those with the appropriate
310
// prefix. We also remove the postings for the unprefixed terms (that
311
// is, we undo what we did when indexing).
312
bool Db::Native::clearField(Xapian::Document& xdoc, const string& pfx,
313
              Xapian::termcount wdfdec)
314
{
315
    LOGDEB1(("Db::clearField: clearing prefix [%s] for docid %u\n",
316
       pfx.c_str(), unsigned(xdoc.get_docid())));
317
318
    vector<DocPosting> eraselist;
319
320
    string wrapd = wrap_prefix(pfx);
321
322
    m_rcldb->m_reason.clear();
323
    for (int tries = 0; tries < 2; tries++) {
324
  try {
325
      Xapian::TermIterator xit;
326
      xit = xdoc.termlist_begin();
327
      xit.skip_to(wrapd);
328
      while (xit != xdoc.termlist_end() && 
329
      !(*xit).compare(0, wrapd.size(), wrapd)) {
330
      LOGDEB1(("Db::clearfield: erasing for [%s]\n", (*xit).c_str()));
331
      Xapian::PositionIterator posit;
332
      for (posit = xit.positionlist_begin();
333
           posit != xit.positionlist_end(); posit++) {
334
          eraselist.push_back(DocPosting(*xit, *posit));
335
          eraselist.push_back(DocPosting(strip_prefix(*xit), *posit));
336
      }
337
      xit++;
338
      }
339
  } catch (const Xapian::DatabaseModifiedError &e) {
340
      m_rcldb->m_reason = e.get_msg();
341
      xrdb.reopen();
342
      continue;
343
  } XCATCHERROR(m_rcldb->m_reason);
344
  break;
345
    }
346
    if (!m_rcldb->m_reason.empty()) {
347
  LOGERR(("Db::clearField: failed building erase list: %s\n", 
348
      m_rcldb->m_reason.c_str()));
349
  return false;
350
    }
351
352
    // Now remove the found positions, and the terms if the wdf is 0
353
    for (vector<DocPosting>::const_iterator it = eraselist.begin();
354
   it != eraselist.end(); it++) {
355
  LOGDEB1(("Db::clearField: remove posting: [%s] pos [%d]\n", 
356
       it->term.c_str(), int(it->pos)));
357
  XAPTRY(xdoc.remove_posting(it->term, it->pos, wdfdec);, 
358
         xwdb,m_rcldb->m_reason);
359
  if (!m_rcldb->m_reason.empty()) {
360
      // Not that this normally fails for non-prefixed XXST and
361
      // ND, don't make a fuss
362
      LOGDEB1(("Db::clearFiedl: remove_posting failed for [%s],%d: %s\n",
363
           it->term.c_str(),int(it->pos), m_rcldb->m_reason.c_str()));
364
  }
365
  clearDocTermIfWdf0(xdoc, it->term);
366
    }
367
    return true;
368
}
369
266
// Check if doc given by udi is indexed by term
370
// Check if doc given by udi is indexed by term
267
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
371
bool Db::Native::hasTerm(const string& udi, int idxi, const string& term)
268
{
372
{
269
    LOGDEB2(("Native::hasTerm: udi [%s] term [%s]\n",udi.c_str(),term.c_str()));
373
    LOGDEB2(("Native::hasTerm: udi [%s] term [%s]\n",udi.c_str(),term.c_str()));
270
    Xapian::Document xdoc;
374
    Xapian::Document xdoc;
...
...
458
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, 
562
bool Db::Native::addOrUpdateWrite(const string& udi, const string& uniterm, 
459
                  Xapian::Document& newdocument, size_t textlen)
563
                  Xapian::Document& newdocument, size_t textlen)
460
{
564
{
461
#ifdef IDX_THREADS
565
#ifdef IDX_THREADS
462
    Chrono chron;
566
    Chrono chron;
463
    // In the case where there is a separate (single) db update
464
    // thread, we only need to protect the update map update below
465
    // (against interaction with threads calling needUpdate()). Else,
466
    // all threads from above need to synchronize here
467
    PTMutexLocker lock(m_mutex, m_havewriteq);
567
    PTMutexLocker lock(m_mutex);
468
#endif
568
#endif
469
569
470
    // Check file system full every mbyte of indexed text. It's a bit wasteful
570
    // Check file system full every mbyte of indexed text. It's a bit wasteful
471
    // to do this after having prepared the document, but it needs to be in
571
    // to do this after having prepared the document, but it needs to be in
472
    // the single-threaded section.
572
    // the single-threaded section.
...
...
489
589
490
    // Add db entry or update existing entry:
590
    // Add db entry or update existing entry:
491
    try {
591
    try {
492
    Xapian::docid did = 
592
    Xapian::docid did = 
493
        xwdb.replace_document(uniterm, newdocument);
593
        xwdb.replace_document(uniterm, newdocument);
494
#ifdef IDX_THREADS
495
  // Need to protect against interaction with the up-to-date checks
496
  // which also update the existence map
497
  PTMutexLocker lock(m_mutex, !m_havewriteq);
498
#endif
499
    if (did < m_rcldb->updated.size()) {
594
    if (did < m_rcldb->updated.size()) {
500
        m_rcldb->updated[did] = true;
595
        m_rcldb->updated[did] = true;
501
        LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
596
        LOGINFO(("Db::add: docid %d updated [%s]\n", did, fnc));
502
    } else {
597
    } else {
503
        LOGINFO(("Db::add: docid %d added [%s]\n", did, fnc));
598
        LOGINFO(("Db::add: docid %d added [%s]\n", did, fnc));
...
...
932
1027
933
    *ftpp = 0;
1028
    *ftpp = 0;
934
    return false;
1029
    return false;
935
}
1030
}
936
1031
937
938
// The splitter breaks text into words and adds postings to the Xapian
1032
// The splitter breaks text into words and adds postings to the Xapian
939
// document. We use a single object to split all of the document
1033
// document. We use a single object to split all of the document
940
// fields and position jumps to separate fields
1034
// fields and position jumps to separate fields
941
class TextSplitDb : public TextSplitP {
1035
class TextSplitDb : public TextSplitP {
942
 public:
1036
 public:
...
...
1149
         udi.c_str(), parent_udi.c_str()));
1243
         udi.c_str(), parent_udi.c_str()));
1150
    if (m_ndb == 0)
1244
    if (m_ndb == 0)
1151
    return false;
1245
    return false;
1152
1246
1153
    Xapian::Document newdocument;
1247
    Xapian::Document newdocument;
1154
1248
    
1155
    // The term processing pipeline:
1249
    // The term processing pipeline:
1156
    TermProcIdx tpidx;
1250
    TermProcIdx tpidx;
1157
    TermProc *nxt = &tpidx;
1251
    TermProc *nxt = &tpidx;
1158
    TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
1252
    TermProcStop tpstop(nxt, m_stops);nxt = &tpstop;
1159
    //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
1253
    //TermProcCommongrams tpcommon(nxt, m_stops); nxt = &tpcommon;
...
...
1163
    nxt = &tpprep;
1257
    nxt = &tpprep;
1164
1258
1165
    TextSplitDb splitter(newdocument, nxt);
1259
    TextSplitDb splitter(newdocument, nxt);
1166
    tpidx.setTSD(&splitter);
1260
    tpidx.setTSD(&splitter);
1167
1261
1168
    // If the ipath is like a path, index the last element. This is
1169
    // for compound documents like zip and chm for which the filter
1170
    // uses the file path as ipath. 
1171
    if (!doc.ipath.empty() && 
1172
  doc.ipath.find_first_not_of("0123456789") != string::npos) {
1173
  string utf8ipathlast;
1174
  // There is no way in hell we could have an idea of the
1175
  // charset here, so let's hope it's ascii or utf-8. We call
1176
  // transcode to strip the bad chars and pray
1177
  if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
1178
            "UTF-8", "UTF-8")) {
1179
      splitter.text_to_words(utf8ipathlast);
1180
  }
1181
    }
1182
1183
    // Split and index the path from the url for path-based filtering
1184
    {
1185
  string path = url_gpath(doc.url);
1186
  vector<string> vpath;
1187
  stringToTokens(path, vpath, "/");
1188
  // If vpath is not /, the last elt is the file/dir name, not a
1189
  // part of the path.
1190
  if (vpath.size())
1191
      vpath.resize(vpath.size()-1);
1192
  splitter.curpos = 0;
1193
  newdocument.add_posting(wrap_prefix(pathelt_prefix),
1194
              splitter.basepos + splitter.curpos++);
1195
  for (vector<string>::iterator it = vpath.begin(); 
1196
       it != vpath.end(); it++){
1197
      if (it->length() > 230) {
1198
      // Just truncate it. May still be useful because of wildcards
1199
      *it = it->substr(0, 230);
1200
      }
1201
      newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
1202
                  splitter.basepos + splitter.curpos++);
1203
  }
1204
    }
1205
1206
    // Index textual metadata.  These are all indexed as text with
1207
    // positions, as we may want to do phrase searches with them (this
1208
    // makes no sense for keywords by the way).
1209
    //
1210
    // The order has no importance, and we set a position gap of 100
1211
    // between fields to avoid false proximity matches.
1212
    map<string, string>::iterator meta_it;
1213
    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
1214
  if (!meta_it->second.empty()) {
1215
      const FieldTraits *ftp;
1216
      // We don't test for an empty prefix here. Some fields are part
1217
      // of the internal conf with an empty prefix (ie: abstract).
1218
      if (!fieldToTraits(meta_it->first, &ftp)) {
1219
      LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
1220
           meta_it->first.c_str()));
1221
      continue;
1222
      }
1223
      LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
1224
           meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
1225
           meta_it->second.c_str()));
1226
      splitter.setprefix(ftp->pfx);
1227
      splitter.setwdfinc(ftp->wdfinc);
1228
      if (!splitter.text_to_words(meta_it->second))
1229
                LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
1230
                        meta_it->first.c_str()));
1231
  }
1232
    }
1233
    splitter.setprefix(string());
1234
    splitter.setwdfinc(1);
1235
1236
    if (splitter.curpos < baseTextPosition)
1237
  splitter.basepos = baseTextPosition;
1238
1239
    // Split and index body text
1240
    LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
1241
1242
#ifdef TEXTSPLIT_STATS
1243
    splitter.resetStats();
1244
#endif
1245
    if (!splitter.text_to_words(doc.text))
1246
        LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
1247
1248
#ifdef TEXTSPLIT_STATS
1249
    // Reject bad data. unrecognized base64 text is characterized by
1250
    // high avg word length and high variation (because there are
1251
    // word-splitters like +/ inside the data).
1252
    TextSplit::Stats::Values v = splitter.getStats();
1253
    // v.avglen > 15 && v.sigma > 12 
1254
    if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
1255
  LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
1256
   "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
1257
       v.count, v.avglen, v.sigma, doc.url.c_str(), 
1258
       doc.ipath.c_str(), doc.text.c_str()));
1259
  return true;
1260
    }
1261
#endif
1262
1263
    ////// Special terms for other metadata. No positions for these.
1264
    // Mime type
1265
    newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
1266
1267
    // Simple file name indexed unsplit for specific "file name"
1268
    // searches. This is not the same as a filename: clause inside the
1269
    // query language.
1270
    // We also add a term for the filename extension if any.
1271
    string utf8fn;
1272
    if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
1273
  string fn;
1274
  if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
1275
      // We should truncate after extracting the extension, but this is
1276
      // a pathological case anyway
1277
      if (fn.size() > 230)
1278
      utf8truncate(fn, 230);
1279
      string::size_type pos = fn.rfind('.');
1280
      if (pos != string::npos && pos != fn.length() - 1) {
1281
      newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
1282
                       fn.substr(pos + 1));
1283
      }
1284
      newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
1285
  }
1286
    }
1287
1288
    // Udi unique term: this is used for file existence/uptodate
1262
    // Udi unique term: this is used for file existence/uptodate
1289
    // checks, and unique id for the replace_document() call.
1263
    // checks, and unique id for the replace_document() call.
1290
    string uniterm = make_uniterm(udi);
1264
    string uniterm = make_uniterm(udi);
1291
    newdocument.add_boolean_term(uniterm);
1292
    // Parent term. This is used to find all descendents, mostly to delete them 
1293
    // when the parent goes away
1294
    if (!parent_udi.empty()) {
1295
  newdocument.add_boolean_term(make_parentterm(parent_udi));
1296
    }
1297
    // Dates etc.
1298
    time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : 
1299
           doc.dmtime.c_str());
1300
    struct tm *tm = localtime(&mtime);
1301
    char buf[9];
1302
    snprintf(buf, 9, "%04d%02d%02d",
1303
      tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
1304
    // Date (YYYYMMDD)
1305
    newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); 
1306
    // Month (YYYYMM)
1307
    buf[6] = '\0';
1308
    newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
1309
    // Year (YYYY)
1310
    buf[4] = '\0';
1311
    newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); 
1312
1265
1313
1266
    if (doc.onlyxattr) {
1314
    //////////////////////////////////////////////////////////////////
1267
  // Only updating an existing doc with new extended attributes
1315
    // Document data record. omindex has the following nl separated fields:
1268
  // data.  Need to read the old doc and its data record
1316
    // - url
1269
  // first. This is so different from the normal processing that
1317
    // - sample
1270
  // it uses a fully separate code path (with some duplication
1318
    // - caption (title limited to 100 chars)
1271
  // unfortunately)
1319
    // - mime type 
1272
  if (!m_ndb->docToXdocXattrOnly(&splitter, udi, doc, newdocument))
1320
    //
1273
      return false;
1321
    // The title, author, abstract and keywords fields are special,
1322
    // they always get stored in the document data
1323
    // record. Configurable other fields can be, too.
1324
    //
1325
    // We truncate stored fields abstract, title and keywords to
1326
    // reasonable lengths and suppress newlines (so that the data
1327
    // record can keep a simple syntax)
1328
1329
    string record;
1330
    RECORD_APPEND(record, Doc::keyurl, doc.url);
1331
    RECORD_APPEND(record, Doc::keytp, doc.mimetype);
1332
    // We left-zero-pad the times so that they are lexico-sortable
1333
    leftzeropad(doc.fmtime, 11);
1334
    RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
1335
    if (!doc.dmtime.empty()) {
1336
  leftzeropad(doc.dmtime, 11);
1337
  RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
1338
    }
1339
    RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
1340
1341
    if (doc.fbytes.empty())
1342
  doc.fbytes = doc.pcbytes;
1343
1344
    if (!doc.fbytes.empty()) {
1345
  RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
1346
  leftzeropad(doc.fbytes, 12);
1347
  newdocument.add_value(VALUE_SIZE, doc.fbytes);
1348
    }
1349
    if (doc.haschildren) {
1350
  newdocument.add_boolean_term(has_children_term);
1351
    } 
1352
    if (!doc.pcbytes.empty())
1353
  RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
1354
    char sizebuf[30]; 
1355
    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
1356
    RECORD_APPEND(record, Doc::keyds, sizebuf);
1357
1358
    // Note that we add the signature both as a value and in the data record
1359
    if (!doc.sig.empty()) {
1360
  RECORD_APPEND(record, Doc::keysig, doc.sig);
1361
  newdocument.add_value(VALUE_SIG, doc.sig);
1362
    }
1363
1364
    if (!doc.ipath.empty())
1365
  RECORD_APPEND(record, Doc::keyipt, doc.ipath);
1366
1367
    doc.meta[Doc::keytt] = 
1368
  neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
1369
    if (!doc.meta[Doc::keytt].empty())
1370
  RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
1371
1372
    trimstring(doc.meta[Doc::keykw], " \t\r\n");
1373
    doc.meta[Doc::keykw] = 
1374
  neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
1375
    // No need to explicitly append the keywords, this will be done by 
1376
    // the "stored" loop
1377
1378
    // If abstract is empty, we make up one with the beginning of the
1379
    // document. This is then not indexed, but part of the doc data so
1380
    // that we can return it to a query without having to decode the
1381
    // original file.
1382
    bool syntabs = false;
1383
    // Note that the map accesses by operator[] create empty entries if they
1384
    // don't exist yet.
1385
    trimstring(doc.meta[Doc::keyabs], " \t\r\n");
1386
    if (doc.meta[Doc::keyabs].empty()) {
1387
  syntabs = true;
1388
  if (!doc.text.empty())
1389
      doc.meta[Doc::keyabs] = cstr_syntAbs + 
1390
      neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
1391
    } else {
1274
    } else {
1275
1276
  // If the ipath is like a path, index the last element. This is
1277
  // for compound documents like zip and chm for which the filter
1278
  // uses the file path as ipath. 
1279
  if (!doc.ipath.empty() && 
1280
      doc.ipath.find_first_not_of("0123456789") != string::npos) {
1281
      string utf8ipathlast;
1282
      // There is no way in hell we could have an idea of the
1283
      // charset here, so let's hope it's ascii or utf-8. We call
1284
      // transcode to strip the bad chars and pray
1285
      if (transcode(path_getsimple(doc.ipath), utf8ipathlast,
1286
            "UTF-8", "UTF-8")) {
1287
      splitter.text_to_words(utf8ipathlast);
1288
      }
1289
  }
1290
1291
  // Split and index the path from the url for path-based filtering
1292
  {
1293
      string path = url_gpath(doc.url);
1294
      vector<string> vpath;
1295
      stringToTokens(path, vpath, "/");
1296
      // If vpath is not /, the last elt is the file/dir name, not a
1297
      // part of the path.
1298
      if (vpath.size())
1299
      vpath.resize(vpath.size()-1);
1300
      splitter.curpos = 0;
1301
      newdocument.add_posting(wrap_prefix(pathelt_prefix),
1302
                  splitter.basepos + splitter.curpos++);
1303
      for (vector<string>::iterator it = vpath.begin(); 
1304
       it != vpath.end(); it++){
1305
      if (it->length() > 230) {
1306
          // Just truncate it. May still be useful because of wildcards
1307
          *it = it->substr(0, 230);
1308
      }
1309
      newdocument.add_posting(wrap_prefix(pathelt_prefix) + *it, 
1310
                  splitter.basepos + splitter.curpos++);
1311
      }
1312
  }
1313
1314
  // Index textual metadata.  These are all indexed as text with
1315
  // positions, as we may want to do phrase searches with them (this
1316
  // makes no sense for keywords by the way).
1317
  //
1318
  // The order has no importance, and we set a position gap of 100
1319
  // between fields to avoid false proximity matches.
1320
  map<string, string>::iterator meta_it;
1321
  for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
1322
      if (!meta_it->second.empty()) {
1323
      const FieldTraits *ftp;
1324
      // We don't test for an empty prefix here. Some fields are part
1325
      // of the internal conf with an empty prefix (ie: abstract).
1326
      if (!fieldToTraits(meta_it->first, &ftp)) {
1327
          LOGDEB0(("Db::add: no prefix for field [%s], no indexing\n",
1328
               meta_it->first.c_str()));
1329
          continue;
1330
      }
1331
      LOGDEB0(("Db::add: field [%s] pfx [%s] inc %d: [%s]\n", 
1332
           meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
1333
           meta_it->second.c_str()));
1334
      splitter.setprefix(ftp->pfx);
1335
      splitter.setwdfinc(ftp->wdfinc);
1336
      if (!splitter.text_to_words(meta_it->second))
1337
          LOGDEB(("Db::addOrUpdate: split failed for %s\n", 
1338
              meta_it->first.c_str()));
1339
      }
1340
  }
1341
  splitter.setprefix(string());
1342
  splitter.setwdfinc(1);
1343
1344
  if (splitter.curpos < baseTextPosition)
1345
      splitter.basepos = baseTextPosition;
1346
1347
  // Split and index body text
1348
  LOGDEB2(("Db::add: split body: [%s]\n", doc.text.c_str()));
1349
1350
#ifdef TEXTSPLIT_STATS
1351
  splitter.resetStats();
1352
#endif
1353
  if (!splitter.text_to_words(doc.text))
1354
      LOGDEB(("Db::addOrUpdate: split failed for main text\n"));
1355
1356
#ifdef TEXTSPLIT_STATS
1357
  // Reject bad data. unrecognized base64 text is characterized by
1358
  // high avg word length and high variation (because there are
1359
  // word-splitters like +/ inside the data).
1360
  TextSplit::Stats::Values v = splitter.getStats();
1361
  // v.avglen > 15 && v.sigma > 12 
1362
  if (v.count > 200 && (v.avglen > 10 && v.sigma / v.avglen > 0.8)) {
1363
      LOGINFO(("RclDb::addOrUpdate: rejecting doc for bad stats "
1364
           "count %d avglen %.4f sigma %.4f url [%s] ipath [%s] text %s\n",
1365
           v.count, v.avglen, v.sigma, doc.url.c_str(), 
1366
           doc.ipath.c_str(), doc.text.c_str()));
1367
      return true;
1368
  }
1369
#endif
1370
1371
  ////// Special terms for other metadata. No positions for these.
1372
  // Mime type
1373
  newdocument.add_boolean_term(wrap_prefix(mimetype_prefix) + doc.mimetype);
1374
1375
  // Simple file name indexed unsplit for specific "file name"
1376
  // searches. This is not the same as a filename: clause inside the
1377
  // query language.
1378
  // We also add a term for the filename extension if any.
1379
  string utf8fn;
1380
  if (doc.getmeta(Doc::keyfn, &utf8fn) && !utf8fn.empty()) {
1381
      string fn;
1382
      if (unacmaybefold(utf8fn, fn, "UTF-8", UNACOP_UNACFOLD)) {
1383
      // We should truncate after extracting the extension, but this is
1384
      // a pathological case anyway
1385
      if (fn.size() > 230)
1386
          utf8truncate(fn, 230);
1387
      string::size_type pos = fn.rfind('.');
1388
      if (pos != string::npos && pos != fn.length() - 1) {
1389
          newdocument.add_boolean_term(wrap_prefix(fileext_prefix) + 
1390
                       fn.substr(pos + 1));
1391
      }
1392
      newdocument.add_term(wrap_prefix(unsplitfilename_prefix) + fn, 0);
1393
      }
1394
  }
1395
1396
  newdocument.add_boolean_term(uniterm);
1397
  // Parent term. This is used to find all descendents, mostly
1398
  // to delete them when the parent goes away
1399
  if (!parent_udi.empty()) {
1400
      newdocument.add_boolean_term(make_parentterm(parent_udi));
1401
  }
1402
  // Dates etc.
1403
  time_t mtime = atoll(doc.dmtime.empty() ? doc.fmtime.c_str() : 
1404
               doc.dmtime.c_str());
1405
  struct tm *tm = localtime(&mtime);
1406
  char buf[9];
1407
  snprintf(buf, 9, "%04d%02d%02d",
1408
       tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
1409
  // Date (YYYYMMDD)
1410
  newdocument.add_boolean_term(wrap_prefix(xapday_prefix) + string(buf)); 
1411
  // Month (YYYYMM)
1412
  buf[6] = '\0';
1413
  newdocument.add_boolean_term(wrap_prefix(xapmonth_prefix) + string(buf));
1414
  // Year (YYYY)
1415
  buf[4] = '\0';
1416
  newdocument.add_boolean_term(wrap_prefix(xapyear_prefix) + string(buf)); 
1417
1418
1419
  //////////////////////////////////////////////////////////////////
1420
  // Document data record. omindex has the following nl separated fields:
1421
  // - url
1422
  // - sample
1423
  // - caption (title limited to 100 chars)
1424
  // - mime type 
1425
  //
1426
  // The title, author, abstract and keywords fields are special,
1427
  // they always get stored in the document data
1428
  // record. Configurable other fields can be, too.
1429
  //
1430
  // We truncate stored fields abstract, title and keywords to
1431
  // reasonable lengths and suppress newlines (so that the data
1432
  // record can keep a simple syntax)
1433
1434
  string record;
1435
  RECORD_APPEND(record, Doc::keyurl, doc.url);
1436
  RECORD_APPEND(record, Doc::keytp, doc.mimetype);
1437
  // We left-zero-pad the times so that they are lexico-sortable
1438
  leftzeropad(doc.fmtime, 11);
1439
  RECORD_APPEND(record, Doc::keyfmt, doc.fmtime);
1440
  if (!doc.dmtime.empty()) {
1441
      leftzeropad(doc.dmtime, 11);
1442
      RECORD_APPEND(record, Doc::keydmt, doc.dmtime);
1443
  }
1444
  RECORD_APPEND(record, Doc::keyoc, doc.origcharset);
1445
1446
  if (doc.fbytes.empty())
1447
      doc.fbytes = doc.pcbytes;
1448
1449
  if (!doc.fbytes.empty()) {
1450
      RECORD_APPEND(record, Doc::keyfs, doc.fbytes);
1451
      leftzeropad(doc.fbytes, 12);
1452
      newdocument.add_value(VALUE_SIZE, doc.fbytes);
1453
  }
1454
  if (doc.haschildren) {
1455
      newdocument.add_boolean_term(has_children_term);
1456
  }   
1457
  if (!doc.pcbytes.empty())
1458
      RECORD_APPEND(record, Doc::keypcs, doc.pcbytes);
1459
  char sizebuf[30]; 
1460
  sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
1461
  RECORD_APPEND(record, Doc::keyds, sizebuf);
1462
1463
  // Note that we add the signature both as a value and in the data record
1464
  if (!doc.sig.empty()) {
1465
      RECORD_APPEND(record, Doc::keysig, doc.sig);
1466
      newdocument.add_value(VALUE_SIG, doc.sig);
1467
  }
1468
1469
  if (!doc.ipath.empty())
1470
      RECORD_APPEND(record, Doc::keyipt, doc.ipath);
1471
1472
  doc.meta[Doc::keytt] = 
1473
      neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
1474
  if (!doc.meta[Doc::keytt].empty())
1475
      RECORD_APPEND(record, cstr_caption, doc.meta[Doc::keytt]);
1476
1477
  trimstring(doc.meta[Doc::keykw], " \t\r\n");
1478
  doc.meta[Doc::keykw] = 
1479
      neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
1480
  // No need to explicitly append the keywords, this will be done by 
1481
  // the "stored" loop
1482
1483
  // If abstract is empty, we make up one with the beginning of the
1484
  // document. This is then not indexed, but part of the doc data so
1485
  // that we can return it to a query without having to decode the
1486
  // original file.
1487
  bool syntabs = false;
1488
  // Note that the map accesses by operator[] create empty entries if they
1489
  // don't exist yet.
1490
  trimstring(doc.meta[Doc::keyabs], " \t\r\n");
1491
  if (doc.meta[Doc::keyabs].empty()) {
1492
      syntabs = true;
1493
      if (!doc.text.empty())
1494
      doc.meta[Doc::keyabs] = cstr_syntAbs + 
1495
          neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
1496
  } else {
1392
    doc.meta[Doc::keyabs] = 
1497
        doc.meta[Doc::keyabs] = 
1393
        neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
1498
      neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
1394
              cstr_nc);
1499
            cstr_nc);
1395
    }
1500
  }
1396
1501
1397
    const set<string>& stored = m_config->getStoredFields();
1502
  const set<string>& stored = m_config->getStoredFields();
1398
    for (set<string>::const_iterator it = stored.begin();
1503
  for (set<string>::const_iterator it = stored.begin();
1399
     it != stored.end(); it++) {
1504
         it != stored.end(); it++) {
1400
    string nm = m_config->fieldCanon(*it);
1505
        string nm = m_config->fieldCanon(*it);
1401
    if (!doc.meta[nm].empty()) {
1506
        if (!doc.meta[nm].empty()) {
1402
        string value = 
1507
      string value = 
1403
        neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
1508
            neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
1404
        RECORD_APPEND(record, nm, value);
1509
      RECORD_APPEND(record, nm, value);
1405
  }
1406
    }
1510
      }
1511
  }
1407
1512
1408
    // If empty pages (multiple break at same pos) were recorded, save
1513
  // If empty pages (multiple break at same pos) were recorded, save
1409
    // them (this is because we have no way to record them in the
1514
  // them (this is because we have no way to record them in the
1410
    // Xapian list
1515
  // Xapian list
1411
    if (!tpidx.m_pageincrvec.empty()) {
1516
  if (!tpidx.m_pageincrvec.empty()) {
1412
    ostringstream multibreaks;
1517
        ostringstream multibreaks;
1413
    for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
1518
        for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
1414
        if (i != 0)
1519
      if (i != 0)
1415
        multibreaks << ",";
1520
            multibreaks << ",";
1416
        multibreaks << tpidx.m_pageincrvec[i].first << "," << 
1521
      multibreaks << tpidx.m_pageincrvec[i].first << "," << 
1417
        tpidx.m_pageincrvec[i].second;
1522
            tpidx.m_pageincrvec[i].second;
1418
  }
1523
      }
1419
    RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
1524
        RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
1420
    }
1525
  }
1421
    
1526
    
1422
    // If the file's md5 was computed, add value and term. 
1527
  // If the file's md5 was computed, add value and term. 
1423
    // The value is optionally used for query result duplicate elimination, 
1528
  // The value is optionally used for query result duplicate elimination, 
1424
    // and the term to find the duplicates.
1529
  // and the term to find the duplicates.
1425
    // We don't do this for empty docs.
1530
  // We don't do this for empty docs.
1426
    const string *md5;
1531
  const string *md5;
1427
    if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
1532
  if (doc.peekmeta(Doc::keymd5, &md5) && !md5->empty() &&
1428
    md5->compare(cstr_md5empty)) {
1533
        md5->compare(cstr_md5empty)) {
1429
    string digest;
1534
        string digest;
1430
    MD5HexScan(*md5, digest);
1535
        MD5HexScan(*md5, digest);
1431
    newdocument.add_value(VALUE_MD5, digest);
1536
        newdocument.add_value(VALUE_MD5, digest);
1432
    newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
1537
        newdocument.add_boolean_term(wrap_prefix("XM") + *md5);
1433
    }
1538
  }
1434
1539
1435
    LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
1540
  LOGDEB0(("Rcl::Db::add: new doc record:\n%s\n", record.c_str()));
1436
    newdocument.set_data(record);
1541
  newdocument.set_data(record);
1437
1542
    }
1438
#ifdef IDX_THREADS
1543
#ifdef IDX_THREADS
1439
    if (m_ndb->m_havewriteq) {
1544
    if (m_ndb->m_havewriteq) {
1440
    DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, 
1545
    DbUpdTask *tp = new DbUpdTask(DbUpdTask::AddOrUpdate, udi, uniterm, 
1441
                      newdocument, doc.text.length());
1546
                      newdocument, doc.text.length());
1442
    if (!m_ndb->m_wqueue.put(tp)) {
1547
    if (!m_ndb->m_wqueue.put(tp)) {
...
...
1448
    }
1553
    }
1449
#endif
1554
#endif
1450
1555
1451
    return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument, 
1556
    return m_ndb->addOrUpdateWrite(udi, uniterm, newdocument, 
1452
                   doc.text.length());
1557
                   doc.text.length());
1558
}
1559
1560
bool Db::Native::docToXdocXattrOnly(TextSplitDb *splitter, const string &udi, 
1561
                  Doc &doc, Xapian::Document& xdoc)
1562
{
1563
    LOGDEB0(("Db::docToXdocXattrOnly\n"));
1564
    PTMutexLocker lock(m_mutex);
1565
1566
    // Read existing document and its data record
1567
    if (getDoc(udi, 0, xdoc) == 0) {
1568
  LOGERR(("docToXdocXattrOnly: existing doc not found\n"));
1569
  return false;
1570
    }
1571
    string data;
1572
    XAPTRY(data = xdoc.get_data(), xrdb, m_rcldb->m_reason);
1573
    if (!m_rcldb->m_reason.empty()) {
1574
        LOGERR(("Db::xattrOnly: got error: %s\n", m_rcldb->m_reason.c_str()));
1575
        return false;
1576
    }
1577
1578
    // Clear the term lists for the incoming fields and index the new values
1579
    map<string, string>::iterator meta_it;
1580
    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
1581
  const FieldTraits *ftp;
1582
  if (!m_rcldb->fieldToTraits(meta_it->first, &ftp) || ftp->pfx.empty()) {
1583
      LOGDEB0(("Db::xattrOnly: no prefix for field [%s], skipped\n",
1584
           meta_it->first.c_str()));
1585
      continue;
1586
  }
1587
  // Clear the previous terms for the field
1588
  clearField(xdoc, ftp->pfx, ftp->wdfinc);
1589
  LOGDEB0(("Db::xattrOnly: field [%s] pfx [%s] inc %d: [%s]\n", 
1590
       meta_it->first.c_str(), ftp->pfx.c_str(), ftp->wdfinc,
1591
       meta_it->second.c_str()));
1592
  splitter->setprefix(ftp->pfx);
1593
  splitter->setwdfinc(ftp->wdfinc);
1594
  if (!splitter->text_to_words(meta_it->second))
1595
      LOGDEB(("Db::xattrOnly: split failed for %s\n", 
1596
          meta_it->first.c_str()));
1597
    }
1598
    xdoc.add_value(VALUE_SIG, doc.sig);
1599
1600
    // Parse current data record into a dict for ease of processing
1601
    ConfSimple datadic(data);
1602
    if (!datadic.ok()) {
1603
  LOGERR(("db::docToXdocXattrOnly: failed turning data rec to dict\n"));
1604
  return false;
1605
    }
1606
1607
    // For each "stored" field, check if set in doc metadata and
1608
    // update the value if it is
1609
    const set<string>& stored = m_rcldb->m_config->getStoredFields();
1610
    for (set<string>::const_iterator it = stored.begin();
1611
   it != stored.end(); it++) {
1612
  string nm = m_rcldb->m_config->fieldCanon(*it);
1613
  if (doc.getmeta(nm, 0)) {
1614
      string value = 
1615
      neutchars(truncate_to_word(doc.meta[nm], 150), cstr_nc);
1616
      datadic.set(nm, value, "");
1617
  }
1618
    }
1619
1620
    // Recreate the record. We want to do this with the local RECORD_APPEND
1621
    // method for consistency in format, instead of using ConfSimple print
1622
    vector<string> names = datadic.getNames("");
1623
    data.clear();
1624
    for (vector<string>::const_iterator it = names.begin(); 
1625
   it != names.end(); it++) {
1626
  string value;
1627
  datadic.get(*it, value, "");
1628
  RECORD_APPEND(data, *it, value);
1629
    }
1630
    RECORD_APPEND(data, Doc::keysig, doc.sig);
1631
    xdoc.set_data(data);
1632
    return true;
1453
}
1633
}
1454
1634
1455
#ifdef IDX_THREADS
1635
#ifdef IDX_THREADS
1456
void Db::waitUpdIdle()
1636
void Db::waitUpdIdle()
1457
{
1637
{