Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
...
...
20
#include <fnmatch.h>
20
#include <fnmatch.h>
21
#include <regex.h>
21
#include <regex.h>
22
#include <math.h>
22
#include <math.h>
23
#include <time.h>
23
#include <time.h>
24
24
25
#include <iostream>
26
#include <string>
25
#include <string>
27
#include <vector>
26
#include <vector>
28
#include <algorithm>
27
#include <algorithm>
28
#include <sstream>
29
29
30
#ifndef NO_NAMESPACES
30
#ifndef NO_NAMESPACES
31
using namespace std;
31
using namespace std;
32
#endif /* NO_NAMESPACES */
32
#endif /* NO_NAMESPACES */
33
33
...
...
68
68
69
// This is the word position offset at which we index the body text
69
// This is the word position offset at which we index the body text
70
// (abstract, keywords, etc.. are stored before this)
70
// (abstract, keywords, etc.. are stored before this)
71
static const unsigned int baseTextPosition = 100000;
71
static const unsigned int baseTextPosition = 100000;
72
72
73
static const string cstr_mbreaks("rclmbreaks");
74
73
#ifndef NO_NAMESPACES
75
#ifndef NO_NAMESPACES
74
namespace Rcl {
76
namespace Rcl {
75
#endif
77
#endif
76
78
77
// Some prefixes that we could get from the fields file, but are not going
79
// Some prefixes that we could get from the fields file, but are not going
...
...
300
}
302
}
301
303
302
// Return the positions list for the page break term
304
// Return the positions list for the page break term
303
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
305
bool Db::Native::getPagePositions(Xapian::docid docid, vector<int>& vpos)
304
{
306
{
307
    // Need to retrieve the document record to check for multiple page breaks
308
    // that we store there for lack of better place
309
    map<int, int> mbreaksmap;
310
    try {
311
  Xapian::Document xdoc = xrdb.get_document(docid);
312
  string data = xdoc.get_data();
313
  Doc doc;
314
  string mbreaks;
315
  if (dbDataToRclDoc(docid, data, doc) && 
316
      doc.getmeta(cstr_mbreaks, &mbreaks)) {
317
      vector<string> values;
318
      stringToTokens(mbreaks, values, ",");
319
      for (unsigned int i = 0; i < values.size() / 2; i += 2) {
320
      int pos  = atoi(values[i].c_str()) + baseTextPosition;
321
      int incr = atoi(values[i+1].c_str());
322
      mbreaksmap[pos] = incr;
323
      }
324
  }
325
    } catch (...) {
326
    }
327
305
    string qterm = page_break_term;
328
    string qterm = page_break_term;
306
    Xapian::PositionIterator pos;
329
    Xapian::PositionIterator pos;
307
    try {
330
    try {
308
    for (pos = xrdb.positionlist_begin(docid, qterm); 
331
    for (pos = xrdb.positionlist_begin(docid, qterm); 
309
         pos != xrdb.positionlist_end(docid, qterm); pos++) {
332
         pos != xrdb.positionlist_end(docid, qterm); pos++) {
310
        int ipos = *pos;
333
        int ipos = *pos;
311
        if (ipos < int(baseTextPosition)) {
334
        if (ipos < int(baseTextPosition)) {
312
        // Not in text body. Strange...
335
        // Not in text body. Strange...
313
        continue;
336
        continue;
337
      }
338
      map<int, int>::iterator it = mbreaksmap.find(ipos);
339
      if (it != mbreaksmap.end()) {
340
      LOGDEB1(("getPagePositions: found multibreak at %d incr %d\n", 
341
           ipos, it->second));
342
      for (int i = 0 ; i < it->second; i++) 
343
          vpos.push_back(ipos);
314
        }
344
        }
315
        vpos.push_back(ipos);
345
        vpos.push_back(ipos);
316
    } 
346
    } 
317
    } catch (...) {
347
    } catch (...) {
318
    // Term does not occur. No problem.
348
    // Term does not occur. No problem.
...
...
355
         pos != xrdb.positionlist_end(docid, qterm); pos++) {
385
         pos != xrdb.positionlist_end(docid, qterm); pos++) {
356
        int ipos = *pos;
386
        int ipos = *pos;
357
        if (ipos < int(baseTextPosition)) // Not in text body
387
        if (ipos < int(baseTextPosition)) // Not in text body
358
            continue;
388
            continue;
359
        // What page ?
389
        // What page ?
360
        LOGABS(("getFirstPageMatch: looking for match for [%s]\n", 
390
        LOGABS(("getFirstPageMatch: search match for [%s] pos %d\n", 
361
            qterm.c_str()));
391
            qterm.c_str(), ipos));
362
        vector<int>::const_iterator it = 
392
        vector<int>::const_iterator it = 
363
            lower_bound(pagepos.begin(), pagepos.end(), ipos);
393
            upper_bound(pagepos.begin(), pagepos.end(), ipos);
364
      if (it != pagepos.end())
365
            return it - pagepos.begin() + 1;
394
        return it - pagepos.begin() + 1;
366
        }
395
        }
367
    } catch (...) {
396
    } catch (...) {
368
        // Term does not occur. No problem.
397
        // Term does not occur. No problem.
369
    }
398
    }
370
    }
399
    }
...
...
1000
    return true;
1029
    return true;
1001
}
1030
}
1002
1031
1003
class TermProcIdx : public TermProc {
1032
class TermProcIdx : public TermProc {
1004
public:
1033
public:
1005
    TermProcIdx() : TermProc(0), m_ts(0) {}
1034
    TermProcIdx() : TermProc(0), m_ts(0), m_lastpagepos(0), m_pageincr(0) {}
1006
    void setTSD(TextSplitDb *ts) {m_ts = ts;}
1035
    void setTSD(TextSplitDb *ts) {m_ts = ts;}
1007
1036
1008
    bool takeword(const std::string &term, int pos, int, int)
1037
    bool takeword(const std::string &term, int pos, int, int)
1009
    {
1038
    {
1010
    // Compute absolute position (pos is relative to current segment),
1039
    // Compute absolute position (pos is relative to current segment),
...
...
1031
    return false;
1060
    return false;
1032
    }
1061
    }
1033
    void newpage(int pos)
1062
    void newpage(int pos)
1034
    {
1063
    {
1035
    pos += m_ts->basepos;
1064
    pos += m_ts->basepos;
1065
  LOGDEB2(("newpage: %d\n", pos));
1066
  if (pos < int(baseTextPosition))
1067
      return;
1068
1036
    m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
1069
    m_ts->doc.add_posting(m_ts->prefix + page_break_term, pos);
1070
  if (pos == m_lastpagepos) {
1071
      m_pageincr++;
1072
      LOGDEB2(("newpage: same pos, pageincr %d lastpagepos %d\n", 
1073
           m_pageincr, m_lastpagepos));
1074
  } else {
1075
      LOGDEB2(("newpage: pos change, pageincr %d lastpagepos %d\n", 
1076
           m_pageincr, m_lastpagepos));
1077
      if (m_pageincr > 0) {
1078
      // Remember the multiple page break at this position
1079
      m_pageincrvec.push_back(
1080
          pair<int, int>(m_lastpagepos - baseTextPosition, 
1081
                 m_pageincr));
1037
    }
1082
      }
1038
private:
1083
      m_pageincr = 0;
1084
  }
1085
  m_lastpagepos = pos;
1086
    }
1087
1088
    virtual bool flush()
1089
    {
1090
  if (m_pageincr > 0) {
1091
      m_pageincrvec.push_back(
1092
      pair<int, int>(m_lastpagepos - baseTextPosition,  
1093
                 m_pageincr));
1094
      m_pageincr = 0;
1095
  }
1096
  return TermProc::flush();
1097
    }
1098
1039
    TextSplitDb *m_ts;
1099
    TextSplitDb *m_ts;
1100
    // Auxiliary page breaks data for positions with multiple page breaks.
1101
    int m_lastpagepos;
1102
    // increment of page breaks at same pos. Normally 0, 1.. when several
1103
    // breaks at the same pos
1104
    int m_pageincr; 
1105
    vector <pair<int, int> > m_pageincrvec;
1040
};
1106
};
1041
1107
1042
1108
1043
#ifdef TESTING_XAPIAN_SPELL
1109
#ifdef TESTING_XAPIAN_SPELL
1044
string Db::getSpellingSuggestion(const string& word)
1110
string Db::getSpellingSuggestion(const string& word)
...
...
1272
    time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : 
1338
    time_t mtime = atol(doc.dmtime.empty() ? doc.fmtime.c_str() : 
1273
            doc.dmtime.c_str());
1339
            doc.dmtime.c_str());
1274
    struct tm *tm = localtime(&mtime);
1340
    struct tm *tm = localtime(&mtime);
1275
    char buf[9];
1341
    char buf[9];
1276
    snprintf(buf, 9, "%04d%02d%02d",
1342
    snprintf(buf, 9, "%04d%02d%02d",
1277
        tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
1343
         tm->tm_year+1900, tm->tm_mon + 1, tm->tm_mday);
1278
    newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
1344
    newdocument.add_term(xapday_prefix + string(buf)); // Date (YYYYMMDD)
1279
    buf[6] = '\0';
1345
    buf[6] = '\0';
1280
    newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
1346
    newdocument.add_term(xapmonth_prefix + string(buf)); // Month (YYYYMM)
1281
    buf[4] = '\0';
1347
    buf[4] = '\0';
1282
    newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
1348
    newdocument.add_term(xapyear_prefix + string(buf)); // Year (YYYY)
...
...
1373
        neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
1439
        neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
1374
        RECORD_APPEND(record, nm, value);
1440
        RECORD_APPEND(record, nm, value);
1375
    }
1441
    }
1376
    }
1442
    }
1377
1443
1444
    // If empty pages (multiple break at same pos) were recorded, save
1445
    // them (this is because we have no way to record them in the
1446
    // Xapian list
1447
    if (!tpidx.m_pageincrvec.empty()) {
1448
  ostringstream multibreaks;
1449
  for (unsigned int i = 0; i < tpidx.m_pageincrvec.size(); i++) {
1450
      multibreaks << tpidx.m_pageincrvec[i].first << "," << 
1451
      tpidx.m_pageincrvec[i].second;
1452
  }
1453
  RECORD_APPEND(record, string(cstr_mbreaks), multibreaks.str());
1454
    }
1455
    
1378
    // If the file's md5 was computed, add value. This is optionally
1456
    // If the file's md5 was computed, add value. This is optionally
1379
    // used for query result duplicate elimination.
1457
    // used for query result duplicate elimination.
1380
    string& md5 = doc.meta[Doc::keymd5];
1458
    string& md5 = doc.meta[Doc::keymd5];
1381
    if (!md5.empty()) {
1459
    if (!md5.empty()) {
1382
    string digest;
1460
    string digest;