Switch to unified view

a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp
...
...
48
{
48
{
49
    string a;
49
    string a;
50
    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
50
    for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
51
        a = a + *it + " ";
51
        a = a + *it + " ";
52
    }
52
    }
53
    LOGDEB(""  << (what) << ": "  << (a) << "\n" );
53
    LOGDEB("" << what << ": " << a << "\n");
54
}
54
}
55
#else
55
#else
56
#define LOGABS LOGDEB2
56
#define LOGABS LOGDEB2
57
static void listList(const string&, const vector<string>&)
57
static void listList(const string&, const vector<string>&)
58
{
58
{
...
...
65
// result in general.
65
// result in general.
66
static const bool prune_prefixed_terms = true; 
66
static const bool prune_prefixed_terms = true; 
67
static void noPrefixList(const vector<string>& in, vector<string>& out) 
67
static void noPrefixList(const vector<string>& in, vector<string>& out) 
68
{
68
{
69
    for (vector<string>::const_iterator qit = in.begin(); 
69
    for (vector<string>::const_iterator qit = in.begin(); 
70
   qit != in.end(); qit++) {
70
         qit != in.end(); qit++) {
71
  if (prune_prefixed_terms) {
71
        if (prune_prefixed_terms) {
72
      if (has_prefix(*qit))
72
            if (has_prefix(*qit))
73
      continue;
73
                continue;
74
  }
74
        }
75
  out.push_back(strip_prefix(*qit));
75
        out.push_back(strip_prefix(*qit));
76
    }
76
    }
77
    sort(out.begin(), out.end());
77
    sort(out.begin(), out.end());
78
    vector<string>::iterator it = unique(out.begin(), out.end());
78
    vector<string>::iterator it = unique(out.begin(), out.end());
79
    out.resize(it - out.begin());
79
    out.resize(it - out.begin());
80
}
80
}
81
81
82
bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
82
bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
83
{
83
{
84
    if (!xenquire) {
84
    if (!xenquire) {
85
  LOGERR("Query::getMatchTerms: no query opened\n" );
85
        LOGERR("Query::getMatchTerms: no query opened\n");
86
  return false;
86
        return false;
87
    }
87
    }
88
88
89
    terms.clear();
89
    terms.clear();
90
    Xapian::TermIterator it;
90
    Xapian::TermIterator it;
91
    Xapian::docid id = Xapian::docid(xdocid);
91
    Xapian::docid id = Xapian::docid(xdocid);
...
...
93
    XAPTRY(iterms.insert(iterms.begin(),
93
    XAPTRY(iterms.insert(iterms.begin(),
94
                        xenquire->get_matching_terms_begin(id),
94
                        xenquire->get_matching_terms_begin(id),
95
                        xenquire->get_matching_terms_end(id)),
95
                        xenquire->get_matching_terms_end(id)),
96
           m_q->m_db->m_ndb->xrdb, m_q->m_reason);
96
           m_q->m_db->m_ndb->xrdb, m_q->m_reason);
97
    if (!m_q->m_reason.empty()) {
97
    if (!m_q->m_reason.empty()) {
98
  LOGERR("getMatchTerms: xapian error: "  << (m_q->m_reason) << "\n" );
98
        LOGERR("getMatchTerms: xapian error: " << m_q->m_reason << "\n");
99
  return false;
99
        return false;
100
    }
100
    }
101
    noPrefixList(iterms, terms);
101
    noPrefixList(iterms, terms);
102
    return true;
102
    return true;
103
}
103
}
104
104
...
...
107
// while computing abstracts for the different result documents.
107
// while computing abstracts for the different result documents.
108
void Query::Native::setDbWideQTermsFreqs()
108
void Query::Native::setDbWideQTermsFreqs()
109
{
109
{
110
    // Do it once only for a given query.
110
    // Do it once only for a given query.
111
    if (!termfreqs.empty())
111
    if (!termfreqs.empty())
112
  return;
112
        return;
113
113
114
    vector<string> qterms;
114
    vector<string> qterms;
115
    {
115
    {
116
  vector<string> iqterms;
116
        vector<string> iqterms;
117
  m_q->getQueryTerms(iqterms);
117
        m_q->getQueryTerms(iqterms);
118
  noPrefixList(iqterms, qterms);
118
        noPrefixList(iqterms, qterms);
119
    }
119
    }
120
    // listList("Query terms: ", qterms);
120
    // listList("Query terms: ", qterms);
121
    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
121
    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
122
122
123
    double doccnt = xrdb.get_doccount();
123
    double doccnt = xrdb.get_doccount();
124
    if (doccnt == 0) 
124
    if (doccnt == 0) 
125
  doccnt = 1;
125
        doccnt = 1;
126
126
127
    for (vector<string>::const_iterator qit = qterms.begin(); 
127
    for (vector<string>::const_iterator qit = qterms.begin(); 
128
   qit != qterms.end(); qit++) {
128
         qit != qterms.end(); qit++) {
129
  termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
129
        termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
130
  LOGABS("setDbWideQTermFreqs: ["  << (qit) << "] db freq "  << (termfreqs[*qit]) << "\n" );
130
        LOGABS("setDbWideQTermFreqs: [" << *qit << "] db freq " <<
131
               termfreqs[*qit] << "\n");
131
    }
132
    }
132
}
133
}
133
134
134
// Compute matched terms quality coefficients for a matched document by
135
// Compute matched terms quality coefficients for a matched document by
135
// retrieving the Within Document Frequencies and multiplying by
136
// retrieving the Within Document Frequencies and multiplying by
...
...
141
// common stem, which seems wrong, we group the terms by
142
// common stem, which seems wrong, we group the terms by
142
// root, compute a frequency for the group from the sum of member
143
// root, compute a frequency for the group from the sum of member
143
// occurrences, and let the frequency for each group member be the
144
// occurrences, and let the frequency for each group member be the
144
// aggregated frequency.
145
// aggregated frequency.
145
double Query::Native::qualityTerms(Xapian::docid docid, 
146
double Query::Native::qualityTerms(Xapian::docid docid, 
146
                 const vector<string>& terms,
147
                                   const vector<string>& terms,
147
                 multimap<double, vector<string> >& byQ)
148
                                   multimap<double, vector<string> >& byQ)
148
{
149
{
149
    LOGABS("qualityTerms\n" );
150
    LOGABS("qualityTerms\n");
150
    setDbWideQTermsFreqs();
151
    setDbWideQTermsFreqs();
151
152
152
    map<string, double> termQcoefs;
153
    map<string, double> termQcoefs;
153
    double totalweight = 0;
154
    double totalweight = 0;
154
155
155
    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
156
    Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
156
    double doclen = xrdb.get_doclength(docid);
157
    double doclen = xrdb.get_doclength(docid);
157
    if (doclen == 0) 
158
    if (doclen == 0) 
158
  doclen = 1;
159
        doclen = 1;
159
    HighlightData hld;
160
    HighlightData hld;
160
    if (m_q->m_sd) {
161
    if (m_q->m_sd) {
161
  m_q->m_sd->getTerms(hld);
162
        m_q->m_sd->getTerms(hld);
162
    }
163
    }
163
164
164
#ifdef DEBUGABSTRACT
165
#ifdef DEBUGABSTRACT
165
    {
166
    {
166
  string deb;
167
        string deb;
167
  hld.toString(deb);
168
        hld.toString(deb);
168
  LOGABS("qualityTerms: hld: "  << (deb) << "\n" );
169
        LOGABS("qualityTerms: hld: " << deb << "\n");
169
    }
170
    }
170
#endif
171
#endif
171
172
172
    // Group the input terms by the user term they were possibly expanded from
173
    // Group the input terms by the user term they were possibly expanded from
173
    map<string, vector<string> > byRoot;
174
    map<string, vector<string> > byRoot;
174
    for (vector<string>::const_iterator qit = terms.begin(); 
175
    for (vector<string>::const_iterator qit = terms.begin(); 
175
   qit != terms.end(); qit++) {
176
         qit != terms.end(); qit++) {
176
  map<string, string>::const_iterator eit = hld.terms.find(*qit);
177
        map<string, string>::const_iterator eit = hld.terms.find(*qit);
177
  if (eit != hld.terms.end()) {
178
        if (eit != hld.terms.end()) {
178
      byRoot[eit->second].push_back(*qit);
179
            byRoot[eit->second].push_back(*qit);
179
  } else {
180
        } else {
180
      LOGDEB0("qualityTerms: ["  << ((*qit)) << "] not found in hld\n" );
181
            LOGDEB0("qualityTerms: [" << *qit << "] not found in hld\n");
181
      byRoot[*qit].push_back(*qit);
182
            byRoot[*qit].push_back(*qit);
182
  }
183
        }
183
    }
184
    }
184
185
185
#ifdef DEBUGABSTRACT
186
#ifdef DEBUGABSTRACT
186
    {
187
    {
187
  string byRootstr;
188
        string byRootstr;
188
  for (map<string, vector<string> >::const_iterator debit = 
189
        for (map<string, vector<string> >::const_iterator debit = 
189
       byRoot.begin();  debit != byRoot.end(); debit++) {
190
                 byRoot.begin();  debit != byRoot.end(); debit++) {
190
      byRootstr.append("[").append(debit->first).append("]->");
191
            byRootstr.append("[").append(debit->first).append("]->");
191
      for (vector<string>::const_iterator it = debit->second.begin();
192
            for (vector<string>::const_iterator it = debit->second.begin();
192
       it != debit->second.end(); it++) {
193
                 it != debit->second.end(); it++) {
193
      byRootstr.append("[").append(*it).append("] ");
194
                byRootstr.append("[").append(*it).append("] ");
194
      }
195
            }
195
      byRootstr.append("\n");
196
            byRootstr.append("\n");
196
  }
197
        }
197
  LOGABS("\nqualityTerms: uterms to terms: "  << (byRootstr) << "\n" );
198
        LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n");
198
    }
199
    }
199
#endif
200
#endif
200
201
201
    // Compute in-document and global frequencies for the groups.
202
    // Compute in-document and global frequencies for the groups.
202
    map<string, double> grpwdfs;
203
    map<string, double> grpwdfs;
203
    map<string, double> grptfreqs;
204
    map<string, double> grptfreqs;
204
    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
205
    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
205
   git != byRoot.end(); git++) {
206
         git != byRoot.end(); git++) {
206
  for (vector<string>::const_iterator qit = git->second.begin(); 
207
        for (vector<string>::const_iterator qit = git->second.begin(); 
207
       qit != git->second.end(); qit++) {
208
             qit != git->second.end(); qit++) {
208
      Xapian::TermIterator term = xrdb.termlist_begin(docid);
209
            Xapian::TermIterator term = xrdb.termlist_begin(docid);
209
      term.skip_to(*qit);
210
            term.skip_to(*qit);
210
      if (term != xrdb.termlist_end(docid) && *term == *qit) {
211
            if (term != xrdb.termlist_end(docid) && *term == *qit) {
211
      if (grpwdfs.find(git->first) != grpwdfs.end()) {
212
                if (grpwdfs.find(git->first) != grpwdfs.end()) {
212
          grpwdfs[git->first] = term.get_wdf() / doclen;
213
                    grpwdfs[git->first] = term.get_wdf() / doclen;
213
          grptfreqs[git->first] = termfreqs[*qit];
214
                    grptfreqs[git->first] = termfreqs[*qit];
214
      } else {
215
                } else {
215
          grpwdfs[git->first] += term.get_wdf() / doclen;
216
                    grpwdfs[git->first] += term.get_wdf() / doclen;
216
          grptfreqs[git->first] += termfreqs[*qit];
217
                    grptfreqs[git->first] += termfreqs[*qit];
217
      }
218
                }
218
      }
219
            }
219
  }    
220
        }    
220
    }
221
    }
221
222
222
    // Build a sorted by quality container for the groups
223
    // Build a sorted by quality container for the groups
223
    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
224
    for (map<string, vector<string> >::const_iterator git = byRoot.begin();
224
   git != byRoot.end(); git++) {
225
         git != byRoot.end(); git++) {
225
  double q = (grpwdfs[git->first]) * grptfreqs[git->first];
226
        double q = (grpwdfs[git->first]) * grptfreqs[git->first];
226
  q = -log10(q);
227
        q = -log10(q);
227
  if (q < 3) {
228
        if (q < 3) {
228
      q = 0.05;
229
            q = 0.05;
229
  } else if (q < 4) {
230
        } else if (q < 4) {
230
      q = 0.3;
231
            q = 0.3;
231
  } else if (q < 5) {
232
        } else if (q < 5) {
232
      q = 0.7;
233
            q = 0.7;
233
  } else if (q < 6) {
234
        } else if (q < 6) {
234
      q = 0.8;
235
            q = 0.8;
235
  } else {
236
        } else {
236
      q = 1;
237
            q = 1;
237
  }
238
        }
238
  totalweight += q;
239
        totalweight += q;
239
  byQ.insert(pair<double, vector<string> >(q, git->second));
240
        byQ.insert(pair<double, vector<string> >(q, git->second));
240
    }
241
    }
241
242
242
#ifdef DEBUGABSTRACT
243
#ifdef DEBUGABSTRACT
243
    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
244
    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
244
   mit != byQ.rend(); mit++) {
245
         mit != byQ.rend(); mit++) {
245
  LOGABS("qualityTerms: group\n" );
246
        LOGABS("qualityTerms: group\n");
246
  for (vector<string>::const_iterator qit = mit->second.begin();
247
        for (vector<string>::const_iterator qit = mit->second.begin();
247
       qit != mit->second.end(); qit++) {
248
             qit != mit->second.end(); qit++) {
248
      LOGABS(""  << (mit->first) << "->["  << (qit) << "]\n" );
249
            LOGABS("" << mit->first << "->[" << *qit << "]\n");
249
  }
250
        }
250
    }
251
    }
251
#endif
252
#endif
252
    return totalweight;
253
    return totalweight;
253
}
254
}
254
255
255
// Return page number for first match of "significant" term.
256
// Return page number for first match of "significant" term.
256
int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
257
int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
257
{
258
{
258
    LOGDEB("Query::Native::getFirstMatchPage\n");
259
    LOGDEB("Query::Native::getFirstMatchPage\n");
259
    if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
260
    if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
260
  LOGERR("Query::getFirstMatchPage: no db\n" );
261
        LOGERR("Query::getFirstMatchPage: no db\n");
261
  return -1;
262
        return -1;
262
    }
263
    }
263
    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
264
    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
264
    Xapian::Database& xrdb(ndb->xrdb);
265
    Xapian::Database& xrdb(ndb->xrdb);
265
266
266
    vector<string> terms;
267
    vector<string> terms;
267
    getMatchTerms(docid, terms);
268
    getMatchTerms(docid, terms);
268
269
269
    if (terms.empty()) {
270
    if (terms.empty()) {
270
  LOGDEB("getFirstMatchPage: empty match term list (field match?)\n" );
271
        LOGDEB("getFirstMatchPage: empty match term list (field match?)\n");
271
  return -1;
272
        return -1;
272
    }
273
    }
273
274
274
    vector<int> pagepos;
275
    vector<int> pagepos;
275
    ndb->getPagePositions(docid, pagepos);
276
    ndb->getPagePositions(docid, pagepos);
276
    if (pagepos.empty())
277
    if (pagepos.empty())
277
  return -1;
278
        return -1;
278
  
279
        
279
    setDbWideQTermsFreqs();
280
    setDbWideQTermsFreqs();
280
281
281
    // We try to use a page which matches the "best" term. Get a sorted list
282
    // We try to use a page which matches the "best" term. Get a sorted list
282
    multimap<double, vector<string> > byQ;
283
    multimap<double, vector<string> > byQ;
283
    qualityTerms(docid, terms, byQ);
284
    qualityTerms(docid, terms, byQ);
284
285
285
    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
286
    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
286
   mit != byQ.rend(); mit++) {
287
         mit != byQ.rend(); mit++) {
287
  for (vector<string>::const_iterator qit = mit->second.begin();
288
        for (vector<string>::const_iterator qit = mit->second.begin();
288
       qit != mit->second.end(); qit++) {
289
             qit != mit->second.end(); qit++) {
289
      string qterm = *qit;
290
            string qterm = *qit;
290
      Xapian::PositionIterator pos;
291
            Xapian::PositionIterator pos;
291
      string emptys;
292
            string emptys;
292
      try {
293
            try {
293
      for (pos = xrdb.positionlist_begin(docid, qterm); 
294
                for (pos = xrdb.positionlist_begin(docid, qterm);
294
           pos != xrdb.positionlist_end(docid, qterm); pos++) {
295
                     pos != xrdb.positionlist_end(docid, qterm); pos++) {
295
          int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
296
                    int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
296
          if (pagenum > 0) {
297
                    if (pagenum > 0) {
297
          term = qterm;
298
                        term = qterm;
298
          return pagenum;
299
                        return pagenum;
299
          }
300
                    }
300
      }
301
                }
301
      } catch (...) {
302
            } catch (...) {
302
      // Term does not occur. No problem.
303
                // Term does not occur. No problem.
303
      }
304
            }
304
  }
305
        }
305
    }
306
    }
306
    return -1;
307
    return -1;
307
}
308
}
308
309
309
// Build a document abstract by extracting text chunks around the query terms
310
// Build a document abstract by extracting text chunks around the query terms
310
// This uses the db termlists, not the original document.
311
// This uses the db termlists, not the original document.
311
//
312
//
312
// DatabaseModified and other general exceptions are catched and
313
// DatabaseModified and other general exceptions are catched and
313
// possibly retried by our caller
314
// possibly retried by our caller
314
int Query::Native::makeAbstract(Xapian::docid docid,
315
int Query::Native::makeAbstract(Xapian::docid docid,
315
              vector<Snippet>& vabs, 
316
                                vector<Snippet>& vabs, 
316
              int imaxoccs, int ictxwords)
317
                                int imaxoccs, int ictxwords)
317
{
318
{
318
    Chrono chron;
319
    Chrono chron;
319
    LOGABS("makeAbstract: docid "  << (long(docid)) << " imaxoccs "  << (imaxoccs) << " ictxwords "  << (ictxwords) << "\n" );
320
    LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
321
           imaxoccs << " ictxwords " << ictxwords << "\n");
320
322
321
    // The (unprefixed) terms matched by this document
323
    // The (unprefixed) terms matched by this document
322
    vector<string> matchedTerms;
324
    vector<string> matchedTerms;
323
    getMatchTerms(docid, matchedTerms);
325
    getMatchTerms(docid, matchedTerms);
324
    if (matchedTerms.empty()) {
326
    if (matchedTerms.empty()) {
325
  LOGDEB("makeAbstract::Empty term list\n" );
327
        LOGDEB("makeAbstract:"<<chron.millis()<<"mS:Empty term list\n");
326
  return ABSRES_ERROR;
328
        return ABSRES_ERROR;
327
    }
329
    }
328
330
329
    listList("Match terms: ", matchedTerms);
331
    listList("Match terms: ", matchedTerms);
330
332
331
    // Retrieve the term frequencies for the query terms. This is
333
    // Retrieve the term frequencies for the query terms. This is
...
...
337
    // going to try and show text around the less common search terms.
339
    // going to try and show text around the less common search terms.
338
    // Terms issued from an original one by stem expansion are
340
    // Terms issued from an original one by stem expansion are
339
    // aggregated by the qualityTerms() routine.
341
    // aggregated by the qualityTerms() routine.
340
    multimap<double, vector<string> > byQ;
342
    multimap<double, vector<string> > byQ;
341
    double totalweight = qualityTerms(docid, matchedTerms, byQ);
343
    double totalweight = qualityTerms(docid, matchedTerms, byQ);
342
    LOGABS("makeAbstract:"  << (chron.ms()) << ": computed Qcoefs.\n" );
344
    LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
343
    // This can't happen, but would crash us
345
    // This can't happen, but would crash us
344
    if (totalweight == 0.0) {
346
    if (totalweight == 0.0) {
345
  LOGERR("makeAbstract: totalweight == 0.0 !\n" );
347
        LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
346
  return ABSRES_ERROR;
348
        return ABSRES_ERROR;
347
    }
349
    }
348
350
349
    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
351
    Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
350
    Xapian::Database& xrdb(ndb->xrdb);
352
    Xapian::Database& xrdb(ndb->xrdb);
351
353
...
...
372
    // average word size. It was a mistake to have the user max
374
    // average word size. It was a mistake to have the user max
373
    // abstract size parameter in characters, we basically only deal
375
    // abstract size parameter in characters, we basically only deal
374
    // with words. We used to limit the character size at the end, but
376
    // with words. We used to limit the character size at the end, but
375
    // this damaged our careful selection of terms
377
    // this damaged our careful selection of terms
376
    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
378
    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
377
  m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
379
        m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
378
    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
380
    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
379
    LOGABS("makeAbstract:"  << (chron.ms()) << ": mxttloccs "  << (maxtotaloccs) << " ctxwords "  << (ctxwords) << "\n" );
381
    LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
382
           maxtotaloccs << " ctxwords " << ctxwords << "\n");
380
383
381
    int ret = ABSRES_OK;
384
    int ret = ABSRES_OK;
382
385
383
    // Let's go populate
386
    // Let's go populate
384
    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
387
    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
385
   mit != byQ.rend(); mit++) {
388
         mit != byQ.rend(); mit++) {
386
  unsigned int maxgrpoccs;
389
        unsigned int maxgrpoccs;
387
  double q;
390
        double q;
388
  if (byQ.size() == 1) {
391
        if (byQ.size() == 1) {
389
      maxgrpoccs = maxtotaloccs;
392
            maxgrpoccs = maxtotaloccs;
390
      q = 1.0;
393
            q = 1.0;
391
  } else {
394
        } else {
392
      // We give more slots to the better term groups
395
            // We give more slots to the better term groups
393
      q = mit->first / totalweight;
396
            q = mit->first / totalweight;
394
      maxgrpoccs = int(ceil(maxtotaloccs * q));
397
            maxgrpoccs = int(ceil(maxtotaloccs * q));
395
  }
398
        }
396
  unsigned int grpoccs = 0;
399
        unsigned int grpoccs = 0;
397
400
398
  for (vector<string>::const_iterator qit = mit->second.begin();
401
        for (vector<string>::const_iterator qit = mit->second.begin();
399
       qit != mit->second.end(); qit++) {
402
             qit != mit->second.end(); qit++) {
400
403
401
      // Group done ?
404
            // Group done ?
402
      if (grpoccs >= maxgrpoccs) 
405
            if (grpoccs >= maxgrpoccs) 
403
      break;
406
                break;
404
407
405
      string qterm = *qit;
408
            string qterm = *qit;
406
409
407
      LOGABS("makeAbstract: ["  << (qterm) << "] "  << (maxgrpoccs) << " max grp occs (coef "  << (q) << ")\n" );
410
            LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
411
                   " max grp occs (coef " << q << ")\n");
408
412
409
      // The match term may span several words
413
            // The match term may span several words
410
      int qtrmwrdcnt = 
414
            int qtrmwrdcnt = 
411
      TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
415
                TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
412
416
413
      Xapian::PositionIterator pos;
417
            Xapian::PositionIterator pos;
414
      // There may be query terms not in this doc. This raises an
418
            // There may be query terms not in this doc. This raises an
415
      // exception when requesting the position list, we catch it ??
419
            // exception when requesting the position list, we catch it ??
416
      // Not clear how this can happen because we are walking the
420
            // Not clear how this can happen because we are walking the
417
      // match list returned by Xapian. Maybe something with the
421
            // match list returned by Xapian. Maybe something with the
418
      // fields?
422
            // fields?
419
      string emptys;
423
            string emptys;
420
      try {
424
            try {
421
      for (pos = xrdb.positionlist_begin(docid, qterm); 
425
                for (pos = xrdb.positionlist_begin(docid, qterm);
422
           pos != xrdb.positionlist_end(docid, qterm); pos++) {
426
                     pos != xrdb.positionlist_end(docid, qterm); pos++) {
423
          int ipos = *pos;
427
                    int ipos = *pos;
424
          if (ipos < int(baseTextPosition)) // Not in text body
428
                    if (ipos < int(baseTextPosition)) // Not in text body
425
          continue;
429
                        continue;
426
          LOGABS("makeAbstract: ["  << (qterm) << "] at pos "  << (ipos) << " grpoccs "  << (grpoccs) << " maxgrpoccs "  << (maxgrpoccs) << "\n" );
430
                    LOGABS("makeAbstract: [" << qterm << "] at pos " <<
431
                           ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
432
                           maxgrpoccs << "\n");
427
433
428
          totaloccs++;
434
                    totaloccs++;
429
          grpoccs++;
435
                    grpoccs++;
430
436
431
          // Add adjacent slots to the set to populate at next
437
                    // Add adjacent slots to the set to populate at next
432
          // step by inserting empty strings. Special provisions
438
                    // step by inserting empty strings. Special provisions
433
          // for adding ellipsis and for positions overlapped by
439
                    // for adding ellipsis and for positions overlapped by
434
          // the match term.
440
                    // the match term.
435
          unsigned int sta = MAX(int(baseTextPosition), 
441
                    unsigned int sta = MAX(int(baseTextPosition), 
436
                     ipos - ctxwords);
442
                                           ipos - ctxwords);
437
          unsigned int sto = ipos + qtrmwrdcnt-1 + 
443
                    unsigned int sto = ipos + qtrmwrdcnt-1 + 
438
          m_q->m_db->getAbsCtxLen();
444
                        m_q->m_db->getAbsCtxLen();
439
          for (unsigned int ii = sta; ii <= sto;  ii++) {
445
                    for (unsigned int ii = sta; ii <= sto;  ii++) {
440
          if (ii == (unsigned int)ipos) {
446
                        if (ii == (unsigned int)ipos) {
441
              sparseDoc[ii] = qterm;
447
                            sparseDoc[ii] = qterm;
442
              searchTermPositions.insert(ii);
448
                            searchTermPositions.insert(ii);
443
              if (ii > maxpos)
449
                            if (ii > maxpos)
444
              maxpos = ii;
450
                                maxpos = ii;
445
          } else if (ii > (unsigned int)ipos && 
451
                        } else if (ii > (unsigned int)ipos && 
446
                 ii < (unsigned int)ipos + qtrmwrdcnt) {
452
                                   ii < (unsigned int)ipos + qtrmwrdcnt) {
447
              sparseDoc[ii] = occupiedmarker;
453
                            sparseDoc[ii] = occupiedmarker;
448
          } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
454
                        } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
449
              // For an empty slot, the test has a side
455
                            // For an empty slot, the test has a side
450
              // effect of inserting an empty string which
456
                            // effect of inserting an empty string which
451
              // is what we want.
457
                            // is what we want.
452
              sparseDoc[ii] = emptys;
458
                            sparseDoc[ii] = emptys;
453
          }
459
                        }
454
          }
460
                    }
455
          // Add ellipsis at the end. This may be replaced later by
461
                    // Add ellipsis at the end. This may be replaced later by
456
          // an overlapping extract. Take care not to replace an
462
                    // an overlapping extract. Take care not to replace an
457
          // empty string here, we really want an empty slot,
463
                    // empty string here, we really want an empty slot,
458
          // use find()
464
                    // use find()
459
          if (sparseDoc.find(sto+1) == sparseDoc.end()) {
465
                    if (sparseDoc.find(sto+1) == sparseDoc.end()) {
460
          sparseDoc[sto+1] = cstr_ellipsis;
466
                        sparseDoc[sto+1] = cstr_ellipsis;
461
          }
467
                    }
462
468
463
          // Group done ?
469
                    // Group done ?
464
          if (grpoccs >= maxgrpoccs) {
470
                    if (grpoccs >= maxgrpoccs) {
465
          ret |= ABSRES_TRUNC;
471
                        ret |= ABSRES_TRUNC;
466
          LOGABS("Db::makeAbstract: max group occs cutoff\n" );
472
                        LOGABS("Db::makeAbstract: max group occs cutoff\n");
467
          break;
473
                        break;
468
          }
474
                    }
469
          // Global done ?
475
                    // Global done ?
470
          if (totaloccs >= maxtotaloccs) {
476
                    if (totaloccs >= maxtotaloccs) {
471
          ret |= ABSRES_TRUNC;
477
                        ret |= ABSRES_TRUNC;
472
          LOGABS("Db::makeAbstract: max occurrences cutoff\n" );
478
                        LOGABS("Db::makeAbstract: max occurrences cutoff\n");
473
          break;
479
                        break;
474
          }
480
                    }
475
      }
481
                }
476
      } catch (...) {
482
            } catch (...) {
477
      // Term does not occur. No problem.
483
                // Term does not occur. No problem.
478
      }
484
            }
479
485
480
      if (totaloccs >= maxtotaloccs) {
486
            if (totaloccs >= maxtotaloccs) {
481
      ret |= ABSRES_TRUNC;
487
                ret |= ABSRES_TRUNC;
482
      LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );
488
                LOGABS("Db::makeAbstract: max1 occurrences cutoff\n");
483
      break;
489
                break;
484
      }
490
            }
485
  }
491
        }
486
    }
492
    }
487
    maxpos += ctxwords + 1;
493
    maxpos += ctxwords + 1;
488
494
489
    LOGABS("makeAbstract:"  << (chron.millis()) << ":chosen number of positions "  << (totaloccs) << "\n" );
495
    LOGABS("makeAbstract:" << chron.millis() <<
496
           "mS:chosen number of positions " << totaloccs << "\n");
490
    // This can happen if there are term occurences in the keywords
497
    // This can happen if there are term occurences in the keywords
491
    // etc. but not elsewhere ?
498
    // etc. but not elsewhere ?
492
    if (totaloccs == 0) {
499
    if (totaloccs == 0) {
493
  LOGDEB("makeAbstract: no occurrences\n" );
500
        LOGDEB("makeAbstract: no occurrences\n");
494
  return ABSRES_OK;
501
        return ABSRES_OK;
495
    }
502
    }
496
503
497
    // Walk all document's terms position lists and populate slots
504
    // Walk all document's terms position lists and populate slots
498
    // around the query terms. We arbitrarily truncate the list to
505
    // around the query terms. We arbitrarily truncate the list to
499
    // avoid taking forever. If we do cutoff, the abstract may be
506
    // avoid taking forever. If we do cutoff, the abstract may be
500
    // inconsistant (missing words, potentially altering meaning),
507
    // inconsistant (missing words, potentially altering meaning),
501
    // which is bad. 
508
    // which is bad. 
502
    { 
509
    { 
503
  Xapian::TermIterator term;
510
        Xapian::TermIterator term;
504
  int cutoff = m_q->m_snipMaxPosWalk;
511
        int cutoff = m_q->m_snipMaxPosWalk;
505
  for (term = xrdb.termlist_begin(docid);
512
        for (term = xrdb.termlist_begin(docid);
506
       term != xrdb.termlist_end(docid); term++) {
513
             term != xrdb.termlist_end(docid); term++) {
507
      // Ignore prefixed terms
514
            // Ignore prefixed terms
508
      if (has_prefix(*term))
515
            if (has_prefix(*term))
509
      continue;
516
                continue;
510
      if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
517
            if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
511
      ret |= ABSRES_TERMMISS;
518
                ret |= ABSRES_TERMMISS;
512
      LOGDEB0("makeAbstract: max term count cutoff "  << (m_q->m_snipMaxPosWalk) << "\n" );
519
                LOGDEB0("makeAbstract: max term count cutoff " <<
513
      break;
520
                        m_q->m_snipMaxPosWalk << "\n");
521
                break;
522
            }
523
524
            map<unsigned int, string>::iterator vit;
525
            Xapian::PositionIterator pos;
526
            for (pos = xrdb.positionlist_begin(docid, *term);
527
                 pos != xrdb.positionlist_end(docid, *term); pos++) {
528
                if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
529
                    ret |= ABSRES_TERMMISS;
530
                    LOGDEB0("makeAbstract: max term count cutoff " <<
531
                            m_q->m_snipMaxPosWalk << "\n");
532
                    break;
533
                }
534
                // If we are beyond the max possible position, stop
535
                // for this term
536
                if (*pos > maxpos) {
537
                    break;
538
                }
539
                if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
540
                    // Don't replace a term: the terms list is in
541
                    // alphabetic order, and we may have several terms
542
                    // at the same position, we want to keep only the
543
                    // first one (ie: dockes and dockes@wanadoo.fr)
544
                    if (vit->second.empty()) {
545
                        LOGDEB2("makeAbstract: populating: [" << *term <<
546
                                "] at " << *pos << "\n");
547
                        sparseDoc[*pos] = *term;
548
                    }
549
                }
550
            }
551
        }
514
      }
552
    }
515
553
    LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");
516
      map<unsigned int, string>::iterator vit;
517
      Xapian::PositionIterator pos;
518
      for (pos = xrdb.positionlist_begin(docid, *term); 
519
       pos != xrdb.positionlist_end(docid, *term); pos++) {
520
      if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
521
          ret |= ABSRES_TERMMISS;
522
          LOGDEB0("makeAbstract: max term count cutoff "  << (m_q->m_snipMaxPosWalk) << "\n" );
523
          break;
524
      }
525
      // If we are beyond the max possible position, stop
526
      // for this term
527
      if (*pos > maxpos) {
528
          break;
529
      }
530
      if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
531
          // Don't replace a term: the terms list is in
532
          // alphabetic order, and we may have several terms
533
          // at the same position, we want to keep only the
534
          // first one (ie: dockes and dockes@wanadoo.fr)
535
          if (vit->second.empty()) {
536
          LOGDEB2("makeAbstract: populating: ["  << ((*term)) << "] at "  << (*pos) << "\n" );
537
          sparseDoc[*pos] = *term;
538
          }
539
      }
540
      }
541
  }
542
    }
543
554
544
#if 0
555
#if 0
545
    // Debug only: output the full term[position] vector
556
    // Debug only: output the full term[position] vector
546
    bool epty = false;
557
    bool epty = false;
547
    int ipos = 0;
558
    int ipos = 0;
548
    for (map<unsigned int, string>::iterator it = sparseDoc.begin(); 
559
    for (map<unsigned int, string>::iterator it = sparseDoc.begin(); 
549
   it != sparseDoc.end();
560
         it != sparseDoc.end();
550
   it++, ipos++) {
561
         it++, ipos++) {
551
  if (it->empty()) {
562
        if (it->empty()) {
552
      if (!epty)
563
            if (!epty)
553
      LOGDEB("makeAbstract:vec["  << (ipos) << "]: ["  << (it) << "]\n" );
564
                LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
554
      epty=true;
565
            epty=true;
555
  } else {
566
        } else {
556
      epty = false;
567
            epty = false;
557
      LOGDEB("makeAbstract:vec["  << (ipos) << "]: ["  << (it) << "]\n" );
568
            LOGDEB("makeAbstract:vec[" << ipos << "]: [" << it << "]\n");
558
  }
569
        }
559
    }
570
    }
560
#endif
571
#endif
561
572
562
    vector<int> vpbreaks;
573
    vector<int> vpbreaks;
563
    ndb->getPagePositions(docid, vpbreaks);
574
    ndb->getPagePositions(docid, vpbreaks);
564
575
565
    LOGABS("makeAbstract:"  << (chron.millis()) << ": extracting. Got "  << (vpbreaks.size()) << " pages\n" );
576
    LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
577
           vpbreaks.size() << " pages\n");
566
    // Finally build the abstract by walking the map (in order of position)
578
    // Finally build the abstract by walking the map (in order of position)
567
    vabs.clear();
579
    vabs.clear();
568
    string chunk;
580
    string chunk;
569
    bool incjk = false;
581
    bool incjk = false;
570
    int page = 0;
582
    int page = 0;
571
    string term;
583
    string term;
572
    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
584
    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
573
   it != sparseDoc.end(); it++) {
585
         it != sparseDoc.end(); it++) {
574
  LOGDEB2("Abtract:output "  << (it->first) << " -> ["  << (it->second) << "]\n" );
586
        LOGDEB2("Abtract:output " << it->first << " -> [" << it->second <<
587
                "]\n");
575
  if (!occupiedmarker.compare(it->second)) {
588
        if (!occupiedmarker.compare(it->second)) {
576
      LOGDEB("Abstract: qtrm position not filled ??\n" );
589
            LOGDEB("Abstract: qtrm position not filled ??\n");
577
      continue;
590
            continue;
578
  }
591
        }
579
  if (chunk.empty() && !vpbreaks.empty()) {
592
        if (chunk.empty() && !vpbreaks.empty()) {
580
      page =  ndb->getPageNumberForPosition(vpbreaks, it->first);
593
            page =  ndb->getPageNumberForPosition(vpbreaks, it->first);
581
      if (page < 0) 
594
            if (page < 0) 
582
      page = 0;
595
                page = 0;
583
      term.clear();
596
            term.clear();
584
  }
597
        }
585
  Utf8Iter uit(it->second);
598
        Utf8Iter uit(it->second);
586
  bool newcjk = false;
599
        bool newcjk = false;
587
  if (TextSplit::isCJK(*uit))
600
        if (TextSplit::isCJK(*uit))
588
      newcjk = true;
601
            newcjk = true;
589
  if (!incjk || (incjk && !newcjk))
602
        if (!incjk || (incjk && !newcjk))
590
      chunk += " ";
603
            chunk += " ";
591
  incjk = newcjk;
604
        incjk = newcjk;
592
  if (searchTermPositions.find(it->first) != searchTermPositions.end())
605
        if (searchTermPositions.find(it->first) != searchTermPositions.end())
593
      term = it->second;
606
            term = it->second;
594
  if (it->second == cstr_ellipsis) {
607
        if (it->second == cstr_ellipsis) {
595
      vabs.push_back(Snippet(page, chunk).setTerm(term));
608
            vabs.push_back(Snippet(page, chunk).setTerm(term));
596
      chunk.clear();
609
            chunk.clear();
597
  } else {
610
        } else {
598
      if (it->second.compare(end_of_field_term) && 
611
            if (it->second.compare(end_of_field_term) && 
599
      it->second.compare(start_of_field_term))
612
                it->second.compare(start_of_field_term))
600
      chunk += it->second;
613
                chunk += it->second;
601
  }
614
        }
602
    }
615
    }
603
    if (!chunk.empty())
616
    if (!chunk.empty())
604
  vabs.push_back(Snippet(page, chunk).setTerm(term));
617
        vabs.push_back(Snippet(page, chunk).setTerm(term));
605
618
606
    LOGDEB2("makeAbtract: done in "  << (chron.millis()) << " mS\n" );
619
    LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
607
    return ret;
620
    return ret;
608
}
621
}
609
622
610
623
611
}
624
}
612
613
614