Switch to unified view

a/src/rcldb/rclabstract.cpp b/src/rcldb/rclabstract.cpp
...
...
125
    doccnt = 1;
125
    doccnt = 1;
126
126
127
    for (vector<string>::const_iterator qit = qterms.begin(); 
127
    for (vector<string>::const_iterator qit = qterms.begin(); 
128
     qit != qterms.end(); qit++) {
128
     qit != qterms.end(); qit++) {
129
    termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
129
    termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
130
  LOGABS(("setDbWideQTermFreqs: [%s] db freq %.1e\n", qit->c_str(), 
130
  LOGABS("setDbWideQTermFreqs: ["  << (qit) << "] db freq "  << (termfreqs[*qit]) << "\n" );
131
      termfreqs[*qit]));
132
    }
131
    }
133
}
132
}
134
133
135
// Compute matched terms quality coefficients for a matched document by
134
// Compute matched terms quality coefficients for a matched document by
136
// retrieving the Within Document Frequencies and multiplying by
135
// retrieving the Within Document Frequencies and multiplying by
...
...
145
// aggregated frequency.
144
// aggregated frequency.
146
double Query::Native::qualityTerms(Xapian::docid docid, 
145
double Query::Native::qualityTerms(Xapian::docid docid, 
147
                   const vector<string>& terms,
146
                   const vector<string>& terms,
148
                   multimap<double, vector<string> >& byQ)
147
                   multimap<double, vector<string> >& byQ)
149
{
148
{
150
    LOGABS(("qualityTerms\n"));
149
    LOGABS("qualityTerms\n" );
151
    setDbWideQTermsFreqs();
150
    setDbWideQTermsFreqs();
152
151
153
    map<string, double> termQcoefs;
152
    map<string, double> termQcoefs;
154
    double totalweight = 0;
153
    double totalweight = 0;
155
154
...
...
164
163
165
#ifdef DEBUGABSTRACT
164
#ifdef DEBUGABSTRACT
166
    {
165
    {
167
    string deb;
166
    string deb;
168
    hld.toString(deb);
167
    hld.toString(deb);
169
  LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));
168
  LOGABS("qualityTerms: hld: "  << (deb) << "\n" );
170
    }
169
    }
171
#endif
170
#endif
172
171
173
    // Group the input terms by the user term they were possibly expanded from
172
    // Group the input terms by the user term they were possibly expanded from
174
    map<string, vector<string> > byRoot;
173
    map<string, vector<string> > byRoot;
...
...
193
         it != debit->second.end(); it++) {
192
         it != debit->second.end(); it++) {
194
        byRootstr.append("[").append(*it).append("] ");
193
        byRootstr.append("[").append(*it).append("] ");
195
        }
194
        }
196
        byRootstr.append("\n");
195
        byRootstr.append("\n");
197
    }
196
    }
198
    LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));
197
    LOGABS("\nqualityTerms: uterms to terms: "  << (byRootstr) << "\n" );
199
    }
198
    }
200
#endif
199
#endif
201
200
202
    // Compute in-document and global frequencies for the groups.
201
    // Compute in-document and global frequencies for the groups.
203
    map<string, double> grpwdfs;
202
    map<string, double> grpwdfs;
...
...
241
    }
240
    }
242
241
243
#ifdef DEBUGABSTRACT
242
#ifdef DEBUGABSTRACT
244
    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
243
    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
245
     mit != byQ.rend(); mit++) {
244
     mit != byQ.rend(); mit++) {
246
    LOGABS(("qualityTerms: group\n"));
245
    LOGABS("qualityTerms: group\n" );
247
    for (vector<string>::const_iterator qit = mit->second.begin();
246
    for (vector<string>::const_iterator qit = mit->second.begin();
248
         qit != mit->second.end(); qit++) {
247
         qit != mit->second.end(); qit++) {
249
      LOGABS(("%.1e->[%s]\n", mit->first, qit->c_str()));
248
      LOGABS(""  << (mit->first) << "->["  << (qit) << "]\n" );
250
    }
249
    }
251
    }
250
    }
252
#endif
251
#endif
253
    return totalweight;
252
    return totalweight;
254
}
253
}
...
...
313
int Query::Native::makeAbstract(Xapian::docid docid,
312
int Query::Native::makeAbstract(Xapian::docid docid,
314
                vector<Snippet>& vabs, 
313
                vector<Snippet>& vabs, 
315
                int imaxoccs, int ictxwords)
314
                int imaxoccs, int ictxwords)
316
{
315
{
317
    Chrono chron;
316
    Chrono chron;
318
    LOGABS(("makeAbstract: docid %ld imaxoccs %d ictxwords %d\n", 
317
    LOGABS("makeAbstract: docid "  << (long(docid)) << " imaxoccs "  << (imaxoccs) << " ictxwords "  << (ictxwords) << "\n" );
319
      long(docid), imaxoccs, ictxwords));
320
318
321
    // The (unprefixed) terms matched by this document
319
    // The (unprefixed) terms matched by this document
322
    vector<string> matchedTerms;
320
    vector<string> matchedTerms;
323
    getMatchTerms(docid, matchedTerms);
321
    getMatchTerms(docid, matchedTerms);
324
    if (matchedTerms.empty()) {
322
    if (matchedTerms.empty()) {
...
...
337
    // going to try and show text around the less common search terms.
335
    // going to try and show text around the less common search terms.
338
    // Terms issued from an original one by stem expansion are
336
    // Terms issued from an original one by stem expansion are
339
    // aggregated by the qualityTerms() routine.
337
    // aggregated by the qualityTerms() routine.
340
    multimap<double, vector<string> > byQ;
338
    multimap<double, vector<string> > byQ;
341
    double totalweight = qualityTerms(docid, matchedTerms, byQ);
339
    double totalweight = qualityTerms(docid, matchedTerms, byQ);
342
    LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
340
    LOGABS("makeAbstract:"  << (chron.ms()) << ": computed Qcoefs.\n" );
343
    // This can't happen, but would crash us
341
    // This can't happen, but would crash us
344
    if (totalweight == 0.0) {
342
    if (totalweight == 0.0) {
345
    LOGERR("makeAbstract: totalweight == 0.0 !\n" );
343
    LOGERR("makeAbstract: totalweight == 0.0 !\n" );
346
    return ABSRES_ERROR;
344
    return ABSRES_ERROR;
347
    }
345
    }
...
...
374
    // with words. We used to limit the character size at the end, but
372
    // with words. We used to limit the character size at the end, but
375
    // this damaged our careful selection of terms
373
    // this damaged our careful selection of terms
376
    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
374
    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
377
    m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
375
    m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
378
    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
376
    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
379
    LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n", 
377
    LOGABS("makeAbstract:"  << (chron.ms()) << ": mxttloccs "  << (maxtotaloccs) << " ctxwords "  << (ctxwords) << "\n" );
380
      chron.ms(), maxtotaloccs, ctxwords));
381
378
382
    int ret = ABSRES_OK;
379
    int ret = ABSRES_OK;
383
380
384
    // Let's go populate
381
    // Let's go populate
385
    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
382
    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
...
...
403
        if (grpoccs >= maxgrpoccs) 
400
        if (grpoccs >= maxgrpoccs) 
404
        break;
401
        break;
405
402
406
        string qterm = *qit;
403
        string qterm = *qit;
407
404
408
      LOGABS(("makeAbstract: [%s] %d max grp occs (coef %.2f)\n", 
405
      LOGABS("makeAbstract: ["  << (qterm) << "] "  << (maxgrpoccs) << " max grp occs (coef "  << (q) << ")\n" );
409
          qterm.c_str(), maxgrpoccs, q));
410
406
411
        // The match term may span several words
407
        // The match term may span several words
412
        int qtrmwrdcnt = 
408
        int qtrmwrdcnt = 
413
        TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
409
        TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
414
410
...
...
423
        for (pos = xrdb.positionlist_begin(docid, qterm); 
419
        for (pos = xrdb.positionlist_begin(docid, qterm); 
424
             pos != xrdb.positionlist_end(docid, qterm); pos++) {
420
             pos != xrdb.positionlist_end(docid, qterm); pos++) {
425
            int ipos = *pos;
421
            int ipos = *pos;
426
            if (ipos < int(baseTextPosition)) // Not in text body
422
            if (ipos < int(baseTextPosition)) // Not in text body
427
            continue;
423
            continue;
428
          LOGABS(("makeAbstract: [%s] at pos %d grpoccs %d maxgrpoccs"
424
          LOGABS("makeAbstract: ["  << (qterm) << "] at pos "  << (ipos) << " grpoccs "  << (grpoccs) << " maxgrpoccs "  << (maxgrpoccs) << "\n" );
429
              " %d\n", qterm.c_str(), ipos, grpoccs, maxgrpoccs));
430
425
431
            totaloccs++;
426
            totaloccs++;
432
            grpoccs++;
427
            grpoccs++;
433
428
434
            // Add adjacent slots to the set to populate at next
429
            // Add adjacent slots to the set to populate at next
...
...
464
            }
459
            }
465
460
466
            // Group done ?
461
            // Group done ?
467
            if (grpoccs >= maxgrpoccs) {
462
            if (grpoccs >= maxgrpoccs) {
468
            ret |= ABSRES_TRUNC;
463
            ret |= ABSRES_TRUNC;
469
            LOGABS(("Db::makeAbstract: max group occs cutoff\n"));
464
            LOGABS("Db::makeAbstract: max group occs cutoff\n" );
470
            break;
465
            break;
471
            }
466
            }
472
            // Global done ?
467
            // Global done ?
473
            if (totaloccs >= maxtotaloccs) {
468
            if (totaloccs >= maxtotaloccs) {
474
            ret |= ABSRES_TRUNC;
469
            ret |= ABSRES_TRUNC;
475
            LOGABS(("Db::makeAbstract: max occurrences cutoff\n"));
470
            LOGABS("Db::makeAbstract: max occurrences cutoff\n" );
476
            break;
471
            break;
477
            }
472
            }
478
        }
473
        }
479
        } catch (...) {
474
        } catch (...) {
480
        // Term does not occur. No problem.
475
        // Term does not occur. No problem.
481
        }
476
        }
482
477
483
        if (totaloccs >= maxtotaloccs) {
478
        if (totaloccs >= maxtotaloccs) {
484
        ret |= ABSRES_TRUNC;
479
        ret |= ABSRES_TRUNC;
485
        LOGABS(("Db::makeAbstract: max1 occurrences cutoff\n"));
480
        LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );
486
        break;
481
        break;
487
        }
482
        }
488
    }
483
    }
489
    }
484
    }
490
    maxpos += ctxwords + 1;
485
    maxpos += ctxwords + 1;
491
486
492
    LOGABS(("makeAbstract:%d:chosen number of positions %d\n", 
487
    LOGABS("makeAbstract:"  << (chron.millis()) << ":chosen number of positions "  << (totaloccs) << "\n" );
493
      chron.millis(), totaloccs));
494
    // This can happen if there are term occurences in the keywords
488
    // This can happen if there are term occurences in the keywords
495
    // etc. but not elsewhere ?
489
    // etc. but not elsewhere ?
496
    if (totaloccs == 0) {
490
    if (totaloccs == 0) {
497
    LOGDEB("makeAbstract: no occurrences\n" );
491
    LOGDEB("makeAbstract: no occurrences\n" );
498
    return ABSRES_OK;
492
    return ABSRES_OK;
...
...
564
#endif
558
#endif
565
559
566
    vector<int> vpbreaks;
560
    vector<int> vpbreaks;
567
    ndb->getPagePositions(docid, vpbreaks);
561
    ndb->getPagePositions(docid, vpbreaks);
568
562
569
    LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(),
563
    LOGABS("makeAbstract:"  << (chron.millis()) << ": extracting. Got "  << (vpbreaks.size()) << " pages\n" );
570
      vpbreaks.size()));
571
    // Finally build the abstract by walking the map (in order of position)
564
    // Finally build the abstract by walking the map (in order of position)
572
    vabs.clear();
565
    vabs.clear();
573
    string chunk;
566
    string chunk;
574
    bool incjk = false;
567
    bool incjk = false;
575
    int page = 0;
568
    int page = 0;
...
...
613
}
606
}
614
607
615
608
616
}
609
}
617
610
611
612