Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
...
...
242
    doccnt = 1;
242
    doccnt = 1;
243
243
244
    for (vector<string>::const_iterator qit = qterms.begin(); 
244
    for (vector<string>::const_iterator qit = qterms.begin(); 
245
     qit != qterms.end(); qit++) {
245
     qit != qterms.end(); qit++) {
246
    query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
246
    query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
247
    LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), 
247
    LOGABS(("set..QTermFreqs: [%s] db freq %.1e\n", qit->c_str(), 
248
        query->m_nq->termfreqs[*qit]));
248
        query->m_nq->termfreqs[*qit]));
249
    }
249
    }
250
}
250
}
251
251
252
// Compute query terms quality coefficients for a matched document by
252
// Compute query terms quality coefficients for a matched document by
...
...
296
    if (termQcoefs.find(*qit) != termQcoefs.end())
296
    if (termQcoefs.find(*qit) != termQcoefs.end())
297
        byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
297
        byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
298
    }
298
    }
299
299
300
#ifdef DEBUGABSTRACT
300
#ifdef DEBUGABSTRACT
301
    LOGDEB(("Db::qualityTerms:\n"));
301
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
302
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
302
     qit != byQ.rend(); qit++) {
303
     qit != byQ.rend(); qit++) {
303
    LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
304
    LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
304
    }
305
    }
305
#endif
306
#endif
...
...
413
// Build a document abstract by extracting text chunks around the query terms
414
// Build a document abstract by extracting text chunks around the query terms
414
// This uses the db termlists, not the original document.
415
// This uses the db termlists, not the original document.
415
//
416
//
416
// DatabaseModified and other general exceptions are catched and
417
// DatabaseModified and other general exceptions are catched and
417
// possibly retried by our caller
418
// possibly retried by our caller
418
bool Db::Native::makeAbstract(Xapian::docid docid, Query *query, 
419
abstract_result Db::Native::makeAbstract(Xapian::docid docid, Query *query, 
419
                  vector<pair<int, string> >& vabs)
420
                   vector<pair<int, string> >& vabs, 
421
                   int imaxoccs, int ictxwords)
420
{
422
{
421
    Chrono chron;
423
    Chrono chron;
422
    LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
424
    LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d imaxoccs %d\n", chron.ms(),
423
         m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
425
         m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen, imaxoccs));
424
426
425
    // The (unprefixed) terms matched by this document
427
    // The (unprefixed) terms matched by this document
426
    vector<string> matchedTerms;
428
    vector<string> matchedTerms;
427
    {
429
    {
428
        vector<string> iterms;
430
        vector<string> iterms;
429
        query->getMatchTerms(docid, iterms);
431
        query->getMatchTerms(docid, iterms);
430
        noPrefixList(iterms, matchedTerms);
432
        noPrefixList(iterms, matchedTerms);
431
        if (matchedTerms.empty()) {
433
        if (matchedTerms.empty()) {
432
            LOGDEB(("makeAbstract::Empty term list\n"));
434
            LOGDEB(("makeAbstract::Empty term list\n"));
433
            return false;
435
            return ABSRES_ERROR;
434
        }
436
        }
435
    }
437
    }
436
    listList("Match terms: ", matchedTerms);
438
    listList("Match terms: ", matchedTerms);
437
439
438
    // Retrieve the term freqencies for the query terms. This is
440
    // Retrieve the term freqencies for the query terms. This is
...
...
451
    double totalweight = qualityTerms(docid, query, matchedTerms, byQ);
453
    double totalweight = qualityTerms(docid, query, matchedTerms, byQ);
452
    LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
454
    LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
453
    // This can't happen, but would crash us
455
    // This can't happen, but would crash us
454
    if (totalweight == 0.0) {
456
    if (totalweight == 0.0) {
455
    LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
457
    LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
456
  return false;
458
  return ABSRES_ERROR;
457
    }
459
    }
458
460
459
    ///////////////////
461
    ///////////////////
460
    // For each of the query terms, ask xapian for its positions list
462
    // For each of the query terms, ask xapian for its positions list
461
    // in the document. For each position entry, remember it in
463
    // in the document. For each position entry, remember it in
...
...
472
    // Limit the total number of slots we populate. The 7 is taken as
474
    // Limit the total number of slots we populate. The 7 is taken as
473
    // average word size. It was a mistake to have the user max
475
    // average word size. It was a mistake to have the user max
474
    // abstract size parameter in characters, we basically only deal
476
    // abstract size parameter in characters, we basically only deal
475
    // with words. We used to limit the character size at the end, but
477
    // with words. We used to limit the character size at the end, but
476
    // this damaged our careful selection of terms
478
    // this damaged our careful selection of terms
477
    const unsigned int maxtotaloccs = 
479
    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
478
    m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
480
    m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
479
    LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
481
    int ctxwords = ictxwords == -1 ? m_rcldb->m_synthAbsWordCtxLen : ictxwords;
482
    LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n", 
483
      chron.ms(), maxtotaloccs, ctxwords));
480
484
481
    // This is used to mark positions overlapped by a multi-word match term
485
    // This is used to mark positions overlapped by a multi-word match term
482
    const string occupiedmarker("?");
486
    const string occupiedmarker("?");
487
488
    abstract_result ret = ABSRES_OK;
483
489
484
    // Let's go populate
490
    // Let's go populate
485
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
491
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
486
     qit != byQ.rend(); qit++) {
492
     qit != byQ.rend(); qit++) {
487
    string qterm = qit->second;
493
    string qterm = qit->second;
...
...
520
526
521
        // Add adjacent slots to the set to populate at next
527
        // Add adjacent slots to the set to populate at next
522
        // step by inserting empty strings. Special provisions
528
        // step by inserting empty strings. Special provisions
523
        // for adding ellipsis and for positions overlapped by
529
        // for adding ellipsis and for positions overlapped by
524
        // the match term.
530
        // the match term.
525
      unsigned int sta = MAX(0, ipos-m_rcldb->m_synthAbsWordCtxLen);
531
      unsigned int sta = MAX(0, ipos - ctxwords);
526
        unsigned int sto = ipos + qtrmwrdcnt-1 + 
532
        unsigned int sto = ipos + qtrmwrdcnt-1 + 
527
            m_rcldb->m_synthAbsWordCtxLen;
533
            m_rcldb->m_synthAbsWordCtxLen;
528
        for (unsigned int ii = sta; ii <= sto;  ii++) {
534
        for (unsigned int ii = sta; ii <= sto;  ii++) {
529
            if (ii == (unsigned int)ipos) {
535
            if (ii == (unsigned int)ipos) {
530
            sparseDoc[ii] = qterm;
536
            sparseDoc[ii] = qterm;
...
...
546
            sparseDoc[sto+1] = cstr_ellipsis;
552
            sparseDoc[sto+1] = cstr_ellipsis;
547
        }
553
        }
548
554
549
        // Limit to allocated occurences and total size
555
        // Limit to allocated occurences and total size
550
        if (++occurrences >= maxoccs || 
556
        if (++occurrences >= maxoccs || 
551
            totaloccs >= maxtotaloccs)
557
            totaloccs >= maxtotaloccs) {
558
          ret = ABSRES_TRUNC;
559
          LOGDEB(("Db::makeAbstract: max occurrences cutoff\n"));
552
            break;
560
            break;
561
      }
553
        }
562
        }
554
    } catch (...) {
563
    } catch (...) {
555
        // Term does not occur. No problem.
564
        // Term does not occur. No problem.
556
    }
565
    }
557
    if (totaloccs >= maxtotaloccs)
566
    if (totaloccs >= maxtotaloccs) {
567
      ret = ABSRES_TRUNC;
568
      LOGDEB(("Db::makeAbstract: max1 occurrences cutoff\n"));
558
        break;
569
        break;
570
  }
559
    }
571
    }
560
    LOGABS(("makeAbstract:%d:chosen number of positions %d\n", 
572
    LOGABS(("makeAbstract:%d:chosen number of positions %d\n", 
561
        chron.millis(), totaloccs));
573
        chron.millis(), totaloccs));
562
574
563
    // This can happen if there are term occurences in the keywords
575
    // This can happen if there are term occurences in the keywords
564
    // etc. but not elsewhere ?
576
    // etc. but not elsewhere ?
565
    if (totaloccs == 0) {
577
    if (totaloccs == 0) {
566
    LOGDEB1(("makeAbstract: no occurrences\n"));
578
    LOGDEB1(("makeAbstract: no occurrences\n"));
567
  return false;
579
  return ABSRES_ERROR;
568
    }
580
    }
569
581
570
    // Walk all document's terms position lists and populate slots
582
    // Walk all document's terms position lists and populate slots
571
    // around the query terms. We arbitrarily truncate the list to
583
    // around the query terms. We arbitrarily truncate the list to
572
    // avoid taking forever. If we do cutoff, the abstract may be
584
    // avoid taking forever. If we do cutoff, the abstract may be
...
...
580
         term != xrdb.termlist_end(docid); term++) {
592
         term != xrdb.termlist_end(docid); term++) {
581
        // Ignore prefixed terms
593
        // Ignore prefixed terms
582
        if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
594
        if ('A' <= (*term).at(0) && (*term).at(0) <= 'Z')
583
        continue;
595
        continue;
584
        if (cutoff-- < 0) {
596
        if (cutoff-- < 0) {
597
      ret = ABSRES_TRUNC;
585
        LOGDEB0(("makeAbstract: max term count cutoff\n"));
598
        LOGDEB0(("makeAbstract: max term count cutoff\n"));
586
        break;
599
        break;
587
        }
600
        }
588
601
589
        Xapian::PositionIterator pos;
602
        Xapian::PositionIterator pos;
590
        for (pos = xrdb.positionlist_begin(docid, *term); 
603
        for (pos = xrdb.positionlist_begin(docid, *term); 
591
         pos != xrdb.positionlist_end(docid, *term); pos++) {
604
         pos != xrdb.positionlist_end(docid, *term); pos++) {
592
        if (cutoff-- < 0) {
605
        if (cutoff-- < 0) {
606
          ret = ABSRES_TRUNC;
593
            LOGDEB0(("makeAbstract: max term count cutoff\n"));
607
            LOGDEB0(("makeAbstract: max term count cutoff\n"));
594
            break;
608
            break;
595
        }
609
        }
596
        map<unsigned int, string>::iterator vit;
610
        map<unsigned int, string>::iterator vit;
597
        if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
611
        if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
598
            // Don't replace a term: the terms list is in
612
            // Don't replace a term: the terms list is in
599
            // alphabetic order, and we may have several terms
613
            // alphabetic order, and we may have several terms
600
            // at the same position, we want to keep only the
614
            // at the same position, we want to keep only the
601
            // first one (ie: dockes and dockes@wanadoo.fr)
615
            // first one (ie: dockes and dockes@wanadoo.fr)
602
            if (vit->second.empty()) {
616
            if (vit->second.empty()) {
603
            LOGABS(("makeAbstract: populating: [%s] at %d\n", 
617
            LOGDEB2(("makeAbstract: populating: [%s] at %d\n", 
604
                (*term).c_str(), *pos));
618
                 (*term).c_str(), *pos));
605
            sparseDoc[*pos] = *term;
619
            sparseDoc[*pos] = *term;
606
            }
620
            }
607
        }
621
        }
608
        }
622
        }
609
    }
623
    }
...
...
663
    }
677
    }
664
    if (!chunk.empty())
678
    if (!chunk.empty())
665
    vabs.push_back(pair<int, string>(page, chunk));
679
    vabs.push_back(pair<int, string>(page, chunk));
666
680
667
    LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
681
    LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
668
    return true;
682
    return ret;
669
}
683
}
670
684
671
/* Rcl::Db methods ///////////////////////////////// */
685
/* Rcl::Db methods ///////////////////////////////// */
672
686
673
bool Db::o_inPlaceReset;
687
bool Db::o_inPlaceReset;
...
...
2117
    return false;
2131
    return false;
2118
    }
2132
    }
2119
    return true;
2133
    return true;
2120
}
2134
}
2121
2135
2122
bool Db::makeDocAbstract(Doc &doc, Query *query, 
2136
abstract_result Db::makeDocAbstract(Doc &doc, Query *query, 
2123
             vector<pair<int, string> >& abstract)
2137
                  vector<pair<int, string> >& abstract, 
2138
                  int maxoccs, int ctxwords)
2124
{
2139
{
2140
    LOGDEB(("makeDocAbstract: maxoccs %d ctxwords %d\n", maxoccs, ctxwords));
2125
    if (!m_ndb || !m_ndb->m_isopen) {
2141
    if (!m_ndb || !m_ndb->m_isopen) {
2126
    LOGERR(("Db::makeDocAbstract: no db\n"));
2142
    LOGERR(("Db::makeDocAbstract: no db\n"));
2127
  return false;
2143
  return ABSRES_ERROR;
2128
    }
2144
    }
2129
    bool ret = false;
2145
    abstract_result ret = ABSRES_ERROR;
2130
    XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract),
2146
    XAPTRY(ret = m_ndb->makeAbstract(doc.xdocid, query, abstract, 
2147
                   maxoccs, ctxwords),
2131
           m_ndb->xrdb, m_reason);
2148
           m_ndb->xrdb, m_reason);
2132
    return (ret && m_reason.empty()) ? true : false;
2149
    if (!m_reason.empty())
2150
  return ABSRES_ERROR;
2151
    return ret;
2133
}
2152
}
2134
2153
2135
bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract)
2154
bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract)
2136
{
2155
{
2137
    if (!m_ndb || !m_ndb->m_isopen) {
2156
    if (!m_ndb || !m_ndb->m_isopen) {