recoll / Code / Diff of /src/rcldb/rclabstract.cpp

Diff of /src/rcldb/rclabstract.cpp [5af2d7] .. [8f77b9]

Switch to unified view


...
    doccnt = 1;

    for (vector<string>::const_iterator qit = qterms.begin(); 
     qit != qterms.end(); qit++) {
    termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
  LOGABS("setDbWideQTermFreqs: ["  << (qit) << "] db freq "  << (termfreqs[*qit]) << "\n" );

    }
}

// Compute matched terms quality coefficients for a matched document by
// retrieving the Within Document Frequencies and multiplying by
...
// aggregated frequency.
double Query::Native::qualityTerms(Xapian::docid docid, 
                   const vector<string>& terms,
                   multimap<double, vector<string> >& byQ)
{
    LOGABS("qualityTerms\n" );
    setDbWideQTermsFreqs();

    map<string, double> termQcoefs;
    double totalweight = 0;

...

#ifdef DEBUGABSTRACT
    {
    string deb;
    hld.toString(deb);
  LOGABS("qualityTerms: hld: "  << (deb) << "\n" );
    }
#endif

    // Group the input terms by the user term they were possibly expanded from
    map<string, vector<string> > byRoot;
...
         it != debit->second.end(); it++) {
        byRootstr.append("[").append(*it).append("] ");
        }
        byRootstr.append("\n");
    }
    LOGABS("\nqualityTerms: uterms to terms: "  << (byRootstr) << "\n" );
    }
#endif

    // Compute in-document and global frequencies for the groups.
    map<string, double> grpwdfs;
...
    }

#ifdef DEBUGABSTRACT
    for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin(); 
     mit != byQ.rend(); mit++) {
    LOGABS("qualityTerms: group\n" );
    for (vector<string>::const_iterator qit = mit->second.begin();
         qit != mit->second.end(); qit++) {
      LOGABS(""  << (mit->first) << "->["  << (qit) << "]\n" );
    }
    }
#endif
    return totalweight;
}
...
int Query::Native::makeAbstract(Xapian::docid docid,
                vector<Snippet>& vabs, 
                int imaxoccs, int ictxwords)
{
    Chrono chron;
    LOGABS("makeAbstract: docid "  << (long(docid)) << " imaxoccs "  << (imaxoccs) << " ictxwords "  << (ictxwords) << "\n" );


    // The (unprefixed) terms matched by this document
    vector<string> matchedTerms;
    getMatchTerms(docid, matchedTerms);
    if (matchedTerms.empty()) {
...
    // going to try and show text around the less common search terms.
    // Terms issued from an original one by stem expansion are
    // aggregated by the qualityTerms() routine.
    multimap<double, vector<string> > byQ;
    double totalweight = qualityTerms(docid, matchedTerms, byQ);
    LOGABS("makeAbstract:"  << (chron.ms()) << ": computed Qcoefs.\n" );
    // This can't happen, but would crash us
    if (totalweight == 0.0) {
    LOGERR("makeAbstract: totalweight == 0.0 !\n" );
    return ABSRES_ERROR;
    }
...
    // with words. We used to limit the character size at the end, but
    // this damaged our careful selection of terms
    const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
    m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
    int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
    LOGABS("makeAbstract:"  << (chron.ms()) << ": mxttloccs "  << (maxtotaloccs) << " ctxwords "  << (ctxwords) << "\n" );


    int ret = ABSRES_OK;

    // Let's go populate
    for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin(); 
...
        if (grpoccs >= maxgrpoccs) 
        break;

        string qterm = *qit;

      LOGABS("makeAbstract: ["  << (qterm) << "] "  << (maxgrpoccs) << " max grp occs (coef "  << (q) << ")\n" );


        // The match term may span several words
        int qtrmwrdcnt = 
        TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);

...
        for (pos = xrdb.positionlist_begin(docid, qterm); 
             pos != xrdb.positionlist_end(docid, qterm); pos++) {
            int ipos = *pos;
            if (ipos < int(baseTextPosition)) // Not in text body
            continue;
          LOGABS("makeAbstract: ["  << (qterm) << "] at pos "  << (ipos) << " grpoccs "  << (grpoccs) << " maxgrpoccs "  << (maxgrpoccs) << "\n" );


            totaloccs++;
            grpoccs++;

            // Add adjacent slots to the set to populate at next
...
            }

            // Group done ?
            if (grpoccs >= maxgrpoccs) {
            ret |= ABSRES_TRUNC;
            LOGABS("Db::makeAbstract: max group occs cutoff\n" );
            break;
            }
            // Global done ?
            if (totaloccs >= maxtotaloccs) {
            ret |= ABSRES_TRUNC;
            LOGABS("Db::makeAbstract: max occurrences cutoff\n" );
            break;
            }
        }
        } catch (...) {
        // Term does not occur. No problem.
        }

        if (totaloccs >= maxtotaloccs) {
        ret |= ABSRES_TRUNC;
        LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );
        break;
        }
    }
    }
    maxpos += ctxwords + 1;

    LOGABS("makeAbstract:"  << (chron.millis()) << ":chosen number of positions "  << (totaloccs) << "\n" );

    // This can happen if there are term occurences in the keywords
    // etc. but not elsewhere ?
    if (totaloccs == 0) {
    LOGDEB("makeAbstract: no occurrences\n" );
    return ABSRES_OK;
...
#endif

    vector<int> vpbreaks;
    ndb->getPagePositions(docid, vpbreaks);

    LOGABS("makeAbstract:"  << (chron.millis()) << ": extracting. Got "  << (vpbreaks.size()) << " pages\n" );

    // Finally build the abstract by walking the map (in order of position)
    vabs.clear();
    string chunk;
    bool incjk = false;
    int page = 0;
...
}


}




	a/src/rcldb/rclabstract.cpp		b/src/rcldb/rclabstract.cpp
	...		...
125	doccnt = 1;	125	doccnt = 1;
126		126
127	for (vector<string>::const_iterator qit = qterms.begin();	127	for (vector<string>::const_iterator qit = qterms.begin();
128	qit != qterms.end(); qit++) {	128	qit != qterms.end(); qit++) {
129	termfreqs[qit] = xrdb.get_termfreq(qit) / doccnt;	129	termfreqs[qit] = xrdb.get_termfreq(qit) / doccnt;
130	LOGABS(("setDbWideQTermFreqs: [%s] db freq %.1e\n", qit->c_str(),	130	LOGABS("setDbWideQTermFreqs: [" << (qit) << "] db freq " << (termfreqs[*qit]) << "\n" );
131	termfreqs[*qit]));
132	}	131	}
133	}	132	}
134		133
135	// Compute matched terms quality coefficients for a matched document by	134	// Compute matched terms quality coefficients for a matched document by
136	// retrieving the Within Document Frequencies and multiplying by	135	// retrieving the Within Document Frequencies and multiplying by
	...		...
145	// aggregated frequency.	144	// aggregated frequency.
146	double Query::Native::qualityTerms(Xapian::docid docid,	145	double Query::Native::qualityTerms(Xapian::docid docid,
147	const vector<string>& terms,	146	const vector<string>& terms,
148	multimap<double, vector<string> >& byQ)	147	multimap<double, vector<string> >& byQ)
149	{	148	{
150	LOGABS(("qualityTerms\n"));	149	LOGABS("qualityTerms\n" );
151	setDbWideQTermsFreqs();	150	setDbWideQTermsFreqs();
152		151
153	map<string, double> termQcoefs;	152	map<string, double> termQcoefs;
154	double totalweight = 0;	153	double totalweight = 0;
155		154
	...		...
164		163
165	#ifdef DEBUGABSTRACT	164	#ifdef DEBUGABSTRACT
166	{	165	{
167	string deb;	166	string deb;
168	hld.toString(deb);	167	hld.toString(deb);
169	LOGABS(("qualityTerms: hld: %s\n", deb.c_str()));	168	LOGABS("qualityTerms: hld: " << (deb) << "\n" );
170	}	169	}
171	#endif	170	#endif
172		171
173	// Group the input terms by the user term they were possibly expanded from	172	// Group the input terms by the user term they were possibly expanded from
174	map<string, vector<string> > byRoot;	173	map<string, vector<string> > byRoot;
	...		...
193	it != debit->second.end(); it++) {	192	it != debit->second.end(); it++) {
194	byRootstr.append("[").append(*it).append("] ");	193	byRootstr.append("[").append(*it).append("] ");
195	}	194	}
196	byRootstr.append("\n");	195	byRootstr.append("\n");
197	}	196	}
198	LOGABS(("\nqualityTerms: uterms to terms: %s\n", byRootstr.c_str()));	197	LOGABS("\nqualityTerms: uterms to terms: " << (byRootstr) << "\n" );
199	}	198	}
200	#endif	199	#endif
201		200
202	// Compute in-document and global frequencies for the groups.	201	// Compute in-document and global frequencies for the groups.
203	map<string, double> grpwdfs;	202	map<string, double> grpwdfs;
	...		...
241	}	240	}
242		241
243	#ifdef DEBUGABSTRACT	242	#ifdef DEBUGABSTRACT
244	for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();	243	for (multimap<double, vector<string> >::reverse_iterator mit= byQ.rbegin();
245	mit != byQ.rend(); mit++) {	244	mit != byQ.rend(); mit++) {
246	LOGABS(("qualityTerms: group\n"));	245	LOGABS("qualityTerms: group\n" );
247	for (vector<string>::const_iterator qit = mit->second.begin();	246	for (vector<string>::const_iterator qit = mit->second.begin();
248	qit != mit->second.end(); qit++) {	247	qit != mit->second.end(); qit++) {
249	LOGABS(("%.1e->[%s]\n", mit->first, qit->c_str()));	248	LOGABS("" << (mit->first) << "->[" << (qit) << "]\n" );
250	}	249	}
251	}	250	}
252	#endif	251	#endif
253	return totalweight;	252	return totalweight;
254	}	253	}
	...		...
313	int Query::Native::makeAbstract(Xapian::docid docid,	312	int Query::Native::makeAbstract(Xapian::docid docid,
314	vector<Snippet>& vabs,	313	vector<Snippet>& vabs,
315	int imaxoccs, int ictxwords)	314	int imaxoccs, int ictxwords)
316	{	315	{
317	Chrono chron;	316	Chrono chron;
318	LOGABS(("makeAbstract: docid %ld imaxoccs %d ictxwords %d\n",	317	LOGABS("makeAbstract: docid " << (long(docid)) << " imaxoccs " << (imaxoccs) << " ictxwords " << (ictxwords) << "\n" );
319	long(docid), imaxoccs, ictxwords));
320		318
321	// The (unprefixed) terms matched by this document	319	// The (unprefixed) terms matched by this document
322	vector<string> matchedTerms;	320	vector<string> matchedTerms;
323	getMatchTerms(docid, matchedTerms);	321	getMatchTerms(docid, matchedTerms);
324	if (matchedTerms.empty()) {	322	if (matchedTerms.empty()) {
	...		...
337	// going to try and show text around the less common search terms.	335	// going to try and show text around the less common search terms.
338	// Terms issued from an original one by stem expansion are	336	// Terms issued from an original one by stem expansion are
339	// aggregated by the qualityTerms() routine.	337	// aggregated by the qualityTerms() routine.
340	multimap<double, vector<string> > byQ;	338	multimap<double, vector<string> > byQ;
341	double totalweight = qualityTerms(docid, matchedTerms, byQ);	339	double totalweight = qualityTerms(docid, matchedTerms, byQ);
342	LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));	340	LOGABS("makeAbstract:" << (chron.ms()) << ": computed Qcoefs.\n" );
343	// This can't happen, but would crash us	341	// This can't happen, but would crash us
344	if (totalweight == 0.0) {	342	if (totalweight == 0.0) {
345	LOGERR("makeAbstract: totalweight == 0.0 !\n" );	343	LOGERR("makeAbstract: totalweight == 0.0 !\n" );
346	return ABSRES_ERROR;	344	return ABSRES_ERROR;
347	}	345	}
	...		...
374	// with words. We used to limit the character size at the end, but	372	// with words. We used to limit the character size at the end, but
375	// this damaged our careful selection of terms	373	// this damaged our careful selection of terms
376	const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :	374	const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
377	m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));	375	m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
378	int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;	376	int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
379	LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n",	377	LOGABS("makeAbstract:" << (chron.ms()) << ": mxttloccs " << (maxtotaloccs) << " ctxwords " << (ctxwords) << "\n" );
380	chron.ms(), maxtotaloccs, ctxwords));
381		378
382	int ret = ABSRES_OK;	379	int ret = ABSRES_OK;
383		380
384	// Let's go populate	381	// Let's go populate
385	for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();	382	for (multimap<double, vector<string> >::reverse_iterator mit = byQ.rbegin();
	...		...
403	if (grpoccs >= maxgrpoccs)	400	if (grpoccs >= maxgrpoccs)
404	break;	401	break;
405		402
406	string qterm = *qit;	403	string qterm = *qit;
407		404
408	LOGABS(("makeAbstract: [%s] %d max grp occs (coef %.2f)\n",	405	LOGABS("makeAbstract: [" << (qterm) << "] " << (maxgrpoccs) << " max grp occs (coef " << (q) << ")\n" );
409	qterm.c_str(), maxgrpoccs, q));
410		406
411	// The match term may span several words	407	// The match term may span several words
412	int qtrmwrdcnt =	408	int qtrmwrdcnt =
413	TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);	409	TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
414		410
	...		...
423	for (pos = xrdb.positionlist_begin(docid, qterm);	419	for (pos = xrdb.positionlist_begin(docid, qterm);
424	pos != xrdb.positionlist_end(docid, qterm); pos++) {	420	pos != xrdb.positionlist_end(docid, qterm); pos++) {
425	int ipos = *pos;	421	int ipos = *pos;
426	if (ipos < int(baseTextPosition)) // Not in text body	422	if (ipos < int(baseTextPosition)) // Not in text body
427	continue;	423	continue;
428	LOGABS(("makeAbstract: [%s] at pos %d grpoccs %d maxgrpoccs"	424	LOGABS("makeAbstract: [" << (qterm) << "] at pos " << (ipos) << " grpoccs " << (grpoccs) << " maxgrpoccs " << (maxgrpoccs) << "\n" );
429	" %d\n", qterm.c_str(), ipos, grpoccs, maxgrpoccs));
430		425
431	totaloccs++;	426	totaloccs++;
432	grpoccs++;	427	grpoccs++;
433		428
434	// Add adjacent slots to the set to populate at next	429	// Add adjacent slots to the set to populate at next
	...		...
464	}	459	}
465		460
466	// Group done ?	461	// Group done ?
467	if (grpoccs >= maxgrpoccs) {	462	if (grpoccs >= maxgrpoccs) {
468	ret \|= ABSRES_TRUNC;	463	ret \|= ABSRES_TRUNC;
469	LOGABS(("Db::makeAbstract: max group occs cutoff\n"));	464	LOGABS("Db::makeAbstract: max group occs cutoff\n" );
470	break;	465	break;
471	}	466	}
472	// Global done ?	467	// Global done ?
473	if (totaloccs >= maxtotaloccs) {	468	if (totaloccs >= maxtotaloccs) {
474	ret \|= ABSRES_TRUNC;	469	ret \|= ABSRES_TRUNC;
475	LOGABS(("Db::makeAbstract: max occurrences cutoff\n"));	470	LOGABS("Db::makeAbstract: max occurrences cutoff\n" );
476	break;	471	break;
477	}	472	}
478	}	473	}
479	} catch (...) {	474	} catch (...) {
480	// Term does not occur. No problem.	475	// Term does not occur. No problem.
481	}	476	}
482		477
483	if (totaloccs >= maxtotaloccs) {	478	if (totaloccs >= maxtotaloccs) {
484	ret \|= ABSRES_TRUNC;	479	ret \|= ABSRES_TRUNC;
485	LOGABS(("Db::makeAbstract: max1 occurrences cutoff\n"));	480	LOGABS("Db::makeAbstract: max1 occurrences cutoff\n" );
486	break;	481	break;
487	}	482	}
488	}	483	}
489	}	484	}
490	maxpos += ctxwords + 1;	485	maxpos += ctxwords + 1;
491		486
492	LOGABS(("makeAbstract:%d:chosen number of positions %d\n",	487	LOGABS("makeAbstract:" << (chron.millis()) << ":chosen number of positions " << (totaloccs) << "\n" );
493	chron.millis(), totaloccs));
494	// This can happen if there are term occurences in the keywords	488	// This can happen if there are term occurences in the keywords
495	// etc. but not elsewhere ?	489	// etc. but not elsewhere ?
496	if (totaloccs == 0) {	490	if (totaloccs == 0) {
497	LOGDEB("makeAbstract: no occurrences\n" );	491	LOGDEB("makeAbstract: no occurrences\n" );
498	return ABSRES_OK;	492	return ABSRES_OK;
	...		...
564	#endif	558	#endif
565		559
566	vector<int> vpbreaks;	560	vector<int> vpbreaks;
567	ndb->getPagePositions(docid, vpbreaks);	561	ndb->getPagePositions(docid, vpbreaks);
568		562
569	LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(),	563	LOGABS("makeAbstract:" << (chron.millis()) << ": extracting. Got " << (vpbreaks.size()) << " pages\n" );
570	vpbreaks.size()));
571	// Finally build the abstract by walking the map (in order of position)	564	// Finally build the abstract by walking the map (in order of position)
572	vabs.clear();	565	vabs.clear();
573	string chunk;	566	string chunk;
574	bool incjk = false;	567	bool incjk = false;
575	int page = 0;	568	int page = 0;
	...		...
613	}	606	}
614		607
615		608
616	}	609	}
617		610
		611
		612