--- a
+++ b/src/rcldb/rclabstract.cpp
@@ -0,0 +1,487 @@
+/* Copyright (C) 2004 J.F.Dockes
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the
+ * Free Software Foundation, Inc.,
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include "autoconfig.h"
+
+#include <math.h>
+
+#include <map>
+using namespace std;
+
+#include "debuglog.h"
+#include "rcldb.h"
+#include "rcldb_p.h"
+#include "rclquery.h"
+#include "rclquery_p.h"
+#include "textsplit.h"
+
+#include "utf8iter.h"
+
+namespace Rcl {
+// This is used as a marker inside the abstract frag lists, but
+// normally doesn't remain in final output (which is built with a
+// custom sep. by our caller).
+static const string cstr_ellipsis("...");
+
+#undef DEBUGABSTRACT
+#ifdef DEBUGABSTRACT
+#define LOGABS LOGDEB
+static void listList(const string& what, const vector<string>&l)
+{
+ string a;
+ for (vector<string>::const_iterator it = l.begin(); it != l.end(); it++) {
+ a = a + *it + " ";
+ }
+ LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
+}
+#else
+#define LOGABS LOGDEB2
+static void listList(const string&, const vector<string>&)
+{
+}
+#endif
+
+// Keep only non-prefixed terms. We use to remove prefixes and keep
+// the terms instead, but field terms are normally also indexed
+// un-prefixed, so this is simpler and better.
+static void noPrefixList(const vector<string>& in, vector<string>& out)
+{
+ for (vector<string>::const_iterator qit = in.begin();
+ qit != in.end(); qit++) {
+ if (!has_prefix(*qit))
+ out.push_back(*qit);
+ }
+}
+
+// Retrieve db-wide frequencies for the query terms and store them in
+// the query object. This is done at most once for a query, and the data is used
+// while computing abstracts for the different result documents.
+void Query::Native::setDbWideQTermsFreqs()
+{
+ // Do it once only for a given query.
+ if (!termfreqs.empty())
+ return;
+
+ vector<string> qterms;
+ {
+ vector<string> iqterms;
+ m_q->getQueryTerms(iqterms);
+ noPrefixList(iqterms, qterms);
+ }
+ // listList("Query terms: ", qterms);
+ Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
+
+ double doccnt = xrdb.get_doccount();
+ if (doccnt == 0)
+ doccnt = 1;
+
+ for (vector<string>::const_iterator qit = qterms.begin();
+ qit != qterms.end(); qit++) {
+ termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
+ LOGABS(("setDbWideQTermFreqs: [%s] db freq %.1e\n", qit->c_str(),
+ termfreqs[*qit]));
+ }
+}
+
+// Compute query terms quality coefficients for a matched document by
+// retrieving the Within Document Frequencies and multiplying by
+// overal term frequency, then using log-based thresholds.
+// 2012: it's not too clear to me why exactly we do the log thresholds thing.
+// Preferring terms wich are rare either or both in the db and the document
+// seems reasonable though
+double Query::Native::qualityTerms(Xapian::docid docid,
+ const vector<string>& terms,
+ multimap<double, string>& byQ)
+{
+ setDbWideQTermsFreqs();
+
+ map<string, double> termQcoefs;
+ double totalweight = 0;
+
+ Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
+ double doclen = xrdb.get_doclength(docid);
+ if (doclen == 0)
+ doclen = 1;
+
+ for (vector<string>::const_iterator qit = terms.begin();
+ qit != terms.end(); qit++) {
+ Xapian::TermIterator term = xrdb.termlist_begin(docid);
+ term.skip_to(*qit);
+ if (term != xrdb.termlist_end(docid) && *term == *qit) {
+ double q = (term.get_wdf() / doclen) * termfreqs[*qit];
+ q = -log10(q);
+ if (q < 3) {
+ q = 0.05;
+ } else if (q < 4) {
+ q = 0.3;
+ } else if (q < 5) {
+ q = 0.7;
+ } else if (q < 6) {
+ q = 0.8;
+ } else {
+ q = 1;
+ }
+ termQcoefs[*qit] = q;
+ totalweight += q;
+ }
+ }
+
+ // Build a sorted by quality term list.
+ for (vector<string>::const_iterator qit = terms.begin();
+ qit != terms.end(); qit++) {
+ if (termQcoefs.find(*qit) != termQcoefs.end())
+ byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
+ }
+
+#ifdef DEBUGABSTRACT
+ LOGDEB(("Db::qualityTerms:\n"));
+ for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
+ qit != byQ.rend(); qit++) {
+ LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
+ }
+#endif
+ return totalweight;
+}
+
+// Return page number for first match of "significant" term.
+int Query::Native::getFirstMatchPage(Xapian::docid docid)
+{
+ if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
+ LOGERR(("Query::getFirstMatchPage: no db\n"));
+ return false;
+ }
+ Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
+ Xapian::Database& xrdb(ndb->xrdb);
+
+ vector<string> terms;
+ {
+ vector<string> iterms;
+ m_q->getMatchTerms(docid, iterms);
+ noPrefixList(iterms, terms);
+ }
+ if (terms.empty()) {
+ LOGDEB(("getFirstMatchPage: empty match term list (field match?)\n"));
+ return -1;
+ }
+
+ vector<int> pagepos;
+ ndb->getPagePositions(docid, pagepos);
+ if (pagepos.empty())
+ return -1;
+
+ setDbWideQTermsFreqs();
+
+ // We try to use a page which matches the "best" term. Get a sorted list
+ multimap<double, string> byQ;
+ double totalweight = qualityTerms(docid, terms, byQ);
+
+ for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
+ qit != byQ.rend(); qit++) {
+ string qterm = qit->second;
+ Xapian::PositionIterator pos;
+ string emptys;
+ try {
+ for (pos = xrdb.positionlist_begin(docid, qterm);
+ pos != xrdb.positionlist_end(docid, qterm); pos++) {
+ int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
+ if (pagenum > 0)
+ return pagenum;
+ }
+ } catch (...) {
+ // Term does not occur. No problem.
+ }
+ }
+ return -1;
+}
+
+// Build a document abstract by extracting text chunks around the query terms
+// This uses the db termlists, not the original document.
+//
+// DatabaseModified and other general exceptions are catched and
+// possibly retried by our caller
+abstract_result Query::Native::makeAbstract(Xapian::docid docid,
+ vector<pair<int, string> >& vabs,
+ int imaxoccs, int ictxwords)
+{
+ Chrono chron;
+ LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d imaxoccs %d\n", chron.ms(),
+ m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen, imaxoccs));
+
+ // The (unprefixed) terms matched by this document
+ vector<string> matchedTerms;
+ {
+ vector<string> iterms;
+ m_q->getMatchTerms(docid, iterms);
+ noPrefixList(iterms, matchedTerms);
+ if (matchedTerms.empty()) {
+ LOGDEB(("makeAbstract::Empty term list\n"));
+ return ABSRES_ERROR;
+ }
+ }
+ listList("Match terms: ", matchedTerms);
+
+ // Retrieve the term freqencies for the query terms. This is
+ // actually computed only once for a query, and for all terms in
+ // the query (not only the matches for this doc)
+ setDbWideQTermsFreqs();
+
+ // Build a sorted by quality container for the match terms We are
+ // going to try and show text around the less common search terms.
+ // TOBEDONE: terms issued from an original one by stem expansion
+ // should be somehow aggregated here, else, it may happen that
+ // such a group prevents displaying matches for other terms (by
+ // removing its meaning from the maximum occurrences per term test
+ // used while walking the list below)
+ multimap<double, string> byQ;
+ double totalweight = qualityTerms(docid, matchedTerms, byQ);
+ LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
+ // This can't happen, but would crash us
+ if (totalweight == 0.0) {
+ LOGERR(("makeAbstract: totalweight == 0.0 !\n"));
+ return ABSRES_ERROR;
+ }
+
+ Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
+ Xapian::Database& xrdb(ndb->xrdb);
+
+ ///////////////////
+ // For each of the query terms, ask xapian for its positions list
+ // in the document. For each position entry, remember it in
+ // qtermposs and insert it and its neighbours in the set of
+ // 'interesting' positions
+
+ // The terms 'array' that we partially populate with the document
+ // terms, at their positions around the search terms positions:
+ map<unsigned int, string> sparseDoc;
+
+ // Total number of occurences for all terms. We stop when we have too much
+ unsigned int totaloccs = 0;
+
+ // Limit the total number of slots we populate. The 7 is taken as
+ // average word size. It was a mistake to have the user max
+ // abstract size parameter in characters, we basically only deal
+ // with words. We used to limit the character size at the end, but
+ // this damaged our careful selection of terms
+ const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
+ m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
+ int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
+ LOGABS(("makeAbstract:%d: mxttloccs %d ctxwords %d\n",
+ chron.ms(), maxtotaloccs, ctxwords));
+
+ // This is used to mark positions overlapped by a multi-word match term
+ const string occupiedmarker("?");
+
+ abstract_result ret = ABSRES_OK;
+
+ // Let's go populate
+ for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
+ qit != byQ.rend(); qit++) {
+ string qterm = qit->second;
+ unsigned int maxoccs;
+ if (byQ.size() == 1) {
+ maxoccs = maxtotaloccs;
+ } else {
+ // We give more slots to the better terms
+ float q = qit->first / totalweight;
+ maxoccs = int(ceil(maxtotaloccs * q));
+ LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
+ qterm.c_str(), maxoccs, q));
+ }
+
+ // The match term may span several words
+ int qtrmwrdcnt = TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
+
+ Xapian::PositionIterator pos;
+ // There may be query terms not in this doc. This raises an
+ // exception when requesting the position list, we catch it ??
+ // Not clear how this can happen because we are walking the
+ // match list returned by Xapian. Maybe something with the
+ // fields?
+ string emptys;
+ try {
+ unsigned int occurrences = 0;
+ for (pos = xrdb.positionlist_begin(docid, qterm);
+ pos != xrdb.positionlist_end(docid, qterm); pos++) {
+ int ipos = *pos;
+ if (ipos < int(baseTextPosition)) // Not in text body
+ continue;
+ LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n",
+ qterm.c_str(), ipos, occurrences, maxoccs));
+
+ totaloccs++;
+
+ // Add adjacent slots to the set to populate at next
+ // step by inserting empty strings. Special provisions
+ // for adding ellipsis and for positions overlapped by
+ // the match term.
+ unsigned int sta = MAX(0, ipos - ctxwords);
+ unsigned int sto = ipos + qtrmwrdcnt-1 +
+ m_q->m_db->getAbsCtxLen();
+ for (unsigned int ii = sta; ii <= sto; ii++) {
+ if (ii == (unsigned int)ipos) {
+ sparseDoc[ii] = qterm;
+ } else if (ii > (unsigned int)ipos &&
+ ii < (unsigned int)ipos + qtrmwrdcnt) {
+ sparseDoc[ii] = occupiedmarker;
+ } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
+ // For an empty slot, the test has a side
+ // effect of inserting an empty string which
+ // is what we want
+ sparseDoc[ii] = emptys;
+ }
+ }
+ // Add ellipsis at the end. This may be replaced later by
+ // an overlapping extract. Take care not to replace an
+ // empty string here, we really want an empty slot,
+ // use find()
+ if (sparseDoc.find(sto+1) == sparseDoc.end()) {
+ sparseDoc[sto+1] = cstr_ellipsis;
+ }
+
+ // Limit to allocated occurences and total size
+ if (++occurrences >= maxoccs ||
+ totaloccs >= maxtotaloccs) {
+ ret = ABSRES_TRUNC;
+ LOGDEB(("Db::makeAbstract: max occurrences cutoff\n"));
+ break;
+ }
+ }
+ } catch (...) {
+ // Term does not occur. No problem.
+ }
+ if (totaloccs >= maxtotaloccs) {
+ ret = ABSRES_TRUNC;
+ LOGDEB(("Db::makeAbstract: max1 occurrences cutoff\n"));
+ break;
+ }
+ }
+ LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
+ chron.millis(), totaloccs));
+
+ // This can happen if there are term occurences in the keywords
+ // etc. but not elsewhere ?
+ if (totaloccs == 0) {
+ LOGDEB1(("makeAbstract: no occurrences\n"));
+ return ABSRES_ERROR;
+ }
+
+ // Walk all document's terms position lists and populate slots
+ // around the query terms. We arbitrarily truncate the list to
+ // avoid taking forever. If we do cutoff, the abstract may be
+ // inconsistant (missing words, potentially altering meaning),
+ // which is bad.
+ {
+ Xapian::TermIterator term;
+ int cutoff = 500 * 1000;
+
+ for (term = xrdb.termlist_begin(docid);
+ term != xrdb.termlist_end(docid); term++) {
+ // Ignore prefixed terms
+ if (has_prefix(*term))
+ continue;
+ if (cutoff-- < 0) {
+ ret = ABSRES_TRUNC;
+ LOGDEB0(("makeAbstract: max term count cutoff\n"));
+ break;
+ }
+
+ Xapian::PositionIterator pos;
+ for (pos = xrdb.positionlist_begin(docid, *term);
+ pos != xrdb.positionlist_end(docid, *term); pos++) {
+ if (cutoff-- < 0) {
+ ret = ABSRES_TRUNC;
+ LOGDEB0(("makeAbstract: max term count cutoff\n"));
+ break;
+ }
+ map<unsigned int, string>::iterator vit;
+ if ((vit=sparseDoc.find(*pos)) != sparseDoc.end()) {
+ // Don't replace a term: the terms list is in
+ // alphabetic order, and we may have several terms
+ // at the same position, we want to keep only the
+ // first one (ie: dockes and dockes@wanadoo.fr)
+ if (vit->second.empty()) {
+ LOGDEB2(("makeAbstract: populating: [%s] at %d\n",
+ (*term).c_str(), *pos));
+ sparseDoc[*pos] = *term;
+ }
+ }
+ }
+ }
+ }
+
+#if 0
+ // Debug only: output the full term[position] vector
+ bool epty = false;
+ int ipos = 0;
+ for (map<unsigned int, string>::iterator it = sparseDoc.begin();
+ it != sparseDoc.end();
+ it++, ipos++) {
+ if (it->empty()) {
+ if (!epty)
+ LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
+ epty=true;
+ } else {
+ epty = false;
+ LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
+ }
+ }
+#endif
+
+ vector<int> vpbreaks;
+ ndb->getPagePositions(docid, vpbreaks);
+
+ LOGABS(("makeAbstract:%d: extracting. Got %u pages\n", chron.millis(),
+ vpbreaks.size()));
+ // Finally build the abstract by walking the map (in order of position)
+ vabs.clear();
+ string chunk;
+ bool incjk = false;
+ int page = 0;
+ for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
+ it != sparseDoc.end(); it++) {
+ LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
+ if (!occupiedmarker.compare(it->second))
+ continue;
+ if (chunk.empty() && !vpbreaks.empty()) {
+ page = ndb->getPageNumberForPosition(vpbreaks, it->first);
+ if (page < 0)
+ page = 0;
+ }
+ Utf8Iter uit(it->second);
+ bool newcjk = false;
+ if (TextSplit::isCJK(*uit))
+ newcjk = true;
+ if (!incjk || (incjk && !newcjk))
+ chunk += " ";
+ incjk = newcjk;
+ if (it->second == cstr_ellipsis) {
+ vabs.push_back(pair<int,string>(page, chunk));
+ chunk.clear();
+ } else {
+ if (it->second.compare(end_of_field_term) &&
+ it->second.compare(start_of_field_term))
+ chunk += it->second;
+ }
+ }
+ if (!chunk.empty())
+ vabs.push_back(pair<int, string>(page, chunk));
+
+ LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
+ return ret;
+}
+
+
+}