/* Copyright (C) 2004-2017 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <math.h>
#include <map>
#include <unordered_map>
#include <deque>
#include <algorithm>
#include "log.h"
#include "rcldb.h"
#include "rcldb_p.h"
#include "rclquery.h"
#include "rclquery_p.h"
#include "textsplit.h"
#include "searchdata.h"
#include "utf8iter.h"
#include "hldata.h"
#include "chrono.h"
using namespace std;
namespace Rcl {
// This is used as a marker inside the abstract frag lists, but
// normally doesn't remain in final output (which is built with a
// custom sep. by our caller).
static const string cstr_ellipsis("...");
static const string emptys;
// This is used to mark positions overlapped by a multi-word match term
static const string occupiedmarker("?");
#define DEBUGABSTRACT
#ifdef DEBUGABSTRACT
#define LOGABS LOGDEB
#else
#define LOGABS LOGDEB2
#endif
// Unprefix terms. Actually it's not completely clear if we should
// remove prefixes and keep all terms or prune the prefixed
// ones. There is no good way to be sure what will provide the best
// result in general.
static const bool prune_prefixed_terms = true;
static void noPrefixList(const vector<string>& in, vector<string>& out)
{
for (const auto& term : in) {
if (prune_prefixed_terms) {
if (has_prefix(term))
continue;
}
out.push_back(strip_prefix(term));
}
sort(out.begin(), out.end());
vector<string>::iterator it = unique(out.begin(), out.end());
out.resize(it - out.begin());
}
bool Query::Native::getMatchTerms(unsigned long xdocid, vector<string>& terms)
{
if (!xenquire) {
LOGERR("Query::getMatchTerms: no query opened\n");
return false;
}
terms.clear();
Xapian::TermIterator it;
Xapian::docid id = Xapian::docid(xdocid);
vector<string> iterms;
XAPTRY(iterms.insert(iterms.begin(),
xenquire->get_matching_terms_begin(id),
xenquire->get_matching_terms_end(id)),
m_q->m_db->m_ndb->xrdb, m_q->m_reason);
if (!m_q->m_reason.empty()) {
LOGERR("getMatchTerms: xapian error: " << m_q->m_reason << "\n");
return false;
}
noPrefixList(iterms, terms);
return true;
}
// Retrieve db-wide frequencies for the query terms and store them in
// the query object. This is done at most once for a query, and the data is used
// while computing abstracts for the different result documents.
void Query::Native::setDbWideQTermsFreqs()
{
// Do it once only for a given query.
if (!termfreqs.empty())
return;
vector<string> qterms;
{
vector<string> iqterms;
m_q->getQueryTerms(iqterms);
noPrefixList(iqterms, qterms);
}
LOGDEB("Query terms: " << stringsToString(qterms) << endl);
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
double doccnt = xrdb.get_doccount();
if (doccnt == 0)
doccnt = 1;
for (const auto& term : qterms) {
termfreqs[term] = xrdb.get_termfreq(term) / doccnt;
LOGABS("setDbWideQTermFreqs: [" << term << "] db freq " <<
termfreqs[term] << "\n");
}
}
// Compute matched terms quality coefficients for a matched document by
// retrieving the Within Document Frequencies and multiplying by
// overal term frequency, then using log-based thresholds.
// 2012: it's not too clear to me why exactly we do the log thresholds thing.
// Preferring terms wich are rare either or both in the db and the document
// seems reasonable though
// To avoid setting a high quality for a low frequency expansion of a
// common stem, which seems wrong, we group the terms by
// root, compute a frequency for the group from the sum of member
// occurrences, and let the frequency for each group member be the
// aggregated frequency.
double Query::Native::qualityTerms(Xapian::docid docid,
const vector<string>& terms,
multimap<double, vector<string> >& byQ)
{
LOGABS("qualityTerms\n");
setDbWideQTermsFreqs();
map<string, double> termQcoefs;
double totalweight = 0;
Xapian::Database &xrdb = m_q->m_db->m_ndb->xrdb;
double doclen = xrdb.get_doclength(docid);
if (doclen == 0)
doclen = 1;
HighlightData hld;
if (m_q->m_sd) {
m_q->m_sd->getTerms(hld);
}
// Group the input terms by the user term they were possibly
// expanded from (by stemming)
map<string, vector<string> > byRoot;
for (const auto& term: terms) {
map<string, string>::const_iterator eit = hld.terms.find(term);
if (eit != hld.terms.end()) {
byRoot[eit->second].push_back(term);
} else {
LOGDEB0("qualityTerms: [" << term << "] not found in hld\n");
byRoot[term].push_back(term);
}
}
#ifdef DEBUGABSTRACT
{
string deb;
hld.toString(deb);
LOGABS("qualityTerms: hld: " << deb << "\n");
string byRootstr;
for (const auto& entry : byRoot) {
byRootstr.append("[").append(entry.first).append("]->");
for (const auto& term : entry.second) {
byRootstr.append("[").append(term).append("] ");
}
byRootstr.append("\n");
}
LOGABS("\nqualityTerms: uterms to terms: " << byRootstr << "\n");
}
#endif
// Compute in-document and global frequencies for the groups.
map<string, double> grpwdfs;
map<string, double> grptfreqs;
for (const auto& group : byRoot) {
for (const auto& term : group.second) {
Xapian::TermIterator xtermit = xrdb.termlist_begin(docid);
xtermit.skip_to(term);
if (xtermit != xrdb.termlist_end(docid) && *xtermit == term) {
if (grpwdfs.find(group.first) != grpwdfs.end()) {
grpwdfs[group.first] = xtermit.get_wdf() / doclen;
grptfreqs[group.first] = termfreqs[term];
} else {
grpwdfs[group.first] += xtermit.get_wdf() / doclen;
grptfreqs[group.first] += termfreqs[term];
}
}
}
}
// Build a sorted by quality container for the groups
for (const auto& group : byRoot) {
double q = (grpwdfs[group.first]) * grptfreqs[group.first];
q = -log10(q);
if (q < 3) {
q = 0.05;
} else if (q < 4) {
q = 0.3;
} else if (q < 5) {
q = 0.7;
} else if (q < 6) {
q = 0.8;
} else {
q = 1;
}
totalweight += q;
byQ.insert(pair<double, vector<string> >(q, group.second));
}
#ifdef DEBUGABSTRACT
for (auto mit= byQ.rbegin(); mit != byQ.rend(); mit++) {
LOGABS("qualityTerms: coef: " << mit->first << " group: " <<
stringsToString(mit->second) << endl);
}
#endif
return totalweight;
}
// Return page number for first match of "significant" term.
int Query::Native::getFirstMatchPage(Xapian::docid docid, string& term)
{
LOGDEB("Query::Native::getFirstMatchPage\n");
if (!m_q|| !m_q->m_db || !m_q->m_db->m_ndb || !m_q->m_db->m_ndb->m_isopen) {
LOGERR("Query::getFirstMatchPage: no db\n");
return -1;
}
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
Xapian::Database& xrdb(ndb->xrdb);
vector<string> terms;
getMatchTerms(docid, terms);
if (terms.empty()) {
LOGDEB("getFirstMatchPage: empty match term list (field match?)\n");
return -1;
}
vector<int> pagepos;
ndb->getPagePositions(docid, pagepos);
if (pagepos.empty())
return -1;
setDbWideQTermsFreqs();
// We try to use a page which matches the "best" term. Get a sorted list
multimap<double, vector<string> > byQ;
qualityTerms(docid, terms, byQ);
for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
for (vector<string>::const_iterator qit = mit->second.begin();
qit != mit->second.end(); qit++) {
string qterm = *qit;
Xapian::PositionIterator pos;
string emptys;
try {
for (pos = xrdb.positionlist_begin(docid, qterm);
pos != xrdb.positionlist_end(docid, qterm); pos++) {
int pagenum = ndb->getPageNumberForPosition(pagepos, *pos);
if (pagenum > 0) {
term = qterm;
return pagenum;
}
}
} catch (...) {
// Term does not occur. No problem.
}
}
}
return -1;
}
// Creating the abstract from index position data: populate the sparse
// array with the positions for a given query term, and mark the
// neighboring positions.
void Query::Native::abstractPopulateQTerm(
Xapian::Database& xrdb,
Xapian::docid docid,
const string& qterm,
int qtrmwrdcnt,
int ctxwords,
unsigned int maxgrpoccs,
unsigned int maxtotaloccs,
map<unsigned int, string>& sparseDoc,
unordered_set<unsigned int>& searchTermPositions,
unsigned int& maxpos,
unsigned int& totaloccs,
unsigned int& grpoccs,
int& ret
)
{
Xapian::PositionIterator pos;
// Walk the position list for this term.
for (pos = xrdb.positionlist_begin(docid, qterm);
pos != xrdb.positionlist_end(docid, qterm); pos++) {
int ipos = *pos;
if (ipos < int(baseTextPosition)) // Not in text body
continue;
LOGABS("makeAbstract: [" << qterm << "] at pos " <<
ipos << " grpoccs " << grpoccs << " maxgrpoccs " <<
maxgrpoccs << "\n");
totaloccs++;
grpoccs++;
// Add adjacent slots to the set to populate at next
// step by inserting empty strings. Special provisions
// for adding ellipsis and for positions overlapped by
// the match term.
unsigned int sta = MAX(int(baseTextPosition),
ipos - ctxwords);
unsigned int sto = ipos + qtrmwrdcnt-1 +
m_q->m_db->getAbsCtxLen();
for (unsigned int ii = sta; ii <= sto; ii++) {
if (ii == (unsigned int)ipos) {
sparseDoc[ii] = qterm;
searchTermPositions.insert(ii);
if (ii > maxpos)
maxpos = ii;
} else if (ii > (unsigned int)ipos &&
ii < (unsigned int)ipos + qtrmwrdcnt) {
// Position for another word of the multi-word term
sparseDoc[ii] = occupiedmarker;
} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
// For an empty slot, the test above has a side
// effect of inserting an empty string which
// is what we want. Do it also if it was an ellipsis
sparseDoc[ii] = emptys;
}
}
// Add ellipsis at the end. This may be replaced later by
// an overlapping extract. Take care not to replace an
// empty string here, we really want an empty slot,
// use find()
if (sparseDoc.find(sto+1) == sparseDoc.end()) {
sparseDoc[sto+1] = cstr_ellipsis;
}
// Group done ?
if (grpoccs >= maxgrpoccs) {
ret |= ABSRES_TRUNC;
LOGABS("Db::makeAbstract: max group occs cutoff\n");
break;
}
// Global done ?
if (totaloccs >= maxtotaloccs) {
ret |= ABSRES_TRUNC;
LOGABS("Db::makeAbstract: max occurrences cutoff\n");
break;
}
}
}
// Creating the abstract from index position data: after the query
// terms have been inserted at their place in the sparse array, and
// the neighboring positions marked, populate the neighbours: for each
// term in the document, walk its position list and populate slots
// around the query terms. We arbitrarily truncate the list to avoid
// taking forever. If we do cutoff, the abstract may be inconsistant
// (missing words, potentially altering meaning), which is bad.
void Query::Native::abstractPopulateContextTerms(
Xapian::Database& xrdb,
Xapian::docid docid,
unsigned int maxpos,
map<unsigned int, string>& sparseDoc,
int& ret
)
{
Xapian::TermIterator term;
int cutoff = m_q->m_snipMaxPosWalk;
for (term = xrdb.termlist_begin(docid);
term != xrdb.termlist_end(docid); term++) {
// Ignore prefixed terms
if (has_prefix(*term))
continue;
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
ret |= ABSRES_TERMMISS;
LOGDEB0("makeAbstract: max term count cutoff " <<
m_q->m_snipMaxPosWalk << "\n");
break;
}
map<unsigned int, string>::iterator vit;
Xapian::PositionIterator pos;
for (pos = xrdb.positionlist_begin(docid, *term);
pos != xrdb.positionlist_end(docid, *term); pos++) {
if (m_q->m_snipMaxPosWalk > 0 && cutoff-- < 0) {
ret |= ABSRES_TERMMISS;
LOGDEB0("makeAbstract: max term count cutoff " <<
m_q->m_snipMaxPosWalk << "\n");
break;
}
// If we are beyond the max possible position, stop
// for this term
if (*pos > maxpos) {
break;
}
if ((vit = sparseDoc.find(*pos)) != sparseDoc.end()) {
// Don't replace a term: the terms list is in
// alphabetic order, and we may have several terms
// at the same position, we want to keep only the
// first one (ie: dockes and dockes@wanadoo.fr)
if (vit->second.empty()) {
LOGDEB2("makeAbstract: populating: [" << *term <<
"] at " << *pos << "\n");
sparseDoc[*pos] = *term;
}
}
}
}
}
// Creating the abstract from position data: final phase: extract the
// snippets from the sparse array.
void Query::Native::abstractCreateSnippetsVector(
Rcl::Db::Native *ndb,
map<unsigned int, string>& sparseDoc,
unordered_set<unsigned int>& searchTermPositions,
vector<int>& vpbreaks,
vector<Snippet>& vabs)
{
vabs.clear();
string chunk;
bool incjk = false;
int page = 0;
string term;
for (const auto& ent : sparseDoc) {
LOGDEB2("Abtract:output "<< ent.first <<" -> [" <<ent.second <<"]\n");
if (!occupiedmarker.compare(ent.second)) {
LOGDEB("Abstract: qtrm position not filled ??\n");
continue;
}
if (chunk.empty() && !vpbreaks.empty()) {
page = ndb->getPageNumberForPosition(vpbreaks, ent.first);
if (page < 0)
page = 0;
term.clear();
}
Utf8Iter uit(ent.second);
bool newcjk = false;
if (TextSplit::isCJK(*uit))
newcjk = true;
if (!incjk || (incjk && !newcjk))
chunk += " ";
incjk = newcjk;
if (searchTermPositions.find(ent.first) != searchTermPositions.end())
term = ent.second;
if (ent.second == cstr_ellipsis) {
vabs.push_back(Snippet(page, chunk).setTerm(term));
chunk.clear();
} else {
if (ent.second.compare(end_of_field_term) &&
ent.second.compare(start_of_field_term))
chunk += ent.second;
}
}
if (!chunk.empty())
vabs.push_back(Snippet(page, chunk).setTerm(term));
}
// Creating the abstract from index position data: top level routine
int Query::Native::abstractFromIndex(
Rcl::Db::Native *ndb,
Xapian::docid docid,
const vector<string>& matchTerms,
const multimap<double, vector<string>> byQ,
double totalweight,
int ctxwords,
unsigned int maxtotaloccs,
vector<Snippet>& vabs,
Chrono& chron
)
{
Xapian::Database& xrdb(ndb->xrdb);
int ret = ABSRES_OK;
// The terms 'array' that we partially populate with the document
// terms, at their positions around the search terms positions:
map<unsigned int, string> sparseDoc;
// Also remember apart the search term positions so that we can list
// them with their snippets.
std::unordered_set<unsigned int> searchTermPositions;
// Remember max position. Used to stop walking positions lists while
// populating the adjacent slots.
unsigned int maxpos = 0;
// Total number of occurences for all terms. We stop when we have too much
unsigned int totaloccs = 0;
// First pass to populate the sparse document: we walk the term
// groups, beginning with the better ones, and insert each term at
// its position. We also insert empty strings at the surrounding
// positions. These are markers showing where we should insert
// data during the next pass.
for (auto mit = byQ.rbegin(); mit != byQ.rend(); mit++) {
unsigned int maxgrpoccs;
double q;
if (byQ.size() == 1) {
maxgrpoccs = maxtotaloccs;
q = 1.0;
} else {
// We give more slots to the better term groups
q = mit->first / totalweight;
maxgrpoccs = int(ceil(maxtotaloccs * q));
}
unsigned int grpoccs = 0;
// For each term in user term expansion group
for (const auto& qterm : mit->second) {
// Enough for this group ?
if (grpoccs >= maxgrpoccs)
break;
LOGABS("makeAbstract: [" << qterm << "] " << maxgrpoccs <<
" max grp occs (coef " << q << ")\n");
// The match term may span several words (more than one position)
int qtrmwrdcnt =
TextSplit::countWords(qterm, TextSplit::TXTS_NOSPANS);
// Populate positions for this query term.
// There may be query terms not in this doc. This raises an
// exception when requesting the position list, we catch it ??
// Not clear how this can happen because we are walking the
// match list returned by Xapian. Maybe something with the
// fields?
try {
abstractPopulateQTerm(xrdb, docid, qterm, qtrmwrdcnt, ctxwords,
maxgrpoccs,maxtotaloccs, sparseDoc,
searchTermPositions, maxpos, totaloccs,
grpoccs, ret);
} catch (...) {
// Term does not occur. No problem.
}
if (totaloccs >= maxtotaloccs) {
ret |= ABSRES_TRUNC;
LOGABS("Db::makeAbstract: max1 occurrences cutoff\n");
break;
}
}
}
maxpos += ctxwords + 1;
LOGABS("makeAbstract:" << chron.millis() <<
"mS:chosen number of positions " << totaloccs << "\n");
// This can happen if there are term occurences in the keywords
// etc. but not elsewhere ?
if (totaloccs == 0) {
LOGDEB("makeAbstract: no occurrences\n");
return ABSRES_OK;
}
abstractPopulateContextTerms(xrdb, docid, maxpos, sparseDoc, ret);
LOGABS("makeAbstract:" << chron.millis() << "mS: all term poslist read\n");
vector<int> vpbreaks;
ndb->getPagePositions(docid, vpbreaks);
LOGABS("makeAbstract:" << chron.millis() << "mS: extracting. Got " <<
vpbreaks.size() << " pages\n");
// Finally build the abstract by walking the map (in order of position)
abstractCreateSnippetsVector(ndb, sparseDoc, searchTermPositions,
vpbreaks, vabs);
LOGABS("makeAbtract: done in " << chron.millis() << " mS\n");
return ret;
}
// Build a document abstract by extracting text chunks around the
// query terms. This can either uses the index position lists, or the
// stored document text, with very different implementations.
//
// DatabaseModified and other general exceptions are catched and
// possibly retried by our caller.
//
// @param[out] vabs the abstract is returned as a vector of snippets.
int Query::Native::makeAbstract(Xapian::docid docid,
vector<Snippet>& vabs,
int imaxoccs, int ictxwords)
{
Chrono chron;
LOGABS("makeAbstract: docid " << docid << " imaxoccs " <<
imaxoccs << " ictxwords " << ictxwords << "\n");
// The (unprefixed) terms matched by this document
vector<string> matchedTerms;
getMatchTerms(docid, matchedTerms);
if (matchedTerms.empty()) {
LOGDEB("makeAbstract:" << chron.millis() << "mS:Empty term list\n");
return ABSRES_ERROR;
}
LOGDEB("Match terms: " << stringsToString(matchedTerms) << endl);
// Retrieve the term frequencies for the query terms. This is
// actually computed only once for a query, and for all terms in
// the query (not only the matches for this doc)
setDbWideQTermsFreqs();
// Build a sorted by quality container for the match terms We are
// going to try and show text around the less common search terms.
// Terms issued from an original one by stem expansion are
// aggregated by the qualityTerms() routine (this is what we call
// 'term groups' in the following: index terms expanded from the
// same user term).
multimap<double, vector<string>> byQ;
double totalweight = qualityTerms(docid, matchedTerms, byQ);
LOGABS("makeAbstract:" << chron.millis() << "mS: computed Qcoefs.\n");
// This can't happen, but would crash us
if (totalweight == 0.0) {
LOGERR("makeAbstract:"<<chron.millis()<<"mS: totalweight == 0.0 !\n");
return ABSRES_ERROR;
}
Rcl::Db::Native *ndb(m_q->m_db->m_ndb);
Xapian::Database& xrdb(ndb->xrdb);
// Total number of slots we populate. The 7 is taken as
// average word size. It was a mistake to have the user max
// abstract size parameter in characters, we basically only deal
// with words. We used to limit the character size at the end, but
// this damaged our careful selection of terms
const unsigned int maxtotaloccs = imaxoccs > 0 ? imaxoccs :
m_q->m_db->getAbsLen() /(7 * (m_q->m_db->getAbsCtxLen() + 1));
int ctxwords = ictxwords == -1 ? m_q->m_db->getAbsCtxLen() : ictxwords;
LOGABS("makeAbstract:" << chron.millis() << "mS: mxttloccs " <<
maxtotaloccs << " ctxwords " << ctxwords << "\n");
if (o_index_storedoctext) {
return abstractFromText(ndb, docid, matchedTerms, byQ,
totalweight, ctxwords, maxtotaloccs, vabs,
chron);
} else {
return abstractFromIndex(ndb, docid, matchedTerms, byQ,
totalweight, ctxwords, maxtotaloccs, vabs,
chron);
}
}
}