recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [e892ca] .. [79e106]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.119 2007-06-25 10:25:39 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
#endif
#ifndef MIN
#define MIN(A,B) (A<B?A:B)
#endif

// This is the word position offset at which we index the body text
// (abstract, keywords, etc.. are stored before this)
static const unsigned int baseTextPosition = 100000;

#undef MTIME_IN_VALUE
#ifdef MTIME_IN_VALUE
// Omega compatible values
#define enum value_slot {
    VALUE_LASTMOD = 0,  // 4 byte big endian value - seconds since 1970.
...
    Xapian::Query    query; // query descriptor: terms and subqueries
                // joined by operators (or/and etc...)
    Xapian::Enquire *enquire; // Open query descriptor.
    Xapian::MSet     mset;    // Partial result set

    // Term frequencies for current query. See makeAbstract, setQuery
    map<string, double>  m_termfreqs; 
    
    Native(Db *db) 
    : m_db(db),
      m_isopen(false), m_iswritable(false), enquire(0) 
    { }
...
    }
    }
    return out;
}

//#define DEBUGABSTRACT 
#ifdef DEBUGABSTRACT
#define LOGABS LOGDEB
#else
#define LOGABS LOGDEB2
#endif

// Build a document abstract by extracting text chunks around the query terms
// This uses the db termlists, not the original document.
string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
{
    Chrono chron;
    LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
         m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));

    list<string> terms = noPrefixList(iterms);
    if (terms.empty()) {
    return "";
    }

    // Retrieve db-wide frequencies for the query terms


    if (m_termfreqs.empty()) {
  double doccnt = db.get_doccount();
  if (doccnt == 0) doccnt = 1;
    for (list<string>::const_iterator qit = terms.begin(); 
         qit != terms.end(); qit++) {
        m_termfreqs[*qit] = db.get_termfreq(*qit) / doccnt;
        LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(), 
             m_termfreqs[*qit]));
    }
    LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
    }


    // Compute a term quality coefficient by retrieving the term
    // Within Document Frequencies and multiplying by overal term
    // frequency, then using log-based thresholds. We are going to try
    // and show text around the less common search terms.
    map<string, double> termQcoefs;
    double totalweight = 0;
    double doclen = db.get_doclength(docid);
    if (doclen == 0) doclen = 1;
    for (list<string>::const_iterator qit = terms.begin(); 
     qit != terms.end(); qit++) {
    Xapian::TermIterator term = db.termlist_begin(docid);
    term.skip_to(*qit);
    if (term != db.termlist_end(docid) && *term == *qit) {
      double q = (term.get_wdf() / doclen) * m_termfreqs[*qit];
      q = -log10(q);
      if (q < 3) {
      q = 0.05;
      } else if (q < 4) {
      q = 0.3;
      } else if (q < 5) {
      q = 0.7;
      } else if (q < 6) {
      q = 0.8;
      } else {
      q = 1;
      }
        termQcoefs[*qit] = q;
      totalweight += q;


    }
    }    
    LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));






    // Build a sorted by quality term list.

    multimap<double, string> byQ;
    for (list<string>::const_iterator qit = terms.begin(); 
     qit != terms.end(); qit++) {
    if (termQcoefs.find(*qit) != termQcoefs.end())
        byQ.insert(pair<double,string>(termQcoefs[*qit], *qit));
    }

#ifdef DEBUGABSTRACT
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
   qit != byQ.rend(); qit++) {
  LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
    }
#endif


    // For each of the query terms, ask xapian for its positions list
    // in the document. For each position entry, remember it in
    // qtermposs and insert it and its neighbours in the set of
    // 'interesting' positions

    // The terms 'array' that we partially populate with the document
    // terms, at their positions around the search terms positions:
    map<unsigned int, string> sparseDoc;

    // All the chosen query term positions. 


    vector<unsigned int> qtermposs; 

    // Limit the total number of slots we populate. The 7 is taken as
    // average word size. It was a mistake to have the user max
    // abstract size parameter in characters, we basically only deal
    // with words. We used to limit the character size at the end, but
    // this damaged our careful selection of terms
    const unsigned int maxtotaloccs = 
    m_db->m_synthAbsLen /(7 * (m_db->m_synthAbsWordCtxLen+1));
    LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
    // This can't happen, but would crash us
    if (totalweight == 0.0) {
  LOGERR(("makeAbstract: 0 totalweight!\n"));
  return "";
    }

    // Let's go populate
    for (multimap<double, string>::reverse_iterator qit = byQ.rbegin(); 
     qit != byQ.rend(); qit++) {








    string qterm = qit->second;
    unsigned int maxoccs;
    if (byQ.size() == 1) {
        maxoccs = maxtotaloccs;
    } else {
      // We give more slots to the better terms
      float q = qit->first / totalweight;
        maxoccs = int(ceil(maxtotaloccs * q));
        LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n", 
            qterm.c_str(), maxoccs, q));
    }
        
    Xapian::PositionIterator pos;
    // There may be query terms not in this doc. This raises an
...
    try {
        unsigned int occurrences = 0;
        for (pos = db.positionlist_begin(docid, qterm); 
         pos != db.positionlist_end(docid, qterm); pos++) {
        unsigned int ipos = *pos;
      if (ipos < baseTextPosition) // Not in text body
          continue;
      LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n",
          qterm.c_str(), ipos, occurrences, maxoccs));
        // Remember the term position
        qtermposs.push_back(ipos);
        // Add adjacent slots to the set to populate at next step
        unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
        unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
...
            if (ii == ipos)
            sparseDoc[ii] = qterm;
            else
            sparseDoc[ii] = emptys;
        }
      // Limit to allocated occurences and total size

        if (++occurrences >= maxoccs || 
          qtermposs.size() >= maxtotaloccs)
            break;
        }
    } catch (...) {
        // Term does not occur. No problem.
    }
  if (qtermposs.size() >= maxtotaloccs)

        break;
    }

    LOGABS(("makeAbstract:%d:chosen number of positions %d\n", 
        chron.millis(), qtermposs.size()));

    // This can happen if there are term occurences in the keywords
    // etc. but not elsewhere ?
    if (qtermposs.size() == 0) 
  return "";

    // Walk all document's terms position lists and populate slots
    // around the query terms. We arbitrarily truncate the list to
    // avoid taking forever. If we do cutoff, the abstract may be
    // inconsistant (missing words, potentially altering meaning),
    // which is bad...
    { 
    Xapian::TermIterator term;
    int cutoff = 500 * 1000;

    for (term = db.termlist_begin(docid);
...
            // Don't replace a term: the terms list is in
            // alphabetic order, and we may have several terms
            // at the same position, we want to keep only the
            // first one (ie: dockes and dockes@wanadoo.fr)
            if (vit->second.empty()) {
            LOGABS(("makeAbstract: populating: [%s] at %d\n", 
                (*term).c_str(), *pos));
            sparseDoc[*pos] = *term;
            }
        }
        }
...
        LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
    }
    }
#endif

    LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));

    // Add "..." at ends of chunks










    for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
     pos != qtermposs.end(); pos++) {





    unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;



















    // Possibly add a ... at the end of chunk if it's not
    // overlapping
  if (sparseDoc.find(sto) != sparseDoc.end() && 
      sparseDoc.find(sto+1) == sparseDoc.end())
        sparseDoc[sto+1] = "...";
    }

    // Finally build the abstract by walking the map (in order of position)
    string abstract;
    for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
     it != sparseDoc.end(); it++) {
    LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
    abstract += it->second + " ";
    }

    // This happens for docs with no terms (only filename) indexed? I'll fix 
    // one day (yeah)
    if (!abstract.compare("... "))
    abstract.clear();

    LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));
...
        splitData.setprefix(emptystring);
        splitData.basepos += splitData.curpos + 100;
    }
    }

    if (splitData.curpos < baseTextPosition)
  splitData.basepos = baseTextPosition;
    else
  splitData.basepos += splitData.curpos + 100;

    // Finally: split and index body text
    LOGDEB2(("Db::add: split body\n"));
    if (!dumb_string(doc.text, noacc)) {
    LOGERR(("Db::add: dumb_string failed\n"));
    return false;
    }
    splitter.text_to_words(noacc);



    ////// Special terms for other metadata. No positions for these.
    // Mime type
    newdocument.add_term("T" + doc.mimetype);

...
    names.push_back("XIMPOSSIBLE");
    }
    return true;
}

// Prepare query out of user search data
bool Db::setQuery(RefCntr<SearchData> sdata, int opts, 
          const string& stemlang)
{
    if (!m_ndb) {
    LOGERR(("Db::setQuery: no db!\n"));
...
    if (!sdata->toNativeQuery(*this, &xq, 
                  (opts & Db::QO_STEM) ? stemlang : "")) {
    m_reason += sdata->getReason();
    return false;
    }

    m_ndb->query = xq;
    delete m_ndb->enquire;
    m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
    m_ndb->enquire->set_query(m_ndb->query);
    m_ndb->mset = Xapian::MSet();

	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.118 2007-06-22 06:14:04 dockes Exp $ (C) 2004 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.119 2007-06-25 10:25:39 dockes Exp $ (C) 2004 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
54	#endif	54	#endif
55	#ifndef MIN	55	#ifndef MIN
56	#define MIN(A,B) (A<B?A:B)	56	#define MIN(A,B) (A<B?A:B)
57	#endif	57	#endif
58		58
		59	// This is the word position offset at which we index the body text
		60	// (abstract, keywords, etc.. are stored before this)
		61	static const unsigned int baseTextPosition = 100000;
		62
59	#undef MTIME_IN_VALUE	63	#undef MTIME_IN_VALUE
60	#ifdef MTIME_IN_VALUE	64	#ifdef MTIME_IN_VALUE
61	// Omega compatible values	65	// Omega compatible values
62	#define enum value_slot {	66	#define enum value_slot {
63	VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.	67	VALUE_LASTMOD = 0, // 4 byte big endian value - seconds since 1970.
	...		...
101	Xapian::Query query; // query descriptor: terms and subqueries	105	Xapian::Query query; // query descriptor: terms and subqueries
102	// joined by operators (or/and etc...)	106	// joined by operators (or/and etc...)
103	Xapian::Enquire *enquire; // Open query descriptor.	107	Xapian::Enquire *enquire; // Open query descriptor.
104	Xapian::MSet mset; // Partial result set	108	Xapian::MSet mset; // Partial result set
105		109
106	// Term frequencies for current query. See makeAbstract, not used yet.	110	// Term frequencies for current query. See makeAbstract, setQuery
107	map<string, int> m_termfreqs;	111	map<string, double> m_termfreqs;
108		112
109	Native(Db *db)	113	Native(Db *db)
110	: m_db(db),	114	: m_db(db),
111	m_isopen(false), m_iswritable(false), enquire(0)	115	m_isopen(false), m_iswritable(false), enquire(0)
112	{ }	116	{ }
	...		...
230	}	234	}
231	}	235	}
232	return out;	236	return out;
233	}	237	}
234		238
		239	//#define DEBUGABSTRACT
		240	#ifdef DEBUGABSTRACT
		241	#define LOGABS LOGDEB
		242	#else
		243	#define LOGABS LOGDEB2
		244	#endif
		245
235	// Build a document abstract by extracting text chunks around the query terms	246	// Build a document abstract by extracting text chunks around the query terms
236	// This uses the db termlists, not the original document.	247	// This uses the db termlists, not the original document.
237	string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)	248	string Native::makeAbstract(Xapian::docid docid, const list<string>& iterms)
238	{	249	{
239	Chrono chron;	250	Chrono chron;
240	LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),	251	LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
241	m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));	252	m_db->m_synthAbsLen, m_db->m_synthAbsWordCtxLen));
242		253
243	list<string> terms = noPrefixList(iterms);	254	list<string> terms = noPrefixList(iterms);
244	if (terms.empty()) {	255	if (terms.empty()) {
245	return "";	256	return "";
246	}	257	}
247		258
248	// We may want to use the db-wide freqs to tune the abstracts one	259	// Retrieve db-wide frequencies for the query terms
249	// day but we currently don't
250	#if 0
251	if (m_termfreqs.empty()) {	260	if (m_termfreqs.empty()) {
		261	double doccnt = db.get_doccount();
		262	if (doccnt == 0) doccnt = 1;
252	for (list<string>::const_iterator qit = terms.begin();	263	for (list<string>::const_iterator qit = terms.begin();
253	qit != terms.end(); qit++) {	264	qit != terms.end(); qit++) {
254	m_termfreqs[qit] = db.get_termfreq(qit);	265	m_termfreqs[qit] = db.get_termfreq(qit) / doccnt;
255	LOGDEB(("makeAbstract: [%s] db freq %d\n", qit->c_str(),	266	LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
256	m_termfreqs[*qit]));	267	m_termfreqs[*qit]));
257	}	268	}
258	LOGDEB(("makeAbstract:%d: got termfreqs\n", chron.ms()));	269	LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
259	}	270	}
260	#endif
261		271
262	// Retrieve the term Within Document Frequencies. We are going to try	272	// Compute a term quality coefficient by retrieving the term
		273	// Within Document Frequencies and multiplying by overal term
		274	// frequency, then using log-based thresholds. We are going to try
263	// and show text around the less common search terms.	275	// and show text around the less common search terms.
264	map<string, int> termwdfs;	276	map<string, double> termQcoefs;
265	int totalqtermoccs = 0;	277	double totalweight = 0;
		278	double doclen = db.get_doclength(docid);
		279	if (doclen == 0) doclen = 1;
266	for (list<string>::const_iterator qit = terms.begin();	280	for (list<string>::const_iterator qit = terms.begin();
267	qit != terms.end(); qit++) {	281	qit != terms.end(); qit++) {
268	Xapian::TermIterator term = db.termlist_begin(docid);	282	Xapian::TermIterator term = db.termlist_begin(docid);
269	term.skip_to(*qit);	283	term.skip_to(*qit);
270	if (term != db.termlist_end(docid) && term == qit) {	284	if (term != db.termlist_end(docid) && term == qit) {
271	int f = term.get_wdf();	285	double q = (term.get_wdf() / doclen) * m_termfreqs[*qit];
		286	q = -log10(q);
		287	if (q < 3) {
		288	q = 0.05;
		289	} else if (q < 4) {
		290	q = 0.3;
		291	} else if (q < 5) {
		292	q = 0.7;
		293	} else if (q < 6) {
		294	q = 0.8;
		295	} else {
		296	q = 1;
		297	}
272	termwdfs[*qit] = f;	298	termQcoefs[*qit] = q;
273	totalqtermoccs += f;	299	totalweight += q;
274	LOGDEB2(("makeAbstract: [%s] wdf %d\n", qit->c_str(),
275	termwdfs[*qit]));
276	}	300	}
277	}	301	}
278	LOGDEB2(("makeAbstract:%d: got wdfs totalqtermoccs %d\n",	302	LOGABS(("makeAbstract:%d: computed Qcoefs.\n", chron.ms()));
279	chron.ms(), totalqtermoccs));
280	if (totalqtermoccs == 0) {
281	LOGERR(("makeAbstract: no term occurrences !\n"));
282	return "";
283	}
284		303
285	// Build a sorted by frequency term list: it seems reasonable to	304	// Build a sorted by quality term list.
286	// prefer sampling around the less frequent terms:
287	multimap<int, string> bywdf;	305	multimap<double, string> byQ;
288	for (list<string>::const_iterator qit = terms.begin();	306	for (list<string>::const_iterator qit = terms.begin();
289	qit != terms.end(); qit++) {	307	qit != terms.end(); qit++) {
290	if (termwdfs.find(*qit) != termwdfs.end())	308	if (termQcoefs.find(*qit) != termQcoefs.end())
291	bywdf.insert(pair<int,string>(termwdfs[qit], qit));	309	byQ.insert(pair<double,string>(termQcoefs[qit], qit));
292	}	310	}
293		311
		312	#ifdef DEBUGABSTRACT
		313	for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
		314	qit != byQ.rend(); qit++) {
		315	LOGDEB(("%.1e->[%s]\n", qit->first, qit->second.c_str()));
		316	}
		317	#endif
		318
		319
294	// For each of the query terms, query xapian for its positions	320	// For each of the query terms, ask xapian for its positions list
295	// list in the document. For each position entry, remember it in qtermposs	321	// in the document. For each position entry, remember it in
296	// and insert it and its neighbours in the set of 'interesting' positions	322	// qtermposs and insert it and its neighbours in the set of
		323	// 'interesting' positions
297		324
298	// The terms 'array' that we partially populate with the document	325	// The terms 'array' that we partially populate with the document
299	// terms, at their positions around the search terms positions:	326	// terms, at their positions around the search terms positions:
300	map<unsigned int, string> sparseDoc;	327	map<unsigned int, string> sparseDoc;
301		328
302	// All the query term positions. We remember this mainly because we are	329	// All the chosen query term positions.
303	// going to random-shuffle it for selecting the chunks that we actually
304	// print.
305	vector<unsigned int> qtermposs;	330	vector<unsigned int> qtermposs;
306		331
307	// Limit the total number of slots we populate.	332	// Limit the total number of slots we populate. The 7 is taken as
		333	// average word size. It was a mistake to have the user max
		334	// abstract size parameter in characters, we basically only deal
		335	// with words. We used to limit the character size at the end, but
		336	// this damaged our careful selection of terms
308	const unsigned int maxtotaloccs =	337	const unsigned int maxtotaloccs =
309	MAX(50, m_db->m_synthAbsLen /(4 * (m_db->m_synthAbsWordCtxLen+1)));	338	m_db->m_synthAbsLen /(7 * (m_db->m_synthAbsWordCtxLen+1));
310	LOGDEB2(("makeAbstract:%d: ttlqtrms %d mxttloccs %d\n",	339	LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
311	chron.ms(), totalqtermoccs, maxtotaloccs));	340	// This can't happen, but would crash us
312	#if 0	341	if (totalweight == 0.0) {
		342	LOGERR(("makeAbstract: 0 totalweight!\n"));
		343	return "";
		344	}
		345
		346	// Let's go populate
313	for (multimap<int, string>::iterator qit = bywdf.begin();	347	for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
314	qit != bywdf.end(); qit++) {	348	qit != byQ.rend(); qit++) {
315	LOGDEB(("%d->[%s]\n", qit->first, qit->second.c_str()));
316	}
317	#endif
318
319	// Find the text positions which we will have to fill with terms
320	unsigned int totaloccs = 0;
321	for (multimap<int, string>::iterator qit = bywdf.begin();
322	qit != bywdf.end(); qit++) {
323	string qterm = qit->second;	349	string qterm = qit->second;
324	unsigned int maxoccs;	350	unsigned int maxoccs;
325	if (bywdf.size() == 1) {	351	if (byQ.size() == 1) {
326	maxoccs = maxtotaloccs;	352	maxoccs = maxtotaloccs;
327	} else {	353	} else {
328	float q = (1 - float(termwdfs[qterm]) / float(totalqtermoccs)) /	354	// We give more slots to the better terms
329	(bywdf.size() - 1);	355	float q = qit->first / totalweight;
330	maxoccs = int(ceil(maxtotaloccs * q));	356	maxoccs = int(ceil(maxtotaloccs * q));
331	LOGDEB2(("makeAbstract: [%s] %d max occs (coef %.2f)\n",	357	LOGABS(("makeAbstract: [%s] %d max occs (coef %.2f)\n",
332	qterm.c_str(), maxoccs, q));	358	qterm.c_str(), maxoccs, q));
333	}	359	}
334		360
335	Xapian::PositionIterator pos;	361	Xapian::PositionIterator pos;
336	// There may be query terms not in this doc. This raises an	362	// There may be query terms not in this doc. This raises an
	...		...
339	try {	365	try {
340	unsigned int occurrences = 0;	366	unsigned int occurrences = 0;
341	for (pos = db.positionlist_begin(docid, qterm);	367	for (pos = db.positionlist_begin(docid, qterm);
342	pos != db.positionlist_end(docid, qterm); pos++) {	368	pos != db.positionlist_end(docid, qterm); pos++) {
343	unsigned int ipos = *pos;	369	unsigned int ipos = *pos;
344	LOGDEB2(("makeAbstract: [%s] at %d\n", qit->c_str(), ipos));	370	if (ipos < baseTextPosition) // Not in text body
		371	continue;
		372	LOGABS(("makeAbstract: [%s] at %d occurrences %d maxoccs %d\n",
		373	qterm.c_str(), ipos, occurrences, maxoccs));
345	// Remember the term position	374	// Remember the term position
346	qtermposs.push_back(ipos);	375	qtermposs.push_back(ipos);
347	// Add adjacent slots to the set to populate at next step	376	// Add adjacent slots to the set to populate at next step
348	unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);	377	unsigned int sta = MAX(0, ipos-m_db->m_synthAbsWordCtxLen);
349	unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;	378	unsigned int sto = ipos+m_db->m_synthAbsWordCtxLen;
	...		...
351	if (ii == ipos)	380	if (ii == ipos)
352	sparseDoc[ii] = qterm;	381	sparseDoc[ii] = qterm;
353	else	382	else
354	sparseDoc[ii] = emptys;	383	sparseDoc[ii] = emptys;
355	}	384	}
356	// Limit the number of occurences we keep for each	385	// Limit to allocated occurences and total size
357	// term. The abstract has a finite length anyway !
358	if (occurrences++ > maxoccs)	386	if (++occurrences >= maxoccs \|\|
		387	qtermposs.size() >= maxtotaloccs)
359	break;	388	break;
360	}	389	}
361	} catch (...) {	390	} catch (...) {
362	// Term does not occur. No problem.	391	// Term does not occur. No problem.
363	}	392	}
364	// Limit total size	393	if (qtermposs.size() >= maxtotaloccs)
365	if (totaloccs++ > maxtotaloccs)
366	break;	394	break;
367	}	395	}
368
369	LOGDEB2(("makeAbstract:%d:chosen number of positions %d\n",	396	LOGABS(("makeAbstract:%d:chosen number of positions %d\n",
370	chron.millis(), qtermposs.size()));	397	chron.millis(), qtermposs.size()));
371		398
372	// Walk the full document position list (for each term walk	399	// This can happen if there are term occurences in the keywords
373	// position list) and populate slots around the query terms. We	400	// etc. but not elsewhere ?
374	// arbitrarily truncate the list to avoid taking forever. If we do	401	if (qtermposs.size() == 0)
375	// cutoff, the abstract may be inconsistant, which is bad...	402	return "";
		403
		404	// Walk all document's terms position lists and populate slots
		405	// around the query terms. We arbitrarily truncate the list to
		406	// avoid taking forever. If we do cutoff, the abstract may be
		407	// inconsistant (missing words, potentially altering meaning),
		408	// which is bad...
376	{	409	{
377	Xapian::TermIterator term;	410	Xapian::TermIterator term;
378	int cutoff = 500 * 1000;	411	int cutoff = 500 * 1000;
379		412
380	for (term = db.termlist_begin(docid);	413	for (term = db.termlist_begin(docid);
	...		...
399	// Don't replace a term: the terms list is in	432	// Don't replace a term: the terms list is in
400	// alphabetic order, and we may have several terms	433	// alphabetic order, and we may have several terms
401	// at the same position, we want to keep only the	434	// at the same position, we want to keep only the
402	// first one (ie: dockes and dockes@wanadoo.fr)	435	// first one (ie: dockes and dockes@wanadoo.fr)
403	if (vit->second.empty()) {	436	if (vit->second.empty()) {
404	LOGDEB2(("makeAbstract: populating: [%s] at %d\n",	437	LOGABS(("makeAbstract: populating: [%s] at %d\n",
405	(term).c_str(), pos));	438	(term).c_str(), pos));
406	sparseDoc[pos] = term;	439	sparseDoc[pos] = term;
407	}	440	}
408	}	441	}
409	}	442	}
	...		...
426	LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));	459	LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
427	}	460	}
428	}	461	}
429	#endif	462	#endif
430		463
431	LOGDEB2(("makeAbstract:%d: randomizing and extracting\n", chron.millis()));	464	LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
432		465
433	// We randomize the selection of term positions, from which we	466	// Add "..." at ends of chunks
434	// shall pull, starting at the beginning, until the abstract is
435	// big enough. The abstract is finally built in correct position
436	// order, thanks to the position map.
437	random_shuffle(qtermposs.begin(), qtermposs.end());
438	map<unsigned int, string> mabs;
439	unsigned int abslen = 0;
440
441	// Extract data around the N first (in random order) query term
442	// positions, and store the terms in the map. Don't concatenate
443	// immediately into chunks because there might be overlaps
444	for (vector<unsigned int>::const_iterator pos = qtermposs.begin();	467	for (vector<unsigned int>::const_iterator pos = qtermposs.begin();
445	pos != qtermposs.end(); pos++) {	468	pos != qtermposs.end(); pos++) {
446
447	if (int(abslen) > m_db->m_synthAbsLen)
448	break;
449
450	unsigned int sta = MAX(0, *pos - m_db->m_synthAbsWordCtxLen);
451	unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;	469	unsigned int sto = *pos + m_db->m_synthAbsWordCtxLen;
452
453	LOGDEB2(("makeAbstract: %d<-%d->%d\n", sta, *pos, sto));
454
455	for (unsigned int ii = sta; ii <= sto; ii++) {
456
457	if (int(abslen) > m_db->m_synthAbsLen)
458	break;
459	map<unsigned int, string>::const_iterator vit =
460	sparseDoc.find(ii);
461	if (vit != sparseDoc.end() && !vit->second.empty()) {
462	LOGDEB2(("makeAbstract: position %d -> [%s]\n",
463	ii, vit->second.c_str()));
464	mabs[ii] = vit->second;
465	abslen += vit->second.length();
466	} else {
467	LOGDEB2(("makeAbstract: empty position at %d\n", ii));
468	}
469	}
470		470
471	// Possibly add a ... at the end of chunk if it's not	471	// Possibly add a ... at the end of chunk if it's not
472	// overlapping	472	// overlapping
473	if (mabs.find(sto+1) == mabs.end())	473	if (sparseDoc.find(sto) != sparseDoc.end() &&
		474	sparseDoc.find(sto+1) == sparseDoc.end())
474	mabs[sto+1] = "...";	475	sparseDoc[sto+1] = "...";
475	}	476	}
476		477
477	// Build the abstract by walking the map (in order of position)	478	// Finally build the abstract by walking the map (in order of position)
478	string abstract;	479	string abstract;
479	for (map<unsigned int, string>::const_iterator it = mabs.begin();	480	for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
480	it != mabs.end(); it++) {	481	it != sparseDoc.end(); it++) {
481	LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));	482	LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
482	abstract += it->second + " ";	483	abstract += it->second + " ";
483	}	484	}
484		485
485	// This happens for docs with no terms (only filename) indexed. I'll fix	486	// This happens for docs with no terms (only filename) indexed? I'll fix
486	// one day (yeah)	487	// one day (yeah)
487	if (!abstract.compare("... "))	488	if (!abstract.compare("... "))
488	abstract.clear();	489	abstract.clear();
489		490
490	LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));	491	LOGDEB(("makeAbtract: done in %d mS\n", chron.millis()));
	...		...
971	splitData.setprefix(emptystring);	972	splitData.setprefix(emptystring);
972	splitData.basepos += splitData.curpos + 100;	973	splitData.basepos += splitData.curpos + 100;
973	}	974	}
974	}	975	}
975		976
		977	if (splitData.curpos < baseTextPosition)
		978	splitData.basepos = baseTextPosition;
		979	else
		980	splitData.basepos += splitData.curpos + 100;
976		981
977	// Split and index body text	982	// Finally: split and index body text
978	LOGDEB2(("Db::add: split body\n"));	983	LOGDEB2(("Db::add: split body\n"));
979	if (!dumb_string(doc.text, noacc)) {	984	if (!dumb_string(doc.text, noacc)) {
980	LOGERR(("Db::add: dumb_string failed\n"));	985	LOGERR(("Db::add: dumb_string failed\n"));
981	return false;	986	return false;
982	}	987	}
983	splitter.text_to_words(noacc);	988	splitter.text_to_words(noacc);
984	splitData.basepos += splitData.curpos + 100;
985
986		989
987	////// Special terms for other metadata. No positions for these.	990	////// Special terms for other metadata. No positions for these.
988	// Mime type	991	// Mime type
989	newdocument.add_term("T" + doc.mimetype);	992	newdocument.add_term("T" + doc.mimetype);
990		993
	...		...
1423	names.push_back("XIMPOSSIBLE");	1426	names.push_back("XIMPOSSIBLE");
1424	}	1427	}
1425	return true;	1428	return true;
1426	}	1429	}
1427		1430
1428	// Prepare query out of "advanced search" data	1431	// Prepare query out of user search data
1429	bool Db::setQuery(RefCntr<SearchData> sdata, int opts,	1432	bool Db::setQuery(RefCntr<SearchData> sdata, int opts,
1430	const string& stemlang)	1433	const string& stemlang)
1431	{	1434	{
1432	if (!m_ndb) {	1435	if (!m_ndb) {
1433	LOGERR(("Db::setQuery: no db!\n"));	1436	LOGERR(("Db::setQuery: no db!\n"));
	...		...
1445	if (!sdata->toNativeQuery(*this, &xq,	1448	if (!sdata->toNativeQuery(*this, &xq,
1446	(opts & Db::QO_STEM) ? stemlang : "")) {	1449	(opts & Db::QO_STEM) ? stemlang : "")) {
1447	m_reason += sdata->getReason();	1450	m_reason += sdata->getReason();
1448	return false;	1451	return false;
1449	}	1452	}
1450
1451	m_ndb->query = xq;	1453	m_ndb->query = xq;
1452	delete m_ndb->enquire;	1454	delete m_ndb->enquire;
1453	m_ndb->enquire = new Xapian::Enquire(m_ndb->db);	1455	m_ndb->enquire = new Xapian::Enquire(m_ndb->db);
1454	m_ndb->enquire->set_query(m_ndb->query);	1456	m_ndb->enquire->set_query(m_ndb->query);
1455	m_ndb->mset = Xapian::MSet();	1457	m_ndb->mset = Xapian::MSet();