|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
71 |
#ifndef NO_NAMESPACES
|
71 |
#ifndef NO_NAMESPACES
|
72 |
namespace Rcl {
|
72 |
namespace Rcl {
|
73 |
#endif
|
73 |
#endif
|
74 |
|
74 |
|
75 |
const string pathelt_prefix = "XP";
|
75 |
const string pathelt_prefix = "XP";
|
|
|
76 |
static const string ellipsis("...");
|
76 |
|
77 |
|
77 |
string version_string(){
|
78 |
string version_string(){
|
78 |
return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
|
79 |
return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
|
79 |
string(Xapian::version_string());
|
80 |
string(Xapian::version_string());
|
80 |
}
|
81 |
}
|
|
... |
|
... |
243 |
// Build a document abstract by extracting text chunks around the query terms
|
244 |
// Build a document abstract by extracting text chunks around the query terms
|
244 |
// This uses the db termlists, not the original document.
|
245 |
// This uses the db termlists, not the original document.
|
245 |
//
|
246 |
//
|
246 |
// DatabaseModified and other general exceptions are catched and
|
247 |
// DatabaseModified and other general exceptions are catched and
|
247 |
// possibly retried by our caller
|
248 |
// possibly retried by our caller
|
248 |
string Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
249 |
vector<string> Db::Native::makeAbstract(Xapian::docid docid, Query *query)
|
249 |
{
|
250 |
{
|
250 |
Chrono chron;
|
251 |
Chrono chron;
|
251 |
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
252 |
LOGDEB2(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
252 |
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
253 |
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
253 |
|
254 |
|
|
... |
|
... |
257 |
list<string> iterms;
|
258 |
list<string> iterms;
|
258 |
query->getMatchTerms(docid, iterms);
|
259 |
query->getMatchTerms(docid, iterms);
|
259 |
noPrefixList(iterms, terms);
|
260 |
noPrefixList(iterms, terms);
|
260 |
if (terms.empty()) {
|
261 |
if (terms.empty()) {
|
261 |
LOGDEB(("makeAbstract::Empty term list\n"));
|
262 |
LOGDEB(("makeAbstract::Empty term list\n"));
|
262 |
return string();
|
263 |
return vector<string>();
|
263 |
}
|
264 |
}
|
264 |
}
|
265 |
}
|
265 |
// listList("Match terms: ", terms);
|
266 |
// listList("Match terms: ", terms);
|
266 |
|
267 |
|
267 |
// Retrieve db-wide frequencies for the query terms (we do this once per
|
268 |
// Retrieve db-wide frequencies for the query terms (we do this once per
|
|
... |
|
... |
351 |
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
|
352 |
m_rcldb->m_synthAbsLen /(7 * (m_rcldb->m_synthAbsWordCtxLen+1));
|
352 |
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
|
353 |
LOGABS(("makeAbstract:%d: mxttloccs %d\n", chron.ms(), maxtotaloccs));
|
353 |
// This can't happen, but would crash us
|
354 |
// This can't happen, but would crash us
|
354 |
if (totalweight == 0.0) {
|
355 |
if (totalweight == 0.0) {
|
355 |
LOGERR(("makeAbstract: 0 totalweight!\n"));
|
356 |
LOGERR(("makeAbstract: 0 totalweight!\n"));
|
356 |
return string();
|
357 |
return vector<string>();
|
357 |
}
|
358 |
}
|
358 |
|
359 |
|
359 |
// This is used to mark positions overlapped by a multi-word match term
|
360 |
// This is used to mark positions overlapped by a multi-word match term
|
360 |
const string occupiedmarker("?");
|
361 |
const string occupiedmarker("?");
|
361 |
const string ellipsis("...");
|
|
|
362 |
|
362 |
|
363 |
// Let's go populate
|
363 |
// Let's go populate
|
364 |
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
364 |
for (multimap<double, string>::reverse_iterator qit = byQ.rbegin();
|
365 |
qit != byQ.rend(); qit++) {
|
365 |
qit != byQ.rend(); qit++) {
|
366 |
string qterm = qit->second;
|
366 |
string qterm = qit->second;
|
|
... |
|
... |
437 |
chron.millis(), qtermposs.size()));
|
437 |
chron.millis(), qtermposs.size()));
|
438 |
|
438 |
|
439 |
// This can happen if there are term occurences in the keywords
|
439 |
// This can happen if there are term occurences in the keywords
|
440 |
// etc. but not elsewhere ?
|
440 |
// etc. but not elsewhere ?
|
441 |
if (qtermposs.size() == 0)
|
441 |
if (qtermposs.size() == 0)
|
442 |
return string();
|
442 |
return vector<string>();
|
443 |
|
443 |
|
444 |
// Walk all document's terms position lists and populate slots
|
444 |
// Walk all document's terms position lists and populate slots
|
445 |
// around the query terms. We arbitrarily truncate the list to
|
445 |
// around the query terms. We arbitrarily truncate the list to
|
446 |
// avoid taking forever. If we do cutoff, the abstract may be
|
446 |
// avoid taking forever. If we do cutoff, the abstract may be
|
447 |
// inconsistant (missing words, potentially altering meaning),
|
447 |
// inconsistant (missing words, potentially altering meaning),
|
|
... |
|
... |
502 |
#endif
|
502 |
#endif
|
503 |
|
503 |
|
504 |
LOGABS(("makeAbstract:%d: extracting\n", chron.millis()));
|
504 |
LOGABS(("makeAbstract:%d: extracting\n", chron.millis()));
|
505 |
|
505 |
|
506 |
// Finally build the abstract by walking the map (in order of position)
|
506 |
// Finally build the abstract by walking the map (in order of position)
|
507 |
string abstract;
|
507 |
vector<string> vabs;
|
508 |
abstract.reserve(sparseDoc.size() * 10);
|
508 |
string chunk;
|
509 |
bool incjk = false;
|
509 |
bool incjk = false;
|
510 |
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
510 |
for (map<unsigned int, string>::const_iterator it = sparseDoc.begin();
|
511 |
it != sparseDoc.end(); it++) {
|
511 |
it != sparseDoc.end(); it++) {
|
512 |
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
512 |
LOGDEB2(("Abtract:output %u -> [%s]\n", it->first,it->second.c_str()));
|
513 |
if (!occupiedmarker.compare(it->second))
|
513 |
if (!occupiedmarker.compare(it->second))
|
|
... |
|
... |
515 |
Utf8Iter uit(it->second);
|
515 |
Utf8Iter uit(it->second);
|
516 |
bool newcjk = false;
|
516 |
bool newcjk = false;
|
517 |
if (TextSplit::isCJK(*uit))
|
517 |
if (TextSplit::isCJK(*uit))
|
518 |
newcjk = true;
|
518 |
newcjk = true;
|
519 |
if (!incjk || (incjk && !newcjk))
|
519 |
if (!incjk || (incjk && !newcjk))
|
520 |
abstract += " ";
|
520 |
chunk += " ";
|
521 |
incjk = newcjk;
|
521 |
incjk = newcjk;
|
522 |
abstract += it->second;
|
522 |
if (it->second == ellipsis) {
|
|
|
523 |
vabs.push_back(chunk);
|
|
|
524 |
chunk.clear();
|
|
|
525 |
} else {
|
|
|
526 |
chunk += it->second;
|
|
|
527 |
}
|
523 |
}
|
528 |
}
|
524 |
|
529 |
if (!chunk.empty())
|
|
|
530 |
vabs.push_back(chunk);
|
525 |
// This happens for docs with no terms (only filename) indexed? I'll fix
|
531 |
// This happens for docs with no terms (only filename) indexed? I'll fix
|
526 |
// one day (yeah)
|
532 |
// one day (yeah)
|
527 |
if (!abstract.compare("... "))
|
533 |
if (vabs.size() == 1 && !vabs[0].compare("... "))
|
528 |
abstract.clear();
|
534 |
vabs.clear();
|
529 |
|
535 |
|
530 |
LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
|
536 |
LOGDEB2(("makeAbtract: done in %d mS\n", chron.millis()));
|
531 |
return abstract;
|
537 |
return vabs;
|
532 |
}
|
538 |
}
|
533 |
|
539 |
|
534 |
/* Rcl::Db methods ///////////////////////////////// */
|
540 |
/* Rcl::Db methods ///////////////////////////////// */
|
535 |
|
541 |
|
536 |
Db::Db(RclConfig *cfp)
|
542 |
Db::Db(RclConfig *cfp)
|
|
... |
|
... |
1740 |
return false;
|
1746 |
return false;
|
1741 |
}
|
1747 |
}
|
1742 |
return true;
|
1748 |
return true;
|
1743 |
}
|
1749 |
}
|
1744 |
|
1750 |
|
1745 |
|
|
|
1746 |
bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
|
1751 |
bool Db::makeDocAbstract(Doc &doc, Query *query, vector<string>& abstract)
|
1747 |
{
|
1752 |
{
|
1748 |
LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
|
1753 |
LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
|
1749 |
if (!m_ndb || !m_ndb->m_isopen) {
|
1754 |
if (!m_ndb || !m_ndb->m_isopen) {
|
1750 |
LOGERR(("Db::makeDocAbstract: no db\n"));
|
1755 |
LOGERR(("Db::makeDocAbstract: no db\n"));
|
1751 |
return false;
|
1756 |
return false;
|
1752 |
}
|
1757 |
}
|
1753 |
|
|
|
1754 |
XAPTRY(abstract = m_ndb->makeAbstract(doc.xdocid, query),
|
1758 |
XAPTRY(abstract = m_ndb->makeAbstract(doc.xdocid, query),
|
1755 |
m_ndb->xrdb, m_reason);
|
1759 |
m_ndb->xrdb, m_reason);
|
|
|
1760 |
return m_reason.empty() ? true : false;
|
|
|
1761 |
}
|
1756 |
|
1762 |
|
|
|
1763 |
bool Db::makeDocAbstract(Doc &doc, Query *query, string& abstract)
|
|
|
1764 |
{
|
|
|
1765 |
LOGDEB1(("Db::makeDocAbstract: exti %d\n", exti));
|
|
|
1766 |
if (!m_ndb || !m_ndb->m_isopen) {
|
|
|
1767 |
LOGERR(("Db::makeDocAbstract: no db\n"));
|
|
|
1768 |
return false;
|
|
|
1769 |
}
|
|
|
1770 |
vector<string> vab;
|
|
|
1771 |
XAPTRY(vab = m_ndb->makeAbstract(doc.xdocid, query),
|
|
|
1772 |
m_ndb->xrdb, m_reason);
|
|
|
1773 |
for (vector<string>::const_iterator it = vab.begin();
|
|
|
1774 |
it != vab.end(); it++) {
|
|
|
1775 |
abstract.append(*it);
|
|
|
1776 |
abstract.append(ellipsis);
|
|
|
1777 |
}
|
1757 |
return m_reason.empty() ? true : false;
|
1778 |
return m_reason.empty() ? true : false;
|
1758 |
}
|
1779 |
}
|
1759 |
|
1780 |
|
1760 |
// Retrieve document defined by Unique doc identifier. This is mainly used
|
1781 |
// Retrieve document defined by Unique doc identifier. This is mainly used
|
1761 |
// by the GUI history feature
|
1782 |
// by the GUI history feature
|