|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
|
... |
|
... |
171 |
}
|
171 |
}
|
172 |
return true;
|
172 |
return true;
|
173 |
}
|
173 |
}
|
174 |
|
174 |
|
175 |
// Remove prefixes (caps) from a list of terms.
|
175 |
// Remove prefixes (caps) from a list of terms.
|
176 |
static list<string> noPrefixList(const list<string>& in)
|
176 |
static void noPrefixList(const list<string>& in, list<string>& out)
|
177 |
{
|
177 |
{
|
178 |
list<string> out;
|
|
|
179 |
for (list<string>::const_iterator qit = in.begin();
|
178 |
for (list<string>::const_iterator qit = in.begin();
|
180 |
qit != in.end(); qit++) {
|
179 |
qit != in.end(); qit++) {
|
181 |
if ('A' <= qit->at(0) && qit->at(0) <= 'Z') {
|
180 |
if ('A' <= qit->at(0) && qit->at(0) <= 'Z') {
|
182 |
string term = *qit;
|
181 |
string term = *qit;
|
183 |
while (term.length() && 'A' <= term.at(0) && term.at(0) <= 'Z')
|
182 |
while (term.length() && 'A' <= term.at(0) && term.at(0) <= 'Z')
|
|
... |
|
... |
187 |
continue;
|
186 |
continue;
|
188 |
} else {
|
187 |
} else {
|
189 |
out.push_back(*qit);
|
188 |
out.push_back(*qit);
|
190 |
}
|
189 |
}
|
191 |
}
|
190 |
}
|
192 |
return out;
|
|
|
193 |
}
|
191 |
}
|
194 |
|
192 |
|
195 |
//#define DEBUGABSTRACT 1
|
193 |
//#define DEBUGABSTRACT 1
|
196 |
#ifdef DEBUGABSTRACT
|
194 |
#ifdef DEBUGABSTRACT
|
197 |
#define LOGABS LOGDEB
|
195 |
#define LOGABS LOGDEB
|
198 |
#else
|
196 |
#else
|
199 |
#define LOGABS LOGDEB2
|
197 |
#define LOGABS LOGDEB2
|
200 |
#endif
|
198 |
#endif
|
|
|
199 |
static void listList(const string& what, const list<string>&l)
|
|
|
200 |
{
|
|
|
201 |
string a;
|
|
|
202 |
for (list<string>::const_iterator it = l.begin(); it != l.end(); it++) {
|
|
|
203 |
a = a + *it + " ";
|
|
|
204 |
}
|
|
|
205 |
LOGDEB(("%s: %s\n", what.c_str(), a.c_str()));
|
|
|
206 |
}
|
201 |
|
207 |
|
202 |
// Build a document abstract by extracting text chunks around the query terms
|
208 |
// Build a document abstract by extracting text chunks around the query terms
|
203 |
// This uses the db termlists, not the original document.
|
209 |
// This uses the db termlists, not the original document.
|
204 |
//
|
210 |
//
|
205 |
// DatabaseModified and other general exceptions are catched and
|
211 |
// DatabaseModified and other general exceptions are catched and
|
|
... |
|
... |
208 |
{
|
214 |
{
|
209 |
Chrono chron;
|
215 |
Chrono chron;
|
210 |
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
216 |
LOGDEB(("makeAbstract:%d: maxlen %d wWidth %d\n", chron.ms(),
|
211 |
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
217 |
m_rcldb->m_synthAbsLen, m_rcldb->m_synthAbsWordCtxLen));
|
212 |
|
218 |
|
213 |
list<string> iterms;
|
219 |
list<string> terms;
|
214 |
query->getQueryTerms(iterms);
|
|
|
215 |
|
220 |
|
216 |
list<string> terms = noPrefixList(iterms);
|
221 |
{
|
|
|
222 |
list<string> iterms;
|
|
|
223 |
query->getMatchTerms(docid, iterms);
|
|
|
224 |
noPrefixList(iterms, terms);
|
217 |
if (terms.empty()) {
|
225 |
if (terms.empty()) {
|
218 |
return string();
|
226 |
LOGDEB(("makeAbstract::Empty term list\n"));
|
|
|
227 |
return string();
|
219 |
}
|
228 |
}
|
|
|
229 |
}
|
|
|
230 |
// listList("Match terms: ", terms);
|
220 |
|
231 |
|
221 |
// Retrieve db-wide frequencies for the query terms
|
232 |
// Retrieve db-wide frequencies for the query terms (we do this once per
|
|
|
233 |
// query, using all the query terms, not only the document match terms)
|
222 |
if (query->m_nq->termfreqs.empty()) {
|
234 |
if (query->m_nq->termfreqs.empty()) {
|
|
|
235 |
list<string> iqterms, qterms;
|
|
|
236 |
query->getQueryTerms(iqterms);
|
|
|
237 |
noPrefixList(iqterms, qterms);
|
|
|
238 |
// listList("Query terms: ", qterms);
|
223 |
double doccnt = xrdb.get_doccount();
|
239 |
double doccnt = xrdb.get_doccount();
|
224 |
if (doccnt == 0) doccnt = 1;
|
240 |
if (doccnt == 0) doccnt = 1;
|
225 |
for (list<string>::const_iterator qit = terms.begin();
|
241 |
for (list<string>::const_iterator qit = qterms.begin();
|
226 |
qit != terms.end(); qit++) {
|
242 |
qit != qterms.end(); qit++) {
|
227 |
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
243 |
query->m_nq->termfreqs[*qit] = xrdb.get_termfreq(*qit) / doccnt;
|
228 |
LOGABS(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
244 |
LOGDEB(("makeAbstract: [%s] db freq %.1e\n", qit->c_str(),
|
229 |
query->m_nq->termfreqs[*qit]));
|
245 |
query->m_nq->termfreqs[*qit]));
|
230 |
}
|
246 |
}
|
231 |
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
247 |
LOGABS(("makeAbstract:%d: got termfreqs\n", chron.ms()));
|
232 |
}
|
248 |
}
|
233 |
|
249 |
|
|
... |
|
... |
448 |
LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
464 |
LOGDEB(("makeAbstract:vec[%d]: [%s]\n", ipos, it->c_str()));
|
449 |
}
|
465 |
}
|
450 |
}
|
466 |
}
|
451 |
#endif
|
467 |
#endif
|
452 |
|
468 |
|
453 |
LOGDEB(("makeAbstract:%d: extracting\n", chron.millis()));
|
469 |
LOGABS(("makeAbstract:%d: extracting\n", chron.millis()));
|
454 |
|
470 |
|
455 |
// Finally build the abstract by walking the map (in order of position)
|
471 |
// Finally build the abstract by walking the map (in order of position)
|
456 |
string abstract;
|
472 |
string abstract;
|
457 |
abstract.reserve(sparseDoc.size() * 10);
|
473 |
abstract.reserve(sparseDoc.size() * 10);
|
458 |
bool incjk = false;
|
474 |
bool incjk = false;
|