...
#include "rclquery.h"
#include "rclquery_p.h"
#include "md5.h"
#include "rclversion.h"
#include "cancelcheck.h"
#include "ptmutex.h"

#ifndef MAX
#define MAX(A,B) (A>B?A:B)
#endif
#ifndef MIN
#define MIN(A,B) (A<B?A:B)
#endif

// Recoll index format version is stored in user metadata. When this change,
// we can't open the db and will have to reindex.
static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
static const string cstr_RCL_IDX_VERSION("1");

// This is the word position offset at which we index the body text
// (abstract, keywords, etc.. are stored before this)
static const unsigned int baseTextPosition = 100000;

...
const string end_of_field_term = "XXND";

// This is used as a marker inside the abstract frag lists, but
// normally doesn't remain in final output (which is built with a
// custom sep. by our caller).
static const string cstr_ellipsis("...");

string version_string(){
    return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
        string(Xapian::version_string());
}

// Synthetic abstract marker (to discriminate from abstract actually
// found in document)
static const string cstr_syntAbs("?!#@");

// Only ONE field name inside the index data record differs from the
// Rcl::Doc ones: caption<->title, for a remnant of compatibility with
// omega
static const string cstr_keycap("caption");

// Static/Default table for field->prefix/weight translation. 
// This is logically const after initialization. Can't use a
// static object to init this as the static std::string objects may
// not be ready.
...
// "fields" configuration (cf: Db::fieldToTraits()), meaning that the
// entries can be overriden in the configuration, but not
// suppressed. 

static map<string, FieldTraits> fldToTraits;
static PTMutexInit o_fldToTraits_mutex;

static void initFldToTraits() 
{
    PTMutexLocker locker(o_fldToTraits_mutex);
    // As we perform non-locked testing of initialization, check again with
    // the lock held
    if (fldToTraits.size())
  return;

    // Can't remember why "abstract" is indexed without a prefix
    // (result: it's indexed twice actually). Maybe I'll dare change
    // this one day
    fldToTraits[Doc::keyabs] = FieldTraits();

    fldToTraits["ext"] = FieldTraits("XE");
    fldToTraits[Doc::keyfn] = FieldTraits("XSFN");

    fldToTraits[cstr_keycap] = FieldTraits("S");
    fldToTraits[Doc::keytt] = FieldTraits("S");
    fldToTraits["subject"] = FieldTraits("S");

    fldToTraits[Doc::keyau] = FieldTraits("A");
    fldToTraits["creator"] = FieldTraits("A");
...
    parms.get(Doc::keyurl, doc.url);
    parms.get(Doc::keytp, doc.mimetype);
    parms.get(Doc::keyfmt, doc.fmtime);
    parms.get(Doc::keydmt, doc.dmtime);
    parms.get(Doc::keyoc, doc.origcharset);
    parms.get(cstr_keycap, doc.meta[Doc::keytt]);
    parms.get(Doc::keykw, doc.meta[Doc::keykw]);
    parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
    // Possibly remove synthetic abstract indicator (if it's there, we
    // used to index the beginning of the text as abstract).
    doc.syntabs = false;
    if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
    doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
    doc.syntabs = true;
    }
    parms.get(Doc::keyipt, doc.ipath);
    parms.get(Doc::keyfs, doc.fbytes);
    parms.get(Doc::keyds, doc.dbytes);
...
            if (ii == (unsigned int)ipos) {
            sparseDoc[ii] = qterm;
            } else if (ii > (unsigned int)ipos && 
                   ii < (unsigned int)ipos + qtrmwrdcnt) {
            sparseDoc[ii] = occupiedmarker;
            } else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
            // For an empty slot, the test has a side
            // effect of inserting an empty string which
            // is what we want
            sparseDoc[ii] = emptys;
            }
...
        // Add ellipsis at the end. This may be replaced later by
        // an overlapping extract. Take care not to replace an
        // empty string here, we really want an empty slot,
        // use find()
        if (sparseDoc.find(sto+1) == sparseDoc.end()) {
            sparseDoc[sto+1] = cstr_ellipsis;
        }

        // Limit to allocated occurences and total size
        if (++occurrences >= maxoccs || 
            qtermposs.size() >= maxtotaloccs)
...
    if (TextSplit::isCJK(*uit))
        newcjk = true;
    if (!incjk || (incjk && !newcjk))
        chunk += " ";
    incjk = newcjk;
    if (it->second == cstr_ellipsis) {
        vabs.push_back(chunk);
        chunk.clear();
    } else {
        chunk += it->second;
    }
...
            Xapian::DB_CREATE_OR_OVERWRITE;
        m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
                // If db is empty, write the data format version at once
                // to avoid stupid error messages:
                if (m_ndb->xwdb.get_doccount() == 0)
                    m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, 
                                             cstr_RCL_IDX_VERSION);
        m_ndb->m_iswritable = true;
        // We open a readonly object in all cases (possibly in
        // addition to the r/w one) because some operations
        // are faster when performed through a Database: no
        // forced flushes on allterms_begin(), ie, used in
...
        *error = DbOpenMainDb;

    // Check index format version. Must not try to check a just created or
    // truncated db
    if (mode != DbTrunc && m_ndb->xdb().get_doccount() > 0) {
        string version = m_ndb->xdb().get_metadata(cstr_RCL_IDX_VERSION_KEY);
        if (version.compare(cstr_RCL_IDX_VERSION)) {
        m_ndb->m_noversionwrite = true;
        LOGERR(("Rcl::Db::open: file index [%s], software [%s]\n",
            version.c_str(), cstr_RCL_IDX_VERSION.c_str()));
        throw Xapian::DatabaseError("Recoll index version mismatch",
                        "", "");
        }
    }
    m_mode = mode;
...
    string ermsg;
    try {
    bool w = m_ndb->m_iswritable;
    if (w) {
        if (!m_ndb->m_noversionwrite)
        m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
        LOGDEB(("Rcl::Db:close: xapian will close. May take some time\n"));
    }
    // Used to do a flush here. Cant see why it should be necessary.
    deleteZ(m_ndb);
    if (w)
...
    if (syntctxlen > 0)
    m_synthAbsWordCtxLen = syntctxlen;
}

static const int MB = 1024 * 1024;
static const string cstr_nc("\n\r\x0c");

#define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";}

// Add document in internal form to the database: index the terms in
// the title abstract and body and add special terms for file name,
...
    RECORD_APPEND(record, Doc::keyipt, doc.ipath);

    if (doc.meta[Doc::keytt].empty())
    doc.meta[Doc::keytt] = doc.utf8fn;
    doc.meta[Doc::keytt] = 
    neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
    if (!doc.meta[Doc::keytt].empty())
    RECORD_APPEND(record, cstr_keycap, doc.meta[Doc::keytt]);

    trimstring(doc.meta[Doc::keykw], " \t\r\n");
    doc.meta[Doc::keykw] = 
    neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
    if (!doc.meta[Doc::keykw].empty())
    RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);

    // If abstract is empty, we make up one with the beginning of the
    // document. This is then not indexed, but part of the doc data so
...
    // don't exist yet.
    trimstring(doc.meta[Doc::keyabs], " \t\r\n");
    if (doc.meta[Doc::keyabs].empty()) {
    syntabs = true;
    if (!doc.text.empty())
        doc.meta[Doc::keyabs] = cstr_syntAbs + 
        neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
    } else {
    doc.meta[Doc::keyabs] = 
        neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
              cstr_nc);
    }
    if (!doc.meta[Doc::keyabs].empty())
    RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);

    const set<string>& stored = m_config->getStoredFields();
    for (set<string>::const_iterator it = stored.begin();
     it != stored.end(); it++) {
    string nm = m_config->fieldCanon(*it);
    if (!doc.meta[*it].empty()) {
        string value = 
        neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
        RECORD_APPEND(record, nm, value);
    }
    }

    // If the file's md5 was computed, add value. This is optionally
...
}

// Characters that can begin a wildcard or regexp expression. We use skipto
// to begin the allterms search with terms that begin with the portion of
// the input string prior to these chars.
const string cstr_wildSpecChars = "*?[";
const string cstr_regSpecChars = "(.[{";

// Find all index terms that match a wildcard or regular expression
bool Db::termMatch(MatchType typ, const string &lang,
           const string &root, 
           TermMatchResult& res,
...
    string droot;
    if (!unacmaybefold(root, droot, "UTF-8", true)) {
    LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
    return false;
    }
    string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;

    string prefix;
    if (!field.empty()) {
    const FieldTraits *ftp = 0;
    if (!fieldToTraits(field, &ftp) || ftp->pfx.empty()) {
...
    XAPTRY(vab = m_ndb->makeAbstract(doc.xdocid, query),
           m_ndb->xrdb, m_reason);
    for (vector<string>::const_iterator it = vab.begin(); 
     it != vab.end(); it++) {
    abstract.append(*it);
    abstract.append(cstr_ellipsis);
    }
    return m_reason.empty() ? true : false;
}

// Retrieve document defined by Unique doc identifier. This is mainly used

	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
	...		...
49	#include "rclquery.h"	49	#include "rclquery.h"
50	#include "rclquery_p.h"	50	#include "rclquery_p.h"
51	#include "md5.h"	51	#include "md5.h"
52	#include "rclversion.h"	52	#include "rclversion.h"
53	#include "cancelcheck.h"	53	#include "cancelcheck.h"
		54	#include "ptmutex.h"
54		55
55	#ifndef MAX	56	#ifndef MAX
56	#define MAX(A,B) (A>B?A:B)	57	#define MAX(A,B) (A>B?A:B)
57	#endif	58	#endif
58	#ifndef MIN	59	#ifndef MIN
59	#define MIN(A,B) (A<B?A:B)	60	#define MIN(A,B) (A<B?A:B)
60	#endif	61	#endif
61		62
62	// Recoll index format version is stored in user metadata. When this change,	63	// Recoll index format version is stored in user metadata. When this change,
63	// we can't open the db and will have to reindex.	64	// we can't open the db and will have to reindex.
64	static const string RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");	65	static const string cstr_RCL_IDX_VERSION_KEY("RCL_IDX_VERSION_KEY");
65	static const string RCL_IDX_VERSION("1");	66	static const string cstr_RCL_IDX_VERSION("1");
66		67
67	// This is the word position offset at which we index the body text	68	// This is the word position offset at which we index the body text
68	// (abstract, keywords, etc.. are stored before this)	69	// (abstract, keywords, etc.. are stored before this)
69	static const unsigned int baseTextPosition = 100000;	70	static const unsigned int baseTextPosition = 100000;
70		71
	...		...
77	const string end_of_field_term = "XXND";	78	const string end_of_field_term = "XXND";
78		79
79	// This is used as a marker inside the abstract frag lists, but	80	// This is used as a marker inside the abstract frag lists, but
80	// normally doesn't remain in final output (which is built with a	81	// normally doesn't remain in final output (which is built with a
81	// custom sep. by our caller).	82	// custom sep. by our caller).
82	static const string ellipsis("...");	83	static const string cstr_ellipsis("...");
83		84
84	string version_string(){	85	string version_string(){
85	return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +	86	return string("Recoll ") + string(rclversionstr) + string(" + Xapian ") +
86	string(Xapian::version_string());	87	string(Xapian::version_string());
87	}	88	}
88		89
89	// Synthetic abstract marker (to discriminate from abstract actually	90	// Synthetic abstract marker (to discriminate from abstract actually
90	// found in document)	91	// found in document)
91	static const string rclSyntAbs("?!#@");	92	static const string cstr_syntAbs("?!#@");
92		93
93	// Only ONE field name inside the index data record differs from the	94	// Only ONE field name inside the index data record differs from the
94	// Rcl::Doc ones: caption<->title, for a remnant of compatibility with	95	// Rcl::Doc ones: caption<->title, for a remnant of compatibility with
95	// omega	96	// omega
96	static const string keycap("caption");	97	static const string cstr_keycap("caption");
97		98
98	// Static/Default table for field->prefix/weight translation.	99	// Static/Default table for field->prefix/weight translation.
99	// This is logically const after initialization. Can't use a	100	// This is logically const after initialization. Can't use a
100	// static object to init this as the static std::string objects may	101	// static object to init this as the static std::string objects may
101	// not be ready.	102	// not be ready.
	...		...
104	// "fields" configuration (cf: Db::fieldToTraits()), meaning that the	105	// "fields" configuration (cf: Db::fieldToTraits()), meaning that the
105	// entries can be overriden in the configuration, but not	106	// entries can be overriden in the configuration, but not
106	// suppressed.	107	// suppressed.
107		108
108	static map<string, FieldTraits> fldToTraits;	109	static map<string, FieldTraits> fldToTraits;
		110	static PTMutexInit o_fldToTraits_mutex;
		111
109	static void initFldToTraits()	112	static void initFldToTraits()
110	{	113	{
		114	PTMutexLocker locker(o_fldToTraits_mutex);
		115	// As we perform non-locked testing of initialization, check again with
		116	// the lock held
		117	if (fldToTraits.size())
		118	return;
		119
111	// Can't remember why "abstract" is indexed without a prefix	120	// Can't remember why "abstract" is indexed without a prefix
112	// (result: it's indexed twice actually). Maybe I'll dare change	121	// (result: it's indexed twice actually). Maybe I'll dare change
113	// this one day	122	// this one day
114	fldToTraits[Doc::keyabs] = FieldTraits();	123	fldToTraits[Doc::keyabs] = FieldTraits();
115		124
116	fldToTraits["ext"] = FieldTraits("XE");	125	fldToTraits["ext"] = FieldTraits("XE");
117	fldToTraits[Doc::keyfn] = FieldTraits("XSFN");	126	fldToTraits[Doc::keyfn] = FieldTraits("XSFN");
118		127
119	fldToTraits[keycap] = FieldTraits("S");	128	fldToTraits[cstr_keycap] = FieldTraits("S");
120	fldToTraits[Doc::keytt] = FieldTraits("S");	129	fldToTraits[Doc::keytt] = FieldTraits("S");
121	fldToTraits["subject"] = FieldTraits("S");	130	fldToTraits["subject"] = FieldTraits("S");
122		131
123	fldToTraits[Doc::keyau] = FieldTraits("A");	132	fldToTraits[Doc::keyau] = FieldTraits("A");
124	fldToTraits["creator"] = FieldTraits("A");	133	fldToTraits["creator"] = FieldTraits("A");
	...		...
187	parms.get(Doc::keyurl, doc.url);	196	parms.get(Doc::keyurl, doc.url);
188	parms.get(Doc::keytp, doc.mimetype);	197	parms.get(Doc::keytp, doc.mimetype);
189	parms.get(Doc::keyfmt, doc.fmtime);	198	parms.get(Doc::keyfmt, doc.fmtime);
190	parms.get(Doc::keydmt, doc.dmtime);	199	parms.get(Doc::keydmt, doc.dmtime);
191	parms.get(Doc::keyoc, doc.origcharset);	200	parms.get(Doc::keyoc, doc.origcharset);
192	parms.get(keycap, doc.meta[Doc::keytt]);	201	parms.get(cstr_keycap, doc.meta[Doc::keytt]);
193	parms.get(Doc::keykw, doc.meta[Doc::keykw]);	202	parms.get(Doc::keykw, doc.meta[Doc::keykw]);
194	parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);	203	parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
195	// Possibly remove synthetic abstract indicator (if it's there, we	204	// Possibly remove synthetic abstract indicator (if it's there, we
196	// used to index the beginning of the text as abstract).	205	// used to index the beginning of the text as abstract).
197	doc.syntabs = false;	206	doc.syntabs = false;
198	if (doc.meta[Doc::keyabs].find(rclSyntAbs) == 0) {	207	if (doc.meta[Doc::keyabs].find(cstr_syntAbs) == 0) {
199	doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(rclSyntAbs.length());	208	doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(cstr_syntAbs.length());
200	doc.syntabs = true;	209	doc.syntabs = true;
201	}	210	}
202	parms.get(Doc::keyipt, doc.ipath);	211	parms.get(Doc::keyipt, doc.ipath);
203	parms.get(Doc::keyfs, doc.fbytes);	212	parms.get(Doc::keyfs, doc.fbytes);
204	parms.get(Doc::keyds, doc.dbytes);	213	parms.get(Doc::keyds, doc.dbytes);
	...		...
415	if (ii == (unsigned int)ipos) {	424	if (ii == (unsigned int)ipos) {
416	sparseDoc[ii] = qterm;	425	sparseDoc[ii] = qterm;
417	} else if (ii > (unsigned int)ipos &&	426	} else if (ii > (unsigned int)ipos &&
418	ii < (unsigned int)ipos + qtrmwrdcnt) {	427	ii < (unsigned int)ipos + qtrmwrdcnt) {
419	sparseDoc[ii] = occupiedmarker;	428	sparseDoc[ii] = occupiedmarker;
420	} else if (!sparseDoc[ii].compare(ellipsis)) {	429	} else if (!sparseDoc[ii].compare(cstr_ellipsis)) {
421	// For an empty slot, the test has a side	430	// For an empty slot, the test has a side
422	// effect of inserting an empty string which	431	// effect of inserting an empty string which
423	// is what we want	432	// is what we want
424	sparseDoc[ii] = emptys;	433	sparseDoc[ii] = emptys;
425	}	434	}
	...		...
427	// Add ellipsis at the end. This may be replaced later by	436	// Add ellipsis at the end. This may be replaced later by
428	// an overlapping extract. Take care not to replace an	437	// an overlapping extract. Take care not to replace an
429	// empty string here, we really want an empty slot,	438	// empty string here, we really want an empty slot,
430	// use find()	439	// use find()
431	if (sparseDoc.find(sto+1) == sparseDoc.end()) {	440	if (sparseDoc.find(sto+1) == sparseDoc.end()) {
432	sparseDoc[sto+1] = ellipsis;	441	sparseDoc[sto+1] = cstr_ellipsis;
433	}	442	}
434		443
435	// Limit to allocated occurences and total size	444	// Limit to allocated occurences and total size
436	if (++occurrences >= maxoccs \|\|	445	if (++occurrences >= maxoccs \|\|
437	qtermposs.size() >= maxtotaloccs)	446	qtermposs.size() >= maxtotaloccs)
	...		...
529	if (TextSplit::isCJK(*uit))	538	if (TextSplit::isCJK(*uit))
530	newcjk = true;	539	newcjk = true;
531	if (!incjk \|\| (incjk && !newcjk))	540	if (!incjk \|\| (incjk && !newcjk))
532	chunk += " ";	541	chunk += " ";
533	incjk = newcjk;	542	incjk = newcjk;
534	if (it->second == ellipsis) {	543	if (it->second == cstr_ellipsis) {
535	vabs.push_back(chunk);	544	vabs.push_back(chunk);
536	chunk.clear();	545	chunk.clear();
537	} else {	546	} else {
538	chunk += it->second;	547	chunk += it->second;
539	}	548	}
	...		...
610	Xapian::DB_CREATE_OR_OVERWRITE;	619	Xapian::DB_CREATE_OR_OVERWRITE;
611	m_ndb->xwdb = Xapian::WritableDatabase(dir, action);	620	m_ndb->xwdb = Xapian::WritableDatabase(dir, action);
612	// If db is empty, write the data format version at once	621	// If db is empty, write the data format version at once
613	// to avoid stupid error messages:	622	// to avoid stupid error messages:
614	if (m_ndb->xwdb.get_doccount() == 0)	623	if (m_ndb->xwdb.get_doccount() == 0)
615	m_ndb->xwdb.set_metadata(RCL_IDX_VERSION_KEY,	624	m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY,
616	RCL_IDX_VERSION);	625	cstr_RCL_IDX_VERSION);
617	m_ndb->m_iswritable = true;	626	m_ndb->m_iswritable = true;
618	// We open a readonly object in all cases (possibly in	627	// We open a readonly object in all cases (possibly in
619	// addition to the r/w one) because some operations	628	// addition to the r/w one) because some operations
620	// are faster when performed through a Database: no	629	// are faster when performed through a Database: no
621	// forced flushes on allterms_begin(), ie, used in	630	// forced flushes on allterms_begin(), ie, used in
	...		...
648	*error = DbOpenMainDb;	657	*error = DbOpenMainDb;
649		658
650	// Check index format version. Must not try to check a just created or	659	// Check index format version. Must not try to check a just created or
651	// truncated db	660	// truncated db
652	if (mode != DbTrunc && m_ndb->xdb().get_doccount() > 0) {	661	if (mode != DbTrunc && m_ndb->xdb().get_doccount() > 0) {
653	string version = m_ndb->xdb().get_metadata(RCL_IDX_VERSION_KEY);	662	string version = m_ndb->xdb().get_metadata(cstr_RCL_IDX_VERSION_KEY);
654	if (version.compare(RCL_IDX_VERSION)) {	663	if (version.compare(cstr_RCL_IDX_VERSION)) {
655	m_ndb->m_noversionwrite = true;	664	m_ndb->m_noversionwrite = true;
656	LOGERR(("Rcl::Db::open: file index [%s], software [%s]\n",	665	LOGERR(("Rcl::Db::open: file index [%s], software [%s]\n",
657	version.c_str(), RCL_IDX_VERSION.c_str()));	666	version.c_str(), cstr_RCL_IDX_VERSION.c_str()));
658	throw Xapian::DatabaseError("Recoll index version mismatch",	667	throw Xapian::DatabaseError("Recoll index version mismatch",
659	"", "");	668	"", "");
660	}	669	}
661	}	670	}
662	m_mode = mode;	671	m_mode = mode;
	...		...
691	string ermsg;	700	string ermsg;
692	try {	701	try {
693	bool w = m_ndb->m_iswritable;	702	bool w = m_ndb->m_iswritable;
694	if (w) {	703	if (w) {
695	if (!m_ndb->m_noversionwrite)	704	if (!m_ndb->m_noversionwrite)
696	m_ndb->xwdb.set_metadata(RCL_IDX_VERSION_KEY, RCL_IDX_VERSION);	705	m_ndb->xwdb.set_metadata(cstr_RCL_IDX_VERSION_KEY, cstr_RCL_IDX_VERSION);
697	LOGDEB(("Rcl::Db:close: xapian will close. May take some time\n"));	706	LOGDEB(("Rcl::Db:close: xapian will close. May take some time\n"));
698	}	707	}
699	// Used to do a flush here. Cant see why it should be necessary.	708	// Used to do a flush here. Cant see why it should be necessary.
700	deleteZ(m_ndb);	709	deleteZ(m_ndb);
701	if (w)	710	if (w)
	...		...
950	if (syntctxlen > 0)	959	if (syntctxlen > 0)
951	m_synthAbsWordCtxLen = syntctxlen;	960	m_synthAbsWordCtxLen = syntctxlen;
952	}	961	}
953		962
954	static const int MB = 1024 * 1024;	963	static const int MB = 1024 * 1024;
955	static const string nc("\n\r\x0c");	964	static const string cstr_nc("\n\r\x0c");
956		965
957	#define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";}	966	#define RECORD_APPEND(R, NM, VAL) {R += NM + "=" + VAL + "\n";}
958		967
959	// Add document in internal form to the database: index the terms in	968	// Add document in internal form to the database: index the terms in
960	// the title abstract and body and add special terms for file name,	969	// the title abstract and body and add special terms for file name,
	...		...
1166	RECORD_APPEND(record, Doc::keyipt, doc.ipath);	1175	RECORD_APPEND(record, Doc::keyipt, doc.ipath);
1167		1176
1168	if (doc.meta[Doc::keytt].empty())	1177	if (doc.meta[Doc::keytt].empty())
1169	doc.meta[Doc::keytt] = doc.utf8fn;	1178	doc.meta[Doc::keytt] = doc.utf8fn;
1170	doc.meta[Doc::keytt] =	1179	doc.meta[Doc::keytt] =
1171	neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc);	1180	neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), cstr_nc);
1172	if (!doc.meta[Doc::keytt].empty())	1181	if (!doc.meta[Doc::keytt].empty())
1173	RECORD_APPEND(record, keycap, doc.meta[Doc::keytt]);	1182	RECORD_APPEND(record, cstr_keycap, doc.meta[Doc::keytt]);
1174		1183
1175	trimstring(doc.meta[Doc::keykw], " \t\r\n");	1184	trimstring(doc.meta[Doc::keykw], " \t\r\n");
1176	doc.meta[Doc::keykw] =	1185	doc.meta[Doc::keykw] =
1177	neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc);	1186	neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), cstr_nc);
1178	if (!doc.meta[Doc::keykw].empty())	1187	if (!doc.meta[Doc::keykw].empty())
1179	RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);	1188	RECORD_APPEND(record, Doc::keykw, doc.meta[Doc::keykw]);
1180		1189
1181	// If abstract is empty, we make up one with the beginning of the	1190	// If abstract is empty, we make up one with the beginning of the
1182	// document. This is then not indexed, but part of the doc data so	1191	// document. This is then not indexed, but part of the doc data so
	...		...
1187	// don't exist yet.	1196	// don't exist yet.
1188	trimstring(doc.meta[Doc::keyabs], " \t\r\n");	1197	trimstring(doc.meta[Doc::keyabs], " \t\r\n");
1189	if (doc.meta[Doc::keyabs].empty()) {	1198	if (doc.meta[Doc::keyabs].empty()) {
1190	syntabs = true;	1199	syntabs = true;
1191	if (!doc.text.empty())	1200	if (!doc.text.empty())
1192	doc.meta[Doc::keyabs] = rclSyntAbs +	1201	doc.meta[Doc::keyabs] = cstr_syntAbs +
1193	neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), nc);	1202	neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), cstr_nc);
1194	} else {	1203	} else {
1195	doc.meta[Doc::keyabs] =	1204	doc.meta[Doc::keyabs] =
1196	neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),	1205	neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
1197	nc);	1206	cstr_nc);
1198	}	1207	}
1199	if (!doc.meta[Doc::keyabs].empty())	1208	if (!doc.meta[Doc::keyabs].empty())
1200	RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);	1209	RECORD_APPEND(record, Doc::keyabs, doc.meta[Doc::keyabs]);
1201		1210
1202	const set<string>& stored = m_config->getStoredFields();	1211	const set<string>& stored = m_config->getStoredFields();
1203	for (set<string>::const_iterator it = stored.begin();	1212	for (set<string>::const_iterator it = stored.begin();
1204	it != stored.end(); it++) {	1213	it != stored.end(); it++) {
1205	string nm = m_config->fieldCanon(*it);	1214	string nm = m_config->fieldCanon(*it);
1206	if (!doc.meta[*it].empty()) {	1215	if (!doc.meta[*it].empty()) {
1207	string value =	1216	string value =
1208	neutchars(truncate_to_word(doc.meta[*it], 150), nc);	1217	neutchars(truncate_to_word(doc.meta[*it], 150), cstr_nc);
1209	RECORD_APPEND(record, nm, value);	1218	RECORD_APPEND(record, nm, value);
1210	}	1219	}
1211	}	1220	}
1212		1221
1213	// If the file's md5 was computed, add value. This is optionally	1222	// If the file's md5 was computed, add value. This is optionally
	...		...
1609	}	1618	}
1610		1619
1611	// Characters that can begin a wildcard or regexp expression. We use skipto	1620	// Characters that can begin a wildcard or regexp expression. We use skipto
1612	// to begin the allterms search with terms that begin with the portion of	1621	// to begin the allterms search with terms that begin with the portion of
1613	// the input string prior to these chars.	1622	// the input string prior to these chars.
1614	const string wildSpecChars = "*?[";	1623	const string cstr_wildSpecChars = "*?[";
1615	const string regSpecChars = "(.[{";	1624	const string cstr_regSpecChars = "(.[{";
1616		1625
1617	// Find all index terms that match a wildcard or regular expression	1626	// Find all index terms that match a wildcard or regular expression
1618	bool Db::termMatch(MatchType typ, const string &lang,	1627	bool Db::termMatch(MatchType typ, const string &lang,
1619	const string &root,	1628	const string &root,
1620	TermMatchResult& res,	1629	TermMatchResult& res,
	...		...
1637	string droot;	1646	string droot;
1638	if (!unacmaybefold(root, droot, "UTF-8", true)) {	1647	if (!unacmaybefold(root, droot, "UTF-8", true)) {
1639	LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));	1648	LOGERR(("Db::termMatch: unac failed for [%s]\n", root.c_str()));
1640	return false;	1649	return false;
1641	}	1650	}
1642	string nochars = typ == ET_WILD ? wildSpecChars : regSpecChars;	1651	string nochars = typ == ET_WILD ? cstr_wildSpecChars : cstr_regSpecChars;
1643		1652
1644	string prefix;	1653	string prefix;
1645	if (!field.empty()) {	1654	if (!field.empty()) {
1646	const FieldTraits *ftp = 0;	1655	const FieldTraits *ftp = 0;
1647	if (!fieldToTraits(field, &ftp) \|\| ftp->pfx.empty()) {	1656	if (!fieldToTraits(field, &ftp) \|\| ftp->pfx.empty()) {
	...		...
1850	XAPTRY(vab = m_ndb->makeAbstract(doc.xdocid, query),	1859	XAPTRY(vab = m_ndb->makeAbstract(doc.xdocid, query),
1851	m_ndb->xrdb, m_reason);	1860	m_ndb->xrdb, m_reason);
1852	for (vector<string>::const_iterator it = vab.begin();	1861	for (vector<string>::const_iterator it = vab.begin();
1853	it != vab.end(); it++) {	1862	it != vab.end(); it++) {
1854	abstract.append(*it);	1863	abstract.append(*it);
1855	abstract.append(ellipsis);	1864	abstract.append(cstr_ellipsis);
1856	}	1865	}
1857	return m_reason.empty() ? true : false;	1866	return m_reason.empty() ? true : false;
1858	}	1867	}
1859		1868
1860	// Retrieve document defined by Unique doc identifier. This is mainly used	1869	// Retrieve document defined by Unique doc identifier. This is mainly used