|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.142 2008-09-05 10:34:17 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.143 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
127 |
}
|
127 |
}
|
128 |
LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str()));
|
128 |
LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str()));
|
129 |
return false;
|
129 |
return false;
|
130 |
}
|
130 |
}
|
131 |
|
131 |
|
|
|
132 |
static const string keycap("caption");
|
|
|
133 |
static const string keymtp("mtype");
|
|
|
134 |
static const string keyfmt("fmtime");
|
|
|
135 |
static const string keydmt("dmtime");
|
|
|
136 |
static const string keyoc("origcharset");
|
|
|
137 |
static const string keyurl("url");
|
|
|
138 |
|
132 |
// Turn data record from db into document fields
|
139 |
// Turn data record from db into document fields
|
133 |
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
140 |
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data,
|
134 |
Doc &doc, int percent)
|
141 |
Doc &doc, int percent)
|
135 |
{
|
142 |
{
|
136 |
LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str()));
|
143 |
LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str()));
|
137 |
ConfSimple parms(&data);
|
144 |
ConfSimple parms(&data);
|
138 |
if (!parms.ok())
|
145 |
if (!parms.ok())
|
139 |
return false;
|
146 |
return false;
|
140 |
parms.get(string("url"), doc.url);
|
147 |
parms.get(keyurl, doc.url);
|
141 |
parms.get(string("mtype"), doc.mimetype);
|
148 |
parms.get(keymtp, doc.mimetype);
|
142 |
parms.get(string("fmtime"), doc.fmtime);
|
149 |
parms.get(keyfmt, doc.fmtime);
|
143 |
parms.get(string("dmtime"), doc.dmtime);
|
150 |
parms.get(keydmt, doc.dmtime);
|
144 |
parms.get(string("origcharset"), doc.origcharset);
|
151 |
parms.get(keyoc, doc.origcharset);
|
145 |
parms.get(string("caption"), doc.meta["title"]);
|
152 |
parms.get(keycap, doc.meta[Doc::keytt]);
|
146 |
parms.get(string("keywords"), doc.meta["keywords"]);
|
153 |
parms.get(Doc::keykw, doc.meta[Doc::keykw]);
|
147 |
parms.get(string("abstract"), doc.meta["abstract"]);
|
154 |
parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
|
148 |
parms.get(string("author"), doc.meta["author"]);
|
|
|
149 |
// Possibly remove synthetic abstract indicator (if it's there, we
|
155 |
// Possibly remove synthetic abstract indicator (if it's there, we
|
150 |
// used to index the beginning of the text as abstract).
|
156 |
// used to index the beginning of the text as abstract).
|
151 |
doc.syntabs = false;
|
157 |
doc.syntabs = false;
|
152 |
if (doc.meta["abstract"].find(rclSyntAbs) == 0) {
|
158 |
if (doc.meta[Doc::keyabs].find(rclSyntAbs) == 0) {
|
153 |
doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length());
|
159 |
doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(rclSyntAbs.length());
|
154 |
doc.syntabs = true;
|
160 |
doc.syntabs = true;
|
155 |
}
|
161 |
}
|
156 |
char buf[20];
|
162 |
char buf[20];
|
157 |
sprintf(buf,"%.2f", float(percent) / 100.0);
|
163 |
sprintf(buf,"%.2f", float(percent) / 100.0);
|
158 |
doc.meta["relevancyrating"] = buf;
|
164 |
doc.meta[Doc::keyrr] = buf;
|
159 |
parms.get(string("ipath"), doc.ipath);
|
165 |
parms.get(string("ipath"), doc.ipath);
|
160 |
parms.get(string("fbytes"), doc.fbytes);
|
166 |
parms.get(string("fbytes"), doc.fbytes);
|
161 |
parms.get(string("dbytes"), doc.dbytes);
|
167 |
parms.get(string("dbytes"), doc.dbytes);
|
162 |
parms.get(string("sig"), doc.sig);
|
168 |
parms.get(string("sig"), doc.sig);
|
163 |
doc.xdocid = docid;
|
169 |
doc.xdocid = docid;
|
|
|
170 |
|
|
|
171 |
// Other, not predefined meta fields:
|
|
|
172 |
list<string> keys = parms.getNames(string());
|
|
|
173 |
for (list<string>::const_iterator it = keys.begin();
|
|
|
174 |
it != keys.end(); it++) {
|
|
|
175 |
if (doc.meta.find(*it) == doc.meta.end())
|
|
|
176 |
parms.get(*it, doc.meta[*it]);
|
|
|
177 |
}
|
164 |
return true;
|
178 |
return true;
|
165 |
}
|
179 |
}
|
166 |
|
180 |
|
167 |
static list<string> noPrefixList(const list<string>& in)
|
181 |
static list<string> noPrefixList(const list<string>& in)
|
168 |
{
|
182 |
{
|
|
... |
|
... |
678 |
bool Db::fieldToPrefix(const string& fldname, string &pfx)
|
692 |
bool Db::fieldToPrefix(const string& fldname, string &pfx)
|
679 |
{
|
693 |
{
|
680 |
// This is the default table
|
694 |
// This is the default table
|
681 |
static map<string, string> fldToPrefs;
|
695 |
static map<string, string> fldToPrefs;
|
682 |
if (fldToPrefs.empty()) {
|
696 |
if (fldToPrefs.empty()) {
|
683 |
fldToPrefs["abstract"] = string();
|
697 |
fldToPrefs[Doc::keyabs] = string();
|
684 |
fldToPrefs["ext"] = "XE";
|
698 |
fldToPrefs["ext"] = "XE";
|
685 |
fldToPrefs["filename"] = "XSFN";
|
699 |
fldToPrefs["filename"] = "XSFN";
|
686 |
|
700 |
|
687 |
fldToPrefs["title"] = "S";
|
701 |
fldToPrefs["title"] = "S";
|
688 |
fldToPrefs["caption"] = "S";
|
702 |
fldToPrefs[keycap] = "S";
|
689 |
fldToPrefs["subject"] = "S";
|
703 |
fldToPrefs["subject"] = "S";
|
690 |
|
704 |
|
691 |
fldToPrefs["author"] = "A";
|
705 |
fldToPrefs[Doc::keyau] = "A";
|
692 |
fldToPrefs["creator"] = "A";
|
706 |
fldToPrefs["creator"] = "A";
|
693 |
fldToPrefs["from"] = "A";
|
707 |
fldToPrefs["from"] = "A";
|
694 |
|
708 |
|
695 |
fldToPrefs["keyword"] = "K";
|
709 |
fldToPrefs["keyword"] = "K";
|
696 |
fldToPrefs["tag"] = "K";
|
710 |
fldToPrefs["tag"] = "K";
|
697 |
fldToPrefs["keywords"] = "K";
|
711 |
fldToPrefs[Doc::keykw] = "K";
|
698 |
fldToPrefs["tags"] = "K";
|
712 |
fldToPrefs["tags"] = "K";
|
699 |
}
|
713 |
}
|
700 |
|
714 |
|
701 |
string fld(fldname);
|
715 |
string fld(fldname);
|
702 |
stringtolower(fld);
|
716 |
stringtolower(fld);
|
|
... |
|
... |
801 |
if (syntctxlen > 0)
|
815 |
if (syntctxlen > 0)
|
802 |
m_synthAbsWordCtxLen = syntctxlen;
|
816 |
m_synthAbsWordCtxLen = syntctxlen;
|
803 |
}
|
817 |
}
|
804 |
|
818 |
|
805 |
static const int MB = 1024 * 1024;
|
819 |
static const int MB = 1024 * 1024;
|
|
|
820 |
static const string nc("\n\r\x0c");
|
806 |
|
821 |
|
807 |
// Add document in internal form to the database: index the terms in
|
822 |
// Add document in internal form to the database: index the terms in
|
808 |
// the title abstract and body and add special terms for file name,
|
823 |
// the title abstract and body and add special terms for file name,
|
809 |
// date, mime type ... , create the document data record (more
|
824 |
// date, mime type ... , create the document data record (more
|
810 |
// metadata), and update database
|
825 |
// metadata), and update database
|
|
... |
|
... |
829 |
m_occtxtsz = m_curtxtsz;
|
844 |
m_occtxtsz = m_curtxtsz;
|
830 |
}
|
845 |
}
|
831 |
|
846 |
|
832 |
Doc doc = idoc;
|
847 |
Doc doc = idoc;
|
833 |
|
848 |
|
834 |
// The title, author, abstract and keywords fields are special, they
|
|
|
835 |
// get stored in the document data record.
|
|
|
836 |
// Truncate abstract, title and keywords to reasonable lengths. If
|
|
|
837 |
// abstract is currently empty, we make up one with the beginning
|
|
|
838 |
// of the document. This is then not indexed, but part of the doc
|
|
|
839 |
// data so that we can return it to a query without having to
|
|
|
840 |
// decode the original file.
|
|
|
841 |
bool syntabs = false;
|
|
|
842 |
// Note that the map accesses by operator[] create empty entries if they
|
|
|
843 |
// don't exist yet.
|
|
|
844 |
if (doc.meta["abstract"].empty()) {
|
|
|
845 |
syntabs = true;
|
|
|
846 |
doc.meta["abstract"] = rclSyntAbs +
|
|
|
847 |
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r");
|
|
|
848 |
} else {
|
|
|
849 |
doc.meta["abstract"] =
|
|
|
850 |
neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen),
|
|
|
851 |
"\n\r");
|
|
|
852 |
}
|
|
|
853 |
if (doc.meta["title"].empty())
|
|
|
854 |
doc.meta["title"] = doc.utf8fn;
|
|
|
855 |
doc.meta["title"] =
|
|
|
856 |
neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r");
|
|
|
857 |
doc.meta["author"] =
|
|
|
858 |
neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r");
|
|
|
859 |
doc.meta["keywords"] =
|
|
|
860 |
neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r");
|
|
|
861 |
|
|
|
862 |
|
|
|
863 |
Xapian::Document newdocument;
|
849 |
Xapian::Document newdocument;
|
864 |
mySplitterCB splitData(newdocument, m_stops);
|
850 |
mySplitterCB splitData(newdocument, m_stops);
|
865 |
TextSplit splitter(&splitData);
|
851 |
TextSplit splitter(&splitData);
|
866 |
string noacc;
|
852 |
string noacc;
|
867 |
|
853 |
|
|
... |
|
... |
880 |
// between fields to avoid false proximity matches.
|
866 |
// between fields to avoid false proximity matches.
|
881 |
map<string,string>::iterator meta_it;
|
867 |
map<string,string>::iterator meta_it;
|
882 |
string pfx;
|
868 |
string pfx;
|
883 |
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
869 |
for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
|
884 |
if (!meta_it->second.empty()) {
|
870 |
if (!meta_it->second.empty()) {
|
885 |
if (meta_it->first == "abstract" && syntabs)
|
|
|
886 |
continue;
|
|
|
887 |
if (!fieldToPrefix(meta_it->first, pfx)) {
|
871 |
if (!fieldToPrefix(meta_it->first, pfx)) {
|
888 |
LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
|
872 |
LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
|
889 |
meta_it->first.c_str()));
|
873 |
meta_it->first.c_str()));
|
890 |
continue;
|
874 |
continue;
|
891 |
}
|
875 |
}
|
892 |
LOGDEB1(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
876 |
LOGDEB1(("Db::add: field [%s] pfx [%s]: [%s]\n",
|
893 |
meta_it->first.c_str(), pfx.c_str(),
|
877 |
meta_it->first.c_str(), pfx.c_str(),
|
894 |
meta_it->second.c_str()));
|
878 |
meta_it->second.c_str()));
|
|
... |
|
... |
906 |
if (splitData.curpos < baseTextPosition)
|
890 |
if (splitData.curpos < baseTextPosition)
|
907 |
splitData.basepos = baseTextPosition;
|
891 |
splitData.basepos = baseTextPosition;
|
908 |
else
|
892 |
else
|
909 |
splitData.basepos += splitData.curpos + 100;
|
893 |
splitData.basepos += splitData.curpos + 100;
|
910 |
|
894 |
|
911 |
// Finally: split and index body text
|
895 |
// Split and index body text
|
912 |
LOGDEB2(("Db::add: split body\n"));
|
896 |
LOGDEB2(("Db::add: split body\n"));
|
913 |
if (!dumb_string(doc.text, noacc)) {
|
897 |
if (!dumb_string(doc.text, noacc)) {
|
914 |
LOGERR(("Db::add: dumb_string failed\n"));
|
898 |
LOGERR(("Db::add: dumb_string failed\n"));
|
915 |
return false;
|
899 |
return false;
|
916 |
}
|
900 |
}
|
|
... |
|
... |
956 |
buf[6] = '\0';
|
940 |
buf[6] = '\0';
|
957 |
newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
|
941 |
newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
|
958 |
buf[4] = '\0';
|
942 |
buf[4] = '\0';
|
959 |
newdocument.add_term("Y" + string(buf)); // Year (YYYY)
|
943 |
newdocument.add_term("Y" + string(buf)); // Year (YYYY)
|
960 |
|
944 |
|
|
|
945 |
|
|
|
946 |
//////////////////////////////////////////////////////////////////
|
961 |
// Document data record. omindex has the following nl separated fields:
|
947 |
// Document data record. omindex has the following nl separated fields:
|
962 |
// - url
|
948 |
// - url
|
963 |
// - sample
|
949 |
// - sample
|
964 |
// - caption (title limited to 100 chars)
|
950 |
// - caption (title limited to 100 chars)
|
965 |
// - mime type
|
951 |
// - mime type
|
|
|
952 |
//
|
|
|
953 |
// The title, author, abstract and keywords fields are special,
|
|
|
954 |
// they always get stored in the document data
|
|
|
955 |
// record. Configurable other fields can be, too.
|
|
|
956 |
//
|
|
|
957 |
// We truncate stored fields abstract, title and keywords to
|
|
|
958 |
// reasonable lengths and suppress newlines (so that the data
|
|
|
959 |
// record can keep a simple syntax)
|
|
|
960 |
|
966 |
string record = "url=" + doc.url;
|
961 |
string record = "url=" + doc.url;
|
967 |
record += "\nmtype=" + doc.mimetype;
|
962 |
record += "\nmtype=" + doc.mimetype;
|
968 |
record += "\nfmtime=" + doc.fmtime;
|
963 |
record += "\nfmtime=" + doc.fmtime;
|
969 |
if (!doc.dmtime.empty()) {
|
964 |
if (!doc.dmtime.empty()) {
|
970 |
record += "\ndmtime=" + doc.dmtime;
|
965 |
record += "\ndmtime=" + doc.dmtime;
|
|
... |
|
... |
980 |
|
975 |
|
981 |
char sizebuf[30];
|
976 |
char sizebuf[30];
|
982 |
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
977 |
sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
|
983 |
record += string("\ndbytes=") + sizebuf;
|
978 |
record += string("\ndbytes=") + sizebuf;
|
984 |
|
979 |
|
985 |
if (!doc.ipath.empty()) {
|
980 |
if (!doc.ipath.empty())
|
986 |
record += "\nipath=" + doc.ipath;
|
981 |
record += "\nipath=" + doc.ipath;
|
|
|
982 |
|
|
|
983 |
if (doc.meta[Doc::keytt].empty())
|
|
|
984 |
doc.meta[Doc::keytt] = doc.utf8fn;
|
|
|
985 |
doc.meta[Doc::keytt] =
|
|
|
986 |
neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc);
|
|
|
987 |
if (!doc.meta[Doc::keytt].empty())
|
|
|
988 |
record += "\n" + keycap + "=" + doc.meta[Doc::keytt];
|
|
|
989 |
|
|
|
990 |
doc.meta[Doc::keykw] =
|
|
|
991 |
neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc);
|
|
|
992 |
if (!doc.meta[Doc::keykw].empty())
|
|
|
993 |
record += "\n" + Doc::keykw + "=" + doc.meta[Doc::keykw];
|
|
|
994 |
|
|
|
995 |
// If abstract is empty, we make up one with the beginning of the
|
|
|
996 |
// document. This is then not indexed, but part of the doc data so
|
|
|
997 |
// that we can return it to a query without having to decode the
|
|
|
998 |
// original file.
|
|
|
999 |
bool syntabs = false;
|
|
|
1000 |
// Note that the map accesses by operator[] create empty entries if they
|
|
|
1001 |
// don't exist yet.
|
|
|
1002 |
if (doc.meta[Doc::keyabs].empty()) {
|
|
|
1003 |
syntabs = true;
|
|
|
1004 |
if (!doc.text.empty())
|
|
|
1005 |
doc.meta[Doc::keyabs] = rclSyntAbs +
|
|
|
1006 |
neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), nc);
|
|
|
1007 |
} else {
|
|
|
1008 |
doc.meta[Doc::keyabs] =
|
|
|
1009 |
neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
|
|
|
1010 |
nc);
|
|
|
1011 |
}
|
|
|
1012 |
if (!doc.meta[Doc::keyabs].empty())
|
|
|
1013 |
record += "\n" + Doc::keyabs + "=" + doc.meta[Doc::keyabs];
|
|
|
1014 |
|
|
|
1015 |
RclConfig *config = RclConfig::getMainConfig();
|
|
|
1016 |
if (config) {
|
|
|
1017 |
const set<string>& stored = config->getStoredFields();
|
|
|
1018 |
for (set<string>::const_iterator it = stored.begin();
|
|
|
1019 |
it != stored.end(); it++) {
|
|
|
1020 |
if (!doc.meta[*it].empty()) {
|
|
|
1021 |
string value =
|
|
|
1022 |
neutchars(truncate_to_word(doc.meta[*it], 150), nc);
|
|
|
1023 |
record += "\n" + *it + "=" + value;
|
987 |
}
|
1024 |
}
|
988 |
if (!doc.meta["title"].empty())
|
1025 |
}
|
989 |
record += "\ncaption=" + doc.meta["title"];
|
|
|
990 |
if (!doc.meta["keywords"].empty())
|
|
|
991 |
record += "\nkeywords=" + doc.meta["keywords"];
|
|
|
992 |
if (!doc.meta["abstract"].empty())
|
|
|
993 |
record += "\nabstract=" + doc.meta["abstract"];
|
|
|
994 |
if (!doc.meta["author"].empty()) {
|
|
|
995 |
record += "\nauthor=" + doc.meta["author"];
|
|
|
996 |
}
|
1026 |
}
|
997 |
record += "\n";
|
1027 |
record += "\n";
|
998 |
LOGDEB1(("Newdocument data: %s\n", record.c_str()));
|
1028 |
LOGDEB(("Rcl::Db::add: new doc record:\n %s\n", record.c_str()));
|
999 |
newdocument.set_data(record);
|
1029 |
newdocument.set_data(record);
|
1000 |
|
1030 |
|
1001 |
const char *fnc = udi.c_str();
|
1031 |
const char *fnc = udi.c_str();
|
1002 |
string ermsg;
|
1032 |
string ermsg;
|
1003 |
|
1033 |
|