Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.142 2008-09-05 10:34:17 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.143 2008-09-08 16:49:10 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
127
    }
127
    }
128
    LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str()));
128
    LOGERR(("Rcl::Db::subDocs: %s\n", ermsg.c_str()));
129
    return false;
129
    return false;
130
}
130
}
131
131
132
static const string keycap("caption");
133
static const string keymtp("mtype");
134
static const string keyfmt("fmtime");
135
static const string keydmt("dmtime");
136
static const string keyoc("origcharset");
137
static const string keyurl("url");
138
132
// Turn data record from db into document fields
139
// Turn data record from db into document fields
133
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, 
140
bool Db::Native::dbDataToRclDoc(Xapian::docid docid, std::string &data, 
134
                Doc &doc, int percent)
141
                Doc &doc, int percent)
135
{
142
{
136
    LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str()));
143
    LOGDEB1(("Db::dbDataToRclDoc: data: %s\n", data.c_str()));
137
    ConfSimple parms(&data);
144
    ConfSimple parms(&data);
138
    if (!parms.ok())
145
    if (!parms.ok())
139
    return false;
146
    return false;
140
    parms.get(string("url"), doc.url);
147
    parms.get(keyurl, doc.url);
141
    parms.get(string("mtype"), doc.mimetype);
148
    parms.get(keymtp, doc.mimetype);
142
    parms.get(string("fmtime"), doc.fmtime);
149
    parms.get(keyfmt, doc.fmtime);
143
    parms.get(string("dmtime"), doc.dmtime);
150
    parms.get(keydmt, doc.dmtime);
144
    parms.get(string("origcharset"), doc.origcharset);
151
    parms.get(keyoc, doc.origcharset);
145
    parms.get(string("caption"), doc.meta["title"]);
152
    parms.get(keycap, doc.meta[Doc::keytt]);
146
    parms.get(string("keywords"), doc.meta["keywords"]);
153
    parms.get(Doc::keykw, doc.meta[Doc::keykw]);
147
    parms.get(string("abstract"), doc.meta["abstract"]);
154
    parms.get(Doc::keyabs, doc.meta[Doc::keyabs]);
148
    parms.get(string("author"), doc.meta["author"]);
149
    // Possibly remove synthetic abstract indicator (if it's there, we
155
    // Possibly remove synthetic abstract indicator (if it's there, we
150
    // used to index the beginning of the text as abstract).
156
    // used to index the beginning of the text as abstract).
151
    doc.syntabs = false;
157
    doc.syntabs = false;
152
    if (doc.meta["abstract"].find(rclSyntAbs) == 0) {
158
    if (doc.meta[Doc::keyabs].find(rclSyntAbs) == 0) {
153
    doc.meta["abstract"] = doc.meta["abstract"].substr(rclSyntAbs.length());
159
    doc.meta[Doc::keyabs] = doc.meta[Doc::keyabs].substr(rclSyntAbs.length());
154
    doc.syntabs = true;
160
    doc.syntabs = true;
155
    }
161
    }
156
    char buf[20];
162
    char buf[20];
157
    sprintf(buf,"%.2f", float(percent) / 100.0);
163
    sprintf(buf,"%.2f", float(percent) / 100.0);
158
    doc.meta["relevancyrating"] = buf;
164
    doc.meta[Doc::keyrr] = buf;
159
    parms.get(string("ipath"), doc.ipath);
165
    parms.get(string("ipath"), doc.ipath);
160
    parms.get(string("fbytes"), doc.fbytes);
166
    parms.get(string("fbytes"), doc.fbytes);
161
    parms.get(string("dbytes"), doc.dbytes);
167
    parms.get(string("dbytes"), doc.dbytes);
162
    parms.get(string("sig"), doc.sig);
168
    parms.get(string("sig"), doc.sig);
163
    doc.xdocid = docid;
169
    doc.xdocid = docid;
170
171
    // Other, not predefined meta fields:
172
    list<string> keys = parms.getNames(string());
173
    for (list<string>::const_iterator it = keys.begin(); 
174
   it != keys.end(); it++) {
175
  if (doc.meta.find(*it) == doc.meta.end()) 
176
      parms.get(*it, doc.meta[*it]);
177
    }
164
    return true;
178
    return true;
165
}
179
}
166
180
167
static list<string> noPrefixList(const list<string>& in) 
181
static list<string> noPrefixList(const list<string>& in) 
168
{
182
{
...
...
678
bool Db::fieldToPrefix(const string& fldname, string &pfx)
692
bool Db::fieldToPrefix(const string& fldname, string &pfx)
679
{
693
{
680
    // This is the default table
694
    // This is the default table
681
    static map<string, string> fldToPrefs;
695
    static map<string, string> fldToPrefs;
682
    if (fldToPrefs.empty()) {
696
    if (fldToPrefs.empty()) {
683
    fldToPrefs["abstract"] = string();
697
    fldToPrefs[Doc::keyabs] = string();
684
    fldToPrefs["ext"] = "XE";
698
    fldToPrefs["ext"] = "XE";
685
    fldToPrefs["filename"] = "XSFN";
699
    fldToPrefs["filename"] = "XSFN";
686
700
687
    fldToPrefs["title"] = "S";
701
    fldToPrefs["title"] = "S";
688
    fldToPrefs["caption"] = "S";
702
    fldToPrefs[keycap] = "S";
689
    fldToPrefs["subject"] = "S";
703
    fldToPrefs["subject"] = "S";
690
704
691
    fldToPrefs["author"] = "A";
705
    fldToPrefs[Doc::keyau] = "A";
692
    fldToPrefs["creator"] = "A";
706
    fldToPrefs["creator"] = "A";
693
    fldToPrefs["from"] = "A";
707
    fldToPrefs["from"] = "A";
694
708
695
    fldToPrefs["keyword"] = "K";
709
    fldToPrefs["keyword"] = "K";
696
    fldToPrefs["tag"] = "K";
710
    fldToPrefs["tag"] = "K";
697
    fldToPrefs["keywords"] = "K";
711
    fldToPrefs[Doc::keykw] = "K";
698
    fldToPrefs["tags"] = "K";
712
    fldToPrefs["tags"] = "K";
699
    }
713
    }
700
714
701
    string fld(fldname);
715
    string fld(fldname);
702
    stringtolower(fld);
716
    stringtolower(fld);
...
...
801
    if (syntctxlen > 0)
815
    if (syntctxlen > 0)
802
    m_synthAbsWordCtxLen = syntctxlen;
816
    m_synthAbsWordCtxLen = syntctxlen;
803
}
817
}
804
818
805
static const int MB = 1024 * 1024;
819
static const int MB = 1024 * 1024;
820
static const string nc("\n\r\x0c");
806
821
807
// Add document in internal form to the database: index the terms in
822
// Add document in internal form to the database: index the terms in
808
// the title abstract and body and add special terms for file name,
823
// the title abstract and body and add special terms for file name,
809
// date, mime type ... , create the document data record (more
824
// date, mime type ... , create the document data record (more
810
// metadata), and update database
825
// metadata), and update database
...
...
829
    m_occtxtsz = m_curtxtsz;
844
    m_occtxtsz = m_curtxtsz;
830
    }
845
    }
831
846
832
    Doc doc = idoc;
847
    Doc doc = idoc;
833
848
834
    // The title, author, abstract and keywords fields are special, they
835
    // get stored in the document data record.
836
    // Truncate abstract, title and keywords to reasonable lengths. If
837
    // abstract is currently empty, we make up one with the beginning
838
    // of the document. This is then not indexed, but part of the doc
839
    // data so that we can return it to a query without having to
840
    // decode the original file.
841
    bool syntabs = false;
842
    // Note that the map accesses by operator[] create empty entries if they
843
    // don't exist yet.
844
    if (doc.meta["abstract"].empty()) {
845
  syntabs = true;
846
  doc.meta["abstract"] = rclSyntAbs + 
847
      neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), "\n\r");
848
    } else {
849
  doc.meta["abstract"] = 
850
      neutchars(truncate_to_word(doc.meta["abstract"], m_idxAbsTruncLen),
851
            "\n\r");
852
    }
853
    if (doc.meta["title"].empty())
854
  doc.meta["title"] = doc.utf8fn;
855
    doc.meta["title"] = 
856
  neutchars(truncate_to_word(doc.meta["title"], 150), "\n\r");
857
    doc.meta["author"] = 
858
  neutchars(truncate_to_word(doc.meta["author"], 150), "\n\r");
859
    doc.meta["keywords"] = 
860
  neutchars(truncate_to_word(doc.meta["keywords"], 300),"\n\r");
861
862
863
    Xapian::Document newdocument;
849
    Xapian::Document newdocument;
864
    mySplitterCB splitData(newdocument, m_stops);
850
    mySplitterCB splitData(newdocument, m_stops);
865
    TextSplit splitter(&splitData);
851
    TextSplit splitter(&splitData);
866
    string noacc;
852
    string noacc;
867
853
...
...
880
    // between fields to avoid false proximity matches.
866
    // between fields to avoid false proximity matches.
881
    map<string,string>::iterator meta_it;
867
    map<string,string>::iterator meta_it;
882
    string pfx;
868
    string pfx;
883
    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
869
    for (meta_it = doc.meta.begin(); meta_it != doc.meta.end(); meta_it++) {
884
    if (!meta_it->second.empty()) {
870
    if (!meta_it->second.empty()) {
885
      if (meta_it->first == "abstract" && syntabs)
886
      continue;
887
        if (!fieldToPrefix(meta_it->first, pfx)) {
871
        if (!fieldToPrefix(meta_it->first, pfx)) {
888
        LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
872
        LOGDEB(("Db::add: no prefix for field [%s], no indexing\n",
889
            meta_it->first.c_str()));
873
             meta_it->first.c_str()));
890
        continue;
874
        continue;
891
        }
875
        }
892
        LOGDEB1(("Db::add: field [%s] pfx [%s]: [%s]\n", 
876
        LOGDEB1(("Db::add: field [%s] pfx [%s]: [%s]\n", 
893
            meta_it->first.c_str(), pfx.c_str(), 
877
            meta_it->first.c_str(), pfx.c_str(), 
894
            meta_it->second.c_str()));
878
            meta_it->second.c_str()));
...
...
906
    if (splitData.curpos < baseTextPosition)
890
    if (splitData.curpos < baseTextPosition)
907
    splitData.basepos = baseTextPosition;
891
    splitData.basepos = baseTextPosition;
908
    else
892
    else
909
    splitData.basepos += splitData.curpos + 100;
893
    splitData.basepos += splitData.curpos + 100;
910
894
911
    // Finally: split and index body text
895
    // Split and index body text
912
    LOGDEB2(("Db::add: split body\n"));
896
    LOGDEB2(("Db::add: split body\n"));
913
    if (!dumb_string(doc.text, noacc)) {
897
    if (!dumb_string(doc.text, noacc)) {
914
    LOGERR(("Db::add: dumb_string failed\n"));
898
    LOGERR(("Db::add: dumb_string failed\n"));
915
    return false;
899
    return false;
916
    }
900
    }
...
...
956
    buf[6] = '\0';
940
    buf[6] = '\0';
957
    newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
941
    newdocument.add_term("M" + string(buf)); // Month (YYYYMM)
958
    buf[4] = '\0';
942
    buf[4] = '\0';
959
    newdocument.add_term("Y" + string(buf)); // Year (YYYY)
943
    newdocument.add_term("Y" + string(buf)); // Year (YYYY)
960
944
945
946
    //////////////////////////////////////////////////////////////////
961
    // Document data record. omindex has the following nl separated fields:
947
    // Document data record. omindex has the following nl separated fields:
962
    // - url
948
    // - url
963
    // - sample
949
    // - sample
964
    // - caption (title limited to 100 chars)
950
    // - caption (title limited to 100 chars)
965
    // - mime type 
951
    // - mime type 
952
    //
953
    // The title, author, abstract and keywords fields are special,
954
    // they always get stored in the document data
955
    // record. Configurable other fields can be, too.
956
    //
957
    // We truncate stored fields abstract, title and keywords to
958
    // reasonable lengths and suppress newlines (so that the data
959
    // record can keep a simple syntax)
960
966
    string record = "url=" + doc.url;
961
    string record = "url=" + doc.url;
967
    record += "\nmtype=" + doc.mimetype;
962
    record += "\nmtype=" + doc.mimetype;
968
    record += "\nfmtime=" + doc.fmtime;
963
    record += "\nfmtime=" + doc.fmtime;
969
    if (!doc.dmtime.empty()) {
964
    if (!doc.dmtime.empty()) {
970
    record += "\ndmtime=" + doc.dmtime;
965
    record += "\ndmtime=" + doc.dmtime;
...
...
980
975
981
    char sizebuf[30]; 
976
    char sizebuf[30]; 
982
    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
977
    sprintf(sizebuf, "%u", (unsigned int)doc.text.length());
983
    record += string("\ndbytes=") + sizebuf;
978
    record += string("\ndbytes=") + sizebuf;
984
979
985
    if (!doc.ipath.empty()) {
980
    if (!doc.ipath.empty())
986
    record += "\nipath=" + doc.ipath;
981
    record += "\nipath=" + doc.ipath;
982
983
    if (doc.meta[Doc::keytt].empty())
984
  doc.meta[Doc::keytt] = doc.utf8fn;
985
    doc.meta[Doc::keytt] = 
986
  neutchars(truncate_to_word(doc.meta[Doc::keytt], 150), nc);
987
    if (!doc.meta[Doc::keytt].empty())
988
  record += "\n" + keycap + "=" + doc.meta[Doc::keytt];
989
990
    doc.meta[Doc::keykw] = 
991
  neutchars(truncate_to_word(doc.meta[Doc::keykw], 300), nc);
992
    if (!doc.meta[Doc::keykw].empty())
993
  record += "\n" + Doc::keykw + "=" + doc.meta[Doc::keykw];
994
995
    // If abstract is empty, we make up one with the beginning of the
996
    // document. This is then not indexed, but part of the doc data so
997
    // that we can return it to a query without having to decode the
998
    // original file.
999
    bool syntabs = false;
1000
    // Note that the map accesses by operator[] create empty entries if they
1001
    // don't exist yet.
1002
    if (doc.meta[Doc::keyabs].empty()) {
1003
  syntabs = true;
1004
  if (!doc.text.empty())
1005
      doc.meta[Doc::keyabs] = rclSyntAbs + 
1006
      neutchars(truncate_to_word(doc.text, m_idxAbsTruncLen), nc);
1007
    } else {
1008
  doc.meta[Doc::keyabs] = 
1009
      neutchars(truncate_to_word(doc.meta[Doc::keyabs], m_idxAbsTruncLen),
1010
            nc);
1011
    }
1012
    if (!doc.meta[Doc::keyabs].empty())
1013
  record += "\n" + Doc::keyabs + "=" + doc.meta[Doc::keyabs];
1014
1015
    RclConfig *config = RclConfig::getMainConfig();
1016
    if (config) {
1017
  const set<string>& stored = config->getStoredFields();
1018
  for (set<string>::const_iterator it = stored.begin();
1019
       it != stored.end(); it++) {
1020
      if (!doc.meta[*it].empty()) {
1021
      string value = 
1022
          neutchars(truncate_to_word(doc.meta[*it], 150), nc);
1023
      record += "\n" + *it + "=" + value;
987
    }
1024
      }
988
    if (!doc.meta["title"].empty())
1025
  }
989
  record += "\ncaption=" + doc.meta["title"];
990
    if (!doc.meta["keywords"].empty())
991
  record += "\nkeywords=" + doc.meta["keywords"];
992
    if (!doc.meta["abstract"].empty())
993
  record += "\nabstract=" + doc.meta["abstract"];
994
    if (!doc.meta["author"].empty()) {
995
  record += "\nauthor=" + doc.meta["author"];
996
    }
1026
    }
997
    record += "\n";
1027
    record += "\n";
998
    LOGDEB1(("Newdocument data: %s\n", record.c_str()));
1028
    LOGDEB(("Rcl::Db::add: new doc record:\n %s\n", record.c_str()));
999
    newdocument.set_data(record);
1029
    newdocument.set_data(record);
1000
1030
1001
    const char *fnc = udi.c_str();
1031
    const char *fnc = udi.c_str();
1002
    string ermsg;
1032
    string ermsg;
1003
1033