Switch to unified view

a/src/rcldb/stemdb.cpp b/src/rcldb/stemdb.cpp
...
...
25
#include <map>
25
#include <map>
26
26
27
#include <xapian.h>
27
#include <xapian.h>
28
28
29
#include "stemdb.h"
29
#include "stemdb.h"
30
#include "wipedir.h"
31
#include "pathut.h"
30
#include "pathut.h"
32
#include "debuglog.h"
31
#include "debuglog.h"
33
#include "smallut.h"
32
#include "smallut.h"
34
#include "utf8iter.h"
33
#include "utf8iter.h"
35
#include "textsplit.h"
34
#include "textsplit.h"
35
#include "rcldb.h"
36
#include "rcldb_p.h"
37
#include "synfamily.h"
38
39
#include <iostream>
36
40
37
using namespace std;
41
using namespace std;
38
42
39
namespace Rcl {
43
namespace Rcl {
40
namespace StemDb {
44
namespace StemDb {
41
45
42
46
43
static const string cstr_stemdirstem = "stem_";
47
vector<string> getLangs(Xapian::Database& xdb)
44
45
/// Compute name of stem db for given base database and language
46
static string stemdbname(const string& dbdir, const string& lang)
47
{
48
{
48
    return path_cat(dbdir, cstr_stemdirstem + lang);
49
    XapSynFamily fam(xdb, synprefStem);
49
}
50
    vector<string> langs;
50
51
    (void)fam.getMembers(langs);
51
vector<string> getLangs(const string& dbdir)
52
{
53
    string pattern = cstr_stemdirstem + "*";
54
    vector<string> dirs = path_dirglob(dbdir, pattern);
55
    for (vector<string>::iterator it = dirs.begin(); it != dirs.end(); it++) {
56
  *it = path_basename(*it);
57
  *it = it->substr(cstr_stemdirstem.length(), string::npos);
58
    }
59
    return dirs;
60
}
61
62
bool deleteDb(const string& dbdir, const string& lang)
63
{
64
    string dir = stemdbname(dbdir, lang);
65
    if (wipedir(dir) == 0 && rmdir(dir.c_str()) == 0)
66
  return true;
67
    return false;
52
    return langs;
68
}
69
70
// Autoclean/delete directory 
71
class DirWiper {
72
 public:
73
    string dir;
74
    bool do_it;
75
    DirWiper(string d) : dir(d), do_it(true) {}
76
    ~DirWiper() {
77
  if (do_it) {
78
      wipedir(dir);
79
      rmdir(dir.c_str());
80
  }
53
}
81
    }
54
55
bool deleteDb(Xapian::WritableDatabase& xdb, const string& lang)
56
{
57
    XapWritableSynFamily fam(xdb, synprefStem);
58
    return fam.deleteMember(lang);
82
};
59
}
83
60
84
inline static bool
61
inline static bool
85
p_notlowerascii(unsigned int c)
62
p_notlowerascii(unsigned int c)
86
{
63
{
87
    if (c < 'a' || (c > 'z' && c < 128))
64
    if (c < 'a' || (c > 'z' && c < 128))
88
    return true;
65
    return true;
89
    return false;
66
    return false;
90
}
67
}
91
92
static bool addAssoc(Xapian::WritableDatabase &sdb, const string& stem,
93
                     const vector<string>& derivs)
94
{
95
    Xapian::Document newdocument;
96
    newdocument.add_term(stem);
97
    // The doc data is just parents=blank-separated-list
98
    string record = "parents=";
99
    for (vector<string>::const_iterator it = derivs.begin(); 
100
         it != derivs.end(); it++) {
101
        record += *it + " ";
102
    }
103
    record += "\n";
104
    LOGDEB2(("createStemDb: stmdoc data: [%s]\n", record.c_str()));
105
    newdocument.set_data(record);
106
    try {
107
        sdb.replace_document(stem, newdocument);
108
    } catch (...) {
109
        LOGERR(("Db::createstemdb(addAssoc): replace failed\n"));
110
        return false;
111
    }
112
    return true;
113
}
114
115
68
116
/**
69
/**
117
 * Create database of stem to parents associations for a given language.
70
 * Create database of stem to parents associations for a given language.
118
 * We walk the list of all terms, stem them, and create another Xapian db
71
 * We walk the list of all terms, stem them, and create another Xapian db
119
 * with documents indexed by a single term (the stem), and with the list of
72
 * with documents indexed by a single term (the stem), and with the list of
120
 * parent terms in the document data.
73
 * parent terms in the document data.
121
 */
74
 */
122
bool createDb(Xapian::Database& xdb, const string& dbdir, const string& lang)
75
bool createDb(Xapian::WritableDatabase& xdb, const string& lang)
123
{
76
{
124
    LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
77
    LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
125
    Chrono cron;
78
    Chrono cron;
126
79
127
    // First build the in-memory stem database:
80
    // First build the in-memory stem database:
...
...
134
    map<string, vector<string> > assocs;
87
    map<string, vector<string> > assocs;
135
    // Statistics
88
    // Statistics
136
    int nostem=0; // Dont even try: not-alphanum (incomplete for now)
89
    int nostem=0; // Dont even try: not-alphanum (incomplete for now)
137
    int stemconst=0; // Stem == term
90
    int stemconst=0; // Stem == term
138
    int stemmultiple = 0; // Count of stems with multiple derivatives
91
    int stemmultiple = 0; // Count of stems with multiple derivatives
92
    string ermsg;
139
    try {
93
    try {
140
        Xapian::Stem stemmer(lang);
94
        Xapian::Stem stemmer(lang);
141
        Xapian::TermIterator it;
95
        Xapian::TermIterator it;
142
        for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
96
        for (it = xdb.allterms_begin(); it != xdb.allterms_end(); it++) {
143
        // If the term has any non-lowercase 7bit char (that is,
97
        // If the term has any non-lowercase 7bit char (that is,
...
...
172
                ++stemconst;
126
                ++stemconst;
173
                continue;
127
                continue;
174
            }
128
            }
175
            assocs[stem].push_back(*it);
129
            assocs[stem].push_back(*it);
176
        }
130
        }
177
    } catch (const Xapian::Error &e) {
131
    } XCATCHERROR(ermsg);
132
    if (!ermsg.empty()) {
178
        LOGERR(("Db::createStemDb: build failed: %s\n", e.get_msg().c_str()));
133
        LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
179
        return false;
134
        return false;
180
    } catch (...) {
181
        LOGERR(("Db::createStemDb: build failed: no stemmer for %s ? \n", 
182
                lang.c_str()));
183
        return false;
184
    }
135
    }
136
185
    LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n", 
137
    LOGDEB1(("StemDb::createDb(%s): in memory map built: %.2f S\n", 
186
             lang.c_str(), cron.secs()));
138
             lang.c_str(), cron.secs()));
187
139
188
    // Create xapian database for stem relations
140
    XapWritableSynFamily fam(xdb, synprefStem);
189
    string stemdbdir = stemdbname(dbdir, lang);
141
    fam.createMember(lang);
190
    // We want to get rid of the db dir in case of error. This gets disarmed
191
    // just before success return.
192
    DirWiper wiper(stemdbdir);
193
    string ermsg;
194
    Xapian::WritableDatabase sdb;
195
    try {
196
        sdb = Xapian::WritableDatabase(stemdbdir, 
197
                                       Xapian::DB_CREATE_OR_OVERWRITE);
198
    } catch (const Xapian::Error &e) {
199
        ermsg = e.get_msg();
200
    } catch (const string &s) {
201
        ermsg = s;
202
    } catch (const char *s) {
203
        ermsg = s;
204
    } catch (...) {
205
        ermsg = "Caught unknown exception";
206
    }
207
    if (!ermsg.empty()) {
208
        LOGERR(("Db::createstemdb: exception while opening [%s]: %s\n", 
209
                stemdbdir.c_str(), ermsg.c_str()));
210
        return false;
211
    }
212
142
213
    // Enter pseud-docs in db by walking the map.
214
    for (map<string, vector<string> >::const_iterator it = assocs.begin();
143
    for (map<string, vector<string> >::const_iterator it = assocs.begin();
215
         it != assocs.end(); it++) {
144
         it != assocs.end(); it++) {
216
    LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
145
    LOGDEB2(("createStemDb: stem [%s]\n", it->first.c_str()));
217
    // We need an entry even if there is only one derivative
146
    // We need an entry even if there is only one derivative
218
    // so that it is possible to search by entering the stem
147
    // so that it is possible to search by entering the stem
219
    // even if it doesnt exist as a term
148
    // even if it doesnt exist as a term
220
    if (it->second.size() > 1)
149
    if (it->second.size() > 1)
221
        ++stemmultiple;
150
        ++stemmultiple;
222
                    
223
    if (!addAssoc(sdb, it->first, it->second)) {
151
    if (!fam.addSynonyms(lang, it->first, it->second)) {
224
        return false;
152
        return false;
225
    }
153
    }
226
    }
154
    }
227
155
228
    LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n", 
156
    LOGDEB1(("StemDb::createDb(%s): done: %.2f S\n", 
229
             lang.c_str(), cron.secs()));
157
             lang.c_str(), cron.secs()));
230
    LOGDEB(("Stem map size: %d mult %d const %d no %d \n", 
158
    LOGDEB(("Stem map size: %d mult %d const %d no %d \n", 
231
        assocs.size(), stemmultiple, stemconst, nostem));
159
        assocs.size(), stemmultiple, stemconst, nostem));
232
    wiper.do_it = false;
160
    fam.listMap(lang);
233
    return true;
161
    return true;
234
}
162
}
235
163
236
static string stringlistdisp(const vector<string>& sl)
164
static string stringlistdisp(const vector<string>& sl)
237
{
165
{
...
...
245
173
246
/**
174
/**
247
 * Expand term to list of all terms which stem to the same term, for one
175
 * Expand term to list of all terms which stem to the same term, for one
248
 * expansion language
176
 * expansion language
249
 */
177
 */
250
static bool stemExpandOne(const std::string& dbdir, 
178
static bool stemExpandOne(Xapian::Database& xdb, 
251
              const std::string& lang,
179
              const std::string& lang,
252
              const std::string& term,
180
              const std::string& term,
253
              vector<string>& result)
181
              vector<string>& result)
254
{
182
{
255
    try {
183
    try {
256
    Xapian::Stem stemmer(lang);
184
    Xapian::Stem stemmer(lang);
257
    string stem = stemmer(term);
185
    string stem = stemmer(term);
258
    LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n", 
186
    LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n", 
259
                lang.c_str(), term.c_str(), stem.c_str()));
187
                lang.c_str(), term.c_str(), stem.c_str()));
260
188
261
  // Open stem database
189
  XapSynFamily fam(xdb, synprefStem);
262
  string stemdbdir = stemdbname(dbdir, lang);
190
  if (!fam.synExpand(lang, stem, result)) {
263
  Xapian::Database sdb(stemdbdir);
191
      // ?
264
  LOGDEB0(("stemExpand: %s lastdocid: %d\n", 
265
      stemdbdir.c_str(), sdb.get_lastdocid()));
266
267
  // Try to fetch the doc from the stem db
268
  if (!sdb.term_exists(stem)) {
269
      LOGDEB0(("Db::stemExpand: no term for %s\n", stem.c_str()));
270
  } else {
271
      Xapian::PostingIterator did = sdb.postlist_begin(stem);
272
      if (did == sdb.postlist_end(stem)) {
273
      LOGDEB0(("stemExpand: no term(1) for %s\n",stem.c_str()));
274
      } else {
275
      Xapian::Document doc = sdb.get_document(*did);
276
      string data = doc.get_data();
277
278
      // Build expansion list from database data No need for
279
      // a conftree, but we need to massage the data a
280
      // little
281
      string::size_type pos = data.find('=');
282
      string::size_type pos1 = data.rfind('\n');
283
      if (pos == string::npos || pos1 == string::npos || 
284
          pos1 <= pos+1) {
285
          LOGERR(("stemExpand: bad data in db: [%s]\n", 
286
              data.c_str()));
287
      } else {
288
          ++pos;
289
          stringToStrings(data.substr(pos, pos1-pos), result);
290
      }
291
      }
292
    }
192
    }
293
193
294
    // If the user term or stem are not in the list, add them
194
    // If the user term or stem are not in the list, add them
295
    if (find(result.begin(), result.end(), term) == result.end()) {
195
    if (find(result.begin(), result.end(), term) == result.end()) {
296
        result.push_back(term);
196
        result.push_back(term);
...
...
300
    }
200
    }
301
    LOGDEB0(("stemExpand:%s: %s ->  %s\n", lang.c_str(), stem.c_str(),
201
    LOGDEB0(("stemExpand:%s: %s ->  %s\n", lang.c_str(), stem.c_str(),
302
         stringlistdisp(result).c_str()));
202
         stringlistdisp(result).c_str()));
303
203
304
    } catch (...) {
204
    } catch (...) {
305
    LOGERR(("stemExpand: error accessing stem db. dbdir [%s] lang [%s]\n",
205
    LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
306
      dbdir.c_str(), lang.c_str()));
206
      lang.c_str()));
307
    result.push_back(term);
207
    result.push_back(term);
308
    return false;
208
    return false;
309
    }
209
    }
310
210
311
    return true;
211
    return true;
...
...
313
    
213
    
314
/**
214
/**
315
 * Expand term to list of all terms which stem to the same term, add the
215
 * Expand term to list of all terms which stem to the same term, add the
316
 * expansion sets for possibly multiple expansion languages
216
 * expansion sets for possibly multiple expansion languages
317
 */
217
 */
318
bool stemExpand(const std::string& dbdir, 
218
bool stemExpand(Xapian::Database& xdb,
319
        const std::string& langs,
219
        const std::string& langs,
320
        const std::string& term,
220
        const std::string& term,
321
        vector<string>& result)
221
        vector<string>& result)
322
{
222
{
323
324
    vector<string> llangs;
223
    vector<string> llangs;
325
    stringToStrings(langs, llangs);
224
    stringToStrings(langs, llangs);
326
    for (vector<string>::const_iterator it = llangs.begin();
225
    for (vector<string>::const_iterator it = llangs.begin();
327
     it != llangs.end(); it++) {
226
     it != llangs.end(); it++) {
328
    vector<string> oneexp;
227
    vector<string> oneexp;
329
    stemExpandOne(dbdir, *it, term, oneexp);
228
    stemExpandOne(xdb, *it, term, oneexp);
330
    result.insert(result.end(), oneexp.begin(), oneexp.end());
229
    result.insert(result.end(), oneexp.begin(), oneexp.end());
331
    }
230
    }
332
    sort(result.begin(), result.end());
231
    sort(result.begin(), result.end());
333
    unique(result.begin(), result.end());
232
    unique(result.begin(), result.end());
334
    return true;
233
    return true;