Switch to unified view

a/src/rcldb/stemdb.cpp b/src/rcldb/stemdb.cpp
...
...
17
17
18
/**
18
/**
19
 * Management of the auxiliary databases listing stems and their expansion 
19
 * Management of the auxiliary databases listing stems and their expansion 
20
 * terms
20
 * terms
21
 */
21
 */
22
23
#include "autoconfig.h"
24
22
#include <unistd.h>
25
#include <unistd.h>
23
26
24
#include <algorithm>
27
#include <algorithm>
25
#include <map>
28
#include <map>
29
#include <iostream>
30
using namespace std;
26
31
27
#include <xapian.h>
32
#include <xapian.h>
28
33
29
#include "stemdb.h"
34
#include "stemdb.h"
30
#include "pathut.h"
31
#include "debuglog.h"
35
#include "debuglog.h"
32
#include "smallut.h"
36
#include "smallut.h"
33
#include "utf8iter.h"
34
#include "textsplit.h"
35
#include "rcldb.h"
36
#include "rcldb_p.h"
37
#include "synfamily.h"
37
#include "synfamily.h"
38
#include "unacpp.h"
38
#include "unacpp.h"
39
39
#include "rclconfig.h"
40
#include <iostream>
41
42
using namespace std;
43
40
44
namespace Rcl {
41
namespace Rcl {
45
42
46
// Fast raw detection of non-natural-language words: look for ascii
47
// chars which are not lowercase letters. Not too sure what islower()
48
// would do with 8 bit values, so not using it here. If we want to be
49
// more complete we'd need to go full utf-8
50
inline static bool p_notlowerascii(unsigned int c)
51
{
52
    if (c < 'a' || (c > 'z' && c < 128))
53
  return true;
54
    return false;
55
}
56
57
/**
58
 * Create database of stem to parents associations for a given language.
59
 */
60
bool createExpansionDbs(Xapian::WritableDatabase& wdb, 
61
          const vector<string>& langs)
62
{
63
    LOGDEB(("StemDb::createExpansionDbs\n"));
64
    Chrono cron;
65
66
    vector<XapWritableSynFamily> stemdbs;
67
    for (unsigned int i = 0; i < langs.size(); i++) {
68
  stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
69
  stemdbs[i].deleteMember(langs[i]);
70
  stemdbs[i].createMember(langs[i]);
71
  stemdbs[i].setCurrentMemberName(langs[i]);
72
    }
73
74
    // We walk the list of all terms, and stem each. We skip terms which
75
    // don't look like natural language.
76
    // If the stem is not identical to the term, we add a synonym entry.
77
    // Statistics
78
    int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
79
    int stemconst = 0; // Stem == term
80
    int allsyns = 0; // Total number of entries created
81
82
    string ermsg;
83
    try {
84
  vector<Xapian::Stem> stemmers;
85
  for (unsigned int i = 0; i < langs.size(); i++) {
86
      stemmers.push_back(Xapian::Stem(langs[i]));
87
  }
88
89
        for (Xapian::TermIterator it = wdb.allterms_begin(); 
90
       it != wdb.allterms_end(); it++) {
91
      // If the term has any non-lowercase 7bit char (that is,
92
            // numbers, capitals and punctuation) dont stem.
93
            string::iterator sit = (*it).begin(), eit = sit + (*it).length();
94
            if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
95
                ++nostem;
96
                LOGDEB1(("stemskipped: [%s], because of 0x%x\n", 
97
                         (*it).c_str(), *sit));
98
                continue;
99
            }
100
101
      // Detect and skip CJK terms.
102
      // We're still sending all other multibyte utf-8 chars to
103
            // the stemmer, which is not too well defined for
104
            // xapian<1.0 (very obsolete now), but seems to work
105
            // anyway. There shouldn't be too many in any case because
106
            // accents are stripped at this point. 
107
      // The effect of stripping accents on stemming is not good, 
108
            // (e.g: in french partimes -> partim, parti^mes -> part)
109
      // but fixing the issue would be complicated.
110
      Utf8Iter utfit(*it);
111
      if (TextSplit::isCJK(*utfit)) {
112
      // LOGDEB(("stemskipped: Skipping CJK\n"));
113
      continue;
114
      }
115
116
      // Create stemming synonym for every lang
117
      for (unsigned int i = 0; i < langs.size(); i++) {
118
      string stem = stemmers[i](*it);
119
      if (stem == *it) {
120
          ++stemconst;
121
      } else {
122
          stemdbs[i].addSynonym(stem, *it);
123
          LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n", 
124
               (*it).c_str(), langs[i].c_str(), stem.c_str()));
125
          ++allsyns;
126
      }
127
      }
128
129
        }
130
    } XCATCHERROR(ermsg);
131
    if (!ermsg.empty()) {
132
        LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
133
        return false;
134
    }
135
136
    LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
137
    LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n", 
138
      nostem, stemconst, allsyns));
139
    return true;
140
}
141
142
/**
143
 * Expand term to list of all terms which stem to the same term, for one
144
 * expansion language
145
 */
146
bool StemDb::expandOne(const std::string& lang,
147
             const std::string& term,
148
             vector<string>& result)
149
{
150
    try {
151
  Xapian::Stem stemmer(lang);
152
  string stem = stemmer(term);
153
  LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n", 
154
                lang.c_str(), term.c_str(), stem.c_str()));
155
156
  if (!synExpand(lang, stem, result)) {
157
      // ?
158
  }
159
160
  // If the user term or stem are not in the list, add them
161
  if (find(result.begin(), result.end(), term) == result.end()) {
162
      result.push_back(term);
163
  }
164
  if (find(result.begin(), result.end(), stem) == result.end()) {
165
      result.push_back(stem);
166
  }
167
  LOGDEB0(("stemExpand:%s: %s ->  %s\n", lang.c_str(), stem.c_str(),
168
       stringsToString(result).c_str()));
169
170
    } catch (...) {
171
  LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
172
      lang.c_str()));
173
  result.push_back(term);
174
  return false;
175
    }
176
177
    return true;
178
}
179
    
180
/**
43
/**
181
 * Expand for one or several languages
44
 * Expand for one or several languages
182
 */
45
 */
183
bool StemDb::stemExpand(const std::string& langs,
46
bool StemDb::stemExpand(const std::string& langs, const std::string& term,
184
          const std::string& term,
185
            vector<string>& result)
47
            vector<string>& result)
186
{
48
{
187
    vector<string> llangs;
49
    vector<string> llangs;
188
    stringToStrings(langs, llangs);
50
    stringToStrings(langs, llangs);
51
189
    for (vector<string>::const_iterator it = llangs.begin();
52
    for (vector<string>::const_iterator it = llangs.begin();
190
     it != llangs.end(); it++) {
53
     it != llangs.end(); it++) {
191
  vector<string> oneexp;
54
  SynTermTransStem stemmer(*it);
192
  expandOne(*it, term, oneexp);
55
  XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
193
  result.insert(result.end(), oneexp.begin(), oneexp.end());
56
  (void)expander.synExpand(term, result);
194
    }
57
    }
58
59
#ifndef RCL_INDEX_STRIPCHARS
60
    // Expand the unaccented stem
61
    if (!o_index_stripchars) {
62
  for (vector<string>::const_iterator it = llangs.begin();
63
       it != llangs.end(); it++) {
64
      SynTermTransStem stemmer(*it);
65
      XapComputableSynFamMember expander(getdb(), synFamStemUnac, 
66
                         *it, &stemmer);
67
      string unac;
68
      unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
69
      (void)expander.synExpand(unac, result);
70
  }
71
    }
72
#endif 
73
74
    if (result.empty())
75
  result.push_back(term);
76
195
    sort(result.begin(), result.end());
77
    sort(result.begin(), result.end());
196
    unique(result.begin(), result.end());
78
    vector<string>::iterator uit = unique(result.begin(), result.end());
79
    result.resize(uit - result.begin());
80
    LOGDEB0(("stemExpand:%s: %s ->  %s\n", langs.c_str(), term.c_str(),
81
       stringsToString(result).c_str()));
197
    return true;
82
    return true;
198
}
83
}
199
84
200
85
201
}
86
}