|
a/src/rcldb/stemdb.cpp |
|
b/src/rcldb/stemdb.cpp |
|
... |
|
... |
17 |
|
17 |
|
18 |
/**
|
18 |
/**
|
19 |
* Management of the auxiliary databases listing stems and their expansion
|
19 |
* Management of the auxiliary databases listing stems and their expansion
|
20 |
* terms
|
20 |
* terms
|
21 |
*/
|
21 |
*/
|
|
|
22 |
|
|
|
23 |
#include "autoconfig.h"
|
|
|
24 |
|
22 |
#include <unistd.h>
|
25 |
#include <unistd.h>
|
23 |
|
26 |
|
24 |
#include <algorithm>
|
27 |
#include <algorithm>
|
25 |
#include <map>
|
28 |
#include <map>
|
|
|
29 |
#include <iostream>
|
|
|
30 |
using namespace std;
|
26 |
|
31 |
|
27 |
#include <xapian.h>
|
32 |
#include <xapian.h>
|
28 |
|
33 |
|
29 |
#include "stemdb.h"
|
34 |
#include "stemdb.h"
|
30 |
#include "pathut.h"
|
|
|
31 |
#include "debuglog.h"
|
35 |
#include "debuglog.h"
|
32 |
#include "smallut.h"
|
36 |
#include "smallut.h"
|
33 |
#include "utf8iter.h"
|
|
|
34 |
#include "textsplit.h"
|
|
|
35 |
#include "rcldb.h"
|
|
|
36 |
#include "rcldb_p.h"
|
|
|
37 |
#include "synfamily.h"
|
37 |
#include "synfamily.h"
|
38 |
#include "unacpp.h"
|
38 |
#include "unacpp.h"
|
39 |
|
39 |
#include "rclconfig.h"
|
40 |
#include <iostream>
|
|
|
41 |
|
|
|
42 |
using namespace std;
|
|
|
43 |
|
40 |
|
44 |
namespace Rcl {
|
41 |
namespace Rcl {
|
45 |
|
42 |
|
46 |
// Fast raw detection of non-natural-language words: look for ascii
|
|
|
47 |
// chars which are not lowercase letters. Not too sure what islower()
|
|
|
48 |
// would do with 8 bit values, so not using it here. If we want to be
|
|
|
49 |
// more complete we'd need to go full utf-8
|
|
|
50 |
inline static bool p_notlowerascii(unsigned int c)
|
|
|
51 |
{
|
|
|
52 |
if (c < 'a' || (c > 'z' && c < 128))
|
|
|
53 |
return true;
|
|
|
54 |
return false;
|
|
|
55 |
}
|
|
|
56 |
|
|
|
57 |
/**
|
|
|
58 |
* Create database of stem to parents associations for a given language.
|
|
|
59 |
*/
|
|
|
60 |
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|
|
61 |
const vector<string>& langs)
|
|
|
62 |
{
|
|
|
63 |
LOGDEB(("StemDb::createExpansionDbs\n"));
|
|
|
64 |
Chrono cron;
|
|
|
65 |
|
|
|
66 |
vector<XapWritableSynFamily> stemdbs;
|
|
|
67 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
|
68 |
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
|
|
|
69 |
stemdbs[i].deleteMember(langs[i]);
|
|
|
70 |
stemdbs[i].createMember(langs[i]);
|
|
|
71 |
stemdbs[i].setCurrentMemberName(langs[i]);
|
|
|
72 |
}
|
|
|
73 |
|
|
|
74 |
// We walk the list of all terms, and stem each. We skip terms which
|
|
|
75 |
// don't look like natural language.
|
|
|
76 |
// If the stem is not identical to the term, we add a synonym entry.
|
|
|
77 |
// Statistics
|
|
|
78 |
int nostem = 0; // Dont even try: not-alphanum (incomplete for now)
|
|
|
79 |
int stemconst = 0; // Stem == term
|
|
|
80 |
int allsyns = 0; // Total number of entries created
|
|
|
81 |
|
|
|
82 |
string ermsg;
|
|
|
83 |
try {
|
|
|
84 |
vector<Xapian::Stem> stemmers;
|
|
|
85 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
|
86 |
stemmers.push_back(Xapian::Stem(langs[i]));
|
|
|
87 |
}
|
|
|
88 |
|
|
|
89 |
for (Xapian::TermIterator it = wdb.allterms_begin();
|
|
|
90 |
it != wdb.allterms_end(); it++) {
|
|
|
91 |
// If the term has any non-lowercase 7bit char (that is,
|
|
|
92 |
// numbers, capitals and punctuation) dont stem.
|
|
|
93 |
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
|
|
94 |
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
|
|
95 |
++nostem;
|
|
|
96 |
LOGDEB1(("stemskipped: [%s], because of 0x%x\n",
|
|
|
97 |
(*it).c_str(), *sit));
|
|
|
98 |
continue;
|
|
|
99 |
}
|
|
|
100 |
|
|
|
101 |
// Detect and skip CJK terms.
|
|
|
102 |
// We're still sending all other multibyte utf-8 chars to
|
|
|
103 |
// the stemmer, which is not too well defined for
|
|
|
104 |
// xapian<1.0 (very obsolete now), but seems to work
|
|
|
105 |
// anyway. There shouldn't be too many in any case because
|
|
|
106 |
// accents are stripped at this point.
|
|
|
107 |
// The effect of stripping accents on stemming is not good,
|
|
|
108 |
// (e.g: in french partimes -> partim, parti^mes -> part)
|
|
|
109 |
// but fixing the issue would be complicated.
|
|
|
110 |
Utf8Iter utfit(*it);
|
|
|
111 |
if (TextSplit::isCJK(*utfit)) {
|
|
|
112 |
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
|
|
113 |
continue;
|
|
|
114 |
}
|
|
|
115 |
|
|
|
116 |
// Create stemming synonym for every lang
|
|
|
117 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
|
118 |
string stem = stemmers[i](*it);
|
|
|
119 |
if (stem == *it) {
|
|
|
120 |
++stemconst;
|
|
|
121 |
} else {
|
|
|
122 |
stemdbs[i].addSynonym(stem, *it);
|
|
|
123 |
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
|
|
|
124 |
(*it).c_str(), langs[i].c_str(), stem.c_str()));
|
|
|
125 |
++allsyns;
|
|
|
126 |
}
|
|
|
127 |
}
|
|
|
128 |
|
|
|
129 |
}
|
|
|
130 |
} XCATCHERROR(ermsg);
|
|
|
131 |
if (!ermsg.empty()) {
|
|
|
132 |
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
|
|
133 |
return false;
|
|
|
134 |
}
|
|
|
135 |
|
|
|
136 |
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
|
|
|
137 |
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
|
|
138 |
nostem, stemconst, allsyns));
|
|
|
139 |
return true;
|
|
|
140 |
}
|
|
|
141 |
|
|
|
142 |
/**
|
|
|
143 |
* Expand term to list of all terms which stem to the same term, for one
|
|
|
144 |
* expansion language
|
|
|
145 |
*/
|
|
|
146 |
bool StemDb::expandOne(const std::string& lang,
|
|
|
147 |
const std::string& term,
|
|
|
148 |
vector<string>& result)
|
|
|
149 |
{
|
|
|
150 |
try {
|
|
|
151 |
Xapian::Stem stemmer(lang);
|
|
|
152 |
string stem = stemmer(term);
|
|
|
153 |
LOGDEB(("stemExpand:%s: [%s] stem-> [%s]\n",
|
|
|
154 |
lang.c_str(), term.c_str(), stem.c_str()));
|
|
|
155 |
|
|
|
156 |
if (!synExpand(lang, stem, result)) {
|
|
|
157 |
// ?
|
|
|
158 |
}
|
|
|
159 |
|
|
|
160 |
// If the user term or stem are not in the list, add them
|
|
|
161 |
if (find(result.begin(), result.end(), term) == result.end()) {
|
|
|
162 |
result.push_back(term);
|
|
|
163 |
}
|
|
|
164 |
if (find(result.begin(), result.end(), stem) == result.end()) {
|
|
|
165 |
result.push_back(stem);
|
|
|
166 |
}
|
|
|
167 |
LOGDEB0(("stemExpand:%s: %s -> %s\n", lang.c_str(), stem.c_str(),
|
|
|
168 |
stringsToString(result).c_str()));
|
|
|
169 |
|
|
|
170 |
} catch (...) {
|
|
|
171 |
LOGERR(("stemExpand: error accessing stem db. lang [%s]\n",
|
|
|
172 |
lang.c_str()));
|
|
|
173 |
result.push_back(term);
|
|
|
174 |
return false;
|
|
|
175 |
}
|
|
|
176 |
|
|
|
177 |
return true;
|
|
|
178 |
}
|
|
|
179 |
|
|
|
180 |
/**
|
43 |
/**
|
181 |
* Expand for one or several languages
|
44 |
* Expand for one or several languages
|
182 |
*/
|
45 |
*/
|
183 |
bool StemDb::stemExpand(const std::string& langs,
|
46 |
bool StemDb::stemExpand(const std::string& langs, const std::string& term,
|
184 |
const std::string& term,
|
|
|
185 |
vector<string>& result)
|
47 |
vector<string>& result)
|
186 |
{
|
48 |
{
|
187 |
vector<string> llangs;
|
49 |
vector<string> llangs;
|
188 |
stringToStrings(langs, llangs);
|
50 |
stringToStrings(langs, llangs);
|
|
|
51 |
|
189 |
for (vector<string>::const_iterator it = llangs.begin();
|
52 |
for (vector<string>::const_iterator it = llangs.begin();
|
190 |
it != llangs.end(); it++) {
|
53 |
it != llangs.end(); it++) {
|
191 |
vector<string> oneexp;
|
54 |
SynTermTransStem stemmer(*it);
|
192 |
expandOne(*it, term, oneexp);
|
55 |
XapComputableSynFamMember expander(getdb(), synFamStem, *it, &stemmer);
|
193 |
result.insert(result.end(), oneexp.begin(), oneexp.end());
|
56 |
(void)expander.synExpand(term, result);
|
194 |
}
|
57 |
}
|
|
|
58 |
|
|
|
59 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
60 |
// Expand the unaccented stem
|
|
|
61 |
if (!o_index_stripchars) {
|
|
|
62 |
for (vector<string>::const_iterator it = llangs.begin();
|
|
|
63 |
it != llangs.end(); it++) {
|
|
|
64 |
SynTermTransStem stemmer(*it);
|
|
|
65 |
XapComputableSynFamMember expander(getdb(), synFamStemUnac,
|
|
|
66 |
*it, &stemmer);
|
|
|
67 |
string unac;
|
|
|
68 |
unacmaybefold(term, unac, "UTF-8", UNACOP_UNAC);
|
|
|
69 |
(void)expander.synExpand(unac, result);
|
|
|
70 |
}
|
|
|
71 |
}
|
|
|
72 |
#endif
|
|
|
73 |
|
|
|
74 |
if (result.empty())
|
|
|
75 |
result.push_back(term);
|
|
|
76 |
|
195 |
sort(result.begin(), result.end());
|
77 |
sort(result.begin(), result.end());
|
196 |
unique(result.begin(), result.end());
|
78 |
vector<string>::iterator uit = unique(result.begin(), result.end());
|
|
|
79 |
result.resize(uit - result.begin());
|
|
|
80 |
LOGDEB0(("stemExpand:%s: %s -> %s\n", langs.c_str(), term.c_str(),
|
|
|
81 |
stringsToString(result).c_str()));
|
197 |
return true;
|
82 |
return true;
|
198 |
}
|
83 |
}
|
199 |
|
84 |
|
200 |
|
85 |
|
201 |
}
|
86 |
}
|