|
a/src/rcldb/expansiondbs.cpp |
|
b/src/rcldb/expansiondbs.cpp |
|
... |
|
... |
61 |
|
61 |
|
62 |
#ifndef RCL_INDEX_STRIPCHARS
|
62 |
#ifndef RCL_INDEX_STRIPCHARS
|
63 |
// Unaccented stem dbs
|
63 |
// Unaccented stem dbs
|
64 |
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
64 |
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
65 |
// We can reuse the same stemmer pointers, the objects are stateless.
|
65 |
// We can reuse the same stemmer pointers, the objects are stateless.
|
|
|
66 |
if (!o_index_stripchars) {
|
66 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
67 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
67 |
unacstemdbs.push_back(
|
68 |
unacstemdbs.push_back(
|
68 |
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
|
69 |
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
|
69 |
stemmers.back().getptr()));
|
70 |
stemmers.back().getptr()));
|
70 |
unacstemdbs.back().recreate();
|
71 |
unacstemdbs.back().recreate();
|
|
|
72 |
}
|
71 |
}
|
73 |
}
|
72 |
|
|
|
73 |
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
74 |
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
74 |
XapWritableComputableSynFamMember
|
75 |
XapWritableComputableSynFamMember
|
75 |
diacasedb(wdb, synFamDiac, "all", &transunac);
|
76 |
diacasedb(wdb, synFamDiCa, "all", &transunac);
|
|
|
77 |
if (!o_index_stripchars)
|
76 |
diacasedb.recreate();
|
78 |
diacasedb.recreate();
|
77 |
#endif
|
79 |
#endif
|
78 |
|
80 |
|
79 |
// Walk the list of all terms, and stem/unac each.
|
81 |
// Walk the list of all terms, and stem/unac each.
|
80 |
string ermsg;
|
82 |
string ermsg;
|
81 |
try {
|
83 |
try {
|
|
... |
|
... |
107 |
#ifndef RCL_INDEX_STRIPCHARS
|
109 |
#ifndef RCL_INDEX_STRIPCHARS
|
108 |
// If the index is raw, compute the case-folded term which
|
110 |
// If the index is raw, compute the case-folded term which
|
109 |
// is the input to the stem db, and add a synonym from the
|
111 |
// is the input to the stem db, and add a synonym from the
|
110 |
// stripped term to the cased and accented one, for accent
|
112 |
// stripped term to the cased and accented one, for accent
|
111 |
// and case expansion at query time
|
113 |
// and case expansion at query time
|
|
|
114 |
if (!o_index_stripchars) {
|
112 |
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
115 |
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
113 |
diacasedb.addSynonym(*it);
|
116 |
diacasedb.addSynonym(*it);
|
|
|
117 |
}
|
114 |
#endif
|
118 |
#endif
|
115 |
|
119 |
|
116 |
// Create stemming synonym for every language. The input is the
|
120 |
// Create stemming synonym for every language. The input is the
|
117 |
// lowercase accented term
|
121 |
// lowercase accented term
|
118 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
122 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
... |
|
... |
122 |
#ifndef RCL_INDEX_STRIPCHARS
|
126 |
#ifndef RCL_INDEX_STRIPCHARS
|
123 |
// For a raw index, also maybe create a stem expansion for
|
127 |
// For a raw index, also maybe create a stem expansion for
|
124 |
// the unaccented term. While this may be incorrect, it is
|
128 |
// the unaccented term. While this may be incorrect, it is
|
125 |
// also necessary for searching in a diacritic-unsensitive
|
129 |
// also necessary for searching in a diacritic-unsensitive
|
126 |
// way on a raw index
|
130 |
// way on a raw index
|
|
|
131 |
if (!o_index_stripchars) {
|
127 |
string unac;
|
132 |
string unac;
|
128 |
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
|
133 |
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
|
129 |
if (unac != lower)
|
134 |
if (unac != lower) {
|
130 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
135 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
131 |
unacstemdbs[i].addSynonym(unac);
|
136 |
unacstemdbs[i].addSynonym(unac);
|
|
|
137 |
}
|
132 |
}
|
138 |
}
|
|
|
139 |
}
|
133 |
#endif
|
140 |
#endif
|
134 |
}
|
141 |
}
|
135 |
} XCATCHERROR(ermsg);
|
142 |
} XCATCHERROR(ermsg);
|
136 |
if (!ermsg.empty()) {
|
143 |
if (!ermsg.empty()) {
|
137 |
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
144 |
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|