|
a/src/rcldb/expansiondbs.cpp |
|
b/src/rcldb/expansiondbs.cpp |
|
... |
|
... |
46 |
// Erase and recreate all the expansion groups
|
46 |
// Erase and recreate all the expansion groups
|
47 |
|
47 |
|
48 |
// If langs is empty and we don't need casediac expansion, then no need to
|
48 |
// If langs is empty and we don't need casediac expansion, then no need to
|
49 |
// walk the big list
|
49 |
// walk the big list
|
50 |
if (langs.empty()) {
|
50 |
if (langs.empty()) {
|
51 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
52 |
if (o_index_stripchars)
|
51 |
if (o_index_stripchars)
|
53 |
#endif
|
|
|
54 |
return true;
|
52 |
return true;
|
55 |
}
|
53 |
}
|
56 |
|
54 |
|
57 |
// Stem dbs
|
55 |
// Stem dbs
|
58 |
vector<XapWritableComputableSynFamMember> stemdbs;
|
56 |
vector<XapWritableComputableSynFamMember> stemdbs;
|
|
... |
|
... |
66 |
XapWritableComputableSynFamMember(wdb, synFamStem, langs[i],
|
64 |
XapWritableComputableSynFamMember(wdb, synFamStem, langs[i],
|
67 |
stemmers.back().getptr()));
|
65 |
stemmers.back().getptr()));
|
68 |
stemdbs.back().recreate();
|
66 |
stemdbs.back().recreate();
|
69 |
}
|
67 |
}
|
70 |
|
68 |
|
71 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
72 |
// Unaccented stem dbs
|
69 |
// Unaccented stem dbs
|
73 |
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
70 |
vector<XapWritableComputableSynFamMember> unacstemdbs;
|
74 |
// We can reuse the same stemmer pointers, the objects are stateless.
|
71 |
// We can reuse the same stemmer pointers, the objects are stateless.
|
75 |
if (!o_index_stripchars) {
|
72 |
if (!o_index_stripchars) {
|
76 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
73 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
... |
|
... |
83 |
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
80 |
SynTermTransUnac transunac(UNACOP_UNACFOLD);
|
84 |
XapWritableComputableSynFamMember
|
81 |
XapWritableComputableSynFamMember
|
85 |
diacasedb(wdb, synFamDiCa, "all", &transunac);
|
82 |
diacasedb(wdb, synFamDiCa, "all", &transunac);
|
86 |
if (!o_index_stripchars)
|
83 |
if (!o_index_stripchars)
|
87 |
diacasedb.recreate();
|
84 |
diacasedb.recreate();
|
88 |
#endif
|
|
|
89 |
|
85 |
|
90 |
// Walk the list of all terms, and stem/unac each.
|
86 |
// Walk the list of all terms, and stem/unac each.
|
91 |
string ermsg;
|
87 |
string ermsg;
|
92 |
try {
|
88 |
try {
|
93 |
Xapian::TermIterator it = wdb.allterms_begin();
|
89 |
Xapian::TermIterator it = wdb.allterms_begin();
|
|
... |
|
... |
105 |
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
101 |
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
106 |
continue;
|
102 |
continue;
|
107 |
}
|
103 |
}
|
108 |
|
104 |
|
109 |
string lower = *it;
|
105 |
string lower = *it;
|
110 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
111 |
// If the index is raw, compute the case-folded term which
|
106 |
// If the index is raw, compute the case-folded term which
|
112 |
// is the input to the stem db, and add a synonym from the
|
107 |
// is the input to the stem db, and add a synonym from the
|
113 |
// stripped term to the cased and accented one, for accent
|
108 |
// stripped term to the cased and accented one, for accent
|
114 |
// and case expansion at query time
|
109 |
// and case expansion at query time
|
115 |
if (!o_index_stripchars) {
|
110 |
if (!o_index_stripchars) {
|
116 |
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
111 |
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
117 |
diacasedb.addSynonym(*it);
|
112 |
diacasedb.addSynonym(*it);
|
118 |
}
|
113 |
}
|
119 |
#endif
|
|
|
120 |
|
114 |
|
121 |
// Dont' apply stemming to terms which don't look like
|
115 |
// Dont' apply stemming to terms which don't look like
|
122 |
// natural language words.
|
116 |
// natural language words.
|
123 |
if (!Db::isSpellingCandidate(*it)) {
|
117 |
if (!Db::isSpellingCandidate(*it)) {
|
124 |
LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
|
118 |
LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
|
|
... |
|
... |
129 |
// lowercase accented term
|
123 |
// lowercase accented term
|
130 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
124 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
131 |
stemdbs[i].addSynonym(lower);
|
125 |
stemdbs[i].addSynonym(lower);
|
132 |
}
|
126 |
}
|
133 |
|
127 |
|
134 |
#ifndef RCL_INDEX_STRIPCHARS
|
|
|
135 |
// For a raw index, also maybe create a stem expansion for
|
128 |
// For a raw index, also maybe create a stem expansion for
|
136 |
// the unaccented term. While this may be incorrect, it is
|
129 |
// the unaccented term. While this may be incorrect, it is
|
137 |
// also necessary for searching in a diacritic-unsensitive
|
130 |
// also necessary for searching in a diacritic-unsensitive
|
138 |
// way on a raw index
|
131 |
// way on a raw index
|
139 |
if (!o_index_stripchars) {
|
132 |
if (!o_index_stripchars) {
|
|
... |
|
... |
143 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
136 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
144 |
unacstemdbs[i].addSynonym(unac);
|
137 |
unacstemdbs[i].addSynonym(unac);
|
145 |
}
|
138 |
}
|
146 |
}
|
139 |
}
|
147 |
}
|
140 |
}
|
148 |
#endif
|
|
|
149 |
}
|
141 |
}
|
150 |
} XCATCHERROR(ermsg);
|
142 |
} XCATCHERROR(ermsg);
|
151 |
if (!ermsg.empty()) {
|
143 |
if (!ermsg.empty()) {
|
152 |
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
144 |
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
153 |
return false;
|
145 |
return false;
|