|
a/src/rcldb/expansiondbs.cpp |
|
b/src/rcldb/expansiondbs.cpp |
|
... |
|
... |
82 |
string ermsg;
|
82 |
string ermsg;
|
83 |
try {
|
83 |
try {
|
84 |
for (Xapian::TermIterator it = wdb.allterms_begin();
|
84 |
for (Xapian::TermIterator it = wdb.allterms_begin();
|
85 |
it != wdb.allterms_end(); it++) {
|
85 |
it != wdb.allterms_end(); it++) {
|
86 |
|
86 |
|
87 |
// Skip terms which don't look like natural language words.
|
|
|
88 |
if (!Db::isSpellingCandidate(*it)) {
|
|
|
89 |
LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
|
|
|
90 |
continue;
|
|
|
91 |
}
|
|
|
92 |
|
|
|
93 |
// Detect and skip CJK terms.
|
87 |
// Detect and skip CJK terms.
|
94 |
// We're still sending all other multibyte utf-8 chars to
|
|
|
95 |
// the stemmer, which is not too well defined for
|
|
|
96 |
// xapian<1.0 (very obsolete now), but seems to work
|
|
|
97 |
// anyway. There shouldn't be too many in any case because
|
|
|
98 |
// accents are stripped at this point.
|
|
|
99 |
// The effect of stripping accents on stemming is not good,
|
|
|
100 |
// (e.g: in french partimes -> partim, parti^mes -> part)
|
|
|
101 |
// but fixing the issue would be complicated.
|
|
|
102 |
Utf8Iter utfit(*it);
|
88 |
Utf8Iter utfit(*it);
|
103 |
if (TextSplit::isCJK(*utfit)) {
|
89 |
if (TextSplit::isCJK(*utfit)) {
|
104 |
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
90 |
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
105 |
continue;
|
91 |
continue;
|
106 |
}
|
92 |
}
|
|
... |
|
... |
114 |
if (!o_index_stripchars) {
|
100 |
if (!o_index_stripchars) {
|
115 |
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
101 |
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
|
116 |
diacasedb.addSynonym(*it);
|
102 |
diacasedb.addSynonym(*it);
|
117 |
}
|
103 |
}
|
118 |
#endif
|
104 |
#endif
|
|
|
105 |
|
|
|
106 |
// Dont' apply stemming to terms which don't look like
|
|
|
107 |
// natural language words.
|
|
|
108 |
if (!Db::isSpellingCandidate(*it)) {
|
|
|
109 |
LOGDEB1(("createExpansionDbs: skipped: [%s]\n", (*it).c_str()));
|
|
|
110 |
continue;
|
|
|
111 |
}
|
119 |
|
112 |
|
120 |
// Create stemming synonym for every language. The input is the
|
113 |
// Create stemming synonym for every language. The input is the
|
121 |
// lowercase accented term
|
114 |
// lowercase accented term
|
122 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
115 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
123 |
stemdbs[i].addSynonym(lower);
|
116 |
stemdbs[i].addSynonym(lower);
|