Parent: [0b0385] (diff)

Child: [04f344] (diff)

Download this file

expansiondbs.cpp    157 lines (136 with data), 5.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include "autoconfig.h"
#include <memory>
#include "log.h"
#include "utf8iter.h"
#include "smallut.h"
#include "chrono.h"
#include "textsplit.h"
#include "xmacros.h"
#include "rcldb.h"
#include "stemdb.h"
#include "expansiondbs.h"
using namespace std;
namespace Rcl {
/**
* Create all expansion dbs used to transform user input term to widen a query
* We use Xapian synonyms subsets to store the expansions.
*/
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
const vector<string>& langs)
{
LOGDEB("StemDb::createExpansionDbs: languages: " << (stringsToString(langs)) << "\n" );
Chrono cron;
// Erase and recreate all the expansion groups
// If langs is empty and we don't need casediac expansion, then no need to
// walk the big list
if (langs.empty()) {
if (o_index_stripchars)
return true;
}
// Walk the list of all terms, and stem/unac each.
string ermsg;
try {
// Stem dbs
vector<XapWritableComputableSynFamMember> stemdbs;
// Note: tried to make this to work with stack-allocated objects, couldn't.
// Looks like a bug in copy constructors somewhere, can't guess where
vector<std::shared_ptr<SynTermTransStem> > stemmers;
for (unsigned int i = 0; i < langs.size(); i++) {
stemmers.push_back(std::shared_ptr<SynTermTransStem>
(new SynTermTransStem(langs[i])));
stemdbs.push_back(
XapWritableComputableSynFamMember(wdb, synFamStem, langs[i],
stemmers.back().get()));
stemdbs.back().recreate();
}
// Unaccented stem dbs
vector<XapWritableComputableSynFamMember> unacstemdbs;
// We can reuse the same stemmer pointers, the objects are stateless.
if (!o_index_stripchars) {
for (unsigned int i = 0; i < langs.size(); i++) {
unacstemdbs.push_back(
XapWritableComputableSynFamMember(wdb, synFamStemUnac, langs[i],
stemmers.back().get()));
unacstemdbs.back().recreate();
}
}
SynTermTransUnac transunac(UNACOP_UNACFOLD);
XapWritableComputableSynFamMember
diacasedb(wdb, synFamDiCa, "all", &transunac);
if (!o_index_stripchars)
diacasedb.recreate();
Xapian::TermIterator it = wdb.allterms_begin();
// We'd want to skip to the first non-prefixed term, but this is a bit
// complicated, so we just jump over most of the prefixed term and then
// skip the rest one by one.
it.skip_to(wrap_prefix("Z"));
for ( ;it != wdb.allterms_end(); it++) {
if (has_prefix(*it))
continue;
// Detect and skip CJK terms.
Utf8Iter utfit(*it);
if (utfit.eof()) // Empty term?? Seems to happen.
continue;
if (TextSplit::isCJK(*utfit)) {
// LOGDEB("stemskipped: Skipping CJK\n" );
continue;
}
string lower = *it;
// If the index is raw, compute the case-folded term which
// is the input to the stem db, and add a synonym from the
// stripped term to the cased and accented one, for accent
// and case expansion at query time
if (!o_index_stripchars) {
unacmaybefold(*it, lower, "UTF-8", UNACOP_FOLD);
diacasedb.addSynonym(*it);
}
// Dont' apply stemming to terms which don't look like
// natural language words.
if (!Db::isSpellingCandidate(*it)) {
LOGDEB1("createExpansionDbs: skipped: [" << ((*it)) << "]\n" );
continue;
}
// Create stemming synonym for every language. The input is the
// lowercase accented term
for (unsigned int i = 0; i < langs.size(); i++) {
stemdbs[i].addSynonym(lower);
}
// For a raw index, also maybe create a stem expansion for
// the unaccented term. While this may be incorrect, it is
// also necessary for searching in a diacritic-unsensitive
// way on a raw index
if (!o_index_stripchars) {
string unac;
unacmaybefold(lower, unac, "UTF-8", UNACOP_UNAC);
if (unac != lower) {
for (unsigned int i = 0; i < langs.size(); i++) {
unacstemdbs[i].addSynonym(unac);
}
}
}
}
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR("Db::createStemDb: map build failed: " << (ermsg) << "\n" );
return false;
}
LOGDEB("StemDb::createExpansionDbs: done: " << (cron.secs()) << " S\n" );
return true;
}
}