Parent: [838ea6] (diff)

Download this file

synfamily.h    227 lines (190 with data), 7.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
/* Copyright (C) 2012 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#ifndef _SYNFAMILY_H_INCLUDED_
#define _SYNFAMILY_H_INCLUDED_
/**
* The Xapian synonyms mechanism can be used for many things beyond actual
* synonyms, anything that would turn a string into a group of equivalents.
* Unfortunately, it has only one keyspace.
* This class partitions the Xapian synonyms keyspace by using prefixes and
* can provide different applications each with a family of keyspaces.
* Two characters are reserved by the class and should not be used inside
* either family or member names: ':' and ';'
* A synonym key for family "stemdb", member "french", key "somestem"
* looks like:
* :stemdb:french:somestem -> somestem expansions
* A special entry is used to list all the members for a family, e.g.:
* :stemdb;members -> french, english ...
*/
#include <string>
#include <vector>
#include <xapian.h>
#include "debuglog.h"
#include "xmacros.h"
#include "strmatcher.h"
namespace Rcl {
class XapSynFamily {
public:
/**
* Construct from readable xapian database and family name (ie: Stm)
*/
XapSynFamily(Xapian::Database xdb, const std::string& familyname)
: m_rdb(xdb)
{
m_prefix1 = std::string(":") + familyname;
}
/** Retrieve all members of this family (e.g: french english german...) */
virtual bool getMembers(std::vector<std::string>&);
/** debug: list map for one member to stdout */
virtual bool listMap(const std::string& fam);
/** Expand term to list of synonyms for given member */
bool synExpand(const std::string& membername,
const std::string& term, std::vector<std::string>& result);
// The prefix shared by all synonym entries inside a family member
virtual std::string entryprefix(const std::string& member)
{
return m_prefix1 + ":" + member + ":";
}
// The key for the "list of members" entry
virtual std::string memberskey()
{
return m_prefix1 + ";" + "members";
}
Xapian::Database& getdb()
{
return m_rdb;
}
protected:
Xapian::Database m_rdb;
std::string m_prefix1;
};
/** Modify ops for a synonyms family
*
* A method to add a synonym entry inside a given member would make sense,
* but would not be used presently as all these ops go through
* ComputableSynFamMember objects
*/
class XapWritableSynFamily : public XapSynFamily {
public:
/** Construct with Xapian db open for r/w */
XapWritableSynFamily(Xapian::WritableDatabase db,
const std::string& familyname)
: XapSynFamily(db, familyname), m_wdb(db)
{
}
/** Delete all entries for one member (e.g. french), and remove from list
* of members */
virtual bool deleteMember(const std::string& membername);
/** Add to list of members. Idempotent, does not affect actual expansions */
virtual bool createMember(const std::string& membername);
Xapian::WritableDatabase getdb() {return m_wdb;}
protected:
Xapian::WritableDatabase m_wdb;
};
/** A functor which transforms a string */
class SynTermTrans {
public:
virtual std::string operator()(const std::string&) = 0;
virtual std::string name() { return "SynTermTrans: unknown";}
};
/** A member (set of root-synonyms associations) of a SynFamily for
* which the root is computable from the input term.
* The objects use a functor member to compute the term root on input
* (e.g. compute the term sterm or casefold it
*/
class XapComputableSynFamMember {
public:
XapComputableSynFamMember(Xapian::Database xdb, std::string familyname,
std::string membername, SynTermTrans* trans)
: m_family(xdb, familyname), m_membername(membername),
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
{
}
/** Expand a term to its list of synonyms. If filtertrans is set we
* keep only the results which transform to the same value as the input
* This is used for example for filtering the result of case+diac
* expansion when only either case or diac expansion is desired.
*/
bool synExpand(const std::string& term, std::vector<std::string>& result,
SynTermTrans *filtertrans = 0);
/** Same with also wildcard/regexp expansion of entry against the keys.
* The input matcher will be modified to fit our key format. */
bool synKeyExpand(StrMatcher* in, std::vector<std::string>& result,
SynTermTrans *filtertrans = 0);
private:
XapSynFamily m_family;
std::string m_membername;
SynTermTrans *m_trans;
std::string m_prefix;
};
/** Computable term root SynFamily member, modify ops */
class XapWritableComputableSynFamMember {
public:
XapWritableComputableSynFamMember(
Xapian::WritableDatabase xdb, std::string familyname,
std::string membername, SynTermTrans* trans)
: m_family(xdb, familyname), m_membername(membername),
m_trans(trans), m_prefix(m_family.entryprefix(m_membername))
{
}
virtual bool addSynonym(const std::string& term)
{
LOGDEB2(("addSynonym:me %p term [%s] m_trans %p\n", this,
term.c_str(), m_trans));
std::string transformed = (*m_trans)(term);
LOGDEB2(("addSynonym: transformed [%s]\n", transformed.c_str()));
if (transformed == term)
return true;
std::string ermsg;
try {
m_family.getdb().add_synonym(m_prefix + transformed, term);
} XCATCHERROR(ermsg);
if (!ermsg.empty()) {
LOGERR(("XapWritableComputableSynFamMember::addSynonym: "
"xapian error %s\n", ermsg.c_str()));
return false;
}
return true;
}
void clear()
{
m_family.deleteMember(m_membername);
}
void recreate()
{
clear();
m_family.createMember(m_membername);
}
private:
XapWritableSynFamily m_family;
std::string m_membername;
SynTermTrans *m_trans;
std::string m_prefix;
};
//
// Prefixes are centrally defined here to avoid collisions
//
// Lowercase accented stem to expansion. Family member name: language
static const std::string synFamStem("Stm");
// Lowercase unaccented stem to expansion. Family member name: language
static const std::string synFamStemUnac("StU");
// Lowercase unaccented term to case and accent variations. Only one
// member, named "all". This set is used for separate case/diac
// expansion by post-filtering the results of dual expansion.
static const std::string synFamDiCa("DCa");
} // end namespace Rcl
#endif /* _SYNFAMILY_H_INCLUDED_ */