|
a/src/rcldb/stemdb.cpp |
|
b/src/rcldb/stemdb.cpp |
|
... |
|
... |
33 |
#include "utf8iter.h"
|
33 |
#include "utf8iter.h"
|
34 |
#include "textsplit.h"
|
34 |
#include "textsplit.h"
|
35 |
#include "rcldb.h"
|
35 |
#include "rcldb.h"
|
36 |
#include "rcldb_p.h"
|
36 |
#include "rcldb_p.h"
|
37 |
#include "synfamily.h"
|
37 |
#include "synfamily.h"
|
|
|
38 |
#include "unacpp.h"
|
38 |
|
39 |
|
39 |
#include <iostream>
|
40 |
#include <iostream>
|
40 |
|
41 |
|
41 |
using namespace std;
|
42 |
using namespace std;
|
42 |
|
43 |
|
|
... |
|
... |
54 |
}
|
55 |
}
|
55 |
|
56 |
|
56 |
/**
|
57 |
/**
|
57 |
* Create database of stem to parents associations for a given language.
|
58 |
* Create database of stem to parents associations for a given language.
|
58 |
*/
|
59 |
*/
|
59 |
bool WritableStemDb::createDb(const string& lang)
|
60 |
bool createExpansionDbs(Xapian::WritableDatabase& wdb,
|
|
|
61 |
const vector<string>& langs)
|
60 |
{
|
62 |
{
|
61 |
LOGDEB(("StemDb::createDb(%s)\n", lang.c_str()));
|
63 |
LOGDEB(("StemDb::createExpansionDbs\n"));
|
62 |
Chrono cron;
|
64 |
Chrono cron;
|
63 |
createMember(lang);
|
65 |
|
64 |
string prefix = entryprefix(lang);
|
66 |
vector<XapWritableSynFamily> stemdbs;
|
|
|
67 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
|
68 |
stemdbs.push_back(XapWritableSynFamily(wdb, synFamStem));
|
|
|
69 |
stemdbs[i].deleteMember(langs[i]);
|
|
|
70 |
stemdbs[i].createMember(langs[i]);
|
|
|
71 |
stemdbs[i].setCurrentMemberName(langs[i]);
|
|
|
72 |
}
|
65 |
|
73 |
|
66 |
// We walk the list of all terms, and stem each. We skip terms which
|
74 |
// We walk the list of all terms, and stem each. We skip terms which
|
67 |
// don't look like natural language.
|
75 |
// don't look like natural language.
|
68 |
// If the stem is not identical to the term, we add a synonym entry.
|
76 |
// If the stem is not identical to the term, we add a synonym entry.
|
69 |
// Statistics
|
77 |
// Statistics
|
|
... |
|
... |
71 |
int stemconst = 0; // Stem == term
|
79 |
int stemconst = 0; // Stem == term
|
72 |
int allsyns = 0; // Total number of entries created
|
80 |
int allsyns = 0; // Total number of entries created
|
73 |
|
81 |
|
74 |
string ermsg;
|
82 |
string ermsg;
|
75 |
try {
|
83 |
try {
|
76 |
Xapian::Stem stemmer(lang);
|
84 |
vector<Xapian::Stem> stemmers;
|
|
|
85 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
|
|
86 |
stemmers.push_back(Xapian::Stem(langs[i]));
|
|
|
87 |
}
|
77 |
|
88 |
|
78 |
for (Xapian::TermIterator it = m_wdb.allterms_begin();
|
89 |
for (Xapian::TermIterator it = wdb.allterms_begin();
|
79 |
it != m_wdb.allterms_end(); it++) {
|
90 |
it != wdb.allterms_end(); it++) {
|
80 |
// If the term has any non-lowercase 7bit char (that is,
|
91 |
// If the term has any non-lowercase 7bit char (that is,
|
81 |
// numbers, capitals and punctuation) dont stem.
|
92 |
// numbers, capitals and punctuation) dont stem.
|
82 |
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
93 |
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
83 |
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
94 |
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
84 |
++nostem;
|
95 |
++nostem;
|
|
... |
|
... |
100 |
if (TextSplit::isCJK(*utfit)) {
|
111 |
if (TextSplit::isCJK(*utfit)) {
|
101 |
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
112 |
// LOGDEB(("stemskipped: Skipping CJK\n"));
|
102 |
continue;
|
113 |
continue;
|
103 |
}
|
114 |
}
|
104 |
|
115 |
|
|
|
116 |
// Create stemming synonym for every lang
|
|
|
117 |
for (unsigned int i = 0; i < langs.size(); i++) {
|
105 |
string stem = stemmer(*it);
|
118 |
string stem = stemmers[i](*it);
|
106 |
LOGDEB2(("Db::createStemDb: word [%s], stem [%s]\n", (*it).c_str(),
|
119 |
if (stem == *it) {
|
107 |
stem.c_str()));
|
120 |
++stemconst;
|
108 |
if (stem == *it) {
|
121 |
} else {
|
109 |
++stemconst;
|
122 |
stemdbs[i].addSynonym(stem, *it);
|
110 |
continue;
|
123 |
LOGDEB0(("Db::createExpansiondbs: [%s] (%s) -> [%s]\n",
|
111 |
}
|
124 |
(*it).c_str(), langs[i].c_str(), stem.c_str()));
|
112 |
|
|
|
113 |
m_wdb.add_synonym(prefix + stem, *it);
|
|
|
114 |
++allsyns;
|
125 |
++allsyns;
|
|
|
126 |
}
|
|
|
127 |
}
|
|
|
128 |
|
115 |
}
|
129 |
}
|
116 |
} XCATCHERROR(ermsg);
|
130 |
} XCATCHERROR(ermsg);
|
117 |
if (!ermsg.empty()) {
|
131 |
if (!ermsg.empty()) {
|
118 |
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
132 |
LOGERR(("Db::createStemDb: map build failed: %s\n", ermsg.c_str()));
|
119 |
return false;
|
133 |
return false;
|
120 |
}
|
134 |
}
|
121 |
|
135 |
|
122 |
LOGDEB(("StemDb::createDb(%s): done: %.2f S\n", lang.c_str(), cron.secs()));
|
136 |
LOGDEB(("StemDb::createExpansionDbs: done: %.2f S\n", cron.secs()));
|
123 |
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
137 |
LOGDEB(("StemDb::createDb: nostem %d stemconst %d allsyns %d\n",
|
124 |
nostem, stemconst, allsyns));
|
138 |
nostem, stemconst, allsyns));
|
125 |
return true;
|
139 |
return true;
|
126 |
}
|
140 |
}
|
127 |
|
141 |
|