|
a/src/rcldb/stemdb.cpp |
|
b/src/rcldb/stemdb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.3 2006-09-19 14:30:39 dockes Exp $ (C) 2005 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: stemdb.cpp,v 1.4 2006-09-20 06:21:43 dockes Exp $ (C) 2005 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
|
4 |
|
5 |
/**
|
5 |
/**
|
6 |
* Management of the auxiliary databases listing stems and their expansion
|
6 |
* Management of the auxiliary databases listing stems and their expansion
|
7 |
* terms
|
7 |
* terms
|
|
... |
|
... |
65 |
rmdir(dir.c_str());
|
65 |
rmdir(dir.c_str());
|
66 |
}
|
66 |
}
|
67 |
}
|
67 |
}
|
68 |
};
|
68 |
};
|
69 |
|
69 |
|
70 |
// Deciding if we try to stem the term. If it has numerals or capitals
|
|
|
71 |
// we don't
|
|
|
72 |
inline static bool
|
70 |
inline static bool
|
73 |
p_notlowerascii(unsigned int c)
|
71 |
p_notlowerascii(unsigned int c)
|
74 |
{
|
72 |
{
|
75 |
if (c < 'a' || (c > 'z' && c < 128))
|
73 |
if (c < 'a' || (c > 'z' && c < 128))
|
76 |
return true;
|
74 |
return true;
|
|
... |
|
... |
103 |
try {
|
101 |
try {
|
104 |
Xapian::Stem stemmer(lang);
|
102 |
Xapian::Stem stemmer(lang);
|
105 |
Xapian::TermIterator it;
|
103 |
Xapian::TermIterator it;
|
106 |
for (it = xdb.allterms_begin();
|
104 |
for (it = xdb.allterms_begin();
|
107 |
it != xdb.allterms_end(); it++) {
|
105 |
it != xdb.allterms_end(); it++) {
|
108 |
// If it has any non-lowercase 7bit char, cant be stemmable
|
106 |
// Deciding if we try to stem the term. If it has any
|
|
|
107 |
// non-lowercase 7bit char, dont. Note that
|
|
|
108 |
// as we are dealing with unaccented data, we are still
|
|
|
109 |
// processing most of western european languages (where
|
|
|
110 |
// most unaccented letters are ascii)
|
109 |
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
111 |
string::iterator sit = (*it).begin(), eit = sit + (*it).length();
|
110 |
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
112 |
if ((sit = find_if(sit, eit, p_notlowerascii)) != eit) {
|
111 |
++nostem;
|
113 |
++nostem;
|
112 |
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
114 |
// LOGDEB(("stemskipped: [%s], because of 0x%x\n",
|
113 |
// (*it).c_str(), *sit));
|
115 |
// (*it).c_str(), *sit));
|