|
a/src/rcldb/rcldb.cpp |
|
b/src/rcldb/rcldb.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
#include <stdio.h>
|
4 |
#include <stdio.h>
|
5 |
#include <sys/stat.h>
|
5 |
#include <sys/stat.h>
|
6 |
|
6 |
|
7 |
#include <iostream>
|
7 |
#include <iostream>
|
|
... |
|
... |
211 |
return true;
|
211 |
return true;
|
212 |
}
|
212 |
}
|
213 |
|
213 |
|
214 |
// Unaccent and lowercase data, replace \n\r with spaces
|
214 |
// Unaccent and lowercase data, replace \n\r with spaces
|
215 |
// Removing crlfs is so that we can use the text in the document data fields.
|
215 |
// Removing crlfs is so that we can use the text in the document data fields.
|
216 |
// Use unac for removing accents
|
216 |
// Use unac (with folding extension) for removing accents and casefolding
|
217 |
// Use our own lower-casing function (built from Unicode tables)
|
|
|
218 |
// Everything is converted to/from UTF-16BE at begin/end as this the internal
|
|
|
219 |
// format used by the processing functions.
|
|
|
220 |
//
|
217 |
//
|
221 |
// A possible optimization would be to remove accented characters from
|
218 |
// Note that we always return true (but set out to "" on error). We don't
|
222 |
// the lowercasing function tables, as we execute unac first. It
|
219 |
// want to stop indexation because of a bad string
|
223 |
// might even be possible must probably non trivial to combine both
|
|
|
224 |
// conversions
|
|
|
225 |
bool Rcl::dumb_string(const string &in, string &out)
|
220 |
bool Rcl::dumb_string(const string &in, string &out)
|
226 |
{
|
221 |
{
|
227 |
out.erase();
|
222 |
out.erase();
|
228 |
if (in.empty())
|
223 |
if (in.empty())
|
229 |
return true;
|
224 |
return true;
|
230 |
|
225 |
|
231 |
string s1, s2;
|
226 |
string s1;
|
|
|
227 |
s1.reserve(in.length());
|
232 |
for (unsigned int i = 0; i < in.length(); i++) {
|
228 |
for (unsigned int i = 0; i < in.length(); i++) {
|
233 |
if (in[i] == '\n' || in[i] == '\r')
|
229 |
if (in[i] == '\n' || in[i] == '\r')
|
234 |
s1 += ' ';
|
230 |
s1 += ' ';
|
235 |
else
|
231 |
else
|
236 |
s1 += in[i];
|
232 |
s1 += in[i];
|
237 |
}
|
233 |
}
|
238 |
if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {
|
234 |
if (!unacmaybefold(s1, out, "UTF-8", true)) {
|
239 |
LOGERR(("dumb_string: convert to utf-16be failed\n"));
|
|
|
240 |
return false;
|
|
|
241 |
}
|
|
|
242 |
|
|
|
243 |
if (!unac_cpp_utf16be(s2, s1)) {
|
|
|
244 |
LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
|
235 |
LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
|
|
|
236 |
out.erase();
|
245 |
return false;
|
237 |
return true;
|
246 |
}
|
|
|
247 |
if (!ucs2lower(s1, s2)) {
|
|
|
248 |
LOGERR(("dumb_string: ucs2lower failed\n"));
|
|
|
249 |
return false;
|
|
|
250 |
}
|
|
|
251 |
if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
|
|
|
252 |
LOGERR(("dumb_string: convert back to utf-8 failed\n"));
|
|
|
253 |
return false;
|
|
|
254 |
}
|
238 |
}
|
255 |
return true;
|
239 |
return true;
|
256 |
}
|
240 |
}
|
257 |
|
241 |
|
258 |
/* omindex direct */
|
242 |
/* omindex direct */
|