--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
#ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>
@@ -213,44 +213,28 @@
// Unaccent and lowercase data, replace \n\r with spaces
// Removing crlfs is so that we can use the text in the document data fields.
-// Use unac for removing accents
-// Use our own lower-casing function (built from Unicode tables)
-// Everything is converted to/from UTF-16BE at begin/end as this the internal
-// format used by the processing functions.
+// Use unac (with folding extension) for removing accents and casefolding
//
-// A possible optimization would be to remove accented characters from
-// the lowercasing function tables, as we execute unac first. It
-// might even be possible must probably non trivial to combine both
-// conversions
+// Note that we always return true (but set out to "" on error). We don't
+// want to stop indexation because of a bad string
bool Rcl::dumb_string(const string &in, string &out)
{
out.erase();
if (in.empty())
return true;
- string s1, s2;
+ string s1;
+ s1.reserve(in.length());
for (unsigned int i = 0; i < in.length(); i++) {
if (in[i] == '\n' || in[i] == '\r')
s1 += ' ';
else
s1 += in[i];
}
- if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {
- LOGERR(("dumb_string: convert to utf-16be failed\n"));
- return false;
- }
-
- if (!unac_cpp_utf16be(s2, s1)) {
- LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
- return false;
- }
- if (!ucs2lower(s1, s2)) {
- LOGERR(("dumb_string: ucs2lower failed\n"));
- return false;
- }
- if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
- LOGERR(("dumb_string: convert back to utf-8 failed\n"));
- return false;
+ if (!unacmaybefold(s1, out, "UTF-8", true)) {
+ LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
+ out.erase();
+ return true;
}
return true;
}