recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [00b954] .. [9a9ce9]

Switch to side-by-side view

--- a/src/rcldb/rcldb.cpp
+++ b/src/rcldb/rcldb.cpp
@@ -1,5 +1,5 @@
 #ifndef lint
-static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
+static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
 #endif
 #include <stdio.h>
 #include <sys/stat.h>
@@ -213,44 +213,28 @@
 
 // Unaccent and lowercase data, replace \n\r with spaces
 // Removing crlfs is so that we can use the text in the document data fields.
-// Use unac for removing accents
-// Use our own lower-casing function (built from Unicode tables)
-// Everything is converted to/from UTF-16BE at begin/end as this the internal
-// format used by the processing functions.
+// Use unac (with folding extension) for removing accents and casefolding
 //
-// A possible optimization would be to remove accented characters from
-// the lowercasing function tables, as we execute unac first.  It
-// might even be possible must probably non trivial to combine both
-// conversions
+// Note that we always return true (but set out to "" on error). We don't
+// want to stop indexation because of a bad string
 bool Rcl::dumb_string(const string &in, string &out)
 {
     out.erase();
     if (in.empty())
 	return true;
 
-    string s1, s2;
+    string s1;
+    s1.reserve(in.length());
     for (unsigned int i = 0; i < in.length(); i++) {
 	if (in[i] == '\n' || in[i] == '\r')
 	    s1 += ' ';
 	else
 	    s1 += in[i];
     }
-    if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {
-	LOGERR(("dumb_string: convert to utf-16be failed\n"));
-	return false;
-    }
-
-    if (!unac_cpp_utf16be(s2, s1)) {
-	LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
-	return false;
-    }
-    if (!ucs2lower(s1, s2)) {
-	LOGERR(("dumb_string: ucs2lower failed\n"));
-	return false;
-    }
-    if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
-	LOGERR(("dumb_string: convert back to utf-8 failed\n"));
-	return false;
+    if (!unacmaybefold(s1, out, "UTF-8", true)) {
+	LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
+	out.erase();
+	return true;
     }
     return true;
 }