Switch to unified view

a/src/rcldb/rcldb.cpp b/src/rcldb/rcldb.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
#include <stdio.h>
4
#include <stdio.h>
5
#include <sys/stat.h>
5
#include <sys/stat.h>
6
6
7
#include <iostream>
7
#include <iostream>
...
...
211
    return true;
211
    return true;
212
}
212
}
213
213
214
// Unaccent and lowercase data, replace \n\r with spaces
214
// Unaccent and lowercase data, replace \n\r with spaces
215
// Removing crlfs is so that we can use the text in the document data fields.
215
// Removing crlfs is so that we can use the text in the document data fields.
216
// Use unac for removing accents
216
// Use unac (with folding extension) for removing accents and casefolding
217
// Use our own lower-casing function (built from Unicode tables)
218
// Everything is converted to/from UTF-16BE at begin/end as this the internal
219
// format used by the processing functions.
220
//
217
//
221
// A possible optimization would be to remove accented characters from
218
// Note that we always return true (but set out to "" on error). We don't
222
// the lowercasing function tables, as we execute unac first.  It
219
// want to stop indexation because of a bad string
223
// might even be possible must probably non trivial to combine both
224
// conversions
225
bool Rcl::dumb_string(const string &in, string &out)
220
bool Rcl::dumb_string(const string &in, string &out)
226
{
221
{
227
    out.erase();
222
    out.erase();
228
    if (in.empty())
223
    if (in.empty())
229
    return true;
224
    return true;
230
225
231
    string s1, s2;
226
    string s1;
227
    s1.reserve(in.length());
232
    for (unsigned int i = 0; i < in.length(); i++) {
228
    for (unsigned int i = 0; i < in.length(); i++) {
233
    if (in[i] == '\n' || in[i] == '\r')
229
    if (in[i] == '\n' || in[i] == '\r')
234
        s1 += ' ';
230
        s1 += ' ';
235
    else
231
    else
236
        s1 += in[i];
232
        s1 += in[i];
237
    }
233
    }
238
    if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {
234
    if (!unacmaybefold(s1, out, "UTF-8", true)) {
239
  LOGERR(("dumb_string: convert to utf-16be failed\n"));
240
  return false;
241
    }
242
243
    if (!unac_cpp_utf16be(s2, s1)) {
244
    LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));
235
    LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
236
  out.erase();
245
    return false;
237
    return true;
246
    }
247
    if (!ucs2lower(s1, s2)) {
248
  LOGERR(("dumb_string: ucs2lower failed\n"));
249
  return false;
250
    }
251
    if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
252
  LOGERR(("dumb_string: convert back to utf-8 failed\n"));
253
  return false;
254
    }
238
    }
255
    return true;
239
    return true;
256
}
240
}
257
241
258
/* omindex direct */
242
/* omindex direct */