recoll / Code / Diff of /src/rcldb/rcldb.cpp

Diff of /src/rcldb/rcldb.cpp [00b954] .. [9a9ce9]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
#include <stdio.h>
#include <sys/stat.h>

#include <iostream>
...
    return true;
}

// Unaccent and lowercase data, replace \n\r with spaces
// Removing crlfs is so that we can use the text in the document data fields.
// Use unac (with folding extension) for removing accents and casefolding



//
// Note that we always return true (but set out to "" on error). We don't
// want to stop indexation because of a bad string


bool Rcl::dumb_string(const string &in, string &out)
{
    out.erase();
    if (in.empty())
    return true;

    string s1;
    s1.reserve(in.length());
    for (unsigned int i = 0; i < in.length(); i++) {
    if (in[i] == '\n' || in[i] == '\r')
        s1 += ' ';
    else
        s1 += in[i];
    }
    if (!unacmaybefold(s1, out, "UTF-8", true)) {





    LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
  out.erase();
    return true;








    }
    return true;
}

/* omindex direct */

	a/src/rcldb/rcldb.cpp		b/src/rcldb/rcldb.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.45 2006-01-05 16:37:26 dockes Exp $ (C) 2004 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: rcldb.cpp,v 1.46 2006-01-06 13:18:17 dockes Exp $ (C) 2004 J.F.Dockes";
3	#endif	3	#endif
4	#include <stdio.h>	4	#include <stdio.h>
5	#include <sys/stat.h>	5	#include <sys/stat.h>
6		6
7	#include <iostream>	7	#include <iostream>
	...		...
211	return true;	211	return true;
212	}	212	}
213		213
214	// Unaccent and lowercase data, replace \n\r with spaces	214	// Unaccent and lowercase data, replace \n\r with spaces
215	// Removing crlfs is so that we can use the text in the document data fields.	215	// Removing crlfs is so that we can use the text in the document data fields.
216	// Use unac for removing accents	216	// Use unac (with folding extension) for removing accents and casefolding
217	// Use our own lower-casing function (built from Unicode tables)
218	// Everything is converted to/from UTF-16BE at begin/end as this the internal
219	// format used by the processing functions.
220	//	217	//
221	// A possible optimization would be to remove accented characters from	218	// Note that we always return true (but set out to "" on error). We don't
222	// the lowercasing function tables, as we execute unac first. It	219	// want to stop indexation because of a bad string
223	// might even be possible must probably non trivial to combine both
224	// conversions
225	bool Rcl::dumb_string(const string &in, string &out)	220	bool Rcl::dumb_string(const string &in, string &out)
226	{	221	{
227	out.erase();	222	out.erase();
228	if (in.empty())	223	if (in.empty())
229	return true;	224	return true;
230		225
231	string s1, s2;	226	string s1;
		227	s1.reserve(in.length());
232	for (unsigned int i = 0; i < in.length(); i++) {	228	for (unsigned int i = 0; i < in.length(); i++) {
233	if (in[i] == '\n' \|\| in[i] == '\r')	229	if (in[i] == '\n' \|\| in[i] == '\r')
234	s1 += ' ';	230	s1 += ' ';
235	else	231	else
236	s1 += in[i];	232	s1 += in[i];
237	}	233	}
238	if (!transcode(s1, s2, "UTF-8","UTF-16BE")) {	234	if (!unacmaybefold(s1, out, "UTF-8", true)) {
239	LOGERR(("dumb_string: convert to utf-16be failed\n"));
240	return false;
241	}
242
243	if (!unac_cpp_utf16be(s2, s1)) {
244	LOGERR(("dumb_string: unac_cpp failed for %s\n", in.c_str()));	235	LOGERR(("dumb_string: unac failed for %s\n", in.c_str()));
		236	out.erase();
245	return false;	237	return true;
246	}
247	if (!ucs2lower(s1, s2)) {
248	LOGERR(("dumb_string: ucs2lower failed\n"));
249	return false;
250	}
251	if (!transcode(s2, out, "UTF-16BE", "UTF-8")) {
252	LOGERR(("dumb_string: convert back to utf-8 failed\n"));
253	return false;
254	}	238	}
255	return true;	239	return true;
256	}	240	}
257		241
258	/* omindex direct */	242	/* omindex direct */