recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [8a0596] .. [bf3ac8]

Switch to unified view


...
// The array could be fully static, but we use a small function to fill it 
// once.
// The array is actually a remnant of the original version which did no utf8.
// Only the lower 127 slots are  now used, but keep it at 256
// because it makes some tests in the code simpler.
const unsigned int charclasses_size = 256;
enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259, 
                A_ULETTER=260, A_LLETTER=261};
static int charclasses[charclasses_size];

// Real UTF-8 characters are handled with sets holding all characters
// with interesting properties. This is far from full-blown management
// of Unicode properties, but seems to do the job well enough in most
// common cases
...
            // Camelcase handling. 
            // If we get uppercase ascii after lowercase ascii, emit word.
            // This emits "camel" when hitting the 'C' of camelCase
    case A_ULETTER:
        if (m_span.length() && 
                charclasses[(unsigned char)m_span[m_span.length() - 1]] == 
                A_LLETTER) {
                if (m_wordLen) {
                    if (!doemit(false, it.getBpos()))
                        return false;
                }
...
            // string of several upper-case letters:  an
            // acronym (readHTML) or a single letter article (ALittleHelp).
            // Emit the uppercase word before proceeding
        case A_LLETTER:
        if (m_span.length() && 
                charclasses[(unsigned char)m_span[m_span.length() - 1]] == 
                A_ULETTER && m_wordLen > 1) {
                // Multiple upper-case letters. Single letter word
                // or acronym which we want to emit now
                m_wordLen--;
                if (!doemit(false, it.getBpos()))
...
bool TextSplit::hasVisibleWhite(const string &in)
{
    setcharclasses();
    Utf8Iter it(in);
    for (; !it.eof(); it++) {
    unsigned int c = (unsigned char)*it;
    LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
    if (c == (unsigned int)-1) {
        LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
        return false;
    }

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
	...		...
48	// The array could be fully static, but we use a small function to fill it	48	// The array could be fully static, but we use a small function to fill it
49	// once.	49	// once.
50	// The array is actually a remnant of the original version which did no utf8.	50	// The array is actually a remnant of the original version which did no utf8.
51	// Only the lower 127 slots are now used, but keep it at 256	51	// Only the lower 127 slots are now used, but keep it at 256
52	// because it makes some tests in the code simpler.	52	// because it makes some tests in the code simpler.
		53	const unsigned int charclasses_size = 256;
53	enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,	54	enum CharClass {LETTER=256, SPACE=257, DIGIT=258, WILD=259,
54	A_ULETTER=260, A_LLETTER=261};	55	A_ULETTER=260, A_LLETTER=261};
55	static int charclasses[256];	56	static int charclasses[charclasses_size];
56		57
57	// Real UTF-8 characters are handled with sets holding all characters	58	// Real UTF-8 characters are handled with sets holding all characters
58	// with interesting properties. This is far from full-blown management	59	// with interesting properties. This is far from full-blown management
59	// of Unicode properties, but seems to do the job well enough in most	60	// of Unicode properties, but seems to do the job well enough in most
60	// common cases	61	// common cases
	...		...
452	// Camelcase handling.	453	// Camelcase handling.
453	// If we get uppercase ascii after lowercase ascii, emit word.	454	// If we get uppercase ascii after lowercase ascii, emit word.
454	// This emits "camel" when hitting the 'C' of camelCase	455	// This emits "camel" when hitting the 'C' of camelCase
455	case A_ULETTER:	456	case A_ULETTER:
456	if (m_span.length() &&	457	if (m_span.length() &&
457	charclasses[(unsigned int)m_span[m_span.length() - 1]] ==	458	charclasses[(unsigned char)m_span[m_span.length() - 1]] ==
458	A_LLETTER) {	459	A_LLETTER) {
459	if (m_wordLen) {	460	if (m_wordLen) {
460	if (!doemit(false, it.getBpos()))	461	if (!doemit(false, it.getBpos()))
461	return false;	462	return false;
462	}	463	}
	...		...
469	// string of several upper-case letters: an	470	// string of several upper-case letters: an
470	// acronym (readHTML) or a single letter article (ALittleHelp).	471	// acronym (readHTML) or a single letter article (ALittleHelp).
471	// Emit the uppercase word before proceeding	472	// Emit the uppercase word before proceeding
472	case A_LLETTER:	473	case A_LLETTER:
473	if (m_span.length() &&	474	if (m_span.length() &&
474	charclasses[(unsigned int)m_span[m_span.length() - 1]] ==	475	charclasses[(unsigned char)m_span[m_span.length() - 1]] ==
475	A_ULETTER && m_wordLen > 1) {	476	A_ULETTER && m_wordLen > 1) {
476	// Multiple upper-case letters. Single letter word	477	// Multiple upper-case letters. Single letter word
477	// or acronym which we want to emit now	478	// or acronym which we want to emit now
478	m_wordLen--;	479	m_wordLen--;
479	if (!doemit(false, it.getBpos()))	480	if (!doemit(false, it.getBpos()))
	...		...
609	bool TextSplit::hasVisibleWhite(const string &in)	610	bool TextSplit::hasVisibleWhite(const string &in)
610	{	611	{
611	setcharclasses();	612	setcharclasses();
612	Utf8Iter it(in);	613	Utf8Iter it(in);
613	for (; !it.eof(); it++) {	614	for (; !it.eof(); it++) {
614	unsigned int c = *it;	615	unsigned int c = (unsigned char)*it;
615	LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));	616	LOGDEB3(("TextSplit::hasVisibleWhite: testing 0x%04x\n", c));
616	if (c == (unsigned int)-1) {	617	if (c == (unsigned int)-1) {
617	LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));	618	LOGERR(("hasVisibleWhite: error while scanning UTF-8 string\n"));
618	return false;	619	return false;
619	}	620	}