recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [0ea2a9] .. [ba295f]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
             int btstart, int btend)
{
    LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));

    unsigned int l = w.length();
    if (l > 0 && l < (unsigned)m_maxWordLength) {
    // 1 char word: we index single letters and digits, but
    // nothing else. We might want to turn this into a test for a single
    // utf8 character instead.
    if (l == 1) {
        int c = (int)w[0];
        if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
        //cerr << "ERASING single letter term " << c << endl;
        return true;
        }
    }
    if (pos != m_prevpos || l != m_prevlen) {
        bool ret = m_cb->takeword(w, pos, btstart, btend);
      m_prevpos = pos;
        m_prevlen = w.length();

        return ret;
    }
    LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
    }
    return true;
...
         span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));

    // Emit span. When splitting for query, we only emit final spans
    bool spanemitted = false;
    if (spanerase && !(m_flags & TXTS_NOSPANS)) {
    // Maybe trim at end. These are chars that we would keep inside 
    // a span, but not at the end
    while (m_span.length() > 0) {
        switch (m_span[m_span.length()-1]) {
        case '.':
        case ',':
        case '@':
        case '\'':
        m_span.resize(m_span.length()-1);
        if (--bp < 0) 
            bp = 0;
        break;
        default:
        goto breakloop1;
        }
    }
    breakloop1:
    spanemitted = true;
    if (!emitterm(true, m_span, m_spanpos, bp - m_span.length(), bp))
        return false;
    }

    // Emit word if different from span and not 'no words' mode
    if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen && 
    (!spanemitted || m_wordLen != m_span.length())) {
    string s(m_span.substr(m_wordStart, m_wordLen));
    if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
        return false;
    }

    // Adjust state
    m_wordpos++;
    m_wordLen = 0;
    if (spanerase) {
    m_span.erase();
    m_spanpos = m_wordpos;
    m_wordStart = 0;
    } else {
    m_wordStart = m_span.length();
    }

    return true;
}

...
    LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb, 
        in.substr(0,50).c_str()));

    setcharclasses();

    m_span.erase();
    m_inNumber = false;
    m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;

    Utf8Iter it(in);

    for (; !it.eof(); it++) {
    unsigned int c = *it;
...
        return false;
    }
    int cc = whatcc(c);
    switch (cc) {
    case LETTER:
        m_wordLen += it.appendchartostring(m_span);
        break;

    case DIGIT:
        if (m_wordLen == 0)
        m_inNumber = true;
        m_wordLen += it.appendchartostring(m_span);
        break;

    case SPACE:
    SPACE:
        if (m_wordLen || m_span.length()) {
        if (!doemit(true, it.getBpos()))
            return false;
        m_inNumber = false;
        }
        break;
    case WILD:
        if (m_flags & TXTS_KEEPWILD)
        goto NORMALCHAR;
        else
        goto SPACE;
        break;
    case '-':
    case '+':
        if (m_wordLen == 0) {
        if (whatcc(it[it.getCpos()+1]) == DIGIT) {
            m_inNumber = true;
            m_wordLen += it.appendchartostring(m_span);
        } else {
            m_wordStart += it.appendchartostring(m_span);
        }
        } else {
        if (!doemit(false, it.getBpos()))
            return false;
        m_inNumber = false;
        m_wordStart += it.appendchartostring(m_span);
        }
        break;
    case '.':
    case ',':
        if (m_inNumber) {
        // 132.jpg ?
        if (whatcc(it[it.getCpos()+1]) != DIGIT)
            goto SPACE;
        m_wordLen += it.appendchartostring(m_span);
        break;
        } else {
        // If . inside a word, keep it, else, this is whitespace. 
        // We also keep an initial '.' for catching .net, but this adds
        // quite a few spurious terms !
                // Another problem is that something like .x-errs 
        // will be split as .x-errs, x, errs but not x-errs
        // A final comma in a word will be removed by doemit
        if (cc == '.') {
            if (m_wordLen) {
            if (!doemit(false, it.getBpos()))
                return false;
            // span length could have been adjusted by trimming
            // inside doemit
            if (m_span.length())
                m_wordStart += it.appendchartostring(m_span);
            break;
            } else {
            m_wordStart += it.appendchartostring(m_span);
            break;
            }
        }
        }
        goto SPACE;
        break;
    case '@':
        if (m_wordLen) {
        if (!doemit(false, it.getBpos()))
            return false;
        m_inNumber = false;
        }
        m_wordStart += it.appendchartostring(m_span);
        break;
    case '\'':
        // If in word, potential span: o'brien, else, this is more 
        // whitespace
        if (m_wordLen) {
        if (!doemit(false, it.getBpos()))
            return false;
        m_inNumber = false;
        m_wordStart += it.appendchartostring(m_span);
        }
        break;
    case '#': 
        // Keep it only at end of word ... Special case for c# you see...
        if (m_wordLen > 0) {
        int w = whatcc(it[it.getCpos()+1]);
        if (w == SPACE || w == '\n' || w == '\r') {
            m_wordLen += it.appendchartostring(m_span);
            break;
        }
        }
        goto SPACE;
        break;
    case '\n':
    case '\r':
        if (m_span.length() && m_span[m_span.length() - 1] == '-') {
        // if '-' is the last char before end of line, just
        // ignore the line change. This is the right thing to
        // do almost always. We'd then need a way to check if
        // the - was added as part of the word hyphenation, or was 
        // there in the first place, but this would need a dictionary.
...
        }
        break;

    default:
    NORMALCHAR:
        m_wordLen += it.appendchartostring(m_span);
        break;
    }
    }
    if (m_wordLen || m_span.length()) {
    if (!doemit(true, it.getBpos()))
        return false;
    }
    return true;
}

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.29 2007-01-25 15:40:55 dockes Exp $ (C) 2004 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.30 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
101	int btstart, int btend)	101	int btstart, int btend)
102	{	102	{
103	LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));	103	LOGDEB3(("TextSplit::emitterm: [%s] pos %d\n", w.c_str(), pos));
104		104
105	unsigned int l = w.length();	105	unsigned int l = w.length();
106	if (l > 0 && l < (unsigned)maxWordLength) {	106	if (l > 0 && l < (unsigned)m_maxWordLength) {
107	// 1 char word: we index single letters and digits, but	107	// 1 char word: we index single letters and digits, but
108	// nothing else. We might want to turn this into a test for a single	108	// nothing else. We might want to turn this into a test for a single
109	// utf8 character instead.	109	// utf8 character instead.
110	if (l == 1) {	110	if (l == 1) {
111	int c = (int)w[0];	111	int c = (int)w[0];
112	if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {	112	if (charclasses[c] != LETTER && charclasses[c] != DIGIT) {
113	//cerr << "ERASING single letter term " << c << endl;	113	//cerr << "ERASING single letter term " << c << endl;
114	return true;	114	return true;
115	}	115	}
116	}	116	}
117	if (pos != prevpos \|\| l != prevlen) {	117	if (pos != m_prevpos \|\| l != m_prevlen) {
118	bool ret = cb->takeword(w, pos, btstart, btend);	118	bool ret = m_cb->takeword(w, pos, btstart, btend);
		119	m_prevpos = pos;
119	prevlen = w.length();	120	m_prevlen = w.length();
120	prevpos = pos;
121	return ret;	121	return ret;
122	}	122	}
123	LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));	123	LOGDEB2(("TextSplit::emitterm:dup: [%s] pos %d\n", w.c_str(), pos));
124	}	124	}
125	return true;	125	return true;
	...		...
144	span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));	144	span.c_str(), spanpos, wordStart, wordLen, spanerase, bp));
145		145
146	// Emit span. When splitting for query, we only emit final spans	146	// Emit span. When splitting for query, we only emit final spans
147	bool spanemitted = false;	147	bool spanemitted = false;
148	if (spanerase && !(m_flags & TXTS_NOSPANS)) {	148	if (spanerase && !(m_flags & TXTS_NOSPANS)) {
149	// Maybe trim at end These are chars that we would keep inside	149	// Maybe trim at end. These are chars that we would keep inside
150	// a span, but not at the end	150	// a span, but not at the end
151	while (span.length() > 0) {	151	while (m_span.length() > 0) {
152	switch (span[span.length()-1]) {	152	switch (m_span[m_span.length()-1]) {
153	case '.':	153	case '.':
154	case ',':	154	case ',':
155	case '@':	155	case '@':
156	case '\'':	156	case '\'':
157	span.resize(span.length()-1);	157	m_span.resize(m_span.length()-1);
158	if (--bp < 0)	158	if (--bp < 0)
159	bp=0;	159	bp = 0;
160	break;	160	break;
161	default:	161	default:
162	goto breakloop1;	162	goto breakloop1;
163	}	163	}
164	}	164	}
165	breakloop1:	165	breakloop1:
166	spanemitted = true;	166	spanemitted = true;
167	if (!emitterm(true, span, spanpos, bp-span.length(), bp))	167	if (!emitterm(true, m_span, m_spanpos, bp - m_span.length(), bp))
168	return false;	168	return false;
169	}	169	}
170		170
171	// Emit word if different from span and not 'no words' mode	171	// Emit word if different from span and not 'no words' mode
172	if (!(m_flags & TXTS_ONLYSPANS) && wordLen &&	172	if (!(m_flags & TXTS_ONLYSPANS) && m_wordLen &&
173	(!spanemitted \|\| wordLen != span.length())) {	173	(!spanemitted \|\| m_wordLen != m_span.length())) {
174	string s(span.substr(wordStart, wordLen));	174	string s(m_span.substr(m_wordStart, m_wordLen));
175	if (!emitterm(false, s, wordpos, bp-wordLen, bp))	175	if (!emitterm(false, s, m_wordpos, bp - m_wordLen, bp))
176	return false;	176	return false;
177	}	177	}
178		178
179	// Adjust state	179	// Adjust state
180	wordpos++;	180	m_wordpos++;
181	wordLen = 0;	181	m_wordLen = 0;
182	if (spanerase) {	182	if (spanerase) {
183	span.erase();	183	m_span.erase();
184	spanpos = wordpos;	184	m_spanpos = m_wordpos;
185	wordStart = 0;	185	m_wordStart = 0;
186	} else {	186	} else {
187	wordStart = span.length();	187	m_wordStart = m_span.length();
188	}	188	}
189		189
190	return true;	190	return true;
191	}	191	}
192		192
	...		...
213	LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb,	213	LOGDEB2(("TextSplit::text_to_words: cb %p in [%s]\n", cb,
214	in.substr(0,50).c_str()));	214	in.substr(0,50).c_str()));
215		215
216	setcharclasses();	216	setcharclasses();
217		217
218	span.erase();	218	m_span.erase();
219	number = false;	219	m_inNumber = false;
220	wordStart = wordLen = prevpos = prevlen = wordpos = spanpos = 0;	220	m_wordStart = m_wordLen = m_prevpos = m_prevlen = m_wordpos = m_spanpos = 0;
221		221
222	Utf8Iter it(in);	222	Utf8Iter it(in);
223		223
224	for (; !it.eof(); it++) {	224	for (; !it.eof(); it++) {
225	unsigned int c = *it;	225	unsigned int c = *it;
	...		...
229	return false;	229	return false;
230	}	230	}
231	int cc = whatcc(c);	231	int cc = whatcc(c);
232	switch (cc) {	232	switch (cc) {
233	case LETTER:	233	case LETTER:
234	wordLen += it.appendchartostring(span);	234	m_wordLen += it.appendchartostring(m_span);
235	break;	235	break;
236		236
237	case DIGIT:	237	case DIGIT:
238	if (wordLen == 0)	238	if (m_wordLen == 0)
239	number = true;	239	m_inNumber = true;
240	wordLen += it.appendchartostring(span);	240	m_wordLen += it.appendchartostring(m_span);
241	break;	241	break;
242		242
243	case SPACE:	243	case SPACE:
244	SPACE:	244	SPACE:
245	if (wordLen \|\| span.length()) {	245	if (m_wordLen \|\| m_span.length()) {
246	if (!doemit(true, it.getBpos()))	246	if (!doemit(true, it.getBpos()))
247	return false;	247	return false;
248	number = false;	248	m_inNumber = false;
249	}	249	}
250	break;	250	break;
251	case WILD:	251	case WILD:
252	if (m_flags & TXTS_KEEPWILD)	252	if (m_flags & TXTS_KEEPWILD)
253	goto NORMALCHAR;	253	goto NORMALCHAR;
254	else	254	else
255	goto SPACE;	255	goto SPACE;
256	break;	256	break;
257	case '-':	257	case '-':
258	case '+':	258	case '+':
259	if (wordLen == 0) {	259	if (m_wordLen == 0) {
260	if (whatcc(it[it.getCpos()+1]) == DIGIT) {	260	if (whatcc(it[it.getCpos()+1]) == DIGIT) {
261	number = true;	261	m_inNumber = true;
262	wordLen += it.appendchartostring(span);	262	m_wordLen += it.appendchartostring(m_span);
263	} else {	263	} else {
264	wordStart += it.appendchartostring(span);	264	m_wordStart += it.appendchartostring(m_span);
265	}	265	}
266	} else {	266	} else {
267	if (!doemit(false, it.getBpos()))	267	if (!doemit(false, it.getBpos()))
268	return false;	268	return false;
269	number = false;	269	m_inNumber = false;
270	wordStart += it.appendchartostring(span);	270	m_wordStart += it.appendchartostring(m_span);
271	}	271	}
272	break;	272	break;
273	case '.':	273	case '.':
274	case ',':	274	case ',':
275	if (number) {	275	if (m_inNumber) {
276	// 132.jpg ?	276	// 132.jpg ?
277	if (whatcc(it[it.getCpos()+1]) != DIGIT)	277	if (whatcc(it[it.getCpos()+1]) != DIGIT)
278	goto SPACE;	278	goto SPACE;
279	wordLen += it.appendchartostring(span);	279	m_wordLen += it.appendchartostring(m_span);
280	break;	280	break;
281	} else {	281	} else {
282	// If . inside a word, keep it, else, this is whitespace.	282	// If . inside a word, keep it, else, this is whitespace.
283	// We also keep an initial '.' for catching .net, but this adds	283	// We also keep an initial '.' for catching .net, but this adds
284	// quite a few spurious terms !	284	// quite a few spurious terms !
285	// Another problem is that something like .x-errs	285	// Another problem is that something like .x-errs
286	// will be split as .x-errs, x, errs but not x-errs	286	// will be split as .x-errs, x, errs but not x-errs
287	// A final comma in a word will be removed by doemit	287	// A final comma in a word will be removed by doemit
288	if (cc == '.') {	288	if (cc == '.') {
289	if (wordLen) {	289	if (m_wordLen) {
290	if (!doemit(false, it.getBpos()))	290	if (!doemit(false, it.getBpos()))
291	return false;	291	return false;
292	// span length could have been adjusted by trimming	292	// span length could have been adjusted by trimming
293	// inside doemit	293	// inside doemit
294	if (span.length())	294	if (m_span.length())
295	wordStart += it.appendchartostring(span);	295	m_wordStart += it.appendchartostring(m_span);
296	break;	296	break;
297	} else {	297	} else {
298	wordStart += it.appendchartostring(span);	298	m_wordStart += it.appendchartostring(m_span);
299	break;	299	break;
300	}	300	}
301	}	301	}
302	}	302	}
303	goto SPACE;	303	goto SPACE;
304	break;	304	break;
305	case '@':	305	case '@':
306	if (wordLen) {	306	if (m_wordLen) {
307	if (!doemit(false, it.getBpos()))	307	if (!doemit(false, it.getBpos()))
308	return false;	308	return false;
309	number = false;	309	m_inNumber = false;
310	}	310	}
311	wordStart += it.appendchartostring(span);	311	m_wordStart += it.appendchartostring(m_span);
312	break;	312	break;
313	case '\'':	313	case '\'':
314	// If in word, potential span: o'brien, else, this is more	314	// If in word, potential span: o'brien, else, this is more
315	// whitespace	315	// whitespace
316	if (wordLen) {	316	if (m_wordLen) {
317	if (!doemit(false, it.getBpos()))	317	if (!doemit(false, it.getBpos()))
318	return false;	318	return false;
319	number = false;	319	m_inNumber = false;
320	wordStart += it.appendchartostring(span);	320	m_wordStart += it.appendchartostring(m_span);
321	}	321	}
322	break;	322	break;
323	case '#':	323	case '#':
324	// Keep it only at end of word ... Special case for c# you see...	324	// Keep it only at end of word ... Special case for c# you see...
325	if (wordLen > 0) {	325	if (m_wordLen > 0) {
326	int w = whatcc(it[it.getCpos()+1]);	326	int w = whatcc(it[it.getCpos()+1]);
327	if (w == SPACE \|\| w == '\n' \|\| w == '\r') {	327	if (w == SPACE \|\| w == '\n' \|\| w == '\r') {
328	wordLen += it.appendchartostring(span);	328	m_wordLen += it.appendchartostring(m_span);
329	break;	329	break;
330	}	330	}
331	}	331	}
332	goto SPACE;	332	goto SPACE;
333	break;	333	break;
334	case '\n':	334	case '\n':
335	case '\r':	335	case '\r':
336	if (span.length() && span[span.length() - 1] == '-') {	336	if (m_span.length() && m_span[m_span.length() - 1] == '-') {
337	// if '-' is the last char before end of line, just	337	// if '-' is the last char before end of line, just
338	// ignore the line change. This is the right thing to	338	// ignore the line change. This is the right thing to
339	// do almost always. We'd then need a way to check if	339	// do almost always. We'd then need a way to check if
340	// the - was added as part of the word hyphenation, or was	340	// the - was added as part of the word hyphenation, or was
341	// there in the first place, but this would need a dictionary.	341	// there in the first place, but this would need a dictionary.
	...		...
347	}	347	}
348	break;	348	break;
349		349
350	default:	350	default:
351	NORMALCHAR:	351	NORMALCHAR:
352	wordLen += it.appendchartostring(span);	352	m_wordLen += it.appendchartostring(m_span);
353	break;	353	break;
354	}	354	}
355	}	355	}
356	if (wordLen \|\| span.length()) {	356	if (m_wordLen \|\| m_span.length()) {
357	if (!doemit(true, it.getBpos()))	357	if (!doemit(true, it.getBpos()))
358	return false;	358	return false;
359	}	359	}
360	return true;	360	return true;
361	}	361	}