recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [75d251] .. [90e378]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.35 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
     || ((p) >= 0xFE30 && (p) <= 0xFE4F)                                \
     || ((p) >= 0xFF00 && (p) <= 0xFFEF)                                \
     || ((p) >= 0x20000 && (p) <= 0x2A6DF)                              \
     || ((p) >= 0x2F800 && (p) <= 0x2FA1F))

bool          TextSplit::o_processCJK = true;
unsigned int  TextSplit::o_CJKNgramLen = 2;

// Do some checking (the kind which is simpler to do here than in the
// main loop), then send term to our client.
inline bool TextSplit::emitterm(bool isspan, string &w, int pos, 
             int btstart, int btend)
...
 * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
 * are handled properly,
 */
bool TextSplit::text_to_words(const string &in)
{
    LOGDEB1(("TextSplit::text_to_words: docjk %d (%d) %s%s%s [%s]\n", 
       o_processCJK, o_CJKNgramLen,
         m_flags & TXTS_NOSPANS ? " nospans" : "",
         m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
         m_flags & TXTS_KEEPWILD ? " keepwild" : "",
         in.substr(0,50).c_str()));

    setcharclasses();

    m_span.erase();
    m_inNumber = false;
...
    if (c == (unsigned int)-1) {
        LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
        return false;
    }

    if (o_processCJK && UNICODE_IS_CJK(c)) {
        // CJK character hit. 
        // Do like at EOF with the current non-cjk data.
        if (m_wordLen || m_span.length()) {
        if (!doemit(true, it.getBpos()))
            return false;
...
        return false;
    }
    return true;
}




// Using an utf8iter pointer just to avoid needing its definition in
// textsplit.h
//
// We output ngrams for exemple for char input a b c and ngramlen== 2, 
// we generate: a ab b bc c as words
...
    LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
    Utf8Iter &it = *itp;

    // We use an offset buffer to remember the starts of the utf-8
    // characters which we still need to use.
    assert(o_CJKNgramLen < o_CJKMaxNgramLen);
    unsigned int boffs[o_CJKMaxNgramLen+1];


    // Current number of valid offsets;
    unsigned int nchars = 0;
    unsigned int c = 0;
    for (; !it.eof(); it++) {
...
    if (!UNICODE_IS_CJK(c)) {
        // Return to normal handler
        break;
    }

    if (nchars == o_CJKNgramLen) {
        // Offset buffer full, shift it. Might be more efficient
        // to have a circular one, but things are complicated
        // enough already...
        for (unsigned int i = 0; i < nchars-1; i++) {
        boffs[i] = boffs[i+1];
...
    boffs[nchars-1] = it.getBpos();

    // Output all new ngrams: they begin at each existing position
    // and end after the new character. onlyspans->only output
    // maximum words, nospans=> single chars
    if (!(m_flags & TXTS_ONLYSPANS) || nchars == o_CJKNgramLen) {
        unsigned int btend = it.getBpos() + it.getBlen();
        unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
        unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
        for (unsigned int i = loopbeg; i < loopend; i++) {
        if (!m_cb->takeword(it.buffer().substr(boffs[i], 
...
    m_wordpos++;
    }

    // If onlyspans is set, there may be things to flush in the buffer
    // first
    if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen)  {
    unsigned int btend = it.getBpos(); // Current char is out
    if (!m_cb->takeword(it.buffer().substr(boffs[0], 
                           btend-boffs[0]),
                m_wordpos - nchars,
                boffs[0], btend)) {

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.34 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.35 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
139	\|\| ((p) >= 0xFE30 && (p) <= 0xFE4F) \	139	\|\| ((p) >= 0xFE30 && (p) <= 0xFE4F) \
140	\|\| ((p) >= 0xFF00 && (p) <= 0xFFEF) \	140	\|\| ((p) >= 0xFF00 && (p) <= 0xFFEF) \
141	\|\| ((p) >= 0x20000 && (p) <= 0x2A6DF) \	141	\|\| ((p) >= 0x20000 && (p) <= 0x2A6DF) \
142	\|\| ((p) >= 0x2F800 && (p) <= 0x2FA1F))	142	\|\| ((p) >= 0x2F800 && (p) <= 0x2FA1F))
143		143
144	bool TextSplit::t_processCJK = true;	144	bool TextSplit::o_processCJK = true;
		145	unsigned int TextSplit::o_CJKNgramLen = 2;
145		146
146	// Do some checking (the kind which is simpler to do here than in the	147	// Do some checking (the kind which is simpler to do here than in the
147	// main loop), then send term to our client.	148	// main loop), then send term to our client.
148	inline bool TextSplit::emitterm(bool isspan, string &w, int pos,	149	inline bool TextSplit::emitterm(bool isspan, string &w, int pos,
149	int btstart, int btend)	150	int btstart, int btend)
	...		...
244	* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,	245	* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
245	* are handled properly,	246	* are handled properly,
246	*/	247	*/
247	bool TextSplit::text_to_words(const string &in)	248	bool TextSplit::text_to_words(const string &in)
248	{	249	{
249	LOGDEB1(("TextSplit::text_to_words: docjk %d %s%s%s [%s]\n",	250	LOGDEB1(("TextSplit::text_to_words: docjk %d (%d) %s%s%s [%s]\n",
250	t_processCJK,	251	o_processCJK, o_CJKNgramLen,
251	m_flags & TXTS_NOSPANS ? " nospans" : "",	252	m_flags & TXTS_NOSPANS ? " nospans" : "",
252	m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",	253	m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
253	m_flags & TXTS_KEEPWILD ? " keepwild" : "",	254	m_flags & TXTS_KEEPWILD ? " keepwild" : "",
254	in.substr(0,50).c_str()));	255	in.substr(0,50).c_str()));
255		256
256	setcharclasses();	257	setcharclasses();
257		258
258	m_span.erase();	259	m_span.erase();
259	m_inNumber = false;	260	m_inNumber = false;
	...		...
267	if (c == (unsigned int)-1) {	268	if (c == (unsigned int)-1) {
268	LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));	269	LOGERR(("Textsplit: error occured while scanning UTF-8 string\n"));
269	return false;	270	return false;
270	}	271	}
271		272
272	if (t_processCJK && UNICODE_IS_CJK(c)) {	273	if (o_processCJK && UNICODE_IS_CJK(c)) {
273	// CJK character hit.	274	// CJK character hit.
274	// Do like at EOF with the current non-cjk data.	275	// Do like at EOF with the current non-cjk data.
275	if (m_wordLen \|\| m_span.length()) {	276	if (m_wordLen \|\| m_span.length()) {
276	if (!doemit(true, it.getBpos()))	277	if (!doemit(true, it.getBpos()))
277	return false;	278	return false;
	...		...
419	return false;	420	return false;
420	}	421	}
421	return true;	422	return true;
422	}	423	}
423		424
424	const unsigned int ngramlen = 2;
425	#define MAXNGRAMLEN 5
426
427	// Using an utf8iter pointer just to avoid needing its definition in	425	// Using an utf8iter pointer just to avoid needing its definition in
428	// textsplit.h	426	// textsplit.h
429	//	427	//
430	// We output ngrams for exemple for char input a b c and ngramlen== 2,	428	// We output ngrams for exemple for char input a b c and ngramlen== 2,
431	// we generate: a ab b bc c as words	429	// we generate: a ab b bc c as words
	...		...
440	LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));	438	LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
441	Utf8Iter &it = *itp;	439	Utf8Iter &it = *itp;
442		440
443	// We use an offset buffer to remember the starts of the utf-8	441	// We use an offset buffer to remember the starts of the utf-8
444	// characters which we still need to use.	442	// characters which we still need to use.
445	// Fixed size array. ngramlen over 3 doesn't make sense.	443	assert(o_CJKNgramLen < o_CJKMaxNgramLen);
446	assert(ngramlen < MAXNGRAMLEN);	444	unsigned int boffs[o_CJKMaxNgramLen+1];
447	unsigned int boffs[MAXNGRAMLEN];
448		445
449	// Current number of valid offsets;	446	// Current number of valid offsets;
450	unsigned int nchars = 0;	447	unsigned int nchars = 0;
451	unsigned int c = 0;	448	unsigned int c = 0;
452	for (; !it.eof(); it++) {	449	for (; !it.eof(); it++) {
	...		...
454	if (!UNICODE_IS_CJK(c)) {	451	if (!UNICODE_IS_CJK(c)) {
455	// Return to normal handler	452	// Return to normal handler
456	break;	453	break;
457	}	454	}
458		455
459	if (nchars == ngramlen) {	456	if (nchars == o_CJKNgramLen) {
460	// Offset buffer full, shift it. Might be more efficient	457	// Offset buffer full, shift it. Might be more efficient
461	// to have a circular one, but things are complicated	458	// to have a circular one, but things are complicated
462	// enough already...	459	// enough already...
463	for (unsigned int i = 0; i < nchars-1; i++) {	460	for (unsigned int i = 0; i < nchars-1; i++) {
464	boffs[i] = boffs[i+1];	461	boffs[i] = boffs[i+1];
	...		...
471	boffs[nchars-1] = it.getBpos();	468	boffs[nchars-1] = it.getBpos();
472		469
473	// Output all new ngrams: they begin at each existing position	470	// Output all new ngrams: they begin at each existing position
474	// and end after the new character. onlyspans->only output	471	// and end after the new character. onlyspans->only output
475	// maximum words, nospans=> single chars	472	// maximum words, nospans=> single chars
476	if (!(m_flags & TXTS_ONLYSPANS) \|\| nchars == ngramlen) {	473	if (!(m_flags & TXTS_ONLYSPANS) \|\| nchars == o_CJKNgramLen) {
477	unsigned int btend = it.getBpos() + it.getBlen();	474	unsigned int btend = it.getBpos() + it.getBlen();
478	unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;	475	unsigned int loopbeg = (m_flags & TXTS_NOSPANS) ? nchars-1 : 0;
479	unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;	476	unsigned int loopend = (m_flags & TXTS_ONLYSPANS) ? 1 : nchars;
480	for (unsigned int i = loopbeg; i < loopend; i++) {	477	for (unsigned int i = loopbeg; i < loopend; i++) {
481	if (!m_cb->takeword(it.buffer().substr(boffs[i],	478	if (!m_cb->takeword(it.buffer().substr(boffs[i],
	...		...
495	m_wordpos++;	492	m_wordpos++;
496	}	493	}
497		494
498	// If onlyspans is set, there may be things to flush in the buffer	495	// If onlyspans is set, there may be things to flush in the buffer
499	// first	496	// first
500	if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != ngramlen) {	497	if ((m_flags & TXTS_ONLYSPANS) && nchars > 0 && nchars != o_CJKNgramLen) {
501	unsigned int btend = it.getBpos(); // Current char is out	498	unsigned int btend = it.getBpos(); // Current char is out
502	if (!m_cb->takeword(it.buffer().substr(boffs[0],	499	if (!m_cb->takeword(it.buffer().substr(boffs[0],
503	btend-boffs[0]),	500	btend-boffs[0]),
504	m_wordpos - nchars,	501	m_wordpos - nchars,
505	boffs[0], btend)) {	502	boffs[0], btend)) {