recoll / Code / Diff of /src/common/textsplit.cpp

Diff of /src/common/textsplit.cpp [069d71] .. [645018]

Switch to unified view


#ifndef lint
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.32 2007-09-20 12:22:26 dockes Exp $ (C) 2004 J.F.Dockes";
#endif
/*
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
...
 * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
 * are handled properly,
 */
bool TextSplit::text_to_words(const string &in)
{
    LOGDEB1(("TextSplit::text_to_words:%s%s%s%s [%s]\n", 
        m_flags & TXTS_NOSPANS ? " nospans" : "",
        m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
        m_flags & TXTS_KEEPWILD ? " keepwild" : "",
        m_flags & TXTS_NOCJK ? " nocjk" : "",
        in.substr(0,50).c_str()));
...
//
// The routine is sort of a mess and goes to show that we'd probably
// be better off converting the whole buffer to utf32 on entry...
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
{
    LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
    Utf8Iter &it = *itp;

    // We use an offset buffer to remember the starts of the utf-8
    // characters which we still need to use.
    // Fixed size array. ngramlen over 3 doesn't make sense.

	a/src/common/textsplit.cpp		b/src/common/textsplit.cpp
1	#ifndef lint	1	#ifndef lint
2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.31 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes";	2	static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.32 2007-09-20 12:22:26 dockes Exp $ (C) 2004 J.F.Dockes";
3	#endif	3	#endif
4	/*	4	/*
5	* This program is free software; you can redistribute it and/or modify	5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by	6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or	7	* the Free Software Foundation; either version 2 of the License, or
	...		...
241	* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,	241	* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
242	* are handled properly,	242	* are handled properly,
243	*/	243	*/
244	bool TextSplit::text_to_words(const string &in)	244	bool TextSplit::text_to_words(const string &in)
245	{	245	{
246	LOGDEB(("TextSplit::text_to_words:%s%s%s%s [%s]\n",	246	LOGDEB1(("TextSplit::text_to_words:%s%s%s%s [%s]\n",
247	m_flags & TXTS_NOSPANS ? " nospans" : "",	247	m_flags & TXTS_NOSPANS ? " nospans" : "",
248	m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",	248	m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
249	m_flags & TXTS_KEEPWILD ? " keepwild" : "",	249	m_flags & TXTS_KEEPWILD ? " keepwild" : "",
250	m_flags & TXTS_NOCJK ? " nocjk" : "",	250	m_flags & TXTS_NOCJK ? " nocjk" : "",
251	in.substr(0,50).c_str()));	251	in.substr(0,50).c_str()));
	...		...
432	//	432	//
433	// The routine is sort of a mess and goes to show that we'd probably	433	// The routine is sort of a mess and goes to show that we'd probably
434	// be better off converting the whole buffer to utf32 on entry...	434	// be better off converting the whole buffer to utf32 on entry...
435	bool TextSplit::cjk_to_words(Utf8Iter itp, unsigned int cp)	435	bool TextSplit::cjk_to_words(Utf8Iter itp, unsigned int cp)
436	{	436	{
437	LOGDEB(("cjk_to_words: m_wordpos %d\n", m_wordpos));	437	LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
438	Utf8Iter &it = *itp;	438	Utf8Iter &it = *itp;
439		439
440	// We use an offset buffer to remember the starts of the utf-8	440	// We use an offset buffer to remember the starts of the utf-8
441	// characters which we still need to use.	441	// characters which we still need to use.
442	// Fixed size array. ngramlen over 3 doesn't make sense.	442	// Fixed size array. ngramlen over 3 doesn't make sense.