Switch to unified view

a/src/common/textsplit.cpp b/src/common/textsplit.cpp
1
#ifndef lint
1
#ifndef lint
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.31 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes";
2
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.32 2007-09-20 12:22:26 dockes Exp $ (C) 2004 J.F.Dockes";
3
#endif
3
#endif
4
/*
4
/*
5
 *   This program is free software; you can redistribute it and/or modify
5
 *   This program is free software; you can redistribute it and/or modify
6
 *   it under the terms of the GNU General Public License as published by
6
 *   it under the terms of the GNU General Public License as published by
7
 *   the Free Software Foundation; either version 2 of the License, or
7
 *   the Free Software Foundation; either version 2 of the License, or
...
...
241
 * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
241
 * handled specially so that special cases, ie, c++ and dockes@okyz.com etc, 
242
 * are handled properly,
242
 * are handled properly,
243
 */
243
 */
244
bool TextSplit::text_to_words(const string &in)
244
bool TextSplit::text_to_words(const string &in)
245
{
245
{
246
    LOGDEB(("TextSplit::text_to_words:%s%s%s%s [%s]\n", 
246
    LOGDEB1(("TextSplit::text_to_words:%s%s%s%s [%s]\n", 
247
        m_flags & TXTS_NOSPANS ? " nospans" : "",
247
        m_flags & TXTS_NOSPANS ? " nospans" : "",
248
        m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
248
        m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
249
        m_flags & TXTS_KEEPWILD ? " keepwild" : "",
249
        m_flags & TXTS_KEEPWILD ? " keepwild" : "",
250
        m_flags & TXTS_NOCJK ? " nocjk" : "",
250
        m_flags & TXTS_NOCJK ? " nocjk" : "",
251
        in.substr(0,50).c_str()));
251
        in.substr(0,50).c_str()));
...
...
432
//
432
//
433
// The routine is sort of a mess and goes to show that we'd probably
433
// The routine is sort of a mess and goes to show that we'd probably
434
// be better off converting the whole buffer to utf32 on entry...
434
// be better off converting the whole buffer to utf32 on entry...
435
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
435
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
436
{
436
{
437
    LOGDEB(("cjk_to_words: m_wordpos %d\n", m_wordpos));
437
    LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
438
    Utf8Iter &it = *itp;
438
    Utf8Iter &it = *itp;
439
439
440
    // We use an offset buffer to remember the starts of the utf-8
440
    // We use an offset buffer to remember the starts of the utf-8
441
    // characters which we still need to use.
441
    // characters which we still need to use.
442
    // Fixed size array. ngramlen over 3 doesn't make sense.
442
    // Fixed size array. ngramlen over 3 doesn't make sense.