|
a/src/common/textsplit.cpp |
|
b/src/common/textsplit.cpp |
1 |
#ifndef lint
|
1 |
#ifndef lint
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.31 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes";
|
2 |
static char rcsid[] = "@(#$Id: textsplit.cpp,v 1.32 2007-09-20 12:22:26 dockes Exp $ (C) 2004 J.F.Dockes";
|
3 |
#endif
|
3 |
#endif
|
4 |
/*
|
4 |
/*
|
5 |
* This program is free software; you can redistribute it and/or modify
|
5 |
* This program is free software; you can redistribute it and/or modify
|
6 |
* it under the terms of the GNU General Public License as published by
|
6 |
* it under the terms of the GNU General Public License as published by
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
7 |
* the Free Software Foundation; either version 2 of the License, or
|
|
... |
|
... |
241 |
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
241 |
* handled specially so that special cases, ie, c++ and dockes@okyz.com etc,
|
242 |
* are handled properly,
|
242 |
* are handled properly,
|
243 |
*/
|
243 |
*/
|
244 |
bool TextSplit::text_to_words(const string &in)
|
244 |
bool TextSplit::text_to_words(const string &in)
|
245 |
{
|
245 |
{
|
246 |
LOGDEB(("TextSplit::text_to_words:%s%s%s%s [%s]\n",
|
246 |
LOGDEB1(("TextSplit::text_to_words:%s%s%s%s [%s]\n",
|
247 |
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
247 |
m_flags & TXTS_NOSPANS ? " nospans" : "",
|
248 |
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
248 |
m_flags & TXTS_ONLYSPANS ? " onlyspans" : "",
|
249 |
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
249 |
m_flags & TXTS_KEEPWILD ? " keepwild" : "",
|
250 |
m_flags & TXTS_NOCJK ? " nocjk" : "",
|
250 |
m_flags & TXTS_NOCJK ? " nocjk" : "",
|
251 |
in.substr(0,50).c_str()));
|
251 |
in.substr(0,50).c_str()));
|
|
... |
|
... |
432 |
//
|
432 |
//
|
433 |
// The routine is sort of a mess and goes to show that we'd probably
|
433 |
// The routine is sort of a mess and goes to show that we'd probably
|
434 |
// be better off converting the whole buffer to utf32 on entry...
|
434 |
// be better off converting the whole buffer to utf32 on entry...
|
435 |
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
435 |
bool TextSplit::cjk_to_words(Utf8Iter *itp, unsigned int *cp)
|
436 |
{
|
436 |
{
|
437 |
LOGDEB(("cjk_to_words: m_wordpos %d\n", m_wordpos));
|
437 |
LOGDEB1(("cjk_to_words: m_wordpos %d\n", m_wordpos));
|
438 |
Utf8Iter &it = *itp;
|
438 |
Utf8Iter &it = *itp;
|
439 |
|
439 |
|
440 |
// We use an offset buffer to remember the starts of the utf-8
|
440 |
// We use an offset buffer to remember the starts of the utf-8
|
441 |
// characters which we still need to use.
|
441 |
// characters which we still need to use.
|
442 |
// Fixed size array. ngramlen over 3 doesn't make sense.
|
442 |
// Fixed size array. ngramlen over 3 doesn't make sense.
|