|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
19 |
/* @(#$Id: textsplit.h,v 1.19 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: textsplit.h,v 1.20 2007-10-04 12:21:52 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
#include <string>
|
21 |
#include <string>
|
22 |
#ifndef NO_NAMESPACES
|
22 |
#ifndef NO_NAMESPACES
|
23 |
using std::string;
|
23 |
using std::string;
|
24 |
#endif
|
24 |
#endif
|
|
... |
|
... |
36 |
) = 0;
|
36 |
) = 0;
|
37 |
};
|
37 |
};
|
38 |
|
38 |
|
39 |
class Utf8Iter;
|
39 |
class Utf8Iter;
|
40 |
|
40 |
|
|
|
41 |
|
41 |
/**
|
42 |
/**
|
42 |
* Split text into words.
|
43 |
* Split text into words.
|
43 |
* See comments at top of .cpp for more explanations.
|
44 |
* See comments at top of .cpp for more explanations.
|
44 |
* This uses a callback function. It could be done with an iterator instead,
|
45 |
* This uses a callback function. It could be done with an iterator instead,
|
45 |
* but 'ts much simpler this way...
|
46 |
* but 'ts much simpler this way...
|
46 |
*/
|
47 |
*/
|
47 |
class TextSplit {
|
48 |
class TextSplit {
|
48 |
public:
|
49 |
public:
|
49 |
// Should we activate special processing of Chinese characters ? This
|
50 |
// Should we activate special processing of Chinese characters ? This
|
50 |
// needs a little more cpu, so it can be turned off globally.
|
51 |
// needs a little more cpu, so it can be turned off globally.
|
51 |
static bool t_processCJK;
|
52 |
static bool o_processCJK;
|
52 |
static void cjkProcessing(bool onoff) {t_processCJK = onoff;}
|
53 |
static unsigned int o_CJKNgramLen;
|
|
|
54 |
static const unsigned int o_CJKMaxNgramLen = 5;
|
|
|
55 |
static void cjkProcessing(bool onoff, unsigned int ngramlen = 2)
|
|
|
56 |
{
|
|
|
57 |
o_processCJK = onoff;
|
|
|
58 |
o_CJKNgramLen = ngramlen <= o_CJKMaxNgramLen ?
|
|
|
59 |
ngramlen : o_CJKMaxNgramLen;
|
|
|
60 |
}
|
53 |
|
61 |
|
54 |
enum Flags {TXTS_NONE = 0,
|
62 |
enum Flags {TXTS_NONE = 0,
|
55 |
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
63 |
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
56 |
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
64 |
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
57 |
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
65 |
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|