|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
19 |
/* @(#$Id: textsplit.h,v 1.18 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: textsplit.h,v 1.19 2007-10-02 11:39:08 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
#include <string>
|
21 |
#include <string>
|
22 |
#ifndef NO_NAMESPACES
|
22 |
#ifndef NO_NAMESPACES
|
23 |
using std::string;
|
23 |
using std::string;
|
24 |
#endif
|
24 |
#endif
|
|
... |
|
... |
44 |
* This uses a callback function. It could be done with an iterator instead,
|
44 |
* This uses a callback function. It could be done with an iterator instead,
|
45 |
* but 'ts much simpler this way...
|
45 |
* but 'ts much simpler this way...
|
46 |
*/
|
46 |
*/
|
47 |
class TextSplit {
|
47 |
class TextSplit {
|
48 |
public:
|
48 |
public:
|
|
|
49 |
// Should we activate special processing of Chinese characters ? This
|
|
|
50 |
// needs a little more cpu, so it can be turned off globally.
|
|
|
51 |
static bool t_processCJK;
|
|
|
52 |
static void cjkProcessing(bool onoff) {t_processCJK = onoff;}
|
|
|
53 |
|
49 |
enum Flags {TXTS_NONE = 0,
|
54 |
enum Flags {TXTS_NONE = 0,
|
50 |
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
55 |
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
51 |
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
56 |
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
52 |
TXTS_KEEPWILD = 4, // Handle wildcards as letters
|
57 |
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
53 |
TXTS_NOCJK = 8 // CJK special processing
|
|
|
54 |
};
|
58 |
};
|
55 |
|
59 |
|
56 |
/**
|
60 |
/**
|
57 |
* Constructor: just store callback object
|
61 |
* Constructor: just store callback object
|
58 |
*/
|
62 |
*/
|
59 |
TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
|
63 |
TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
|
60 |
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
64 |
: m_flags(flags), m_cb(t), m_maxWordLength(40),
|
61 |
m_nocjk((m_flags & TXTS_NOCJK) != 0),
|
|
|
62 |
m_prevpos(-1)
|
65 |
m_prevpos(-1)
|
63 |
{
|
66 |
{
|
64 |
}
|
67 |
}
|
65 |
|
68 |
|
66 |
/**
|
69 |
/**
|
|
... |
|
... |
74 |
|
77 |
|
75 |
private:
|
78 |
private:
|
76 |
Flags m_flags;
|
79 |
Flags m_flags;
|
77 |
TextSplitCB *m_cb;
|
80 |
TextSplitCB *m_cb;
|
78 |
int m_maxWordLength;
|
81 |
int m_maxWordLength;
|
79 |
int m_nocjk;
|
|
|
80 |
|
82 |
|
81 |
// Current span. Might be jf.dockes@wanadoo.f
|
83 |
// Current span. Might be jf.dockes@wanadoo.f
|
82 |
string m_span;
|
84 |
string m_span;
|
83 |
|
85 |
|
84 |
// Current word: no punctuation at all in there. Byte offset
|
86 |
// Current word: no punctuation at all in there. Byte offset
|