|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
19 |
/* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: textsplit.h,v 1.17 2007-09-18 20:35:31 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
#include <string>
|
21 |
#include <string>
|
22 |
#ifndef NO_NAMESPACES
|
22 |
#ifndef NO_NAMESPACES
|
23 |
using std::string;
|
23 |
using std::string;
|
24 |
#endif
|
24 |
#endif
|
|
... |
|
... |
42 |
* This uses a callback function. It could be done with an iterator instead,
|
42 |
* This uses a callback function. It could be done with an iterator instead,
|
43 |
* but 'ts much simpler this way...
|
43 |
* but 'ts much simpler this way...
|
44 |
*/
|
44 |
*/
|
45 |
class TextSplit {
|
45 |
class TextSplit {
|
46 |
public:
|
46 |
public:
|
47 |
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2,
|
47 |
enum Flags {TXTS_NONE = 0,
|
48 |
TXTS_KEEPWILD = 4};
|
48 |
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
|
|
49 |
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
|
|
50 |
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
|
|
51 |
};
|
|
|
52 |
|
49 |
/**
|
53 |
/**
|
50 |
* Constructor: just store callback object
|
54 |
* Constructor: just store callback object
|
51 |
*/
|
55 |
*/
|
52 |
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
56 |
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
53 |
: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
|
57 |
: m_flags(flags), m_cb(t), m_maxWordLength(40), m_prevpos(-1) {}
|
|
|
58 |
|
54 |
/**
|
59 |
/**
|
55 |
* Split text, emit words and positions.
|
60 |
* Split text, emit words and positions.
|
56 |
*/
|
61 |
*/
|
57 |
bool text_to_words(const string &in);
|
62 |
bool text_to_words(const string &in);
|
58 |
|
63 |
|
59 |
// Utility functions : these does not need the user to setup a callback
|
64 |
// Utility functions : these does not need the user to setup a callback
|
60 |
// etc.
|
65 |
// etc.
|
61 |
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
66 |
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
62 |
|
67 |
|
63 |
private:
|
68 |
private:
|
64 |
Flags m_flags;
|
69 |
Flags m_flags;
|
65 |
TextSplitCB *cb;
|
70 |
TextSplitCB *m_cb;
|
66 |
int maxWordLength;
|
71 |
int m_maxWordLength;
|
67 |
|
72 |
|
68 |
string span; // Current span. Might be jf.dockes@wanadoo.f
|
73 |
// Current span. Might be jf.dockes@wanadoo.f
|
|
|
74 |
string m_span;
|
|
|
75 |
|
69 |
int wordStart; // Current word: no punctuation at all in there
|
76 |
// Current word: no punctuation at all in there
|
|
|
77 |
int m_wordStart;
|
70 |
unsigned int wordLen;
|
78 |
unsigned int m_wordLen;
|
71 |
bool number;
|
79 |
|
|
|
80 |
// Currently inside number
|
|
|
81 |
bool m_inNumber;
|
|
|
82 |
|
72 |
int wordpos; // Term position of current word
|
83 |
// Term position of current word and span
|
73 |
int spanpos; // Term position of current span
|
84 |
int m_wordpos;
|
|
|
85 |
int m_spanpos;
|
74 |
|
86 |
|
75 |
// It may happen that our cleanup would result in emitting the
|
87 |
// It may happen that our cleanup would result in emitting the
|
76 |
// same term twice. We try to avoid this
|
88 |
// same term twice. We try to avoid this
|
77 |
int prevpos;
|
89 |
int m_prevpos;
|
78 |
unsigned int prevlen;
|
90 |
unsigned int m_prevlen;
|
|
|
91 |
|
|
|
92 |
// This processes cjk text:
|
|
|
93 |
// bool cjk_to_words();
|
79 |
|
94 |
|
80 |
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
95 |
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
81 |
bool doemit(bool spanerase, int bp);
|
96 |
bool doemit(bool spanerase, int bp);
|
82 |
|
|
|
83 |
};
|
97 |
};
|
84 |
|
98 |
|
85 |
|
99 |
|
86 |
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
100 |
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|