|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
19 |
/* @(#$Id: textsplit.h,v 1.14 2006-11-20 11:17:53 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: textsplit.h,v 1.15 2006-12-08 07:11:17 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
#include <string>
|
21 |
#include <string>
|
22 |
#ifndef NO_NAMESPACES
|
22 |
#ifndef NO_NAMESPACES
|
23 |
using std::string;
|
23 |
using std::string;
|
24 |
#endif
|
24 |
#endif
|
25 |
|
25 |
|
26 |
/**
|
26 |
/**
|
27 |
* Function class whose takeword method is called for every detected word while * splitting text.
|
27 |
* Function class whose takeword method is called for every detected word while * splitting text.
|
28 |
*/
|
28 |
*/
|
29 |
class TextSplitCB {
|
29 |
class TextSplitCB {
|
30 |
public:
|
30 |
public:
|
31 |
virtual ~TextSplitCB() {}
|
31 |
virtual ~TextSplitCB() {}
|
32 |
virtual bool takeword(const std::string& term,
|
32 |
virtual bool takeword(const string& term,
|
33 |
int pos, // term pos
|
33 |
int pos, // term pos
|
34 |
int bts, // byte offset of first char in term
|
34 |
int bts, // byte offset of first char in term
|
35 |
int bte // byte offset of first char after term
|
35 |
int bte // byte offset of first char after term
|
36 |
) = 0;
|
36 |
) = 0;
|
37 |
};
|
37 |
};
|
|
... |
|
... |
41 |
* See comments at top of .cpp for more explanations.
|
41 |
* See comments at top of .cpp for more explanations.
|
42 |
* This uses a callback function. It could be done with an iterator instead,
|
42 |
* This uses a callback function. It could be done with an iterator instead,
|
43 |
* but 'ts much simpler this way...
|
43 |
* but 'ts much simpler this way...
|
44 |
*/
|
44 |
*/
|
45 |
class TextSplit {
|
45 |
class TextSplit {
|
46 |
public:
|
46 |
public:
|
47 |
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
|
47 |
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
|
48 |
/**
|
48 |
/**
|
49 |
* Constructor: just store callback object
|
49 |
* Constructor: just store callback object
|
50 |
*/
|
50 |
*/
|
51 |
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
51 |
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
52 |
: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
|
52 |
: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
|
53 |
/**
|
53 |
/**
|
54 |
* Split text, emit words and positions.
|
54 |
* Split text, emit words and positions.
|
55 |
*/
|
55 |
*/
|
56 |
bool text_to_words(const std::string &in);
|
56 |
bool text_to_words(const string &in);
|
57 |
|
57 |
|
|
|
58 |
// Utility functions : these does not need the user to setup a callback
|
|
|
59 |
// etc.
|
|
|
60 |
static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
|
|
|
61 |
|
58 |
private:
|
62 |
private:
|
59 |
Flags m_flags;
|
63 |
Flags m_flags;
|
60 |
TextSplitCB *cb;
|
64 |
TextSplitCB *cb;
|
61 |
int maxWordLength;
|
65 |
int maxWordLength;
|
62 |
|
66 |
|
63 |
string span; // Current span. Might be jf.dockes@wanadoo.f
|
67 |
string span; // Current span. Might be jf.dockes@wanadoo.f
|
|
... |
|
... |
70 |
// It may happen that our cleanup would result in emitting the
|
74 |
// It may happen that our cleanup would result in emitting the
|
71 |
// same term twice. We try to avoid this
|
75 |
// same term twice. We try to avoid this
|
72 |
int prevpos;
|
76 |
int prevpos;
|
73 |
unsigned int prevlen;
|
77 |
unsigned int prevlen;
|
74 |
|
78 |
|
75 |
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
|
79 |
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
76 |
bool doemit(bool spanerase, int bp);
|
80 |
bool doemit(bool spanerase, int bp);
|
|
|
81 |
|
77 |
};
|
82 |
};
|
78 |
|
83 |
|
|
|
84 |
|
79 |
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
85 |
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|