|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
1 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
1 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
2 |
#define _TEXTSPLIT_H_INCLUDED_
|
2 |
#define _TEXTSPLIT_H_INCLUDED_
|
3 |
/* @(#$Id: textsplit.h,v 1.3 2005-01-24 13:17:58 dockes Exp $ (C) 2004 J.F.Dockes */
|
3 |
/* @(#$Id: textsplit.h,v 1.4 2005-02-07 13:17:47 dockes Exp $ (C) 2004 J.F.Dockes */
|
4 |
|
4 |
|
5 |
#include <string>
|
5 |
#include <string>
|
|
|
6 |
|
|
|
7 |
// Function class whose called for every detected word
|
|
|
8 |
class TextSplitCB {
|
|
|
9 |
public:
|
|
|
10 |
virtual ~TextSplitCB() {}
|
|
|
11 |
virtual bool takeword(const std::string& term,
|
|
|
12 |
int pos, // term pos
|
|
|
13 |
int bts, // byte offset of first char in term
|
|
|
14 |
int bte // byte offset of first char after term
|
|
|
15 |
) = 0;
|
|
|
16 |
};
|
6 |
|
17 |
|
7 |
/**
|
18 |
/**
|
8 |
* Split text into words.
|
19 |
* Split text into words.
|
9 |
* See comments at top of .cpp for more explanations.
|
20 |
* See comments at top of .cpp for more explanations.
|
10 |
* This uses a callback function. It could be done with an iterator instead,
|
21 |
* This uses a callback function. It could be done with an iterator instead,
|
11 |
* but 'ts much simpler this way...
|
22 |
* but 'ts much simpler this way...
|
12 |
*/
|
23 |
*/
|
13 |
class TextSplit {
|
24 |
class TextSplit {
|
14 |
public:
|
25 |
TextSplitCB *cb;
|
15 |
typedef bool (*TermSink)(void *cdata, const std::string & term, int pos);
|
|
|
16 |
private:
|
|
|
17 |
TermSink termsink;
|
|
|
18 |
void *cdata;
|
|
|
19 |
int maxWordLength;
|
26 |
int maxWordLength;
|
20 |
bool emitterm(std::string &term, int pos, bool doerase);
|
27 |
bool emitterm(std::string &term, int pos, bool doerase, int, int);
|
21 |
public:
|
28 |
public:
|
22 |
/**
|
29 |
/**
|
23 |
* Constructor: just store callback and client data
|
30 |
* Constructor: just store callback and client data
|
24 |
*/
|
31 |
*/
|
25 |
TextSplit(TermSink t, void *c) : termsink(t), cdata(c), maxWordLength(40)
|
32 |
TextSplit(TextSplitCB *t) : cb(t), maxWordLength(40) {}
|
26 |
{}
|
|
|
27 |
/**
|
33 |
/**
|
28 |
* Split text, emit words and positions.
|
34 |
* Split text, emit words and positions.
|
29 |
*/
|
35 |
*/
|
30 |
bool text_to_words(const std::string &in);
|
36 |
bool text_to_words(const std::string &in);
|
31 |
};
|
37 |
};
|