|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
19 |
/* @(#$Id: textsplit.h,v 1.11 2006-01-30 11:15:27 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: textsplit.h,v 1.12 2006-11-12 08:35:11 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
#include <string>
|
21 |
#include <string>
|
22 |
#ifndef NO_NAMESPACES
|
22 |
#ifndef NO_NAMESPACES
|
23 |
using std::string;
|
23 |
using std::string;
|
24 |
#endif
|
24 |
#endif
|
|
... |
|
... |
42 |
* This uses a callback function. It could be done with an iterator instead,
|
42 |
* This uses a callback function. It could be done with an iterator instead,
|
43 |
* but 'ts much simpler this way...
|
43 |
* but 'ts much simpler this way...
|
44 |
*/
|
44 |
*/
|
45 |
class TextSplit {
|
45 |
class TextSplit {
|
46 |
public:
|
46 |
public:
|
|
|
47 |
enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2};
|
47 |
/**
|
48 |
/**
|
48 |
* Constructor: just store callback object
|
49 |
* Constructor: just store callback object
|
49 |
*/
|
50 |
*/
|
50 |
TextSplit(TextSplitCB *t, bool forquery = false)
|
51 |
TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE)
|
51 |
: fq(forquery), cb(t), maxWordLength(40), prevpos(-1) {}
|
52 |
: m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
|
52 |
/**
|
53 |
/**
|
53 |
* Split text, emit words and positions.
|
54 |
* Split text, emit words and positions.
|
54 |
*/
|
55 |
*/
|
55 |
bool text_to_words(const std::string &in);
|
56 |
bool text_to_words(const std::string &in);
|
56 |
|
57 |
|
57 |
private:
|
58 |
private:
|
58 |
bool fq; // for query: Are we splitting for query or index ?
|
59 |
Flags m_flags;
|
59 |
TextSplitCB *cb;
|
60 |
TextSplitCB *cb;
|
60 |
int maxWordLength;
|
61 |
int maxWordLength;
|
61 |
|
62 |
|
62 |
string span; // Current span. Might be jf.dockes@wanadoo.f
|
63 |
string span; // Current span. Might be jf.dockes@wanadoo.f
|
63 |
string word; // Current word: no punctuation at all in there
|
64 |
string word; // Current word: no punctuation at all in there
|