Switch to unified view

a/src/common/textsplit.h b/src/common/textsplit.h
...
...
22
#include <list>
22
#include <list>
23
#ifndef NO_NAMESPACES
23
#ifndef NO_NAMESPACES
24
using std::string;
24
using std::string;
25
using std::list;
25
using std::list;
26
#endif
26
#endif
27
28
/**
29
 * Function class whose takeword method is called for every detected word while * splitting text.
30
 */
31
class TextSplitCB {
32
public:
33
    virtual ~TextSplitCB() {}
34
    virtual bool takeword(const string& term, 
35
            int pos,  // term pos
36
            int bts,  // byte offset of first char in term
37
            int bte   // byte offset of first char after term
38
            ) = 0; 
39
};
40
27
41
class Utf8Iter;
28
class Utf8Iter;
42
29
43
30
44
/** 
31
/** 
...
...
65
        TXTS_ONLYSPANS = 1,  // Only return maximum spans (a@b.com) 
52
        TXTS_ONLYSPANS = 1,  // Only return maximum spans (a@b.com) 
66
        TXTS_NOSPANS = 2,  // Only return atomic words (a, b, com)
53
        TXTS_NOSPANS = 2,  // Only return atomic words (a, b, com)
67
        TXTS_KEEPWILD = 4 // Handle wildcards as letters
54
        TXTS_KEEPWILD = 4 // Handle wildcards as letters
68
    };
55
    };
69
56
70
    /**
57
    
71
     * Constructor: just store callback object
72
     */
73
    TextSplit(TextSplitCB *t, Flags flags = Flags(TXTS_NONE))
58
    TextSplit(Flags flags = Flags(TXTS_NONE))
74
    : m_flags(flags), m_cb(t), m_maxWordLength(40), 
59
    : m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
75
    m_prevpos(-1)
76
    {
60
    {
77
    }
61
    }
62
    virtual ~TextSplit() {}
78
63
79
    /** Split text, emit words and positions. */
64
    /** Split text, emit words and positions. */
80
    bool text_to_words(const string &in);
65
    bool text_to_words(const string &in);
81
66
82
    //Utility functions : these does not need the user to setup a callback 
67
    /** Process one output word: to be implemented by the actual user class */
83
    // etc.
68
    virtual bool takeword(const string& term, 
69
            int pos,  // term pos
70
            int bts,  // byte offset of first char in term
71
            int bte   // byte offset of first char after term
72
            ) = 0; 
73
74
75
    // Static utility functions:
84
76
85
    /** Count words in string, as the splitter would generate them */
77
    /** Count words in string, as the splitter would generate them */
86
    static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
78
    static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
87
79
88
    /** Check if this is visibly not a single block of text */
80
    /** Check if this is visibly not a single block of text */
...
...
100
    /** Is char CJK ? */
92
    /** Is char CJK ? */
101
    static bool isCJK(int c);
93
    static bool isCJK(int c);
102
94
103
private:
95
private:
104
    Flags         m_flags;
96
    Flags         m_flags;
105
    TextSplitCB  *m_cb;
106
    int           m_maxWordLength;
97
    int           m_maxWordLength;
107
98
108
    // Current span. Might be jf.dockes@wanadoo.f
99
    // Current span. Might be jf.dockes@wanadoo.f
109
    string        m_span; 
100
    string        m_span; 
110
101
...
...
130
121
131
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
122
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
132
    bool doemit(bool spanerase, int bp, bool spanemit=false);
123
    bool doemit(bool spanerase, int bp, bool spanemit=false);
133
};
124
};
134
125
135
136
#endif /* _TEXTSPLIT_H_INCLUDED_ */
126
#endif /* _TEXTSPLIT_H_INCLUDED_ */