Switch to unified view

a/src/common/textsplit.h b/src/common/textsplit.h
...
...
14
 *   Free Software Foundation, Inc.,
14
 *   Free Software Foundation, Inc.,
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
16
 */
17
#ifndef _TEXTSPLIT_H_INCLUDED_
17
#ifndef _TEXTSPLIT_H_INCLUDED_
18
#define _TEXTSPLIT_H_INCLUDED_
18
#define _TEXTSPLIT_H_INCLUDED_
19
/* @(#$Id: textsplit.h,v 1.16 2007-01-18 12:09:58 dockes Exp $  (C) 2004 J.F.Dockes */
19
/* @(#$Id: textsplit.h,v 1.17 2007-09-18 20:35:31 dockes Exp $  (C) 2004 J.F.Dockes */
20
20
21
#include <string>
21
#include <string>
22
#ifndef NO_NAMESPACES
22
#ifndef NO_NAMESPACES
23
using std::string;
23
using std::string;
24
#endif
24
#endif
...
...
42
 * This uses a callback function. It could be done with an iterator instead,
42
 * This uses a callback function. It could be done with an iterator instead,
43
 * but 'ts much simpler this way...
43
 * but 'ts much simpler this way...
44
 */
44
 */
45
class TextSplit {
45
class TextSplit {
46
public:
46
public:
47
    enum Flags {TXTS_NONE = 0, TXTS_ONLYSPANS = 1, TXTS_NOSPANS = 2, 
47
    enum Flags {TXTS_NONE = 0, 
48
      TXTS_KEEPWILD = 4};
48
      TXTS_ONLYSPANS = 1,  // Only return maximum spans (a@b.com) 
49
      TXTS_NOSPANS = 2,  // Only return atomic words (a, b, com)
50
      TXTS_KEEPWILD = 4 // Handle wildcards as letters
51
    };
52
49
    /**
53
    /**
50
     * Constructor: just store callback object
54
     * Constructor: just store callback object
51
     */
55
     */
52
    TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE) 
56
    TextSplit(TextSplitCB *t, Flags flags = TXTS_NONE) 
53
    : m_flags(flags), cb(t), maxWordLength(40), prevpos(-1) {}
57
    : m_flags(flags), m_cb(t), m_maxWordLength(40), m_prevpos(-1) {}
58
54
    /**
59
    /**
55
     * Split text, emit words and positions.
60
     * Split text, emit words and positions.
56
     */
61
     */
57
    bool text_to_words(const string &in);
62
    bool text_to_words(const string &in);
58
63
59
    // Utility functions : these does not need the user to setup a callback 
64
    // Utility functions : these does not need the user to setup a callback 
60
    // etc.
65
    // etc.
61
    static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
66
    static int countWords(const string &in, Flags flgs = TXTS_ONLYSPANS);
62
67
63
private:
68
private:
64
    Flags m_flags;
69
    Flags         m_flags;
65
    TextSplitCB *cb;
70
    TextSplitCB  *m_cb;
66
    int maxWordLength;
71
    int           m_maxWordLength;
67
72
68
    string span; // Current span. Might be jf.dockes@wanadoo.f
73
    // Current span. Might be jf.dockes@wanadoo.f
74
    string        m_span; 
75
69
    int wordStart; // Current word: no punctuation at all in there
76
    // Current word: no punctuation at all in there
77
    int           m_wordStart;
70
    unsigned int wordLen;
78
    unsigned int  m_wordLen;
71
    bool number;
79
80
    // Currently inside number
81
    bool          m_inNumber;
82
72
    int wordpos; // Term position of current word
83
    // Term position of current word and span
73
    int spanpos; // Term position of current span
84
    int           m_wordpos; 
85
    int           m_spanpos;
74
86
75
    // It may happen that our cleanup would result in emitting the
87
    // It may happen that our cleanup would result in emitting the
76
    // same term twice. We try to avoid this
88
    // same term twice. We try to avoid this
77
    int prevpos;
89
    int           m_prevpos;
78
    unsigned int prevlen;
90
    unsigned int  m_prevlen;
91
92
    // This processes cjk text:
93
    // bool cjk_to_words();
79
94
80
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
95
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
81
    bool doemit(bool spanerase, int bp);
96
    bool doemit(bool spanerase, int bp);
82
83
};
97
};
84
98
85
99
86
#endif /* _TEXTSPLIT_H_INCLUDED_ */
100
#endif /* _TEXTSPLIT_H_INCLUDED_ */