Switch to unified view

a/src/common/textsplit.h b/src/common/textsplit.h
...
...
22
#include <string>
22
#include <string>
23
#include <vector>
23
#include <vector>
24
24
25
using std::string;
25
using std::string;
26
using std::vector;
26
using std::vector;
27
using std::pair;
27
28
28
class Utf8Iter;
29
class Utf8Iter;
29
30
30
/** 
31
/** 
31
 * Split text into words. 
32
 * Split text into words. 
...
...
53
    static void noNumbers()
54
    static void noNumbers()
54
    {
55
    {
55
    o_noNumbers = true;
56
    o_noNumbers = true;
56
    }
57
    }
57
58
58
    enum Flags {TXTS_NONE = 0, 
59
    enum Flags {
59
      TXTS_ONLYSPANS = 1,  // Only return maximum spans (a@b.com) 
60
        // Default: will return spans and words (a_b, a, b)
60
      TXTS_NOSPANS = 2,  // Only return atomic words (a, b, com)
61
        TXTS_NONE = 0, 
61
      TXTS_KEEPWILD = 4 // Handle wildcards as letters
62
        // Only return maximum spans (a@b.com, not a, b, or com) 
63
        TXTS_ONLYSPANS = 1,  
64
        // Special: Only return atomic words (a, b, com).  This is not
65
        // used for indexing, but for position computation during
66
        // abstract generation,
67
        TXTS_NOSPANS = 2,  
68
        // Handle wildcards as letters. This is used with ONLYSPANS
69
        // for parsing a user query (never alone).
70
        TXTS_KEEPWILD = 4 
62
    };
71
    };
63
64
    
72
    
65
    TextSplit(Flags flags = Flags(TXTS_NONE))
73
    TextSplit(Flags flags = Flags(TXTS_NONE))
66
    : m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
74
    : m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
67
    {
75
    {
68
    }
76
    }
...
...
175
    int           m_maxWordLength;
183
    int           m_maxWordLength;
176
184
177
    // Current span. Might be jf.dockes@wanadoo.f
185
    // Current span. Might be jf.dockes@wanadoo.f
178
    string        m_span; 
186
    string        m_span; 
179
187
188
    vector <pair<unsigned int, unsigned int> > m_words_in_span;
189
180
    // Current word: no punctuation at all in there. Byte offset
190
    // Current word: no punctuation at all in there. Byte offset
181
    // relative to the current span and byte length
191
    // relative to the current span and byte length
182
    int           m_wordStart;
192
    int           m_wordStart;
183
    unsigned int  m_wordLen;
193
    unsigned int  m_wordLen;
184
194
...
...
205
215
206
    // This processes cjk text:
216
    // This processes cjk text:
207
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
217
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
208
218
209
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
219
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
210
    bool doemit(bool spanerase, int bp, bool spanemit=false);
220
    bool doemit(bool spanerase, int bp);
211
    void discardspan();
221
    void discardspan();
222
    bool span_is_acronym(std::string *acronym);
223
    bool words_from_span();
212
};
224
};
213
225
214
#endif /* _TEXTSPLIT_H_INCLUDED_ */
226
#endif /* _TEXTSPLIT_H_INCLUDED_ */