|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
22 |
#include <string>
|
22 |
#include <string>
|
23 |
#include <vector>
|
23 |
#include <vector>
|
24 |
|
24 |
|
25 |
using std::string;
|
25 |
using std::string;
|
26 |
using std::vector;
|
26 |
using std::vector;
|
|
|
27 |
using std::pair;
|
27 |
|
28 |
|
28 |
class Utf8Iter;
|
29 |
class Utf8Iter;
|
29 |
|
30 |
|
30 |
/**
|
31 |
/**
|
31 |
* Split text into words.
|
32 |
* Split text into words.
|
|
... |
|
... |
53 |
static void noNumbers()
|
54 |
static void noNumbers()
|
54 |
{
|
55 |
{
|
55 |
o_noNumbers = true;
|
56 |
o_noNumbers = true;
|
56 |
}
|
57 |
}
|
57 |
|
58 |
|
58 |
enum Flags {TXTS_NONE = 0,
|
59 |
enum Flags {
|
59 |
TXTS_ONLYSPANS = 1, // Only return maximum spans (a@b.com)
|
60 |
// Default: will return spans and words (a_b, a, b)
|
60 |
TXTS_NOSPANS = 2, // Only return atomic words (a, b, com)
|
61 |
TXTS_NONE = 0,
|
61 |
TXTS_KEEPWILD = 4 // Handle wildcards as letters
|
62 |
// Only return maximum spans (a@b.com, not a, b, or com)
|
|
|
63 |
TXTS_ONLYSPANS = 1,
|
|
|
64 |
// Special: Only return atomic words (a, b, com). This is not
|
|
|
65 |
// used for indexing, but for position computation during
|
|
|
66 |
// abstract generation,
|
|
|
67 |
TXTS_NOSPANS = 2,
|
|
|
68 |
// Handle wildcards as letters. This is used with ONLYSPANS
|
|
|
69 |
// for parsing a user query (never alone).
|
|
|
70 |
TXTS_KEEPWILD = 4
|
62 |
};
|
71 |
};
|
63 |
|
|
|
64 |
|
72 |
|
65 |
TextSplit(Flags flags = Flags(TXTS_NONE))
|
73 |
TextSplit(Flags flags = Flags(TXTS_NONE))
|
66 |
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
74 |
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
67 |
{
|
75 |
{
|
68 |
}
|
76 |
}
|
|
... |
|
... |
175 |
int m_maxWordLength;
|
183 |
int m_maxWordLength;
|
176 |
|
184 |
|
177 |
// Current span. Might be jf.dockes@wanadoo.f
|
185 |
// Current span. Might be jf.dockes@wanadoo.f
|
178 |
string m_span;
|
186 |
string m_span;
|
179 |
|
187 |
|
|
|
188 |
vector <pair<unsigned int, unsigned int> > m_words_in_span;
|
|
|
189 |
|
180 |
// Current word: no punctuation at all in there. Byte offset
|
190 |
// Current word: no punctuation at all in there. Byte offset
|
181 |
// relative to the current span and byte length
|
191 |
// relative to the current span and byte length
|
182 |
int m_wordStart;
|
192 |
int m_wordStart;
|
183 |
unsigned int m_wordLen;
|
193 |
unsigned int m_wordLen;
|
184 |
|
194 |
|
|
... |
|
... |
205 |
|
215 |
|
206 |
// This processes cjk text:
|
216 |
// This processes cjk text:
|
207 |
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
217 |
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
208 |
|
218 |
|
209 |
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
219 |
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
210 |
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
220 |
bool doemit(bool spanerase, int bp);
|
211 |
void discardspan();
|
221 |
void discardspan();
|
|
|
222 |
bool span_is_acronym(std::string *acronym);
|
|
|
223 |
bool words_from_span();
|
212 |
};
|
224 |
};
|
213 |
|
225 |
|
214 |
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
226 |
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|