|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
182 |
int m_maxWordLength;
|
182 |
int m_maxWordLength;
|
183 |
|
183 |
|
184 |
// Current span. Might be jf.dockes@wanadoo.f
|
184 |
// Current span. Might be jf.dockes@wanadoo.f
|
185 |
std::string m_span;
|
185 |
std::string m_span;
|
186 |
|
186 |
|
187 |
std::vector <std::pair<unsigned int, unsigned int> > m_words_in_span;
|
187 |
std::vector <std::pair<int, int> > m_words_in_span;
|
188 |
|
188 |
|
189 |
// Current word: no punctuation at all in there. Byte offset
|
189 |
// Current word: no punctuation at all in there. Byte offset
|
190 |
// relative to the current span and byte length
|
190 |
// relative to the current span and byte length
|
191 |
int m_wordStart;
|
191 |
int m_wordStart;
|
192 |
unsigned int m_wordLen;
|
192 |
unsigned int m_wordLen;
|
|
... |
|
... |
199 |
int m_spanpos;
|
199 |
int m_spanpos;
|
200 |
|
200 |
|
201 |
// It may happen that our cleanup would result in emitting the
|
201 |
// It may happen that our cleanup would result in emitting the
|
202 |
// same term twice. We try to avoid this
|
202 |
// same term twice. We try to avoid this
|
203 |
int m_prevpos;
|
203 |
int m_prevpos;
|
204 |
unsigned int m_prevlen;
|
204 |
int m_prevlen;
|
205 |
|
205 |
|
206 |
#ifdef TEXTSPLIT_STATS
|
206 |
#ifdef TEXTSPLIT_STATS
|
207 |
// Stats counters. These are processed in TextSplit rather than by a
|
207 |
// Stats counters. These are processed in TextSplit rather than by a
|
208 |
// TermProc so that we can take very long words (not emitted) into
|
208 |
// TermProc so that we can take very long words (not emitted) into
|
209 |
// account.
|
209 |
// account.
|
|
... |
|
... |
213 |
unsigned int m_wordChars;
|
213 |
unsigned int m_wordChars;
|
214 |
|
214 |
|
215 |
// This processes cjk text:
|
215 |
// This processes cjk text:
|
216 |
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
216 |
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
217 |
|
217 |
|
218 |
bool emitterm(bool isspan, std::string &term, int pos, int bs, int be);
|
218 |
bool emitterm(bool isspan, std::string &term, int pos, size_t bs,size_t be);
|
219 |
bool doemit(bool spanerase, int bp);
|
219 |
bool doemit(bool spanerase, size_t bp);
|
220 |
void discardspan();
|
220 |
void discardspan();
|
221 |
bool span_is_acronym(std::string *acronym);
|
221 |
bool span_is_acronym(std::string *acronym);
|
222 |
bool words_from_span(int bp);
|
222 |
bool words_from_span(size_t bp);
|
223 |
};
|
223 |
};
|
224 |
|
224 |
|
225 |
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|
225 |
#endif /* _TEXTSPLIT_H_INCLUDED_ */
|