|
a/src/common/textsplit.h |
|
b/src/common/textsplit.h |
|
... |
|
... |
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
17 |
#ifndef _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
18 |
#define _TEXTSPLIT_H_INCLUDED_
|
19 |
|
19 |
|
|
|
20 |
#include <math.h>
|
|
|
21 |
|
20 |
#include <string>
|
22 |
#include <string>
|
21 |
#include <vector>
|
23 |
#include <vector>
|
22 |
|
24 |
|
23 |
using std::string;
|
25 |
using std::string;
|
24 |
using std::vector;
|
26 |
using std::vector;
|
|
... |
|
... |
64 |
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
66 |
: m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
|
65 |
{
|
67 |
{
|
66 |
}
|
68 |
}
|
67 |
virtual ~TextSplit() {}
|
69 |
virtual ~TextSplit() {}
|
68 |
|
70 |
|
|
|
71 |
virtual void setMaxWordLength(int l)
|
|
|
72 |
{
|
|
|
73 |
m_maxWordLength = l;
|
|
|
74 |
}
|
69 |
/** Split text, emit words and positions. */
|
75 |
/** Split text, emit words and positions. */
|
70 |
virtual bool text_to_words(const string &in);
|
76 |
virtual bool text_to_words(const string &in);
|
71 |
|
77 |
|
72 |
/** Process one output word: to be implemented by the actual user class */
|
78 |
/** Process one output word: to be implemented by the actual user class */
|
73 |
virtual bool takeword(const string& term,
|
79 |
virtual bool takeword(const string& term,
|
|
... |
|
... |
101 |
static bool stringToStrings(const string &s, vector<string> &tokens);
|
107 |
static bool stringToStrings(const string &s, vector<string> &tokens);
|
102 |
|
108 |
|
103 |
/** Is char CJK ? */
|
109 |
/** Is char CJK ? */
|
104 |
static bool isCJK(int c);
|
110 |
static bool isCJK(int c);
|
105 |
|
111 |
|
|
|
112 |
/** Statistics about word length (average and dispersion) can
|
|
|
113 |
* detect bad data like undecoded base64 or other mis-identified
|
|
|
114 |
* pieces of data taken as text. In practise, this keeps some junk out
|
|
|
115 |
* of the index, but does not decrease the index size much, and is
|
|
|
116 |
* probably not worth the trouble in general. Code kept because it
|
|
|
117 |
* probably can be useful in special cases. Base64 data does has
|
|
|
118 |
* word separators in it (+/) and is characterised by high average
|
|
|
119 |
* word length (>10, often close to 20) and high word length
|
|
|
120 |
* dispersion (avg/sigma > 0.8). In my tests, most natural
|
|
|
121 |
* language text has average word lengths around 5-8 and avg/sigma
|
|
|
122 |
* < 0.7
|
|
|
123 |
*/
|
|
|
124 |
#ifdef TEXTSPLIT_STATS
|
|
|
125 |
class Stats {
|
|
|
126 |
public:
|
|
|
127 |
Stats()
|
|
|
128 |
{
|
|
|
129 |
reset();
|
|
|
130 |
}
|
|
|
131 |
void reset()
|
|
|
132 |
{
|
|
|
133 |
count = 0;
|
|
|
134 |
totlen = 0;
|
|
|
135 |
sigma_acc = 0;
|
|
|
136 |
}
|
|
|
137 |
void newsamp(unsigned int len)
|
|
|
138 |
{
|
|
|
139 |
++count;
|
|
|
140 |
totlen += len;
|
|
|
141 |
double avglen = double(totlen) / double(count);
|
|
|
142 |
sigma_acc += (avglen - len) * (avglen - len);
|
|
|
143 |
}
|
|
|
144 |
struct Values {
|
|
|
145 |
int count;
|
|
|
146 |
double avglen;
|
|
|
147 |
double sigma;
|
|
|
148 |
};
|
|
|
149 |
Values get()
|
|
|
150 |
{
|
|
|
151 |
Values v;
|
|
|
152 |
v.count = count;
|
|
|
153 |
v.avglen = double(totlen) / double(count);
|
|
|
154 |
v.sigma = sqrt(sigma_acc / count);
|
|
|
155 |
return v;
|
|
|
156 |
}
|
|
|
157 |
private:
|
|
|
158 |
int count;
|
|
|
159 |
int totlen;
|
|
|
160 |
double sigma_acc;
|
|
|
161 |
};
|
|
|
162 |
|
|
|
163 |
Stats::Values getStats()
|
|
|
164 |
{
|
|
|
165 |
return m_stats.get();
|
|
|
166 |
}
|
|
|
167 |
void resetStats()
|
|
|
168 |
{
|
|
|
169 |
m_stats.reset();
|
|
|
170 |
}
|
|
|
171 |
#endif // TEXTSPLIT_STATS
|
|
|
172 |
|
106 |
private:
|
173 |
private:
|
107 |
Flags m_flags;
|
174 |
Flags m_flags;
|
108 |
int m_maxWordLength;
|
175 |
int m_maxWordLength;
|
109 |
|
176 |
|
110 |
// Current span. Might be jf.dockes@wanadoo.f
|
177 |
// Current span. Might be jf.dockes@wanadoo.f
|
|
... |
|
... |
125 |
// It may happen that our cleanup would result in emitting the
|
192 |
// It may happen that our cleanup would result in emitting the
|
126 |
// same term twice. We try to avoid this
|
193 |
// same term twice. We try to avoid this
|
127 |
int m_prevpos;
|
194 |
int m_prevpos;
|
128 |
unsigned int m_prevlen;
|
195 |
unsigned int m_prevlen;
|
129 |
|
196 |
|
|
|
197 |
#ifdef TEXTSPLIT_STATS
|
|
|
198 |
// Stats counters. These are processed in TextSplit rather than by a
|
|
|
199 |
// TermProc so that we can take very long words (not emitted) into
|
|
|
200 |
// account.
|
|
|
201 |
Stats m_stats;
|
|
|
202 |
#endif
|
|
|
203 |
// Word length in characters. Declared but not updated if !TEXTSPLIT_STATS
|
|
|
204 |
unsigned int m_wordChars;
|
|
|
205 |
|
130 |
// This processes cjk text:
|
206 |
// This processes cjk text:
|
131 |
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
207 |
bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
|
132 |
|
208 |
|
133 |
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
209 |
bool emitterm(bool isspan, string &term, int pos, int bs, int be);
|
134 |
bool doemit(bool spanerase, int bp, bool spanemit=false);
|
210 |
bool doemit(bool spanerase, int bp, bool spanemit=false);
|