Switch to unified view

a/src/common/textsplit.h b/src/common/textsplit.h
...
...
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
16
 */
17
#ifndef _TEXTSPLIT_H_INCLUDED_
17
#ifndef _TEXTSPLIT_H_INCLUDED_
18
#define _TEXTSPLIT_H_INCLUDED_
18
#define _TEXTSPLIT_H_INCLUDED_
19
19
20
#include <math.h>
21
20
#include <string>
22
#include <string>
21
#include <vector>
23
#include <vector>
22
24
23
using std::string;
25
using std::string;
24
using std::vector;
26
using std::vector;
...
...
64
    : m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
66
    : m_flags(flags), m_maxWordLength(40), m_prevpos(-1)
65
    {
67
    {
66
    }
68
    }
67
    virtual ~TextSplit() {}
69
    virtual ~TextSplit() {}
68
70
71
    virtual void setMaxWordLength(int l)
72
    {
73
  m_maxWordLength = l;
74
    }
69
    /** Split text, emit words and positions. */
75
    /** Split text, emit words and positions. */
70
    virtual bool text_to_words(const string &in);
76
    virtual bool text_to_words(const string &in);
71
77
72
    /** Process one output word: to be implemented by the actual user class */
78
    /** Process one output word: to be implemented by the actual user class */
73
    virtual bool takeword(const string& term, 
79
    virtual bool takeword(const string& term, 
...
...
101
    static bool stringToStrings(const string &s, vector<string> &tokens);
107
    static bool stringToStrings(const string &s, vector<string> &tokens);
102
108
103
    /** Is char CJK ? */
109
    /** Is char CJK ? */
104
    static bool isCJK(int c);
110
    static bool isCJK(int c);
105
111
112
    /** Statistics about word length (average and dispersion) can
113
     * detect bad data like undecoded base64 or other mis-identified
114
     * pieces of data taken as text. In practise, this keeps some junk out 
115
     * of the index, but does not decrease the index size much, and is
116
     * probably not worth the trouble in general. Code kept because it
117
     * probably can be useful in special cases. Base64 data does has
118
     * word separators in it (+/) and is characterised by high average
119
     * word length (>10, often close to 20) and high word length
120
     * dispersion (avg/sigma > 0.8). In my tests, most natural
121
     * language text has average word lengths around 5-8 and avg/sigma
122
     * < 0.7
123
     */
124
#ifdef TEXTSPLIT_STATS
125
    class Stats {
126
    public:
127
  Stats()
128
  {
129
      reset();
130
  }
131
  void reset()
132
  {
133
      count = 0;
134
      totlen = 0;
135
      sigma_acc = 0;
136
  }
137
  void newsamp(unsigned int len)
138
  {
139
      ++count;
140
      totlen += len;
141
      double avglen = double(totlen) / double(count);
142
      sigma_acc += (avglen - len) * (avglen - len);
143
  }
144
  struct Values {
145
      int count;
146
      double avglen;
147
      double sigma;
148
  };
149
  Values get()
150
  {
151
      Values v;
152
      v.count = count;
153
      v.avglen = double(totlen) / double(count);
154
      v.sigma = sqrt(sigma_acc / count);
155
      return v;
156
  }
157
    private:
158
  int count;
159
  int totlen;
160
  double sigma_acc;
161
    };
162
163
    Stats::Values getStats()
164
    {
165
  return m_stats.get();
166
    }
167
    void resetStats()
168
    {
169
  m_stats.reset();
170
    }
171
#endif // TEXTSPLIT_STATS
172
106
private:
173
private:
107
    Flags         m_flags;
174
    Flags         m_flags;
108
    int           m_maxWordLength;
175
    int           m_maxWordLength;
109
176
110
    // Current span. Might be jf.dockes@wanadoo.f
177
    // Current span. Might be jf.dockes@wanadoo.f
...
...
125
    // It may happen that our cleanup would result in emitting the
192
    // It may happen that our cleanup would result in emitting the
126
    // same term twice. We try to avoid this
193
    // same term twice. We try to avoid this
127
    int           m_prevpos;
194
    int           m_prevpos;
128
    unsigned int  m_prevlen;
195
    unsigned int  m_prevlen;
129
196
197
#ifdef TEXTSPLIT_STATS
198
    // Stats counters. These are processed in TextSplit rather than by a 
199
    // TermProc so that we can take very long words (not emitted) into
200
    // account.
201
    Stats         m_stats;
202
#endif
203
    // Word length in characters. Declared but not updated if !TEXTSPLIT_STATS
204
    unsigned int  m_wordChars;
205
130
    // This processes cjk text:
206
    // This processes cjk text:
131
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
207
    bool cjk_to_words(Utf8Iter *it, unsigned int *cp);
132
208
133
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
209
    bool emitterm(bool isspan, string &term, int pos, int bs, int be);
134
    bool doemit(bool spanerase, int bp, bool spanemit=false);
210
    bool doemit(bool spanerase, int bp, bool spanemit=false);