Switch to unified view

a/src/utils/utf8iter.h b/src/utils/utf8iter.h
...
...
14
 *   Free Software Foundation, Inc.,
14
 *   Free Software Foundation, Inc.,
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
16
 */
17
#ifndef _UTF8ITER_H_INCLUDED_
17
#ifndef _UTF8ITER_H_INCLUDED_
18
#define _UTF8ITER_H_INCLUDED_
18
#define _UTF8ITER_H_INCLUDED_
19
/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $  (C) 2004 J.F.Dockes */
19
/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $  (C) 2004 J.F.Dockes */
20
20
21
/** 
21
/** 
22
 * A small helper class to iterate over utf8 strings. This is not an
22
 * A small helper class to iterate over utf8 strings. This is not an
23
 * STL iterator and this is not well designed, just convenient for
23
 * STL iterator and does not much error checking. It is designed purely
24
   some specific uses
24
 * for recoll usage, where the utf-8 string comes out of iconv in most cases
25
 * and is assumed legal. We just try to catch cases where there would be 
26
 * a risk of crash.
25
 */
27
 */
26
class Utf8Iter {
28
class Utf8Iter {
27
    unsigned int cl; // Char length at current position if known
28
    const string &s; // String we're working with
29
    string::size_type pos; // Current position in string
30
    unsigned int m_charpos; // Current character posiiton
31
32
    // Get character byte length at specified position
33
    inline int get_cl(string::size_type p) const {
34
  unsigned int z = (unsigned char)s[p];
35
  if (z <= 127) {
36
      return 1;
37
  } else if (z>=192 && z <= 223) {
38
      return 2;
39
  } else if (z >= 224 && z <= 239) {
40
      return 3;
41
  } else if (z >= 240 && z <= 247) {
42
      return 4;
43
  } else if (z >= 248 && z <= 251) {
44
      return 5;
45
  } else if (z >= 252 && z <= 253) {
46
      return 6;
47
  } 
48
  return -1;
49
    }
50
    // Check position and cl against string length
51
    bool poslok(string::size_type p, int l) const {
52
  return p != string::npos && l > 0 && p + l <= s.length();
53
    }
54
    // Update current char length in object state. Assumes pos is inside string
55
    inline int compute_cl() {
56
  cl = 0;
57
  cl = get_cl(pos);
58
  if (!poslok(pos, cl)) {
59
      pos = s.length();
60
      cl = 0;
61
      return -1;
62
  }
63
  return 0;
64
    }
65
    // Compute value at given position
66
    inline unsigned int getvalueat(string::size_type p, int l) const {
67
  switch (l) {
68
  case 1: return (unsigned char)s[p];
69
  case 2: return ((unsigned char)s[p] - 192) * 64 + 
70
      (unsigned char)s[p+1] - 128 ;
71
  case 3: return ((unsigned char)s[p]-224)*4096 + 
72
      ((unsigned char)s[p+1]-128)*64 + 
73
      (unsigned char)s[p+2]-128;
74
  case 4: return ((unsigned char)s[p]-240)*262144 + 
75
      ((unsigned char)s[p+1]-128)*4096 + 
76
      ((unsigned char)s[p+2]-128)*64 + 
77
      (unsigned char)s[p+3]-128;
78
  case 5: return ((unsigned char)s[p]-248)*16777216 + 
79
      ((unsigned char)s[p+1]-128)*262144 + 
80
      ((unsigned char)s[p+2]-128)*4096 + 
81
      ((unsigned char)s[p+3]-128)*64 + 
82
      (unsigned char)s[p+4]-128;
83
  case 6: return  ((unsigned char)s[p]-252)*1073741824 + 
84
      ((unsigned char)s[p+1]-128)*16777216 + 
85
      ((unsigned char)s[p+2]-128)*262144 + 
86
      ((unsigned char)s[p+3]-128)*4096 + 
87
      ((unsigned char)s[p+4]-128)*64 + 
88
      (unsigned char)s[p+5]-128;
89
  default:
90
      return (unsigned int)-1;
91
  }
92
    }
93
 public:
29
public:
94
    Utf8Iter(const string &in) 
30
    Utf8Iter(const string &in) 
95
  : cl(0), s(in), pos(0), m_charpos(0) 
31
  : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
96
  {
32
    {
97
      // Ensure state is ok if appendchartostring is called at once
98
        compute_cl();
33
    compute_cl();
99
  }
34
    }
100
35
101
    void rewind() {
36
    void rewind() 
102
  cl=0; pos=0; m_charpos=0;
103
    }
37
    {
104
    /** operator* returns the ucs4 value as a machine integer*/
38
  m_cl = 0; 
105
    unsigned int operator*() {
39
  m_pos = 0; 
106
  if (!cl && compute_cl() < 0)
40
  m_charpos = 0; 
107
      return (unsigned int)-1;
41
  m_error = false;
108
  unsigned int val = getvalueat(pos, cl);
42
  compute_cl();
109
  if (val == (unsigned int)-1) {
110
      pos = s.length();
111
      cl = 0;
112
  }
113
  return val;
114
    }
43
    }
44
115
    /** "Direct" access. Awfully inefficient as we skip from start or current
45
    /** "Direct" access. Awfully inefficient as we skip from start or current
116
     * position at best. This can only be useful for a lookahead from the
46
     * position at best. This can only be useful for a lookahead from the
117
     * current position */
47
     * current position */
118
    unsigned int operator[](unsigned int charpos) const {
48
    unsigned int operator[](unsigned int charpos) const 
49
    {
119
    string::size_type mypos = 0;
50
    string::size_type mypos = 0;
120
    unsigned int mycp = 0;;
51
    unsigned int mycp = 0;
121
    if (charpos >= m_charpos) {
52
    if (charpos >= m_charpos) {
122
        mypos = pos;
53
        mypos = m_pos;
123
        mycp = m_charpos;
54
        mycp = m_charpos;
124
    }
55
    }
56
  int l;
125
    while (mypos < s.length() && mycp != charpos) {
57
    while (mypos < m_s.length() && mycp != charpos) {
126
        mypos += get_cl(mypos);
58
        l = get_cl(mypos);
59
      if (l < 0)
60
      return (unsigned int)-1;
61
      mypos += l;
127
        ++mycp;
62
        ++mycp;
128
    }
63
    }
129
    if (mypos < s.length() && mycp == charpos) {
64
    if (mypos < m_s.length() && mycp == charpos) {
130
        int l = get_cl(mypos);
65
        l = get_cl(mypos);
131
        if (poslok(mypos, l))
66
        if (poslok(mypos, l))
132
        return getvalueat(mypos, get_cl(mypos));
67
        return getvalueat(mypos, get_cl(mypos));
133
    }
68
    }
134
    return (unsigned int)-1;
69
    return (unsigned int)-1;
135
    }
70
    }
136
71
137
    /** Set current position before next utf-8 character */
72
    /** Increment current position to next utf-8 char */
138
    string::size_type operator++(int) {
73
    string::size_type operator++(int) 
139
  if (!cl && compute_cl() < 0) {
74
    {
75
  // Note: m_cl may be zero at eof if user's test not right
76
  // this shouldn't crash the program until actual data access
77
#ifdef UTF8ITER_CHECK
78
  assert(m_cl != 0);
79
#endif
80
  if (m_cl == 0) 
140
        return pos = string::npos;
81
        return string::npos;
141
  }
82
142
    pos += cl;
83
    m_pos += m_cl;
143
    m_charpos++;
84
    m_charpos++;
144
  cl = 0;
85
  compute_cl();
145
    return pos;
86
    return m_pos;
87
    }
88
89
    /** operator* returns the ucs4 value as a machine integer*/
90
    unsigned int operator*() 
146
    }
91
    {
92
#ifdef UTF8ITER_CHECK
93
  assert(m_cl != 0);
94
#endif
95
  return getvalueat(m_pos, m_cl);
96
    }
97
98
    /** Append current utf-8 possibly multi-byte character to string param.
147
    /** This needs to be fast. No error checking. */
99
  This needs to be fast. No error checking. */
148
    void appendchartostring(string &out) {
100
    unsigned int appendchartostring(string &out) {
101
#ifdef UTF8ITER_CHECK
102
  assert(m_cl != 0);
103
#endif
149
    out.append(&s[pos], cl);
104
    out.append(&m_s[m_pos], m_cl);
105
  return m_cl;
150
    }
106
    }
107
108
    /** Return current character as string */
151
    operator string() {
109
    operator string() {
152
  if (!cl && compute_cl() < 0) {
110
#ifdef UTF8ITER_CHECK
153
      return std::string("");
111
  assert(m_cl != 0);
154
  }
112
#endif
155
    return s.substr(pos, cl);
113
    return m_s.substr(m_pos, m_cl);
156
    }
114
    }
115
157
    bool eof() {
116
    bool eof() {
158
  // Note: we always ensure that pos == s.length() when setting bad to 
159
  // true
160
    return pos == s.length();
117
    return m_pos == m_s.length();
161
    }
118
    }
119
162
    bool error() {
120
    bool error() {
163
  return compute_cl() < 0;
121
  return m_error;
164
    }
122
    }
123
165
    string::size_type getBpos() const {
124
    string::size_type getBpos() const {
166
    return pos;
125
    return m_pos;
167
    }
126
    }
127
168
    string::size_type getCpos() const {
128
    string::size_type getCpos() const {
169
    return m_charpos;
129
    return m_charpos;
170
    }
130
    }
131
132
private:
133
    // String we're working with
134
    const string&     m_s; 
135
    // Character length at current position. A value of zero indicates
136
    // unknown or error.
137
    unsigned int      m_cl; 
138
    // Current byte offset in string.
139
    string::size_type m_pos; 
140
    // Current character position
141
    unsigned int      m_charpos; 
142
    mutable bool      m_error;
143
144
    // Check position and cl against string length
145
    bool poslok(string::size_type p, int l) const {
146
#ifdef UTF8ITER_CHECK
147
  assert(p != string::npos && l > 0 && p + l <= m_s.length());
148
#endif
149
  return p != string::npos && l > 0 && p + l <= m_s.length();
150
    }
151
152
    // Update current char length in object state, minimum checking for 
153
    // errors
154
    inline int compute_cl() 
155
    {
156
  m_cl = 0;
157
  if (m_pos == m_s.length())
158
      return -1;
159
  m_cl = get_cl(m_pos);
160
  if (!poslok(m_pos, m_cl)) {
161
      m_pos = m_s.length();
162
      m_cl = 0;
163
      m_error = true;
164
      return -1;
165
  }
166
  return 0;
167
    }
168
169
    // Get character byte length at specified position
170
    inline int get_cl(string::size_type p) const 
171
    {
172
  unsigned int z = (unsigned char)m_s[p];
173
  if (z <= 127) {
174
      return 1;
175
  } else if ((z & 224) == 192) {
176
      return 2;
177
  } else if ((z & 240) == 224) {
178
      return 3;
179
  } else if ((z & 248) == 240) {
180
      return 4;
181
  }
182
#ifdef UTF8ITER_CHECK
183
  assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
184
         (z & 248) == 240);
185
#endif
186
  return -1;
187
    }
188
189
    // Compute value at given position. No error checking.
190
    inline unsigned int getvalueat(string::size_type p, int l) const
191
    {
192
  switch (l) {
193
  case 1: 
194
#ifdef UTF8ITER_CHECK
195
      assert((unsigned char)m_s[p] < 128);
196
#endif
197
      return (unsigned char)m_s[p];
198
  case 2: 
199
#ifdef UTF8ITER_CHECK
200
      assert(
201
         ((unsigned char)m_s[p] & 224) == 192
202
         && ((unsigned char)m_s[p+1] & 192) ==  128
203
         );
204
#endif
205
      return ((unsigned char)m_s[p] - 192) * 64 + 
206
      (unsigned char)m_s[p+1] - 128 ;
207
  case 3: 
208
#ifdef UTF8ITER_CHECK
209
      assert(
210
         (((unsigned char)m_s[p]) & 240) == 224
211
         && (((unsigned char)m_s[p+1]) & 192) ==  128
212
         && (((unsigned char)m_s[p+2]) & 192) ==  128
213
         );
214
#endif
215
216
      return ((unsigned char)m_s[p] - 224) * 4096 + 
217
      ((unsigned char)m_s[p+1] - 128) * 64 + 
218
      (unsigned char)m_s[p+2] - 128;
219
  case 4: 
220
#ifdef UTF8ITER_CHECK
221
      assert(
222
         (((unsigned char)m_s[p]) & 248) == 240
223
         && (((unsigned char)m_s[p+1]) & 192) ==  128
224
         && (((unsigned char)m_s[p+2]) & 192) ==  128
225
         && (((unsigned char)m_s[p+3]) & 192) ==  128
226
         );
227
#endif
228
229
      return ((unsigned char)m_s[p]-240)*262144 + 
230
      ((unsigned char)m_s[p+1]-128)*4096 + 
231
      ((unsigned char)m_s[p+2]-128)*64 + 
232
      (unsigned char)m_s[p+3]-128;
233
234
  default:
235
#ifdef UTF8ITER_CHECK
236
      assert(l <= 4);
237
#endif
238
      m_error = true;
239
      return (unsigned int)-1;
240
  }
241
    }
242
171
};
243
};
172
244
173
245
174
#endif /* _UTF8ITER_H_INCLUDED_ */
246
#endif /* _UTF8ITER_H_INCLUDED_ */