Switch to unified view

a/src/utils/utf8iter.h b/src/utils/utf8iter.h
...
...
30
 * a risk of crash.
30
 * a risk of crash.
31
 */
31
 */
32
class Utf8Iter {
32
class Utf8Iter {
33
public:
33
public:
34
    Utf8Iter(const std::string &in) 
34
    Utf8Iter(const std::string &in) 
35
    : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
35
    : m_s(in), m_cl(0), m_pos(0), m_charpos(0)
36
    {
36
    {
37
    update_cl();
37
    update_cl();
38
    }
38
    }
39
39
40
    const std::string& buffer() const {return m_s;}
40
    const std::string& buffer() const {return m_s;}
...
...
42
    void rewind() 
42
    void rewind() 
43
    {
43
    {
44
    m_cl = 0; 
44
    m_cl = 0; 
45
    m_pos = 0; 
45
    m_pos = 0; 
46
    m_charpos = 0; 
46
    m_charpos = 0; 
47
  m_error = false;
48
    update_cl();
47
    update_cl();
49
    }
48
    }
50
49
51
    /** "Direct" access. Awfully inefficient as we skip from start or current
50
    /** "Direct" access. Awfully inefficient as we skip from start or current
52
     * position at best. This can only be useful for a lookahead from the
51
     * position at best. This can only be useful for a lookahead from the
...
...
60
        mycp = m_charpos;
59
        mycp = m_charpos;
61
    }
60
    }
62
    int l;
61
    int l;
63
    while (mypos < m_s.length() && mycp != charpos) {
62
    while (mypos < m_s.length() && mycp != charpos) {
64
        l = get_cl(mypos);
63
        l = get_cl(mypos);
65
      if (l <= 0)
64
      if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
66
        return (unsigned int)-1;
65
        return (unsigned int)-1;
67
        mypos += l;
66
        mypos += l;
68
        ++mycp;
67
        ++mycp;
69
    }
68
    }
70
    if (mypos < m_s.length() && mycp == charpos) {
69
    if (mypos < m_s.length() && mycp == charpos) {
71
        l = get_cl(mypos);
70
        l = get_cl(mypos);
72
      if (poslok(mypos, l))
71
      if (poslok(mypos, l) && checkvalidat(mypos, l))
73
        return getvalueat(mypos, get_cl(mypos));
72
        return getvalueat(mypos, l);
74
    }
73
    }
75
    return (unsigned int)-1;
74
    return (unsigned int)-1;
76
    }
75
    }
77
76
78
    /** Increment current position to next utf-8 char */
77
    /** Increment current position to next utf-8 char */
...
...
81
    // Note: m_cl may be zero at eof if user's test not right
80
    // Note: m_cl may be zero at eof if user's test not right
82
    // this shouldn't crash the program until actual data access
81
    // this shouldn't crash the program until actual data access
83
#ifdef UTF8ITER_CHECK
82
#ifdef UTF8ITER_CHECK
84
    assert(m_cl != 0);
83
    assert(m_cl != 0);
85
#endif
84
#endif
86
    if (m_cl <= 0) 
85
    if (m_cl == 0)
87
        return std::string::npos;
86
        return std::string::npos;
88
87
89
    m_pos += m_cl;
88
    m_pos += m_cl;
90
    m_charpos++;
89
    m_charpos++;
91
    update_cl();
90
    update_cl();
...
...
94
93
95
    /** operator* returns the ucs4 value as a machine integer*/
94
    /** operator* returns the ucs4 value as a machine integer*/
96
    unsigned int operator*() 
95
    unsigned int operator*() 
97
    {
96
    {
98
#ifdef UTF8ITER_CHECK
97
#ifdef UTF8ITER_CHECK
99
    assert(m_cl != 0);
98
    assert(m_cl > 0);
100
#endif
99
#endif
101
  return getvalueat(m_pos, m_cl);
100
  return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
102
    }
101
    }
103
102
104
    /** Append current utf-8 possibly multi-byte character to string param.
103
    /** Append current utf-8 possibly multi-byte character to string param.
105
    This needs to be fast. No error checking. */
104
    This needs to be fast. No error checking. */
106
    unsigned int appendchartostring(std::string &out) const {
105
    unsigned int appendchartostring(std::string &out) const {
...
...
114
    /** Return current character as string */
113
    /** Return current character as string */
115
    operator std::string() {
114
    operator std::string() {
116
#ifdef UTF8ITER_CHECK
115
#ifdef UTF8ITER_CHECK
117
    assert(m_cl != 0);
116
    assert(m_cl != 0);
118
#endif
117
#endif
119
  return m_s.substr(m_pos, m_cl);
118
  return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
120
    }
119
    }
121
120
122
    bool eof() {
121
    bool eof() const {
123
    return m_pos == m_s.length();
122
    return m_pos == m_s.length();
124
    }
123
    }
125
124
126
    bool error() {
125
    bool error() const {
127
  return m_error;
126
  return m_cl == 0;
128
    }
127
    }
129
128
130
    /** Return current byte offset in input string */
129
    /** Return current byte offset in input string */
131
    std::string::size_type getBpos() const {
130
    std::string::size_type getBpos() const {
132
    return m_pos;
131
    return m_pos;
...
...
145
private:
144
private:
146
    // String we're working with
145
    // String we're working with
147
    const std::string&     m_s; 
146
    const std::string&     m_s; 
148
    // Character length at current position. A value of zero indicates
147
    // Character length at current position. A value of zero indicates
149
    // an error.
148
    // an error.
150
    unsigned int      m_cl; 
149
    unsigned int m_cl;
151
    // Current byte offset in string.
150
    // Current byte offset in string.
152
    std::string::size_type m_pos; 
151
    std::string::size_type m_pos; 
153
    // Current character position
152
    // Current character position
154
    unsigned int      m_charpos; 
153
    unsigned int      m_charpos; 
155
    // Am I ok ?
156
    mutable bool      m_error;
157
154
158
    // Check position and cl against string length
155
    // Check position and cl against string length
159
    bool poslok(std::string::size_type p, int l) const {
156
    bool poslok(std::string::size_type p, int l) const {
160
#ifdef UTF8ITER_CHECK
157
#ifdef UTF8ITER_CHECK
161
    assert(p != std::string::npos && l > 0 && p + l <= m_s.length());
158
    assert(p != std::string::npos && l > 0 && p + l <= m_s.length());
162
#endif
159
#endif
163
    return p != std::string::npos && l > 0 && p + l <= m_s.length();
160
    return p != std::string::npos && l > 0 && p + l <= m_s.length();
164
    }
161
    }
165
162
166
    // Update current char length in object state, minimum checking
163
    // Update current char length in object state, check
167
    // for errors
164
    // for errors
168
    inline void update_cl() 
165
    inline void update_cl() 
169
    {
166
    {
170
    m_cl = 0;
167
    m_cl = 0;
171
    if (m_pos >= m_s.length())
168
    if (m_pos >= m_s.length())
...
...
174
    if (!poslok(m_pos, m_cl)) {
171
    if (!poslok(m_pos, m_cl)) {
175
        // Used to set eof here for safety, but this is bad because it
172
        // Used to set eof here for safety, but this is bad because it
176
        // basically prevents the caller to discriminate error and eof.
173
        // basically prevents the caller to discriminate error and eof.
177
        //      m_pos = m_s.length();
174
        //      m_pos = m_s.length();
178
        m_cl = 0;
175
        m_cl = 0;
179
      m_error = true;
176
      return;
177
  }
178
  if (!checkvalidat(m_pos, m_cl)) {
179
      m_cl = 0;
180
  }
181
    }
182
183
    inline bool checkvalidat(std::string::size_type p, int l) const
184
    {
185
  switch (l) {
186
  case 1: 
187
      return (unsigned char)m_s[p] < 128;
188
  case 2: 
189
      return (((unsigned char)m_s[p]) & 224) == 192
190
      && (((unsigned char)m_s[p+1]) & 192) == 128;
191
  case 3: 
192
      return (((unsigned char)m_s[p]) & 240) == 224
193
         && (((unsigned char)m_s[p+1]) & 192) ==  128
194
         && (((unsigned char)m_s[p+2]) & 192) ==  128
195
         ;
196
  case 4: 
197
      return (((unsigned char)m_s[p]) & 248) == 240
198
         && (((unsigned char)m_s[p+1]) & 192) ==  128
199
         && (((unsigned char)m_s[p+2]) & 192) ==  128
200
         && (((unsigned char)m_s[p+3]) & 192) ==  128
201
      ;
202
  default:
203
      return false;
180
    }
204
    }
181
    }
205
    }
182
206
183
    // Get character byte length at specified position. Returns 0 for error.
207
    // Get character byte length at specified position. Returns 0 for error.
184
    inline int get_cl(std::string::size_type p) const 
208
    inline int get_cl(std::string::size_type p) const 
...
...
247
271
248
    default:
272
    default:
249
#ifdef UTF8ITER_CHECK
273
#ifdef UTF8ITER_CHECK
250
        assert(l <= 4);
274
        assert(l <= 4);
251
#endif
275
#endif
252
      m_error = true;
253
        return (unsigned int)-1;
276
        return (unsigned int)-1;
254
    }
277
    }
255
    }
278
    }
256
279
257
};
280
};