Switch to unified view

a/src/utils/utf8iter.h b/src/utils/utf8iter.h
...
...
30
 * a risk of crash.
30
 * a risk of crash.
31
 */
31
 */
32
class Utf8Iter {
32
class Utf8Iter {
33
public:
33
public:
34
    Utf8Iter(const std::string &in) 
34
    Utf8Iter(const std::string &in) 
35
    : m_s(in), m_cl(0), m_pos(0), m_charpos(0)
35
    : m_sp(&in), m_cl(0), m_pos(0), m_charpos(0)
36
    {
36
    {
37
    update_cl();
37
    update_cl();
38
    }
38
    }
39
39
40
    const std::string& buffer() const {return m_s;}
40
    const std::string& buffer() const {return (*m_sp);}
41
41
42
    void rewind() 
42
    void rewind() 
43
    {
43
    {
44
    m_cl = 0; 
44
    m_cl = 0; 
45
    m_pos = 0; 
45
    m_pos = 0; 
...
...
57
    if (charpos >= m_charpos) {
57
    if (charpos >= m_charpos) {
58
        mypos = m_pos;
58
        mypos = m_pos;
59
        mycp = m_charpos;
59
        mycp = m_charpos;
60
    }
60
    }
61
    int l;
61
    int l;
62
    while (mypos < m_s.length() && mycp != charpos) {
62
    while (mypos < m_sp->length() && mycp != charpos) {
63
        l = get_cl(mypos);
63
        l = get_cl(mypos);
64
        if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
64
        if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
65
        return (unsigned int)-1;
65
        return (unsigned int)-1;
66
        mypos += l;
66
        mypos += l;
67
        ++mycp;
67
        ++mycp;
68
    }
68
    }
69
    if (mypos < m_s.length() && mycp == charpos) {
69
    if (mypos < m_sp->length() && mycp == charpos) {
70
        l = get_cl(mypos);
70
        l = get_cl(mypos);
71
        if (poslok(mypos, l) && checkvalidat(mypos, l))
71
        if (poslok(mypos, l) && checkvalidat(mypos, l))
72
        return getvalueat(mypos, l);
72
        return getvalueat(mypos, l);
73
    }
73
    }
74
    return (unsigned int)-1;
74
    return (unsigned int)-1;
...
...
104
    This needs to be fast. No error checking. */
104
    This needs to be fast. No error checking. */
105
    unsigned int appendchartostring(std::string &out) const {
105
    unsigned int appendchartostring(std::string &out) const {
106
#ifdef UTF8ITER_CHECK
106
#ifdef UTF8ITER_CHECK
107
    assert(m_cl != 0);
107
    assert(m_cl != 0);
108
#endif
108
#endif
109
    out.append(&m_s[m_pos], m_cl);
109
    out.append(&(*m_sp)[m_pos], m_cl);
110
    return m_cl;
110
    return m_cl;
111
    }
111
    }
112
112
113
    /** Return current character as string */
113
    /** Return current character as string */
114
    operator std::string() {
114
    operator std::string() {
115
#ifdef UTF8ITER_CHECK
115
#ifdef UTF8ITER_CHECK
116
    assert(m_cl != 0);
116
    assert(m_cl != 0);
117
#endif
117
#endif
118
    return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
118
    return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string();
119
    }
119
    }
120
120
121
    bool eof() const {
121
    bool eof() const {
122
    return m_pos == m_s.length();
122
    return m_pos == m_sp->length();
123
    }
123
    }
124
124
125
    bool error() const {
125
    bool error() const {
126
    return m_cl == 0;
126
    return m_cl == 0;
127
    }
127
    }
...
...
141
    return m_charpos;
141
    return m_charpos;
142
    }
142
    }
143
143
144
private:
144
private:
145
    // String we're working with
145
    // String we're working with
146
    const std::string&     m_s; 
146
    const std::string*     m_sp; 
147
    // Character length at current position. A value of zero indicates
147
    // Character length at current position. A value of zero indicates
148
    // an error.
148
    // an error.
149
    unsigned int m_cl;
149
    unsigned int m_cl;
150
    // Current byte offset in string.
150
    // Current byte offset in string.
151
    std::string::size_type m_pos; 
151
    std::string::size_type m_pos; 
...
...
153
    unsigned int      m_charpos; 
153
    unsigned int      m_charpos; 
154
154
155
    // Check position and cl against string length
155
    // Check position and cl against string length
156
    bool poslok(std::string::size_type p, int l) const {
156
    bool poslok(std::string::size_type p, int l) const {
157
#ifdef UTF8ITER_CHECK
157
#ifdef UTF8ITER_CHECK
158
    assert(p != std::string::npos && l > 0 && p + l <= m_s.length());
158
    assert(p != std::string::npos && l > 0 && p + l <= m_sp->length());
159
#endif
159
#endif
160
    return p != std::string::npos && l > 0 && p + l <= m_s.length();
160
    return p != std::string::npos && l > 0 && p + l <= m_sp->length();
161
    }
161
    }
162
162
163
    // Update current char length in object state, check
163
    // Update current char length in object state, check
164
    // for errors
164
    // for errors
165
    inline void update_cl() 
165
    inline void update_cl() 
166
    {
166
    {
167
    m_cl = 0;
167
    m_cl = 0;
168
    if (m_pos >= m_s.length())
168
    if (m_pos >= m_sp->length())
169
        return;
169
        return;
170
    m_cl = get_cl(m_pos);
170
    m_cl = get_cl(m_pos);
171
    if (!poslok(m_pos, m_cl)) {
171
    if (!poslok(m_pos, m_cl)) {
172
        // Used to set eof here for safety, but this is bad because it
172
        // Used to set eof here for safety, but this is bad because it
173
        // basically prevents the caller to discriminate error and eof.
173
        // basically prevents the caller to discriminate error and eof.
174
        //      m_pos = m_s.length();
174
        //      m_pos = m_sp->length();
175
        m_cl = 0;
175
        m_cl = 0;
176
        return;
176
        return;
177
    }
177
    }
178
    if (!checkvalidat(m_pos, m_cl)) {
178
    if (!checkvalidat(m_pos, m_cl)) {
179
        m_cl = 0;
179
        m_cl = 0;
...
...
182
182
183
    inline bool checkvalidat(std::string::size_type p, int l) const
183
    inline bool checkvalidat(std::string::size_type p, int l) const
184
    {
184
    {
185
    switch (l) {
185
    switch (l) {
186
    case 1: 
186
    case 1: 
187
        return (unsigned char)m_s[p] < 128;
187
        return (unsigned char)(*m_sp)[p] < 128;
188
    case 2: 
188
    case 2: 
189
        return (((unsigned char)m_s[p]) & 224) == 192
189
        return (((unsigned char)(*m_sp)[p]) & 224) == 192
190
        && (((unsigned char)m_s[p+1]) & 192) == 128;
190
        && (((unsigned char)(*m_sp)[p+1]) & 192) == 128;
191
    case 3: 
191
    case 3: 
192
        return (((unsigned char)m_s[p]) & 240) == 224
192
        return (((unsigned char)(*m_sp)[p]) & 240) == 224
193
           && (((unsigned char)m_s[p+1]) & 192) ==  128
193
           && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
194
           && (((unsigned char)m_s[p+2]) & 192) ==  128
194
           && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
195
           ;
195
           ;
196
    case 4: 
196
    case 4: 
197
        return (((unsigned char)m_s[p]) & 248) == 240
197
        return (((unsigned char)(*m_sp)[p]) & 248) == 240
198
           && (((unsigned char)m_s[p+1]) & 192) ==  128
198
           && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
199
           && (((unsigned char)m_s[p+2]) & 192) ==  128
199
           && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
200
           && (((unsigned char)m_s[p+3]) & 192) ==  128
200
           && (((unsigned char)(*m_sp)[p+3]) & 192) ==  128
201
        ;
201
        ;
202
    default:
202
    default:
203
        return false;
203
        return false;
204
    }
204
    }
205
    }
205
    }
206
206
207
    // Get character byte length at specified position. Returns 0 for error.
207
    // Get character byte length at specified position. Returns 0 for error.
208
    inline int get_cl(std::string::size_type p) const 
208
    inline int get_cl(std::string::size_type p) const 
209
    {
209
    {
210
    unsigned int z = (unsigned char)m_s[p];
210
    unsigned int z = (unsigned char)(*m_sp)[p];
211
    if (z <= 127) {
211
    if (z <= 127) {
212
        return 1;
212
        return 1;
213
    } else if ((z & 224) == 192) {
213
    } else if ((z & 224) == 192) {
214
        return 2;
214
        return 2;
215
    } else if ((z & 240) == 224) {
215
    } else if ((z & 240) == 224) {
...
...
228
    inline unsigned int getvalueat(std::string::size_type p, int l) const
228
    inline unsigned int getvalueat(std::string::size_type p, int l) const
229
    {
229
    {
230
    switch (l) {
230
    switch (l) {
231
    case 1: 
231
    case 1: 
232
#ifdef UTF8ITER_CHECK
232
#ifdef UTF8ITER_CHECK
233
        assert((unsigned char)m_s[p] < 128);
233
        assert((unsigned char)(*m_sp)[p] < 128);
234
#endif
234
#endif
235
        return (unsigned char)m_s[p];
235
        return (unsigned char)(*m_sp)[p];
236
    case 2: 
236
    case 2: 
237
#ifdef UTF8ITER_CHECK
237
#ifdef UTF8ITER_CHECK
238
        assert(
238
        assert(
239
           ((unsigned char)m_s[p] & 224) == 192
239
           ((unsigned char)(*m_sp)[p] & 224) == 192
240
           && ((unsigned char)m_s[p+1] & 192) ==  128
240
           && ((unsigned char)(*m_sp)[p+1] & 192) ==  128
241
           );
241
           );
242
#endif
242
#endif
243
        return ((unsigned char)m_s[p] - 192) * 64 + 
243
        return ((unsigned char)(*m_sp)[p] - 192) * 64 + 
244
        (unsigned char)m_s[p+1] - 128 ;
244
        (unsigned char)(*m_sp)[p+1] - 128 ;
245
    case 3: 
245
    case 3: 
246
#ifdef UTF8ITER_CHECK
246
#ifdef UTF8ITER_CHECK
247
        assert(
247
        assert(
248
           (((unsigned char)m_s[p]) & 240) == 224
248
           (((unsigned char)(*m_sp)[p]) & 240) == 224
249
           && (((unsigned char)m_s[p+1]) & 192) ==  128
249
           && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
250
           && (((unsigned char)m_s[p+2]) & 192) ==  128
250
           && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
251
           );
251
           );
252
#endif
252
#endif
253
253
254
        return ((unsigned char)m_s[p] - 224) * 4096 + 
254
        return ((unsigned char)(*m_sp)[p] - 224) * 4096 + 
255
        ((unsigned char)m_s[p+1] - 128) * 64 + 
255
        ((unsigned char)(*m_sp)[p+1] - 128) * 64 + 
256
        (unsigned char)m_s[p+2] - 128;
256
        (unsigned char)(*m_sp)[p+2] - 128;
257
    case 4: 
257
    case 4: 
258
#ifdef UTF8ITER_CHECK
258
#ifdef UTF8ITER_CHECK
259
        assert(
259
        assert(
260
           (((unsigned char)m_s[p]) & 248) == 240
260
           (((unsigned char)(*m_sp)[p]) & 248) == 240
261
           && (((unsigned char)m_s[p+1]) & 192) ==  128
261
           && (((unsigned char)(*m_sp)[p+1]) & 192) ==  128
262
           && (((unsigned char)m_s[p+2]) & 192) ==  128
262
           && (((unsigned char)(*m_sp)[p+2]) & 192) ==  128
263
           && (((unsigned char)m_s[p+3]) & 192) ==  128
263
           && (((unsigned char)(*m_sp)[p+3]) & 192) ==  128
264
           );
264
           );
265
#endif
265
#endif
266
266
267
        return ((unsigned char)m_s[p]-240)*262144 + 
267
        return ((unsigned char)(*m_sp)[p]-240)*262144 + 
268
        ((unsigned char)m_s[p+1]-128)*4096 + 
268
        ((unsigned char)(*m_sp)[p+1]-128)*4096 + 
269
        ((unsigned char)m_s[p+2]-128)*64 + 
269
        ((unsigned char)(*m_sp)[p+2]-128)*64 + 
270
        (unsigned char)m_s[p+3]-128;
270
        (unsigned char)(*m_sp)[p+3]-128;
271
271
272
    default:
272
    default:
273
#ifdef UTF8ITER_CHECK
273
#ifdef UTF8ITER_CHECK
274
        assert(l <= 4);
274
        assert(l <= 4);
275
#endif
275
#endif