Switch to unified view

a/src/utils/utf8iter.h b/src/utils/utf8iter.h
...
...
18
#define _UTF8ITER_H_INCLUDED_
18
#define _UTF8ITER_H_INCLUDED_
19
19
20
#ifdef UTF8ITER_CHECK
20
#ifdef UTF8ITER_CHECK
21
#include "assert.h"
21
#include "assert.h"
22
#endif
22
#endif
23
#include <string>
23
24
24
/** 
25
/** 
25
 * A small helper class to iterate over utf8 strings. This is not an
26
 * A small helper class to iterate over utf8 strings. This is not an
26
 * STL iterator and does not much error checking. It is designed purely
27
 * STL iterator and does not much error checking. It is designed purely
27
 * for recoll usage, where the utf-8 string comes out of iconv in most cases
28
 * for recoll usage, where the utf-8 string comes out of iconv in most cases
28
 * and is assumed legal. We just try to catch cases where there would be 
29
 * and is assumed legal. We just try to catch cases where there would be 
29
 * a risk of crash.
30
 * a risk of crash.
30
 */
31
 */
31
class Utf8Iter {
32
class Utf8Iter {
32
public:
33
public:
33
    Utf8Iter(const string &in) 
34
    Utf8Iter(const std::string &in) 
34
    : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
35
    : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
35
    {
36
    {
36
    update_cl();
37
    update_cl();
37
    }
38
    }
38
39
39
    const string& buffer() const {return m_s;}
40
    const std::string& buffer() const {return m_s;}
40
41
41
    void rewind() 
42
    void rewind() 
42
    {
43
    {
43
    m_cl = 0; 
44
    m_cl = 0; 
44
    m_pos = 0; 
45
    m_pos = 0; 
...
...
50
    /** "Direct" access. Awfully inefficient as we skip from start or current
51
    /** "Direct" access. Awfully inefficient as we skip from start or current
51
     * position at best. This can only be useful for a lookahead from the
52
     * position at best. This can only be useful for a lookahead from the
52
     * current position */
53
     * current position */
53
    unsigned int operator[](unsigned int charpos) const 
54
    unsigned int operator[](unsigned int charpos) const 
54
    {
55
    {
55
    string::size_type mypos = 0;
56
    std::string::size_type mypos = 0;
56
    unsigned int mycp = 0;
57
    unsigned int mycp = 0;
57
    if (charpos >= m_charpos) {
58
    if (charpos >= m_charpos) {
58
        mypos = m_pos;
59
        mypos = m_pos;
59
        mycp = m_charpos;
60
        mycp = m_charpos;
60
    }
61
    }
...
...
73
    }
74
    }
74
    return (unsigned int)-1;
75
    return (unsigned int)-1;
75
    }
76
    }
76
77
77
    /** Increment current position to next utf-8 char */
78
    /** Increment current position to next utf-8 char */
78
    string::size_type operator++(int) 
79
    std::string::size_type operator++(int) 
79
    {
80
    {
80
    // Note: m_cl may be zero at eof if user's test not right
81
    // Note: m_cl may be zero at eof if user's test not right
81
    // this shouldn't crash the program until actual data access
82
    // this shouldn't crash the program until actual data access
82
#ifdef UTF8ITER_CHECK
83
#ifdef UTF8ITER_CHECK
83
    assert(m_cl != 0);
84
    assert(m_cl != 0);
84
#endif
85
#endif
85
    if (m_cl <= 0) 
86
    if (m_cl <= 0) 
86
        return string::npos;
87
        return std::string::npos;
87
88
88
    m_pos += m_cl;
89
    m_pos += m_cl;
89
    m_charpos++;
90
    m_charpos++;
90
    update_cl();
91
    update_cl();
91
    return m_pos;
92
    return m_pos;
...
...
100
    return getvalueat(m_pos, m_cl);
101
    return getvalueat(m_pos, m_cl);
101
    }
102
    }
102
103
103
    /** Append current utf-8 possibly multi-byte character to string param.
104
    /** Append current utf-8 possibly multi-byte character to string param.
104
    This needs to be fast. No error checking. */
105
    This needs to be fast. No error checking. */
105
    unsigned int appendchartostring(string &out) {
106
    unsigned int appendchartostring(std::string &out) {
106
#ifdef UTF8ITER_CHECK
107
#ifdef UTF8ITER_CHECK
107
    assert(m_cl != 0);
108
    assert(m_cl != 0);
108
#endif
109
#endif
109
    out.append(&m_s[m_pos], m_cl);
110
    out.append(&m_s[m_pos], m_cl);
110
    return m_cl;
111
    return m_cl;
111
    }
112
    }
112
113
113
    /** Return current character as string */
114
    /** Return current character as string */
114
    operator string() {
115
    operator std::string() {
115
#ifdef UTF8ITER_CHECK
116
#ifdef UTF8ITER_CHECK
116
    assert(m_cl != 0);
117
    assert(m_cl != 0);
117
#endif
118
#endif
118
    return m_s.substr(m_pos, m_cl);
119
    return m_s.substr(m_pos, m_cl);
119
    }
120
    }
...
...
125
    bool error() {
126
    bool error() {
126
    return m_error;
127
    return m_error;
127
    }
128
    }
128
129
129
    /** Return current byte offset in input string */
130
    /** Return current byte offset in input string */
130
    string::size_type getBpos() const {
131
    std::string::size_type getBpos() const {
131
    return m_pos;
132
    return m_pos;
132
    }
133
    }
133
134
134
    /** Return current character length */
135
    /** Return current character length */
135
    string::size_type getBlen() const {
136
    std::string::size_type getBlen() const {
136
    return m_cl;
137
    return m_cl;
137
    }
138
    }
138
139
139
    /** Return current unicode character offset in input string */
140
    /** Return current unicode character offset in input string */
140
    string::size_type getCpos() const {
141
    std::string::size_type getCpos() const {
141
    return m_charpos;
142
    return m_charpos;
142
    }
143
    }
143
144
144
private:
145
private:
145
    // String we're working with
146
    // String we're working with
146
    const string&     m_s; 
147
    const std::string&     m_s; 
147
    // Character length at current position. A value of zero indicates
148
    // Character length at current position. A value of zero indicates
148
    // an error.
149
    // an error.
149
    unsigned int      m_cl; 
150
    unsigned int      m_cl; 
150
    // Current byte offset in string.
151
    // Current byte offset in string.
151
    string::size_type m_pos; 
152
    std::string::size_type m_pos; 
152
    // Current character position
153
    // Current character position
153
    unsigned int      m_charpos; 
154
    unsigned int      m_charpos; 
154
    // Am I ok ?
155
    // Am I ok ?
155
    mutable bool      m_error;
156
    mutable bool      m_error;
156
157
157
    // Check position and cl against string length
158
    // Check position and cl against string length
158
    bool poslok(string::size_type p, int l) const {
159
    bool poslok(std::string::size_type p, int l) const {
159
#ifdef UTF8ITER_CHECK
160
#ifdef UTF8ITER_CHECK
160
    assert(p != string::npos && l > 0 && p + l <= m_s.length());
161
    assert(p != std::string::npos && l > 0 && p + l <= m_s.length());
161
#endif
162
#endif
162
    return p != string::npos && l > 0 && p + l <= m_s.length();
163
    return p != std::string::npos && l > 0 && p + l <= m_s.length();
163
    }
164
    }
164
165
165
    // Update current char length in object state, minimum checking
166
    // Update current char length in object state, minimum checking
166
    // for errors
167
    // for errors
167
    inline void update_cl() 
168
    inline void update_cl() 
...
...
178
        m_error = true;
179
        m_error = true;
179
    }
180
    }
180
    }
181
    }
181
182
182
    // Get character byte length at specified position. Returns 0 for error.
183
    // Get character byte length at specified position. Returns 0 for error.
183
    inline int get_cl(string::size_type p) const 
184
    inline int get_cl(std::string::size_type p) const 
184
    {
185
    {
185
    unsigned int z = (unsigned char)m_s[p];
186
    unsigned int z = (unsigned char)m_s[p];
186
    if (z <= 127) {
187
    if (z <= 127) {
187
        return 1;
188
        return 1;
188
    } else if ((z & 224) == 192) {
189
    } else if ((z & 224) == 192) {
...
...
198
#endif
199
#endif
199
    return 0;
200
    return 0;
200
    }
201
    }
201
202
202
    // Compute value at given position. No error checking.
203
    // Compute value at given position. No error checking.
203
    inline unsigned int getvalueat(string::size_type p, int l) const
204
    inline unsigned int getvalueat(std::string::size_type p, int l) const
204
    {
205
    {
205
    switch (l) {
206
    switch (l) {
206
    case 1: 
207
    case 1: 
207
#ifdef UTF8ITER_CHECK
208
#ifdef UTF8ITER_CHECK
208
        assert((unsigned char)m_s[p] < 128);
209
        assert((unsigned char)m_s[p] < 128);