Switch to unified view

a/src/utils/utf8iter.h b/src/utils/utf8iter.h
...
...
14
 *   Free Software Foundation, Inc.,
14
 *   Free Software Foundation, Inc.,
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
15
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
16
 */
16
 */
17
#ifndef _UTF8ITER_H_INCLUDED_
17
#ifndef _UTF8ITER_H_INCLUDED_
18
#define _UTF8ITER_H_INCLUDED_
18
#define _UTF8ITER_H_INCLUDED_
19
/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $  (C) 2004 J.F.Dockes */
19
/* @(#$Id: utf8iter.h,v 1.9 2007-09-20 08:45:05 dockes Exp $  (C) 2004 J.F.Dockes */
20
20
21
/** 
21
/** 
22
 * A small helper class to iterate over utf8 strings. This is not an
22
 * A small helper class to iterate over utf8 strings. This is not an
23
 * STL iterator and does not much error checking. It is designed purely
23
 * STL iterator and does not much error checking. It is designed purely
24
 * for recoll usage, where the utf-8 string comes out of iconv in most cases
24
 * for recoll usage, where the utf-8 string comes out of iconv in most cases
...
...
28
class Utf8Iter {
28
class Utf8Iter {
29
public:
29
public:
30
    Utf8Iter(const string &in) 
30
    Utf8Iter(const string &in) 
31
    : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
31
    : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
32
    {
32
    {
33
    compute_cl();
33
    update_cl();
34
    }
34
    }
35
36
    const string& buffer() const {return m_s;}
35
37
36
    void rewind() 
38
    void rewind() 
37
    {
39
    {
38
    m_cl = 0; 
40
    m_cl = 0; 
39
    m_pos = 0; 
41
    m_pos = 0; 
40
    m_charpos = 0; 
42
    m_charpos = 0; 
41
    m_error = false;
43
    m_error = false;
42
    compute_cl();
44
    update_cl();
43
    }
45
    }
44
46
45
    /** "Direct" access. Awfully inefficient as we skip from start or current
47
    /** "Direct" access. Awfully inefficient as we skip from start or current
46
     * position at best. This can only be useful for a lookahead from the
48
     * position at best. This can only be useful for a lookahead from the
47
     * current position */
49
     * current position */
...
...
54
        mycp = m_charpos;
56
        mycp = m_charpos;
55
    }
57
    }
56
    int l;
58
    int l;
57
    while (mypos < m_s.length() && mycp != charpos) {
59
    while (mypos < m_s.length() && mycp != charpos) {
58
        l = get_cl(mypos);
60
        l = get_cl(mypos);
59
        if (l < 0)
61
        if (l <= 0)
60
        return (unsigned int)-1;
62
        return (unsigned int)-1;
61
        mypos += l;
63
        mypos += l;
62
        ++mycp;
64
        ++mycp;
63
    }
65
    }
64
    if (mypos < m_s.length() && mycp == charpos) {
66
    if (mypos < m_s.length() && mycp == charpos) {
...
...
75
    // Note: m_cl may be zero at eof if user's test not right
77
    // Note: m_cl may be zero at eof if user's test not right
76
    // this shouldn't crash the program until actual data access
78
    // this shouldn't crash the program until actual data access
77
#ifdef UTF8ITER_CHECK
79
#ifdef UTF8ITER_CHECK
78
    assert(m_cl != 0);
80
    assert(m_cl != 0);
79
#endif
81
#endif
80
    if (m_cl == 0) 
82
    if (m_cl <= 0) 
81
        return string::npos;
83
        return string::npos;
82
84
83
    m_pos += m_cl;
85
    m_pos += m_cl;
84
    m_charpos++;
86
    m_charpos++;
85
    compute_cl();
87
    update_cl();
86
    return m_pos;
88
    return m_pos;
87
    }
89
    }
88
90
89
    /** operator* returns the ucs4 value as a machine integer*/
91
    /** operator* returns the ucs4 value as a machine integer*/
90
    unsigned int operator*() 
92
    unsigned int operator*() 
...
...
119
121
120
    bool error() {
122
    bool error() {
121
    return m_error;
123
    return m_error;
122
    }
124
    }
123
125
126
    /** Return current byte offset in input string */
124
    string::size_type getBpos() const {
127
    string::size_type getBpos() const {
125
    return m_pos;
128
    return m_pos;
126
    }
129
    }
127
130
131
    /** Return current character length */
132
    string::size_type getBlen() const {
133
  return m_cl;
134
    }
135
136
    /** Return current unicode character offset in input string */
128
    string::size_type getCpos() const {
137
    string::size_type getCpos() const {
129
    return m_charpos;
138
    return m_charpos;
130
    }
139
    }
131
140
132
private:
141
private:
133
    // String we're working with
142
    // String we're working with
134
    const string&     m_s; 
143
    const string&     m_s; 
135
    // Character length at current position. A value of zero indicates
144
    // Character length at current position. A value of zero indicates
136
    // unknown or error.
145
    // an error.
137
    unsigned int      m_cl; 
146
    unsigned int      m_cl; 
138
    // Current byte offset in string.
147
    // Current byte offset in string.
139
    string::size_type m_pos; 
148
    string::size_type m_pos; 
140
    // Current character position
149
    // Current character position
141
    unsigned int      m_charpos; 
150
    unsigned int      m_charpos; 
151
    // Am I ok ?
142
    mutable bool      m_error;
152
    mutable bool      m_error;
143
153
144
    // Check position and cl against string length
154
    // Check position and cl against string length
145
    bool poslok(string::size_type p, int l) const {
155
    bool poslok(string::size_type p, int l) const {
146
#ifdef UTF8ITER_CHECK
156
#ifdef UTF8ITER_CHECK
147
    assert(p != string::npos && l > 0 && p + l <= m_s.length());
157
    assert(p != string::npos && l > 0 && p + l <= m_s.length());
148
#endif
158
#endif
149
    return p != string::npos && l > 0 && p + l <= m_s.length();
159
    return p != string::npos && l > 0 && p + l <= m_s.length();
150
    }
160
    }
151
161
152
    // Update current char length in object state, minimum checking for 
162
    // Update current char length in object state, minimum checking
153
    // errors
163
    // for errors
154
    inline int compute_cl() 
164
    inline void update_cl() 
155
    {
165
    {
156
    m_cl = 0;
166
    m_cl = 0;
157
    if (m_pos == m_s.length())
167
    if (m_pos >= m_s.length())
158
        return -1;
168
        return;
159
    m_cl = get_cl(m_pos);
169
    m_cl = get_cl(m_pos);
160
    if (!poslok(m_pos, m_cl)) {
170
    if (!poslok(m_pos, m_cl)) {
171
      // Used to set eof here for safety, but this is bad because it
172
      // basically prevents the caller to discriminate error and eof.
161
        m_pos = m_s.length();
173
      //     m_pos = m_s.length();
162
        m_cl = 0;
174
        m_cl = 0;
163
        m_error = true;
175
        m_error = true;
164
      return -1;
165
    }
176
    }
166
  return 0;
167
    }
177
    }
168
178
169
    // Get character byte length at specified position
179
    // Get character byte length at specified position. Returns 0 for error.
170
    inline int get_cl(string::size_type p) const 
180
    inline int get_cl(string::size_type p) const 
171
    {
181
    {
172
    unsigned int z = (unsigned char)m_s[p];
182
    unsigned int z = (unsigned char)m_s[p];
173
    if (z <= 127) {
183
    if (z <= 127) {
174
        return 1;
184
        return 1;
...
...
181
    }
191
    }
182
#ifdef UTF8ITER_CHECK
192
#ifdef UTF8ITER_CHECK
183
    assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
193
    assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
184
           (z & 248) == 240);
194
           (z & 248) == 240);
185
#endif
195
#endif
186
    return -1;
196
    return 0;
187
    }
197
    }
188
198
189
    // Compute value at given position. No error checking.
199
    // Compute value at given position. No error checking.
190
    inline unsigned int getvalueat(string::size_type p, int l) const
200
    inline unsigned int getvalueat(string::size_type p, int l) const
191
    {
201
    {