|
a/src/utils/utf8iter.h |
|
b/src/utils/utf8iter.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _UTF8ITER_H_INCLUDED_
|
17 |
#ifndef _UTF8ITER_H_INCLUDED_
|
18 |
#define _UTF8ITER_H_INCLUDED_
|
18 |
#define _UTF8ITER_H_INCLUDED_
|
19 |
/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: utf8iter.h,v 1.9 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
/**
|
21 |
/**
|
22 |
* A small helper class to iterate over utf8 strings. This is not an
|
22 |
* A small helper class to iterate over utf8 strings. This is not an
|
23 |
* STL iterator and does not much error checking. It is designed purely
|
23 |
* STL iterator and does not much error checking. It is designed purely
|
24 |
* for recoll usage, where the utf-8 string comes out of iconv in most cases
|
24 |
* for recoll usage, where the utf-8 string comes out of iconv in most cases
|
|
... |
|
... |
28 |
class Utf8Iter {
|
28 |
class Utf8Iter {
|
29 |
public:
|
29 |
public:
|
30 |
Utf8Iter(const string &in)
|
30 |
Utf8Iter(const string &in)
|
31 |
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
|
31 |
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
|
32 |
{
|
32 |
{
|
33 |
compute_cl();
|
33 |
update_cl();
|
34 |
}
|
34 |
}
|
|
|
35 |
|
|
|
36 |
const string& buffer() const {return m_s;}
|
35 |
|
37 |
|
36 |
void rewind()
|
38 |
void rewind()
|
37 |
{
|
39 |
{
|
38 |
m_cl = 0;
|
40 |
m_cl = 0;
|
39 |
m_pos = 0;
|
41 |
m_pos = 0;
|
40 |
m_charpos = 0;
|
42 |
m_charpos = 0;
|
41 |
m_error = false;
|
43 |
m_error = false;
|
42 |
compute_cl();
|
44 |
update_cl();
|
43 |
}
|
45 |
}
|
44 |
|
46 |
|
45 |
/** "Direct" access. Awfully inefficient as we skip from start or current
|
47 |
/** "Direct" access. Awfully inefficient as we skip from start or current
|
46 |
* position at best. This can only be useful for a lookahead from the
|
48 |
* position at best. This can only be useful for a lookahead from the
|
47 |
* current position */
|
49 |
* current position */
|
|
... |
|
... |
54 |
mycp = m_charpos;
|
56 |
mycp = m_charpos;
|
55 |
}
|
57 |
}
|
56 |
int l;
|
58 |
int l;
|
57 |
while (mypos < m_s.length() && mycp != charpos) {
|
59 |
while (mypos < m_s.length() && mycp != charpos) {
|
58 |
l = get_cl(mypos);
|
60 |
l = get_cl(mypos);
|
59 |
if (l < 0)
|
61 |
if (l <= 0)
|
60 |
return (unsigned int)-1;
|
62 |
return (unsigned int)-1;
|
61 |
mypos += l;
|
63 |
mypos += l;
|
62 |
++mycp;
|
64 |
++mycp;
|
63 |
}
|
65 |
}
|
64 |
if (mypos < m_s.length() && mycp == charpos) {
|
66 |
if (mypos < m_s.length() && mycp == charpos) {
|
|
... |
|
... |
75 |
// Note: m_cl may be zero at eof if user's test not right
|
77 |
// Note: m_cl may be zero at eof if user's test not right
|
76 |
// this shouldn't crash the program until actual data access
|
78 |
// this shouldn't crash the program until actual data access
|
77 |
#ifdef UTF8ITER_CHECK
|
79 |
#ifdef UTF8ITER_CHECK
|
78 |
assert(m_cl != 0);
|
80 |
assert(m_cl != 0);
|
79 |
#endif
|
81 |
#endif
|
80 |
if (m_cl == 0)
|
82 |
if (m_cl <= 0)
|
81 |
return string::npos;
|
83 |
return string::npos;
|
82 |
|
84 |
|
83 |
m_pos += m_cl;
|
85 |
m_pos += m_cl;
|
84 |
m_charpos++;
|
86 |
m_charpos++;
|
85 |
compute_cl();
|
87 |
update_cl();
|
86 |
return m_pos;
|
88 |
return m_pos;
|
87 |
}
|
89 |
}
|
88 |
|
90 |
|
89 |
/** operator* returns the ucs4 value as a machine integer*/
|
91 |
/** operator* returns the ucs4 value as a machine integer*/
|
90 |
unsigned int operator*()
|
92 |
unsigned int operator*()
|
|
... |
|
... |
119 |
|
121 |
|
120 |
bool error() {
|
122 |
bool error() {
|
121 |
return m_error;
|
123 |
return m_error;
|
122 |
}
|
124 |
}
|
123 |
|
125 |
|
|
|
126 |
/** Return current byte offset in input string */
|
124 |
string::size_type getBpos() const {
|
127 |
string::size_type getBpos() const {
|
125 |
return m_pos;
|
128 |
return m_pos;
|
126 |
}
|
129 |
}
|
127 |
|
130 |
|
|
|
131 |
/** Return current character length */
|
|
|
132 |
string::size_type getBlen() const {
|
|
|
133 |
return m_cl;
|
|
|
134 |
}
|
|
|
135 |
|
|
|
136 |
/** Return current unicode character offset in input string */
|
128 |
string::size_type getCpos() const {
|
137 |
string::size_type getCpos() const {
|
129 |
return m_charpos;
|
138 |
return m_charpos;
|
130 |
}
|
139 |
}
|
131 |
|
140 |
|
132 |
private:
|
141 |
private:
|
133 |
// String we're working with
|
142 |
// String we're working with
|
134 |
const string& m_s;
|
143 |
const string& m_s;
|
135 |
// Character length at current position. A value of zero indicates
|
144 |
// Character length at current position. A value of zero indicates
|
136 |
// unknown or error.
|
145 |
// an error.
|
137 |
unsigned int m_cl;
|
146 |
unsigned int m_cl;
|
138 |
// Current byte offset in string.
|
147 |
// Current byte offset in string.
|
139 |
string::size_type m_pos;
|
148 |
string::size_type m_pos;
|
140 |
// Current character position
|
149 |
// Current character position
|
141 |
unsigned int m_charpos;
|
150 |
unsigned int m_charpos;
|
|
|
151 |
// Am I ok ?
|
142 |
mutable bool m_error;
|
152 |
mutable bool m_error;
|
143 |
|
153 |
|
144 |
// Check position and cl against string length
|
154 |
// Check position and cl against string length
|
145 |
bool poslok(string::size_type p, int l) const {
|
155 |
bool poslok(string::size_type p, int l) const {
|
146 |
#ifdef UTF8ITER_CHECK
|
156 |
#ifdef UTF8ITER_CHECK
|
147 |
assert(p != string::npos && l > 0 && p + l <= m_s.length());
|
157 |
assert(p != string::npos && l > 0 && p + l <= m_s.length());
|
148 |
#endif
|
158 |
#endif
|
149 |
return p != string::npos && l > 0 && p + l <= m_s.length();
|
159 |
return p != string::npos && l > 0 && p + l <= m_s.length();
|
150 |
}
|
160 |
}
|
151 |
|
161 |
|
152 |
// Update current char length in object state, minimum checking for
|
162 |
// Update current char length in object state, minimum checking
|
153 |
// errors
|
163 |
// for errors
|
154 |
inline int compute_cl()
|
164 |
inline void update_cl()
|
155 |
{
|
165 |
{
|
156 |
m_cl = 0;
|
166 |
m_cl = 0;
|
157 |
if (m_pos == m_s.length())
|
167 |
if (m_pos >= m_s.length())
|
158 |
return -1;
|
168 |
return;
|
159 |
m_cl = get_cl(m_pos);
|
169 |
m_cl = get_cl(m_pos);
|
160 |
if (!poslok(m_pos, m_cl)) {
|
170 |
if (!poslok(m_pos, m_cl)) {
|
|
|
171 |
// Used to set eof here for safety, but this is bad because it
|
|
|
172 |
// basically prevents the caller to discriminate error and eof.
|
161 |
m_pos = m_s.length();
|
173 |
// m_pos = m_s.length();
|
162 |
m_cl = 0;
|
174 |
m_cl = 0;
|
163 |
m_error = true;
|
175 |
m_error = true;
|
164 |
return -1;
|
|
|
165 |
}
|
176 |
}
|
166 |
return 0;
|
|
|
167 |
}
|
177 |
}
|
168 |
|
178 |
|
169 |
// Get character byte length at specified position
|
179 |
// Get character byte length at specified position. Returns 0 for error.
|
170 |
inline int get_cl(string::size_type p) const
|
180 |
inline int get_cl(string::size_type p) const
|
171 |
{
|
181 |
{
|
172 |
unsigned int z = (unsigned char)m_s[p];
|
182 |
unsigned int z = (unsigned char)m_s[p];
|
173 |
if (z <= 127) {
|
183 |
if (z <= 127) {
|
174 |
return 1;
|
184 |
return 1;
|
|
... |
|
... |
181 |
}
|
191 |
}
|
182 |
#ifdef UTF8ITER_CHECK
|
192 |
#ifdef UTF8ITER_CHECK
|
183 |
assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
|
193 |
assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
|
184 |
(z & 248) == 240);
|
194 |
(z & 248) == 240);
|
185 |
#endif
|
195 |
#endif
|
186 |
return -1;
|
196 |
return 0;
|
187 |
}
|
197 |
}
|
188 |
|
198 |
|
189 |
// Compute value at given position. No error checking.
|
199 |
// Compute value at given position. No error checking.
|
190 |
inline unsigned int getvalueat(string::size_type p, int l) const
|
200 |
inline unsigned int getvalueat(string::size_type p, int l) const
|
191 |
{
|
201 |
{
|