|
a/src/utils/utf8iter.h |
|
b/src/utils/utf8iter.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _UTF8ITER_H_INCLUDED_
|
17 |
#ifndef _UTF8ITER_H_INCLUDED_
|
18 |
#define _UTF8ITER_H_INCLUDED_
|
18 |
#define _UTF8ITER_H_INCLUDED_
|
19 |
/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
/**
|
21 |
/**
|
22 |
* A small helper class to iterate over utf8 strings. This is not an
|
22 |
* A small helper class to iterate over utf8 strings. This is not an
|
23 |
* STL iterator and this is not well designed, just convenient for
|
23 |
* STL iterator and does not much error checking. It is designed purely
|
24 |
some specific uses
|
24 |
* for recoll usage, where the utf-8 string comes out of iconv in most cases
|
|
|
25 |
* and is assumed legal. We just try to catch cases where there would be
|
|
|
26 |
* a risk of crash.
|
25 |
*/
|
27 |
*/
|
26 |
class Utf8Iter {
|
28 |
class Utf8Iter {
|
27 |
unsigned int cl; // Char length at current position if known
|
|
|
28 |
const string &s; // String we're working with
|
|
|
29 |
string::size_type pos; // Current position in string
|
|
|
30 |
unsigned int m_charpos; // Current character posiiton
|
|
|
31 |
|
|
|
32 |
// Get character byte length at specified position
|
|
|
33 |
inline int get_cl(string::size_type p) const {
|
|
|
34 |
unsigned int z = (unsigned char)s[p];
|
|
|
35 |
if (z <= 127) {
|
|
|
36 |
return 1;
|
|
|
37 |
} else if (z>=192 && z <= 223) {
|
|
|
38 |
return 2;
|
|
|
39 |
} else if (z >= 224 && z <= 239) {
|
|
|
40 |
return 3;
|
|
|
41 |
} else if (z >= 240 && z <= 247) {
|
|
|
42 |
return 4;
|
|
|
43 |
} else if (z >= 248 && z <= 251) {
|
|
|
44 |
return 5;
|
|
|
45 |
} else if (z >= 252 && z <= 253) {
|
|
|
46 |
return 6;
|
|
|
47 |
}
|
|
|
48 |
return -1;
|
|
|
49 |
}
|
|
|
50 |
// Check position and cl against string length
|
|
|
51 |
bool poslok(string::size_type p, int l) const {
|
|
|
52 |
return p != string::npos && l > 0 && p + l <= s.length();
|
|
|
53 |
}
|
|
|
54 |
// Update current char length in object state. Assumes pos is inside string
|
|
|
55 |
inline int compute_cl() {
|
|
|
56 |
cl = 0;
|
|
|
57 |
cl = get_cl(pos);
|
|
|
58 |
if (!poslok(pos, cl)) {
|
|
|
59 |
pos = s.length();
|
|
|
60 |
cl = 0;
|
|
|
61 |
return -1;
|
|
|
62 |
}
|
|
|
63 |
return 0;
|
|
|
64 |
}
|
|
|
65 |
// Compute value at given position
|
|
|
66 |
inline unsigned int getvalueat(string::size_type p, int l) const {
|
|
|
67 |
switch (l) {
|
|
|
68 |
case 1: return (unsigned char)s[p];
|
|
|
69 |
case 2: return ((unsigned char)s[p] - 192) * 64 +
|
|
|
70 |
(unsigned char)s[p+1] - 128 ;
|
|
|
71 |
case 3: return ((unsigned char)s[p]-224)*4096 +
|
|
|
72 |
((unsigned char)s[p+1]-128)*64 +
|
|
|
73 |
(unsigned char)s[p+2]-128;
|
|
|
74 |
case 4: return ((unsigned char)s[p]-240)*262144 +
|
|
|
75 |
((unsigned char)s[p+1]-128)*4096 +
|
|
|
76 |
((unsigned char)s[p+2]-128)*64 +
|
|
|
77 |
(unsigned char)s[p+3]-128;
|
|
|
78 |
case 5: return ((unsigned char)s[p]-248)*16777216 +
|
|
|
79 |
((unsigned char)s[p+1]-128)*262144 +
|
|
|
80 |
((unsigned char)s[p+2]-128)*4096 +
|
|
|
81 |
((unsigned char)s[p+3]-128)*64 +
|
|
|
82 |
(unsigned char)s[p+4]-128;
|
|
|
83 |
case 6: return ((unsigned char)s[p]-252)*1073741824 +
|
|
|
84 |
((unsigned char)s[p+1]-128)*16777216 +
|
|
|
85 |
((unsigned char)s[p+2]-128)*262144 +
|
|
|
86 |
((unsigned char)s[p+3]-128)*4096 +
|
|
|
87 |
((unsigned char)s[p+4]-128)*64 +
|
|
|
88 |
(unsigned char)s[p+5]-128;
|
|
|
89 |
default:
|
|
|
90 |
return (unsigned int)-1;
|
|
|
91 |
}
|
|
|
92 |
}
|
|
|
93 |
public:
|
29 |
public:
|
94 |
Utf8Iter(const string &in)
|
30 |
Utf8Iter(const string &in)
|
95 |
: cl(0), s(in), pos(0), m_charpos(0)
|
31 |
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
|
96 |
{
|
32 |
{
|
97 |
// Ensure state is ok if appendchartostring is called at once
|
|
|
98 |
compute_cl();
|
33 |
compute_cl();
|
99 |
}
|
34 |
}
|
100 |
|
35 |
|
101 |
void rewind() {
|
36 |
void rewind()
|
102 |
cl=0; pos=0; m_charpos=0;
|
|
|
103 |
}
|
37 |
{
|
104 |
/** operator* returns the ucs4 value as a machine integer*/
|
38 |
m_cl = 0;
|
105 |
unsigned int operator*() {
|
39 |
m_pos = 0;
|
106 |
if (!cl && compute_cl() < 0)
|
40 |
m_charpos = 0;
|
107 |
return (unsigned int)-1;
|
41 |
m_error = false;
|
108 |
unsigned int val = getvalueat(pos, cl);
|
42 |
compute_cl();
|
109 |
if (val == (unsigned int)-1) {
|
|
|
110 |
pos = s.length();
|
|
|
111 |
cl = 0;
|
|
|
112 |
}
|
|
|
113 |
return val;
|
|
|
114 |
}
|
43 |
}
|
|
|
44 |
|
115 |
/** "Direct" access. Awfully inefficient as we skip from start or current
|
45 |
/** "Direct" access. Awfully inefficient as we skip from start or current
|
116 |
* position at best. This can only be useful for a lookahead from the
|
46 |
* position at best. This can only be useful for a lookahead from the
|
117 |
* current position */
|
47 |
* current position */
|
118 |
unsigned int operator[](unsigned int charpos) const {
|
48 |
unsigned int operator[](unsigned int charpos) const
|
|
|
49 |
{
|
119 |
string::size_type mypos = 0;
|
50 |
string::size_type mypos = 0;
|
120 |
unsigned int mycp = 0;;
|
51 |
unsigned int mycp = 0;
|
121 |
if (charpos >= m_charpos) {
|
52 |
if (charpos >= m_charpos) {
|
122 |
mypos = pos;
|
53 |
mypos = m_pos;
|
123 |
mycp = m_charpos;
|
54 |
mycp = m_charpos;
|
124 |
}
|
55 |
}
|
|
|
56 |
int l;
|
125 |
while (mypos < s.length() && mycp != charpos) {
|
57 |
while (mypos < m_s.length() && mycp != charpos) {
|
126 |
mypos += get_cl(mypos);
|
58 |
l = get_cl(mypos);
|
|
|
59 |
if (l < 0)
|
|
|
60 |
return (unsigned int)-1;
|
|
|
61 |
mypos += l;
|
127 |
++mycp;
|
62 |
++mycp;
|
128 |
}
|
63 |
}
|
129 |
if (mypos < s.length() && mycp == charpos) {
|
64 |
if (mypos < m_s.length() && mycp == charpos) {
|
130 |
int l = get_cl(mypos);
|
65 |
l = get_cl(mypos);
|
131 |
if (poslok(mypos, l))
|
66 |
if (poslok(mypos, l))
|
132 |
return getvalueat(mypos, get_cl(mypos));
|
67 |
return getvalueat(mypos, get_cl(mypos));
|
133 |
}
|
68 |
}
|
134 |
return (unsigned int)-1;
|
69 |
return (unsigned int)-1;
|
135 |
}
|
70 |
}
|
136 |
|
71 |
|
137 |
/** Set current position before next utf-8 character */
|
72 |
/** Increment current position to next utf-8 char */
|
138 |
string::size_type operator++(int) {
|
73 |
string::size_type operator++(int)
|
139 |
if (!cl && compute_cl() < 0) {
|
74 |
{
|
|
|
75 |
// Note: m_cl may be zero at eof if user's test not right
|
|
|
76 |
// this shouldn't crash the program until actual data access
|
|
|
77 |
#ifdef UTF8ITER_CHECK
|
|
|
78 |
assert(m_cl != 0);
|
|
|
79 |
#endif
|
|
|
80 |
if (m_cl == 0)
|
140 |
return pos = string::npos;
|
81 |
return string::npos;
|
141 |
}
|
82 |
|
142 |
pos += cl;
|
83 |
m_pos += m_cl;
|
143 |
m_charpos++;
|
84 |
m_charpos++;
|
144 |
cl = 0;
|
85 |
compute_cl();
|
145 |
return pos;
|
86 |
return m_pos;
|
|
|
87 |
}
|
|
|
88 |
|
|
|
89 |
/** operator* returns the ucs4 value as a machine integer*/
|
|
|
90 |
unsigned int operator*()
|
146 |
}
|
91 |
{
|
|
|
92 |
#ifdef UTF8ITER_CHECK
|
|
|
93 |
assert(m_cl != 0);
|
|
|
94 |
#endif
|
|
|
95 |
return getvalueat(m_pos, m_cl);
|
|
|
96 |
}
|
|
|
97 |
|
|
|
98 |
/** Append current utf-8 possibly multi-byte character to string param.
|
147 |
/** This needs to be fast. No error checking. */
|
99 |
This needs to be fast. No error checking. */
|
148 |
void appendchartostring(string &out) {
|
100 |
unsigned int appendchartostring(string &out) {
|
|
|
101 |
#ifdef UTF8ITER_CHECK
|
|
|
102 |
assert(m_cl != 0);
|
|
|
103 |
#endif
|
149 |
out.append(&s[pos], cl);
|
104 |
out.append(&m_s[m_pos], m_cl);
|
|
|
105 |
return m_cl;
|
150 |
}
|
106 |
}
|
|
|
107 |
|
|
|
108 |
/** Return current character as string */
|
151 |
operator string() {
|
109 |
operator string() {
|
152 |
if (!cl && compute_cl() < 0) {
|
110 |
#ifdef UTF8ITER_CHECK
|
153 |
return std::string("");
|
111 |
assert(m_cl != 0);
|
154 |
}
|
112 |
#endif
|
155 |
return s.substr(pos, cl);
|
113 |
return m_s.substr(m_pos, m_cl);
|
156 |
}
|
114 |
}
|
|
|
115 |
|
157 |
bool eof() {
|
116 |
bool eof() {
|
158 |
// Note: we always ensure that pos == s.length() when setting bad to
|
|
|
159 |
// true
|
|
|
160 |
return pos == s.length();
|
117 |
return m_pos == m_s.length();
|
161 |
}
|
118 |
}
|
|
|
119 |
|
162 |
bool error() {
|
120 |
bool error() {
|
163 |
return compute_cl() < 0;
|
121 |
return m_error;
|
164 |
}
|
122 |
}
|
|
|
123 |
|
165 |
string::size_type getBpos() const {
|
124 |
string::size_type getBpos() const {
|
166 |
return pos;
|
125 |
return m_pos;
|
167 |
}
|
126 |
}
|
|
|
127 |
|
168 |
string::size_type getCpos() const {
|
128 |
string::size_type getCpos() const {
|
169 |
return m_charpos;
|
129 |
return m_charpos;
|
170 |
}
|
130 |
}
|
|
|
131 |
|
|
|
132 |
private:
|
|
|
133 |
// String we're working with
|
|
|
134 |
const string& m_s;
|
|
|
135 |
// Character length at current position. A value of zero indicates
|
|
|
136 |
// unknown or error.
|
|
|
137 |
unsigned int m_cl;
|
|
|
138 |
// Current byte offset in string.
|
|
|
139 |
string::size_type m_pos;
|
|
|
140 |
// Current character position
|
|
|
141 |
unsigned int m_charpos;
|
|
|
142 |
mutable bool m_error;
|
|
|
143 |
|
|
|
144 |
// Check position and cl against string length
|
|
|
145 |
bool poslok(string::size_type p, int l) const {
|
|
|
146 |
#ifdef UTF8ITER_CHECK
|
|
|
147 |
assert(p != string::npos && l > 0 && p + l <= m_s.length());
|
|
|
148 |
#endif
|
|
|
149 |
return p != string::npos && l > 0 && p + l <= m_s.length();
|
|
|
150 |
}
|
|
|
151 |
|
|
|
152 |
// Update current char length in object state, minimum checking for
|
|
|
153 |
// errors
|
|
|
154 |
inline int compute_cl()
|
|
|
155 |
{
|
|
|
156 |
m_cl = 0;
|
|
|
157 |
if (m_pos == m_s.length())
|
|
|
158 |
return -1;
|
|
|
159 |
m_cl = get_cl(m_pos);
|
|
|
160 |
if (!poslok(m_pos, m_cl)) {
|
|
|
161 |
m_pos = m_s.length();
|
|
|
162 |
m_cl = 0;
|
|
|
163 |
m_error = true;
|
|
|
164 |
return -1;
|
|
|
165 |
}
|
|
|
166 |
return 0;
|
|
|
167 |
}
|
|
|
168 |
|
|
|
169 |
// Get character byte length at specified position
|
|
|
170 |
inline int get_cl(string::size_type p) const
|
|
|
171 |
{
|
|
|
172 |
unsigned int z = (unsigned char)m_s[p];
|
|
|
173 |
if (z <= 127) {
|
|
|
174 |
return 1;
|
|
|
175 |
} else if ((z & 224) == 192) {
|
|
|
176 |
return 2;
|
|
|
177 |
} else if ((z & 240) == 224) {
|
|
|
178 |
return 3;
|
|
|
179 |
} else if ((z & 248) == 240) {
|
|
|
180 |
return 4;
|
|
|
181 |
}
|
|
|
182 |
#ifdef UTF8ITER_CHECK
|
|
|
183 |
assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
|
|
|
184 |
(z & 248) == 240);
|
|
|
185 |
#endif
|
|
|
186 |
return -1;
|
|
|
187 |
}
|
|
|
188 |
|
|
|
189 |
// Compute value at given position. No error checking.
|
|
|
190 |
inline unsigned int getvalueat(string::size_type p, int l) const
|
|
|
191 |
{
|
|
|
192 |
switch (l) {
|
|
|
193 |
case 1:
|
|
|
194 |
#ifdef UTF8ITER_CHECK
|
|
|
195 |
assert((unsigned char)m_s[p] < 128);
|
|
|
196 |
#endif
|
|
|
197 |
return (unsigned char)m_s[p];
|
|
|
198 |
case 2:
|
|
|
199 |
#ifdef UTF8ITER_CHECK
|
|
|
200 |
assert(
|
|
|
201 |
((unsigned char)m_s[p] & 224) == 192
|
|
|
202 |
&& ((unsigned char)m_s[p+1] & 192) == 128
|
|
|
203 |
);
|
|
|
204 |
#endif
|
|
|
205 |
return ((unsigned char)m_s[p] - 192) * 64 +
|
|
|
206 |
(unsigned char)m_s[p+1] - 128 ;
|
|
|
207 |
case 3:
|
|
|
208 |
#ifdef UTF8ITER_CHECK
|
|
|
209 |
assert(
|
|
|
210 |
(((unsigned char)m_s[p]) & 240) == 224
|
|
|
211 |
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
|
|
212 |
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
|
|
213 |
);
|
|
|
214 |
#endif
|
|
|
215 |
|
|
|
216 |
return ((unsigned char)m_s[p] - 224) * 4096 +
|
|
|
217 |
((unsigned char)m_s[p+1] - 128) * 64 +
|
|
|
218 |
(unsigned char)m_s[p+2] - 128;
|
|
|
219 |
case 4:
|
|
|
220 |
#ifdef UTF8ITER_CHECK
|
|
|
221 |
assert(
|
|
|
222 |
(((unsigned char)m_s[p]) & 248) == 240
|
|
|
223 |
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
|
|
224 |
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
|
|
225 |
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
|
|
226 |
);
|
|
|
227 |
#endif
|
|
|
228 |
|
|
|
229 |
return ((unsigned char)m_s[p]-240)*262144 +
|
|
|
230 |
((unsigned char)m_s[p+1]-128)*4096 +
|
|
|
231 |
((unsigned char)m_s[p+2]-128)*64 +
|
|
|
232 |
(unsigned char)m_s[p+3]-128;
|
|
|
233 |
|
|
|
234 |
default:
|
|
|
235 |
#ifdef UTF8ITER_CHECK
|
|
|
236 |
assert(l <= 4);
|
|
|
237 |
#endif
|
|
|
238 |
m_error = true;
|
|
|
239 |
return (unsigned int)-1;
|
|
|
240 |
}
|
|
|
241 |
}
|
|
|
242 |
|
171 |
};
|
243 |
};
|
172 |
|
244 |
|
173 |
|
245 |
|
174 |
#endif /* _UTF8ITER_H_INCLUDED_ */
|
246 |
#endif /* _UTF8ITER_H_INCLUDED_ */
|