|
a/src/utils/utf8iter.h |
|
b/src/utils/utf8iter.h |
|
... |
|
... |
30 |
* a risk of crash.
|
30 |
* a risk of crash.
|
31 |
*/
|
31 |
*/
|
32 |
class Utf8Iter {
|
32 |
class Utf8Iter {
|
33 |
public:
|
33 |
public:
|
34 |
Utf8Iter(const std::string &in)
|
34 |
Utf8Iter(const std::string &in)
|
35 |
: m_s(in), m_cl(0), m_pos(0), m_charpos(0)
|
35 |
: m_sp(&in), m_cl(0), m_pos(0), m_charpos(0)
|
36 |
{
|
36 |
{
|
37 |
update_cl();
|
37 |
update_cl();
|
38 |
}
|
38 |
}
|
39 |
|
39 |
|
40 |
const std::string& buffer() const {return m_s;}
|
40 |
const std::string& buffer() const {return (*m_sp);}
|
41 |
|
41 |
|
42 |
void rewind()
|
42 |
void rewind()
|
43 |
{
|
43 |
{
|
44 |
m_cl = 0;
|
44 |
m_cl = 0;
|
45 |
m_pos = 0;
|
45 |
m_pos = 0;
|
|
... |
|
... |
57 |
if (charpos >= m_charpos) {
|
57 |
if (charpos >= m_charpos) {
|
58 |
mypos = m_pos;
|
58 |
mypos = m_pos;
|
59 |
mycp = m_charpos;
|
59 |
mycp = m_charpos;
|
60 |
}
|
60 |
}
|
61 |
int l;
|
61 |
int l;
|
62 |
while (mypos < m_s.length() && mycp != charpos) {
|
62 |
while (mypos < m_sp->length() && mycp != charpos) {
|
63 |
l = get_cl(mypos);
|
63 |
l = get_cl(mypos);
|
64 |
if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
|
64 |
if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
|
65 |
return (unsigned int)-1;
|
65 |
return (unsigned int)-1;
|
66 |
mypos += l;
|
66 |
mypos += l;
|
67 |
++mycp;
|
67 |
++mycp;
|
68 |
}
|
68 |
}
|
69 |
if (mypos < m_s.length() && mycp == charpos) {
|
69 |
if (mypos < m_sp->length() && mycp == charpos) {
|
70 |
l = get_cl(mypos);
|
70 |
l = get_cl(mypos);
|
71 |
if (poslok(mypos, l) && checkvalidat(mypos, l))
|
71 |
if (poslok(mypos, l) && checkvalidat(mypos, l))
|
72 |
return getvalueat(mypos, l);
|
72 |
return getvalueat(mypos, l);
|
73 |
}
|
73 |
}
|
74 |
return (unsigned int)-1;
|
74 |
return (unsigned int)-1;
|
|
... |
|
... |
104 |
This needs to be fast. No error checking. */
|
104 |
This needs to be fast. No error checking. */
|
105 |
unsigned int appendchartostring(std::string &out) const {
|
105 |
unsigned int appendchartostring(std::string &out) const {
|
106 |
#ifdef UTF8ITER_CHECK
|
106 |
#ifdef UTF8ITER_CHECK
|
107 |
assert(m_cl != 0);
|
107 |
assert(m_cl != 0);
|
108 |
#endif
|
108 |
#endif
|
109 |
out.append(&m_s[m_pos], m_cl);
|
109 |
out.append(&(*m_sp)[m_pos], m_cl);
|
110 |
return m_cl;
|
110 |
return m_cl;
|
111 |
}
|
111 |
}
|
112 |
|
112 |
|
113 |
/** Return current character as string */
|
113 |
/** Return current character as string */
|
114 |
operator std::string() {
|
114 |
operator std::string() {
|
115 |
#ifdef UTF8ITER_CHECK
|
115 |
#ifdef UTF8ITER_CHECK
|
116 |
assert(m_cl != 0);
|
116 |
assert(m_cl != 0);
|
117 |
#endif
|
117 |
#endif
|
118 |
return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
|
118 |
return m_cl > 0 ? m_sp->substr(m_pos, m_cl) : std::string();
|
119 |
}
|
119 |
}
|
120 |
|
120 |
|
121 |
bool eof() const {
|
121 |
bool eof() const {
|
122 |
return m_pos == m_s.length();
|
122 |
return m_pos == m_sp->length();
|
123 |
}
|
123 |
}
|
124 |
|
124 |
|
125 |
bool error() const {
|
125 |
bool error() const {
|
126 |
return m_cl == 0;
|
126 |
return m_cl == 0;
|
127 |
}
|
127 |
}
|
|
... |
|
... |
141 |
return m_charpos;
|
141 |
return m_charpos;
|
142 |
}
|
142 |
}
|
143 |
|
143 |
|
144 |
private:
|
144 |
private:
|
145 |
// String we're working with
|
145 |
// String we're working with
|
146 |
const std::string& m_s;
|
146 |
const std::string* m_sp;
|
147 |
// Character length at current position. A value of zero indicates
|
147 |
// Character length at current position. A value of zero indicates
|
148 |
// an error.
|
148 |
// an error.
|
149 |
unsigned int m_cl;
|
149 |
unsigned int m_cl;
|
150 |
// Current byte offset in string.
|
150 |
// Current byte offset in string.
|
151 |
std::string::size_type m_pos;
|
151 |
std::string::size_type m_pos;
|
|
... |
|
... |
153 |
unsigned int m_charpos;
|
153 |
unsigned int m_charpos;
|
154 |
|
154 |
|
155 |
// Check position and cl against string length
|
155 |
// Check position and cl against string length
|
156 |
bool poslok(std::string::size_type p, int l) const {
|
156 |
bool poslok(std::string::size_type p, int l) const {
|
157 |
#ifdef UTF8ITER_CHECK
|
157 |
#ifdef UTF8ITER_CHECK
|
158 |
assert(p != std::string::npos && l > 0 && p + l <= m_s.length());
|
158 |
assert(p != std::string::npos && l > 0 && p + l <= m_sp->length());
|
159 |
#endif
|
159 |
#endif
|
160 |
return p != std::string::npos && l > 0 && p + l <= m_s.length();
|
160 |
return p != std::string::npos && l > 0 && p + l <= m_sp->length();
|
161 |
}
|
161 |
}
|
162 |
|
162 |
|
163 |
// Update current char length in object state, check
|
163 |
// Update current char length in object state, check
|
164 |
// for errors
|
164 |
// for errors
|
165 |
inline void update_cl()
|
165 |
inline void update_cl()
|
166 |
{
|
166 |
{
|
167 |
m_cl = 0;
|
167 |
m_cl = 0;
|
168 |
if (m_pos >= m_s.length())
|
168 |
if (m_pos >= m_sp->length())
|
169 |
return;
|
169 |
return;
|
170 |
m_cl = get_cl(m_pos);
|
170 |
m_cl = get_cl(m_pos);
|
171 |
if (!poslok(m_pos, m_cl)) {
|
171 |
if (!poslok(m_pos, m_cl)) {
|
172 |
// Used to set eof here for safety, but this is bad because it
|
172 |
// Used to set eof here for safety, but this is bad because it
|
173 |
// basically prevents the caller to discriminate error and eof.
|
173 |
// basically prevents the caller to discriminate error and eof.
|
174 |
// m_pos = m_s.length();
|
174 |
// m_pos = m_sp->length();
|
175 |
m_cl = 0;
|
175 |
m_cl = 0;
|
176 |
return;
|
176 |
return;
|
177 |
}
|
177 |
}
|
178 |
if (!checkvalidat(m_pos, m_cl)) {
|
178 |
if (!checkvalidat(m_pos, m_cl)) {
|
179 |
m_cl = 0;
|
179 |
m_cl = 0;
|
|
... |
|
... |
182 |
|
182 |
|
183 |
inline bool checkvalidat(std::string::size_type p, int l) const
|
183 |
inline bool checkvalidat(std::string::size_type p, int l) const
|
184 |
{
|
184 |
{
|
185 |
switch (l) {
|
185 |
switch (l) {
|
186 |
case 1:
|
186 |
case 1:
|
187 |
return (unsigned char)m_s[p] < 128;
|
187 |
return (unsigned char)(*m_sp)[p] < 128;
|
188 |
case 2:
|
188 |
case 2:
|
189 |
return (((unsigned char)m_s[p]) & 224) == 192
|
189 |
return (((unsigned char)(*m_sp)[p]) & 224) == 192
|
190 |
&& (((unsigned char)m_s[p+1]) & 192) == 128;
|
190 |
&& (((unsigned char)(*m_sp)[p+1]) & 192) == 128;
|
191 |
case 3:
|
191 |
case 3:
|
192 |
return (((unsigned char)m_s[p]) & 240) == 224
|
192 |
return (((unsigned char)(*m_sp)[p]) & 240) == 224
|
193 |
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
193 |
&& (((unsigned char)(*m_sp)[p+1]) & 192) == 128
|
194 |
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
194 |
&& (((unsigned char)(*m_sp)[p+2]) & 192) == 128
|
195 |
;
|
195 |
;
|
196 |
case 4:
|
196 |
case 4:
|
197 |
return (((unsigned char)m_s[p]) & 248) == 240
|
197 |
return (((unsigned char)(*m_sp)[p]) & 248) == 240
|
198 |
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
198 |
&& (((unsigned char)(*m_sp)[p+1]) & 192) == 128
|
199 |
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
199 |
&& (((unsigned char)(*m_sp)[p+2]) & 192) == 128
|
200 |
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
200 |
&& (((unsigned char)(*m_sp)[p+3]) & 192) == 128
|
201 |
;
|
201 |
;
|
202 |
default:
|
202 |
default:
|
203 |
return false;
|
203 |
return false;
|
204 |
}
|
204 |
}
|
205 |
}
|
205 |
}
|
206 |
|
206 |
|
207 |
// Get character byte length at specified position. Returns 0 for error.
|
207 |
// Get character byte length at specified position. Returns 0 for error.
|
208 |
inline int get_cl(std::string::size_type p) const
|
208 |
inline int get_cl(std::string::size_type p) const
|
209 |
{
|
209 |
{
|
210 |
unsigned int z = (unsigned char)m_s[p];
|
210 |
unsigned int z = (unsigned char)(*m_sp)[p];
|
211 |
if (z <= 127) {
|
211 |
if (z <= 127) {
|
212 |
return 1;
|
212 |
return 1;
|
213 |
} else if ((z & 224) == 192) {
|
213 |
} else if ((z & 224) == 192) {
|
214 |
return 2;
|
214 |
return 2;
|
215 |
} else if ((z & 240) == 224) {
|
215 |
} else if ((z & 240) == 224) {
|
|
... |
|
... |
228 |
inline unsigned int getvalueat(std::string::size_type p, int l) const
|
228 |
inline unsigned int getvalueat(std::string::size_type p, int l) const
|
229 |
{
|
229 |
{
|
230 |
switch (l) {
|
230 |
switch (l) {
|
231 |
case 1:
|
231 |
case 1:
|
232 |
#ifdef UTF8ITER_CHECK
|
232 |
#ifdef UTF8ITER_CHECK
|
233 |
assert((unsigned char)m_s[p] < 128);
|
233 |
assert((unsigned char)(*m_sp)[p] < 128);
|
234 |
#endif
|
234 |
#endif
|
235 |
return (unsigned char)m_s[p];
|
235 |
return (unsigned char)(*m_sp)[p];
|
236 |
case 2:
|
236 |
case 2:
|
237 |
#ifdef UTF8ITER_CHECK
|
237 |
#ifdef UTF8ITER_CHECK
|
238 |
assert(
|
238 |
assert(
|
239 |
((unsigned char)m_s[p] & 224) == 192
|
239 |
((unsigned char)(*m_sp)[p] & 224) == 192
|
240 |
&& ((unsigned char)m_s[p+1] & 192) == 128
|
240 |
&& ((unsigned char)(*m_sp)[p+1] & 192) == 128
|
241 |
);
|
241 |
);
|
242 |
#endif
|
242 |
#endif
|
243 |
return ((unsigned char)m_s[p] - 192) * 64 +
|
243 |
return ((unsigned char)(*m_sp)[p] - 192) * 64 +
|
244 |
(unsigned char)m_s[p+1] - 128 ;
|
244 |
(unsigned char)(*m_sp)[p+1] - 128 ;
|
245 |
case 3:
|
245 |
case 3:
|
246 |
#ifdef UTF8ITER_CHECK
|
246 |
#ifdef UTF8ITER_CHECK
|
247 |
assert(
|
247 |
assert(
|
248 |
(((unsigned char)m_s[p]) & 240) == 224
|
248 |
(((unsigned char)(*m_sp)[p]) & 240) == 224
|
249 |
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
249 |
&& (((unsigned char)(*m_sp)[p+1]) & 192) == 128
|
250 |
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
250 |
&& (((unsigned char)(*m_sp)[p+2]) & 192) == 128
|
251 |
);
|
251 |
);
|
252 |
#endif
|
252 |
#endif
|
253 |
|
253 |
|
254 |
return ((unsigned char)m_s[p] - 224) * 4096 +
|
254 |
return ((unsigned char)(*m_sp)[p] - 224) * 4096 +
|
255 |
((unsigned char)m_s[p+1] - 128) * 64 +
|
255 |
((unsigned char)(*m_sp)[p+1] - 128) * 64 +
|
256 |
(unsigned char)m_s[p+2] - 128;
|
256 |
(unsigned char)(*m_sp)[p+2] - 128;
|
257 |
case 4:
|
257 |
case 4:
|
258 |
#ifdef UTF8ITER_CHECK
|
258 |
#ifdef UTF8ITER_CHECK
|
259 |
assert(
|
259 |
assert(
|
260 |
(((unsigned char)m_s[p]) & 248) == 240
|
260 |
(((unsigned char)(*m_sp)[p]) & 248) == 240
|
261 |
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
261 |
&& (((unsigned char)(*m_sp)[p+1]) & 192) == 128
|
262 |
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
262 |
&& (((unsigned char)(*m_sp)[p+2]) & 192) == 128
|
263 |
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
263 |
&& (((unsigned char)(*m_sp)[p+3]) & 192) == 128
|
264 |
);
|
264 |
);
|
265 |
#endif
|
265 |
#endif
|
266 |
|
266 |
|
267 |
return ((unsigned char)m_s[p]-240)*262144 +
|
267 |
return ((unsigned char)(*m_sp)[p]-240)*262144 +
|
268 |
((unsigned char)m_s[p+1]-128)*4096 +
|
268 |
((unsigned char)(*m_sp)[p+1]-128)*4096 +
|
269 |
((unsigned char)m_s[p+2]-128)*64 +
|
269 |
((unsigned char)(*m_sp)[p+2]-128)*64 +
|
270 |
(unsigned char)m_s[p+3]-128;
|
270 |
(unsigned char)(*m_sp)[p+3]-128;
|
271 |
|
271 |
|
272 |
default:
|
272 |
default:
|
273 |
#ifdef UTF8ITER_CHECK
|
273 |
#ifdef UTF8ITER_CHECK
|
274 |
assert(l <= 4);
|
274 |
assert(l <= 4);
|
275 |
#endif
|
275 |
#endif
|