|
a/src/utils/utf8iter.h |
|
b/src/utils/utf8iter.h |
|
... |
|
... |
30 |
* a risk of crash.
|
30 |
* a risk of crash.
|
31 |
*/
|
31 |
*/
|
32 |
class Utf8Iter {
|
32 |
class Utf8Iter {
|
33 |
public:
|
33 |
public:
|
34 |
Utf8Iter(const std::string &in)
|
34 |
Utf8Iter(const std::string &in)
|
35 |
: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
|
35 |
: m_s(in), m_cl(0), m_pos(0), m_charpos(0)
|
36 |
{
|
36 |
{
|
37 |
update_cl();
|
37 |
update_cl();
|
38 |
}
|
38 |
}
|
39 |
|
39 |
|
40 |
const std::string& buffer() const {return m_s;}
|
40 |
const std::string& buffer() const {return m_s;}
|
|
... |
|
... |
42 |
void rewind()
|
42 |
void rewind()
|
43 |
{
|
43 |
{
|
44 |
m_cl = 0;
|
44 |
m_cl = 0;
|
45 |
m_pos = 0;
|
45 |
m_pos = 0;
|
46 |
m_charpos = 0;
|
46 |
m_charpos = 0;
|
47 |
m_error = false;
|
|
|
48 |
update_cl();
|
47 |
update_cl();
|
49 |
}
|
48 |
}
|
50 |
|
49 |
|
51 |
/** "Direct" access. Awfully inefficient as we skip from start or current
|
50 |
/** "Direct" access. Awfully inefficient as we skip from start or current
|
52 |
* position at best. This can only be useful for a lookahead from the
|
51 |
* position at best. This can only be useful for a lookahead from the
|
|
... |
|
... |
60 |
mycp = m_charpos;
|
59 |
mycp = m_charpos;
|
61 |
}
|
60 |
}
|
62 |
int l;
|
61 |
int l;
|
63 |
while (mypos < m_s.length() && mycp != charpos) {
|
62 |
while (mypos < m_s.length() && mycp != charpos) {
|
64 |
l = get_cl(mypos);
|
63 |
l = get_cl(mypos);
|
65 |
if (l <= 0)
|
64 |
if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
|
66 |
return (unsigned int)-1;
|
65 |
return (unsigned int)-1;
|
67 |
mypos += l;
|
66 |
mypos += l;
|
68 |
++mycp;
|
67 |
++mycp;
|
69 |
}
|
68 |
}
|
70 |
if (mypos < m_s.length() && mycp == charpos) {
|
69 |
if (mypos < m_s.length() && mycp == charpos) {
|
71 |
l = get_cl(mypos);
|
70 |
l = get_cl(mypos);
|
72 |
if (poslok(mypos, l))
|
71 |
if (poslok(mypos, l) && checkvalidat(mypos, l))
|
73 |
return getvalueat(mypos, get_cl(mypos));
|
72 |
return getvalueat(mypos, l);
|
74 |
}
|
73 |
}
|
75 |
return (unsigned int)-1;
|
74 |
return (unsigned int)-1;
|
76 |
}
|
75 |
}
|
77 |
|
76 |
|
78 |
/** Increment current position to next utf-8 char */
|
77 |
/** Increment current position to next utf-8 char */
|
|
... |
|
... |
81 |
// Note: m_cl may be zero at eof if user's test not right
|
80 |
// Note: m_cl may be zero at eof if user's test not right
|
82 |
// this shouldn't crash the program until actual data access
|
81 |
// this shouldn't crash the program until actual data access
|
83 |
#ifdef UTF8ITER_CHECK
|
82 |
#ifdef UTF8ITER_CHECK
|
84 |
assert(m_cl != 0);
|
83 |
assert(m_cl != 0);
|
85 |
#endif
|
84 |
#endif
|
86 |
if (m_cl <= 0)
|
85 |
if (m_cl == 0)
|
87 |
return std::string::npos;
|
86 |
return std::string::npos;
|
88 |
|
87 |
|
89 |
m_pos += m_cl;
|
88 |
m_pos += m_cl;
|
90 |
m_charpos++;
|
89 |
m_charpos++;
|
91 |
update_cl();
|
90 |
update_cl();
|
|
... |
|
... |
94 |
|
93 |
|
95 |
/** operator* returns the ucs4 value as a machine integer*/
|
94 |
/** operator* returns the ucs4 value as a machine integer*/
|
96 |
unsigned int operator*()
|
95 |
unsigned int operator*()
|
97 |
{
|
96 |
{
|
98 |
#ifdef UTF8ITER_CHECK
|
97 |
#ifdef UTF8ITER_CHECK
|
99 |
assert(m_cl != 0);
|
98 |
assert(m_cl > 0);
|
100 |
#endif
|
99 |
#endif
|
101 |
return getvalueat(m_pos, m_cl);
|
100 |
return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
|
102 |
}
|
101 |
}
|
103 |
|
102 |
|
104 |
/** Append current utf-8 possibly multi-byte character to string param.
|
103 |
/** Append current utf-8 possibly multi-byte character to string param.
|
105 |
This needs to be fast. No error checking. */
|
104 |
This needs to be fast. No error checking. */
|
106 |
unsigned int appendchartostring(std::string &out) const {
|
105 |
unsigned int appendchartostring(std::string &out) const {
|
|
... |
|
... |
114 |
/** Return current character as string */
|
113 |
/** Return current character as string */
|
115 |
operator std::string() {
|
114 |
operator std::string() {
|
116 |
#ifdef UTF8ITER_CHECK
|
115 |
#ifdef UTF8ITER_CHECK
|
117 |
assert(m_cl != 0);
|
116 |
assert(m_cl != 0);
|
118 |
#endif
|
117 |
#endif
|
119 |
return m_s.substr(m_pos, m_cl);
|
118 |
return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
|
120 |
}
|
119 |
}
|
121 |
|
120 |
|
122 |
bool eof() {
|
121 |
bool eof() const {
|
123 |
return m_pos == m_s.length();
|
122 |
return m_pos == m_s.length();
|
124 |
}
|
123 |
}
|
125 |
|
124 |
|
126 |
bool error() {
|
125 |
bool error() const {
|
127 |
return m_error;
|
126 |
return m_cl == 0;
|
128 |
}
|
127 |
}
|
129 |
|
128 |
|
130 |
/** Return current byte offset in input string */
|
129 |
/** Return current byte offset in input string */
|
131 |
std::string::size_type getBpos() const {
|
130 |
std::string::size_type getBpos() const {
|
132 |
return m_pos;
|
131 |
return m_pos;
|
|
... |
|
... |
145 |
private:
|
144 |
private:
|
146 |
// String we're working with
|
145 |
// String we're working with
|
147 |
const std::string& m_s;
|
146 |
const std::string& m_s;
|
148 |
// Character length at current position. A value of zero indicates
|
147 |
// Character length at current position. A value of zero indicates
|
149 |
// an error.
|
148 |
// an error.
|
150 |
unsigned int m_cl;
|
149 |
unsigned int m_cl;
|
151 |
// Current byte offset in string.
|
150 |
// Current byte offset in string.
|
152 |
std::string::size_type m_pos;
|
151 |
std::string::size_type m_pos;
|
153 |
// Current character position
|
152 |
// Current character position
|
154 |
unsigned int m_charpos;
|
153 |
unsigned int m_charpos;
|
155 |
// Am I ok ?
|
|
|
156 |
mutable bool m_error;
|
|
|
157 |
|
154 |
|
158 |
// Check position and cl against string length
|
155 |
// Check position and cl against string length
|
159 |
bool poslok(std::string::size_type p, int l) const {
|
156 |
bool poslok(std::string::size_type p, int l) const {
|
160 |
#ifdef UTF8ITER_CHECK
|
157 |
#ifdef UTF8ITER_CHECK
|
161 |
assert(p != std::string::npos && l > 0 && p + l <= m_s.length());
|
158 |
assert(p != std::string::npos && l > 0 && p + l <= m_s.length());
|
162 |
#endif
|
159 |
#endif
|
163 |
return p != std::string::npos && l > 0 && p + l <= m_s.length();
|
160 |
return p != std::string::npos && l > 0 && p + l <= m_s.length();
|
164 |
}
|
161 |
}
|
165 |
|
162 |
|
166 |
// Update current char length in object state, minimum checking
|
163 |
// Update current char length in object state, check
|
167 |
// for errors
|
164 |
// for errors
|
168 |
inline void update_cl()
|
165 |
inline void update_cl()
|
169 |
{
|
166 |
{
|
170 |
m_cl = 0;
|
167 |
m_cl = 0;
|
171 |
if (m_pos >= m_s.length())
|
168 |
if (m_pos >= m_s.length())
|
|
... |
|
... |
174 |
if (!poslok(m_pos, m_cl)) {
|
171 |
if (!poslok(m_pos, m_cl)) {
|
175 |
// Used to set eof here for safety, but this is bad because it
|
172 |
// Used to set eof here for safety, but this is bad because it
|
176 |
// basically prevents the caller to discriminate error and eof.
|
173 |
// basically prevents the caller to discriminate error and eof.
|
177 |
// m_pos = m_s.length();
|
174 |
// m_pos = m_s.length();
|
178 |
m_cl = 0;
|
175 |
m_cl = 0;
|
179 |
m_error = true;
|
176 |
return;
|
|
|
177 |
}
|
|
|
178 |
if (!checkvalidat(m_pos, m_cl)) {
|
|
|
179 |
m_cl = 0;
|
|
|
180 |
}
|
|
|
181 |
}
|
|
|
182 |
|
|
|
183 |
inline bool checkvalidat(std::string::size_type p, int l) const
|
|
|
184 |
{
|
|
|
185 |
switch (l) {
|
|
|
186 |
case 1:
|
|
|
187 |
return (unsigned char)m_s[p] < 128;
|
|
|
188 |
case 2:
|
|
|
189 |
return (((unsigned char)m_s[p]) & 224) == 192
|
|
|
190 |
&& (((unsigned char)m_s[p+1]) & 192) == 128;
|
|
|
191 |
case 3:
|
|
|
192 |
return (((unsigned char)m_s[p]) & 240) == 224
|
|
|
193 |
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
|
|
194 |
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
|
|
195 |
;
|
|
|
196 |
case 4:
|
|
|
197 |
return (((unsigned char)m_s[p]) & 248) == 240
|
|
|
198 |
&& (((unsigned char)m_s[p+1]) & 192) == 128
|
|
|
199 |
&& (((unsigned char)m_s[p+2]) & 192) == 128
|
|
|
200 |
&& (((unsigned char)m_s[p+3]) & 192) == 128
|
|
|
201 |
;
|
|
|
202 |
default:
|
|
|
203 |
return false;
|
180 |
}
|
204 |
}
|
181 |
}
|
205 |
}
|
182 |
|
206 |
|
183 |
// Get character byte length at specified position. Returns 0 for error.
|
207 |
// Get character byte length at specified position. Returns 0 for error.
|
184 |
inline int get_cl(std::string::size_type p) const
|
208 |
inline int get_cl(std::string::size_type p) const
|
|
... |
|
... |
247 |
|
271 |
|
248 |
default:
|
272 |
default:
|
249 |
#ifdef UTF8ITER_CHECK
|
273 |
#ifdef UTF8ITER_CHECK
|
250 |
assert(l <= 4);
|
274 |
assert(l <= 4);
|
251 |
#endif
|
275 |
#endif
|
252 |
m_error = true;
|
|
|
253 |
return (unsigned int)-1;
|
276 |
return (unsigned int)-1;
|
254 |
}
|
277 |
}
|
255 |
}
|
278 |
}
|
256 |
|
279 |
|
257 |
};
|
280 |
};
|