|
a/src/utils/utf8iter.h |
|
b/src/utils/utf8iter.h |
|
... |
|
... |
14 |
* Free Software Foundation, Inc.,
|
14 |
* Free Software Foundation, Inc.,
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
15 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
*/
|
16 |
*/
|
17 |
#ifndef _UTF8ITER_H_INCLUDED_
|
17 |
#ifndef _UTF8ITER_H_INCLUDED_
|
18 |
#define _UTF8ITER_H_INCLUDED_
|
18 |
#define _UTF8ITER_H_INCLUDED_
|
19 |
/* @(#$Id: utf8iter.h,v 1.6 2006-01-30 11:15:28 dockes Exp $ (C) 2004 J.F.Dockes */
|
19 |
/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $ (C) 2004 J.F.Dockes */
|
20 |
|
20 |
|
21 |
/**
|
21 |
/**
|
22 |
* A small helper class to iterate over utf8 strings. This is not an
|
22 |
* A small helper class to iterate over utf8 strings. This is not an
|
23 |
* STL iterator and this is not well designed, just convenient for
|
23 |
* STL iterator and this is not well designed, just convenient for
|
24 |
some specific uses
|
24 |
some specific uses
|
25 |
*/
|
25 |
*/
|
26 |
class Utf8Iter {
|
26 |
class Utf8Iter {
|
27 |
unsigned int cl; // Char length at current position if known
|
27 |
unsigned int cl; // Char length at current position if known
|
28 |
const string &s; // String we're working with
|
28 |
const string &s; // String we're working with
|
29 |
string::size_type pos; // Current position in string
|
29 |
string::size_type pos; // Current position in string
|
30 |
bool bad; // Status
|
|
|
31 |
unsigned int m_charpos; // Current character posiiton
|
30 |
unsigned int m_charpos; // Current character posiiton
|
32 |
|
31 |
|
33 |
// Get character byte length at specified position
|
32 |
// Get character byte length at specified position
|
34 |
inline int get_cl(string::size_type p) const {
|
33 |
inline int get_cl(string::size_type p) const {
|
35 |
unsigned int z = (unsigned char)s[p];
|
34 |
unsigned int z = (unsigned char)s[p];
|
|
... |
|
... |
53 |
return p != string::npos && l > 0 && p + l <= s.length();
|
52 |
return p != string::npos && l > 0 && p + l <= s.length();
|
54 |
}
|
53 |
}
|
55 |
// Update current char length in object state. Assumes pos is inside string
|
54 |
// Update current char length in object state. Assumes pos is inside string
|
56 |
inline int compute_cl() {
|
55 |
inline int compute_cl() {
|
57 |
cl = 0;
|
56 |
cl = 0;
|
58 |
if (bad)
|
|
|
59 |
return -1;
|
|
|
60 |
cl = get_cl(pos);
|
57 |
cl = get_cl(pos);
|
61 |
if (!poslok(pos, cl)) {
|
58 |
if (!poslok(pos, cl)) {
|
62 |
bad = true;
|
|
|
63 |
pos = s.length();
|
59 |
pos = s.length();
|
64 |
cl = 0;
|
60 |
cl = 0;
|
65 |
return -1;
|
61 |
return -1;
|
66 |
}
|
62 |
}
|
67 |
return 0;
|
63 |
return 0;
|
|
... |
|
... |
94 |
return (unsigned int)-1;
|
90 |
return (unsigned int)-1;
|
95 |
}
|
91 |
}
|
96 |
}
|
92 |
}
|
97 |
public:
|
93 |
public:
|
98 |
Utf8Iter(const string &in)
|
94 |
Utf8Iter(const string &in)
|
99 |
: cl(0), s(in), pos(0), bad(false), m_charpos(0) {}
|
95 |
: cl(0), s(in), pos(0), m_charpos(0)
|
|
|
96 |
{
|
|
|
97 |
// Ensure state is ok if appendchartostring is called at once
|
|
|
98 |
compute_cl();
|
|
|
99 |
}
|
100 |
|
100 |
|
101 |
void rewind() {
|
101 |
void rewind() {
|
102 |
cl=0; pos=0; bad=false; m_charpos=0;
|
102 |
cl=0; pos=0; m_charpos=0;
|
103 |
}
|
103 |
}
|
104 |
/** operator* returns the ucs4 value as a machine integer*/
|
104 |
/** operator* returns the ucs4 value as a machine integer*/
|
105 |
unsigned int operator*() {
|
105 |
unsigned int operator*() {
|
106 |
if (!cl && compute_cl() < 0)
|
106 |
if (!cl && compute_cl() < 0)
|
107 |
return (unsigned int)-1;
|
107 |
return (unsigned int)-1;
|
108 |
unsigned int val = getvalueat(pos, cl);
|
108 |
unsigned int val = getvalueat(pos, cl);
|
109 |
if (val == (unsigned int)-1) {
|
109 |
if (val == (unsigned int)-1) {
|
110 |
bad = true;
|
|
|
111 |
pos = s.length();
|
110 |
pos = s.length();
|
112 |
cl = 0;
|
111 |
cl = 0;
|
113 |
}
|
112 |
}
|
114 |
return val;
|
113 |
return val;
|
115 |
}
|
114 |
}
|
|
... |
|
... |
135 |
return (unsigned int)-1;
|
134 |
return (unsigned int)-1;
|
136 |
}
|
135 |
}
|
137 |
|
136 |
|
138 |
/** Set current position before next utf-8 character */
|
137 |
/** Set current position before next utf-8 character */
|
139 |
string::size_type operator++(int) {
|
138 |
string::size_type operator++(int) {
|
140 |
if (bad || (!cl && compute_cl() < 0)) {
|
139 |
if (!cl && compute_cl() < 0) {
|
141 |
return pos = string::npos;
|
140 |
return pos = string::npos;
|
142 |
}
|
141 |
}
|
143 |
pos += cl;
|
142 |
pos += cl;
|
144 |
m_charpos++;
|
143 |
m_charpos++;
|
145 |
cl = 0;
|
144 |
cl = 0;
|
146 |
return pos;
|
145 |
return pos;
|
147 |
}
|
146 |
}
|
148 |
|
147 |
/** This needs to be fast. No error checking. */
|
149 |
bool appendchartostring(string &out) {
|
148 |
void appendchartostring(string &out) {
|
150 |
if (bad || (!cl && compute_cl() < 0)) {
|
149 |
out.append(&s[pos], cl);
|
151 |
return false;
|
|
|
152 |
}
|
|
|
153 |
out += s.substr(pos, cl);
|
|
|
154 |
return true;
|
|
|
155 |
}
|
150 |
}
|
156 |
operator string() {
|
151 |
operator string() {
|
157 |
if (bad || (!cl && compute_cl() < 0)) {
|
152 |
if (!cl && compute_cl() < 0) {
|
158 |
return std::string("");
|
153 |
return std::string("");
|
159 |
}
|
154 |
}
|
160 |
return s.substr(pos, cl);
|
155 |
return s.substr(pos, cl);
|
161 |
}
|
156 |
}
|
162 |
bool eof() {
|
157 |
bool eof() {
|
163 |
// Note: we always ensure that pos == s.length() when setting bad to
|
158 |
// Note: we always ensure that pos == s.length() when setting bad to
|
164 |
// true
|
159 |
// true
|
165 |
return pos == s.length();
|
160 |
return pos == s.length();
|
166 |
}
|
161 |
}
|
167 |
bool error() {
|
162 |
bool error() {
|
168 |
return bad;
|
163 |
return compute_cl() < 0;
|
169 |
}
|
164 |
}
|
170 |
string::size_type getBpos() const {
|
165 |
string::size_type getBpos() const {
|
171 |
return pos;
|
166 |
return pos;
|
172 |
}
|
167 |
}
|
173 |
string::size_type getCpos() const {
|
168 |
string::size_type getCpos() const {
|