Parent: [ae6ce2] (diff)

Child: [3872f8] (diff)

Download this file

utf8iter.h    164 lines (156 with data), 4.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_
/* @(#$Id: utf8iter.h,v 1.5 2006-01-28 10:23:55 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* A small helper class to iterate over utf8 strings. This is not an
* STL iterator and this is not well designed, just convenient for
some specific uses
*/
class Utf8Iter {
unsigned int cl; // Char length at current position if known
const string &s; // String we're working with
string::size_type pos; // Current position in string
bool bad; // Status
unsigned int m_charpos; // Current character posiiton
// Get character byte length at specified position
inline int get_cl(string::size_type p) const {
unsigned int z = (unsigned char)s[p];
if (z <= 127) {
return 1;
} else if (z>=192 && z <= 223) {
return 2;
} else if (z >= 224 && z <= 239) {
return 3;
} else if (z >= 240 && z <= 247) {
return 4;
} else if (z >= 248 && z <= 251) {
return 5;
} else if (z >= 252 && z <= 253) {
return 6;
}
return -1;
}
// Check position and cl against string length
bool poslok(string::size_type p, int l) const {
return p != string::npos && l > 0 && p + l <= s.length();
}
// Update current char length in object state. Assumes pos is inside string
inline int compute_cl() {
cl = 0;
if (bad)
return -1;
cl = get_cl(pos);
if (!poslok(pos, cl)) {
bad = true;
pos = s.length();
cl = 0;
return -1;
}
return 0;
}
// Compute value at given position
inline unsigned int getvalueat(string::size_type p, int l) const {
switch (l) {
case 1: return (unsigned char)s[p];
case 2: return ((unsigned char)s[p] - 192) * 64 +
(unsigned char)s[p+1] - 128 ;
case 3: return ((unsigned char)s[p]-224)*4096 +
((unsigned char)s[p+1]-128)*64 +
(unsigned char)s[p+2]-128;
case 4: return ((unsigned char)s[p]-240)*262144 +
((unsigned char)s[p+1]-128)*4096 +
((unsigned char)s[p+2]-128)*64 +
(unsigned char)s[p+3]-128;
case 5: return ((unsigned char)s[p]-248)*16777216 +
((unsigned char)s[p+1]-128)*262144 +
((unsigned char)s[p+2]-128)*4096 +
((unsigned char)s[p+3]-128)*64 +
(unsigned char)s[p+4]-128;
case 6: return ((unsigned char)s[p]-252)*1073741824 +
((unsigned char)s[p+1]-128)*16777216 +
((unsigned char)s[p+2]-128)*262144 +
((unsigned char)s[p+3]-128)*4096 +
((unsigned char)s[p+4]-128)*64 +
(unsigned char)s[p+5]-128;
default:
return (unsigned int)-1;
}
}
public:
Utf8Iter(const string &in)
: cl(0), s(in), pos(0), bad(false), m_charpos(0) {}
void rewind() {
cl=0; pos=0; bad=false; m_charpos=0;
}
/** operator* returns the ucs4 value as a machine integer*/
unsigned int operator*() {
if (!cl && compute_cl() < 0)
return (unsigned int)-1;
unsigned int val = getvalueat(pos, cl);
if (val == (unsigned int)-1) {
bad = true;
pos = s.length();
cl = 0;
}
return val;
}
/** "Direct" access. Awfully inefficient as we skip from start or current
* position at best. This can only be useful for a lookahead from the
* current position */
unsigned int operator[](unsigned int charpos) const {
string::size_type mypos = 0;
unsigned int mycp = 0;;
if (charpos >= m_charpos) {
mypos = pos;
mycp = m_charpos;
}
while (mypos < s.length() && mycp != charpos) {
mypos += get_cl(mypos);
++mycp;
}
if (mypos < s.length() && mycp == charpos) {
int l = get_cl(mypos);
if (poslok(mypos, l))
return getvalueat(mypos, get_cl(mypos));
}
return (unsigned int)-1;
}
/** Set current position before next utf-8 character */
string::size_type operator++(int) {
if (bad || (!cl && compute_cl() < 0)) {
return pos = string::npos;
}
pos += cl;
m_charpos++;
cl = 0;
return pos;
}
bool appendchartostring(string &out) {
if (bad || (!cl && compute_cl() < 0)) {
return false;
}
out += s.substr(pos, cl);
return true;
}
operator string() {
if (bad || (!cl && compute_cl() < 0)) {
return std::string("");
}
return s.substr(pos, cl);
}
bool eof() {
// Note: we always ensure that pos == s.length() when setting bad to
// true
return pos == s.length();
}
bool error() {
return bad;
}
string::size_type getBpos() const {
return pos;
}
string::size_type getCpos() const {
return m_charpos;
}
};
#endif /* _UTF8ITER_H_INCLUDED_ */