#ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_
/* @(#$Id: utf8iter.h,v 1.3 2005-03-17 14:02:06 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* A small helper class to iterate over utf8 strings. This is not an
* STL iterator and this is not well designed, just convenient for
some specific uses
*/
class Utf8Iter {
unsigned int cl; // Char length at current position if known
const string &s; // String we're working with
string::size_type pos; // Current position in string
bool bad; // Status
unsigned int m_charpos; // Current character posiiton
// Get character byte length at specified position
inline int get_cl(string::size_type p) const {
unsigned int z = (unsigned char)s[p];
if (z <= 127) {
return 1;
} else if (z>=192 && z <= 223) {
return 2;
} else if (z >= 224 && z <= 239) {
return 3;
} else if (z >= 240 && z <= 247) {
return 4;
} else if (z >= 248 && z <= 251) {
return 5;
} else if (z >= 252 && z <= 253) {
return 6;
}
return -1;
}
// Check position and cl against string length
bool poslok(string::size_type p, int l) const {
return p != string::npos && l > 0 && p + l <= s.length();
}
// Update current char length in object state. Assumes pos is inside string
inline int compute_cl() {
cl = 0;
if (bad)
return -1;
cl = get_cl(pos);
if (!poslok(pos, cl)) {
bad = true;
cl = 0;
return -1;
}
return 0;
}
// Compute value at given position
inline unsigned int getvalueat(string::size_type p, int l) const {
switch (l) {
case 1: return (unsigned char)s[p];
case 2: return ((unsigned char)s[p] - 192) * 64 +
(unsigned char)s[p+1] - 128 ;
case 3: return ((unsigned char)s[p]-224)*4096 +
((unsigned char)s[p+1]-128)*64 +
(unsigned char)s[p+2]-128;
case 4: return ((unsigned char)s[p]-240)*262144 +
((unsigned char)s[p+1]-128)*4096 +
((unsigned char)s[p+2]-128)*64 +
(unsigned char)s[p+3]-128;
case 5: return ((unsigned char)s[p]-248)*16777216 +
((unsigned char)s[p+1]-128)*262144 +
((unsigned char)s[p+2]-128)*4096 +
((unsigned char)s[p+3]-128)*64 +
(unsigned char)s[p+4]-128;
case 6: return ((unsigned char)s[p]-252)*1073741824 +
((unsigned char)s[p+1]-128)*16777216 +
((unsigned char)s[p+2]-128)*262144 +
((unsigned char)s[p+3]-128)*4096 +
((unsigned char)s[p+4]-128)*64 +
(unsigned char)s[p+5]-128;
default:
return (unsigned int)-1;
}
}
public:
Utf8Iter(const string &in)
: cl(0), s(in), pos(0), bad(false), m_charpos(0) {}
void rewind() {
cl=0; pos=0; bad=false; m_charpos=0;
}
/** operator* returns the ucs4 value as a machine integer*/
unsigned int operator*() {
if (!cl && compute_cl() < 0)
return (unsigned int)-1;
unsigned int val = getvalueat(pos, cl);
if (val == (unsigned int)-1) {
bad = true;
cl = 0;
}
return val;
}
/** "Direct" access. Awfully inefficient as we skip from start or current
* position at best. This can only be useful for a lookahead from the
* current position */
unsigned int operator[](unsigned int charpos) const {
string::size_type mypos = 0;
unsigned int mycp = 0;;
if (charpos >= m_charpos) {
mypos = pos;
mycp = m_charpos;
}
while (mypos < s.length() && mycp != charpos) {
mypos += get_cl(mypos);
++mycp;
}
if (mypos < s.length() && mycp == charpos) {
int l = get_cl(mypos);
if (poslok(mypos, l))
return getvalueat(mypos, get_cl(mypos));
}
return (unsigned int)-1;
}
/** Set current position before next utf-8 character */
string::size_type operator++(int) {
if (bad || (!cl && compute_cl() < 0)) {
return pos = string::npos;
}
pos += cl;
m_charpos++;
cl = 0;
return pos;
}
bool appendchartostring(string &out) {
if (bad || (!cl && compute_cl() < 0)) {
return false;
}
out += s.substr(pos, cl);
return true;
}
operator string() {
if (bad || (!cl && compute_cl() < 0)) {
return false;
}
return s.substr(pos, cl);
}
bool eof() {
return bad || pos == s.length();
}
bool error() {
return bad;
}
string::size_type getBpos() const {
return pos;
}
string::size_type getCpos() const {
return m_charpos;
}
};
#endif /* _UTF8ITER_H_INCLUDED_ */