--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@@ -1,6 +1,6 @@
#ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_
-/* @(#$Id: utf8iter.h,v 1.1 2005-02-10 19:52:50 dockes Exp $ (C) 2004 J.F.Dockes */
+/* @(#$Id: utf8iter.h,v 1.2 2005-02-11 11:20:02 dockes Exp $ (C) 2004 J.F.Dockes */
/**
* A small helper class to iterate over utf8 strings. This is not an
@@ -8,58 +8,113 @@
some specific uses
*/
class Utf8Iter {
- unsigned int cl;
- const string &s;
- string::size_type pos;
- bool bad;
- int compute_cl() {
+ unsigned int cl; // Char length at current position if known
+ const string &s; // String we're working with
+ string::size_type pos; // Current position in string
+ bool bad; // Status
+ unsigned int m_charpos; // Current character posiiton
+
+ // Get character byte length at specified position
+ inline int get_cl(string::size_type p) const {
+ unsigned int z = (unsigned char)s[p];
+ if (z <= 127) {
+ return 1;
+ } else if (z>=192 && z <= 223) {
+ return 2;
+ } else if (z >= 224 && z <= 239) {
+ return 3;
+ } else if (z >= 240 && z <= 247) {
+ return 4;
+ } else if (z >= 248 && z <= 251) {
+ return 5;
+ } else if (z >= 252 && z <= 253) {
+ return 6;
+ }
+ return -1;
+ }
+ // Check position and cl against string length
+ bool poslok(string::size_type p, int l) const {
+ return p != string::npos && l > 0 && p + l <= s.length();
+ }
+ // Update current char length in object state. Assumes pos is inside string
+ inline int compute_cl() {
cl = 0;
if (bad)
return -1;
- unsigned int z = (unsigned char)s[pos];
- if (z <= 127) {
- cl = 1;
- } else if (z>=192 && z <= 223) {
- cl = 2;
- } else if (z >= 224 && z <= 239) {
- cl = 3;
- } else if (z >= 240 && z <= 247) {
- cl = 4;
- } else if (z >= 248 && z <= 251) {
- cl = 5;
- } else if (z >= 252 && z <= 253) {
- cl = 6;
- }
- if (!cl || s.length() - pos < cl) {
+ cl = get_cl(pos);
+ if (!poslok(pos, cl)) {
bad = true;
cl = 0;
return -1;
}
return 0;
}
+ // Compute value at given position
+ inline unsigned int getvalueat(string::size_type p, int l) const {
+ switch (l) {
+ case 1: return (unsigned char)s[p];
+ case 2: return ((unsigned char)s[p] - 192) * 64 +
+ (unsigned char)s[p+1] - 128 ;
+ case 3: return ((unsigned char)s[p]-224)*4096 +
+ ((unsigned char)s[p+1]-128)*64 +
+ (unsigned char)s[p+2]-128;
+ case 4: return ((unsigned char)s[p]-240)*262144 +
+ ((unsigned char)s[p+1]-128)*4096 +
+ ((unsigned char)s[p+2]-128)*64 +
+ (unsigned char)s[p+3]-128;
+ case 5: return ((unsigned char)s[p]-248)*16777216 +
+ ((unsigned char)s[p+1]-128)*262144 +
+ ((unsigned char)s[p+2]-128)*4096 +
+ ((unsigned char)s[p+3]-128)*64 +
+ (unsigned char)s[p+4]-128;
+ case 6: return ((unsigned char)s[p]-252)*1073741824 +
+ ((unsigned char)s[p+1]-128)*16777216 +
+ ((unsigned char)s[p+2]-128)*262144 +
+ ((unsigned char)s[p+3]-128)*4096 +
+ ((unsigned char)s[p+4]-128)*64 +
+ (unsigned char)s[p+5]-128;
+ default:
+ return (unsigned int)-1;
+ }
+ }
public:
- Utf8Iter(const string &in) : cl(0), s(in), pos(0), bad(false) {}
+ Utf8Iter(const string &in)
+ : cl(0), s(in), pos(0), bad(false), m_charpos(0) {}
+ void rewind() {
+ cl=0; pos=0; bad=false; m_charpos=0;
+ }
/** operator* returns the ucs4 value as a machine integer*/
unsigned int operator*() {
if (!cl && compute_cl() < 0)
return (unsigned int)-1;
- switch (cl) {
- case 1: return (unsigned char)s[pos];
- case 2: return ((unsigned char)s[pos] - 192) * 64 + (unsigned char)s[pos+1] - 128 ;
- case 3: return ((unsigned char)s[pos]-224)*4096 + ((unsigned char)s[pos+1]-128)*64 + (unsigned char)s[pos+2]-128;
- case 4: return ((unsigned char)s[pos]-240)*262144 + ((unsigned char)s[pos+1]-128)*4096 +
- ((unsigned char)s[pos+2]-128)*64 + (unsigned char)s[pos+3]-128;
- case 5: return ((unsigned char)s[pos]-248)*16777216 + ((unsigned char)s[pos+1]-128)*262144 +
- ((unsigned char)s[pos+2]-128)*4096 + ((unsigned char)s[pos+3]-128)*64 + (unsigned char)s[pos+4]-128;
- case 6: return ((unsigned char)s[pos]-252)*1073741824 + ((unsigned char)s[pos+1]-128)*16777216 +
- ((unsigned char)s[pos+2]-128)*262144 + ((unsigned char)s[pos+3]-128)*4096 +
- ((unsigned char)s[pos+4]-128)*64 + (unsigned char)s[pos+5]-128;
- default:
+ unsigned int val = getvalueat(pos, cl);
+ if (val == (unsigned int)-1) {
bad = true;
cl = 0;
- return (unsigned int)-1;
}
+ return val;
+ }
+ /** "Direct" access. Awfully inefficient as we skip from start or current
+ * position at best. This can only be useful for a lookahead from the
+ * current position */
+ unsigned int operator[](unsigned int charpos) const {
+ string::size_type mypos = 0;
+ unsigned int mycp = 0;;
+ if (charpos >= m_charpos) {
+ mypos = pos;
+ mycp = m_charpos;
+ }
+ while (mypos < s.length() && mycp != charpos) {
+ mypos += get_cl(mypos);
+ ++mycp;
+ }
+ if (mypos < s.length() && mycp == charpos) {
+ int l = get_cl(mypos);
+ if (poslok(mypos, l))
+ return getvalueat(mypos, get_cl(mypos));
+ }
+ return (unsigned int)-1;
}
string::size_type operator++(int) {
@@ -67,6 +122,7 @@
return string::npos;
}
pos += cl;
+ m_charpos++;
cl = 0;
return pos;
}
@@ -78,12 +134,24 @@
out += s.substr(pos, cl);
return true;
}
+ operator string() {
+ if (bad || (!cl && compute_cl() < 0)) {
+ return false;
+ }
+ return s.substr(pos, cl);
+ }
bool eof() {
return bad || pos == s.length();
}
bool error() {
return bad;
}
+ string::size_type getBpos() const {
+ return pos;
+ }
+ string::size_type getCpos() const {
+ return m_charpos;
+ }
};