Switch to side-by-side view

--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@@ -16,158 +16,230 @@
  */
 #ifndef _UTF8ITER_H_INCLUDED_
 #define _UTF8ITER_H_INCLUDED_
-/* @(#$Id: utf8iter.h,v 1.7 2006-11-17 12:31:34 dockes Exp $  (C) 2004 J.F.Dockes */
+/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $  (C) 2004 J.F.Dockes */
 
 /** 
  * A small helper class to iterate over utf8 strings. This is not an
- * STL iterator and this is not well designed, just convenient for
-   some specific uses
+ * STL iterator and does not much error checking. It is designed purely
+ * for recoll usage, where the utf-8 string comes out of iconv in most cases
+ * and is assumed legal. We just try to catch cases where there would be 
+ * a risk of crash.
  */
 class Utf8Iter {
-    unsigned int cl; // Char length at current position if known
-    const string &s; // String we're working with
-    string::size_type pos; // Current position in string
-    unsigned int m_charpos; // Current character posiiton
-
-    // Get character byte length at specified position
-    inline int get_cl(string::size_type p) const {
-	unsigned int z = (unsigned char)s[p];
-	if (z <= 127) {
-	    return 1;
-	} else if (z>=192 && z <= 223) {
-	    return 2;
-	} else if (z >= 224 && z <= 239) {
-	    return 3;
-	} else if (z >= 240 && z <= 247) {
-	    return 4;
-	} else if (z >= 248 && z <= 251) {
-	    return 5;
-	} else if (z >= 252 && z <= 253) {
-	    return 6;
-	} 
-	return -1;
-    }
-    // Check position and cl against string length
-    bool poslok(string::size_type p, int l) const {
-	return p != string::npos && l > 0 && p + l <= s.length();
-    }
-    // Update current char length in object state. Assumes pos is inside string
-    inline int compute_cl() {
-	cl = 0;
-	cl = get_cl(pos);
-	if (!poslok(pos, cl)) {
-	    pos = s.length();
-	    cl = 0;
-	    return -1;
-	}
-	return 0;
-    }
-    // Compute value at given position
-    inline unsigned int getvalueat(string::size_type p, int l) const {
-	switch (l) {
-	case 1: return (unsigned char)s[p];
-	case 2: return ((unsigned char)s[p] - 192) * 64 + 
-		(unsigned char)s[p+1] - 128 ;
-	case 3: return ((unsigned char)s[p]-224)*4096 + 
-		((unsigned char)s[p+1]-128)*64 + 
-		(unsigned char)s[p+2]-128;
-	case 4: return ((unsigned char)s[p]-240)*262144 + 
-		((unsigned char)s[p+1]-128)*4096 + 
-		((unsigned char)s[p+2]-128)*64 + 
-		(unsigned char)s[p+3]-128;
-	case 5: return ((unsigned char)s[p]-248)*16777216 + 
-		((unsigned char)s[p+1]-128)*262144 + 
-		((unsigned char)s[p+2]-128)*4096 + 
-		((unsigned char)s[p+3]-128)*64 + 
-		(unsigned char)s[p+4]-128;
-	case 6: return  ((unsigned char)s[p]-252)*1073741824 + 
-		((unsigned char)s[p+1]-128)*16777216 + 
-		((unsigned char)s[p+2]-128)*262144 + 
-		((unsigned char)s[p+3]-128)*4096 + 
-		((unsigned char)s[p+4]-128)*64 + 
-		(unsigned char)s[p+5]-128;
-	default:
-	    return (unsigned int)-1;
-	}
-    }
- public:
+public:
     Utf8Iter(const string &in) 
-	: cl(0), s(in), pos(0), m_charpos(0) 
-	{
-	    // Ensure state is ok if appendchartostring is called at once
-	    compute_cl();
-	}
-
-    void rewind() {
-	cl=0; pos=0; m_charpos=0;
-    }
-    /** operator* returns the ucs4 value as a machine integer*/
-    unsigned int operator*() {
-	if (!cl && compute_cl() < 0)
-	    return (unsigned int)-1;
-	unsigned int val = getvalueat(pos, cl);
-	if (val == (unsigned int)-1) {
-	    pos = s.length();
-	    cl = 0;
-	}
-	return val;
-    }
+	: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
+    {
+	compute_cl();
+    }
+
+    void rewind() 
+    {
+	m_cl = 0; 
+	m_pos = 0; 
+	m_charpos = 0; 
+	m_error = false;
+	compute_cl();
+    }
+
     /** "Direct" access. Awfully inefficient as we skip from start or current
      * position at best. This can only be useful for a lookahead from the
      * current position */
-    unsigned int operator[](unsigned int charpos) const {
+    unsigned int operator[](unsigned int charpos) const 
+    {
 	string::size_type mypos = 0;
-	unsigned int mycp = 0;;
+	unsigned int mycp = 0;
 	if (charpos >= m_charpos) {
-	    mypos = pos;
+	    mypos = m_pos;
 	    mycp = m_charpos;
 	}
-	while (mypos < s.length() && mycp != charpos) {
-	    mypos += get_cl(mypos);
+	int l;
+	while (mypos < m_s.length() && mycp != charpos) {
+	    l = get_cl(mypos);
+	    if (l < 0)
+		return (unsigned int)-1;
+	    mypos += l;
 	    ++mycp;
 	}
-	if (mypos < s.length() && mycp == charpos) {
-	    int l = get_cl(mypos);
+	if (mypos < m_s.length() && mycp == charpos) {
+	    l = get_cl(mypos);
 	    if (poslok(mypos, l))
 		return getvalueat(mypos, get_cl(mypos));
 	}
 	return (unsigned int)-1;
     }
 
-    /** Set current position before next utf-8 character */
-    string::size_type operator++(int) {
-	if (!cl && compute_cl() < 0) {
-	    return pos = string::npos;
-	}
-	pos += cl;
+    /** Increment current position to next utf-8 char */
+    string::size_type operator++(int) 
+    {
+	// Note: m_cl may be zero at eof if user's test not right
+	// this shouldn't crash the program until actual data access
+#ifdef UTF8ITER_CHECK
+	assert(m_cl != 0);
+#endif
+	if (m_cl == 0) 
+	    return string::npos;
+
+	m_pos += m_cl;
 	m_charpos++;
-	cl = 0;
-	return pos;
-    }
-    /** This needs to be fast. No error checking. */
-    void appendchartostring(string &out) {
-	out.append(&s[pos], cl);
-    }
+	compute_cl();
+	return m_pos;
+    }
+
+    /** operator* returns the ucs4 value as a machine integer*/
+    unsigned int operator*() 
+    {
+#ifdef UTF8ITER_CHECK
+	assert(m_cl != 0);
+#endif
+	return getvalueat(m_pos, m_cl);
+    }
+
+    /** Append current utf-8 possibly multi-byte character to string param.
+	This needs to be fast. No error checking. */
+    unsigned int appendchartostring(string &out) {
+#ifdef UTF8ITER_CHECK
+	assert(m_cl != 0);
+#endif
+	out.append(&m_s[m_pos], m_cl);
+	return m_cl;
+    }
+
+    /** Return current character as string */
     operator string() {
-	if (!cl && compute_cl() < 0) {
-	    return std::string("");
-	}
-	return s.substr(pos, cl);
-    }
+#ifdef UTF8ITER_CHECK
+	assert(m_cl != 0);
+#endif
+	return m_s.substr(m_pos, m_cl);
+    }
+
     bool eof() {
-	// Note: we always ensure that pos == s.length() when setting bad to 
-	// true
-	return pos == s.length();
-    }
+	return m_pos == m_s.length();
+    }
+
     bool error() {
-	return compute_cl() < 0;
-    }
+	return m_error;
+    }
+
     string::size_type getBpos() const {
-	return pos;
-    }
+	return m_pos;
+    }
+
     string::size_type getCpos() const {
 	return m_charpos;
     }
+
+private:
+    // String we're working with
+    const string&     m_s; 
+    // Character length at current position. A value of zero indicates
+    // unknown or error.
+    unsigned int      m_cl; 
+    // Current byte offset in string.
+    string::size_type m_pos; 
+    // Current character position
+    unsigned int      m_charpos; 
+    mutable bool      m_error;
+
+    // Check position and cl against string length
+    bool poslok(string::size_type p, int l) const {
+#ifdef UTF8ITER_CHECK
+	assert(p != string::npos && l > 0 && p + l <= m_s.length());
+#endif
+	return p != string::npos && l > 0 && p + l <= m_s.length();
+    }
+
+    // Update current char length in object state, minimum checking for 
+    // errors
+    inline int compute_cl() 
+    {
+	m_cl = 0;
+	if (m_pos == m_s.length())
+	    return -1;
+	m_cl = get_cl(m_pos);
+	if (!poslok(m_pos, m_cl)) {
+	    m_pos = m_s.length();
+	    m_cl = 0;
+	    m_error = true;
+	    return -1;
+	}
+	return 0;
+    }
+
+    // Get character byte length at specified position
+    inline int get_cl(string::size_type p) const 
+    {
+	unsigned int z = (unsigned char)m_s[p];
+	if (z <= 127) {
+	    return 1;
+	} else if ((z & 224) == 192) {
+	    return 2;
+	} else if ((z & 240) == 224) {
+	    return 3;
+	} else if ((z & 248) == 240) {
+	    return 4;
+	}
+#ifdef UTF8ITER_CHECK
+	assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
+	       (z & 248) == 240);
+#endif
+	return -1;
+    }
+
+    // Compute value at given position. No error checking.
+    inline unsigned int getvalueat(string::size_type p, int l) const
+    {
+	switch (l) {
+	case 1: 
+#ifdef UTF8ITER_CHECK
+	    assert((unsigned char)m_s[p] < 128);
+#endif
+	    return (unsigned char)m_s[p];
+	case 2: 
+#ifdef UTF8ITER_CHECK
+	    assert(
+		   ((unsigned char)m_s[p] & 224) == 192
+		   && ((unsigned char)m_s[p+1] & 192) ==  128
+		   );
+#endif
+	    return ((unsigned char)m_s[p] - 192) * 64 + 
+		(unsigned char)m_s[p+1] - 128 ;
+	case 3: 
+#ifdef UTF8ITER_CHECK
+	    assert(
+		   (((unsigned char)m_s[p]) & 240) == 224
+		   && (((unsigned char)m_s[p+1]) & 192) ==  128
+		   && (((unsigned char)m_s[p+2]) & 192) ==  128
+		   );
+#endif
+
+	    return ((unsigned char)m_s[p] - 224) * 4096 + 
+		((unsigned char)m_s[p+1] - 128) * 64 + 
+		(unsigned char)m_s[p+2] - 128;
+	case 4: 
+#ifdef UTF8ITER_CHECK
+	    assert(
+		   (((unsigned char)m_s[p]) & 248) == 240
+		   && (((unsigned char)m_s[p+1]) & 192) ==  128
+		   && (((unsigned char)m_s[p+2]) & 192) ==  128
+		   && (((unsigned char)m_s[p+3]) & 192) ==  128
+		   );
+#endif
+
+	    return ((unsigned char)m_s[p]-240)*262144 + 
+		((unsigned char)m_s[p+1]-128)*4096 + 
+		((unsigned char)m_s[p+2]-128)*64 + 
+		(unsigned char)m_s[p+3]-128;
+
+	default:
+#ifdef UTF8ITER_CHECK
+	    assert(l <= 4);
+#endif
+	    m_error = true;
+	    return (unsigned int)-1;
+	}
+    }
+
 };