Switch to side-by-side view

--- a/src/utils/utf8iter.h
+++ b/src/utils/utf8iter.h
@@ -32,7 +32,7 @@
 class Utf8Iter {
 public:
     Utf8Iter(const std::string &in) 
-	: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
+	: m_s(in), m_cl(0), m_pos(0), m_charpos(0)
     {
 	update_cl();
     }
@@ -44,7 +44,6 @@
 	m_cl = 0; 
 	m_pos = 0; 
 	m_charpos = 0; 
-	m_error = false;
 	update_cl();
     }
 
@@ -62,15 +61,15 @@
 	int l;
 	while (mypos < m_s.length() && mycp != charpos) {
 	    l = get_cl(mypos);
-	    if (l <= 0)
+	    if (l <= 0 || !poslok(mypos, l) || !checkvalidat(mypos, l))
 		return (unsigned int)-1;
 	    mypos += l;
 	    ++mycp;
 	}
 	if (mypos < m_s.length() && mycp == charpos) {
 	    l = get_cl(mypos);
-	    if (poslok(mypos, l))
-		return getvalueat(mypos, get_cl(mypos));
+	    if (poslok(mypos, l) && checkvalidat(mypos, l))
+		return getvalueat(mypos, l);
 	}
 	return (unsigned int)-1;
     }
@@ -83,7 +82,7 @@
 #ifdef UTF8ITER_CHECK
 	assert(m_cl != 0);
 #endif
-	if (m_cl <= 0) 
+	if (m_cl == 0)
 	    return std::string::npos;
 
 	m_pos += m_cl;
@@ -96,9 +95,9 @@
     unsigned int operator*() 
     {
 #ifdef UTF8ITER_CHECK
-	assert(m_cl != 0);
-#endif
-	return getvalueat(m_pos, m_cl);
+	assert(m_cl > 0);
+#endif
+	return m_cl == 0 ? (unsigned int)-1 : getvalueat(m_pos, m_cl);
     }
 
     /** Append current utf-8 possibly multi-byte character to string param.
@@ -116,15 +115,15 @@
 #ifdef UTF8ITER_CHECK
 	assert(m_cl != 0);
 #endif
-	return m_s.substr(m_pos, m_cl);
-    }
-
-    bool eof() {
+	return m_cl > 0 ? m_s.substr(m_pos, m_cl) : std::string();
+    }
+
+    bool eof() const {
 	return m_pos == m_s.length();
     }
 
-    bool error() {
-	return m_error;
+    bool error() const {
+	return m_cl == 0;
     }
 
     /** Return current byte offset in input string */
@@ -147,13 +146,11 @@
     const std::string&     m_s; 
     // Character length at current position. A value of zero indicates
     // an error.
-    unsigned int      m_cl; 
+    unsigned int m_cl;
     // Current byte offset in string.
     std::string::size_type m_pos; 
     // Current character position
     unsigned int      m_charpos; 
-    // Am I ok ?
-    mutable bool      m_error;
 
     // Check position and cl against string length
     bool poslok(std::string::size_type p, int l) const {
@@ -163,7 +160,7 @@
 	return p != std::string::npos && l > 0 && p + l <= m_s.length();
     }
 
-    // Update current char length in object state, minimum checking
+    // Update current char length in object state, check
     // for errors
     inline void update_cl() 
     {
@@ -176,7 +173,34 @@
 	    // basically prevents the caller to discriminate error and eof.
 	    //	    m_pos = m_s.length();
 	    m_cl = 0;
-	    m_error = true;
+	    return;
+	}
+	if (!checkvalidat(m_pos, m_cl)) {
+	    m_cl = 0;
+	}
+    }
+
+    inline bool checkvalidat(std::string::size_type p, int l) const
+    {
+	switch (l) {
+	case 1: 
+	    return (unsigned char)m_s[p] < 128;
+	case 2: 
+	    return (((unsigned char)m_s[p]) & 224) == 192
+		&& (((unsigned char)m_s[p+1]) & 192) == 128;
+	case 3: 
+	    return (((unsigned char)m_s[p]) & 240) == 224
+		   && (((unsigned char)m_s[p+1]) & 192) ==  128
+		   && (((unsigned char)m_s[p+2]) & 192) ==  128
+		   ;
+	case 4: 
+	    return (((unsigned char)m_s[p]) & 248) == 240
+		   && (((unsigned char)m_s[p+1]) & 192) ==  128
+		   && (((unsigned char)m_s[p+2]) & 192) ==  128
+		   && (((unsigned char)m_s[p+3]) & 192) ==  128
+		;
+	default:
+	    return false;
 	}
     }
 
@@ -249,7 +273,6 @@
 #ifdef UTF8ITER_CHECK
 	    assert(l <= 4);
 #endif
-	    m_error = true;
 	    return (unsigned int)-1;
 	}
     }