recoll / Code / Diff of /src/utils/utf8iter.h

Diff of /src/utils/utf8iter.h [4982e9] .. [069d71]

Switch to unified view


...
 *   Free Software Foundation, Inc.,
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
#ifndef _UTF8ITER_H_INCLUDED_
#define _UTF8ITER_H_INCLUDED_
/* @(#$Id: utf8iter.h,v 1.9 2007-09-20 08:45:05 dockes Exp $  (C) 2004 J.F.Dockes */

/** 
 * A small helper class to iterate over utf8 strings. This is not an
 * STL iterator and does not much error checking. It is designed purely
 * for recoll usage, where the utf-8 string comes out of iconv in most cases
...
class Utf8Iter {
public:
    Utf8Iter(const string &in) 
    : m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
    {
    update_cl();
    }

    const string& buffer() const {return m_s;}

    void rewind() 
    {
    m_cl = 0; 
    m_pos = 0; 
    m_charpos = 0; 
    m_error = false;
    update_cl();
    }

    /** "Direct" access. Awfully inefficient as we skip from start or current
     * position at best. This can only be useful for a lookahead from the
     * current position */
...
        mycp = m_charpos;
    }
    int l;
    while (mypos < m_s.length() && mycp != charpos) {
        l = get_cl(mypos);
        if (l <= 0)
        return (unsigned int)-1;
        mypos += l;
        ++mycp;
    }
    if (mypos < m_s.length() && mycp == charpos) {
...
    // Note: m_cl may be zero at eof if user's test not right
    // this shouldn't crash the program until actual data access
#ifdef UTF8ITER_CHECK
    assert(m_cl != 0);
#endif
    if (m_cl <= 0) 
        return string::npos;

    m_pos += m_cl;
    m_charpos++;
    update_cl();
    return m_pos;
    }

    /** operator* returns the ucs4 value as a machine integer*/
    unsigned int operator*() 
...

    bool error() {
    return m_error;
    }

    /** Return current byte offset in input string */
    string::size_type getBpos() const {
    return m_pos;
    }

    /** Return current character length */
    string::size_type getBlen() const {
  return m_cl;
    }

    /** Return current unicode character offset in input string */
    string::size_type getCpos() const {
    return m_charpos;
    }

private:
    // String we're working with
    const string&     m_s; 
    // Character length at current position. A value of zero indicates
    // an error.
    unsigned int      m_cl; 
    // Current byte offset in string.
    string::size_type m_pos; 
    // Current character position
    unsigned int      m_charpos; 
    // Am I ok ?
    mutable bool      m_error;

    // Check position and cl against string length
    bool poslok(string::size_type p, int l) const {
#ifdef UTF8ITER_CHECK
    assert(p != string::npos && l > 0 && p + l <= m_s.length());
#endif
    return p != string::npos && l > 0 && p + l <= m_s.length();
    }

    // Update current char length in object state, minimum checking
    // for errors
    inline void update_cl() 
    {
    m_cl = 0;
    if (m_pos >= m_s.length())
        return;
    m_cl = get_cl(m_pos);
    if (!poslok(m_pos, m_cl)) {
      // Used to set eof here for safety, but this is bad because it
      // basically prevents the caller to discriminate error and eof.
      //     m_pos = m_s.length();
        m_cl = 0;
        m_error = true;

    }

    }

    // Get character byte length at specified position. Returns 0 for error.
    inline int get_cl(string::size_type p) const 
    {
    unsigned int z = (unsigned char)m_s[p];
    if (z <= 127) {
        return 1;
...
    }
#ifdef UTF8ITER_CHECK
    assert(z <= 127 || (z & 224) == 192 || (z & 240) == 224 ||
           (z & 248) == 240);
#endif
    return 0;
    }

    // Compute value at given position. No error checking.
    inline unsigned int getvalueat(string::size_type p, int l) const
    {

	a/src/utils/utf8iter.h		b/src/utils/utf8iter.h
	...		...
14	* Free Software Foundation, Inc.,	14	* Free Software Foundation, Inc.,
15	* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.	15	* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16	*/	16	*/
17	#ifndef _UTF8ITER_H_INCLUDED_	17	#ifndef _UTF8ITER_H_INCLUDED_
18	#define _UTF8ITER_H_INCLUDED_	18	#define _UTF8ITER_H_INCLUDED_
19	/* @(#$Id: utf8iter.h,v 1.8 2006-11-20 11:16:54 dockes Exp $ (C) 2004 J.F.Dockes */	19	/* @(#$Id: utf8iter.h,v 1.9 2007-09-20 08:45:05 dockes Exp $ (C) 2004 J.F.Dockes */
20		20
21	/**	21	/**
22	* A small helper class to iterate over utf8 strings. This is not an	22	* A small helper class to iterate over utf8 strings. This is not an
23	* STL iterator and does not much error checking. It is designed purely	23	* STL iterator and does not much error checking. It is designed purely
24	* for recoll usage, where the utf-8 string comes out of iconv in most cases	24	* for recoll usage, where the utf-8 string comes out of iconv in most cases
	...		...
28	class Utf8Iter {	28	class Utf8Iter {
29	public:	29	public:
30	Utf8Iter(const string &in)	30	Utf8Iter(const string &in)
31	: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)	31	: m_s(in), m_cl(0), m_pos(0), m_charpos(0), m_error(false)
32	{	32	{
33	compute_cl();	33	update_cl();
34	}	34	}
		35
		36	const string& buffer() const {return m_s;}
35		37
36	void rewind()	38	void rewind()
37	{	39	{
38	m_cl = 0;	40	m_cl = 0;
39	m_pos = 0;	41	m_pos = 0;
40	m_charpos = 0;	42	m_charpos = 0;
41	m_error = false;	43	m_error = false;
42	compute_cl();	44	update_cl();
43	}	45	}
44		46
45	/** "Direct" access. Awfully inefficient as we skip from start or current	47	/** "Direct" access. Awfully inefficient as we skip from start or current
46	* position at best. This can only be useful for a lookahead from the	48	* position at best. This can only be useful for a lookahead from the
47	* current position */	49	* current position */
	...		...
54	mycp = m_charpos;	56	mycp = m_charpos;
55	}	57	}
56	int l;	58	int l;
57	while (mypos < m_s.length() && mycp != charpos) {	59	while (mypos < m_s.length() && mycp != charpos) {
58	l = get_cl(mypos);	60	l = get_cl(mypos);
59	if (l < 0)	61	if (l <= 0)
60	return (unsigned int)-1;	62	return (unsigned int)-1;
61	mypos += l;	63	mypos += l;
62	++mycp;	64	++mycp;
63	}	65	}
64	if (mypos < m_s.length() && mycp == charpos) {	66	if (mypos < m_s.length() && mycp == charpos) {
	...		...
75	// Note: m_cl may be zero at eof if user's test not right	77	// Note: m_cl may be zero at eof if user's test not right
76	// this shouldn't crash the program until actual data access	78	// this shouldn't crash the program until actual data access
77	#ifdef UTF8ITER_CHECK	79	#ifdef UTF8ITER_CHECK
78	assert(m_cl != 0);	80	assert(m_cl != 0);
79	#endif	81	#endif
80	if (m_cl == 0)	82	if (m_cl <= 0)
81	return string::npos;	83	return string::npos;
82		84
83	m_pos += m_cl;	85	m_pos += m_cl;
84	m_charpos++;	86	m_charpos++;
85	compute_cl();	87	update_cl();
86	return m_pos;	88	return m_pos;
87	}	89	}
88		90
89	/** operator* returns the ucs4 value as a machine integer*/	91	/** operator* returns the ucs4 value as a machine integer*/
90	unsigned int operator*()	92	unsigned int operator*()
	...		...
119		121
120	bool error() {	122	bool error() {
121	return m_error;	123	return m_error;
122	}	124	}
123		125
		126	/** Return current byte offset in input string */
124	string::size_type getBpos() const {	127	string::size_type getBpos() const {
125	return m_pos;	128	return m_pos;
126	}	129	}
127		130
		131	/** Return current character length */
		132	string::size_type getBlen() const {
		133	return m_cl;
		134	}
		135
		136	/** Return current unicode character offset in input string */
128	string::size_type getCpos() const {	137	string::size_type getCpos() const {
129	return m_charpos;	138	return m_charpos;
130	}	139	}
131		140
132	private:	141	private:
133	// String we're working with	142	// String we're working with
134	const string& m_s;	143	const string& m_s;
135	// Character length at current position. A value of zero indicates	144	// Character length at current position. A value of zero indicates
136	// unknown or error.	145	// an error.
137	unsigned int m_cl;	146	unsigned int m_cl;
138	// Current byte offset in string.	147	// Current byte offset in string.
139	string::size_type m_pos;	148	string::size_type m_pos;
140	// Current character position	149	// Current character position
141	unsigned int m_charpos;	150	unsigned int m_charpos;
		151	// Am I ok ?
142	mutable bool m_error;	152	mutable bool m_error;
143		153
144	// Check position and cl against string length	154	// Check position and cl against string length
145	bool poslok(string::size_type p, int l) const {	155	bool poslok(string::size_type p, int l) const {
146	#ifdef UTF8ITER_CHECK	156	#ifdef UTF8ITER_CHECK
147	assert(p != string::npos && l > 0 && p + l <= m_s.length());	157	assert(p != string::npos && l > 0 && p + l <= m_s.length());
148	#endif	158	#endif
149	return p != string::npos && l > 0 && p + l <= m_s.length();	159	return p != string::npos && l > 0 && p + l <= m_s.length();
150	}	160	}
151		161
152	// Update current char length in object state, minimum checking for	162	// Update current char length in object state, minimum checking
153	// errors	163	// for errors
154	inline int compute_cl()	164	inline void update_cl()
155	{	165	{
156	m_cl = 0;	166	m_cl = 0;
157	if (m_pos == m_s.length())	167	if (m_pos >= m_s.length())
158	return -1;	168	return;
159	m_cl = get_cl(m_pos);	169	m_cl = get_cl(m_pos);
160	if (!poslok(m_pos, m_cl)) {	170	if (!poslok(m_pos, m_cl)) {
		171	// Used to set eof here for safety, but this is bad because it
		172	// basically prevents the caller to discriminate error and eof.
161	m_pos = m_s.length();	173	// m_pos = m_s.length();
162	m_cl = 0;	174	m_cl = 0;
163	m_error = true;	175	m_error = true;
164	return -1;
165	}	176	}
166	return 0;
167	}	177	}
168		178
169	// Get character byte length at specified position	179	// Get character byte length at specified position. Returns 0 for error.
170	inline int get_cl(string::size_type p) const	180	inline int get_cl(string::size_type p) const
171	{	181	{
172	unsigned int z = (unsigned char)m_s[p];	182	unsigned int z = (unsigned char)m_s[p];
173	if (z <= 127) {	183	if (z <= 127) {
174	return 1;	184	return 1;
	...		...
181	}	191	}
182	#ifdef UTF8ITER_CHECK	192	#ifdef UTF8ITER_CHECK
183	assert(z <= 127 \|\| (z & 224) == 192 \|\| (z & 240) == 224 \|\|	193	assert(z <= 127 \|\| (z & 224) == 192 \|\| (z & 240) == 224 \|\|
184	(z & 248) == 240);	194	(z & 248) == 240);
185	#endif	195	#endif
186	return -1;	196	return 0;
187	}	197	}
188		198
189	// Compute value at given position. No error checking.	199	// Compute value at given position. No error checking.
190	inline unsigned int getvalueat(string::size_type p, int l) const	200	inline unsigned int getvalueat(string::size_type p, int l) const
191	{	201	{