recoll / Code / Diff of /unac/unac.c

Diff of /unac/unac.c [913dff] .. [d9dc7c]

Switch to unified view


...
   caught writing another binary search  */
#include <vector>
#include <map>
#include <string>
#include <algorithm>
#include <tr1/unordered_map>
using std::string;
using std::tr1::unordered_map;

#include "smallut.h"

/* 
   Storage for the exception translations. These are chars which
   should not be translated according to what UnicodeData says, but
   instead according to some local rule. There will usually be very
   few of them, but they must be looked up for every translated char.



 */
unordered_map<unsigned short, string> except_trans;

static inline bool is_except_char(unsigned short c, string& trans)
{
    unordered_map<unsigned short, string>::const_iterator it 
  = except_trans.find(c);
    if (it == except_trans.end())
  return false;
    trans = it->second;
    return true;
}
#endif /* RECOLL_DATADIR */

/*
 * If configure.in has not defined this symbol, assume const. It
...
    // In conformance with current usage, but incorrectly, we do the following
    // things for the special chars depending on the operation requested:
    //   - unaccenting: do nothing (copy original char)
    //   - unac+fold: use table
    //   - fold: use the unicode data.
    string trans;
    if (what != UNAC_FOLD && except_trans.size() != 0 && 
  is_except_char(c, trans)) {
    if (what == UNAC_UNAC) {
        // Unaccent only. Do nothing
        p = 0;
        l = 0;
    } else {
        // Has to be UNAC_UNACFOLD: use table
        p = (unsigned short *)trans.c_str();
        l = trans.size() / 2;
    }



    } else {
#endif /* RECOLL_DATADIR */
    unac_uf_char_utf16_(c, p, l, what)
#ifdef RECOLL_DATADIR
    }
...
}

#ifdef RECOLL_DATADIR
void unac_set_except_translations(const char *spectrans)
{

    except_trans.clear();
    if (!spectrans || !spectrans[0])
    return;

    // The translation tables out of Unicode are in machine byte order (we
...
    if (littleendian)
        ch = (out[1] << 8) | (out[0] & 0xff);
    else
        ch = (out[0] << 8) | (out[1] & 0xff);





    except_trans[ch] = string((const char *)(out + 2), outsize-2);
    free(out);
    }


}
#endif /* RECOLL_DATADIR */

	a/unac/unac.c		b/unac/unac.c
	...		...
29	caught writing another binary search */	29	caught writing another binary search */
30	#include <vector>	30	#include <vector>
31	#include <map>	31	#include <map>
32	#include <string>	32	#include <string>
33	#include <algorithm>	33	#include <algorithm>
		34	#include <tr1/unordered_map>
34	using std::string;	35	using std::string;
35	using std::vector;	36	using std::tr1::unordered_map;
36	using std::map;
37	#include "smallut.h"	37	#include "smallut.h"
38		38
39	/*	39	/*
40	Storage for the exception translations. These are chars which	40	Storage for the exception translations. These are chars which
41	should not be translated according to what UnicodeData says, but	41	should not be translated according to what UnicodeData says, but
42	instead according to some local rule. There will usually be very	42	instead according to some local rule. There will usually be very
43	few of them, but they must be looked up for every translated char.	43	few of them, but they must be looked up for every translated char.
44
45	We use a sorted vector for fastest elimination by binary search and
46	a vector<string> to store the translations
47	*/	44	*/
48	static vector<unsigned short> except_chars;	45	unordered_map<unsigned short, string> except_trans;
49	static vector<string> except_trans;
50	static inline size_t is_except_char(unsigned short c)	46	static inline bool is_except_char(unsigned short c, string& trans)
51	{	47	{
52	vector<unsigned short>::iterator it =	48	unordered_map<unsigned short, string>::const_iterator it
53	std::lower_bound(except_chars.begin(), except_chars.end(), c);	49	= except_trans.find(c);
54	if (it == except_chars.end() \|\| *it != c) {	50	if (it == except_trans.end())
55	return (size_t(-1));	51	return false;
56	}	52	trans = it->second;
57	return std::distance(except_chars.begin(), it);	53	return true;
58	}	54	}
59	#endif /* RECOLL_DATADIR */	55	#endif /* RECOLL_DATADIR */
60		56
61	/*	57	/*
62	* If configure.in has not defined this symbol, assume const. It	58	* If configure.in has not defined this symbol, assume const. It
	...		...
12713	// In conformance with current usage, but incorrectly, we do the following	12709	// In conformance with current usage, but incorrectly, we do the following
12714	// things for the special chars depending on the operation requested:	12710	// things for the special chars depending on the operation requested:
12715	// - unaccenting: do nothing (copy original char)	12711	// - unaccenting: do nothing (copy original char)
12716	// - unac+fold: use table	12712	// - unac+fold: use table
12717	// - fold: use the unicode data.	12713	// - fold: use the unicode data.
12718	size_t idx;	12714	string trans;
12719	if (what != UNAC_FOLD && except_chars.size() != 0 &&	12715	if (what != UNAC_FOLD && except_trans.size() != 0 &&
12720	(idx=is_except_char(c)) != (size_t)-1) {	12716	is_except_char(c, trans)) {
12721	if (what == UNAC_UNAC) {	12717	if (what == UNAC_UNAC) {
12722	// Unaccent only. Do nothing	12718	// Unaccent only. Do nothing
12723	p = 0;	12719	p = 0;
12724	l = 0;	12720	l = 0;
12725	} else {	12721	} else {
12726	// Has to be UNAC_UNACFOLD: use table	12722	// Has to be UNAC_UNACFOLD: use table
12727	p = (unsigned short *)(except_trans[idx].c_str() + 2);	12723	p = (unsigned short *)trans.c_str();
12728	l = (except_trans[idx].size() - 2) / 2;	12724	l = trans.size() / 2;
12729	}	12725	}
12730	/* if (p) {unsigned char cp = (unsigned char )p;
12731	fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
12732	(unsigned int)cp[1]);}*/
12733	} else {	12726	} else {
12734	#endif /* RECOLL_DATADIR */	12727	#endif /* RECOLL_DATADIR */
12735	unac_uf_char_utf16_(c, p, l, what)	12728	unac_uf_char_utf16_(c, p, l, what)
12736	#ifdef RECOLL_DATADIR	12729	#ifdef RECOLL_DATADIR
12737	}	12730	}
	...		...
13074	}	13067	}
13075		13068
13076	#ifdef RECOLL_DATADIR	13069	#ifdef RECOLL_DATADIR
13077	void unac_set_except_translations(const char *spectrans)	13070	void unac_set_except_translations(const char *spectrans)
13078	{	13071	{
13079	except_chars.clear();
13080	except_trans.clear();	13072	except_trans.clear();
13081	if (!spectrans \|\| !spectrans[0])	13073	if (!spectrans \|\| !spectrans[0])
13082	return;	13074	return;
13083		13075
13084	// The translation tables out of Unicode are in machine byte order (we	13076	// The translation tables out of Unicode are in machine byte order (we
	...		...
13121	if (littleendian)	13113	if (littleendian)
13122	ch = (out[1] << 8) \| (out[0] & 0xff);	13114	ch = (out[1] << 8) \| (out[0] & 0xff);
13123	else	13115	else
13124	ch = (out[0] << 8) \| (out[1] & 0xff);	13116	ch = (out[0] << 8) \| (out[1] & 0xff);
13125		13117
13126	/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
13127	except_chars.push_back(ch);
13128	// We keep ch as the first 2 bytes in the translation so that
13129	// both vectors sort identically
13130	except_trans.push_back(string((const char *)out, outsize));	13118	except_trans[ch] = string((const char *)(out + 2), outsize-2);
13131	free(out);	13119	free(out);
13132	}	13120	}
13133	std::sort(except_chars.begin(), except_chars.end());
13134	std::sort(except_trans.begin(), except_trans.end());
13135	}	13121	}
13136	#endif /* RECOLL_DATADIR */	13122	#endif /* RECOLL_DATADIR */