Switch to unified view

a/unac/unac.c b/unac/unac.c
...
...
29
   caught writing another binary search  */
29
   caught writing another binary search  */
30
#include <vector>
30
#include <vector>
31
#include <map>
31
#include <map>
32
#include <string>
32
#include <string>
33
#include <algorithm>
33
#include <algorithm>
34
#include <tr1/unordered_map>
34
using std::string;
35
using std::string;
35
using std::vector;
36
using std::tr1::unordered_map;
36
using std::map;
37
#include "smallut.h"
37
#include "smallut.h"
38
38
39
/* 
39
/* 
40
   Storage for the exception translations. These are chars which
40
   Storage for the exception translations. These are chars which
41
   should not be translated according to what UnicodeData says, but
41
   should not be translated according to what UnicodeData says, but
42
   instead according to some local rule. There will usually be very
42
   instead according to some local rule. There will usually be very
43
   few of them, but they must be looked up for every translated char.
43
   few of them, but they must be looked up for every translated char.
44
   
45
   We use a sorted vector for fastest elimination by binary search and
46
   a vector<string> to store the translations
47
 */
44
 */
48
static vector<unsigned short> except_chars;
45
unordered_map<unsigned short, string> except_trans;
49
static vector<string> except_trans;
50
static inline size_t is_except_char(unsigned short c)
46
static inline bool is_except_char(unsigned short c, string& trans)
51
{
47
{
52
    vector<unsigned short>::iterator it = 
48
    unordered_map<unsigned short, string>::const_iterator it 
53
  std::lower_bound(except_chars.begin(), except_chars.end(), c);
49
  = except_trans.find(c);
54
    if (it == except_chars.end() || *it != c) {
50
    if (it == except_trans.end())
55
  return (size_t(-1));
51
  return false;
56
    }
52
    trans = it->second;
57
    return std::distance(except_chars.begin(), it);
53
    return true;
58
}
54
}
59
#endif /* RECOLL_DATADIR */
55
#endif /* RECOLL_DATADIR */
60
56
61
/*
57
/*
62
 * If configure.in has not defined this symbol, assume const. It
58
 * If configure.in has not defined this symbol, assume const. It
...
...
12713
    // In conformance with current usage, but incorrectly, we do the following
12709
    // In conformance with current usage, but incorrectly, we do the following
12714
    // things for the special chars depending on the operation requested:
12710
    // things for the special chars depending on the operation requested:
12715
    //   - unaccenting: do nothing (copy original char)
12711
    //   - unaccenting: do nothing (copy original char)
12716
    //   - unac+fold: use table
12712
    //   - unac+fold: use table
12717
    //   - fold: use the unicode data.
12713
    //   - fold: use the unicode data.
12718
    size_t idx;
12714
    string trans;
12719
    if (what != UNAC_FOLD && except_chars.size() != 0 && 
12715
    if (what != UNAC_FOLD && except_trans.size() != 0 && 
12720
  (idx=is_except_char(c)) != (size_t)-1) {
12716
  is_except_char(c, trans)) {
12721
    if (what == UNAC_UNAC) {
12717
    if (what == UNAC_UNAC) {
12722
        // Unaccent only. Do nothing
12718
        // Unaccent only. Do nothing
12723
        p = 0;
12719
        p = 0;
12724
        l = 0;
12720
        l = 0;
12725
    } else {
12721
    } else {
12726
        // Has to be UNAC_UNACFOLD: use table
12722
        // Has to be UNAC_UNACFOLD: use table
12727
        p = (unsigned short *)(except_trans[idx].c_str() + 2);
12723
        p = (unsigned short *)trans.c_str();
12728
        l = (except_trans[idx].size() - 2) / 2;
12724
        l = trans.size() / 2;
12729
    }
12725
    }
12730
  /* if (p) {unsigned char *cp = (unsigned char *)p;
12731
     fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0], 
12732
     (unsigned int)cp[1]);}*/
12733
    } else {
12726
    } else {
12734
#endif /* RECOLL_DATADIR */
12727
#endif /* RECOLL_DATADIR */
12735
    unac_uf_char_utf16_(c, p, l, what)
12728
    unac_uf_char_utf16_(c, p, l, what)
12736
#ifdef RECOLL_DATADIR
12729
#ifdef RECOLL_DATADIR
12737
    }
12730
    }
...
...
13074
}
13067
}
13075
13068
13076
#ifdef RECOLL_DATADIR
13069
#ifdef RECOLL_DATADIR
13077
void unac_set_except_translations(const char *spectrans)
13070
void unac_set_except_translations(const char *spectrans)
13078
{
13071
{
13079
    except_chars.clear();
13080
    except_trans.clear();
13072
    except_trans.clear();
13081
    if (!spectrans || !spectrans[0])
13073
    if (!spectrans || !spectrans[0])
13082
    return;
13074
    return;
13083
13075
13084
    // The translation tables out of Unicode are in machine byte order (we
13076
    // The translation tables out of Unicode are in machine byte order (we
...
...
13121
    if (littleendian)
13113
    if (littleendian)
13122
        ch = (out[1] << 8) | (out[0] & 0xff);
13114
        ch = (out[1] << 8) | (out[0] & 0xff);
13123
    else
13115
    else
13124
        ch = (out[0] << 8) | (out[1] & 0xff);
13116
        ch = (out[0] << 8) | (out[1] & 0xff);
13125
13117
13126
  /* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
13127
  except_chars.push_back(ch);
13128
  // We keep ch as the first 2 bytes in the translation so that 
13129
  // both vectors sort identically
13130
    except_trans.push_back(string((const char *)out, outsize));
13118
    except_trans[ch] = string((const char *)(out + 2), outsize-2);
13131
    free(out);
13119
    free(out);
13132
    }
13120
    }
13133
    std::sort(except_chars.begin(), except_chars.end());
13134
    std::sort(except_trans.begin(), except_trans.end());
13135
}
13121
}
13136
#endif /* RECOLL_DATADIR */
13122
#endif /* RECOLL_DATADIR */