Switch to unified view

a/unac/unac.c b/unac/unac.c
...
...
15
 * along with this program; if not, write to the Free Software
15
 * along with this program; if not, write to the Free Software
16
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
16
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17
 */
17
 */
18
18
19
#ifdef HAVE_CONFIG_H
19
#ifdef HAVE_CONFIG_H
20
#ifdef RECOLL_DATADIR
21
#include "autoconfig.h"
22
#else
20
#include "config.h"
23
#include "config.h"
24
#endif /* RECOLL */
21
#endif /* HAVE_CONFIG_H */
25
#endif /* HAVE_CONFIG_H */
26
27
#ifdef RECOLL_DATADIR
28
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
29
   caught writing another binary search  */
30
#include <vector>
31
#include <map>
32
#include <string>
33
#include <algorithm>
34
using std::string;
35
using std::vector;
36
using std::map;
37
#include "smallut.h"
38
39
/* 
40
   Storage for the exception translations. These are chars which
41
   should not be translated according to what UnicodeData says, but
42
   instead according to some local rule. There will usually be very
43
   few of them, but they must be looked up for every translated char.
44
   
45
   We use a sorted vector for fastest elimination by binary search and
46
   a vector<string> to store the translations
47
 */
48
static vector<unsigned short> except_chars;
49
static vector<string> except_trans;
50
static inline size_t is_except_char(unsigned short c)
51
{
52
    vector<unsigned short>::iterator it = 
53
  std::lower_bound(except_chars.begin(), except_chars.end(), c);
54
    if (it == except_chars.end() || *it != c) {
55
  return (size_t(-1));
56
    }
57
    return std::distance(except_chars.begin(), it);
58
}
59
#endif /* RECOLL_DATADIR */
22
60
23
/*
61
/*
24
 * If configure.in has not defined this symbol, assume const. It
62
 * If configure.in has not defined this symbol, assume const. It
25
 * does not harm much: a warning will be issued during compilation.
63
 * does not harm much: a warning will be issued during compilation.
26
 */
64
 */
27
#ifndef ICONV_CONST
65
#ifndef ICONV_CONST
66
#ifdef RCL_ICONV_INBUF_CONST
67
#define ICONV_CONST const
68
#else
28
#define ICONV_CONST
69
#define ICONV_CONST
70
#endif
29
#endif /* ICONV_CONST */
71
#endif /* ICONV_CONST */
30
72
31
#include <stdlib.h>
73
#include <stdlib.h>
32
#include <string.h>
74
#include <string.h>
33
#include <iconv.h>
75
#include <iconv.h>
...
...
12620
              char** outp, size_t* out_lengthp, int dofold)
12662
              char** outp, size_t* out_lengthp, int dofold)
12621
{
12663
{
12622
  char* out;
12664
  char* out;
12623
  int out_size;
12665
  int out_size;
12624
  int out_length;
12666
  int out_length;
12625
  int i;
12667
  unsigned int i;
12626
12668
12627
  out_size = in_length > 0 ? in_length : 1024;
12669
  out_size = in_length > 0 ? in_length : 1024;
12628
12670
12629
  out = *outp;
12671
  out = *outp;
12630
  out = realloc(out, out_size + 1);
12672
  out = (char*)realloc(out, out_size + 1);
12631
  if(out == 0) {
12673
  if(out == 0) {
12632
      if(debug_level >= UNAC_DEBUG_LOW)
12674
      if(debug_level >= UNAC_DEBUG_LOW)
12633
      DEBUG("realloc %d bytes failed\n", out_size+1);
12675
      DEBUG("realloc %d bytes failed\n", out_size+1);
12634
      /* *outp is still valid. Let the caller free it */
12676
      /* *outp is still valid. Let the caller free it */
12635
      return -1;
12677
      return -1;
...
...
12644
    int k;
12686
    int k;
12645
    c = (in[i] << 8) | (in[i + 1] & 0xff);
12687
    c = (in[i] << 8) | (in[i + 1] & 0xff);
12646
    /*
12688
    /*
12647
     * Lookup the tables for decomposition information
12689
     * Lookup the tables for decomposition information
12648
     */
12690
     */
12649
    if (dofold) {
12691
#ifdef RECOLL_DATADIR
12650
  unacfold_char_utf16(c, p, l);
12692
    size_t idx;
12693
    if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
12694
  p = (unsigned short *)(except_trans[idx].c_str() + 2);
12695
  l = (except_trans[idx].size() - 2) / 2;
12696
  /* unsigned char *cp = (unsigned char *)p;
12697
     fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0], 
12698
     (unsigned int)cp[1]);*/
12651
    } else {
12699
    } else {
12700
#endif /* RECOLL_DATADIR */
12701
  if (dofold) {
12702
      unacfold_char_utf16(c, p, l);
12703
  } else {
12652
    unac_char_utf16(c, p, l);
12704
        unac_char_utf16(c, p, l);
12705
  }
12706
#ifdef RECOLL_DATADIR
12653
    }
12707
    }
12708
#endif /* RECOLL_DATADIR */
12709
12654
    /*
12710
    /*
12655
     * Explain what's done in great detail
12711
     * Explain what's done in great detail
12656
     */
12712
     */
12657
    if(debug_level == UNAC_DEBUG_HIGH) {
12713
    if(debug_level == UNAC_DEBUG_HIGH) {
12658
      unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
12714
      unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
...
...
12676
     */
12732
     */
12677
    if(out_length + ((l + 1) * 2) > out_size) {
12733
    if(out_length + ((l + 1) * 2) > out_size) {
12678
      char *saved;
12734
      char *saved;
12679
      out_size += ((l + 1) * 2) + 1024;
12735
      out_size += ((l + 1) * 2) + 1024;
12680
      saved = out;
12736
      saved = out;
12681
      out = realloc(out, out_size);
12737
      out = (char *)realloc(out, out_size);
12682
      if(out == 0) {
12738
      if(out == 0) {
12683
    if(debug_level >= UNAC_DEBUG_LOW)
12739
    if(debug_level >= UNAC_DEBUG_LOW)
12684
      DEBUG("realloc %d bytes failed\n", out_size);
12740
      DEBUG("realloc %d bytes failed\n", out_size);
12685
        free(saved);
12741
        free(saved);
12686
    *outp = 0;
12742
    *outp = 0;
...
...
12796
  u8tou16 = from_utf8 && to_utf16;
12852
  u8tou16 = from_utf8 && to_utf16;
12797
12853
12798
  out_size = in_length > 0 ? in_length : 1024;
12854
  out_size = in_length > 0 ? in_length : 1024;
12799
12855
12800
  out = *outp;
12856
  out = *outp;
12801
  out = realloc(out, out_size + 1);
12857
  out = (char *)realloc(out, out_size + 1);
12802
  if(out == 0) {
12858
  if(out == 0) {
12803
      /* *outp still valid, no freeing */
12859
      /* *outp still valid, no freeing */
12804
      if(debug_level >= UNAC_DEBUG_LOW)
12860
      if(debug_level >= UNAC_DEBUG_LOW)
12805
      DEBUG("realloc %d bytes failed\n", out_size+1);
12861
      DEBUG("realloc %d bytes failed\n", out_size+1);
12806
      goto out;
12862
      goto out;
...
...
12882
      int length = out - out_base;
12938
      int length = out - out_base;
12883
      out_size *= 2;
12939
      out_size *= 2;
12884
      {
12940
      {
12885
          char *saved = out_base;
12941
          char *saved = out_base;
12886
          /* +1 for null */
12942
          /* +1 for null */
12887
          out_base = realloc(out_base, out_size + 1);
12943
          out_base = (char *)realloc(out_base, out_size + 1);
12888
          if (out_base == 0) {
12944
          if (out_base == 0) {
12889
          /* *outp potentially not valid any more. Free here,
12945
          /* *outp potentially not valid any more. Free here,
12890
           * and zero out */
12946
           * and zero out */
12891
          if(debug_level >= UNAC_DEBUG_LOW)
12947
          if(debug_level >= UNAC_DEBUG_LOW)
12892
              DEBUG("realloc %d bytes failed\n", out_size+1);
12948
              DEBUG("realloc %d bytes failed\n", out_size+1);
...
...
12927
   * When converting an empty string, skip everything but alloc the
12983
   * When converting an empty string, skip everything but alloc the
12928
   * buffer if NULL pointer.
12984
   * buffer if NULL pointer.
12929
   */
12985
   */
12930
  if (in_length <= 0) {
12986
  if (in_length <= 0) {
12931
      if(!*outp) {
12987
      if(!*outp) {
12932
      if ((*outp = malloc(32)) == 0)
12988
      if ((*outp = (char*)malloc(32)) == 0)
12933
          return -1;
12989
          return -1;
12934
      }
12990
      }
12935
      (*outp)[0] = '\0';
12991
      (*outp)[0] = '\0';
12936
      *out_lengthp = 0;
12992
      *out_lengthp = 0;
12937
  } else {
12993
  } else {
...
...
12973
const char* unac_version(void)
13029
const char* unac_version(void)
12974
{
13030
{
12975
  return UNAC_VERSION;
13031
  return UNAC_VERSION;
12976
}
13032
}
12977
13033
13034
#ifdef RECOLL_DATADIR
13035
void unac_set_except_translations(const char *spectrans)
13036
{
13037
    except_chars.clear();
13038
    except_trans.clear();
13039
    if (!spectrans || !spectrans[0])
13040
  return;
13041
13042
    // The translation tables out of Unicode are in machine byte order (we
13043
    // just let the compiler read the values). 
13044
    // For the translation part, we need to choose our encoding in accordance )
13045
    // (16BE or 16LE depending on processor)
13046
    // On the contrary, the source char is always to be compared to
13047
    // the input text, which is encoded in UTF-16BE ... What a mess.
13048
    static const char *machinecoding = 0;
13049
    bool littleendian = true;
13050
    if (machinecoding == 0) {
13051
  const char*  charshort = "\001\002";
13052
  short *ip = (short *)charshort;
13053
  if (*ip == 0x0102) {
13054
      littleendian = false;
13055
      machinecoding = "UTF-16BE";
13056
  } else {
13057
      littleendian = true;
13058
      machinecoding = "UTF-16LE";
13059
  }
13060
    }
13061
13062
    vector<string> vtrans;
13063
    stringToStrings(spectrans, vtrans);
13064
13065
    for (vector<string>::iterator it = vtrans.begin();
13066
   it != vtrans.end(); it++) {
13067
13068
  /* Convert the whole thing to utf-16be/le according to endianness */
13069
  char *out = 0;
13070
  size_t outsize;
13071
  if (convert("UTF-8", machinecoding,
13072
          it->c_str(), it->size(),
13073
          &out, &outsize) != 0 || outsize < 2)
13074
      continue;
13075
13076
  /* The source char must be utf-16be as this is what we convert the
13077
     input text to for internal processing */
13078
  unsigned short ch;
13079
  if (littleendian)
13080
      ch = (out[1] << 8) | (out[0] & 0xff);
13081
  else
13082
      ch = (out[0] << 8) | (out[1] & 0xff);
13083
13084
  /* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
13085
  except_chars.push_back(ch);
13086
  // We keep ch as the first 2 bytes in the translation so that 
13087
  // both vectors sort identically
13088
  except_trans.push_back(string((const char *)out, outsize));
13089
  free(out);
13090
    }
13091
    std::sort(except_chars.begin(), except_chars.end());
13092
    std::sort(except_trans.begin(), except_trans.end());
13093
}
13094
#endif /* RECOLL_DATADIR */