Switch to unified view

a/unac/unac.c b/unac/unac.c
...
...
10436
#else /* UNAC_DEBUG_AVAILABLE */
10436
#else /* UNAC_DEBUG_AVAILABLE */
10437
#define DEBUG 
10437
#define DEBUG 
10438
#define DEBUG_APPEND
10438
#define DEBUG_APPEND
10439
#endif /* UNAC_DEBUG_AVAILABLE */
10439
#endif /* UNAC_DEBUG_AVAILABLE */
10440
10440
10441
10442
/*
10443
 * If UTF-16BE exists, use it. If not, use UTF-16 and hope it is
10444
 * encoded in big endian. This fallback is a iconv related
10445
 * compatibility hack introduced in some GNU/Linux distributions that
10446
 * did not know UTF-16BE.
10447
 */
10448
static const char* utf16be(void)
10449
{
10450
  iconv_t cd;
10451
  static char* name = 0;
10452
10453
  if(name == 0) {
10454
    if((cd = iconv_open("UTF-16BE", "UTF-16BE")) == (iconv_t)-1) {
10455
      if(debug_level >= UNAC_DEBUG_LOW) DEBUG("could not find UTF-16BE (see iconv -l), using UTF-16. If UTF-16 happens to be encoded in little endian, be prepared for an horrible mess.");
10456
      name = "UTF-16";
10457
    } else {
10458
      iconv_close(cd);
10459
      name = "UTF-16BE";
10460
    }
10461
  }
10462
10463
  return name;
10464
}
10465
10466
int unacmaybefold_string_utf16(const char* in, size_t in_length,
10441
int unacmaybefold_string_utf16(const char* in, size_t in_length,
10467
              char** outp, size_t* out_lengthp, int dofold)
10442
              char** outp, size_t* out_lengthp, int dofold)
10468
{
10443
{
10469
  char* out;
10444
  char* out;
10470
  int out_size;
10445
  int out_size;
...
...
10584
10559
10585
static int convert(const char* from, const char* to,
10560
static int convert(const char* from, const char* to,
10586
           const char* in, size_t in_length,
10561
           const char* in, size_t in_length,
10587
           char** outp, size_t* out_lengthp);
10562
           char** outp, size_t* out_lengthp);
10588
10563
10564
static const char *utf16be = "UTF-16BE";
10565
static iconv_t u8tou16_cd = (iconv_t)-1;
10566
static iconv_t u16tou8_cd = (iconv_t)-1;
10567
10589
/*
10568
/*
10590
 * Convert buffer <in> containing string encoded in charset <from> into
10569
 * Convert buffer <in> containing string encoded in charset <from> into
10591
 * a string in charset <to> and return it in buffer <outp>. The <outp>
10570
 * a string in charset <to> and return it in buffer <outp>. The <outp>
10592
 * points to a malloced string large enough to hold the conversion result.
10571
 * points to a malloced string large enough to hold the conversion result.
10593
 * It is the responsibility of the caller to free this array.
10572
 * It is the responsibility of the caller to free this array.
...
...
10600
  iconv_t cd;
10579
  iconv_t cd;
10601
  char* out;
10580
  char* out;
10602
  size_t out_remain;
10581
  size_t out_remain;
10603
  size_t out_size;
10582
  size_t out_size;
10604
  char* out_base;
10583
  char* out_base;
10605
  int from_utf16 = !strcmp(utf16be(), from);
10584
  int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
10606
  const char space[] = { 0x00, 0x20 };
10585
  const char space[] = { 0x00, 0x20 };
10586
10587
  if (!strcmp(utf16be, from)) {
10588
      from_utf8 = 0;
10589
      from_utf16 = 1;
10590
  } else if (!strcasecmp("UTF-8", from)) {
10591
      from_utf8 = 1;
10592
      from_utf16 = 0;
10593
  } else {
10594
      from_utf8 = from_utf16 = 0;
10595
  }
10596
  if (!strcmp(utf16be, to)) {
10597
      to_utf8 = 0;
10598
      to_utf16 = 1;
10599
  } else if (!strcasecmp("UTF-8", to)) {
10600
      to_utf8 = 1;
10601
      to_utf16 = 0;
10602
  } else {
10603
      to_utf8 = to_utf16 = 0;
10604
  }
10605
  u16tou8 = from_utf16 && to_utf8;
10606
  u8tou16 = from_utf8 && to_utf16;
10607
10607
10608
  out_size = in_length > 0 ? in_length : 1024;
10608
  out_size = in_length > 0 ? in_length : 1024;
10609
  if(*outp) {
10609
  if(*outp) {
10610
    out = *outp;
10610
    out = *outp;
10611
    /* +1 for null */
10611
    /* +1 for null */
...
...
10626
    }
10626
    }
10627
  }
10627
  }
10628
  out_remain = out_size;
10628
  out_remain = out_size;
10629
  out_base = out;
10629
  out_base = out;
10630
10630
10631
  if (u8tou16) {
10632
      if (u8tou16_cd == (iconv_t)-1) {
10633
    if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
10634
        return -1;
10635
    }
10636
      } else {
10637
    iconv(u8tou16_cd, 0, 0, 0, 0);
10638
      }
10639
      cd = u8tou16_cd;
10640
  } else if (u16tou8) {
10641
      if (u16tou8_cd == (iconv_t)-1) {
10642
    if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
10643
        return -1;
10644
    }
10645
      } else {
10646
    iconv(u16tou8_cd, 0, 0, 0, 0);
10647
      }
10648
      cd = u16tou8_cd;
10649
  } else {
10631
  if((cd = iconv_open(to, from)) == (iconv_t)-1) {
10650
      if((cd = iconv_open(to, from)) == (iconv_t)-1) {
10632
    return -1;
10651
    return -1;
10652
      }
10633
  }
10653
  }
10654
10634
  do {
10655
  do {
10635
    if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
10656
    if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
10636
      switch(errno) {
10657
      switch(errno) {
10637
      case EILSEQ:
10658
      case EILSEQ:
10638
    /*
10659
    /*
...
...
10701
    return -1;
10722
    return -1;
10702
    break;
10723
    break;
10703
      }
10724
      }
10704
    }
10725
    }
10705
  } while(in_length > 0);
10726
  } while(in_length > 0);
10727
10728
  if (!u8tou16 && !u16tou8)
10706
  iconv_close(cd);
10729
      iconv_close(cd);
10707
10730
10708
  *outp = out_base;
10731
  *outp = out_base;
10709
  *out_lengthp = out - out_base;
10732
  *out_lengthp = out - out_base;
10710
  (*outp)[*out_lengthp] = '\0';
10733
  (*outp)[*out_lengthp] = '\0';
10711
10734
...
...
10731
    char* utf16 = 0;
10754
    char* utf16 = 0;
10732
    size_t utf16_length = 0;
10755
    size_t utf16_length = 0;
10733
    char* utf16_unaccented = 0;
10756
    char* utf16_unaccented = 0;
10734
    size_t utf16_unaccented_length = 0;
10757
    size_t utf16_unaccented_length = 0;
10735
  
10758
  
10736
    if(convert(charset, utf16be(), in, in_length, &utf16, &utf16_length) < 0) {
10759
    if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
10737
      return -1;
10760
      return -1;
10738
    }
10761
    }
10739
10762
10740
    unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
10763
    unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
10741
    free(utf16);
10764
    free(utf16);
10742
10765
10743
    if(convert(utf16be(), charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
10766
    if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
10744
      return -1;
10767
      return -1;
10745
    }
10768
    }
10746
    free(utf16_unaccented);
10769
    free(utf16_unaccented);
10747
  }
10770
  }
10748
10771