|
a/unac/unac.c |
|
b/unac/unac.c |
|
... |
|
... |
10436 |
#else /* UNAC_DEBUG_AVAILABLE */
|
10436 |
#else /* UNAC_DEBUG_AVAILABLE */
|
10437 |
#define DEBUG
|
10437 |
#define DEBUG
|
10438 |
#define DEBUG_APPEND
|
10438 |
#define DEBUG_APPEND
|
10439 |
#endif /* UNAC_DEBUG_AVAILABLE */
|
10439 |
#endif /* UNAC_DEBUG_AVAILABLE */
|
10440 |
|
10440 |
|
10441 |
|
|
|
10442 |
/*
|
|
|
10443 |
* If UTF-16BE exists, use it. If not, use UTF-16 and hope it is
|
|
|
10444 |
* encoded in big endian. This fallback is a iconv related
|
|
|
10445 |
* compatibility hack introduced in some GNU/Linux distributions that
|
|
|
10446 |
* did not know UTF-16BE.
|
|
|
10447 |
*/
|
|
|
10448 |
static const char* utf16be(void)
|
|
|
10449 |
{
|
|
|
10450 |
iconv_t cd;
|
|
|
10451 |
static char* name = 0;
|
|
|
10452 |
|
|
|
10453 |
if(name == 0) {
|
|
|
10454 |
if((cd = iconv_open("UTF-16BE", "UTF-16BE")) == (iconv_t)-1) {
|
|
|
10455 |
if(debug_level >= UNAC_DEBUG_LOW) DEBUG("could not find UTF-16BE (see iconv -l), using UTF-16. If UTF-16 happens to be encoded in little endian, be prepared for an horrible mess.");
|
|
|
10456 |
name = "UTF-16";
|
|
|
10457 |
} else {
|
|
|
10458 |
iconv_close(cd);
|
|
|
10459 |
name = "UTF-16BE";
|
|
|
10460 |
}
|
|
|
10461 |
}
|
|
|
10462 |
|
|
|
10463 |
return name;
|
|
|
10464 |
}
|
|
|
10465 |
|
|
|
10466 |
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
10441 |
int unacmaybefold_string_utf16(const char* in, size_t in_length,
|
10467 |
char** outp, size_t* out_lengthp, int dofold)
|
10442 |
char** outp, size_t* out_lengthp, int dofold)
|
10468 |
{
|
10443 |
{
|
10469 |
char* out;
|
10444 |
char* out;
|
10470 |
int out_size;
|
10445 |
int out_size;
|
|
... |
|
... |
10584 |
|
10559 |
|
10585 |
static int convert(const char* from, const char* to,
|
10560 |
static int convert(const char* from, const char* to,
|
10586 |
const char* in, size_t in_length,
|
10561 |
const char* in, size_t in_length,
|
10587 |
char** outp, size_t* out_lengthp);
|
10562 |
char** outp, size_t* out_lengthp);
|
10588 |
|
10563 |
|
|
|
10564 |
static const char *utf16be = "UTF-16BE";
|
|
|
10565 |
static iconv_t u8tou16_cd = (iconv_t)-1;
|
|
|
10566 |
static iconv_t u16tou8_cd = (iconv_t)-1;
|
|
|
10567 |
|
10589 |
/*
|
10568 |
/*
|
10590 |
* Convert buffer <in> containing string encoded in charset <from> into
|
10569 |
* Convert buffer <in> containing string encoded in charset <from> into
|
10591 |
* a string in charset <to> and return it in buffer <outp>. The <outp>
|
10570 |
* a string in charset <to> and return it in buffer <outp>. The <outp>
|
10592 |
* points to a malloced string large enough to hold the conversion result.
|
10571 |
* points to a malloced string large enough to hold the conversion result.
|
10593 |
* It is the responsibility of the caller to free this array.
|
10572 |
* It is the responsibility of the caller to free this array.
|
|
... |
|
... |
10600 |
iconv_t cd;
|
10579 |
iconv_t cd;
|
10601 |
char* out;
|
10580 |
char* out;
|
10602 |
size_t out_remain;
|
10581 |
size_t out_remain;
|
10603 |
size_t out_size;
|
10582 |
size_t out_size;
|
10604 |
char* out_base;
|
10583 |
char* out_base;
|
10605 |
int from_utf16 = !strcmp(utf16be(), from);
|
10584 |
int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
|
10606 |
const char space[] = { 0x00, 0x20 };
|
10585 |
const char space[] = { 0x00, 0x20 };
|
|
|
10586 |
|
|
|
10587 |
if (!strcmp(utf16be, from)) {
|
|
|
10588 |
from_utf8 = 0;
|
|
|
10589 |
from_utf16 = 1;
|
|
|
10590 |
} else if (!strcasecmp("UTF-8", from)) {
|
|
|
10591 |
from_utf8 = 1;
|
|
|
10592 |
from_utf16 = 0;
|
|
|
10593 |
} else {
|
|
|
10594 |
from_utf8 = from_utf16 = 0;
|
|
|
10595 |
}
|
|
|
10596 |
if (!strcmp(utf16be, to)) {
|
|
|
10597 |
to_utf8 = 0;
|
|
|
10598 |
to_utf16 = 1;
|
|
|
10599 |
} else if (!strcasecmp("UTF-8", to)) {
|
|
|
10600 |
to_utf8 = 1;
|
|
|
10601 |
to_utf16 = 0;
|
|
|
10602 |
} else {
|
|
|
10603 |
to_utf8 = to_utf16 = 0;
|
|
|
10604 |
}
|
|
|
10605 |
u16tou8 = from_utf16 && to_utf8;
|
|
|
10606 |
u8tou16 = from_utf8 && to_utf16;
|
10607 |
|
10607 |
|
10608 |
out_size = in_length > 0 ? in_length : 1024;
|
10608 |
out_size = in_length > 0 ? in_length : 1024;
|
10609 |
if(*outp) {
|
10609 |
if(*outp) {
|
10610 |
out = *outp;
|
10610 |
out = *outp;
|
10611 |
/* +1 for null */
|
10611 |
/* +1 for null */
|
|
... |
|
... |
10626 |
}
|
10626 |
}
|
10627 |
}
|
10627 |
}
|
10628 |
out_remain = out_size;
|
10628 |
out_remain = out_size;
|
10629 |
out_base = out;
|
10629 |
out_base = out;
|
10630 |
|
10630 |
|
|
|
10631 |
if (u8tou16) {
|
|
|
10632 |
if (u8tou16_cd == (iconv_t)-1) {
|
|
|
10633 |
if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
|
|
10634 |
return -1;
|
|
|
10635 |
}
|
|
|
10636 |
} else {
|
|
|
10637 |
iconv(u8tou16_cd, 0, 0, 0, 0);
|
|
|
10638 |
}
|
|
|
10639 |
cd = u8tou16_cd;
|
|
|
10640 |
} else if (u16tou8) {
|
|
|
10641 |
if (u16tou8_cd == (iconv_t)-1) {
|
|
|
10642 |
if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
|
|
|
10643 |
return -1;
|
|
|
10644 |
}
|
|
|
10645 |
} else {
|
|
|
10646 |
iconv(u16tou8_cd, 0, 0, 0, 0);
|
|
|
10647 |
}
|
|
|
10648 |
cd = u16tou8_cd;
|
|
|
10649 |
} else {
|
10631 |
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
10650 |
if((cd = iconv_open(to, from)) == (iconv_t)-1) {
|
10632 |
return -1;
|
10651 |
return -1;
|
|
|
10652 |
}
|
10633 |
}
|
10653 |
}
|
|
|
10654 |
|
10634 |
do {
|
10655 |
do {
|
10635 |
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
10656 |
if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
|
10636 |
switch(errno) {
|
10657 |
switch(errno) {
|
10637 |
case EILSEQ:
|
10658 |
case EILSEQ:
|
10638 |
/*
|
10659 |
/*
|
|
... |
|
... |
10701 |
return -1;
|
10722 |
return -1;
|
10702 |
break;
|
10723 |
break;
|
10703 |
}
|
10724 |
}
|
10704 |
}
|
10725 |
}
|
10705 |
} while(in_length > 0);
|
10726 |
} while(in_length > 0);
|
|
|
10727 |
|
|
|
10728 |
if (!u8tou16 && !u16tou8)
|
10706 |
iconv_close(cd);
|
10729 |
iconv_close(cd);
|
10707 |
|
10730 |
|
10708 |
*outp = out_base;
|
10731 |
*outp = out_base;
|
10709 |
*out_lengthp = out - out_base;
|
10732 |
*out_lengthp = out - out_base;
|
10710 |
(*outp)[*out_lengthp] = '\0';
|
10733 |
(*outp)[*out_lengthp] = '\0';
|
10711 |
|
10734 |
|
|
... |
|
... |
10731 |
char* utf16 = 0;
|
10754 |
char* utf16 = 0;
|
10732 |
size_t utf16_length = 0;
|
10755 |
size_t utf16_length = 0;
|
10733 |
char* utf16_unaccented = 0;
|
10756 |
char* utf16_unaccented = 0;
|
10734 |
size_t utf16_unaccented_length = 0;
|
10757 |
size_t utf16_unaccented_length = 0;
|
10735 |
|
10758 |
|
10736 |
if(convert(charset, utf16be(), in, in_length, &utf16, &utf16_length) < 0) {
|
10759 |
if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
|
10737 |
return -1;
|
10760 |
return -1;
|
10738 |
}
|
10761 |
}
|
10739 |
|
10762 |
|
10740 |
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
|
10763 |
unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
|
10741 |
free(utf16);
|
10764 |
free(utf16);
|
10742 |
|
10765 |
|
10743 |
if(convert(utf16be(), charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
|
10766 |
if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
|
10744 |
return -1;
|
10767 |
return -1;
|
10745 |
}
|
10768 |
}
|
10746 |
free(utf16_unaccented);
|
10769 |
free(utf16_unaccented);
|
10747 |
}
|
10770 |
}
|
10748 |
|
10771 |
|