recoll / Code / Diff of /unac/unac.c

Diff of /unac/unac.c [623065] .. [773ab5]

Switch to unified view


...
#else /* UNAC_DEBUG_AVAILABLE */
#define DEBUG 
#define DEBUG_APPEND
#endif /* UNAC_DEBUG_AVAILABLE */


























int unacmaybefold_string_utf16(const char* in, size_t in_length,
              char** outp, size_t* out_lengthp, int dofold)
{
  char* out;
  int out_size;
...

static int convert(const char* from, const char* to,
           const char* in, size_t in_length,
           char** outp, size_t* out_lengthp);

static const char *utf16be = "UTF-16BE";
static iconv_t u8tou16_cd = (iconv_t)-1;
static iconv_t u16tou8_cd = (iconv_t)-1;

/*
 * Convert buffer <in> containing string encoded in charset <from> into
 * a string in charset <to> and return it in buffer <outp>. The <outp>
 * points to a malloced string large enough to hold the conversion result.
 * It is the responsibility of the caller to free this array.
...
  iconv_t cd;
  char* out;
  size_t out_remain;
  size_t out_size;
  char* out_base;
  int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
  const char space[] = { 0x00, 0x20 };

  if (!strcmp(utf16be, from)) {
      from_utf8 = 0;
      from_utf16 = 1;
  } else if (!strcasecmp("UTF-8", from)) {
      from_utf8 = 1;
      from_utf16 = 0;
  } else {
      from_utf8 = from_utf16 = 0;
  }
  if (!strcmp(utf16be, to)) {
      to_utf8 = 0;
      to_utf16 = 1;
  } else if (!strcasecmp("UTF-8", to)) {
      to_utf8 = 1;
      to_utf16 = 0;
  } else {
      to_utf8 = to_utf16 = 0;
  }
  u16tou8 = from_utf16 && to_utf8;
  u8tou16 = from_utf8 && to_utf16;

  out_size = in_length > 0 ? in_length : 1024;
  if(*outp) {
    out = *outp;
    /* +1 for null */
...
    }
  }
  out_remain = out_size;
  out_base = out;

  if (u8tou16) {
      if (u8tou16_cd == (iconv_t)-1) {
    if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
        return -1;
    }
      } else {
    iconv(u8tou16_cd, 0, 0, 0, 0);
      }
      cd = u8tou16_cd;
  } else if (u16tou8) {
      if (u16tou8_cd == (iconv_t)-1) {
    if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
        return -1;
    }
      } else {
    iconv(u16tou8_cd, 0, 0, 0, 0);
      }
      cd = u16tou8_cd;
  } else {
      if((cd = iconv_open(to, from)) == (iconv_t)-1) {
    return -1;
      }
  }

  do {
    if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
      switch(errno) {
      case EILSEQ:
    /*
...
    return -1;
    break;
      }
    }
  } while(in_length > 0);

  if (!u8tou16 && !u16tou8)
      iconv_close(cd);

  *outp = out_base;
  *out_lengthp = out - out_base;
  (*outp)[*out_lengthp] = '\0';

...
    char* utf16 = 0;
    size_t utf16_length = 0;
    char* utf16_unaccented = 0;
    size_t utf16_unaccented_length = 0;
  
    if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
      return -1;
    }

    unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
    free(utf16);

    if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
      return -1;
    }
    free(utf16_unaccented);
  }


	a/unac/unac.c		b/unac/unac.c
	...		...
10436	#else /* UNAC_DEBUG_AVAILABLE */	10436	#else /* UNAC_DEBUG_AVAILABLE */
10437	#define DEBUG	10437	#define DEBUG
10438	#define DEBUG_APPEND	10438	#define DEBUG_APPEND
10439	#endif /* UNAC_DEBUG_AVAILABLE */	10439	#endif /* UNAC_DEBUG_AVAILABLE */
10440		10440
10441
10442	/*
10443	* If UTF-16BE exists, use it. If not, use UTF-16 and hope it is
10444	* encoded in big endian. This fallback is a iconv related
10445	* compatibility hack introduced in some GNU/Linux distributions that
10446	* did not know UTF-16BE.
10447	*/
10448	static const char* utf16be(void)
10449	{
10450	iconv_t cd;
10451	static char* name = 0;
10452
10453	if(name == 0) {
10454	if((cd = iconv_open("UTF-16BE", "UTF-16BE")) == (iconv_t)-1) {
10455	if(debug_level >= UNAC_DEBUG_LOW) DEBUG("could not find UTF-16BE (see iconv -l), using UTF-16. If UTF-16 happens to be encoded in little endian, be prepared for an horrible mess.");
10456	name = "UTF-16";
10457	} else {
10458	iconv_close(cd);
10459	name = "UTF-16BE";
10460	}
10461	}
10462
10463	return name;
10464	}
10465
10466	int unacmaybefold_string_utf16(const char* in, size_t in_length,	10441	int unacmaybefold_string_utf16(const char* in, size_t in_length,
10467	char** outp, size_t* out_lengthp, int dofold)	10442	char** outp, size_t* out_lengthp, int dofold)
10468	{	10443	{
10469	char* out;	10444	char* out;
10470	int out_size;	10445	int out_size;
	...		...
10584		10559
10585	static int convert(const char* from, const char* to,	10560	static int convert(const char* from, const char* to,
10586	const char* in, size_t in_length,	10561	const char* in, size_t in_length,
10587	char** outp, size_t* out_lengthp);	10562	char** outp, size_t* out_lengthp);
10588		10563
		10564	static const char *utf16be = "UTF-16BE";
		10565	static iconv_t u8tou16_cd = (iconv_t)-1;
		10566	static iconv_t u16tou8_cd = (iconv_t)-1;
		10567
10589	/*	10568	/*
10590	* Convert buffer <in> containing string encoded in charset <from> into	10569	* Convert buffer <in> containing string encoded in charset <from> into
10591	* a string in charset <to> and return it in buffer <outp>. The <outp>	10570	* a string in charset <to> and return it in buffer <outp>. The <outp>
10592	* points to a malloced string large enough to hold the conversion result.	10571	* points to a malloced string large enough to hold the conversion result.
10593	* It is the responsibility of the caller to free this array.	10572	* It is the responsibility of the caller to free this array.
	...		...
10600	iconv_t cd;	10579	iconv_t cd;
10601	char* out;	10580	char* out;
10602	size_t out_remain;	10581	size_t out_remain;
10603	size_t out_size;	10582	size_t out_size;
10604	char* out_base;	10583	char* out_base;
10605	int from_utf16 = !strcmp(utf16be(), from);	10584	int from_utf16, from_utf8, to_utf16, to_utf8, u8tou16, u16tou8;
10606	const char space[] = { 0x00, 0x20 };	10585	const char space[] = { 0x00, 0x20 };
		10586
		10587	if (!strcmp(utf16be, from)) {
		10588	from_utf8 = 0;
		10589	from_utf16 = 1;
		10590	} else if (!strcasecmp("UTF-8", from)) {
		10591	from_utf8 = 1;
		10592	from_utf16 = 0;
		10593	} else {
		10594	from_utf8 = from_utf16 = 0;
		10595	}
		10596	if (!strcmp(utf16be, to)) {
		10597	to_utf8 = 0;
		10598	to_utf16 = 1;
		10599	} else if (!strcasecmp("UTF-8", to)) {
		10600	to_utf8 = 1;
		10601	to_utf16 = 0;
		10602	} else {
		10603	to_utf8 = to_utf16 = 0;
		10604	}
		10605	u16tou8 = from_utf16 && to_utf8;
		10606	u8tou16 = from_utf8 && to_utf16;
10607		10607
10608	out_size = in_length > 0 ? in_length : 1024;	10608	out_size = in_length > 0 ? in_length : 1024;
10609	if(*outp) {	10609	if(*outp) {
10610	out = *outp;	10610	out = *outp;
10611	/* +1 for null */	10611	/* +1 for null */
	...		...
10626	}	10626	}
10627	}	10627	}
10628	out_remain = out_size;	10628	out_remain = out_size;
10629	out_base = out;	10629	out_base = out;
10630		10630
		10631	if (u8tou16) {
		10632	if (u8tou16_cd == (iconv_t)-1) {
		10633	if((u8tou16_cd = iconv_open(to, from)) == (iconv_t)-1) {
		10634	return -1;
		10635	}
		10636	} else {
		10637	iconv(u8tou16_cd, 0, 0, 0, 0);
		10638	}
		10639	cd = u8tou16_cd;
		10640	} else if (u16tou8) {
		10641	if (u16tou8_cd == (iconv_t)-1) {
		10642	if((u16tou8_cd = iconv_open(to, from)) == (iconv_t)-1) {
		10643	return -1;
		10644	}
		10645	} else {
		10646	iconv(u16tou8_cd, 0, 0, 0, 0);
		10647	}
		10648	cd = u16tou8_cd;
		10649	} else {
10631	if((cd = iconv_open(to, from)) == (iconv_t)-1) {	10650	if((cd = iconv_open(to, from)) == (iconv_t)-1) {
10632	return -1;	10651	return -1;
		10652	}
10633	}	10653	}
		10654
10634	do {	10655	do {
10635	if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {	10656	if(iconv(cd, (ICONV_CONST char **) &in, &in_length, &out, &out_remain) == (size_t)-1) {
10636	switch(errno) {	10657	switch(errno) {
10637	case EILSEQ:	10658	case EILSEQ:
10638	/*	10659	/*
	...		...
10701	return -1;	10722	return -1;
10702	break;	10723	break;
10703	}	10724	}
10704	}	10725	}
10705	} while(in_length > 0);	10726	} while(in_length > 0);
		10727
		10728	if (!u8tou16 && !u16tou8)
10706	iconv_close(cd);	10729	iconv_close(cd);
10707		10730
10708	*outp = out_base;	10731	*outp = out_base;
10709	*out_lengthp = out - out_base;	10732	*out_lengthp = out - out_base;
10710	(outp)[out_lengthp] = '\0';	10733	(outp)[out_lengthp] = '\0';
10711		10734
	...		...
10731	char* utf16 = 0;	10754	char* utf16 = 0;
10732	size_t utf16_length = 0;	10755	size_t utf16_length = 0;
10733	char* utf16_unaccented = 0;	10756	char* utf16_unaccented = 0;
10734	size_t utf16_unaccented_length = 0;	10757	size_t utf16_unaccented_length = 0;
10735		10758
10736	if(convert(charset, utf16be(), in, in_length, &utf16, &utf16_length) < 0) {	10759	if(convert(charset, utf16be, in, in_length, &utf16, &utf16_length) < 0) {
10737	return -1;	10760	return -1;
10738	}	10761	}
10739		10762
10740	unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);	10763	unacmaybefold_string_utf16(utf16, utf16_length, &utf16_unaccented, &utf16_unaccented_length, dofold);
10741	free(utf16);	10764	free(utf16);
10742		10765
10743	if(convert(utf16be(), charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {	10766	if(convert(utf16be, charset, utf16_unaccented, utf16_unaccented_length, outp, out_lengthp) < 0) {
10744	return -1;	10767	return -1;
10745	}	10768	}
10746	free(utf16_unaccented);	10769	free(utf16_unaccented);
10747	}	10770	}
10748		10771