|
a/unac/unac.c |
|
b/unac/unac.c |
|
... |
|
... |
15 |
* along with this program; if not, write to the Free Software
|
15 |
* along with this program; if not, write to the Free Software
|
16 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
16 |
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
17 |
*/
|
17 |
*/
|
18 |
|
18 |
|
19 |
#ifdef HAVE_CONFIG_H
|
19 |
#ifdef HAVE_CONFIG_H
|
|
|
20 |
#ifdef RECOLL_DATADIR
|
|
|
21 |
#include "autoconfig.h"
|
|
|
22 |
#else
|
20 |
#include "config.h"
|
23 |
#include "config.h"
|
|
|
24 |
#endif /* RECOLL */
|
21 |
#endif /* HAVE_CONFIG_H */
|
25 |
#endif /* HAVE_CONFIG_H */
|
|
|
26 |
|
|
|
27 |
#ifdef RECOLL_DATADIR
|
|
|
28 |
/* Yes, recoll unac is actually c++, lets face modernity, I will not be
|
|
|
29 |
caught writing another binary search */
|
|
|
30 |
#include <vector>
|
|
|
31 |
#include <map>
|
|
|
32 |
#include <string>
|
|
|
33 |
#include <algorithm>
|
|
|
34 |
using std::string;
|
|
|
35 |
using std::vector;
|
|
|
36 |
using std::map;
|
|
|
37 |
#include "smallut.h"
|
|
|
38 |
|
|
|
39 |
/*
|
|
|
40 |
Storage for the exception translations. These are chars which
|
|
|
41 |
should not be translated according to what UnicodeData says, but
|
|
|
42 |
instead according to some local rule. There will usually be very
|
|
|
43 |
few of them, but they must be looked up for every translated char.
|
|
|
44 |
|
|
|
45 |
We use a sorted vector for fastest elimination by binary search and
|
|
|
46 |
a vector<string> to store the translations
|
|
|
47 |
*/
|
|
|
48 |
static vector<unsigned short> except_chars;
|
|
|
49 |
static vector<string> except_trans;
|
|
|
50 |
static inline size_t is_except_char(unsigned short c)
|
|
|
51 |
{
|
|
|
52 |
vector<unsigned short>::iterator it =
|
|
|
53 |
std::lower_bound(except_chars.begin(), except_chars.end(), c);
|
|
|
54 |
if (it == except_chars.end() || *it != c) {
|
|
|
55 |
return (size_t(-1));
|
|
|
56 |
}
|
|
|
57 |
return std::distance(except_chars.begin(), it);
|
|
|
58 |
}
|
|
|
59 |
#endif /* RECOLL_DATADIR */
|
22 |
|
60 |
|
23 |
/*
|
61 |
/*
|
24 |
* If configure.in has not defined this symbol, assume const. It
|
62 |
* If configure.in has not defined this symbol, assume const. It
|
25 |
* does not harm much: a warning will be issued during compilation.
|
63 |
* does not harm much: a warning will be issued during compilation.
|
26 |
*/
|
64 |
*/
|
27 |
#ifndef ICONV_CONST
|
65 |
#ifndef ICONV_CONST
|
|
|
66 |
#ifdef RCL_ICONV_INBUF_CONST
|
|
|
67 |
#define ICONV_CONST const
|
|
|
68 |
#else
|
28 |
#define ICONV_CONST
|
69 |
#define ICONV_CONST
|
|
|
70 |
#endif
|
29 |
#endif /* ICONV_CONST */
|
71 |
#endif /* ICONV_CONST */
|
30 |
|
72 |
|
31 |
#include <stdlib.h>
|
73 |
#include <stdlib.h>
|
32 |
#include <string.h>
|
74 |
#include <string.h>
|
33 |
#include <iconv.h>
|
75 |
#include <iconv.h>
|
|
... |
|
... |
12620 |
char** outp, size_t* out_lengthp, int dofold)
|
12662 |
char** outp, size_t* out_lengthp, int dofold)
|
12621 |
{
|
12663 |
{
|
12622 |
char* out;
|
12664 |
char* out;
|
12623 |
int out_size;
|
12665 |
int out_size;
|
12624 |
int out_length;
|
12666 |
int out_length;
|
12625 |
int i;
|
12667 |
unsigned int i;
|
12626 |
|
12668 |
|
12627 |
out_size = in_length > 0 ? in_length : 1024;
|
12669 |
out_size = in_length > 0 ? in_length : 1024;
|
12628 |
|
12670 |
|
12629 |
out = *outp;
|
12671 |
out = *outp;
|
12630 |
out = realloc(out, out_size + 1);
|
12672 |
out = (char*)realloc(out, out_size + 1);
|
12631 |
if(out == 0) {
|
12673 |
if(out == 0) {
|
12632 |
if(debug_level >= UNAC_DEBUG_LOW)
|
12674 |
if(debug_level >= UNAC_DEBUG_LOW)
|
12633 |
DEBUG("realloc %d bytes failed\n", out_size+1);
|
12675 |
DEBUG("realloc %d bytes failed\n", out_size+1);
|
12634 |
/* *outp is still valid. Let the caller free it */
|
12676 |
/* *outp is still valid. Let the caller free it */
|
12635 |
return -1;
|
12677 |
return -1;
|
|
... |
|
... |
12644 |
int k;
|
12686 |
int k;
|
12645 |
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
12687 |
c = (in[i] << 8) | (in[i + 1] & 0xff);
|
12646 |
/*
|
12688 |
/*
|
12647 |
* Lookup the tables for decomposition information
|
12689 |
* Lookup the tables for decomposition information
|
12648 |
*/
|
12690 |
*/
|
12649 |
if (dofold) {
|
12691 |
#ifdef RECOLL_DATADIR
|
12650 |
unacfold_char_utf16(c, p, l);
|
12692 |
size_t idx;
|
|
|
12693 |
if (except_chars.size() != 0 && (idx=is_except_char(c)) != (size_t)-1) {
|
|
|
12694 |
p = (unsigned short *)(except_trans[idx].c_str() + 2);
|
|
|
12695 |
l = (except_trans[idx].size() - 2) / 2;
|
|
|
12696 |
/* unsigned char *cp = (unsigned char *)p;
|
|
|
12697 |
fprintf(stderr, "l %d cp[0] %x cp[1] %x\n", l, (unsigned int)cp[0],
|
|
|
12698 |
(unsigned int)cp[1]);*/
|
12651 |
} else {
|
12699 |
} else {
|
|
|
12700 |
#endif /* RECOLL_DATADIR */
|
|
|
12701 |
if (dofold) {
|
|
|
12702 |
unacfold_char_utf16(c, p, l);
|
|
|
12703 |
} else {
|
12652 |
unac_char_utf16(c, p, l);
|
12704 |
unac_char_utf16(c, p, l);
|
|
|
12705 |
}
|
|
|
12706 |
#ifdef RECOLL_DATADIR
|
12653 |
}
|
12707 |
}
|
|
|
12708 |
#endif /* RECOLL_DATADIR */
|
|
|
12709 |
|
12654 |
/*
|
12710 |
/*
|
12655 |
* Explain what's done in great detail
|
12711 |
* Explain what's done in great detail
|
12656 |
*/
|
12712 |
*/
|
12657 |
if(debug_level == UNAC_DEBUG_HIGH) {
|
12713 |
if(debug_level == UNAC_DEBUG_HIGH) {
|
12658 |
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
12714 |
unsigned short index = unac_indexes[(c) >> UNAC_BLOCK_SHIFT];
|
|
... |
|
... |
12676 |
*/
|
12732 |
*/
|
12677 |
if(out_length + ((l + 1) * 2) > out_size) {
|
12733 |
if(out_length + ((l + 1) * 2) > out_size) {
|
12678 |
char *saved;
|
12734 |
char *saved;
|
12679 |
out_size += ((l + 1) * 2) + 1024;
|
12735 |
out_size += ((l + 1) * 2) + 1024;
|
12680 |
saved = out;
|
12736 |
saved = out;
|
12681 |
out = realloc(out, out_size);
|
12737 |
out = (char *)realloc(out, out_size);
|
12682 |
if(out == 0) {
|
12738 |
if(out == 0) {
|
12683 |
if(debug_level >= UNAC_DEBUG_LOW)
|
12739 |
if(debug_level >= UNAC_DEBUG_LOW)
|
12684 |
DEBUG("realloc %d bytes failed\n", out_size);
|
12740 |
DEBUG("realloc %d bytes failed\n", out_size);
|
12685 |
free(saved);
|
12741 |
free(saved);
|
12686 |
*outp = 0;
|
12742 |
*outp = 0;
|
|
... |
|
... |
12796 |
u8tou16 = from_utf8 && to_utf16;
|
12852 |
u8tou16 = from_utf8 && to_utf16;
|
12797 |
|
12853 |
|
12798 |
out_size = in_length > 0 ? in_length : 1024;
|
12854 |
out_size = in_length > 0 ? in_length : 1024;
|
12799 |
|
12855 |
|
12800 |
out = *outp;
|
12856 |
out = *outp;
|
12801 |
out = realloc(out, out_size + 1);
|
12857 |
out = (char *)realloc(out, out_size + 1);
|
12802 |
if(out == 0) {
|
12858 |
if(out == 0) {
|
12803 |
/* *outp still valid, no freeing */
|
12859 |
/* *outp still valid, no freeing */
|
12804 |
if(debug_level >= UNAC_DEBUG_LOW)
|
12860 |
if(debug_level >= UNAC_DEBUG_LOW)
|
12805 |
DEBUG("realloc %d bytes failed\n", out_size+1);
|
12861 |
DEBUG("realloc %d bytes failed\n", out_size+1);
|
12806 |
goto out;
|
12862 |
goto out;
|
|
... |
|
... |
12882 |
int length = out - out_base;
|
12938 |
int length = out - out_base;
|
12883 |
out_size *= 2;
|
12939 |
out_size *= 2;
|
12884 |
{
|
12940 |
{
|
12885 |
char *saved = out_base;
|
12941 |
char *saved = out_base;
|
12886 |
/* +1 for null */
|
12942 |
/* +1 for null */
|
12887 |
out_base = realloc(out_base, out_size + 1);
|
12943 |
out_base = (char *)realloc(out_base, out_size + 1);
|
12888 |
if (out_base == 0) {
|
12944 |
if (out_base == 0) {
|
12889 |
/* *outp potentially not valid any more. Free here,
|
12945 |
/* *outp potentially not valid any more. Free here,
|
12890 |
* and zero out */
|
12946 |
* and zero out */
|
12891 |
if(debug_level >= UNAC_DEBUG_LOW)
|
12947 |
if(debug_level >= UNAC_DEBUG_LOW)
|
12892 |
DEBUG("realloc %d bytes failed\n", out_size+1);
|
12948 |
DEBUG("realloc %d bytes failed\n", out_size+1);
|
|
... |
|
... |
12927 |
* When converting an empty string, skip everything but alloc the
|
12983 |
* When converting an empty string, skip everything but alloc the
|
12928 |
* buffer if NULL pointer.
|
12984 |
* buffer if NULL pointer.
|
12929 |
*/
|
12985 |
*/
|
12930 |
if (in_length <= 0) {
|
12986 |
if (in_length <= 0) {
|
12931 |
if(!*outp) {
|
12987 |
if(!*outp) {
|
12932 |
if ((*outp = malloc(32)) == 0)
|
12988 |
if ((*outp = (char*)malloc(32)) == 0)
|
12933 |
return -1;
|
12989 |
return -1;
|
12934 |
}
|
12990 |
}
|
12935 |
(*outp)[0] = '\0';
|
12991 |
(*outp)[0] = '\0';
|
12936 |
*out_lengthp = 0;
|
12992 |
*out_lengthp = 0;
|
12937 |
} else {
|
12993 |
} else {
|
|
... |
|
... |
12973 |
const char* unac_version(void)
|
13029 |
const char* unac_version(void)
|
12974 |
{
|
13030 |
{
|
12975 |
return UNAC_VERSION;
|
13031 |
return UNAC_VERSION;
|
12976 |
}
|
13032 |
}
|
12977 |
|
13033 |
|
|
|
13034 |
#ifdef RECOLL_DATADIR
|
|
|
13035 |
void unac_set_except_translations(const char *spectrans)
|
|
|
13036 |
{
|
|
|
13037 |
except_chars.clear();
|
|
|
13038 |
except_trans.clear();
|
|
|
13039 |
if (!spectrans || !spectrans[0])
|
|
|
13040 |
return;
|
|
|
13041 |
|
|
|
13042 |
// The translation tables out of Unicode are in machine byte order (we
|
|
|
13043 |
// just let the compiler read the values).
|
|
|
13044 |
// For the translation part, we need to choose our encoding in accordance )
|
|
|
13045 |
// (16BE or 16LE depending on processor)
|
|
|
13046 |
// On the contrary, the source char is always to be compared to
|
|
|
13047 |
// the input text, which is encoded in UTF-16BE ... What a mess.
|
|
|
13048 |
static const char *machinecoding = 0;
|
|
|
13049 |
bool littleendian = true;
|
|
|
13050 |
if (machinecoding == 0) {
|
|
|
13051 |
const char* charshort = "\001\002";
|
|
|
13052 |
short *ip = (short *)charshort;
|
|
|
13053 |
if (*ip == 0x0102) {
|
|
|
13054 |
littleendian = false;
|
|
|
13055 |
machinecoding = "UTF-16BE";
|
|
|
13056 |
} else {
|
|
|
13057 |
littleendian = true;
|
|
|
13058 |
machinecoding = "UTF-16LE";
|
|
|
13059 |
}
|
|
|
13060 |
}
|
|
|
13061 |
|
|
|
13062 |
vector<string> vtrans;
|
|
|
13063 |
stringToStrings(spectrans, vtrans);
|
|
|
13064 |
|
|
|
13065 |
for (vector<string>::iterator it = vtrans.begin();
|
|
|
13066 |
it != vtrans.end(); it++) {
|
|
|
13067 |
|
|
|
13068 |
/* Convert the whole thing to utf-16be/le according to endianness */
|
|
|
13069 |
char *out = 0;
|
|
|
13070 |
size_t outsize;
|
|
|
13071 |
if (convert("UTF-8", machinecoding,
|
|
|
13072 |
it->c_str(), it->size(),
|
|
|
13073 |
&out, &outsize) != 0 || outsize < 2)
|
|
|
13074 |
continue;
|
|
|
13075 |
|
|
|
13076 |
/* The source char must be utf-16be as this is what we convert the
|
|
|
13077 |
input text to for internal processing */
|
|
|
13078 |
unsigned short ch;
|
|
|
13079 |
if (littleendian)
|
|
|
13080 |
ch = (out[1] << 8) | (out[0] & 0xff);
|
|
|
13081 |
else
|
|
|
13082 |
ch = (out[0] << 8) | (out[1] & 0xff);
|
|
|
13083 |
|
|
|
13084 |
/* fprintf(stderr, "outsize %d Ch is 0x%hx\n", int(outsize), ch);*/
|
|
|
13085 |
except_chars.push_back(ch);
|
|
|
13086 |
// We keep ch as the first 2 bytes in the translation so that
|
|
|
13087 |
// both vectors sort identically
|
|
|
13088 |
except_trans.push_back(string((const char *)out, outsize));
|
|
|
13089 |
free(out);
|
|
|
13090 |
}
|
|
|
13091 |
std::sort(except_chars.begin(), except_chars.end());
|
|
|
13092 |
std::sort(except_trans.begin(), except_trans.end());
|
|
|
13093 |
}
|
|
|
13094 |
#endif /* RECOLL_DATADIR */
|