|
a/unac/builder.in |
|
b/unac/builder.in |
|
... |
|
... |
70 |
# CJK ideographs for instance.
|
70 |
# CJK ideographs for instance.
|
71 |
#
|
71 |
#
|
72 |
if($character_name =~ /^<(.*), (First|Last)>/) {
|
72 |
if($character_name =~ /^<(.*), (First|Last)>/) {
|
73 |
$ranges{$1}{$2} = $code_value;
|
73 |
$ranges{$1}{$2} = $code_value;
|
74 |
}
|
74 |
}
|
|
|
75 |
|
|
|
76 |
# For kana japanese characters, we don't want to strip accents as I'm
|
|
|
77 |
# told that they are essential and stripping them does not
|
|
|
78 |
# make sense. Wonder why Unicode does these decompositions
|
|
|
79 |
# then... Problem: the first solution used was to decompose
|
|
|
80 |
# the japanese accented kana and not remove accents. But then
|
|
|
81 |
# the unaccented character would match the string with
|
|
|
82 |
# accent. So now we don't decompose at all, but this means
|
|
|
83 |
# that, if the original text was decomposed, things don't work
|
|
|
84 |
# as intended as we should actually recombine the
|
|
|
85 |
# letter+accents in this case for data to be unified.
|
75 |
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
|
86 |
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
|
|
|
87 |
# Not for Hiragana + Katakana
|
|
|
88 |
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
|
|
|
89 |
# and Halfwidth katakana
|
|
|
90 |
!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
|
76 |
$decomposition{$code_value} = $2;
|
91 |
$decomposition{$code_value} = $2;
|
|
|
92 |
}
|
77 |
}
|
93 |
}
|
78 |
if($general_category =~ /^M/) {
|
94 |
if($general_category =~ /^M/) {
|
79 |
$mark{$code_value} = 1;
|
95 |
$mark{$code_value} = 1;
|
80 |
}
|
96 |
}
|
81 |
$name{$code_value} = $character_name;
|
97 |
$name{$code_value} = $character_name;
|
82 |
}
|
98 |
}
|
83 |
close(FILE);
|
99 |
close(FILE);
|
84 |
|
100 |
|
85 |
#
|
|
|
86 |
# Generate compatibility decomposition and strip marks
|
101 |
# Generate compatibility decomposition and strip marks
|
87 |
# (marks == diacritics == accents)
|
102 |
# (marks == diacritics == accents)
|
88 |
#
|
103 |
#
|
89 |
# For kana japanese characters, we don't strip accents. Note: we just
|
|
|
90 |
# need to test for the main kana (hiragana + katakana 3040-30ff) block,
|
|
|
91 |
# characters such as halfwidth variations will be first decomposed into it
|
|
|
92 |
#
|
|
|
93 |
# We also forbid any excursion out of the basic plane. Sorry, Dave.
|
104 |
# We also forbid any excursion out of the basic plane.
|
94 |
my($from, $to);
|
105 |
my($from, $to);
|
95 |
while(($from, $to) = each(%decomposition)) {
|
106 |
while(($from, $to) = each(%decomposition)) {
|
96 |
my(@code_values) = split(' ', $to);
|
107 |
my(@code_values) = split(' ', $to);
|
97 |
my($code_value);
|
108 |
my($code_value);
|
98 |
my(@decomposition);
|
109 |
my(@decomposition);
|
|
|
110 |
|
99 |
while(@code_values) {
|
111 |
while(@code_values) {
|
100 |
my($code_value) = shift(@code_values);
|
112 |
my($code_value) = shift(@code_values);
|
101 |
if (hex $code_value > 0xffff) {
|
113 |
if (hex $code_value > 0xffff) {
|
102 |
undef @decomposition;
|
114 |
undef @decomposition;
|
103 |
last;
|
115 |
last;
|
104 |
}
|
116 |
}
|
105 |
if(exists($decomposition{$code_value})) {
|
117 |
if(exists($decomposition{$code_value})) {
|
106 |
push(@code_values, split(' ', $decomposition{$code_value}));
|
118 |
push(@code_values, split(' ', $decomposition{$code_value}));
|
107 |
} elsif (!exists($mark{$code_value}) ||
|
119 |
} elsif (!exists($mark{$code_value})) {
|
108 |
(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
|
|
|
109 |
push(@decomposition, $code_value);
|
120 |
push(@decomposition, $code_value);
|
110 |
}
|
121 |
}
|
111 |
}
|
122 |
}
|
112 |
if(@decomposition) {
|
123 |
if(@decomposition) {
|
113 |
$decomposition{$from} = "@decomposition";
|
124 |
$decomposition{$from} = "@decomposition";
|