|
a/unac/builder.in |
|
b/unac/builder.in |
|
... |
|
... |
71 |
#
|
71 |
#
|
72 |
if($character_name =~ /^<(.*), (First|Last)>/) {
|
72 |
if($character_name =~ /^<(.*), (First|Last)>/) {
|
73 |
$ranges{$1}{$2} = $code_value;
|
73 |
$ranges{$1}{$2} = $code_value;
|
74 |
}
|
74 |
}
|
75 |
|
75 |
|
|
|
76 |
# Test for exceptions to unaccenting. Note that this is
|
|
|
77 |
# mostly based on blocks when it should use the Unicode
|
|
|
78 |
# script property. In practise, for the script concerned,
|
|
|
79 |
# this does not look to be an issue currently
|
|
|
80 |
# (following comment made for japanese but also concerns
|
|
|
81 |
# other exceptions)
|
76 |
# For kana japanese characters, we don't want to strip accents as I'm
|
82 |
# For kana japanese characters, we don't want to strip
|
77 |
# told that they are essential and stripping them does not
|
83 |
# accents as I'm told that they are essential and
|
78 |
# make sense. Wonder why Unicode does these decompositions
|
84 |
# stripping them does not make sense.
|
79 |
# then... Problem: the first solution used was to decompose
|
85 |
# Problem: the first solution used was to decompose the
|
80 |
# the japanese accented kana and not remove accents. But then
|
86 |
# Japanese accented kana and not remove accents. But then
|
81 |
# the unaccented character would match the string with
|
87 |
# the unaccented character would match the string with
|
82 |
# accent. So now we don't decompose at all, but this means
|
88 |
# accent. So now we don't decompose at all, but this means
|
83 |
# that, if the original text was decomposed, things don't work
|
89 |
# that, if the original text was decomposed, things don't
|
84 |
# as intended as we should actually recombine the
|
90 |
# work as intended as we should actually recombine the
|
85 |
# letter+accents in this case for data to be unified.
|
91 |
# letter+accents in this case for data to be unified.
|
|
|
92 |
|
|
|
93 |
# Hiragana + Katakana
|
|
|
94 |
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)
|
|
|
95 |
# Halfwidth katakana
|
|
|
96 |
&& !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f)
|
|
|
97 |
# Hindi Devanagari
|
|
|
98 |
&& !(hex $code_value >= 0x0900 && hex $code_value <= 0x097f)
|
|
|
99 |
&& !(hex $code_value >= 0xa8e0 && hex $code_value <= 0xa8ff)
|
|
|
100 |
) {
|
|
|
101 |
# If a decomposition exists, record it
|
86 |
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
|
102 |
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
|
87 |
# Not for Hiragana + Katakana
|
|
|
88 |
if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
|
|
|
89 |
# and Halfwidth katakana
|
|
|
90 |
!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
|
|
|
91 |
$decomposition{$code_value} = $2;
|
103 |
$decomposition{$code_value} = $2;
|
92 |
}
|
104 |
}
|
93 |
}
|
|
|
94 |
if($general_category =~ /^M/) {
|
105 |
if($general_category =~ /^M/) {
|
95 |
$mark{$code_value} = 1;
|
106 |
$mark{$code_value} = 1;
|
96 |
# For mark caracters, we generate a 0 entry in the
|
107 |
# For mark caracters, we generate a 0 entry in the
|
97 |
# decomposition table. This signals to the c code that no
|
108 |
# decomposition table. This signals to the c code that no
|
98 |
# output should be generated. Slightly hacky but ok. The
|
109 |
# output should be generated. Slightly hacky but ok. The
|
99 |
# original code left mark character go through (generating
|
110 |
# original code left mark character go through (generating
|
100 |
# still accented output if the input was in decomposed
|
111 |
# still accented output if the input was in decomposed
|
101 |
# form). Decomposed text is rare, but, for example, macosx file
|
112 |
# form). Decomposed text is rare, but, for example, macosx file
|
102 |
# names have separate combining accent characters.
|
113 |
# names have separate combining accent characters.
|
103 |
$decomposition{$code_value} = "0000";
|
114 |
$decomposition{$code_value} = "0000";
|
|
|
115 |
}
|
104 |
}
|
116 |
}
|
105 |
$name{$code_value} = $character_name;
|
117 |
$name{$code_value} = $character_name;
|
106 |
}
|
118 |
}
|
107 |
close(FILE);
|
119 |
close(FILE);
|
108 |
|
120 |
|