--- a/unac/builder.in
+++ b/unac/builder.in
@@ -72,8 +72,24 @@
if($character_name =~ /^<(.*), (First|Last)>/) {
$ranges{$1}{$2} = $code_value;
}
+
+ # For kana japanese characters, we don't want to strip accents as I'm
+ # told that they are essential and stripping them does not
+ # make sense. Wonder why Unicode does these decompositions
+ # then... Problem: the first solution used was to decompose
+ # the japanese accented kana and not remove accents. But then
+ # the unaccented character would match the string with
+ # accent. So now we don't decompose at all, but this means
+ # that, if the original text was decomposed, things don't work
+ # as intended as we should actually recombine the
+ # letter+accents in this case for data to be unified.
if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
- $decomposition{$code_value} = $2;
+ # Not for Hiragana + Katakana
+ if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
+ # and Halfwidth katakana
+ !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
+ $decomposition{$code_value} = $2;
+ }
}
if($general_category =~ /^M/) {
$mark{$code_value} = 1;
@@ -82,20 +98,16 @@
}
close(FILE);
- #
# Generate compatibility decomposition and strip marks
# (marks == diacritics == accents)
#
- # For kana japanese characters, we don't strip accents. Note: we just
- # need to test for the main kana (hiragana + katakana 3040-30ff) block,
- # characters such as halfwidth variations will be first decomposed into it
- #
- # We also forbid any excursion out of the basic plane. Sorry, Dave.
+ # We also forbid any excursion out of the basic plane.
my($from, $to);
while(($from, $to) = each(%decomposition)) {
my(@code_values) = split(' ', $to);
my($code_value);
my(@decomposition);
+
while(@code_values) {
my($code_value) = shift(@code_values);
if (hex $code_value > 0xffff) {
@@ -104,8 +116,7 @@
}
if(exists($decomposition{$code_value})) {
push(@code_values, split(' ', $decomposition{$code_value}));
- } elsif (!exists($mark{$code_value}) ||
- (hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
+ } elsif (!exists($mark{$code_value})) {
push(@decomposition, $code_value);
}
}