Switch to unified view

a/unac/builder.in b/unac/builder.in
...
...
70
    # CJK ideographs for instance.
70
    # CJK ideographs for instance.
71
    #
71
    #
72
    if($character_name =~ /^<(.*), (First|Last)>/) {
72
    if($character_name =~ /^<(.*), (First|Last)>/) {
73
        $ranges{$1}{$2} = $code_value;
73
        $ranges{$1}{$2} = $code_value;
74
    }
74
    }
75
76
  # For kana japanese characters, we don't want to strip accents as I'm
77
  # told that they are essential and stripping them does not
78
  # make sense. Wonder why Unicode does these decompositions
79
  # then...  Problem: the first solution used was to decompose
80
  # the japanese accented kana and not remove accents. But then
81
  # the unaccented character would match the string with
82
  # accent. So now we don't decompose at all, but this means
83
  # that, if the original text was decomposed, things don't work
84
  # as intended as we should actually recombine the
85
  # letter+accents in this case for data to be unified.
75
    if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
86
    if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
87
      # Not for Hiragana + Katakana 
88
      if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
89
      # and Halfwidth katakana
90
      !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
76
        $decomposition{$code_value} = $2;
91
      $decomposition{$code_value} = $2;
92
      }
77
    }
93
    }
78
    if($general_category =~ /^M/) {
94
    if($general_category =~ /^M/) {
79
        $mark{$code_value} = 1;
95
        $mark{$code_value} = 1;
80
    }
96
    }
81
    $name{$code_value} = $character_name;
97
    $name{$code_value} = $character_name;
82
    }
98
    }
83
    close(FILE);
99
    close(FILE);
84
    
100
    
85
    #
86
    # Generate compatibility decomposition and strip marks
101
    # Generate compatibility decomposition and strip marks
87
    # (marks == diacritics == accents)
102
    # (marks == diacritics == accents)
88
    #
103
    #
89
    # For kana japanese characters, we don't strip accents. Note: we just
90
    # need to test for the main kana (hiragana + katakana 3040-30ff) block,
91
    # characters such as halfwidth variations will be first decomposed into it
92
    #
93
    # We also forbid any excursion out of the basic plane. Sorry, Dave.
104
    # We also forbid any excursion out of the basic plane. 
94
    my($from, $to);
105
    my($from, $to);
95
    while(($from, $to) = each(%decomposition)) {
106
    while(($from, $to) = each(%decomposition)) {
96
    my(@code_values) = split(' ', $to);
107
    my(@code_values) = split(' ', $to);
97
    my($code_value);
108
    my($code_value);
98
    my(@decomposition);
109
    my(@decomposition);
110
99
    while(@code_values) {
111
    while(@code_values) {
100
        my($code_value) = shift(@code_values);
112
        my($code_value) = shift(@code_values);
101
        if (hex $code_value > 0xffff) {
113
        if (hex $code_value > 0xffff) {
102
        undef @decomposition;
114
        undef @decomposition;
103
        last;
115
        last;
104
        }
116
        }
105
        if(exists($decomposition{$code_value})) {
117
        if(exists($decomposition{$code_value})) {
106
        push(@code_values, split(' ', $decomposition{$code_value}));
118
        push(@code_values, split(' ', $decomposition{$code_value}));
107
        } elsif (!exists($mark{$code_value}) || 
119
        } elsif (!exists($mark{$code_value})) {
108
           (hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
109
        push(@decomposition, $code_value);
120
        push(@decomposition, $code_value);
110
        }
121
        }
111
    }
122
    }
112
    if(@decomposition) {
123
    if(@decomposition) {
113
        $decomposition{$from} = "@decomposition";
124
        $decomposition{$from} = "@decomposition";