recoll / Code / Diff of /unac/builder.in

Diff of /unac/builder.in [36919a] .. [623065]

Switch to unified view


...
    # CJK ideographs for instance.
    #
    if($character_name =~ /^<(.*), (First|Last)>/) {
        $ranges{$1}{$2} = $code_value;
    }

  # For kana japanese characters, we don't want to strip accents as I'm
  # told that they are essential and stripping them does not
  # make sense. Wonder why Unicode does these decompositions
  # then...  Problem: the first solution used was to decompose
  # the japanese accented kana and not remove accents. But then
  # the unaccented character would match the string with
  # accent. So now we don't decompose at all, but this means
  # that, if the original text was decomposed, things don't work
  # as intended as we should actually recombine the
  # letter+accents in this case for data to be unified.
    if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
      # Not for Hiragana + Katakana 
      if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
      # and Halfwidth katakana
      !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
      $decomposition{$code_value} = $2;
      }
    }
    if($general_category =~ /^M/) {
        $mark{$code_value} = 1;
    }
    $name{$code_value} = $character_name;
    }
    close(FILE);
    

    # Generate compatibility decomposition and strip marks
    # (marks == diacritics == accents)
    #




    # We also forbid any excursion out of the basic plane. 
    my($from, $to);
    while(($from, $to) = each(%decomposition)) {
    my(@code_values) = split(' ', $to);
    my($code_value);
    my(@decomposition);

    while(@code_values) {
        my($code_value) = shift(@code_values);
        if (hex $code_value > 0xffff) {
        undef @decomposition;
        last;
        }
        if(exists($decomposition{$code_value})) {
        push(@code_values, split(' ', $decomposition{$code_value}));
        } elsif (!exists($mark{$code_value})) {

        push(@decomposition, $code_value);
        }
    }
    if(@decomposition) {
        $decomposition{$from} = "@decomposition";

	a/unac/builder.in		b/unac/builder.in
	...		...
70	# CJK ideographs for instance.	70	# CJK ideographs for instance.
71	#	71	#
72	if($character_name =~ /^<(.*), (First\|Last)>/) {	72	if($character_name =~ /^<(.*), (First\|Last)>/) {
73	$ranges{$1}{$2} = $code_value;	73	$ranges{$1}{$2} = $code_value;
74	}	74	}
		75
		76	# For kana japanese characters, we don't want to strip accents as I'm
		77	# told that they are essential and stripping them does not
		78	# make sense. Wonder why Unicode does these decompositions
		79	# then... Problem: the first solution used was to decompose
		80	# the japanese accented kana and not remove accents. But then
		81	# the unaccented character would match the string with
		82	# accent. So now we don't decompose at all, but this means
		83	# that, if the original text was decomposed, things don't work
		84	# as intended as we should actually recombine the
		85	# letter+accents in this case for data to be unified.
75	if($character_decomposition_mapping =~ /(<.>)?\s(.+)/) {	86	if($character_decomposition_mapping =~ /(<.>)?\s(.+)/) {
		87	# Not for Hiragana + Katakana
		88	if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
		89	# and Halfwidth katakana
		90	!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
76	$decomposition{$code_value} = $2;	91	$decomposition{$code_value} = $2;
		92	}
77	}	93	}
78	if($general_category =~ /^M/) {	94	if($general_category =~ /^M/) {
79	$mark{$code_value} = 1;	95	$mark{$code_value} = 1;
80	}	96	}
81	$name{$code_value} = $character_name;	97	$name{$code_value} = $character_name;
82	}	98	}
83	close(FILE);	99	close(FILE);
84		100
85	#
86	# Generate compatibility decomposition and strip marks	101	# Generate compatibility decomposition and strip marks
87	# (marks == diacritics == accents)	102	# (marks == diacritics == accents)
88	#	103	#
89	# For kana japanese characters, we don't strip accents. Note: we just
90	# need to test for the main kana (hiragana + katakana 3040-30ff) block,
91	# characters such as halfwidth variations will be first decomposed into it
92	#
93	# We also forbid any excursion out of the basic plane. Sorry, Dave.	104	# We also forbid any excursion out of the basic plane.
94	my($from, $to);	105	my($from, $to);
95	while(($from, $to) = each(%decomposition)) {	106	while(($from, $to) = each(%decomposition)) {
96	my(@code_values) = split(' ', $to);	107	my(@code_values) = split(' ', $to);
97	my($code_value);	108	my($code_value);
98	my(@decomposition);	109	my(@decomposition);
		110
99	while(@code_values) {	111	while(@code_values) {
100	my($code_value) = shift(@code_values);	112	my($code_value) = shift(@code_values);
101	if (hex $code_value > 0xffff) {	113	if (hex $code_value > 0xffff) {
102	undef @decomposition;	114	undef @decomposition;
103	last;	115	last;
104	}	116	}
105	if(exists($decomposition{$code_value})) {	117	if(exists($decomposition{$code_value})) {
106	push(@code_values, split(' ', $decomposition{$code_value}));	118	push(@code_values, split(' ', $decomposition{$code_value}));
107	} elsif (!exists($mark{$code_value}) \|\|	119	} elsif (!exists($mark{$code_value})) {
108	(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)) {
109	push(@decomposition, $code_value);	120	push(@decomposition, $code_value);
110	}	121	}
111	}	122	}
112	if(@decomposition) {	123	if(@decomposition) {
113	$decomposition{$from} = "@decomposition";	124	$decomposition{$from} = "@decomposition";