recoll / Code / Diff of /unac/builder.in

Diff of /unac/builder.in [913dff] .. [2b8107]

Switch to unified view


...
    #
    if($character_name =~ /^<(.*), (First|Last)>/) {
        $ranges{$1}{$2} = $code_value;
    }

  # Test for exceptions to unaccenting. Note that this is
  # mostly based on blocks when it should use the Unicode
  # script property. In practise, for the script concerned,
  # this does not look to be an issue currently
  # (following comment made for japanese but also concerns
  #  other exceptions)
    # For kana japanese characters, we don't want to strip
  # accents as I'm told that they are essential and
  # stripping them does not make sense. 
    # Problem: the first solution used was to decompose the
    # Japanese accented kana and not remove accents. But then
    # the unaccented character would match the string with
    # accent. So now we don't decompose at all, but this means
    # that, if the original text was decomposed, things don't
    # work as intended as we should actually recombine the
    # letter+accents in this case for data to be unified.

  # Hiragana + Katakana 
  if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)
      # Halfwidth katakana
      && !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f)
      #  Hindi Devanagari
      && !(hex $code_value >= 0x0900 && hex $code_value <= 0x097f)
      && !(hex $code_value >= 0xa8e0 && hex $code_value <= 0xa8ff)
      ) {
      # If a decomposition exists, record it
        if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {




        $decomposition{$code_value} = $2;
        }

        if($general_category =~ /^M/) {
      $mark{$code_value} = 1;
      # For mark caracters, we generate a 0 entry in the
      # decomposition table. This signals to the c code that no
      # output should be generated. Slightly hacky but ok. The
      # original code left mark character go through (generating
      # still accented output if the input was in decomposed
      # form). Decomposed text is rare, but, for example, macosx file
      # names have separate combining accent characters.
      $decomposition{$code_value} = "0000";
      }
    }
    $name{$code_value} = $character_name;
    }
    close(FILE);
    

	a/unac/builder.in		b/unac/builder.in
	...		...
71	#	71	#
72	if($character_name =~ /^<(.*), (First\|Last)>/) {	72	if($character_name =~ /^<(.*), (First\|Last)>/) {
73	$ranges{$1}{$2} = $code_value;	73	$ranges{$1}{$2} = $code_value;
74	}	74	}
75		75
		76	# Test for exceptions to unaccenting. Note that this is
		77	# mostly based on blocks when it should use the Unicode
		78	# script property. In practise, for the script concerned,
		79	# this does not look to be an issue currently
		80	# (following comment made for japanese but also concerns
		81	# other exceptions)
76	# For kana japanese characters, we don't want to strip accents as I'm	82	# For kana japanese characters, we don't want to strip
77	# told that they are essential and stripping them does not	83	# accents as I'm told that they are essential and
78	# make sense. Wonder why Unicode does these decompositions	84	# stripping them does not make sense.
79	# then... Problem: the first solution used was to decompose	85	# Problem: the first solution used was to decompose the
80	# the japanese accented kana and not remove accents. But then	86	# Japanese accented kana and not remove accents. But then
81	# the unaccented character would match the string with	87	# the unaccented character would match the string with
82	# accent. So now we don't decompose at all, but this means	88	# accent. So now we don't decompose at all, but this means
83	# that, if the original text was decomposed, things don't work	89	# that, if the original text was decomposed, things don't
84	# as intended as we should actually recombine the	90	# work as intended as we should actually recombine the
85	# letter+accents in this case for data to be unified.	91	# letter+accents in this case for data to be unified.
		92
		93	# Hiragana + Katakana
		94	if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff)
		95	# Halfwidth katakana
		96	&& !(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f)
		97	# Hindi Devanagari
		98	&& !(hex $code_value >= 0x0900 && hex $code_value <= 0x097f)
		99	&& !(hex $code_value >= 0xa8e0 && hex $code_value <= 0xa8ff)
		100	) {
		101	# If a decomposition exists, record it
86	if($character_decomposition_mapping =~ /(<.>)?\s(.+)/) {	102	if($character_decomposition_mapping =~ /(<.>)?\s(.+)/) {
87	# Not for Hiragana + Katakana
88	if (!(hex $code_value >= 0x3040 && hex $code_value <= 0x30ff) &&
89	# and Halfwidth katakana
90	!(hex $code_value >= 0xff65 && hex $code_value <= 0xff9f) ) {
91	$decomposition{$code_value} = $2;	103	$decomposition{$code_value} = $2;
92	}	104	}
93	}
94	if($general_category =~ /^M/) {	105	if($general_category =~ /^M/) {
95	$mark{$code_value} = 1;	106	$mark{$code_value} = 1;
96	# For mark caracters, we generate a 0 entry in the	107	# For mark caracters, we generate a 0 entry in the
97	# decomposition table. This signals to the c code that no	108	# decomposition table. This signals to the c code that no
98	# output should be generated. Slightly hacky but ok. The	109	# output should be generated. Slightly hacky but ok. The
99	# original code left mark character go through (generating	110	# original code left mark character go through (generating
100	# still accented output if the input was in decomposed	111	# still accented output if the input was in decomposed
101	# form). Decomposed text is rare, but, for example, macosx file	112	# form). Decomposed text is rare, but, for example, macosx file
102	# names have separate combining accent characters.	113	# names have separate combining accent characters.
103	$decomposition{$code_value} = "0000";	114	$decomposition{$code_value} = "0000";
		115	}
104	}	116	}
105	$name{$code_value} = $character_name;	117	$name{$code_value} = $character_name;
106	}	118	}
107	close(FILE);	119	close(FILE);
108		120