recoll / Code / Diff of /unac/builder.in

Diff of /unac/builder.in [000000] .. [b396d2]

Switch to side-by-side view

--- a
+++ b/unac/builder.in
@@ -0,0 +1,405 @@
+#
+# Copyright (C) 2000, 2001, 2002 Loic Dachary <loic@senga.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+#
+# Generate a table mapping UTF-16 characters to their unaccented
+# equivalent. Some characters such as fi (one character) are expanded
+# into two letters : f and i. In Unicode jargon it means that the table
+# map each character to its compatibility decomposition in which marks
+# were stripped.
+#
+# The format of the $base file can be found at:
+# http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html
+#
+use strict;
+
+use Getopt::Long;
+
+sub main {
+    my($base) = "UnicodeData-@UNICODE_VERSION@.txt";
+    my($verbose);
+    my($source);
+    my($reference);
+
+    GetOptions("verbose+" => \$verbose,
+	       "database=s" => \$base,
+	       "source!" => \$source,
+               "reference!" => \$reference);
+    
+    my(%decomposition, %mark, %name);
+    my(%ranges);
+    open(FILE, "<$base") or die "cannot open $base for reading : $!";
+    while(<FILE>) {
+	next if(/^\s*#/);    # Skip comments
+	my($code_value,
+	   $character_name,
+	   $general_category,
+	   $canonical_combining_classes,
+	   $bidirectional_category,
+	   $character_decomposition_mapping,
+	   $decimal_digit_value,
+	   $digit_value,
+	   $numeric_value,
+	   $mirrored,
+	   $unicode_1_0_name,
+	   $_10646_comment_field,
+	   $uppercase_mapping,
+	   $lowercase_mapping,
+	   $titlecase_mapping) = split(/;/, $_);
+	#
+	# Generate ranges of values that are not explicitly listed.
+	# CJK ideographs for instance.
+	#
+	if($character_name =~ /^<(.*), (First|Last)>/) {
+	    $ranges{$1}{$2} = $code_value;
+	}
+	if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
+	    $decomposition{$code_value} = $2;
+	}
+	if($general_category =~ /^M/) {
+	    $mark{$code_value} = 1;
+	}
+	$name{$code_value} = $character_name;
+    }
+    close(FILE);
+    
+    #
+    # Generate compatibility decomposition and strip marks
+    # (marks == diacritics == accents)
+    #
+    my($from, $to);
+    while(($from, $to) = each(%decomposition)) {
+	my(@code_values) = split(' ', $to);
+	my($code_value);
+	my(@decomposition);
+	while(@code_values) {
+	    my($code_value) = shift(@code_values);
+	    if(exists($decomposition{$code_value})) {
+		push(@code_values, split(' ', $decomposition{$code_value}));
+	    } elsif(!exists($mark{$code_value})) {
+		push(@decomposition, $code_value);
+	    }
+	}
+	if(@decomposition) {
+	    $decomposition{$from} = "@decomposition";
+	} else {
+	    delete($decomposition{$from});
+	}
+    }
+
+    reference(\%decomposition, $verbose) if($reference);
+    source(\%decomposition, \%name, $verbose) if($source);
+}
+
+#
+# Generate machine readable file mapping all UTF-16 codes
+# to their unaccented replacement. This file can be compared
+# with the output of a program doing the same mapping using the
+# libunac library.
+#
+sub reference {
+    my($decomposition, $verbose) = @_;
+
+    my($code_value);
+    foreach $code_value (0 .. 0xFFFF) {
+	$code_value = uc(sprintf("%04x", $code_value));
+	print "$code_value";
+	if(exists($decomposition->{$code_value})) {
+	    print " => $decomposition->{$code_value}\n";
+	} else {
+	    print "\n";
+	}
+    }
+}
+
+#
+# Read input file into hash table and return it.
+#
+# The input is divided in chuncks according to special markers. For
+# instance:
+#
+# before
+# /* Generated by builder. Do not modify. Start a_tag */
+# bla bla
+# /* Generated by builder. Do not modify. End a_tag */
+# after
+# /* Generated by builder. Do not modify. Start b_tag */
+# more stuff
+# /* Generated by builder. Do not modify. End b_tag */
+# still something
+#
+# Will generate the following hash:
+#
+# {
+#   'list' => [ 1, a_tag, 2, b_tag, 3 ],
+#   '1' => "before\n",
+#   'a_tag' => undef,
+#   '2' => "after\n";
+#   'b_tag' => undef,
+#   '3' => "still something\n"
+# }
+#
+# The caller may then assign a string to the a_tag and b_tag entries
+# and then call the spit function to rebuild the file.
+#
+sub slurp {
+    my($file) = @_;
+    my(%content);
+    my($count) = 1;
+    my(@lines);
+    open(FILE, "<$file") or die "cannot open $file for reading : $!";
+    while(<FILE>) {
+	if(/Do not modify. Start\s+(\w+)/i) {
+	    push(@{$content{'list'}}, $count);
+	    $content{$count} = join("", @lines); 
+	    $count++;
+	    push(@{$content{'list'}}, $1);
+	    @lines = ();
+	}
+	next if(/Do not modify. Start/i .. /Do not modify. End/i);
+	push(@lines, $_);
+    }
+    if(@lines) {
+	push(@{$content{'list'}}, $count);
+	$content{$count} = join("", @lines); 
+    }
+    close(FILE);
+    return \%content;
+}
+
+#
+# Write the $file with the content of the $content hash table. 
+# See the slurp function for a description of the $content format.
+#
+sub spit {
+    my($file, $content) = @_;
+    open(FILE, ">$file") or die "cannot open $file for writing : $!";
+    my($tag);
+    foreach $tag (@{$content->{'list'}}) {
+	print(FILE "/* Generated by builder. Do not modify. Start $tag */\n") if($tag !~ /^\d+$/);
+	print FILE $content->{$tag};
+	print(FILE "/* Generated by builder. Do not modify. End $tag */\n") if($tag !~ /^\d+$/);
+    }
+    close(FILE);
+}
+
+#
+# Generate tables, defines and code in the unac.c and unac.h files.
+# The unac.c and unac.h files are substituted in place.
+#
+sub source {
+    my($decomposition, $name, $verbose) = @_;
+
+    my($csource) = slurp("unac.c");
+    my($hsource) = slurp("unac.h");
+    #
+    # Human readable table
+    #
+    my(@comment);
+    push(@comment, "/*\n");
+    my($from);
+    foreach $from (sort(keys(%$decomposition))) {
+	my($character_name) = $name->{$from};
+	$character_name = "??" if(!$character_name);
+	push(@comment, " * $from $character_name\n");
+	my($code_value);
+	foreach $code_value (split(' ', $decomposition->{$from})) {
+	    $character_name = $name->{$code_value} || "??";
+	    push(@comment, " * \t$code_value $character_name\n");
+	}
+    }
+    push(@comment, "*/\n");
+    my($comment) = join("", @comment);
+
+    #
+    # Select the best block size (the one that takes less space)
+    #
+    # result: $best_blocks (array of blocks that contain exactly
+    #                       $block_count replacements. Each block
+    #                       is a string containing replacements 
+    #                       separated by |)
+    #         $best_indexes (array mapping block number to a block
+    #                        in the $best_blocks array)
+    #         $best_block_shift (the size of the block)
+    # 
+    # Within a block, if the character has no replacement the 0xFFFF 
+    # placeholder is inserted.
+    #
+    my($best_blocks);
+    my($best_indexes);
+    my($best_block_shift);
+    my($best_total_size) = 10 * 1024 * 1024;
+    my($block_shift);
+    foreach $block_shift (2 .. 10) {
+	my($block_count) = 1 << $block_shift;
+	my(@blocks, @indexes);
+	my($duplicate) = 0;
+	my(@values);
+	my($code_value);
+	foreach $code_value (0 .. 0x10000) {
+	    if($code_value > 0 && $code_value % $block_count == 0) {
+		my($block) = join("|", @values);
+		my($existing_block);
+		my($index) = 0;
+		my($found);
+		foreach $existing_block (@blocks) {
+		    if($block eq $existing_block) {
+			push(@indexes, $index);
+			$found = 1;
+			$duplicate++;
+			last;
+		    }
+		    $index++;
+		}
+		if(!$found) {
+		    push(@indexes, $index);
+		    push(@blocks, $block);
+		}
+		@values = ();
+	    }
+	    $code_value = uc(sprintf("%04x", $code_value));
+	    if(exists($decomposition->{$code_value})) {
+		push(@values, $decomposition->{$code_value});
+	    } else {
+		push(@values, "FFFF");
+	    }
+	}
+	print STDERR scalar(@blocks) . " blocks of " . $block_count . " entries, factorized $duplicate blocks\n\t" if($verbose);
+	my($block_size) = 0;
+	my($block);
+	foreach $block (@blocks) {
+	    my(@tmp) = split(/[| ]/, $block);
+	    $block_size += scalar(@tmp) * 2;
+	}
+	#
+	# Pointer to the block array
+	#
+	$block_size += scalar(@blocks) * 4;
+	#
+	# Positions of the entries in the block
+	#
+	$block_size += $block_count * scalar(@blocks) * 2;
+	print STDERR "total block size = $block_size, " if($verbose);
+	my($index_size) = (1 << (16 - $block_shift)) * 2;
+	print STDERR "index size = " . $index_size . "\n\t" if($verbose);
+	my($total_size) = $block_size + $index_size;
+	print STDERR "total size = $total_size\n" if($verbose);
+
+	if($total_size < $best_total_size) {
+	    $best_total_size = $total_size;
+	    $best_blocks = \@blocks;
+	    $best_indexes = \@indexes;
+	    $best_block_shift = $block_shift;
+	}
+    }
+
+    my($block_count) = scalar(@$best_blocks);
+    my($block_size) = 1 << $best_block_shift;
+
+    #
+    # Constants that depend on the block size.
+    # result : $defines
+    #
+    my($defines) = <<EOF;
+#define UNAC_BLOCK_SHIFT $best_block_shift
+#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
+#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
+#define UNAC_BLOCK_COUNT $block_count
+#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
+EOF
+    #
+    # Mapping block number to index in data_table or position table.
+    # result : $index_out
+    #
+    my($count) = 0;
+    my($index);
+    my($index_out) = "unsigned short unac_indexes[UNAC_INDEXES_SIZE] = {\n";
+    foreach $index (@$best_indexes) {
+	$count++;
+	$index_out .= sprintf("%4s,", $index);
+	if($count % 15 == 0) {
+	    $index_out .= "\n";
+	}
+    }
+    $index_out =~ s/,\s*\Z/\n/s;
+    $index_out .= "};\n";
+
+    #
+    # Generate the position table (map character number in block to
+    # position in the data string), the data_table that maps a block
+    # to a unsigned short array that contains the character (aka the
+    # data array) and the data arrays themselves that is a pure concatenation
+    # of all the unsigned short in a block. 
+    # result : $position_out, $data_table_out, $data_out
+    #
+    my(@positions_out);
+    my($highest_position) = 0;
+    my(@data_table_out);
+    my($data_table_out) = "unsigned short* unac_data_table[UNAC_BLOCK_COUNT] = {\n";
+    my(@data_out);
+    my($block_number) = 0;
+    my($block);
+    foreach $block (@$best_blocks) {
+	my(@index);
+	my($position) = 0;
+	my($entry);
+	my(@data);
+	foreach $entry (split('\|', $block)) {
+	    push(@index, $position);
+	    my(@tmp) = split(' ', $entry);
+	    push(@data, @tmp);
+	    $position += scalar(@tmp);
+	}
+	push(@index, $position);
+	$highest_position = $position if($position > $highest_position);
+	push(@positions_out, "/* $block_number */ { " . join(", ", @index) . " }");
+	push(@data_table_out, "unac_data$block_number");
+	push(@data_out, "unsigned short unac_data$block_number" . "[] = { 0x" . join(", 0x", @data) . " };\n");
+	$block_number++;
+    }
+    my($position_type) = $highest_position >= 256 ? "short" : "char";
+    my($positions_out) = "unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][UNAC_BLOCK_SIZE + 1] = {\n";
+
+    $positions_out .= join(",\n", @positions_out);
+    $positions_out .= "\n};\n";
+    my($data_out) = join("", @data_out);
+    $data_table_out .= join(",\n", @data_table_out);
+    $data_table_out .= "\n};\n";
+
+    #
+    # Tables declarations
+    # result : $declarations
+    #
+    my($declarations);
+    $declarations = <<EOF;
+extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
+extern unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][UNAC_BLOCK_SIZE + 1];
+extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
+EOF
+    for($block_number = 0; $block_number < $block_count; $block_number++) {
+	$declarations .= "extern unsigned short unac_data$block_number" . "[];\n";
+    }
+
+    $csource->{'tables'} = "$comment\n$index_out\n$positions_out\n$data_out\n$data_table_out";
+    $hsource->{'defines'} = $defines;
+    $hsource->{'declarations'} = $declarations;
+
+    spit("unac.c", $csource);
+    spit("unac.h", $hsource);
+}
+
+main();