a b/unac/builder.in
1
#
2
# Copyright (C) 2000, 2001, 2002 Loic Dachary <loic@senga.org>
3
#
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17
#
18
#
19
# Generate a table mapping UTF-16 characters to their unaccented
20
# equivalent. Some characters such as fi (one character) are expanded
21
# into two letters : f and i. In Unicode jargon it means that the table
22
# map each character to its compatibility decomposition in which marks
23
# were stripped.
24
#
25
# The format of the $base file can be found at:
26
# http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html
27
#
28
use strict;
29
30
use Getopt::Long;
31
32
sub main {
33
    my($base) = "UnicodeData-@UNICODE_VERSION@.txt";
34
    my($verbose);
35
    my($source);
36
    my($reference);
37
38
    GetOptions("verbose+" => \$verbose,
39
         "database=s" => \$base,
40
         "source!" => \$source,
41
               "reference!" => \$reference);
42
    
43
    my(%decomposition, %mark, %name);
44
    my(%ranges);
45
    open(FILE, "<$base") or die "cannot open $base for reading : $!";
46
    while(<FILE>) {
47
  next if(/^\s*#/);    # Skip comments
48
  my($code_value,
49
     $character_name,
50
     $general_category,
51
     $canonical_combining_classes,
52
     $bidirectional_category,
53
     $character_decomposition_mapping,
54
     $decimal_digit_value,
55
     $digit_value,
56
     $numeric_value,
57
     $mirrored,
58
     $unicode_1_0_name,
59
     $_10646_comment_field,
60
     $uppercase_mapping,
61
     $lowercase_mapping,
62
     $titlecase_mapping) = split(/;/, $_);
63
  #
64
  # Generate ranges of values that are not explicitly listed.
65
  # CJK ideographs for instance.
66
  #
67
  if($character_name =~ /^<(.*), (First|Last)>/) {
68
      $ranges{$1}{$2} = $code_value;
69
  }
70
  if($character_decomposition_mapping =~ /(<.*>)?\s*(.+)/) {
71
      $decomposition{$code_value} = $2;
72
  }
73
  if($general_category =~ /^M/) {
74
      $mark{$code_value} = 1;
75
  }
76
  $name{$code_value} = $character_name;
77
    }
78
    close(FILE);
79
    
80
    #
81
    # Generate compatibility decomposition and strip marks
82
    # (marks == diacritics == accents)
83
    #
84
    my($from, $to);
85
    while(($from, $to) = each(%decomposition)) {
86
  my(@code_values) = split(' ', $to);
87
  my($code_value);
88
  my(@decomposition);
89
  while(@code_values) {
90
      my($code_value) = shift(@code_values);
91
      if(exists($decomposition{$code_value})) {
92
      push(@code_values, split(' ', $decomposition{$code_value}));
93
      } elsif(!exists($mark{$code_value})) {
94
      push(@decomposition, $code_value);
95
      }
96
  }
97
  if(@decomposition) {
98
      $decomposition{$from} = "@decomposition";
99
  } else {
100
      delete($decomposition{$from});
101
  }
102
    }
103
104
    reference(\%decomposition, $verbose) if($reference);
105
    source(\%decomposition, \%name, $verbose) if($source);
106
}
107
108
#
109
# Generate machine readable file mapping all UTF-16 codes
110
# to their unaccented replacement. This file can be compared
111
# with the output of a program doing the same mapping using the
112
# libunac library.
113
#
114
sub reference {
115
    my($decomposition, $verbose) = @_;
116
117
    my($code_value);
118
    foreach $code_value (0 .. 0xFFFF) {
119
  $code_value = uc(sprintf("%04x", $code_value));
120
  print "$code_value";
121
  if(exists($decomposition->{$code_value})) {
122
      print " => $decomposition->{$code_value}\n";
123
  } else {
124
      print "\n";
125
  }
126
    }
127
}
128
129
#
130
# Read input file into hash table and return it.
131
#
132
# The input is divided in chuncks according to special markers. For
133
# instance:
134
#
135
# before
136
# /* Generated by builder. Do not modify. Start a_tag */
137
# bla bla
138
# /* Generated by builder. Do not modify. End a_tag */
139
# after
140
# /* Generated by builder. Do not modify. Start b_tag */
141
# more stuff
142
# /* Generated by builder. Do not modify. End b_tag */
143
# still something
144
#
145
# Will generate the following hash:
146
#
147
# {
148
#   'list' => [ 1, a_tag, 2, b_tag, 3 ],
149
#   '1' => "before\n",
150
#   'a_tag' => undef,
151
#   '2' => "after\n";
152
#   'b_tag' => undef,
153
#   '3' => "still something\n"
154
# }
155
#
156
# The caller may then assign a string to the a_tag and b_tag entries
157
# and then call the spit function to rebuild the file.
158
#
159
sub slurp {
160
    my($file) = @_;
161
    my(%content);
162
    my($count) = 1;
163
    my(@lines);
164
    open(FILE, "<$file") or die "cannot open $file for reading : $!";
165
    while(<FILE>) {
166
  if(/Do not modify. Start\s+(\w+)/i) {
167
      push(@{$content{'list'}}, $count);
168
      $content{$count} = join("", @lines); 
169
      $count++;
170
      push(@{$content{'list'}}, $1);
171
      @lines = ();
172
  }
173
  next if(/Do not modify. Start/i .. /Do not modify. End/i);
174
  push(@lines, $_);
175
    }
176
    if(@lines) {
177
  push(@{$content{'list'}}, $count);
178
  $content{$count} = join("", @lines); 
179
    }
180
    close(FILE);
181
    return \%content;
182
}
183
184
#
185
# Write the $file with the content of the $content hash table. 
186
# See the slurp function for a description of the $content format.
187
#
188
sub spit {
189
    my($file, $content) = @_;
190
    open(FILE, ">$file") or die "cannot open $file for writing : $!";
191
    my($tag);
192
    foreach $tag (@{$content->{'list'}}) {
193
  print(FILE "/* Generated by builder. Do not modify. Start $tag */\n") if($tag !~ /^\d+$/);
194
  print FILE $content->{$tag};
195
  print(FILE "/* Generated by builder. Do not modify. End $tag */\n") if($tag !~ /^\d+$/);
196
    }
197
    close(FILE);
198
}
199
200
#
201
# Generate tables, defines and code in the unac.c and unac.h files.
202
# The unac.c and unac.h files are substituted in place.
203
#
204
sub source {
205
    my($decomposition, $name, $verbose) = @_;
206
207
    my($csource) = slurp("unac.c");
208
    my($hsource) = slurp("unac.h");
209
    #
210
    # Human readable table
211
    #
212
    my(@comment);
213
    push(@comment, "/*\n");
214
    my($from);
215
    foreach $from (sort(keys(%$decomposition))) {
216
  my($character_name) = $name->{$from};
217
  $character_name = "??" if(!$character_name);
218
  push(@comment, " * $from $character_name\n");
219
  my($code_value);
220
  foreach $code_value (split(' ', $decomposition->{$from})) {
221
      $character_name = $name->{$code_value} || "??";
222
      push(@comment, " * \t$code_value $character_name\n");
223
  }
224
    }
225
    push(@comment, "*/\n");
226
    my($comment) = join("", @comment);
227
228
    #
229
    # Select the best block size (the one that takes less space)
230
    #
231
    # result: $best_blocks (array of blocks that contain exactly
232
    #                       $block_count replacements. Each block
233
    #                       is a string containing replacements 
234
    #                       separated by |)
235
    #         $best_indexes (array mapping block number to a block
236
    #                        in the $best_blocks array)
237
    #         $best_block_shift (the size of the block)
238
    # 
239
    # Within a block, if the character has no replacement the 0xFFFF 
240
    # placeholder is inserted.
241
    #
242
    my($best_blocks);
243
    my($best_indexes);
244
    my($best_block_shift);
245
    my($best_total_size) = 10 * 1024 * 1024;
246
    my($block_shift);
247
    foreach $block_shift (2 .. 10) {
248
  my($block_count) = 1 << $block_shift;
249
  my(@blocks, @indexes);
250
  my($duplicate) = 0;
251
  my(@values);
252
  my($code_value);
253
  foreach $code_value (0 .. 0x10000) {
254
      if($code_value > 0 && $code_value % $block_count == 0) {
255
      my($block) = join("|", @values);
256
      my($existing_block);
257
      my($index) = 0;
258
      my($found);
259
      foreach $existing_block (@blocks) {
260
          if($block eq $existing_block) {
261
          push(@indexes, $index);
262
          $found = 1;
263
          $duplicate++;
264
          last;
265
          }
266
          $index++;
267
      }
268
      if(!$found) {
269
          push(@indexes, $index);
270
          push(@blocks, $block);
271
      }
272
      @values = ();
273
      }
274
      $code_value = uc(sprintf("%04x", $code_value));
275
      if(exists($decomposition->{$code_value})) {
276
      push(@values, $decomposition->{$code_value});
277
      } else {
278
      push(@values, "FFFF");
279
      }
280
  }
281
  print STDERR scalar(@blocks) . " blocks of " . $block_count . " entries, factorized $duplicate blocks\n\t" if($verbose);
282
  my($block_size) = 0;
283
  my($block);
284
  foreach $block (@blocks) {
285
      my(@tmp) = split(/[| ]/, $block);
286
      $block_size += scalar(@tmp) * 2;
287
  }
288
  #
289
  # Pointer to the block array
290
  #
291
  $block_size += scalar(@blocks) * 4;
292
  #
293
  # Positions of the entries in the block
294
  #
295
  $block_size += $block_count * scalar(@blocks) * 2;
296
  print STDERR "total block size = $block_size, " if($verbose);
297
  my($index_size) = (1 << (16 - $block_shift)) * 2;
298
  print STDERR "index size = " . $index_size . "\n\t" if($verbose);
299
  my($total_size) = $block_size + $index_size;
300
  print STDERR "total size = $total_size\n" if($verbose);
301
302
  if($total_size < $best_total_size) {
303
      $best_total_size = $total_size;
304
      $best_blocks = \@blocks;
305
      $best_indexes = \@indexes;
306
      $best_block_shift = $block_shift;
307
  }
308
    }
309
310
    my($block_count) = scalar(@$best_blocks);
311
    my($block_size) = 1 << $best_block_shift;
312
313
    #
314
    # Constants that depend on the block size.
315
    # result : $defines
316
    #
317
    my($defines) = <<EOF;
318
#define UNAC_BLOCK_SHIFT $best_block_shift
319
#define UNAC_BLOCK_MASK ((1 << UNAC_BLOCK_SHIFT) - 1)
320
#define UNAC_BLOCK_SIZE (1 << UNAC_BLOCK_SHIFT)
321
#define UNAC_BLOCK_COUNT $block_count
322
#define UNAC_INDEXES_SIZE (0x10000 >> UNAC_BLOCK_SHIFT)
323
EOF
324
    #
325
    # Mapping block number to index in data_table or position table.
326
    # result : $index_out
327
    #
328
    my($count) = 0;
329
    my($index);
330
    my($index_out) = "unsigned short unac_indexes[UNAC_INDEXES_SIZE] = {\n";
331
    foreach $index (@$best_indexes) {
332
  $count++;
333
  $index_out .= sprintf("%4s,", $index);
334
  if($count % 15 == 0) {
335
      $index_out .= "\n";
336
  }
337
    }
338
    $index_out =~ s/,\s*\Z/\n/s;
339
    $index_out .= "};\n";
340
341
    #
342
    # Generate the position table (map character number in block to
343
    # position in the data string), the data_table that maps a block
344
    # to a unsigned short array that contains the character (aka the
345
    # data array) and the data arrays themselves that is a pure concatenation
346
    # of all the unsigned short in a block. 
347
    # result : $position_out, $data_table_out, $data_out
348
    #
349
    my(@positions_out);
350
    my($highest_position) = 0;
351
    my(@data_table_out);
352
    my($data_table_out) = "unsigned short* unac_data_table[UNAC_BLOCK_COUNT] = {\n";
353
    my(@data_out);
354
    my($block_number) = 0;
355
    my($block);
356
    foreach $block (@$best_blocks) {
357
  my(@index);
358
  my($position) = 0;
359
  my($entry);
360
  my(@data);
361
  foreach $entry (split('\|', $block)) {
362
      push(@index, $position);
363
      my(@tmp) = split(' ', $entry);
364
      push(@data, @tmp);
365
      $position += scalar(@tmp);
366
  }
367
  push(@index, $position);
368
  $highest_position = $position if($position > $highest_position);
369
  push(@positions_out, "/* $block_number */ { " . join(", ", @index) . " }");
370
  push(@data_table_out, "unac_data$block_number");
371
  push(@data_out, "unsigned short unac_data$block_number" . "[] = { 0x" . join(", 0x", @data) . " };\n");
372
  $block_number++;
373
    }
374
    my($position_type) = $highest_position >= 256 ? "short" : "char";
375
    my($positions_out) = "unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][UNAC_BLOCK_SIZE + 1] = {\n";
376
377
    $positions_out .= join(",\n", @positions_out);
378
    $positions_out .= "\n};\n";
379
    my($data_out) = join("", @data_out);
380
    $data_table_out .= join(",\n", @data_table_out);
381
    $data_table_out .= "\n};\n";
382
383
    #
384
    # Tables declarations
385
    # result : $declarations
386
    #
387
    my($declarations);
388
    $declarations = <<EOF;
389
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
390
extern unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][UNAC_BLOCK_SIZE + 1];
391
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
392
EOF
393
    for($block_number = 0; $block_number < $block_count; $block_number++) {
394
  $declarations .= "extern unsigned short unac_data$block_number" . "[];\n";
395
    }
396
397
    $csource->{'tables'} = "$comment\n$index_out\n$positions_out\n$data_out\n$data_table_out";
398
    $hsource->{'defines'} = $defines;
399
    $hsource->{'declarations'} = $declarations;
400
401
    spit("unac.c", $csource);
402
    spit("unac.h", $hsource);
403
}
404
405
main();