Switch to unified view

a/unac/builder.in b/unac/builder.in
...
...
29
29
30
use Getopt::Long;
30
use Getopt::Long;
31
31
32
sub main {
32
sub main {
33
    my($base) = "UnicodeData-@UNICODE_VERSION@.txt";
33
    my($base) = "UnicodeData-@UNICODE_VERSION@.txt";
34
    my($cfbase) = "CaseFolding-@UNICODE_VERSION@.txt";
34
    my($verbose);
35
    my($verbose);
35
    my($source);
36
    my($source);
36
    my($reference);
37
    my($reference);
37
38
38
    GetOptions("verbose+" => \$verbose,
39
    GetOptions("verbose+" => \$verbose,
39
           "database=s" => \$base,
40
           "database=s" => \$base,
40
           "source!" => \$source,
41
           "source!" => \$source,
41
               "reference!" => \$reference);
42
               "reference!" => \$reference);
42
    
43
43
    my(%decomposition, %mark, %name);
44
    my(%decomposition, %mark, %name);
44
    my(%ranges);
45
    my(%ranges);
45
    open(FILE, "<$base") or die "cannot open $base for reading : $!";
46
    open(FILE, "<$base") or die "cannot open $base for reading : $!";
46
    while(<FILE>) {
47
    while(<FILE>) {
47
    next if(/^\s*#/);    # Skip comments
48
    next if(/^\s*#/);    # Skip comments
...
...
99
    } else {
100
    } else {
100
        delete($decomposition{$from});
101
        delete($decomposition{$from});
101
    }
102
    }
102
    }
103
    }
103
104
105
    # Read in the casefolding file
106
    my(%casefold);
107
    open(FILE, "<$cfbase") or die "cannot open $cfbase for reading : $!";
108
    while(<FILE>) {
109
  next if(/^\s*#/);    # Skip comments
110
  my($code_value,
111
     $foldstatus,
112
     $folded) = split(/;/, $_);
113
  if ($foldstatus =~ /C|F/) {
114
      $casefold{$code_value} = $folded;
115
  }
116
    }
117
    close(FILE);
118
119
    #showcasefold(\%casefold);
104
    reference(\%decomposition, $verbose) if($reference);
120
    reference(\%decomposition, $verbose) if($reference);
105
    source(\%decomposition, \%name, $verbose) if($source);
121
    source(\%decomposition, \%name, \%casefold, $verbose) if($source);
122
}
123
124
sub showcasefold {
125
    my($casefold) = @_;
126
127
    my($code_value);
128
    foreach $code_value (0 .. 0xFFFF) {
129
  $code_value = uc(sprintf("%04x", $code_value));
130
  print "$code_value";
131
  if(exists($casefold->{$code_value})) {
132
      print " => $casefold->{$code_value}\n";
133
  } else {
134
      print "\n";
135
  }
136
    }
106
}
137
}
107
138
108
#
139
#
109
# Generate machine readable file mapping all UTF-16 codes
140
# Generate machine readable file mapping all UTF-16 codes
110
# to their unaccented replacement. This file can be compared
141
# to their unaccented replacement. This file can be compared
...
...
200
#
231
#
201
# Generate tables, defines and code in the unac.c and unac.h files.
232
# Generate tables, defines and code in the unac.c and unac.h files.
202
# The unac.c and unac.h files are substituted in place.
233
# The unac.c and unac.h files are substituted in place.
203
#
234
#
204
sub source {
235
sub source {
205
    my($decomposition, $name, $verbose) = @_;
236
    my($decomposition, $name, $casefold, $verbose) = @_;
206
237
207
    my($csource) = slurp("unac.c");
238
    my($csource) = slurp("unac.c");
208
    my($hsource) = slurp("unac.h");
239
    my($hsource) = slurp("unac.h");
209
    #
240
    #
210
    # Human readable table
241
    # Human readable table
...
...
275
        if(exists($decomposition->{$code_value})) {
306
        if(exists($decomposition->{$code_value})) {
276
        push(@values, $decomposition->{$code_value});
307
        push(@values, $decomposition->{$code_value});
277
        } else {
308
        } else {
278
        push(@values, "FFFF");
309
        push(@values, "FFFF");
279
        }
310
        }
311
      # We also push the case-folded version of the unaccented char
312
      # Note that by pushing the case-folded version of the original
313
      # char, we'd have the possibility of independant unaccenting and
314
      # case folding, but with less performance. 
315
      # We could also keep the three chunks, using a little more memory
316
      if(exists($decomposition->{$code_value})) {
317
        my($cv);
318
        my(@vl);
319
        foreach $cv (split(' ', $decomposition->{$code_value})) {
320
      if(exists($casefold->{$cv})) {
321
        push(@vl, $casefold->{$cv});
322
      } else {
323
        push(@vl, $cv);
324
      }
325
        }
326
        #print STDERR "Pushing " . join(" ", @vl) . " for " . 
327
        #$code_value . "\n";
328
        push(@values, join(" ", @vl));
329
      } else {
330
        if(exists($casefold->{$code_value})) {
331
      push(@values, $casefold->{$code_value});
332
        } else {
333
      push(@values, "FFFF");
334
        }
335
      }
280
    }
336
    }
281
    print STDERR scalar(@blocks) . " blocks of " . $block_count . " entries, factorized $duplicate blocks\n\t" if($verbose);
337
    print STDERR scalar(@blocks) . " blocks of " . $block_count . " entries, factorized $duplicate blocks\n\t" if($verbose);
282
    my($block_size) = 0;
338
    my($block_size) = 0;
283
    my($block);
339
    my($block);
284
    foreach $block (@blocks) {
340
    foreach $block (@blocks) {
...
...
370
    push(@data_table_out, "unac_data$block_number");
426
    push(@data_table_out, "unac_data$block_number");
371
    push(@data_out, "unsigned short unac_data$block_number" . "[] = { 0x" . join(", 0x", @data) . " };\n");
427
    push(@data_out, "unsigned short unac_data$block_number" . "[] = { 0x" . join(", 0x", @data) . " };\n");
372
    $block_number++;
428
    $block_number++;
373
    }
429
    }
374
    my($position_type) = $highest_position >= 256 ? "short" : "char";
430
    my($position_type) = $highest_position >= 256 ? "short" : "char";
375
    my($positions_out) = "unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][UNAC_BLOCK_SIZE + 1] = {\n";
431
    my($positions_out) = "unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][2*UNAC_BLOCK_SIZE + 1] = {\n";
376
432
377
    $positions_out .= join(",\n", @positions_out);
433
    $positions_out .= join(",\n", @positions_out);
378
    $positions_out .= "\n};\n";
434
    $positions_out .= "\n};\n";
379
    my($data_out) = join("", @data_out);
435
    my($data_out) = join("", @data_out);
380
    $data_table_out .= join(",\n", @data_table_out);
436
    $data_table_out .= join(",\n", @data_table_out);
...
...
385
    # result : $declarations
441
    # result : $declarations
386
    #
442
    #
387
    my($declarations);
443
    my($declarations);
388
    $declarations = <<EOF;
444
    $declarations = <<EOF;
389
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
445
extern unsigned short unac_indexes[UNAC_INDEXES_SIZE];
390
extern unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][UNAC_BLOCK_SIZE + 1];
446
extern unsigned $position_type unac_positions[UNAC_BLOCK_COUNT][2*UNAC_BLOCK_SIZE + 1];
391
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
447
extern unsigned short* unac_data_table[UNAC_BLOCK_COUNT];
392
EOF
448
EOF
393
    for($block_number = 0; $block_number < $block_count; $block_number++) {
449
    for($block_number = 0; $block_number < $block_count; $block_number++) {
394
    $declarations .= "extern unsigned short unac_data$block_number" . "[];\n";
450
    $declarations .= "extern unsigned short unac_data$block_number" . "[];\n";
395
    }
451
    }