libjava/scripts/unicode-to-chartables.pl

   1 #!/usr/bin/perl -w
   2 # unicode-to-chartables.pl -- generate Unicode database for java.lang.Character
   3 # Copyright (C) 1998, 2002, 2004, 2006  Free Software Foundation, Inc.
   4 #
   5 # This file is part of GNU Classpath.
   6 #
   7 # GNU Classpath is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # GNU Classpath is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GNU Classpath; see the file COPYING.  If not, write to the
  19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20 # 02110-1301 USA.
  21 #
  22 # Linking this library statically or dynamically with other modules is
  23 # making a combined work based on this library.  Thus, the terms and
  24 # conditions of the GNU General Public License cover the whole
  25 # combination.
  26 #
  27 # As a special exception, the copyright holders of this library give you
  28 # permission to link this library with independent modules to produce an
  29 # executable, regardless of the license terms of these independent
  30 # modules, and to copy and distribute the resulting executable under
  31 # terms of your choice, provided that you also meet, for each linked
  32 # independent module, the terms and conditions of the license of that
  33 # module.  An independent module is a module which is not derived from
  34 # or based on this library.  If you modify this library, you may extend
  35 # this exception to your version of the library, but you are not
  36 # obligated to do so.  If you do not wish to do so, delete this
  37 # exception statement from your version.
  38
  39 # Code for reading UnicodeData-4.0.0.txt and SpecialCasing-4.0.0.txt to generate
  40 # the code for java-chartables.h. The relevant files can be found here:
  41 #
  42 #   http://www.unicode.org/Public/4.0-Update/UnicodeData-4.0.0.txt
  43 #   http://www.unicode.org/Public/4.0-Update/SpecialCasing-4.0.0.txt
  44 #
  45 # Inspired by code from Jochen Hoenicke.
  46 # author Eric Blake <ebb9@email.byu.edu>
  47 # Unicode 4.0.0 support by Anthony Balkissoon <abalkiss@redhat.com>
  48 #
  49 # Usage: ./unicode-to-chartables.pl <UnicodeData> <SpecialCasing> <tables>
  50 #   where <UnicodeData.txt> is obtained from www.unicode.org (named
  51 #   UnicodeData-4.0.0.txt for Unicode version 4.0.0), <SpecialCasing>
  52 #   is obtained from www.unicode too (named SpecialCasing-4.0.0.txt for Unicode
  53 #   version 4.0.0), and <tables> is the final location for the header file
  54 #   java-chartables.h. As of JDK 1.5, use Unicode version 4.0.0
  55 #   for best results.
  56
  57
  58 ##
  59 ## Return the given variable interpreted as a 16 bit signed number.
  60 ##
  61 sub cShort($) {
  62     my ($char) = @_;
  63     return unpack "s", pack "I", $char;
  64 }
  65
  66 ##
  67 ## Convert the text UnicodeData file from www.unicode.org into a header file
  68 ## interface with arrays holding the compressed information.
  69 ##
  70 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
  71                    SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
  72 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
  73
  74 my $NOBREAK_FLAG  = 32;
  75 my $MIRRORED_FLAG = 64;
  76
  77 my %special = ();
  78
  79 # infoArray is an array where each element is a list of character information
  80 # for characters in a plane.  The index of each list is equal to the plane
  81 # that it corresponds to even though most of these lists will currently be
  82 # empty.  This is done so that that this script can be easily modified to
  83 # accomodate future versions of Unicode.
  84 my @infoArray = \((), (), (), (), (), (), (), (),
  85     (), (), (), (), (), (), (), (), ());
  86
  87 # info is a reference to one of the lists in infoArray, depending on which
  88 # plane we're currently parsing.
  89 my $info;
  90
  91 # largeNums is an array of numerical values that are too large to fit
  92 # into the 16 bit char where most numerical values are stored.
  93 # What is stored in the char then is a number N such that (-N - 3) is
  94 # the index into largeNums where the numerical value can be found.
  95 my @largeNums = ();
  96
  97 my $titlecase = "";
  98 my $count = 0;
  99 my $range = 0;
 100
 101 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <java-chartables.h>"
 102     unless @ARGV == 3;
 103 $| = 1;
 104 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
 105 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
 106
 107
 108 ################################################################################
 109 ################################################################################
 110 # Stage 0: Parse the special casing file
 111 print "Parsing special casing file\n";
 112 open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
 113 while (<SPECIAL>) {
 114     next if /^\#/;
 115     my ($ch, undef, undef, $upper) = split / *; */;
 116
 117     # This grabs only the special casing for multi-char uppercase. Note that
 118     # there are no multi-char lowercase, and that Sun ignores multi-char
 119     # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
 120     # which must be hardcoded in java.lang.String:
 121     #  \u03a3 (Sun ignores this special case)
 122     #  \u0049 - lowercases to \u0131, but only in Turkish locale
 123     #  \u0069 - uppercases to \u0130, but only in Turkish locale
 124     next unless defined $upper and $upper =~ / /;
 125     $special{hex $ch} = [map {hex} split ' ', $upper];
 126 }
 127
 128 close SPECIAL;
 129
 130
 131 ################################################################################
 132 ################################################################################
 133 ## Stage 1: Parse the attribute file
 134 print "Parsing attributes file";
 135 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
 136 while (<UNICODE>) {
 137     print "." unless $count++ % 1000;
 138     chomp;
 139     s/\r//g;
 140     my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
 141         $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
 142     $ch = hex($ch);
 143
 144     # plane tells us which Unicode code plane we're currently in and is an
 145     # index into infoArray.
 146     my $plane = int($ch / 0x10000);
 147     my $planeBase = $plane * 0x10000;
 148     $info = \@{$infoArray[$plane]};
 149
 150     my ($type, $numValue, $upperchar, $lowerchar, $direction);
 151
 152     $type = 0;
 153     while ($category !~ /^$TYPECODES[$type]$/) {
 154         if (++$type == @TYPECODES) {
 155             die "$ch: Unknown type: $category";
 156         }
 157     }
 158     $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
 159     $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
 160
 161     if ($numeric =~ /^[0-9]+$/) {
 162         $numValue = $numeric;
 163         # If numeric takes more than 16 bits to store we want to store that
 164         # number in a separate array and store a number N in numValue such
 165         # that (-N - 3) is the offset into the separate array containing the
 166         # large numerical value.
 167         if ($numValue >= 0x7fff) {
 168             $numValue = -3 - @largeNums;
 169             push @largeNums, $numeric;
 170         }
 171     } elsif ($numeric eq "") {
 172         # Special case sequences of 'a'-'z'
 173         if ($ch >= 0x0041 && $ch <= 0x005a) {
 174             $numValue = $ch - 0x0037;
 175         } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
 176             $numValue = $ch - 0x0057;
 177         } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
 178             $numValue = $ch - 0xff17;
 179         } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
 180             $numValue = $ch - 0xff37;
 181         } else {
 182             $numValue = -1;
 183         }
 184     } else {
 185         $numValue = -2;
 186     }
 187
 188     $upperchar = $upcase ? hex($upcase) - $ch : 0;
 189     $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
 190     if ($title ne $upcase) {
 191         my $titlechar = $title ? hex($title) : $ch;
 192         $titlecase .= pack("n2", $ch, $titlechar);
 193     }
 194
 195     $direction = 0;
 196     while ($bidir !~ /^$DIRCODES[$direction]$/) {
 197         if (++$direction == @DIRCODES) {
 198             $direction = -1;
 199             last;
 200         }
 201     }
 202     $direction <<= 2;
 203     $direction += $#{$special{$ch}} if defined $special{$ch};
 204
 205     if ($range) {
 206         die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
 207         for ($range + 1 .. $ch - 1) {
 208             $info->[$_ - $planeBase] = pack("n5", $type, $numValue, $upperchar,
 209                              $lowerchar, $direction);
 210         }
 211         $range = 0;
 212     } elsif ($name =~ /First>$/) {
 213         $range = $ch;
 214     }
 215     # Store all this parsed information into the element in infoArray that info
 216     # points to.
 217     $info->[$ch - $planeBase] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
 218                       $direction);
 219 }
 220 close UNICODE;
 221
 222
 223 ################################################################################
 224 ################################################################################
 225 ## Stage 2: Compress the data structures
 226 printf "\nCompressing data structures";
 227 $count = 0;
 228
 229 # data is a String that will be used to create the DATA String containing
 230 # character information and offsets into the attribute tables.
 231 my @data = ();
 232
 233 # charhashArray is an array of hashtables used so that we can reuse character
 234 # attributes when characters share the same attributes ... this makes our
 235 # attribute tables smaller.  charhash is a pointer into this array.
 236 my @charhashArray = ({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {});
 237 my $charhash = ();
 238
 239 # charinfoArray is an array of arrays, one per plane, for storing character
 240 # information.  charinfo is a pointer into this array.
 241 my @charinfoArray = \((), (), (), (), (), (), (), (),
 242     (), (), (), (), (), (), (), (), ());
 243 my $charinfo;
 244
 245 # charlen is an array, one element per plane, that tells us how many unique
 246 # character attributes there are for that plane.
 247 my @charlen = ();
 248
 249 for my $plane (0 .. 0x10) {
 250     $info = \@{$infoArray[$plane]};
 251     my $planeBase = $plane * 0x10000;
 252     $charhash = \%{$charhashArray[$plane]};
 253     $charinfo = \@{$charinfoArray[$plane]};
 254
 255     for my $ch ($planeBase .. $planeBase + 0xffff) {
 256         my $index = $ch - $planeBase;
 257         print "." unless $count++ % 0x1000;
 258         $info->[$index] = pack("n5", 0, -1, 0, 0, -4) unless defined $info->[$index];
 259
 260         my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info->[$index]);
 261         if (! exists $charhash->{$info->[$index]}) {
 262             # If we entered this loop that means the character we're looking at
 263             # now has attributes that are unique from those that we've looked
 264             # at so far for this plane.  So we push its attributes into charinfo
 265             # and store in charhash the offset into charinfo where these
 266             # attributes can later be found.
 267             push @{$charinfo}, [ $numVal, $upper, $lower, $direction ];
 268             $charhash->{$info->[$index]} = @{$charinfo} - 1;
 269             # When the file is generaged, the number we just stored in charhas
 270             # will be the upper 9 bits in the DATA String that are an offset
 271             # into the attribute tables.
 272         }
 273         $data[$plane] .= pack("n", ($charhash->{$info->[$index]} << 7) | $type);
 274     }
 275     $charlen[$plane] = scalar(@{$charinfoArray[$plane]});
 276 }
 277
 278 # the shift that results in the best compression of the table.  This is an array
 279 # because different shifts are better for the different tables for each plane.
 280 my @bestshift;
 281
 282 # an initial guess.
 283 my $bestest = 1000000;
 284 my @bestblkstr;
 285 my @blksize = ();
 286
 287 for my $plane (0 .. 0x10) {
 288     print "\n\nplane: $plane\n";
 289     print "Unique character entries: $charlen[$plane]\n";
 290     $bestest = 1000000;
 291     for my $i (3 .. 8) {
 292         my $blksize = 1 << $i;
 293         my %blocks = ();
 294         my @blkarray = ();
 295         my ($j, $k);
 296         print "shift: $i";
 297
 298         for ($j = 0; $j < 0x10000; $j += $blksize) {
 299             my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize;
 300             if (! exists $blocks{$blkkey}) {
 301                 push @blkarray, $blkkey;
 302                 $blocks{$blkkey} = $#blkarray;
 303             }
 304         }
 305
 306         my $blknum = @blkarray;
 307         my $blocklen = $blknum * $blksize;
 308         printf " before %5d", $blocklen;
 309
 310         # Now we try to pack the blkarray as tight as possible by finding matching
 311         # heads and tails.
 312         for ($j = $blksize - 1; $j > 0; $j--) {
 313             my %tails = ();
 314             for $k (0 .. $#blkarray) {
 315                 next unless defined $blkarray[$k];
 316                 my $len = length $blkarray[$k];
 317                 my $tail = substr $blkarray[$k], $len - $j * 2;
 318                 if (exists $tails{$tail}) {
 319                     push @{$tails{$tail}}, $k;
 320                 } else {
 321                     $tails{$tail} = [ $k ];
 322                 }
 323             }
 324
 325             # tails are calculated, now calculate the heads and merge.
 326           BLOCK:
 327             for $k (0 .. $#blkarray) {
 328                 next unless defined $blkarray[$k];
 329                 my $tomerge = $k;
 330                 while (1) {
 331                     my $head = substr($blkarray[$tomerge], 0, $j * 2);
 332                     my $entry = $tails{$head};
 333                     next BLOCK unless defined $entry;
 334
 335                     my $other = shift @{$entry};
 336                     if ($other == $tomerge) {
 337                         if (@{$entry}) {
 338                             push @{$entry}, $other;
 339                             $other = shift @{$entry};
 340                         } else {
 341                             push @{$entry}, $other;
 342                             next BLOCK;
 343                         }
 344                     }
 345                     if (@{$entry} == 0) {
 346                         delete $tails{$head};
 347                     }
 348
 349                     # a match was found
 350                     my $merge = $blkarray[$other]
 351                         . substr($blkarray[$tomerge], $j * 2);
 352                     $blocklen -= $j;
 353                     $blknum--;
 354
 355                     if ($other < $tomerge) {
 356                         $blkarray[$tomerge] = undef;
 357                         $blkarray[$other] = $merge;
 358                         my $len = length $merge;
 359                         my $tail = substr $merge, $len - $j * 2;
 360                         $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
 361                                           @{$tails{$tail}} ];
 362                         next BLOCK;
 363                     }
 364                     $blkarray[$tomerge] = $merge;
 365                     $blkarray[$other] = undef;
 366                 }
 367             }
 368         }
 369         my $blockstr;
 370         for $k (0 .. $#blkarray) {
 371             $blockstr .= $blkarray[$k] if defined $blkarray[$k];
 372         }
 373
 374         die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
 375         my $estimate = 2 * $blocklen + (0x20000 >> $i);
 376
 377         printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
 378         if ($estimate < $bestest) {
 379             $bestest = $estimate;
 380             $bestshift[$plane] = $i;
 381             $bestblkstr[$plane] = $blockstr;
 382         }
 383     }
 384     $blksize[$plane] = 1 << $bestshift[$plane];
 385     print "best shift: ", $bestshift[$plane];
 386     print "     blksize: ", $blksize[$plane];
 387 }
 388 my @blocksArray = \((), (), (), (), (), (), (), (),
 389     (), (), (), (), (), (), (), (), ());
 390
 391 for my $plane (0 .. 0x10) {
 392     for (my $j = 0; $j < 0x10000; $j += $blksize[$plane]) {
 393         my $blkkey = substr $data[$plane], 2 * $j, 2 * $blksize[$plane];
 394         my $index = index $bestblkstr[$plane], $blkkey;
 395         while ($index & 1) {
 396             die "not found: $j" if $index == -1;
 397             $index = index $bestblkstr[$plane], $blkkey, $index + 1;
 398         }
 399         push @{$blocksArray[$plane]}, ($index / 2 - $j) & 0xffff;
 400     }
 401 }
 402
 403
 404 ################################################################################
 405 ################################################################################
 406 ## Stage 3: Generate the file
 407 for my $plane (0 .. 0x10) {
 408     die "UTF-8 limit of blocks may be exceeded for plane $plane: " . scalar(@{$blocksArray[$plane]}) . "\n"
 409         if @{$blocksArray[$plane]} > 0xffff / 3;
 410     die "UTF-8 limit of data may be exceeded for plane $plane: " . length($bestblkstr[$plane]) . "\n"
 411         if length($bestblkstr[$plane]) > 0xffff / 3;
 412 }
 413
 414 {
 415     print "\nGenerating $ARGV[2].";
 416     my ($i, $j);
 417
 418     open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
 419     print OUTPUT <<EOF;
 420 /* java-chartables.h -- Character tables for java.lang.Character -*- c++ -*-
 421    Copyright (C) 2002, 2006 Free Software Foundation, Inc.
 422    *** This file is generated by scripts/unicode-to-chartables.pl ***
 423
 424 This file is part of GNU Classpath.
 425
 426 GNU Classpath is free software; you can redistribute it and/or modify
 427 it under the terms of the GNU General Public License as published by
 428 the Free Software Foundation; either version 2, or (at your option)
 429 any later version.
 430
 431 GNU Classpath is distributed in the hope that it will be useful, but
 432 WITHOUT ANY WARRANTY; without even the implied warranty of
 433 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 434 General Public License for more details.
 435
 436 You should have received a copy of the GNU General Public License
 437 along with GNU Classpath; see the file COPYING.  If not, write to the
 438 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 439 02110-1301 USA.
 440
 441 Linking this library statically or dynamically with other modules is
 442 making a combined work based on this library.  Thus, the terms and
 443 conditions of the GNU General Public License cover the whole
 444 combination.
 445
 446 As a special exception, the copyright holders of this library give you
 447 permission to link this library with independent modules to produce an
 448 executable, regardless of the license terms of these independent
 449 modules, and to copy and distribute the resulting executable under
 450 terms of your choice, provided that you also meet, for each linked
 451 independent module, the terms and conditions of the license of that
 452 module.  An independent module is a module which is not derived from
 453 or based on this library.  If you modify this library, you may extend
 454 this exception to your version of the library, but you are not
 455 obligated to do so.  If you do not wish to do so, delete this
 456 exception statement from your version. */
 457
 458 #ifndef __JAVA_CHARTABLES_H__
 459 #define __JAVA_CHARTABLES_H__
 460
 461 // These tables are automatically generated by scripts/unicode_to_chartables.pl.
 462 // The Unicode data comes from www.unicode.org; this header is based on
 463 // UnicodeData-4.0.0.txt. JDK 1.5 uses Unicode version 4.0.0.
 464 // DO NOT EDIT the tables.  Instead, fix the upstream scripts and run
 465 // them again.
 466
 467 // The data is stored in C style arrays of the appropriate CNI types, to
 468 // guarantee that the data is constant and non-relocatable.  The field
 469 // <code>blocks</code> stores the offset of a block of 2<sup>SHIFT</sup>
 470 // characters within <code>data</code>. The data field, in turn, stores
 471 // information about each character in the low order bits, and an offset
 472 // into the attribute tables <code>upper</code>, <code>lower</code>,
 473 // <code>numValue</code>, and <code>direction</code>.  Notice that the
 474 // attribute tables are much smaller than 0xffff entries; as many characters
 475 // in Unicode share common attributes.  Finally, there is a listing for
 476 // <code>title</code> exceptions (most characters just have the same title
 477 // case as upper case).
 478
 479 // This file should only be included by natCharacter.cc
 480
 481 /**
 482  * The array containing the numeric values that are too large to be stored as
 483  * chars in NUM_VALUE.  NUM_VALUE in this case will contain a negative integer
 484  * N such that LARGENUMS[-N - 3] contains the correct numeric value.
 485  */
 486 EOF
 487   print OUTPUT "static const jint largenums[] = {\n    ";
 488   for ($i = 0; $i < @largeNums; $i++) {
 489       print OUTPUT $largeNums[$i], ", ";
 490   }
 491   print OUTPUT "}";
 492   print OUTPUT <<EOF;
 493 ;
 494
 495 /**
 496  * The character shift amount to look up the block offset. In other words,
 497  * <code>(char) (blocks[p][off >> SHIFT[p]] + off)</code> is the index where
 498  * <code>ch</code> is described in <code>data</code>, where <code>off</code>
 499  * is ch & 0xffff and <code>p</code> is the plane the character belongs to.
 500  */
 501 EOF
 502   print OUTPUT "static const int shift[] = {\n    ";
 503   for ($i = 0; $i < @bestshift; $i++) {
 504       print OUTPUT $bestshift[$i], ", ";
 505   }
 506   print OUTPUT "}";
 507   print OUTPUT <<EOF;
 508 ;
 509
 510 /**
 511  * The mapping of character blocks to their location in <code>data</code>.
 512  * Each entry has been adjusted so that a modulo 16 sum with the desired
 513  * character gives the actual index into <code>data</code>.
 514  */
 515 EOF
 516   for ($plane = 0; $plane <= 0x10; $plane++) {
 517       # The following if statement handles the cases of unassigned planes
 518       # specially so we don't waste space with unused Strings.  As of
 519       # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 520       # you are updating this script to work with a later version of
 521       # Unicode you may have to alter this if statement.
 522       next if ($plane > 2 && $plane != 14) ;
 523
 524       print OUTPUT "static const jchar blocks", $plane, "[] = {\n";
 525       for ($i = 0; $i < @{$blocksArray[$plane]} / 10; $i++) {
 526           print OUTPUT "    ";
 527           for $j (0 .. 9) {
 528               last if @{$blocksArray[$plane]} <= $i * 10 + $j;
 529               my $val = $blocksArray[$plane]->[$i * 10 + $j];
 530               print OUTPUT $val, ", ";
 531           }
 532           print OUTPUT "\n";
 533       }
 534       print OUTPUT "};\n\n";
 535   }
 536   print OUTPUT "static const int blocks_length[] = {\n    ";
 537   for ($plane = 0; $plane <= 0x10; $plane++) {
 538       if ($plane > 2 && $plane != 14){
 539           print OUTPUT "-1, ";
 540       }
 541       else {
 542           print OUTPUT scalar(@{$blocksArray[$plane]}), ", ";
 543       }
 544   }
 545   print OUTPUT "};\n";
 546   print OUTPUT <<EOF;
 547 static const jchar* blocks[] = {
 548     blocks0, blocks1, blocks2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 549     NULL, NULL, NULL, NULL, blocks14, NULL, NULL};
 550
 551 /**
 552  * Information about each character.  The low order 5 bits form the
 553  * character type, the next bit is a flag for non-breaking spaces, and the
 554  * next bit is a flag for mirrored directionality.  The high order 9 bits
 555  * form the offset into the attribute tables.  Note that this limits the
 556  * number of unique character attributes per plane to 512, which is not a
 557  * problem as of Unicode version 4.0.0, but may soon become one.
 558  */
 559 EOF
 560   for ($plane = 0; $plane <= 0x10; $plane++) {
 561       # The following if statement handles the cases of unassigned planes
 562       # specially so we don't waste space with unused Strings.  As of
 563       # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 564       # you are updating this script to work with a later version of
 565       # Unicode you may have to alter this if statement.
 566       next if ($plane > 2 && $plane != 14);
 567
 568       print OUTPUT "static const jchar data", $plane, "[] = {\n";
 569       my $len = length($bestblkstr[$plane]) / 2;
 570       for ($i = 0; $i < $len / 10; $i++) {
 571           print OUTPUT "    ";
 572           for $j (0 .. 9) {
 573               last if $len <= $i * 10 + $j;
 574               my $val = unpack "n", substr($bestblkstr[$plane], 2 * ($i * 10 + $j), 2);
 575               print OUTPUT $val, ", ";
 576           }
 577           print OUTPUT "\n";
 578       }
 579       print OUTPUT "};\n\n";
 580   }
 581   print OUTPUT "static const int data_length[] = {\n    ";
 582   for ($plane = 0; $plane <= 0x10; $plane++) {
 583       if ($plane > 2 && $plane != 14){
 584           print OUTPUT "-1, ";
 585       }
 586       else {
 587           print OUTPUT length($bestblkstr[$plane]) / 2, ", ";
 588       }
 589   }
 590   print OUTPUT "};\n";
 591   print OUTPUT <<EOF;
 592 static const jchar* data[] = {
 593     data0, data1, data2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 594     NULL, NULL, NULL, NULL, data14, NULL, NULL};
 595
 596
 597 /**
 598  * This is the attribute table for computing the numeric value of a
 599  * character.  The value is -1 if Unicode does not define a value, -2
 600  * if the value is not a positive integer, otherwise it is the value.
 601  */
 602 EOF
 603   for ($plane = 0; $plane <= 0x10; $plane++) {
 604       # The following if statement handles the cases of unassigned planes
 605       # specially so we don't waste space with unused Strings.  As of
 606       # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 607       # you are updating this script to work with a later version of
 608       # Unicode you may have to alter this if statement.
 609       next if ($plane > 2 && $plane != 14);
 610
 611       print OUTPUT "static const jshort numValue", $plane, "[] = {\n";
 612       $len = @{$charinfoArray[$plane]};
 613       for ($i = 0; $i < $len / 13; $i++) {
 614           print OUTPUT "    ";
 615           for $j (0 .. 12) {
 616               last if $len <= $i * 13 + $j;
 617               my $val = $charinfoArray[$plane]->[$i * 13 + $j][0];
 618               print OUTPUT cShort($val), ", ";
 619           }
 620           print OUTPUT "\n";
 621       }
 622       print OUTPUT "};\n\n";
 623   }
 624   print OUTPUT "static const int numValue_length[] = {\n    ";
 625   for ($plane = 0; $plane <= 0x10; $plane++) {
 626       if ($plane > 2 && $plane != 14){
 627           print OUTPUT "-1, ";
 628       }
 629       else {
 630           print OUTPUT scalar(@{$charinfoArray[$plane]}), ", ";
 631       }
 632   }
 633   print OUTPUT "};\n";
 634   print OUTPUT <<EOF;
 635 static const jshort* numValue[] = {
 636     numValue0, numValue1, numValue2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 637     NULL, NULL, NULL, NULL, numValue14, NULL, NULL};
 638
 639
 640
 641 /**
 642  * This is the attribute table for computing the uppercase representation
 643  * of a character.  The value is the difference between the character and
 644  * its uppercase version.
 645  */
 646 EOF
 647   for ($plane = 0; $plane <= 0x10; $plane++) {
 648       # The following if statement handles the cases of unassigned planes
 649       # specially so we don't waste space with unused Strings.  As of
 650       # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 651       # you are updating this script to work with a later version of
 652       # Unicode you may have to alter this if statement.
 653       next if ($plane > 2 && $plane != 14);
 654
 655       print OUTPUT "static const jshort upper", $plane, "[] = {\n";
 656       $len = @{$charinfoArray[$plane]};
 657       for ($i = 0; $i < $len / 13; $i++) {
 658           print OUTPUT "    ";
 659           for $j (0 .. 12) {
 660               last if $len <= $i * 13 + $j;
 661               my $val = $charinfoArray[$plane]->[$i * 13 + $j][1];
 662               print OUTPUT cShort($val), ", ";
 663           }
 664           print OUTPUT "\n";
 665       }
 666       print OUTPUT "};\n\n";
 667   }
 668   print OUTPUT "static const int upper_length[] = {\n    ";
 669   for ($plane = 0; $plane <= 0x10; $plane++) {
 670       if ($plane > 2 && $plane != 14){
 671           print OUTPUT "-1, ";
 672       }
 673       else {
 674           print OUTPUT scalar(@{$charinfoArray[$plane]}), ", ";
 675       }
 676   }
 677   print OUTPUT "};\n";
 678   print OUTPUT <<EOF;
 679 static const jshort* upper[] = {
 680     upper0, upper1, upper2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 681     NULL, NULL, NULL, NULL, upper14, NULL, NULL};
 682
 683
 684 /**
 685  * This is the attribute table for computing the lowercase representation
 686  * of a character.  The value is the difference between the character and
 687  * its lowercase version.
 688  */
 689 EOF
 690   for ($plane = 0; $plane <= 0x10; $plane++) {
 691       # The following if statement handles the cases of unassigned planes
 692       # specially so we don't waste space with unused Strings.  As of
 693       # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 694       # you are updating this script to work with a later version of
 695       # Unicode you may have to alter this if statement.
 696       next if ($plane > 2 && $plane != 14);
 697
 698       print OUTPUT "static const jshort lower", $plane, "[] = {\n";
 699       $len = @{$charinfoArray[$plane]};
 700       for ($i = 0; $i < $len / 13; $i++) {
 701           print OUTPUT "    ";
 702           for $j (0 .. 12) {
 703               last if $len <= $i * 13 + $j;
 704               my $val = $charinfoArray[$plane]->[$i * 13 + $j][2];
 705               print OUTPUT cShort($val), ", ";
 706           }
 707           print OUTPUT "\n";
 708       }
 709       print OUTPUT "};\n\n";
 710   }
 711   print OUTPUT "static const int lower_length[] = {\n    ";
 712   for ($plane = 0; $plane <= 0x10; $plane++) {
 713       if ($plane > 2 && $plane != 14){
 714           print OUTPUT "-1, ";
 715       }
 716       else {
 717           print OUTPUT scalar(@{$charinfoArray[$plane]}), ", ";
 718       }
 719   }
 720   print OUTPUT "};\n";
 721   print OUTPUT <<EOF;
 722 static const jshort* lower[] = {
 723     lower0, lower1, lower2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 724     NULL, NULL, NULL, NULL, lower14, NULL, NULL};
 725
 726
 727 /**
 728  * This is the attribute table for computing the directionality class
 729  * of a character.  At present, the value is in the range 0 - 18 if the
 730  * character has a direction, otherwise it is -1.
 731  */
 732 EOF
 733   for ($plane = 0; $plane <= 0x10; $plane++) {
 734       # The following if statement handles the cases of unassigned planes
 735       # specially so we don't waste space with unused Strings.  As of
 736       # Unicode version 4.0.0 only planes 0, 1, 2, and 14 are used.  If
 737       # you are updating this script to work with a later version of
 738       # Unicode you may have to alter this if statement.
 739       next if ($plane > 2 && $plane != 14);
 740
 741       print OUTPUT "static const jbyte direction", $plane, "[] = {\n";
 742       $len = @{$charinfoArray[$plane]};
 743       for ($i = 0; $i < $len / 19; $i++) {
 744           print OUTPUT "    ";
 745           for $j (0 .. 18) {
 746               last if $len <= $i * 19 + $j;
 747               my $val = $charinfoArray[$plane]->[$i * 19 + $j][3];
 748               $val >>= 2;
 749               if ($val < 0 || $val > 18){
 750                   $val = -1;
 751               }
 752               print OUTPUT cShort($val), ", ";
 753           }
 754           print OUTPUT "\n";
 755       }
 756       print OUTPUT "};\n\n";
 757   }
 758   print OUTPUT "static const int direction_length[] = {\n    ";
 759   for ($plane = 0; $plane <= 0x10; $plane++) {
 760       if ($plane > 2 && $plane != 14){
 761           print OUTPUT "-1, ";
 762       }
 763       else {
 764           print OUTPUT scalar(@{$charinfoArray[$plane]}), ", ";
 765       }
 766   }
 767   print OUTPUT "};\n";
 768   print OUTPUT <<EOF;
 769 static const jbyte* direction[] = {
 770     direction0, direction1, direction2, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
 771     NULL, NULL, NULL, NULL, direction14, NULL, NULL};
 772
 773
 774 /**
 775  * This is the listing of titlecase special cases (all other character
 776  * can use <code>upper</code> to determine their titlecase).  The listing
 777  * is a sequence of character pairs; converting the first character of the
 778  * pair to titlecase produces the second character.
 779  */
 780 static const jchar title[] = {
 781 EOF
 782
 783   $len = length($titlecase) / 2;
 784   for ($i = 0; $i < $len / 10; $i++) {
 785       print OUTPUT $i ? "\n    " : "    ";
 786       for $j (0 .. 9) {
 787           last if $len <= $i * 10 + $j;
 788           my $val = unpack "n", substr($titlecase, 2 * ($i * 10 + $j), 2);
 789           print OUTPUT $val, ", ";
 790       }
 791   }
 792
 793   print OUTPUT "\n  };";
 794   print OUTPUT "\n/** Length of title. */\nstatic const int title_length = ", $len;
 795   print OUTPUT <<EOF;
 796 ;
 797
 798 #endif /* __JAVA_CHARTABLES_H__ */
 799 EOF
 800   close OUTPUT;
 801 }
 802 print "\nDone.\n";