libjava/scripts/unicode-muncher.pl

   1 #!/usr/bin/perl -w
   2 # unicode-muncher.pl -- generate Unicode database for java.lang.Character
   3 # Copyright (C) 1998, 2002, 2004  Free Software Foundation, Inc.
   4 #
   5 # This file is part of GNU Classpath.
   6 #
   7 # GNU Classpath is free software; you can redistribute it and/or modify
   8 # it under the terms of the GNU General Public License as published by
   9 # the Free Software Foundation; either version 2, or (at your option)
  10 # any later version.
  11 #
  12 # GNU Classpath is distributed in the hope that it will be useful, but
  13 # WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU General Public License
  18 # along with GNU Classpath; see the file COPYING.  If not, write to the
  19 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  20 # 02110-1301 USA.
  21 #
  22 # Linking this library statically or dynamically with other modules is
  23 # making a combined work based on this library.  Thus, the terms and
  24 # conditions of the GNU General Public License cover the whole
  25 # combination.
  26 #
  27 # As a special exception, the copyright holders of this library give you
  28 # permission to link this library with independent modules to produce an
  29 # executable, regardless of the license terms of these independent
  30 # modules, and to copy and distribute the resulting executable under
  31 # terms of your choice, provided that you also meet, for each linked
  32 # independent module, the terms and conditions of the license of that
  33 # module.  An independent module is a module which is not derived from
  34 # or based on this library.  If you modify this library, you may extend
  35 # this exception to your version of the library, but you are not
  36 # obligated to do so.  If you do not wish to do so, delete this
  37 # exception statement from your version.
  38
  39 # Code for reading UnicodeData-3.0.0.txt and SpecialCasing-2.txt to generate
  40 # the code for gnu.java.lang.CharData. The relevant files can be found here:
  41 #
  42 #   http://www.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.txt
  43 #   http://www.unicode.org/Public/3.0-Update/SpecialCasing-2.txt
  44 #
  45 # Inspired by code from Jochen Hoenicke.
  46 # author Eric Blake <ebb9@email.byu.edu>
  47 #
  48 # Usage: ./unicode-muncher <UnicodeData.txt> <SpecialCasing> <CharData.java>
  49 #   where <UnicodeData.txt> is obtained from www.unicode.org (named
  50 #   UnicodeData-3.0.0.txt for Unicode version 3.0.0), <SpecialCasing>
  51 #   is obtained from www.unicode too (named SpecialCasing-2.txt for Unicode
  52 #   version 3.0.0), and <CharData.java> is the final location for the Java
  53 #   interface gnu.java.lang.CharData. As of JDK 1.4, use Unicode version 3.0.0
  54 #   for best results.
  55
  56 ##
  57 ## Convert a 16-bit integer to a Java source code String literal character
  58 ##
  59 sub javaChar($) {
  60     my ($char) = @_;
  61     die "Out of range: $char\n" if $char < -0x8000 or $char > 0xffff;
  62     $char += 0x10000 if $char < 0;
  63     # Special case characters that must be escaped, or are shorter as ASCII
  64     return sprintf("\\%03o", $char) if $char < 0x20;
  65     return "\\\"" if $char == 0x22;
  66     return "\\\\" if $char == 0x5c;
  67     return pack("C", $char) if $char < 0x7f;
  68     return sprintf("\\u%04x", $char);
  69 }
  70
  71 ##
  72 ## Convert the text UnicodeData file from www.unicode.org into a Java
  73 ## interface with string constants holding the compressed information.
  74 ##
  75 my @TYPECODES = qw(Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp Cc Cf
  76                    SKIPPED Co Cs Pd Ps Pe Pc Po Sm Sc Sk So Pi Pf);
  77 my @DIRCODES = qw(L R AL EN ES ET AN CS NSM BN B S WS ON LRE LRO RLE RLO PDF);
  78
  79 my $NOBREAK_FLAG  = 32;
  80 my $MIRRORED_FLAG = 64;
  81
  82 my %special = ();
  83 my @info = ();
  84 my $titlecase = "";
  85 my $count = 0;
  86 my $range = 0;
  87
  88 die "Usage: $0 <UnicodeData.txt> <SpecialCasing.txt> <CharData.java>"
  89     unless @ARGV == 3;
  90 $| = 1;
  91 print "GNU Classpath Unicode Attribute Database Generator 2.1\n";
  92 print "Copyright (C) 1998, 2002 Free Software Foundation, Inc.\n";
  93
  94 # Stage 0: Parse the special casing file
  95 print "Parsing special casing file\n";
  96 open (SPECIAL, "< $ARGV[1]") || die "Can't open special casing file: $!\n";
  97 while (<SPECIAL>) {
  98     next if /^\#/;
  99     my ($ch, undef, undef, $upper) = split / *; */;
 100
 101     # This grabs only the special casing for multi-char uppercase. Note that
 102     # there are no multi-char lowercase, and that Sun ignores multi-char
 103     # titlecase rules. This script omits 3 special cases in Unicode 3.0.0,
 104     # which must be hardcoded in java.lang.String:
 105     #  \u03a3 (Sun ignores this special case)
 106     #  \u0049 - lowercases to \u0131, but only in Turkish locale
 107     #  \u0069 - uppercases to \u0130, but only in Turkish locale
 108     next unless defined $upper and $upper =~ / /;
 109     $special{hex $ch} = [map {hex} split ' ', $upper];
 110 }
 111
 112 close SPECIAL;
 113
 114 # Stage 1: Parse the attribute file
 115 print "Parsing attributes file";
 116 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
 117 while (<UNICODE>) {
 118     print "." unless $count++ % 1000;
 119     chomp;
 120     s/\r//g;
 121     my ($ch, $name, $category, undef, $bidir, $decomp, undef, undef, $numeric,
 122         $mirrored, undef, undef, $upcase, $lowcase, $title) = split ';';
 123     $ch = hex($ch);
 124     next if $ch > 0xffff; # Ignore surrogate pairs, since Java does
 125
 126     my ($type, $numValue, $upperchar, $lowerchar, $direction);
 127
 128     $type = 0;
 129     while ($category !~ /^$TYPECODES[$type]$/) {
 130         if (++$type == @TYPECODES) {
 131             die "$ch: Unknown type: $category";
 132         }
 133     }
 134     $type |= $NOBREAK_FLAG if ($decomp =~ /noBreak/);
 135     $type |= $MIRRORED_FLAG if ($mirrored =~ /Y/);
 136
 137     if ($numeric =~ /^[0-9]+$/) {
 138         $numValue = $numeric;
 139         die "numValue too big: $ch, $numValue\n" if $numValue >= 0x7fff;
 140     } elsif ($numeric eq "") {
 141         # Special case sequences of 'a'-'z'
 142         if ($ch >= 0x0041 && $ch <= 0x005a) {
 143             $numValue = $ch - 0x0037;
 144         } elsif ($ch >= 0x0061 && $ch <= 0x007a) {
 145             $numValue = $ch - 0x0057;
 146         } elsif ($ch >= 0xff21 && $ch <= 0xff3a) {
 147             $numValue = $ch - 0xff17;
 148         } elsif ($ch >= 0xff41 && $ch <= 0xff5a) {
 149             $numValue = $ch - 0xff37;
 150         } else {
 151             $numValue = -1;
 152         }
 153     } else {
 154         $numValue = -2;
 155     }
 156
 157     $upperchar = $upcase ? hex($upcase) - $ch : 0;
 158     $lowerchar = $lowcase ? hex($lowcase) - $ch : 0;
 159     if ($title ne $upcase) {
 160         my $titlechar = $title ? hex($title) : $ch;
 161         $titlecase .= pack("n2", $ch, $titlechar);
 162     }
 163
 164     $direction = 0;
 165     while ($bidir !~ /^$DIRCODES[$direction]$/) {
 166         if (++$direction == @DIRCODES) {
 167             $direction = -1;
 168             last;
 169         }
 170     }
 171     $direction <<= 2;
 172     $direction += $#{$special{$ch}} if defined $special{$ch};
 173
 174     if ($range) {
 175         die "Expecting end of range at $ch\n" unless $name =~ /Last>$/;
 176         for ($range + 1 .. $ch - 1) {
 177             $info[$_] = pack("n5", $type, $numValue, $upperchar,
 178                              $lowerchar, $direction);
 179         }
 180         $range = 0;
 181     } elsif ($name =~ /First>$/) {
 182         $range = $ch;
 183     }
 184     $info[$ch] = pack("n5", $type, $numValue, $upperchar, $lowerchar,
 185                       $direction);
 186 }
 187 close UNICODE;
 188
 189 # Stage 2: Compress the data structures
 190 printf "\nCompressing data structures";
 191 $count = 0;
 192 my $info = ();
 193 my %charhash = ();
 194 my @charinfo = ();
 195
 196 for my $ch (0 .. 0xffff) {
 197     print "." unless $count++ % 0x1000;
 198     $info[$ch] = pack("n5", 0, -1, 0, 0, -4) unless defined $info[$ch];
 199
 200     my ($type, $numVal, $upper, $lower, $direction) = unpack("n5", $info[$ch]);
 201     if (! exists $charhash{$info[$ch]}) {
 202         push @charinfo, [ $numVal, $upper, $lower, $direction ];
 203         $charhash{$info[$ch]} = $#charinfo;
 204     }
 205     $info .= pack("n", ($charhash{$info[$ch]} << 7) | $type);
 206 }
 207
 208 my $charlen = @charinfo;
 209 my $bestshift;
 210 my $bestest = 1000000;
 211 my $bestblkstr;
 212 die "Too many unique character entries: $charlen\n" if $charlen > 512;
 213 print "\nUnique character entries: $charlen\n";
 214
 215 for my $i (3 .. 8) {
 216     my $blksize = 1 << $i;
 217     my %blocks = ();
 218     my @blkarray = ();
 219     my ($j, $k);
 220     print "shift: $i";
 221
 222     for ($j = 0; $j < 0x10000; $j += $blksize) {
 223         my $blkkey = substr $info, 2 * $j, 2 * $blksize;
 224         if (! exists $blocks{$blkkey}) {
 225             push @blkarray, $blkkey;
 226             $blocks{$blkkey} = $#blkarray;
 227         }
 228     }
 229     my $blknum = @blkarray;
 230     my $blocklen = $blknum * $blksize;
 231     printf " before %5d", $blocklen;
 232
 233     # Now we try to pack the blkarray as tight as possible by finding matching
 234     # heads and tails.
 235     for ($j = $blksize - 1; $j > 0; $j--) {
 236         my %tails = ();
 237         for $k (0 .. $#blkarray) {
 238             next unless defined $blkarray[$k];
 239             my $len = length $blkarray[$k];
 240             my $tail = substr $blkarray[$k], $len - $j * 2;
 241             if (exists $tails{$tail}) {
 242                 push @{$tails{$tail}}, $k;
 243             } else {
 244                 $tails{$tail} = [ $k ];
 245             }
 246         }
 247
 248         # tails are calculated, now calculate the heads and merge.
 249       BLOCK:
 250         for $k (0 .. $#blkarray) {
 251             next unless defined $blkarray[$k];
 252             my $tomerge = $k;
 253             while (1) {
 254                 my $head = substr($blkarray[$tomerge], 0, $j * 2);
 255                 my $entry = $tails{$head};
 256                 next BLOCK unless defined $entry;
 257
 258                 my $other = shift @{$entry};
 259                 if ($other == $tomerge) {
 260                     if (@{$entry}) {
 261                         push @{$entry}, $other;
 262                         $other = shift @{$entry};
 263                     } else {
 264                         push @{$entry}, $other;
 265                         next BLOCK;
 266                     }
 267                 }
 268                 if (@{$entry} == 0) {
 269                     delete $tails{$head};
 270                 }
 271
 272                 # a match was found
 273                 my $merge = $blkarray[$other]
 274                     . substr($blkarray[$tomerge], $j * 2);
 275                 $blocklen -= $j;
 276                 $blknum--;
 277
 278                 if ($other < $tomerge) {
 279                     $blkarray[$tomerge] = undef;
 280                     $blkarray[$other] = $merge;
 281                     my $len = length $merge;
 282                     my $tail = substr $merge, $len - $j * 2;
 283                     $tails{$tail} = [ map { $_ == $tomerge ? $other : $_ }
 284                                       @{$tails{$tail}} ];
 285                     next BLOCK;
 286                 }
 287                 $blkarray[$tomerge] = $merge;
 288                 $blkarray[$other] = undef;
 289             }
 290         }
 291     }
 292     my $blockstr;
 293     for $k (0 .. $#blkarray) {
 294         $blockstr .= $blkarray[$k] if defined $blkarray[$k];
 295     }
 296
 297     die "Unexpected $blocklen" if length($blockstr) != 2 * $blocklen;
 298     my $estimate = 2 * $blocklen + (0x20000 >> $i);
 299
 300     printf " after merge %5d: %6d bytes\n", $blocklen, $estimate;
 301     if ($estimate < $bestest) {
 302         $bestest = $estimate;
 303         $bestshift = $i;
 304         $bestblkstr = $blockstr;
 305     }
 306 }
 307
 308 my @blocks;
 309 my $blksize = 1 << $bestshift;
 310 for (my $j = 0; $j < 0x10000; $j += $blksize) {
 311     my $blkkey = substr $info, 2 * $j, 2 * $blksize;
 312     my $index = index $bestblkstr, $blkkey;
 313     while ($index & 1) {
 314         die "not found: $j" if $index == -1;
 315         $index = index $bestblkstr, $blkkey, $index + 1;
 316     }
 317     push @blocks, ($index / 2 - $j) & 0xffff;
 318 }
 319
 320 # Phase 3: Generate the file
 321 die "UTF-8 limit of blocks may be exceeded: " . scalar(@blocks) . "\n"
 322     if @blocks > 0xffff / 3;
 323 die "UTF-8 limit of data may be exceeded: " . length($bestblkstr) . "\n"
 324     if length($bestblkstr) > 0xffff / 3;
 325 {
 326     print "Generating $ARGV[2] with shift of $bestshift";
 327     my ($i, $j);
 328
 329     open OUTPUT, "> $ARGV[2]" or die "Failed creating output file: $!\n";
 330     print OUTPUT <<EOF;
 331 /* gnu/java/lang/CharData -- Database for java.lang.Character Unicode info
 332    Copyright (C) 2002 Free Software Foundation, Inc.
 333    *** This file is generated by scripts/unicode-muncher.pl ***
 334
 335 This file is part of GNU Classpath.
 336
 337 GNU Classpath is free software; you can redistribute it and/or modify
 338 it under the terms of the GNU General Public License as published by
 339 the Free Software Foundation; either version 2, or (at your option)
 340 any later version.
 341
 342 GNU Classpath is distributed in the hope that it will be useful, but
 343 WITHOUT ANY WARRANTY; without even the implied warranty of
 344 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 345 General Public License for more details.
 346
 347 You should have received a copy of the GNU General Public License
 348 along with GNU Classpath; see the file COPYING.  If not, write to the
 349 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 350 02110-1301 USA.
 351
 352 Linking this library statically or dynamically with other modules is
 353 making a combined work based on this library.  Thus, the terms and
 354 conditions of the GNU General Public License cover the whole
 355 combination.
 356
 357 As a special exception, the copyright holders of this library give you
 358 permission to link this library with independent modules to produce an
 359 executable, regardless of the license terms of these independent
 360 modules, and to copy and distribute the resulting executable under
 361 terms of your choice, provided that you also meet, for each linked
 362 independent module, the terms and conditions of the license of that
 363 module.  An independent module is a module which is not derived from
 364 or based on this library.  If you modify this library, you may extend
 365 this exception to your version of the library, but you are not
 366 obligated to do so.  If you do not wish to do so, delete this
 367 exception statement from your version. */
 368
 369 package gnu.java.lang;
 370
 371 /**
 372  * This contains the info about the unicode characters, that
 373  * java.lang.Character needs.  It is generated automatically from
 374  * <code>$ARGV[0]</code> and
 375  * <code>$ARGV[1]</code>, by some
 376  * perl scripts. These Unicode definition files can be found on the
 377  * <a href="http://www.unicode.org">http://www.unicode.org</a> website.
 378  * JDK 1.4 uses Unicode version 3.0.0.
 379  *
 380  * The data is stored as string constants, but Character will convert these
 381  * Strings to their respective <code>char[]</code> components.  The field
 382  * <code>BLOCKS</code> stores the offset of a block of 2<sup>SHIFT</sup>
 383  * characters within <code>DATA</code>.  The DATA field, in turn, stores
 384  * information about each character in the low order bits, and an offset
 385  * into the attribute tables <code>UPPER</code>, <code>LOWER</code>,
 386  * <code>NUM_VALUE</code>, and <code>DIRECTION</code>.  Notice that the
 387  * attribute tables are much smaller than 0xffff entries; as many characters
 388  * in Unicode share common attributes.  The DIRECTION table also contains
 389  * a field for detecting characters with multi-character uppercase expansions.
 390  * Next, there is a listing for <code>TITLE</code> exceptions (most characters
 391  * just have the same title case as upper case).  Finally, there are two
 392  * tables for multi-character capitalization, <code>UPPER_SPECIAL</code>
 393  * which lists the characters which are special cased, and
 394  * <code>UPPER_EXPAND</code>, which lists their expansion.
 395  *
 396  * \@author scripts/unicode-muncher.pl (written by Jochen Hoenicke,
 397  *         Eric Blake)
 398  * \@see Character
 399  * \@see String
 400  */
 401 public interface CharData
 402 {
 403   /**
 404    * The Unicode definition file that was parsed to build this database.
 405    */
 406   String SOURCE = \"$ARGV[0]\";
 407
 408   /**
 409    * The character shift amount to look up the block offset. In other words,
 410    * <code>(char) (BLOCKS.value[ch >> SHIFT] + ch)</code> is the index where
 411    * <code>ch</code> is described in <code>DATA</code>.
 412    */
 413   int SHIFT = $bestshift;
 414
 415   /**
 416    * The mapping of character blocks to their location in <code>DATA</code>.
 417    * Each entry has been adjusted so that the 16-bit sum with the desired
 418    * character gives the actual index into <code>DATA</code>.
 419    */
 420   String BLOCKS
 421 EOF
 422
 423     for ($i = 0; $i < @blocks / 11; $i++) {
 424         print OUTPUT $i ? "\n    + \"" : "    = \"";
 425         for $j (0 .. 10) {
 426             last if @blocks <= $i * 11 + $j;
 427             my $val = $blocks[$i * 11 + $j];
 428             print OUTPUT javaChar($val);
 429         }
 430         print OUTPUT "\"";
 431     }
 432
 433     print OUTPUT <<EOF;
 434 ;
 435
 436   /**
 437    * Information about each character.  The low order 5 bits form the
 438    * character type, the next bit is a flag for non-breaking spaces, and the
 439    * next bit is a flag for mirrored directionality.  The high order 9 bits
 440    * form the offset into the attribute tables.  Note that this limits the
 441    * number of unique character attributes to 512, which is not a problem
 442    * as of Unicode version 3.2.0, but may soon become one.
 443    */
 444   String DATA
 445 EOF
 446
 447     my $len = length($bestblkstr) / 2;
 448     for ($i = 0; $i < $len / 11; $i++) {
 449         print OUTPUT $i ? "\n    + \"" : "    = \"";
 450         for $j (0 .. 10) {
 451             last if $len <= $i * 11 + $j;
 452             my $val = unpack "n", substr($bestblkstr, 2 * ($i * 11 + $j), 2);
 453             print OUTPUT javaChar($val);
 454         }
 455         print OUTPUT "\"";
 456     }
 457
 458     print OUTPUT <<EOF;
 459 ;
 460
 461   /**
 462    * This is the attribute table for computing the numeric value of a
 463    * character.  The value is -1 if Unicode does not define a value, -2
 464    * if the value is not a positive integer, otherwise it is the value.
 465    * Note that this is a signed value, but stored as an unsigned char
 466    * since this is a String literal.
 467    */
 468   String NUM_VALUE
 469 EOF
 470
 471     $len = @charinfo;
 472     for ($i = 0; $i < $len / 11; $i++) {
 473         print OUTPUT $i ? "\n    + \"" : "    = \"";
 474         for $j (0 .. 10) {
 475             last if $len <= $i * 11 + $j;
 476             my $val = $charinfo[$i * 11 + $j][0];
 477             print OUTPUT javaChar($val);
 478         }
 479         print OUTPUT "\"";
 480     }
 481
 482     print OUTPUT <<EOF;
 483 ;
 484
 485   /**
 486    * This is the attribute table for computing the single-character uppercase
 487    * representation of a character.  The value is the signed difference
 488    * between the character and its uppercase version.  Note that this is
 489    * stored as an unsigned char since this is a String literal.  When
 490    * capitalizing a String, you must first check if a multi-character uppercase
 491    * sequence exists before using this character.
 492    */
 493   String UPPER
 494 EOF
 495
 496     $len = @charinfo;
 497     for ($i = 0; $i < $len / 11; $i++) {
 498         print OUTPUT $i ? "\n    + \"" : "    = \"";
 499         for $j (0 .. 10) {
 500             last if $len <= $i * 11 + $j;
 501             my $val = $charinfo[$i * 11 + $j][1];
 502             print OUTPUT javaChar($val);
 503         }
 504         print OUTPUT "\"";
 505     }
 506
 507     print OUTPUT <<EOF;
 508 ;
 509
 510   /**
 511    * This is the attribute table for computing the lowercase representation
 512    * of a character.  The value is the signed difference between the
 513    * character and its lowercase version.  Note that this is stored as an
 514    * unsigned char since this is a String literal.
 515    */
 516   String LOWER
 517 EOF
 518
 519     $len = @charinfo;
 520     for ($i = 0; $i < $len / 13; $i++) {
 521         print OUTPUT $i ? "\n    + \"" : "    = \"";
 522         for $j (0 .. 12) {
 523             last if $len <= $i * 13 + $j;
 524             my $val = $charinfo[$i * 13 + $j][2];
 525             print OUTPUT javaChar($val);
 526         }
 527         print OUTPUT "\"";
 528     }
 529
 530     print OUTPUT <<EOF;
 531 ;
 532
 533   /**
 534    * This is the attribute table for computing the directionality class
 535    * of a character, as well as a marker of characters with a multi-character
 536    * capitalization.  The direction is taken by performing a signed shift
 537    * right by 2 (where a result of -1 means an unknown direction, such as
 538    * for undefined characters). The lower 2 bits form a count of the
 539    * additional characters that will be added to a String when performing
 540    * multi-character uppercase expansion. This count is also used, along with
 541    * the offset in UPPER_SPECIAL, to determine how much of UPPER_EXPAND to use
 542    * when performing the case conversion. Note that this information is stored
 543    * as an unsigned char since this is a String literal.
 544    */
 545   String DIRECTION
 546 EOF
 547
 548     $len = @charinfo;
 549     for ($i = 0; $i < $len / 17; $i++) {
 550         print OUTPUT $i ? "\n    + \"" : "    = \"";
 551         for $j (0 .. 16) {
 552             last if $len <= $i * 17 + $j;
 553             my $val = $charinfo[$i * 17 + $j][3];
 554             print OUTPUT javaChar($val);
 555         }
 556         print OUTPUT "\"";
 557     }
 558
 559     print OUTPUT <<EOF;
 560 ;
 561
 562   /**
 563    * This is the listing of titlecase special cases (all other characters
 564    * can use <code>UPPER</code> to determine their titlecase).  The listing
 565    * is a sorted sequence of character pairs; converting the first character
 566    * of the pair to titlecase produces the second character.
 567    */
 568   String TITLE
 569 EOF
 570
 571     $len = length($titlecase) / 2;
 572     for ($i = 0; $i < $len / 11; $i++) {
 573         print OUTPUT $i ? "\n    + \"" : "    = \"";
 574         for $j (0 .. 10) {
 575             last if $len <= $i * 11 + $j;
 576             my $val = unpack "n", substr($titlecase, 2 * ($i * 11 + $j), 2);
 577             print OUTPUT javaChar($val);
 578         }
 579         print OUTPUT "\"";
 580     }
 581
 582     print OUTPUT <<EOF;
 583 ;
 584
 585   /**
 586    * This is a listing of characters with multi-character uppercase sequences.
 587    * A character appears in this list exactly when it has a non-zero entry
 588    * in the low-order 2-bit field of DIRECTION.  The listing is a sorted
 589    * sequence of pairs (hence a binary search on the even elements is an
 590    * efficient way to lookup a character). The first element of a pair is the
 591    * character with the expansion, and the second is the index into
 592    * UPPER_EXPAND where the expansion begins. Use the 2-bit field of
 593    * DIRECTION to determine where the expansion ends.
 594    */
 595   String UPPER_SPECIAL
 596 EOF
 597
 598     my @list = sort {$a <=> $b} keys %special;
 599     my $expansion = "";
 600     my $offset = 0;
 601     $len = @list;
 602     for ($i = 0; $i < $len / 5; $i++) {
 603         print OUTPUT $i ? "\n    + \"" : "    = \"";
 604         for $j (0 .. 4) {
 605             last if $len <= $i * 5 + $j;
 606             my $ch = $list[$i * 5 + $j];
 607             print OUTPUT javaChar($ch);
 608             print OUTPUT javaChar($offset);
 609             $offset += @{$special{$ch}};
 610             $expansion .= pack "n*", @{$special{$ch}};
 611         }
 612         print OUTPUT "\"";
 613     }
 614
 615     print OUTPUT <<EOF;
 616 ;
 617
 618   /**
 619    * This is the listing of special case multi-character uppercase sequences.
 620    * Characters listed in UPPER_SPECIAL index into this table to find their
 621    * uppercase expansion. Remember that you must also perform special-casing
 622    * on two single-character sequences in the Turkish locale, which are not
 623    * covered here in CharData.
 624    */
 625   String UPPER_EXPAND
 626 EOF
 627
 628     $len = length($expansion) / 2;
 629     for ($i = 0; $i < $len / 11; $i++) {
 630         print OUTPUT $i ? "\n    + \"" : "    = \"";
 631         for $j (0 .. 10) {
 632             last if $len <= $i * 11 + $j;
 633             my $val = unpack "n", substr($expansion, 2 * ($i * 11 + $j), 2);
 634             print OUTPUT javaChar($val);
 635         }
 636         print OUTPUT "\"";
 637     }
 638
 639     print OUTPUT ";\n}\n";
 640     close OUTPUT;
 641 }
 642 print "\nDone.\n";