libjava/scripts/unicode-decomp.pl

   1 #!/usr/bin/perl -w
   2 # unicode-decomp.pl - script to generate database for java.text.Collator
   3 # Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
   4 #
   5 # This file is part of libjava.
   6 #
   7 # This software is copyrighted work licensed under the terms of the
   8 # Libjava License.  Please consult the file "LIBJAVA_LICENSE" for
   9 # details.
  10
  11 # Code for reading UnicodeData.txt and generating the code for
  12 # gnu.java.lang.CharData.  For now, the relevant Unicode definition files
  13 # are found in libjava/gnu/gcj/convert/.
  14 #
  15 # Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
  16 #   where <UnicodeData.txt> is obtained from www.unicode.org (named
  17 #   UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
  18 #   is the final location of include/java-chardecomp.h.
  19 #   As of JDK 1.4, use Unicode version 3.0.0 for best results.
  20 #
  21 # If this exits with nonzero status, then you must investigate the
  22 # cause of the problem.
  23 # Diagnostics and other information to stderr.
  24 # With -n, the files are not created, but all processing still occurs.
  25
  26 # These maps characters to their decompositions.
  27 my %canonical_decomposition = ();
  28 my %full_decomposition = ();
  29
  30 # Handle `-n' and open output files.
  31 if ($ARGV[0] && $ARGV[0] eq '-n')
  32 {
  33     shift @ARGV;
  34     $ARGV[1] = '/dev/null';
  35 }
  36 die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
  37 open (UNICODE, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
  38
  39 # Process the Unicode file.
  40 $| = 1;
  41 my $count = 0;
  42 print STDERR "Parsing attributes file";
  43 while (<UNICODE>)
  44 {
  45     print STDERR "." unless $count++ % 1000;
  46     chomp;
  47     s/\r//g;
  48     my ($ch, undef, undef, undef, undef, $decomp) = split ';';
  49     $ch = hex($ch);
  50
  51     if ($decomp ne '')
  52     {
  53         my $is_full = 0;
  54         my @decomp = ();
  55         foreach (split (' ', $decomp))
  56         {
  57             if (/^\<.*\>$/)
  58             {
  59                 $is_full = 1;
  60                 next;
  61             }
  62             push (@decomp, hex ($_));
  63         }
  64         my $s = pack "n*", @decomp;
  65         if ($is_full)
  66         {
  67             $full_decomposition{$ch} = $s;
  68         }
  69         else
  70         {
  71             $canonical_decomposition{$ch} = $s;
  72         }
  73     }
  74 }
  75
  76 # Now generate decomposition tables.
  77 open DECOMP, "> $ARGV[1]" or die "Can't open output file: $!\n";
  78 print STDERR "\nGenerating tables\n";
  79 print DECOMP <<EOF;
  80 // java-chardecomp.h - Decomposition character tables -*- c++ -*-
  81
  82 #ifndef __JAVA_CHARDECOMP_H__
  83 #define __JAVA_CHARDECOMP_H__
  84
  85
  86 // These tables are automatically generated by the $0
  87 // script.  DO NOT EDIT the tables.  Instead, fix the script
  88 // and run it again.
  89
  90 // This file should only be included by natCollator.cc
  91
  92 struct decomp_entry
  93 {
  94   jchar key;
  95   const char *value;
  96 };
  97
  98 EOF
  99
 100 &write_decompositions;
 101
 102 print DECOMP "#endif /* __JAVA_CHARDECOMP_H__ */\n";
 103
 104 close(DECOMP);
 105 print STDERR "Done\n";
 106 exit;
 107
 108
 109 # Write a single decomposition table.
 110 sub write_single_decomposition($$%)
 111 {
 112     my ($name, $is_canon, %table) = @_;
 113     my $first_line = 1;
 114     print DECOMP "static const decomp_entry ${name}_decomposition[] =\n{\n";
 115
 116     for my $key (0 .. 0xffff)
 117     {
 118         next if ! defined $table{$key};
 119         print DECOMP ",\n" unless $first_line;
 120         $first_line = 0;
 121
 122         printf DECOMP "  { 0x%04x, \"", $key;
 123
 124         # We represent the expansion as a series of bytes, terminated
 125         # with a double nul.  This is ugly, but relatively
 126         # space-efficient.  Most expansions are short, but there are a
 127         # few that are very long (e.g. \uFDFA).  This means that if we
 128         # chose a fixed-space representation we would waste a lot of
 129         # space.
 130         my @expansion = unpack "n*", $table{$key};
 131         foreach my $char (@expansion)
 132         {
 133             printf DECOMP "\\x%02x\\x%02x", ($char / 256), ($char % 256);
 134         }
 135
 136         print DECOMP "\" }";
 137     }
 138
 139     print DECOMP "\n};\n\n";
 140 }
 141
 142 sub write_decompositions()
 143 {
 144     &write_single_decomposition ('canonical', 1, %canonical_decomposition);
 145     &write_single_decomposition ('full', 0, %full_decomposition);
 146 }