2 # unicode-decomp.pl - script to generate database for java.text.Collator
3 # Copyright (C) 1998, 1999, 2002 Free Software Foundation, Inc.
5 # This file is part of libjava.
7 # This software is copyrighted work licensed under the terms of the
8 # Libjava License. Please consult the file "LIBJAVA_LICENSE" for
11 # Code for reading UnicodeData.txt and generating the code for
12 # gnu.java.lang.CharData. For now, the relevant Unicode definition files
13 # are found in libjava/gnu/gcj/convert/.
15 # Usage: ./unicode-decomp.pl [-n] <UnicodeData.txt> <decomp.h>
16 # where <UnicodeData.txt> is obtained from www.unicode.org (named
17 # UnicodeData-3.0.0.txt for Unicode version 3.0.0), and <CharData.java>
18 # is the final location of include/java-chardecomp.h.
19 # As of JDK 1.4, use Unicode version 3.0.0 for best results.
21 # If this exits with nonzero status, then you must investigate the
22 # cause of the problem.
23 # Diagnostics and other information to stderr.
24 # With -n, the files are not created, but all processing still occurs.
26 # These maps characters to their decompositions.
27 my %canonical_decomposition = ();
28 my %full_decomposition = ();
30 # Handle `-n' and open output files.
31 if ($ARGV[0] && $ARGV[0] eq '-n')
34 $ARGV[1] = '/dev/null';
36 die "Usage: $0 <UnicodeData.txt> <java-chardecomp.h>" unless @ARGV == 2;
37 open (UNICODE
, "< $ARGV[0]") || die "Can't open Unicode attribute file: $!\n";
39 # Process the Unicode file.
42 print STDERR
"Parsing attributes file";
45 print STDERR
"." unless $count++ % 1000;
48 my ($ch, undef, undef, undef, undef, $decomp) = split ';';
55 foreach (split (' ', $decomp))
62 push (@decomp, hex ($_));
64 my $s = pack "n*", @decomp;
67 $full_decomposition{$ch} = $s;
71 $canonical_decomposition{$ch} = $s;
76 # Now generate decomposition tables.
77 open DECOMP
, "> $ARGV[1]" or die "Can't open output file: $!\n";
78 print STDERR
"\nGenerating tables\n";
80 // java-chardecomp.h - Decomposition character tables -*- c++ -*-
82 #ifndef __JAVA_CHARDECOMP_H__
83 #define __JAVA_CHARDECOMP_H__
86 // These tables are automatically generated by the $0
87 // script. DO NOT EDIT the tables. Instead, fix the script
90 // This file should only be included by natCollator.cc
100 &write_decompositions
;
102 print DECOMP
"#endif /* __JAVA_CHARDECOMP_H__ */\n";
105 print STDERR
"Done\n";
109 # Write a single decomposition table.
110 sub write_single_decomposition
($$%)
112 my ($name, $is_canon, %table) = @_;
114 print DECOMP
"static const decomp_entry ${name}_decomposition[] =\n{\n";
116 for my $key (0 .. 0xffff)
118 next if ! defined $table{$key};
119 print DECOMP
",\n" unless $first_line;
122 printf DECOMP
" { 0x%04x, \"", $key;
124 # We represent the expansion as a series of bytes, terminated
125 # with a double nul. This is ugly, but relatively
126 # space-efficient. Most expansions are short, but there are a
127 # few that are very long (e.g. \uFDFA). This means that if we
128 # chose a fixed-space representation we would waste a lot of
130 my @expansion = unpack "n*", $table{$key};
131 foreach my $char (@expansion)
133 printf DECOMP
"\\x%02x\\x%02x", ($char / 256), ($char % 256);
139 print DECOMP
"\n};\n\n";
142 sub write_decompositions
()
144 &write_single_decomposition
('canonical', 1, %canonical_decomposition);
145 &write_single_decomposition
('full', 0, %full_decomposition);