3 # Generate a subset of the UnicodeData.txt file, available from
4 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
7 # gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
12 # Mark as needed all the characters mentioned in the relevant files
13 foreach $file (@ARGV) {
14 open(F
, '<', $file) or die;
15 while (defined($line = <F
>)) {
16 $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
17 @f = split(/\s+/, $line);
18 next if (scalar @f != 2);
19 $need_these{hex $f[1]}++;
24 # Also mark as needed any case variants of those
25 # (Note: this doesn't necessarily provide the full transitive closure,
26 # but we shouldn't need it.)
27 while (defined($line = <STDIN
>)) {
28 @f = split(/;/, $line);
29 if ($f[0] =~ /^([0-9a-f]+)$/i) {
31 if ($need_these{$r}) {
32 $need_these{hex $f[12]}++ if ($f[12] ne '');
33 $need_these{hex $f[13]}++ if ($f[13] ne '');
34 $need_these{hex $f[14]}++ if ($f[14] ne '');
39 # Finally, write out the subset
41 while (defined($line = <STDIN
>)) {
42 ($v, $l) = split(/;/, $line, 2);
43 if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
44 # This isn't actually the format... fix that if it ever matters
47 } elsif ($v =~ /^([0-9a-f]+)$/i) {
52 for ($r = $r1; $r <= $r2; $r++) {
53 printf "%04X;%s", $r, $l if ($need_these{$r});