Merge branch 'nocomapi'
[syslinux/sherbszt.git] / codepage / gensubset.pl
blob4dd7f2c1c1e2a6eb0e0b94fbf47358884f184ccc
1 #!/usr/bin/perl
3 # Generate a subset of the UnicodeData.txt file, available from
4 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
6 # Usage:
7 # gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt
10 %need_these = ();
12 # Mark as needed all the characters mentioned in the relevant files
13 foreach $file (@ARGV) {
14 open(F, '<', $file) or die;
15 while (defined($line = <F>)) {
16 $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks
17 @f = split(/\s+/, $line);
18 next if (scalar @f != 2);
19 $need_these{hex $f[1]}++;
21 close(F);
24 # Also mark as needed any case variants of those
25 # (Note: this doesn't necessarily provide the full transitive closure,
26 # but we shouldn't need it.)
27 while (defined($line = <STDIN>)) {
28 @f = split(/;/, $line);
29 if ($f[0] =~ /^([0-9a-f]+)$/i) {
30 $r = hex $f[0];
31 if ($need_these{$r}) {
32 $need_these{hex $f[12]}++ if ($f[12] ne '');
33 $need_these{hex $f[13]}++ if ($f[13] ne '');
34 $need_these{hex $f[14]}++ if ($f[14] ne '');
39 # Finally, write out the subset
40 seek(STDIN, 0, 0);
41 while (defined($line = <STDIN>)) {
42 ($v, $l) = split(/;/, $line, 2);
43 if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) {
44 # This isn't actually the format... fix that if it ever matters
45 $r1 = hex $1;
46 $r2 = hex $2;
47 } elsif ($v =~ /^([0-9a-f]+)$/i) {
48 $r1 = $r2 = hex $1;
49 } else {
50 next;
52 for ($r = $r1; $r <= $r2; $r++) {
53 printf "%04X;%s", $r, $l if ($need_these{$r});