tools/make_unicode

   1 #!/usr/bin/perl -w
   2 #
   3 # Generate code page .c files from ftp.unicode.org descriptions
   4 #
   5 # Copyright 2000 Alexandre Julliard
   6 #
   7 # This library is free software; you can redistribute it and/or
   8 # modify it under the terms of the GNU Lesser General Public
   9 # License as published by the Free Software Foundation; either
  10 # version 2.1 of the License, or (at your option) any later version.
  11 #
  12 # This library is distributed in the hope that it will be useful,
  13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 # Lesser General Public License for more details.
  16 #
  17 # You should have received a copy of the GNU Lesser General Public
  18 # License along with this library; if not, write to the Free Software
  19 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
  20 #
  21
  22 use strict;
  23
  24 # base URLs for www.unicode.org files
  25 my $UNIVERSION = "14.0.0";
  26 my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
  27 my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
  28 my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
  29 my $KSCDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
  30 my $REPORTS = "http://www.unicode.org/reports";
  31 my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
  32 my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
  33
  34 # Sort keys file
  35 my $SORTKEYS = "tr10/allkeys.txt";
  36
  37 # Default char for undefined mappings
  38 my $DEF_CHAR = ord '?';
  39
  40 # Last valid Unicode character
  41 my $MAX_CHAR = 0x10ffff;
  42
  43 my @allfiles =
  44 (
  45     "CodpageFiles/037.txt",
  46     "CodpageFiles/437.txt",
  47     "CodpageFiles/500.txt",
  48     "CodpageFiles/708.txt",
  49     "CodpageFiles/720.txt",
  50     "CodpageFiles/737.txt",
  51     "CodpageFiles/775.txt",
  52     "CodpageFiles/850.txt",
  53     "CodpageFiles/852.txt",
  54     "CodpageFiles/855.txt",
  55     "CodpageFiles/857.txt",
  56     "CodpageFiles/860.txt",
  57     "CodpageFiles/861.txt",
  58     "CodpageFiles/862.txt",
  59     "CodpageFiles/863.txt",
  60     "CodpageFiles/864.txt",
  61     "CodpageFiles/865.txt",
  62     "CodpageFiles/866.txt",
  63     "CodpageFiles/869.txt",
  64     "CodpageFiles/874.txt",
  65     "CodpageFiles/875.txt",
  66     "CodpageFiles/932.txt",
  67     "CodpageFiles/936.txt",
  68     "CodpageFiles/949.txt",
  69     "CodpageFiles/950.txt",
  70     "CodpageFiles/1026.txt",
  71     "CodpageFiles/1250.txt",
  72     "CodpageFiles/1251.txt",
  73     "CodpageFiles/1252.txt",
  74     "CodpageFiles/1253.txt",
  75     "CodpageFiles/1254.txt",
  76     "CodpageFiles/1255.txt",
  77     "CodpageFiles/1256.txt",
  78     "CodpageFiles/1257.txt",
  79     "CodpageFiles/1258.txt",
  80     "CodpageFiles/1361.txt",
  81     "CodpageFiles/10000.txt",
  82     "CodpageFiles/10001.txt",
  83     "CodpageFiles/10002.txt",
  84     "CodpageFiles/10003.txt",
  85     "CodpageFiles/10004.txt",
  86     "CodpageFiles/10005.txt",
  87     "CodpageFiles/10006.txt",
  88     "CodpageFiles/10007.txt",
  89     "CodpageFiles/10008.txt",
  90     "CodpageFiles/10010.txt",
  91     "CodpageFiles/10017.txt",
  92     "CodpageFiles/10021.txt",
  93     "CodpageFiles/10029.txt",
  94     "CodpageFiles/10079.txt",
  95     "CodpageFiles/10081.txt",
  96     "CodpageFiles/10082.txt",
  97     "CodpageFiles/20127.txt",
  98     "CodpageFiles/20866.txt",
  99     "CodpageFiles/21866.txt",
 100     "CodpageFiles/28591.txt",
 101     "CodpageFiles/28592.txt",
 102     "CodpageFiles/28593.txt",
 103     "CodpageFiles/28594.txt",
 104     "CodpageFiles/28595.txt",
 105     "CodpageFiles/28596.txt",
 106     "CodpageFiles/28597.txt",
 107     "CodpageFiles/28598.txt",
 108     "CodpageFiles/28599.txt",
 109     "CodpageFiles/28603.txt",
 110     "CodpageFiles/28605.txt",
 111 );
 112
 113
 114 my %ctype =
 115 (
 116      # CT_CTYPE1
 117     "upper"  => 0x0001,
 118     "lower"  => 0x0002,
 119     "digit"  => 0x0004,
 120     "space"  => 0x0008,
 121     "punct"  => 0x0010,
 122     "cntrl"  => 0x0020,
 123     "blank"  => 0x0040,
 124     "xdigit" => 0x0080,
 125     "alpha"  => 0x0100 | 0x80000000,
 126     "defin"  => 0x0200,
 127      # CT_CTYPE3 in high 16 bits
 128     "nonspacing"    => 0x00010000,
 129     "diacritic"     => 0x00020000,
 130     "vowelmark"     => 0x00040000,
 131     "symbol"        => 0x00080000,
 132     "katakana"      => 0x00100000,
 133     "hiragana"      => 0x00200000,
 134     "halfwidth"     => 0x00400000,
 135     "fullwidth"     => 0x00800000,
 136     "ideograph"     => 0x01000000,
 137     "kashida"       => 0x02000000,
 138     "lexical"       => 0x04000000,
 139     "highsurrogate" => 0x08000000,
 140     "lowsurrogate"  => 0x10000000,
 141 );
 142
 143 my %bracket_types =
 144 (
 145     "o" => 0x0000,
 146     "c" => 0x0001,
 147 );
 148
 149 my %indic_types =
 150 (
 151     "Other"    => 0x0000,
 152     "Bindu"    => 0x0001,
 153     "Visarga"  => 0x0002,
 154     "Avagraha" => 0x0003,
 155     "Nukta"    => 0x0004,
 156     "Virama"   => 0x0005,
 157     "Vowel_Independent"  => 0x0006,
 158     "Vowel_Dependent"  => 0x0007,
 159     "Vowel"  => 0x0008,
 160     "Consonant_Placeholder"  => 0x0009,
 161     "Consonant"  => 0x000a,
 162     "Consonant_Dead"  => 0x000b,
 163     "Consonant_Succeeding_Repha" => 0x000c,
 164     "Consonant_Subjoined"  => 0x000d,
 165     "Consonant_Medial"  => 0x000e,
 166     "Consonant_Final"  => 0x000f,
 167     "Consonant_Head_Letter"  => 0x0010,
 168     "Modifying_Letter"  => 0x0011,
 169     "Tone_Letter"  => 0x0012,
 170     "Tone_Mark"  => 0x0013,
 171     "Register_Shifter"  => 0x0014,
 172     "Consonant_Preceding_Repha" => 0x0015,
 173     "Pure_Killer" => 0x0016,
 174     "Invisible_Stacker" => 0x0017,
 175     "Gemination_Mark" => 0x0018,
 176     "Cantillation_Mark" => 0x0019,
 177     "Non_Joiner" => 0x001a,
 178     "Joiner" => 0x001b,
 179     "Number_Joiner" => 0x001c,
 180     "Number" => 0x001d,
 181     "Brahmi_Joining_Number" => 0x001e,
 182     "Consonant_With_Stacker" => 0x001f,
 183     "Consonant_Prefixed" => 0x0020,
 184     "Syllable_Modifier" => 0x0021,
 185     "Consonant_Killer" => 0x0022,
 186     "Consonant_Initial_Postfixed" => 0x0023,
 187 );
 188
 189 my %matra_types =
 190 (
 191     "Right"    => 0x01,
 192     "Left"  => 0x02,
 193     "Visual_Order_Left" => 0x03,
 194     "Left_And_Right"    => 0x04,
 195     "Top"   => 0x05,
 196     "Bottom"  => 0x06,
 197     "Top_And_Bottom"  => 0x07,
 198     "Top_And_Right"  => 0x08,
 199     "Top_And_Left"  => 0x09,
 200     "Top_And_Left_And_Right"  => 0x0a,
 201     "Bottom_And_Right"  => 0x0b,
 202     "Top_And_Bottom_And_Right"  => 0x0c,
 203     "Overstruck"  => 0x0d,
 204     "Invisible"  => 0x0e,
 205     "Bottom_And_Left"  => 0x0f,
 206     "Top_And_Bottom_And_Left"  => 0x10,
 207 );
 208
 209 my %break_types =
 210 (
 211     "BK"  => 0x0001,
 212     "CR"  => 0x0002,
 213     "LF"  => 0x0003,
 214     "CM"  => 0x0004,
 215     "SG"  => 0x0005,
 216     "GL"  => 0x0006,
 217     "CB"  => 0x0007,
 218     "SP"  => 0x0008,
 219     "ZW"  => 0x0009,
 220     "NL"  => 0x000a,
 221     "WJ"  => 0x000b,
 222     "JL"  => 0x000c,
 223     "JV"  => 0x000d,
 224     "JT"  => 0x000e,
 225     "H2"  => 0x000f,
 226     "H3"  => 0x0010,
 227     "XX"  => 0x0011,
 228     "OP"  => 0x0012,
 229     "CL"  => 0x0013,
 230     "CP"  => 0x0014,
 231     "QU"  => 0x0015,
 232     "NS"  => 0x0016,
 233     "EX"  => 0x0017,
 234     "SY"  => 0x0018,
 235     "IS"  => 0x0019,
 236     "PR"  => 0x001a,
 237     "PO"  => 0x001b,
 238     "NU"  => 0x001c,
 239     "AL"  => 0x001d,
 240     "ID"  => 0x001e,
 241     "IN"  => 0x001f,
 242     "HY"  => 0x0020,
 243     "BB"  => 0x0021,
 244     "BA"  => 0x0022,
 245     "SA"  => 0x0023,
 246     "AI"  => 0x0024,
 247     "B2"  => 0x0025,
 248     "HL"  => 0x0026,
 249     "CJ"  => 0x0027,
 250     "RI"  => 0x0028,
 251     "EB"  => 0x0029,
 252     "EM"  => 0x002a,
 253     "ZWJ" => 0x002b,
 254 );
 255
 256 my %vertical_types =
 257 (
 258     "R"  => 0x0000,
 259     "U"  => 0x0001,
 260     "Tr" => 0x0002,
 261     "Tu" => 0x0003,
 262 );
 263
 264 my %categories =
 265 (
 266     "Lu" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
 267     "Ll" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
 268     "Lt" => $ctype{"defin"}|$ctype{"alpha"}|$ctype{"upper"}|$ctype{"lower"},    # Letter, Titlecase
 269     "Mn" => $ctype{"defin"}|$ctype{"nonspacing"}, # Mark, Non-Spacing
 270     "Mc" => $ctype{"defin"},                    # Mark, Spacing Combining
 271     "Me" => $ctype{"defin"},                    # Mark, Enclosing
 272     "Nd" => $ctype{"defin"}|$ctype{"digit"},    # Number, Decimal Digit
 273     "Nl" => $ctype{"defin"}|$ctype{"alpha"},    # Number, Letter
 274     "No" => $ctype{"defin"},                    # Number, Other
 275     "Zs" => $ctype{"defin"}|$ctype{"space"},    # Separator, Space
 276     "Zl" => $ctype{"defin"}|$ctype{"space"},    # Separator, Line
 277     "Zp" => $ctype{"defin"}|$ctype{"space"},    # Separator, Paragraph
 278     "Cc" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Control
 279     "Cf" => $ctype{"defin"}|$ctype{"cntrl"},    # Other, Format
 280     "Cs" => $ctype{"defin"},                    # Other, Surrogate
 281     "Co" => $ctype{"defin"},                    # Other, Private Use
 282     "Cn" => $ctype{"defin"},                    # Other, Not Assigned
 283     "Lm" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Modifier
 284     "Lo" => $ctype{"defin"}|$ctype{"alpha"},    # Letter, Other
 285     "Pc" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Connector
 286     "Pd" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Dash
 287     "Ps" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Open
 288     "Pe" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Close
 289     "Pi" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Initial quote
 290     "Pf" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Final quote
 291     "Po" => $ctype{"defin"}|$ctype{"punct"},    # Punctuation, Other
 292     "Sm" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Math
 293     "Sc" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Currency
 294     "Sk" => $ctype{"defin"}|$ctype{"symbol"},   # Symbol, Modifier
 295     "So" => $ctype{"defin"}|$ctype{"symbol"}    # Symbol, Other
 296 );
 297
 298 # a few characters need additional categories that cannot be determined automatically
 299 my %special_categories =
 300 (
 301     "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
 302                   0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
 303     "space"  => [ 0x09..0x0d, 0x85 ],
 304     "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
 305     "cntrl"  => [ 0x070f, 0x200c, 0x200d,
 306                   0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
 307                   0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
 308                   0xfff9, 0xfffa, 0xfffb ],
 309     "punct"  => [ 0x24, 0x2b, 0x3c..0x3e, 0x5e, 0x60, 0x7c, 0x7e, 0xa2..0xbe,
 310                   0xd7, 0xf7 ],
 311     "digit"  => [ 0xb2, 0xb3, 0xb9 ],
 312     "lower"  => [ 0xaa, 0xba, 0x2071, 0x207f ],
 313     "nonspacing" => [ 0xc0..0xc5, 0xc7..0xcf, 0xd1..0xd6, 0xd8..0xdd, 0xe0..0xe5, 0xe7..0xef,
 314                       0xf1..0xf6, 0xf8..0xfd, 0xff, 0x6de, 0x1929..0x192b, 0x302e..0x302f ],
 315     "diacritic" => [ 0x5e, 0x60, 0xb7, 0xd8, 0xf8 ],
 316     "symbol" => [ 0x09..0x0d, 0x20..0x23, 0x25, 0x26, 0x28..0x2a, 0x2c, 0x2e..0x2f, 0x3a..0x40,
 317                   0x5b..0x60, 0x7b..0x7e, 0xa0..0xa9, 0xab..0xb1, 0xb4..0xb8, 0xbb, 0xbf,
 318                   0x02b9..0x02ba, 0x02c6..0x02cf ],
 319     "halfwidth" => [ 0x20..0x7e, 0xa2..0xa3, 0xa5..0xa6, 0xac, 0xaf, 0x20a9 ],
 320     "fullwidth" => [ 0x2018..0x2019, 0x201c..0x201d, 0x3000..0x3002, 0x300c..0x300d, 0x309b..0x309c,
 321                      0x30a1..0x30ab, 0x30ad, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9,
 322                      0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c3, 0x30c4, 0x30c6, 0x30c8, 0x30ca..0x30cf,
 323                      0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de..0x30ed, 0x30ef, 0x30f2..0x30f3, 0x30fb,
 324                      0x3131..0x3164 ],
 325     "ideograph" => [ 0x3006..0x3007 ],
 326     "lexical" => [ 0x22, 0x24, 0x27, 0x2d, 0x2f, 0x3d, 0x40, 0x5c, 0x5e..0x60, 0x7e,
 327                    0xa8, 0xaa, 0xad, 0xaf, 0xb4, 0xb8, 0xba,
 328                    0x02b0..0x02b8, 0x02bc, 0x02c7, 0x02ca..0x02cb, 0x02cf, 0x02d8..0x02dd, 0x02e0..0x02e3,
 329                    0x037a, 0x0384..0x0385, 0x0387, 0x0559..0x055a, 0x0640, 0x1fbd..0x1fc1,
 330                    0x1fcd..0x1fcf, 0x1fdd..0x1fdf, 0x1fed..0x1fef, 0x1ffd..0x1ffe, 0x2010..0x2015,
 331                    0x2032..0x2034, 0x2038, 0x2043..0x2044, 0x207b..0x207c, 0x207f, 0x208b..0x208c,
 332                    0x2212, 0x2215..0x2216, 0x2500, 0x2504..0x2505, 0x2508..0x2509, 0x254c..0x254d,
 333                    0x3003, 0x301c, 0x3030..0x3035, 0x309b..0x309e, 0x30fd..0x30fe, 0xfe31..0xfe32,
 334                    0xfe58, 0xfe63, 0xfe66, 0xfe68..0xfe69, 0xfe6b, 0xff04, 0xff07, 0xff0d, 0xff0f,
 335                    0xff1d, 0xff20, 0xff3c, 0xff3e, 0xff40, 0xff5e ],
 336     "kashida" => [ 0x0640 ],
 337 );
 338
 339 my %directions =
 340 (
 341     "L"   => 1,    # Left-to-Right
 342     "R"   => 2,    # Right-to-Left
 343     "AL"  => 12,   # Right-to-Left Arabic
 344     "EN"  => 3,    # European Number
 345     "ES"  => 4,    # European Number Separator
 346     "ET"  => 5,    # European Number Terminator
 347     "AN"  => 6,    # Arabic Number
 348     "CS"  => 7,    # Common Number Separator
 349     "NSM" => 13,   # Non-Spacing Mark
 350     "BN"  => 14,   # Boundary Neutral
 351     "B"   => 8,    # Paragraph Separator
 352     "S"   => 9,    # Segment Separator
 353     "WS"  => 10,   # Whitespace
 354     "ON"  => 11,   # Other Neutrals
 355     "LRE" => 15,   # Left-to-Right Embedding
 356     "LRO" => 15,   # Left-to-Right Override
 357     "RLE" => 15,   # Right-to-Left Embedding
 358     "RLO" => 15,   # Right-to-Left Override
 359     "PDF" => 15,   # Pop Directional Format
 360     "LRI" => 15,   # Left-to-Right Isolate
 361     "RLI" => 15,   # Right-to-Left Isolate
 362     "FSI" => 15,   # First Strong Isolate
 363     "PDI" => 15    # Pop Directional Isolate
 364 );
 365
 366 my %c2_types =
 367 (
 368     "L"   => 1,    # C2_LEFTTORIGHT
 369     "R"   => 2,    # C2_RIGHTTOLEFT
 370     "AL"  => 2,    # C2_RIGHTTOLEFT
 371     "EN"  => 3,    # C2_EUROPENUMBER
 372     "ES"  => 4,    # C2_EUROPESEPARATOR
 373     "ET"  => 5,    # C2_EUROPETERMINATOR
 374     "AN"  => 6,    # C2_ARABICNUMBER
 375     "CS"  => 7,    # C2_COMMONSEPARATOR
 376     "NSM" => 11,   # C2_OTHERNEUTRAL
 377     "BN"  => 0,    # C2_NOTAPPLICABLE
 378     "B"   => 8,    # C2_BLOCKSEPARATOR
 379     "S"   => 9,    # C2_SEGMENTSEPARATOR
 380     "WS"  => 10,   # C2_WHITESPACE
 381     "ON"  => 11,   # C2_OTHERNEUTRAL
 382     "LRE" => 11,   # C2_OTHERNEUTRAL
 383     "LRO" => 11,   # C2_OTHERNEUTRAL
 384     "RLE" => 11,   # C2_OTHERNEUTRAL
 385     "RLO" => 11,   # C2_OTHERNEUTRAL
 386     "PDF" => 11,   # C2_OTHERNEUTRAL
 387     "LRI" => 11,   # C2_OTHERNEUTRAL
 388     "RLI" => 11,   # C2_OTHERNEUTRAL
 389     "FSI" => 11,   # C2_OTHERNEUTRAL
 390     "PDI" => 11    # C2_OTHERNEUTRAL
 391 );
 392
 393 my %bidi_types =
 394 (
 395     "ON"  => 0,    # Other Neutrals
 396     "L"   => 1,    # Left-to-Right
 397     "R"   => 2,    # Right-to-Left
 398     "AN"  => 3,    # Arabic Number
 399     "EN"  => 4,    # European Number
 400     "AL"  => 5,    # Right-to-Left Arabic
 401     "NSM" => 6,    # Non-Spacing Mark
 402     "CS"  => 7,    # Common Number Separator
 403     "ES"  => 8,    # European Number Separator
 404     "ET"  => 9,    # European Number Terminator
 405     "BN"  => 10,   # Boundary Neutral
 406     "S"   => 11,   # Segment Separator
 407     "WS"  => 12,   # Whitespace
 408     "B"   => 13,   # Paragraph Separator
 409     "RLO" => 14,   # Right-to-Left Override
 410     "RLE" => 15,   # Right-to-Left Embedding
 411     "LRO" => 16,   # Left-to-Right Override
 412     "LRE" => 17,   # Left-to-Right Embedding
 413     "PDF" => 18,   # Pop Directional Format
 414     "LRI" => 19,   # Left-to-Right Isolate
 415     "RLI" => 20,   # Right-to-Left Isolate
 416     "FSI" => 21,   # First Strong Isolate
 417     "PDI" => 22    # Pop Directional Isolate
 418 );
 419
 420 my %joining_types =
 421 (
 422    "U" => 0,           # Non_Joining
 423    "L" => 1,           # Left_Joining
 424    "R" => 2,           # Right_Joining
 425    "D" => 3,           # Dual_Joining
 426    "C" => 3,           # Join_Causing
 427    "ALAPH" => 4,       # Syriac ALAPH
 428    "DALATH RISH" => 5, # Syriac DALATH RISH group
 429    "T" => 6,           # Transparent
 430 );
 431
 432 my @cp2uni = ();
 433 my @glyph2uni = ();
 434 my @lead_bytes = ();
 435 my @uni2cp = ();
 436 my @tolower_table = ();
 437 my @toupper_table = ();
 438 my @digitmap_table = ();
 439 my @category_table = ();
 440 my @initial_joining_table = ();
 441 my @direction_table = ();
 442 my @decomp_table = ();
 443 my @combining_class_table = ();
 444 my @decomp_compat_table = ();
 445 my @comp_exclusions = ();
 446 my @idna_decomp_table = ();
 447 my @idna_disallowed = ();
 448 my %registry_keys;
 449 my $default_char;
 450 my $default_wchar;
 451
 452 my %joining_forms =
 453 (
 454    "isolated" => [],
 455    "final" => [],
 456    "initial" => [],
 457    "medial" => []
 458 );
 459
 460 sub to_utf16(@)
 461 {
 462     my @ret;
 463     foreach my $ch (@_)
 464     {
 465         if ($ch < 0x10000)
 466         {
 467             push @ret, $ch;
 468         }
 469         else
 470         {
 471             my $val = $ch - 0x10000;
 472             push @ret, 0xd800 | ($val >> 10), 0xdc00 | ($val & 0x3ff);
 473         }
 474     }
 475     return @ret;
 476 }
 477
 478 ################################################################
 479 # fetch a unicode.org file and open it
 480 sub open_data_file($$)
 481 {
 482     my ($base, $name) = @_;
 483     my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
 484     (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
 485     my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
 486     local *FILE;
 487
 488     if ($base =~ /.*\/([^\/]+)\.zip$/)
 489     {
 490         my $zip = "$1$suffix.zip";
 491         unless (-f "$cache/$zip")
 492         {
 493             system "mkdir", "-p", $cache;
 494             print "Fetching $base...\n";
 495             !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
 496         }
 497         open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
 498     }
 499     else
 500     {
 501         (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
 502         unless (-f $dest)
 503         {
 504             system "mkdir", "-p", $dir;
 505             print "Fetching $base/$name...\n";
 506             !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
 507         }
 508         open FILE, "<$dest" or die "cannot open $dest";
 509     }
 510     return *FILE;
 511 }
 512
 513 ################################################################
 514 # recursively get the decomposition for a character
 515 sub get_decomposition($$);
 516 sub get_decomposition($$)
 517 {
 518     my ($char, $table) = @_;
 519     my @ret;
 520
 521     return $char unless defined $table->[$char];
 522     foreach my $ch (@{$table->[$char]})
 523     {
 524         push @ret, get_decomposition( $ch, $table );
 525     }
 526     return @ret;
 527 }
 528
 529 ################################################################
 530 # get the composition that results in a given character
 531 sub get_composition($$)
 532 {
 533     my ($ch, $compat) = @_;
 534     return () unless defined $decomp_table[$ch];  # no decomposition
 535     my @ret = @{$decomp_table[$ch]};
 536     return () if @ret < 2;                        # singleton decomposition
 537     return () if $comp_exclusions[$ch];           # composition exclusion
 538     return () if $combining_class_table[$ch];     # non-starter
 539     return () if $combining_class_table[$ret[0]]; # first char is non-starter
 540     return () if $compat == 1 && !defined $decomp_table[$ret[0]] &&
 541         defined $decomp_compat_table[$ret[0]];    # first char has compat decomposition
 542     return () if $compat == 2 && !defined $decomp_table[$ret[0]] &&
 543         defined $idna_decomp_table[$ret[0]];      # first char has IDNA decomposition
 544     return () if $compat == 2 && defined $idna_decomp_table[$ret[0]] &&
 545         defined $idna_decomp_table[$idna_decomp_table[$ret[0]]->[0]];  # first char's decomposition has IDNA decomposition
 546     return () if $compat == 2 && defined $idna_decomp_table[$ret[1]];  # second char has IDNA decomposition
 547     return @ret;
 548 }
 549
 550 ################################################################
 551 # recursively build decompositions
 552 sub build_decompositions(@)
 553 {
 554     my @src = @_;
 555     my @dst;
 556
 557     for (my $i = 0; $i < @src; $i++)
 558     {
 559         next unless defined $src[$i];
 560         my @decomp = to_utf16( get_decomposition( $i, \@src ));
 561         $dst[$i] = \@decomp;
 562     }
 563     return @dst;
 564 }
 565
 566 ################################################################
 567 # compose Hangul sequences
 568 sub compose_hangul(@)
 569 {
 570     my $SBASE  = 0xac00;
 571     my $LBASE  = 0x1100;
 572     my $VBASE  = 0x1161;
 573     my $TBASE  = 0x11a7;
 574     my $LCOUNT = 19;
 575     my $VCOUNT = 21;
 576     my $TCOUNT = 28;
 577     my $NCOUNT = $VCOUNT * $TCOUNT;
 578     my $SCOUNT = $LCOUNT * $NCOUNT;
 579
 580     my @seq = @_;
 581     my @ret;
 582     my $i;
 583
 584     for ($i = 0; $i < @seq; $i++)
 585     {
 586         my $ch = $seq[$i];
 587         if ($ch >= $LBASE && $ch < $LBASE + $LCOUNT && $i < @seq - 1 &&
 588             $seq[$i+1] >= $VBASE && $seq[$i+1] < $VBASE + $VCOUNT)
 589         {
 590             $ch = $SBASE + (($seq[$i] - $LBASE) * $VCOUNT + ($seq[$i+1] - $VBASE)) * $TCOUNT;
 591             $i++;
 592         }
 593         if ($ch >= $SBASE && $ch < $SBASE + $SCOUNT && !(($ch - $SBASE) % $TCOUNT) && $i < @seq - 1 &&
 594             $seq[$i+1] > $TBASE && $seq[$i+1] < $TBASE + $TCOUNT)
 595         {
 596             $ch += $seq[$i+1] - $TBASE;
 597             $i++;
 598         }
 599         push @ret, $ch;
 600     }
 601     return @ret;
 602 }
 603
 604 ################################################################
 605 # remove linguistic-only mappings from the case table
 606 sub remove_linguistic_mappings($$)
 607 {
 608     my ($upper, $lower) = @_;
 609
 610     # remove case mappings that don't round-trip
 611
 612     for (my $i = 0; $i < @{$upper}; $i++)
 613     {
 614         next unless defined ${$upper}[$i];
 615         my $ch = ${$upper}[$i];
 616         ${$upper}[$i] = undef unless defined ${$lower}[$ch] && ${$lower}[$ch] == $i;
 617     }
 618     for (my $i = 0; $i < @{$lower}; $i++)
 619     {
 620         next unless defined ${$lower}[$i];
 621         my $ch = ${$lower}[$i];
 622         ${$lower}[$i] = undef unless defined ${$upper}[$ch] && ${$upper}[$ch] == $i;
 623     }
 624 }
 625
 626 ################################################################
 627 # read in the Unicode database files
 628 sub load_data()
 629 {
 630     my $start;
 631
 632     # now build mappings from the decomposition field of the Unicode database
 633
 634     my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
 635     while (<$UNICODE_DATA>)
 636     {
 637         # Decode the fields ...
 638         my ($code, $name, $cat, $comb, $bidi,
 639             $decomp, $dec, $dig, $num, $mirror,
 640             $oldname, $comment, $upper, $lower, $title) = split /;/;
 641         my $src = hex $code;
 642
 643         die "unknown category $cat" unless defined $categories{$cat};
 644         die "unknown directionality $bidi" unless defined $directions{$bidi};
 645
 646         $category_table[$src] = $categories{$cat};
 647         $direction_table[$src] = $bidi;
 648         if ($cat eq "Mn" || $cat eq "Me" || $cat eq "Cf")
 649         {
 650             $initial_joining_table[$src] = $joining_types{"T"};
 651         }
 652         else
 653         {
 654             $initial_joining_table[$src] = $joining_types{"U"};
 655         }
 656
 657         if ($lower ne "")
 658         {
 659             $tolower_table[$src] = hex $lower;
 660         }
 661         if ($upper ne "")
 662         {
 663             $toupper_table[$src] = hex $upper;
 664         }
 665         if ($dec ne "")
 666         {
 667             $category_table[$src] |= $ctype{"digit"};
 668         }
 669         if ($dig ne "")
 670         {
 671             $digitmap_table[$src] = ord $dig;
 672         }
 673         $combining_class_table[$src] = ($cat ne "Co") ? $comb : 0x100; # Private Use
 674
 675         $category_table[$src] |= $ctype{"nonspacing"}    if $bidi eq "NSM";
 676         $category_table[$src] |= $ctype{"diacritic"}     if $name =~ /^(COMBINING)|(MODIFIER LETTER)\W/;
 677         $category_table[$src] |= $ctype{"vowelmark"}     if $name =~ /\sVOWEL/ || $oldname =~ /\sVOWEL/;
 678         $category_table[$src] |= $ctype{"halfwidth"}     if $name =~ /^HALFWIDTH\s/;
 679         $category_table[$src] |= $ctype{"fullwidth"}     if $name =~ /^FULLWIDTH\s/;
 680         $category_table[$src] |= $ctype{"hiragana"}      if $name =~ /(HIRAGANA)|(\WKANA\W)/;
 681         $category_table[$src] |= $ctype{"katakana"}      if $name =~ /(KATAKANA)|(\WKANA\W)/;
 682         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^<CJK Ideograph/;
 683         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^CJK COMPATIBILITY IDEOGRAPH/;
 684         $category_table[$src] |= $ctype{"ideograph"}     if $name =~ /^HANGZHOU/;
 685         $category_table[$src] |= $ctype{"highsurrogate"} if $name =~ /High Surrogate/;
 686         $category_table[$src] |= $ctype{"lowsurrogate"}  if $name =~ /Low Surrogate/;
 687
 688         # copy the category and direction for everything between First/Last pairs
 689         if ($name =~ /, First>/) { $start = $src; }
 690         if ($name =~ /, Last>/)
 691         {
 692             while ($start < $src)
 693             {
 694                 $category_table[$start] = $category_table[$src];
 695                 $direction_table[$start] = $direction_table[$src];
 696                 $combining_class_table[$start] = $combining_class_table[$src];
 697                 $start++;
 698             }
 699         }
 700
 701         next if $decomp eq "";  # no decomposition, skip it
 702
 703         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)/)
 704         {
 705             my @seq = map { hex $_; } (split /\s+/, (split /\s+/, $decomp, 2)[1]);
 706             $decomp_compat_table[$src] = \@seq;
 707         }
 708
 709         if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
 710         {
 711             # decomposition of the form "<foo> 1234" -> use char if type is known
 712             if ($1 eq "isolated" || $1 eq "final" || $1 eq "initial" || $1 eq "medial")
 713             {
 714                 ${joining_forms{$1}}[hex $2] = $src;
 715             }
 716         }
 717         elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
 718         {
 719             # decomposition "<compat> 0020 1234" -> combining accent
 720         }
 721         elsif ($decomp =~ /^([0-9a-fA-F]+)/)
 722         {
 723             # store decomposition
 724             if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
 725             {
 726                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1, hex $2 ];
 727             }
 728             elsif ($decomp =~ /^([0-9a-fA-F]+)$/)
 729             {
 730                 # Single char decomposition
 731                 $decomp_table[$src] = $decomp_compat_table[$src] = [ hex $1 ];
 732             }
 733         }
 734     }
 735     close $UNICODE_DATA;
 736
 737     # patch the category of some special characters
 738
 739     for (my $i = 0; $i < @decomp_table; $i++)
 740     {
 741         next unless defined $decomp_table[$i];
 742         $category_table[$i] |= $category_table[$decomp_table[$i]->[0]];
 743     }
 744     foreach my $cat (keys %special_categories)
 745     {
 746         my $flag = $ctype{$cat};
 747         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
 748     }
 749     for (my $i = 0; $i < @decomp_compat_table; $i++)
 750     {
 751         next unless defined $decomp_compat_table[$i];
 752         next unless @{$decomp_compat_table[$i]} == 2;
 753         $category_table[$i] |= $category_table[$decomp_compat_table[$i]->[1]] & $ctype{"diacritic"};
 754     }
 755
 756     # load the composition exclusions
 757
 758     my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
 759     while (<$EXCL>)
 760     {
 761         s/\#.*//;  # remove comments
 762         if (/^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s*$/)
 763         {
 764             foreach my $i (hex $1 .. hex $2) { $comp_exclusions[$i] = 1; }
 765         }
 766         elsif (/^([0-9a-fA-F]+)\s*$/)
 767         {
 768             $comp_exclusions[hex $1] = 1;
 769         }
 770     }
 771     close $EXCL;
 772
 773     # load the IDNA mappings
 774
 775     @idna_decomp_table = @decomp_compat_table;
 776     my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
 777     while (<$IDNA>)
 778     {
 779         s/\#.*//;  # remove comments
 780         next if /^\s*$/;
 781         my ($char, $type, $mapping) = split /;/;
 782         my ($ch1, $ch2);
 783         if ($char =~ /([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)/)
 784         {
 785             $ch1 = hex $1;
 786             $ch2 = hex $2;
 787         }
 788         elsif ($char =~ /([0-9a-fA-F]+)/)
 789         {
 790             $ch1 = $ch2 = hex $1;
 791         }
 792
 793         if ($type =~ /mapped/ || $type =~ /deviation/)
 794         {
 795             $mapping =~ s/^\s*(([0-9a-fA-F]+\s+)+)\s*$/$1/;
 796             my @seq = map { hex $_; } split /\s+/, $mapping;
 797             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = @seq ? \@seq : [ 0 ]; }
 798         }
 799         elsif ($type =~ /valid/)
 800         {
 801         }
 802         elsif ($type =~ /ignored/)
 803         {
 804             foreach my $i ($ch1 .. $ch2) { $idna_decomp_table[$i] = [ 0 ]; }
 805         }
 806         elsif ($type =~ /disallowed/)
 807         {
 808             foreach my $i ($ch1 .. $ch2)
 809             {
 810                 $idna_decomp_table[$i] = undef;
 811                 $idna_disallowed[$i] = 1;
 812             }
 813         }
 814     }
 815     close $IDNA;
 816 }
 817
 818
 819 ################################################################
 820 # add a new registry key
 821 sub add_registry_key($$)
 822 {
 823     my ($key, $defval) = @_;
 824     $registry_keys{$key} = [ $defval ] unless defined $registry_keys{$key};
 825 }
 826
 827 ################################################################
 828 # add a new registry value
 829 sub add_registry_value($$$)
 830 {
 831     my ($key, $name, $value) = @_;
 832     add_registry_key( $key, undef );
 833     push @{$registry_keys{$key}}, "'$name' = s '$value'";
 834 }
 835
 836 ################################################################
 837 # define a new lead byte
 838 sub add_lead_byte($)
 839 {
 840     my $ch = shift;
 841     return if defined $cp2uni[$ch];
 842     push @lead_bytes, $ch;
 843     $cp2uni[$ch] = 0;
 844 }
 845
 846 ################################################################
 847 # define a new char mapping
 848 sub add_mapping($$)
 849 {
 850     my ($cp, $uni) = @_;
 851     $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
 852     $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
 853     if ($cp > 0xff) { add_lead_byte( $cp >> 8 ); }
 854 }
 855
 856 ################################################################
 857 # get a mapping including glyph chars for MB_USEGLYPHCHARS
 858 sub get_glyphs_mapping(@)
 859 {
 860     my @table = @_;
 861
 862     for (my $i = 0; $i < @glyph2uni; $i++)
 863     {
 864         $table[$i] = $glyph2uni[$i] if defined $glyph2uni[$i];
 865     }
 866     return @table;
 867 }
 868
 869 ################################################################
 870 # build EUC-JP table from the JIS 0208/0212 files
 871 sub dump_eucjp_codepage()
 872 {
 873     @cp2uni = ();
 874     @glyph2uni = ();
 875     @lead_bytes = ();
 876     @uni2cp = ();
 877     $default_char = $DEF_CHAR;
 878     $default_wchar = 0x30fb;
 879
 880     # ASCII chars
 881     foreach my $i (0x00 .. 0x7f) { add_mapping( $i, $i ); }
 882
 883     # lead bytes
 884     foreach my $i (0x8e, 0xa1 .. 0xfe) { add_lead_byte($i); }
 885
 886     # JIS X 0201 right plane
 887     foreach my $i (0xa1 .. 0xdf) { add_mapping( 0x8e00 + $i, 0xfec0 + $i ); }
 888
 889     # undefined chars
 890     foreach my $i (0x80 .. 0x8d, 0x8f .. 0x9f) { $cp2uni[$i] = $i; }
 891     $cp2uni[0xa0] = 0xf8f0;
 892     $cp2uni[0xff] = 0xf8f3;
 893
 894     # Fix backslash conversion
 895     add_mapping( 0xa1c0, 0xff3c );
 896
 897     # Add private mappings for rows undefined in JIS 0208/0212
 898     my $private = 0xe000;
 899     foreach my $hi (0xf5 .. 0xfe)
 900     {
 901         foreach my $lo (0xa1 .. 0xfe)
 902         {
 903             add_mapping( ($hi << 8) + $lo, $private++ );
 904         }
 905     }
 906     foreach my $hi (0xf5 .. 0xfe)
 907     {
 908         foreach my $lo (0x21 .. 0x7e)
 909         {
 910             add_mapping( ($hi << 8) + $lo, $private++ );
 911         }
 912     }
 913
 914     my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
 915     while (<$INPUT>)
 916     {
 917         next if /^\#/;  # skip comments
 918         next if /^$/;  # skip empty lines
 919         next if /\x1a/;  # skip ^Z
 920         if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 921         {
 922             add_mapping( 0x8080 + hex $1, hex $2 );
 923             next;
 924         }
 925         die "Unrecognized line $_\n";
 926     }
 927     close $INPUT;
 928
 929     $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
 930     while (<$INPUT>)
 931     {
 932         next if /^\#/;  # skip comments
 933         next if /^$/;  # skip empty lines
 934         next if /\x1a/;  # skip ^Z
 935         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 936         {
 937             add_mapping( 0x8000 + hex $1, hex $2 );
 938             next;
 939         }
 940         die "Unrecognized line $_\n";
 941     }
 942     close $INPUT;
 943
 944     output_codepage_file( 20932 );
 945 }
 946
 947 ################################################################
 948 # build Korean Wansung table from the KSX1001 file
 949 sub dump_krwansung_codepage(@)
 950 {
 951     my @cp949 = @_;
 952     @cp2uni = ();
 953     @glyph2uni = ();
 954     @lead_bytes = ();
 955     @uni2cp = ();
 956     $default_char = 0x3f;
 957     $default_wchar = 0x003f;
 958
 959     # ASCII and undefined chars
 960     foreach my $i (0x00 .. 0x9f) { add_mapping( $i, $i ); }
 961     add_mapping( 0xa0, 0xf8e6 );
 962     add_mapping( 0xad, 0xf8e7 );
 963     add_mapping( 0xae, 0xf8e8 );
 964     add_mapping( 0xaf, 0xf8e9 );
 965     add_mapping( 0xfe, 0xf8ea );
 966     add_mapping( 0xff, 0xf8eb );
 967
 968     my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
 969     while (<$INPUT>)
 970     {
 971         next if /^\#/;  # skip comments
 972         next if /^$/;  # skip empty lines
 973         next if /\x1a/;  # skip ^Z
 974         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
 975         {
 976             add_mapping( 0x8080 + hex $1, hex $2 );
 977             next;
 978         }
 979         die "Unrecognized line $_\n";
 980     }
 981     close $INPUT;
 982
 983     # get some extra mappings from cp 949
 984     my @defined_lb;
 985     map { $defined_lb[$_] = 1; } @lead_bytes;
 986     foreach my $i (0x0000 .. 0xffff)
 987     {
 988         next if ($i >= 0x1100 && $i <= 0x11ff);  # range not used in 20949
 989         next unless defined $cp949[$i];
 990         if ($cp949[$i] >= 0xff)
 991         {
 992             # only add chars for lead bytes that exist in 20949
 993             my $hi = $cp949[$i] >> 8;
 994             my $lo = $cp949[$i] & 0xff;
 995             next unless $defined_lb[$hi];
 996             next unless $lo >= 0xa1 && $lo <= 0xfe;
 997         }
 998         add_mapping( $cp949[$i], $i );
 999     }
1000
1001     output_codepage_file( 20949 );
1002 }
1003
1004 ################################################################
1005 # build the sort keys table
1006 sub dump_sortkeys($)
1007 {
1008     my $filename = shift;
1009     my @sortkeys = ();
1010
1011     my $INPUT = open_data_file( $REPORTS, $SORTKEYS );
1012     while (<$INPUT>)
1013     {
1014         next if /^\#/;  # skip comments
1015         next if /^$/;  # skip empty lines
1016         next if /\x1a/;  # skip ^Z
1017         next if /^\@version/;  # skip @version header
1018         if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1019         {
1020             my ($uni,$variable) = (hex $1, $2);
1021             next if $uni > 65535;
1022             $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
1023             next;
1024         }
1025         if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
1026         {
1027             # multiple character sequence, ignored for now
1028             next;
1029         }
1030         die "$SORTKEYS: Unrecognized line $_\n";
1031     }
1032     close $INPUT;
1033
1034     # compress the keys to 32 bit:
1035     # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit
1036
1037     @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or
1038                        ${$a}[2] <=> ${$b}[2] or
1039                        ${$a}[3] <=> ${$b}[3] or
1040                        ${$a}[4] <=> ${$b}[4] or
1041                        $a cmp $b; } @sortkeys;
1042
1043     my ($n2, $n3) = (1, 1);
1044     my @keys = (-1, -1, -1, -1, -1 );
1045     my @flatkeys = ();
1046
1047     for (my $i = 0; $i < @sortkeys; $i++)
1048     {
1049         next unless defined $sortkeys[$i];
1050         my @current = @{$sortkeys[$i]};
1051         if ($current[1] == $keys[1])
1052         {
1053             if ($current[2] == $keys[2])
1054             {
1055                 if ($current[3] == $keys[3])
1056                 {
1057                     # nothing
1058                 }
1059                 else
1060                 {
1061                     $keys[3] = $current[3];
1062                     $n3++;
1063                     die if ($n3 >= 16);
1064                 }
1065             }
1066             else
1067             {
1068                 $keys[2] = $current[2];
1069                 $keys[3] = $current[3];
1070                 $n2++;
1071                 $n3 = 1;
1072                 die if ($n2 >= 256);
1073             }
1074         }
1075         else
1076         {
1077             $keys[1] = $current[1];
1078             $keys[2] = $current[2];
1079             $keys[3] = $current[3];
1080             $n2 = 1;
1081             $n3 = 1;
1082         }
1083
1084         if ($current[2]) { $current[2] = $n2; }
1085         if ($current[3]) { $current[3] = $n3; }
1086         if ($current[4]) { $current[4] = 1; }
1087
1088         $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
1089     }
1090
1091     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1092     printf "Building $filename\n";
1093     printf OUTPUT "/* Unicode collation element table */\n";
1094     printf OUTPUT "/* generated from %s */\n", "$REPORTS/$SORTKEYS";
1095     printf OUTPUT "/* DO NOT EDIT!! */\n\n";
1096     print OUTPUT "#include \"windef.h\"\n\n";
1097
1098     dump_two_level_mapping( "collation_table", 0xffffffff, 32, @flatkeys );
1099
1100     close OUTPUT;
1101     save_file($filename);
1102 }
1103
1104
1105 ################################################################
1106 # dump an array of integers
1107 sub dump_array($$@)
1108 {
1109     my ($bit_width, $default, @array) = @_;
1110     my $format = sprintf "0x%%0%ux", $bit_width / 4;
1111     my $i;
1112     my $ret = "    ";
1113     for ($i = 0; $i < $#array; $i++)
1114     {
1115         $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1116         $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
1117     }
1118     $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
1119     return $ret;
1120 }
1121
1122
1123 ################################################################
1124 # dump an SBCS mapping table in binary format
1125 sub dump_binary_sbcs_table($)
1126 {
1127     my $codepage = shift;
1128
1129     my @header = ( 13, $codepage, 1, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1130     my $wc_offset = 256 + 3 + (@glyph2uni ? 256 : 0);
1131
1132     print OUTPUT pack "S<*", @header;
1133     print OUTPUT pack "C12", (0) x 12;
1134     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1135
1136     if (@glyph2uni)
1137     {
1138         print OUTPUT pack "S<*", 256, get_glyphs_mapping(@cp2uni[0 .. 255]);
1139     }
1140     else
1141     {
1142         print OUTPUT pack "S<*", 0;
1143     }
1144
1145     print OUTPUT pack "S<*", 0, 0;
1146
1147     print OUTPUT pack "C*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1148 }
1149
1150
1151 ################################################################
1152 # dump a DBCS mapping table in binary format
1153 sub dump_binary_dbcs_table($)
1154 {
1155     my $codepage = shift;
1156     my @lb_ranges = get_lb_ranges();
1157     my @header = ( 13, $codepage, 2, $default_char, $default_wchar, $cp2uni[$default_char], $uni2cp[$default_wchar] );
1158
1159     my @offsets = (0) x 256;
1160     my $pos = 0;
1161     foreach my $i (@lead_bytes)
1162     {
1163         $offsets[$i] = ($pos += 256);
1164         $cp2uni[$i] = 0;
1165     }
1166
1167     my $wc_offset = 256 + 3 + 256 * (1 + scalar @lead_bytes);
1168
1169     print OUTPUT pack "S<*", @header;
1170     print OUTPUT pack "C12", @lb_ranges, 0 x 12;
1171     print OUTPUT pack "S<*", $wc_offset, map { $_ || 0; } @cp2uni[0 .. 255];
1172     print OUTPUT pack "S<*", 0, scalar @lb_ranges / 2, @offsets;
1173
1174     foreach my $i (@lead_bytes)
1175     {
1176         my $base = $i << 8;
1177         print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_wchar; } @cp2uni[$base .. $base + 255];
1178     }
1179
1180     print OUTPUT pack "S<", 4;
1181     print OUTPUT pack "S<*", map { defined $_ ? $_ : $default_char; } @uni2cp[0 .. 65535];
1182 }
1183
1184
1185 ################################################################
1186 # get the list of defined lead byte ranges
1187 sub get_lb_ranges()
1188 {
1189     my @list = ();
1190     my @ranges = ();
1191
1192     foreach my $i (@lead_bytes) { $list[$i] = 1; }
1193     my $on = 0;
1194     for (my $i = 0; $i < 256; $i++)
1195     {
1196         if ($on)
1197         {
1198             if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
1199         }
1200         else
1201         {
1202             if ($list[$i]) { push @ranges, $i; $on = 1; }
1203         }
1204     }
1205     if ($on) { push @ranges, 0xff; }
1206     return @ranges;
1207 }
1208
1209 ################################################################
1210 # dump the Indic Syllabic Category table
1211 sub dump_indic($)
1212 {
1213     my $filename = shift;
1214     my @indic_table;
1215
1216     my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
1217     while (<$INPUT>)
1218     {
1219         next if /^\#/;  # skip comments
1220         next if /^\s*$/;  # skip empty lines
1221         next if /\x1a/;  # skip ^Z
1222         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1223         {
1224             my $type = $2;
1225             die "unknown indic $type" unless defined $indic_types{$type};
1226             if (hex $1 < 65536)
1227             {
1228                 $indic_table[hex $1] = $indic_types{$type};
1229             }
1230             next;
1231         }
1232         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1233         {
1234             my $type = $3;
1235             die "unknown indic $type" unless defined $indic_types{$type};
1236             if (hex $1 < 65536 and hex $2 < 65536)
1237             {
1238                 foreach my $i (hex $1 .. hex $2)
1239                 {
1240                     $indic_table[$i] = $indic_types{$type};
1241                 }
1242             }
1243             next;
1244         }
1245         die "malformed line $_";
1246     }
1247     close $INPUT;
1248
1249     $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
1250     while (<$INPUT>)
1251     {
1252         next if /^\#/;  # skip comments
1253         next if /^\s*$/;  # skip empty lines
1254         next if /\x1a/;  # skip ^Z
1255         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*#/)
1256         {
1257             my $type = $2;
1258             die "unknown matra $type" unless defined $matra_types{$type};
1259             $indic_table[hex $1] |= $matra_types{$type} << 8;
1260             next;
1261         }
1262         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*#/)
1263         {
1264             my $type = $3;
1265             die "unknown matra $type" unless defined $matra_types{$type};
1266             foreach my $i (hex $1 .. hex $2)
1267             {
1268                 $indic_table[$i] |= $matra_types{$type} << 8;
1269             }
1270             next;
1271         }
1272         die "malformed line $_";
1273     }
1274     close $INPUT;
1275
1276     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1277     print "Building $filename\n";
1278     print OUTPUT "/* Unicode Indic Syllabic Category */\n";
1279     print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
1280     print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
1281     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1282     print OUTPUT "#include \"windef.h\"\n\n";
1283
1284     dump_two_level_mapping( "indic_syllabic_table", $indic_types{'Other'}, 16, @indic_table );
1285
1286     close OUTPUT;
1287     save_file($filename);
1288 }
1289
1290 ################################################################
1291 # dump the Line Break Properties table
1292 sub dump_linebreak($)
1293 {
1294     my $filename = shift;
1295     my @break_table;
1296
1297     my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
1298     while (<$INPUT>)
1299     {
1300         next if /^\#/;  # skip comments
1301         next if /^\s*$/;  # skip empty lines
1302         next if /\x1a/;  # skip ^Z
1303         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1304         {
1305             my $type = $2;
1306             die "unknown breaktype $type" unless defined $break_types{$type};
1307             $break_table[hex $1] = $break_types{$type};
1308             next;
1309         }
1310         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z][0-9A-Z])+\s*/)
1311         {
1312             my $type = $3;
1313             die "unknown breaktype $type" unless defined $break_types{$type};
1314             foreach my $i (hex $1 .. hex $2)
1315             {
1316                 $break_table[$i] = $break_types{$type};
1317             }
1318             next;
1319         }
1320         elsif (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1321         {
1322             my $type = $2;
1323             die "unknown breaktype $type" unless defined $break_types{$type};
1324             $break_table[hex $1] = $break_types{$type};
1325             next;
1326         }
1327         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([0-9A-Z][0-9A-Z])+\s*/)
1328         {
1329             my $type = $3;
1330             die "unknown breaktype $type" unless defined $break_types{$type};
1331             foreach my $i (hex $1 .. hex $2)
1332             {
1333                 $break_table[$i] = $break_types{$type};
1334             }
1335             next;
1336         }
1337         die "malformed line $_";
1338     }
1339     close $INPUT;
1340
1341     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1342     print "Building $filename\n";
1343     print OUTPUT "/* Unicode Line Break Properties */\n";
1344     print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
1345     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1346     print OUTPUT "#include \"windef.h\"\n\n";
1347
1348     dump_two_level_mapping( "wine_linebreak_table", $break_types{'XX'}, 16, @break_table );
1349
1350     close OUTPUT;
1351     save_file($filename);
1352 }
1353
1354 my %scripts =
1355 (
1356     "Unknown"                => 0,
1357     "Common"                 => 1,
1358     "Inherited"              => 2,
1359     "Arabic"                 => 3,
1360     "Armenian"               => 4,
1361     "Avestan"                => 5,
1362     "Balinese"               => 6,
1363     "Bamum"                  => 7,
1364     "Batak"                  => 8,
1365     "Bengali"                => 9,
1366     "Bopomofo"               => 10,
1367     "Brahmi"                 => 11,
1368     "Braille"                => 12,
1369     "Buginese"               => 13,
1370     "Buhid"                  => 14,
1371     "Canadian_Aboriginal"    => 15,
1372     "Carian"                 => 16,
1373     "Cham"                   => 17,
1374     "Cherokee"               => 18,
1375     "Coptic"                 => 19,
1376     "Cuneiform"              => 20,
1377     "Cypriot"                => 21,
1378     "Cyrillic"               => 22,
1379     "Deseret"                => 23,
1380     "Devanagari"             => 24,
1381     "Egyptian_Hieroglyphs"   => 25,
1382     "Ethiopic"               => 26,
1383     "Georgian"               => 27,
1384     "Glagolitic"             => 28,
1385     "Gothic"                 => 29,
1386     "Greek"                  => 30,
1387     "Gujarati"               => 31,
1388     "Gurmukhi"               => 32,
1389     "Han"                    => 33,
1390     "Hangul"                 => 34,
1391     "Hanunoo"                => 35,
1392     "Hebrew"                 => 36,
1393     "Hiragana"               => 37,
1394     "Imperial_Aramaic"       => 38,
1395     "Inscriptional_Pahlavi"  => 39,
1396     "Inscriptional_Parthian" => 40,
1397     "Javanese"               => 41,
1398     "Kaithi"                 => 42,
1399     "Kannada"                => 43,
1400     "Katakana"               => 44,
1401     "Kayah_Li"               => 45,
1402     "Kharoshthi"             => 46,
1403     "Khmer"                  => 47,
1404     "Lao"                    => 48,
1405     "Latin"                  => 49,
1406     "Lepcha"                 => 50,
1407     "Limbu"                  => 51,
1408     "Linear_B"               => 52,
1409     "Lisu"                   => 53,
1410     "Lycian"                 => 54,
1411     "Lydian"                 => 55,
1412     "Malayalam"              => 56,
1413     "Mandaic"                => 57,
1414     "Meetei_Mayek"           => 58,
1415     "Mongolian"              => 59,
1416     "Myanmar"                => 60,
1417     "New_Tai_Lue"            => 61,
1418     "Nko"                    => 62,
1419     "Ogham"                  => 63,
1420     "Ol_Chiki"               => 64,
1421     "Old_Italic"             => 65,
1422     "Old_Persian"            => 66,
1423     "Old_South_Arabian"      => 67,
1424     "Old_Turkic"             => 68,
1425     "Oriya"                  => 69,
1426     "Osmanya"                => 70,
1427     "Phags_Pa"               => 71,
1428     "Phoenician"             => 72,
1429     "Rejang"                 => 73,
1430     "Runic"                  => 74,
1431     "Samaritan"              => 75,
1432     "Saurashtra"             => 76,
1433     "Shavian"                => 77,
1434     "Sinhala"                => 78,
1435     "Sundanese"              => 79,
1436     "Syloti_Nagri"           => 80,
1437     "Syriac"                 => 81,
1438     "Tagalog"                => 82,
1439     "Tagbanwa"               => 83,
1440     "Tai_Le"                 => 84,
1441     "Tai_Tham"               => 85,
1442     "Tai_Viet"               => 86,
1443     "Tamil"                  => 87,
1444     "Telugu"                 => 88,
1445     "Thaana"                 => 89,
1446     "Thai"                   => 90,
1447     "Tibetan"                => 91,
1448     "Tifinagh"               => 92,
1449     "Ugaritic"               => 93,
1450     "Vai"                    => 94,
1451     "Yi"                     => 95,
1452     # Win8/Win8.1
1453     "Chakma"                 => 96,
1454     "Meroitic_Cursive"       => 97,
1455     "Meroitic_Hieroglyphs"   => 98,
1456     "Miao"                   => 99,
1457     "Sharada"                => 100,
1458     "Sora_Sompeng"           => 101,
1459     "Takri"                  => 102,
1460     # Win10
1461     "Bassa_Vah"              => 103,
1462     "Caucasian_Albanian"     => 104,
1463     "Duployan"               => 105,
1464     "Elbasan"                => 106,
1465     "Grantha"                => 107,
1466     "Khojki"                 => 108,
1467     "Khudawadi"              => 109,
1468     "Linear_A"               => 110,
1469     "Mahajani"               => 111,
1470     "Manichaean"             => 112,
1471     "Mende_Kikakui"          => 113,
1472     "Modi"                   => 114,
1473     "Mro"                    => 115,
1474     "Nabataean"              => 116,
1475     "Old_North_Arabian"      => 117,
1476     "Old_Permic"             => 118,
1477     "Pahawh_Hmong"           => 119,
1478     "Palmyrene"              => 120,
1479     "Pau_Cin_Hau"            => 121,
1480     "Psalter_Pahlavi"        => 122,
1481     "Siddham"                => 123,
1482     "Tirhuta"                => 124,
1483     "Warang_Citi"            => 125,
1484     # Win10 RS1
1485     "Adlam"                  => 126,
1486     "Ahom"                   => 127,
1487     "Anatolian_Hieroglyphs"  => 128,
1488     "Bhaiksuki"              => 129,
1489     "Hatran"                 => 130,
1490     "Marchen"                => 131,
1491     "Multani"                => 132,
1492     "Newa"                   => 133,
1493     "Old_Hungarian"          => 134,
1494     "Osage"                  => 135,
1495     "SignWriting"            => 136,
1496     "Tangut"                 => 137,
1497     # Win10 RS4
1498     "Masaram_Gondi"          => 138,
1499     "Nushu"                  => 139,
1500     "Soyombo"                => 140,
1501     "Zanabazar_Square"       => 141,
1502     # Win10 1903
1503     "Dogra"                  => 142,
1504     "Gunjala_Gondi"          => 143,
1505     "Hanifi_Rohingya"        => 144,
1506     "Makasar"                => 145,
1507     "Medefaidrin"            => 146,
1508     "Old_Sogdian"            => 147,
1509     "Sogdian"                => 148,
1510     # Win10 2004
1511     "Elymaic"                => 149,
1512     "Nyiakeng_Puachue_Hmong" => 150,
1513     "Nandinagari"            => 151,
1514     "Wancho"                 => 152,
1515     # Win11
1516     "Chorasmian"             => 153,
1517     "Dives_Akuru"            => 154,
1518     "Khitan_Small_Script"    => 155,
1519     "Yezidi"                 => 156,
1520 );
1521
1522 ################################################################
1523 # dump Script IDs table
1524 sub dump_scripts($)
1525 {
1526     my $filename = shift;
1527     my $header = $filename;
1528     my @scripts_table;
1529     my $script_index;
1530     my $i;
1531
1532     my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
1533     # Fill the table
1534     # Unknown script id is always 0, so undefined scripts are automatically treated as such
1535     while (<$INPUT>)
1536     {
1537         my $type = "";
1538
1539         next if /^\#/;  # skip comments
1540         next if /^\s*$/;  # skip empty lines
1541         next if /\x1a/;  # skip ^Z
1542         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1543         {
1544             $type = $2;
1545             if (defined $scripts{$type})
1546             {
1547                 $scripts_table[hex $1] = $scripts{$type};
1548             }
1549             next;
1550         }
1551         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1552         {
1553             $type = $3;
1554             if (defined $scripts{$type})
1555             {
1556                 foreach my $i (hex $1 .. hex $2)
1557                 {
1558                     $scripts_table[$i] = $scripts{$type};
1559                 }
1560             }
1561             next;
1562         }
1563     }
1564
1565     close $INPUT;
1566
1567     $header = "$filename.h";
1568     open OUTPUT,">$header.new" or die "Cannot create $header";
1569     print "Building $header\n";
1570     print OUTPUT "/* Unicode Script IDs */\n";
1571     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1572     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1573
1574     print OUTPUT "enum unicode_script_id {\n";
1575     foreach my $script (sort { $scripts{$a} <=> $scripts{$b} } keys %scripts)
1576     {
1577         print OUTPUT "    Script_$script = $scripts{$script},\n";
1578     }
1579     print OUTPUT "    Script_LastId = ", (scalar keys %scripts) - 1, "\n";
1580     print OUTPUT "};\n";
1581
1582     close OUTPUT;
1583     save_file($header);
1584
1585     $filename = "$filename.c";
1586     open OUTPUT,">$filename.new" or die "Cannot create $header";
1587     print "Building $filename\n";
1588     print OUTPUT "/* Unicode Script IDs */\n";
1589     print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
1590     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1591     print OUTPUT "#include \"windef.h\"\n\n";
1592
1593     dump_two_level_mapping( "wine_scripts_table", 0, 16, @scripts_table );
1594     close OUTPUT;
1595     save_file($filename);
1596 }
1597
1598 ################################################################
1599 # dump the BiDi mirroring table
1600 sub dump_mirroring($)
1601 {
1602     my $filename = shift;
1603     my @mirror_table = ();
1604
1605     my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
1606     while (<$INPUT>)
1607     {
1608         next if /^\#/;  # skip comments
1609         next if /^$/;  # skip empty lines
1610         next if /\x1a/;  # skip ^Z
1611         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+)/)
1612         {
1613             $mirror_table[hex $1] = hex $2;
1614             next;
1615         }
1616         die "malformed line $_";
1617     }
1618     close $INPUT;
1619
1620     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1621     print "Building $filename\n";
1622     print OUTPUT "/* Unicode BiDi mirroring */\n";
1623     print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
1624     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1625     print OUTPUT "#include \"windef.h\"\n\n";
1626     dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
1627     close OUTPUT;
1628     save_file($filename);
1629 }
1630
1631 ################################################################
1632 # dump the Bidi Brackets
1633 sub dump_bracket($)
1634 {
1635     my $filename = shift;
1636     my @bracket_table;
1637
1638     my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
1639     while (<$INPUT>)
1640     {
1641         next if /^\#/;  # skip comments
1642         next if /^\s*$/;  # skip empty lines
1643         next if /\x1a/;  # skip ^Z
1644         if (/^\s*([0-9a-fA-F]+)\s*;\s*([0-9a-fA-F]+);\s*([con])/)
1645         {
1646             my $type = $3;
1647             die "unknown bracket $type" unless defined $bracket_types{$type};
1648             die "characters too distant $1 and $2" if abs(hex($2) - hex($1)) >= 128;
1649             $bracket_table[hex $1] = (hex($2) - hex($1)) % 255;
1650             $bracket_table[hex $1] += $bracket_types{$type} << 8;
1651             next;
1652         }
1653         die "malformed line $_";
1654     }
1655     close $INPUT;
1656
1657     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1658     print "Building $filename\n";
1659     print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
1660     print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
1661     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1662     print OUTPUT "#include \"windef.h\"\n\n";
1663
1664     dump_two_level_mapping( "bidi_bracket_table", 0, 16, @bracket_table );
1665
1666     close OUTPUT;
1667     save_file($filename);
1668 }
1669
1670 ################################################################
1671 # dump the Arabic shaping table
1672 sub dump_shaping($)
1673 {
1674     my $filename = shift;
1675     my @joining_table = @initial_joining_table;
1676
1677     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1678     while (<$INPUT>)
1679     {
1680         next if /^\#/;  # skip comments
1681         next if /^\s*$/;  # skip empty lines
1682         next if /\x1a/;  # skip ^Z
1683         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1684         {
1685             my $type = $2;
1686             $joining_table[hex $1] = $joining_types{$type};
1687             next;
1688         }
1689         die "malformed line $_";
1690     }
1691     close $INPUT;
1692
1693     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1694     print "Building $filename\n";
1695     print OUTPUT "/* Unicode Arabic shaping */\n";
1696     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1697     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1698     print OUTPUT "#include \"windef.h\"\n\n";
1699
1700     dump_two_level_mapping( "wine_shaping_table", 0, 16, @joining_table );
1701
1702     print OUTPUT "\nconst unsigned short DECLSPEC_HIDDEN wine_shaping_forms[256][4] =\n{\n";
1703     for (my $i = 0x600; $i <= 0x6ff; $i++)
1704     {
1705         printf OUTPUT "    { 0x%04x, 0x%04x, 0x%04x, 0x%04x },\n",
1706             ${joining_forms{"isolated"}}[$i] || $i,
1707             ${joining_forms{"final"}}[$i] || $i,
1708             ${joining_forms{"initial"}}[$i] || $i,
1709             ${joining_forms{"medial"}}[$i] || $i;
1710     }
1711     print OUTPUT "};\n";
1712
1713     close OUTPUT;
1714     save_file($filename);
1715 }
1716
1717 ################################################################
1718 # dump the Arabic shaping table
1719 sub dump_arabic_shaping($)
1720 {
1721     my $filename = shift;
1722     my @joining_table = @initial_joining_table;
1723
1724     my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
1725     while (<$INPUT>)
1726     {
1727         next if /^\#/;  # skip comments
1728         next if /^\s*$/;  # skip empty lines
1729         next if /\x1a/;  # skip ^Z
1730         if (/^\s*([0-9a-fA-F]+)\s*;.*;\s*([RLDCUT])\s*;\s*(\w+)/)
1731         {
1732             my $type = $2;
1733             my $group = $3;
1734
1735             if ($group eq "ALAPH" || $group eq "DALATH RISH")
1736             {
1737                 $joining_table[hex $1] = $joining_types{$group};
1738             }
1739             else
1740             {
1741                 $joining_table[hex $1] = $joining_types{$type};
1742             }
1743
1744             next;
1745         }
1746         die "malformed line $_";
1747     }
1748     close $INPUT;
1749
1750     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1751     print "Building $filename\n";
1752     print OUTPUT "/* Unicode Arabic shaping */\n";
1753     print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
1754     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1755     print OUTPUT "#include \"windef.h\"\n\n";
1756
1757     dump_two_level_mapping( "arabic_shaping_table", 0, 16, @joining_table );
1758
1759     close OUTPUT;
1760     save_file($filename);
1761 }
1762
1763 ################################################################
1764 # dump the Vertical Orientation table
1765 sub dump_vertical($$)
1766 {
1767     my ($filename, $unix) = @_;
1768     my @vertical_table;
1769
1770     my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
1771     while (<$INPUT>)
1772     {
1773         next if /^\#/;  # skip comments
1774         next if /^\s*$/;  # skip empty lines
1775         next if /\x1a/;  # skip ^Z
1776         if (/^\s*([0-9a-fA-F]+)\s*;\s*([a-zA-Z_]+)\s*/)
1777         {
1778             my $type = $2;
1779             die "unknown vertical $type" unless defined $vertical_types{$type};
1780             if (hex $1 < 65536)
1781             {
1782                 $vertical_table[hex $1] = $vertical_types{$type};
1783             }
1784             next;
1785         }
1786         elsif (/^\s*([0-9a-fA-F]+)\.\.\s*([0-9a-fA-F]+)\s*;\s*([A-Za-z_]+)\s*/)
1787         {
1788             my $type = $3;
1789             die "unknown vertical $type" unless defined $vertical_types{$type};
1790             foreach my $i (hex $1 .. hex $2)
1791             {
1792                 $vertical_table[$i] = $vertical_types{$type};
1793             }
1794             next;
1795         }
1796         die "malformed line $_";
1797     }
1798     close $INPUT;
1799
1800     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1801     print "Building $filename\n";
1802     print OUTPUT "/* Unicode Vertical Orientation */\n";
1803     print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
1804     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1805     if ($unix)
1806     {
1807         print OUTPUT "#if 0\n";
1808         print OUTPUT "#pragma makedep unix\n";
1809         print OUTPUT "#endif\n\n";
1810     }
1811     print OUTPUT "#include \"windef.h\"\n\n";
1812
1813     dump_two_level_mapping( "vertical_orientation_table", $vertical_types{'R'}, 16, @vertical_table );
1814
1815     close OUTPUT;
1816     save_file($filename);
1817 }
1818
1819 ################################################################
1820 # dump the digit folding tables
1821 sub dump_digit_folding($)
1822 {
1823     my ($filename) = shift;
1824     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1825     print "Building $filename\n";
1826     print OUTPUT "/* Unicode digit folding mappings */\n";
1827     print OUTPUT "/* generated from $UNIDATA:UnicodeData.txt */\n";
1828     print OUTPUT "/* DO NOT EDIT!! */\n\n";
1829     print OUTPUT "#include \"windef.h\"\n\n";
1830
1831     dump_two_level_mapping( "wine_digitmap", 0, 16, @digitmap_table );
1832     close OUTPUT;
1833     save_file($filename);
1834 }
1835
1836
1837 ################################################################
1838 # compress a mapping table by removing identical rows
1839 sub compress_array($$@)
1840 {
1841     my $rows = shift;
1842     my $def = shift;
1843     my @table = @_;
1844     my $len = @table / $rows;
1845     my @array;
1846     my $data = "";
1847
1848     # try to merge table rows
1849     for (my $row = 0; $row < $rows; $row++)
1850     {
1851         my $rowtxt = pack "U*", map { defined($_) ? $_ : $def; } @table[($row * $len)..(($row + 1) * $len - 1)];
1852         my $pos = index $data, $rowtxt;
1853         if ($pos == -1)
1854         {
1855             # check if the tail of the data can match the start of the new row
1856             my $first = substr( $rowtxt, 0, 1 );
1857             for (my $i = length($data) - 1; $i > 0; $i--)
1858             {
1859                 $pos = index( substr( $data, -$i ), $first );
1860                 last if $pos == -1;
1861                 $i -= $pos;
1862                 next unless substr( $data, -$i ) eq substr( $rowtxt, 0, $i );
1863                 substr( $data, -$i ) = "";
1864                 last;
1865             }
1866             $pos = length $data;
1867             $data .= $rowtxt;
1868         }
1869         $array[$row] = $rows + $pos;
1870     }
1871     return @array, unpack "U*", $data;
1872 }
1873
1874 ################################################################
1875 # dump a char -> 16-bit value mapping table using two-level tables
1876 sub dump_two_level_mapping($$@)
1877 {
1878     my $name = shift;
1879     my $def = shift;
1880     my $size = shift;
1881     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1882     my @row_array = compress_array( 4096, $def, @_[0..65535] );
1883     my @array = compress_array( 256, 0, @row_array[0..4095] );
1884
1885     for (my $i = 256; $i < @array; $i++) { $array[$i] += @array - 4096; }
1886
1887     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%d] =\n{\n", $type, $name, @array + @row_array - 4096;
1888     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array[0..255] );
1889     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array[256..$#array] );
1890     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @row_array[4096..$#row_array] );
1891 }
1892
1893 ################################################################
1894 # dump a char -> value mapping table using three-level tables
1895 sub dump_three_level_mapping($$@)
1896 {
1897     my $name = shift;
1898     my $def = shift;
1899     my $size = shift;
1900     my $type = $size == 16 ? "unsigned short" : "unsigned int";
1901     my $level3 = ($MAX_CHAR + 1) / 16;
1902     my $level2 = $level3 / 16;
1903     my $level1 = $level2 / 16;
1904     my @array3 = compress_array( $level3, $def, @_[0..$MAX_CHAR] );
1905     my @array2 = compress_array( $level2, 0, @array3[0..$level3-1] );
1906     my @array1 = compress_array( $level1, 0, @array2[0..$level2-1] );
1907
1908     for (my $i = $level2; $i < @array2; $i++) { $array2[$i] += @array1 + @array2 - $level2 - $level3; }
1909     for (my $i = $level1; $i < @array1; $i++) { $array1[$i] += @array1 - $level2; }
1910
1911     printf OUTPUT "const %s DECLSPEC_HIDDEN %s[%u] =\n{\n", $type, $name, @array1 + (@array2 - $level2) + (@array3 - $level3);
1912     printf OUTPUT "    /* level 1 offsets */\n%s,\n", dump_array( $size, 0, @array1[0..$level1-1] );
1913     printf OUTPUT "    /* level 2 offsets */\n%s,\n", dump_array( $size, 0, @array1[$level1..$#array1] );
1914     printf OUTPUT "    /* level 3 offsets */\n%s,\n", dump_array( $size, 0, @array2[$level2..$#array2] );
1915     printf OUTPUT "    /* values */\n%s\n};\n", dump_array( $size, 0, @array3[$level3..$#array3] );
1916 }
1917
1918 ################################################################
1919 # dump a binary case mapping table in l_intl.nls format
1920 sub dump_binary_case_table(@)
1921 {
1922     my (@table) = @_;
1923     my $max_char = 0x10000;
1924     my $level1 = $max_char / 16;
1925     my $level2 = $level1 / 16;
1926
1927     my @difftable;
1928     for (my $i = 0; $i < @table; $i++)
1929     {
1930         next unless defined $table[$i];
1931         $difftable[$i] = ($table[$i] - $i) & 0xffff;
1932     }
1933
1934     my @row_array = compress_array( $level1, 0, @difftable[0..$max_char-1] );
1935     my @array = compress_array( $level2, 0, @row_array[0..$level1-1] );
1936     my $offset = @array - $level1;
1937     for (my $i = $level2; $i < @array; $i++) { $array[$i] += $offset; }
1938     return pack "S<*", 1 + $offset + @row_array, @array, @row_array[$level1..$#row_array];
1939 }
1940
1941 ################################################################
1942 # dump case mappings for l_intl.nls
1943 sub dump_intl_nls($)
1944 {
1945     my @upper_table = @toupper_table;
1946     my @lower_table = @tolower_table;
1947     remove_linguistic_mappings( \@upper_table, \@lower_table );
1948
1949     my $upper = dump_binary_case_table( @upper_table );
1950     my $lower = dump_binary_case_table( @lower_table );
1951
1952     my $filename = shift;
1953     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1954     printf "Building $filename\n";
1955
1956     binmode OUTPUT;
1957     print OUTPUT pack "S<", 1;  # version
1958     print OUTPUT $upper;
1959     print OUTPUT $lower;
1960     close OUTPUT;
1961     save_file($filename);
1962 }
1963
1964
1965 ################################################################
1966 # dump the bidi direction table
1967 sub dump_bidi_dir_table($)
1968 {
1969     my $filename = shift;
1970     open OUTPUT,">$filename.new" or die "Cannot create $filename";
1971     printf "Building $filename\n";
1972     printf OUTPUT "/* Unicode BiDi direction table */\n";
1973     printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
1974     printf OUTPUT "#include \"windef.h\"\n\n";
1975
1976     my @table;
1977
1978     for (my $i = 0; $i < 65536; $i++)
1979     {
1980         $table[$i] = $bidi_types{$direction_table[$i]} if defined $direction_table[$i];
1981     }
1982
1983     dump_two_level_mapping( "bidi_direction_table", $bidi_types{"L"}, 16, @table );
1984
1985     close OUTPUT;
1986     save_file($filename);
1987 }
1988
1989
1990 sub rol($$)
1991 {
1992     my ($byte, $count) = @_;
1993     return (($byte << $count) | ($byte >> (8 - $count))) & 0xff;
1994 }
1995
1996 ################################################################
1997 # compress the character properties table
1998 sub compress_char_props_table($@)
1999 {
2000     my $rows = shift;
2001     my @table = @_;
2002     my $len = @table / $rows;
2003     my $pos = 0;
2004     my @array = (0) x $rows;
2005     my %sequences;
2006
2007     # add some predefined sequences
2008     foreach my $i (0, 0xfb .. 0xff) { $sequences{pack "L*", (rol($i,5)) x $len} = $i; }
2009
2010     # try to merge table rows
2011     for (my $row = 0; $row < $rows; $row++)
2012     {
2013         my @table_row = map { defined $_ ? $_ : 0x7f; } @table[($row * $len)..(($row + 1) * $len - 1)];
2014         my $rowtxt = pack "L*", @table_row;
2015         if (defined($sequences{$rowtxt}))
2016         {
2017             # reuse an existing row
2018             $array[$row] = $sequences{$rowtxt};
2019         }
2020         else
2021         {
2022             # create a new row
2023             $sequences{$rowtxt} = $array[$row] = ++$pos;
2024             push @array, @table_row;
2025         }
2026     }
2027     return @array;
2028 }
2029
2030 ################################################################
2031 # dump a normalization table in binary format
2032 sub dump_norm_table($)
2033 {
2034     my $filename = shift;
2035
2036     my %forms  = ( "nfc" => 1, "nfd" => 2, "nfkc" => 5, "nfkd" => 6, "idna" => 13 );
2037     my %decomp = ( "nfc" => \@decomp_table,
2038                    "nfd" => \@decomp_table,
2039                    "nfkc" => \@decomp_compat_table,
2040                    "nfkd" => \@decomp_compat_table ,
2041                    "idna" => \@idna_decomp_table );
2042
2043     open OUTPUT,">$filename.new" or die "Cannot create $filename";
2044     print "Building $filename\n";
2045
2046     my $type = $filename;
2047     $type =~ s!.*/norm(\w+)\.nls!$1!;
2048
2049     my $compose = $forms{$type} & 1;
2050     my $compat = !!($forms{$type} & 4) + ($type eq "idna");
2051
2052     my @version = split /\./, $UNIVERSION;
2053
2054     # combining classes
2055
2056     my @classes;
2057     my @class_values;
2058
2059     foreach my $c (grep defined, @combining_class_table)
2060     {
2061         $classes[$c] = 1 if $c < 0x100;
2062     }
2063     for (my $i = 0; $i < @classes; $i++)
2064     {
2065         next unless defined $classes[$i];
2066         $classes[$i] = @class_values;
2067         push @class_values, $i;
2068     }
2069     push @class_values, 0 if (@class_values % 2);
2070     die "too many classes" if @class_values >= 0x40;
2071
2072     # character properties
2073
2074     my @char_props;
2075     my @decomposed;
2076     my @comp_hash_table;
2077     my $comp_hash_size = $compose ? 254 : 0;
2078
2079     for (my $i = 0; $i <= $MAX_CHAR; $i++)
2080     {
2081         next unless defined $combining_class_table[$i];
2082         if (defined $decomp{$type}->[$i])
2083         {
2084             my @dec = get_decomposition( $i, $decomp{$type} );
2085             if ($compose && (my @comp = get_composition( $i, $compat )))
2086             {
2087                 my $hash = ($comp[0] + 95 * $comp[1]) % $comp_hash_size;
2088                 push @{$comp_hash_table[$hash]}, to_utf16( @comp, $i );
2089
2090                 my $val = 0;
2091                 foreach my $d (@dec)
2092                 {
2093                     $val = $combining_class_table[$d];
2094                     last if $val;
2095                 }
2096                 $char_props[$i] = $classes[$val];
2097             }
2098             else
2099             {
2100                 $char_props[$i] = 0xbf;
2101             }
2102             @dec = compose_hangul( @dec ) if $compose;
2103             @dec = to_utf16( @dec );
2104             push @dec, 0 if @dec >= 7;
2105             $decomposed[$i] = \@dec;
2106         }
2107         else
2108         {
2109             if ($combining_class_table[$i] == 0x100)
2110             {
2111                 $char_props[$i] = 0x7f;
2112             }
2113             elsif ($combining_class_table[$i])
2114             {
2115                 $char_props[$i] = $classes[$combining_class_table[$i]] | 0x80;
2116             }
2117             elsif ($type eq "idna" && defined $idna_disallowed[$i])
2118             {
2119                 $char_props[$i] = 0xff;
2120             }
2121             else
2122             {
2123                 $char_props[$i] = 0;
2124             }
2125         }
2126     }
2127
2128     if ($compose)
2129     {
2130         for (my $i = 0; $i <= $MAX_CHAR; $i++)
2131         {
2132             my @comp = get_composition( $i, $compat );
2133             next unless @comp;
2134             if ($combining_class_table[$comp[1]])
2135             {
2136                 $char_props[$comp[0]] |= 0x40 unless $char_props[$comp[0]] & 0x80;
2137                 $char_props[$comp[1]] |= 0x40;
2138             }
2139             else
2140             {
2141                 $char_props[$comp[0]] = ($char_props[$comp[0]] & ~0x40) | 0x80;
2142                 $char_props[$comp[1]] |= 0xc0;
2143             }
2144         }
2145     }
2146
2147     # surrogates
2148     foreach my $i (0xd800..0xdbff) { $char_props[$i] = 0xdf; }
2149     foreach my $i (0xdc00..0xdfff) { $char_props[$i] = 0x9f; }
2150
2151     # Hangul
2152     if ($type eq "nfc") { foreach my $i (0x1100..0x117f) { $char_props[$i] = 0xff; } }
2153     elsif ($compose) { foreach my $i (0x1100..0x11ff) { $char_props[$i] = 0xff; } }
2154     foreach my $i (0xac00..0xd7ff) { $char_props[$i] = 0xff; }
2155
2156     # invalid chars
2157     if ($type eq "idna") { foreach my $i (0x00..0x1f, 0x7f) { $char_props[$i] = 0xff; } }
2158     foreach my $i (0xfdd0..0xfdef) { $char_props[$i] = 0xff; }
2159     foreach my $i (0x00..0x10)
2160     {
2161         $char_props[($i << 16) | 0xfffe] = 0xff;
2162         $char_props[($i << 16) | 0xffff] = 0xff;
2163     }
2164
2165     # decomposition hash table
2166
2167     my @decomp_hash_table;
2168     my @decomp_hash_index;
2169     my @decomp_hash_data;
2170     my $decomp_hash_size = 944;
2171
2172     # build string of character data, reusing substrings when possible
2173     my $decomp_char_data = "";
2174     foreach my $i (sort { @{$b} <=> @{$a} } grep defined, @decomposed)
2175     {
2176         my $str = pack "U*", @{$i};
2177         $decomp_char_data .= $str if index( $decomp_char_data, $str) == -1;
2178     }
2179     for (my $i = 0; $i < @decomposed; $i++)
2180     {
2181         next unless defined $decomposed[$i];
2182         my $pos = index( $decomp_char_data, pack( "U*", @{$decomposed[$i]} ));
2183         die "sequence not found" if $pos == -1;
2184         my $len = @{$decomposed[$i]};
2185         $len = 7 if $len > 7;
2186         my $hash = $i % $decomp_hash_size;
2187         push @{$decomp_hash_table[$hash]}, [ $i, ($len << 13) | $pos ];
2188     }
2189     for (my $i = 0; $i < $decomp_hash_size; $i++)
2190     {
2191         $decomp_hash_index[$i] = @decomp_hash_data / 2;
2192         next unless defined $decomp_hash_table[$i];
2193         if (@{$decomp_hash_table[$i]} == 1)
2194         {
2195             my $entry = $decomp_hash_table[$i]->[0];
2196             if ($char_props[$entry->[0]] == 0xbf)
2197             {
2198                 $decomp_hash_index[$i] = $entry->[1];
2199                 next;
2200             }
2201         }
2202         foreach my $entry (@{$decomp_hash_table[$i]})
2203         {
2204             push @decomp_hash_data, $entry->[0] & 0xffff, $entry->[1];
2205         }
2206     }
2207     push @decomp_hash_data, 0, 0;
2208
2209     # composition hash table
2210
2211     my @comp_hash_index;
2212     my @comp_hash_data;
2213     if (@comp_hash_table)
2214     {
2215         for (my $i = 0; $i < $comp_hash_size; $i++)
2216         {
2217             $comp_hash_index[$i] = @comp_hash_data;
2218             push @comp_hash_data, @{$comp_hash_table[$i]} if defined $comp_hash_table[$i];
2219         }
2220         $comp_hash_index[$comp_hash_size] = @comp_hash_data;
2221         push @comp_hash_data, 0, 0, 0;
2222     }
2223
2224     my $level1 = ($MAX_CHAR + 1) / 128;
2225     my @rows = compress_char_props_table( $level1, @char_props[0..$MAX_CHAR] );
2226
2227     my @header = ( $version[0], $version[1], $version[2], 0, $forms{$type}, $compat ? 18 : 3,
2228                    0, $decomp_hash_size, $comp_hash_size, 0 );
2229     my @tables = (0) x 8;
2230
2231     $tables[0] = 16 + @header + @tables;
2232     $tables[1] = $tables[0] + @class_values / 2;
2233     $tables[2] = $tables[1] + $level1 / 2;
2234     $tables[3] = $tables[2] + (@rows - $level1) / 2;
2235     $tables[4] = $tables[3] + @decomp_hash_index;
2236     $tables[5] = $tables[4] + @decomp_hash_data;
2237     $tables[6] = $tables[5] + length $decomp_char_data;
2238     $tables[7] = $tables[6] + @comp_hash_index;
2239
2240     print OUTPUT pack "S<16", unpack "U*", "norm$type.nlp";
2241     print OUTPUT pack "S<*", @header;
2242     print OUTPUT pack "S<*", @tables;
2243     print OUTPUT pack "C*", @class_values;
2244
2245     print OUTPUT pack "C*", @rows[0..$level1-1];
2246     print OUTPUT pack "C*", @rows[$level1..$#rows];
2247     print OUTPUT pack "S<*", @decomp_hash_index;
2248     print OUTPUT pack "S<*", @decomp_hash_data;
2249     print OUTPUT pack "S<*", unpack "U*", $decomp_char_data;
2250     print OUTPUT pack "S<*", @comp_hash_index;
2251     print OUTPUT pack "S<*", @comp_hash_data;
2252
2253     close OUTPUT;
2254     save_file($filename);
2255
2256     add_registry_value( "Normalization", sprintf( "%x", $forms{$type} ), "norm$type.nls" );
2257 }
2258
2259
2260 ################################################################
2261 # output a codepage definition file from the global tables
2262 sub output_codepage_file($)
2263 {
2264     my $codepage = shift;
2265
2266     my $output = sprintf "nls/c_%03d.nls", $codepage;
2267     open OUTPUT,">$output.new" or die "Cannot create $output";
2268
2269     printf "Building %s\n", $output;
2270     if (!@lead_bytes) { dump_binary_sbcs_table( $codepage ); }
2271     else { dump_binary_dbcs_table( $codepage ); }
2272
2273     close OUTPUT;
2274     save_file($output);
2275
2276     add_registry_value( "Codepage", sprintf( "%d", $codepage ), sprintf( "c_%03d.nls", $codepage ));
2277 }
2278
2279 ################################################################
2280 # output a codepage table from a Microsoft-style mapping file
2281 sub dump_msdata_codepage($)
2282 {
2283     my $filename = shift;
2284
2285     my $state = "";
2286     my ($codepage, $width, $count);
2287     my ($lb_cur, $lb_end);
2288
2289     @cp2uni = ();
2290     @glyph2uni = ();
2291     @lead_bytes = ();
2292     @uni2cp = ();
2293     $default_char = $DEF_CHAR;
2294     $default_wchar = $DEF_CHAR;
2295
2296     my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
2297
2298     while (<$INPUT>)
2299     {
2300         next if /^;/;  # skip comments
2301         next if /^\s*$/;  # skip empty lines
2302         next if /\x1a/;  # skip ^Z
2303         last if /^ENDCODEPAGE/;
2304
2305         if (/^CODEPAGE\s+(\d+)/)
2306         {
2307             $codepage = $1;
2308             next;
2309         }
2310         if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
2311         {
2312             $width = $1;
2313             $default_char = hex $2;
2314             $default_wchar = hex $3;
2315             next;
2316         }
2317         if (/^(MBTABLE|GLYPHTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
2318         {
2319             $state = $1;
2320             $count = $2;
2321             next;
2322         }
2323         if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
2324         {
2325             if ($state eq "MBTABLE")
2326             {
2327                 my $cp = hex $1;
2328                 my $uni = hex $2;
2329                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2330                 next;
2331             }
2332             if ($state eq "GLYPHTABLE")
2333             {
2334                 my $cp = hex $1;
2335                 my $uni = hex $2;
2336                 $glyph2uni[$cp] = $uni unless defined($glyph2uni[$cp]);
2337                 next;
2338             }
2339             if ($state eq "WCTABLE")
2340             {
2341                 my $uni = hex $1;
2342                 my $cp = hex $2;
2343                 $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
2344                 next;
2345             }
2346             if ($state eq "DBCSRANGE")
2347             {
2348                 my $start = hex $1;
2349                 my $end = hex $2;
2350                 for (my $i = $start; $i <= $end; $i++) { add_lead_byte( $i ); }
2351                 $lb_cur = $start;
2352                 $lb_end = $end;
2353                 next;
2354             }
2355             if ($state eq "DBCSTABLE")
2356             {
2357                 my $mb = hex $1;
2358                 my $uni = hex $2;
2359                 my $cp = ($lb_cur << 8) | $mb;
2360                 $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
2361                 if (!--$count)
2362                 {
2363                     if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
2364                 }
2365                 next;
2366             }
2367         }
2368         die "$filename: Unrecognized line $_\n";
2369     }
2370     close $INPUT;
2371
2372     output_codepage_file( $codepage );
2373
2374     if ($codepage == 949) { dump_krwansung_codepage( @uni2cp ); }
2375 }
2376
2377 ################################################################
2378 # align a string length
2379 sub align_string($$)
2380 {
2381     my ($align, $str) = @_;
2382     $str .= pack "C*", (0) x ($align - length($str) % $align) if length($str) % $align;
2383     return $str;
2384 }
2385
2386 ################################################################
2387 # pack a GUID string
2388 sub pack_guid($)
2389 {
2390     $_ = shift;
2391     /([0-9A-Fa-f]{8})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{4})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})-([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})([0-9A-Fa-f]{2})/;
2392     return pack "L<S<2C8", hex $1, hex $2, hex $3, hex $4, hex $5, hex $6, hex $7, hex $8, hex $9, hex $10, hex $11;
2393 }
2394
2395 ################################################################
2396 # comparison function for compression sort
2397 sub cmp_compression
2398 {
2399     return scalar @{$a} <=> scalar @{$b} ||
2400         $a->[4] <=> $b->[4] ||
2401         $a->[5] <=> $b->[5] ||
2402         $a->[6] <=> $b->[6] ||
2403         $a->[7] <=> $b->[7] ||
2404         $a->[8] <=> $b->[8] ||
2405         $a->[9] <=> $b->[9] ||
2406         $a->[10] <=> $b->[10] ||
2407         $a->[11] <=> $b->[11] ||
2408         $a->[12] <=> $b->[12];
2409 }
2410
2411 ################################################################
2412 # build a binary sort keys table
2413 sub dump_sortkey_table($$)
2414 {
2415     my ($filename, $download) = @_;
2416
2417     my @keys;
2418     my ($part, $section, $subsection, $guid, $version, $ling_flag);
2419     my @multiple_weights;
2420     my @expansions;
2421     my @compressions;
2422     my %exceptions;
2423     my %guids;
2424     my %compr_flags;
2425     my %locales;
2426     my $default_guid = "00000001-57ee-1e5c-00b4-d0000bb1e11e";
2427     my $jamostr = "";
2428
2429     my $re_hex = '0x[0-9A-Fa-f]+';
2430     my $re_key = '(\d+\s+\d+\s+\d+\s+\d+)';
2431     $guids{$default_guid} = { };
2432
2433     my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );
2434
2435     my $KEYS = open_data_file( $MSDATA, $download );
2436
2437     printf "Building $filename\n";
2438
2439     while (<$KEYS>)
2440     {
2441         s/\s*;.*$//;
2442         next if /^\s*$/;  # skip empty lines
2443         if (/^\s*(SORTKEY|SORTTABLES)/)
2444         {
2445             $part = $1;
2446             next;
2447         }
2448         if (/^\s*(ENDSORTKEY|ENDSORTTABLES)/)
2449         {
2450             $part = $section = "";
2451             next;
2452         }
2453         if (/^\s*(DEFAULT|RELEASE|REVERSEDIACRITICS|DOUBLECOMPRESSION|INVERSECASING|MULTIPLEWEIGHTS|EXPANSION|COMPATIBILITY|COMPRESSION|EXCEPTION|JAMOSORT)\s+/)
2454         {
2455             $section = $1;
2456             $guid = undef;
2457             next;
2458         }
2459         next unless $part;
2460         if ("$part.$section" eq "SORTKEY.DEFAULT")
2461         {
2462             if (/^\s*($re_hex)\s+$re_key/)
2463             {
2464                 $keys[hex $1] = [ split(/\s+/,$2) ];
2465                 next;
2466             }
2467         }
2468         elsif ("$part.$section" eq "SORTTABLES.RELEASE")
2469         {
2470             if (/^\s*NLSVERSION\s+0x([0-9A-Fa-f]+)/)
2471             {
2472                 $version = hex $1;
2473                 next;
2474             }
2475             if (/^\s*DEFINEDVERSION\s+0x([0-9A-Fa-f]+)/)
2476             {
2477                 # ignore for now
2478                 next;
2479             }
2480         }
2481         elsif ("$part.$section" eq "SORTTABLES.REVERSEDIACRITICS" ||
2482                "$part.$section" eq "SORTTABLES.DOUBLECOMPRESSION" ||
2483                "$part.$section" eq "SORTTABLES.INVERSECASING")
2484         {
2485             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)/)
2486             {
2487                 $guid = lc $1;
2488                 $guids{$guid} = { } unless defined $guids{$guid};
2489                 $guids{$guid}->{flags} |= $flags{$section};
2490                 next;
2491             }
2492             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2493             {
2494                 $locales{$1} = $guid;
2495                 next;
2496             }
2497         }
2498         elsif ("$part.$section" eq "SORTTABLES.MULTIPLEWEIGHTS")
2499         {
2500             if (/^\s*(\d+)\s+(\d+)/)
2501             {
2502                 push @multiple_weights, $1, $2;
2503                 next;
2504             }
2505         }
2506         elsif ("$part.$section" eq "SORTTABLES.EXPANSION")
2507         {
2508             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2509             {
2510                 my $pos = scalar @expansions / 2;
2511                 $keys[hex $1] = [ 2, 0, $pos & 0xff, $pos >> 8 ] unless defined $keys[hex $1];
2512                 push @expansions, hex $2, hex $3;
2513                 next;
2514             }
2515         }
2516         elsif ("$part.$section" eq "SORTTABLES.COMPATIBILITY")
2517         {
2518             if (/^\s*0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)/)
2519             {
2520                 $keys[hex $1] = $keys[hex $2];
2521                 next;
2522             }
2523         }
2524         elsif ("$part.$section" eq "SORTTABLES.COMPRESSION")
2525         {
2526             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*([A-Z0-9_]+)?/)
2527             {
2528                 if ($subsection || !$guid)  # start a new one
2529                 {
2530                     $guid = lc $1;
2531                     $subsection = "";
2532                     $guids{$guid} = { } unless defined $guids{$guid};
2533                     $guids{$guid}->{flags} |= $flags{$2} if $2;
2534                     $guids{$guid}->{compr} = @compressions;
2535                     $exceptions{"$guid-"} = [ ] unless defined $exceptions{"$guid-"};
2536                     $compr_flags{$guid} = [ ] unless defined $compr_flags{$guid};
2537                     push @compressions, [ ];
2538                 }
2539                 else  # merge with current one
2540                 {
2541                     $guids{lc $1} = { } unless defined $guids{lc $1};
2542                     $guids{lc $1}->{flags} |= $flags{$2} if $2;
2543                     $guids{lc $1}->{compr} = $guids{$guid}->{compr};
2544                     $compr_flags{lc $1} = $compr_flags{$guid};
2545                 }
2546                 next;
2547             }
2548             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2549             {
2550                 $locales{$1} = $guid;
2551                 next;
2552             }
2553             if (/^\s*(TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT)/)
2554             {
2555                 $subsection = $1;
2556                 next;
2557             }
2558             if ($subsection && /^\s*(($re_hex\s+){2,8})$re_key/)
2559             {
2560                 my @comp = map { hex $_; } split(/\s+/,$1);
2561                 push @{$compressions[$#compressions]}, [ split(/\s+/,$3), @comp ];
2562                 # add compression flags
2563                 $compr_flags{$guid}->[$comp[0]] |= @comp >= 6 ? 0xc0 : @comp >= 4 ? 0x80 : 0x40;
2564                 next;
2565             }
2566         }
2567         elsif ("$part.$section" eq "SORTTABLES.EXCEPTION")
2568         {
2569             if (/^\s*SORTGUID\s+([-0-9A-Fa-f]+)\s+\d*\s*(LINGUISTIC_CASING)?/)
2570             {
2571                 $guid = lc $1;
2572                 $guids{$guid} = { } unless defined $guids{lc $1};
2573                 $ling_flag = ($2 ? "+" : "-");
2574                 $exceptions{"$guid$ling_flag"} = [ ] unless defined $exceptions{"$guid$ling_flag"};
2575                 next;
2576             }
2577             if (/^\s*LOCALENAME\s+([A-Za-z0-9-_]+)/)
2578             {
2579                 $locales{$1} = $guid;
2580                 next;
2581             }
2582             if (/^\s*($re_hex)\s+$re_key/)
2583             {
2584                 $exceptions{"$guid$ling_flag"}->[hex $1] = [ split(/\s+/,$2) ];
2585                 next;
2586             }
2587         }
2588         elsif ("$part.$section" eq "SORTTABLES.JAMOSORT")
2589         {
2590             if (/^\s*$re_hex\s+(($re_hex\s*){5})/)
2591             {
2592                 $jamostr .= pack "C8", map { hex $_; } split /\s+/, $1;
2593                 next;
2594             }
2595         }
2596         die "$download: $part.$section: unrecognized line $_\n";
2597     }
2598     close $KEYS;
2599
2600     # Sortkey table
2601
2602     my $table;
2603     for (my $i = 0; $i < 0x10000; $i++)
2604     {
2605         my @k = defined $keys[$i] ? @{$keys[$i]} : (0) x 4;
2606         $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2607     }
2608
2609     foreach my $id (sort keys %exceptions)
2610     {
2611         my $pos = length($table) / 4;
2612         my @exc = @{$exceptions{$id}};
2613         my @filled;
2614         my $key = (substr( $id, -1 ) eq "+" ? "ling_except" : "except");
2615         my $guid = substr( $id, 0, -1 );
2616         $guids{$guid}->{$key} = $pos;
2617         $pos += 0x100;
2618         my @flags = @{$compr_flags{$guid}} if defined $compr_flags{$guid};
2619         for (my $j = 0; $j < 0x10000; $j++)
2620         {
2621             next unless defined $exc[$j] || defined $flags[$j];
2622             $filled[$j >> 8] = 1;
2623             $j |= 0xff;
2624         }
2625         for (my $j = 0; $j < 0x100; $j++)
2626         {
2627             $table .= pack "L<", $filled[$j] ? $pos : $j * 0x100;
2628             $pos += 0x100 if $filled[$j];
2629         }
2630         for (my $j = 0; $j < 0x10000; $j++)
2631         {
2632             next unless $filled[$j >> 8];
2633             my @k = defined $exc[$j] ? @{$exc[$j]} : defined $keys[$j] ? @{$keys[$j]} : (0) x 4;
2634             $k[3] |= $flags[$j] || 0;
2635             $table .= pack "C4", $k[1], $k[0], $k[2], $k[3];
2636         }
2637     }
2638
2639     # Case mapping tables
2640
2641     # standard table
2642     my @casemaps;
2643     my @upper = @toupper_table;
2644     my @lower = @tolower_table;
2645     remove_linguistic_mappings( \@upper, \@lower );
2646     $casemaps[0] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2647
2648     # linguistic table
2649     $casemaps[1] = pack( "S<*", 1) . dump_binary_case_table( @toupper_table ) . dump_binary_case_table( @tolower_table );
2650
2651     # Turkish table
2652     @upper = @toupper_table;
2653     @lower = @tolower_table;
2654     $upper[ord 'i'] = 0x130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
2655     $lower[ord 'I'] = 0x131; # LATIN SMALL LETTER DOTLESS I
2656     $casemaps[2] = pack( "S<*", 1) . dump_binary_case_table( @upper ) . dump_binary_case_table( @lower );
2657     my $casemaps = align_string( 8, $casemaps[0] . $casemaps[1] . $casemaps[2] );
2658
2659     # Char type table
2660
2661     my @table;
2662     my $types = "";
2663     my %typestr;
2664     for (my $i = 0; $i < 0x10000; $i++)
2665     {
2666         my $str = pack "S<3",
2667             ($category_table[$i] || 0) & 0xffff,
2668             defined($direction_table[$i]) ? $c2_types{$direction_table[$i]} : 0,
2669             ($category_table[$i] || 0) >> 16;
2670
2671         if (!defined($typestr{$str}))
2672         {
2673             $typestr{$str} = length($types) / 6;
2674             $types .= $str;
2675         }
2676         $table[$i] = $typestr{$str};
2677     }
2678
2679     my @rows = compress_array( 4096, 0, @table[0..65535] );
2680     my @array = compress_array( 256, 0, @rows[0..4095] );
2681     for (my $i = 0; $i < 256; $i++) { $array[$i] *= 2; }  # we need byte offsets
2682     for (my $i = 256; $i < @array; $i++) { $array[$i] += 2 * @array - 4096; }
2683
2684     my $arraystr = pack("S<*", @array) . pack("C*", @rows[4096..$#rows]);
2685     my $chartypes = pack "S<2", 4 + length($types) + length($arraystr), 2 + length($types);
2686     $chartypes = align_string( 8, $chartypes . $types . $arraystr );
2687
2688     # Sort tables
2689
2690     # guids
2691     my $sorttables = pack "L<2", $version, scalar %guids;
2692     foreach my $id (sort keys %guids)
2693     {
2694         my %guid = %{$guids{$id}};
2695         my $flags = $guid{flags} || 0;
2696         my $map = length($casemaps[0]) + (defined $guid{ling_except} ? length($casemaps[1]) : 0);
2697         $sorttables .= pack_guid($id) . pack "L<5",
2698             $flags,
2699             defined($guid{compr}) ? $guid{compr} : 0xffffffff,
2700             $guid{except} || 0,
2701             $guid{ling_except} || 0,
2702             $map / 2;
2703     }
2704
2705     # expansions
2706     $sorttables .= pack "L<S<*", scalar @expansions / 2, @expansions;
2707
2708     # compressions
2709     $sorttables .= pack "L<", scalar @compressions;
2710     my $rowstr = "";
2711     foreach my $c (@compressions)
2712     {
2713         my $pos = length($rowstr) / 2;
2714         my $min = 0xffff;
2715         my $max = 0;
2716         my @lengths = (0) x 8;
2717         foreach my $r (sort cmp_compression @{$c})
2718         {
2719             my @row = @{$r};
2720             $lengths[scalar @row - 6]++;
2721             foreach my $val (@row[4..$#row])
2722             {
2723                 $min = $val if $min > $val;
2724                 $max = $val if $max < $val;
2725             }
2726             $rowstr .= align_string( 4, pack "S<*", @row[4..$#row] );
2727             $rowstr .= pack "C4", $row[1], $row[0], $row[2], $row[3];
2728         }
2729         $sorttables .= pack "L<S<10", $pos, $min, $max, @lengths;
2730     }
2731     $sorttables .= $rowstr;
2732
2733     # multiple weights
2734     $sorttables .= align_string( 4, pack "L<C*", scalar @multiple_weights / 2, @multiple_weights );
2735
2736     # jamo sort
2737     $sorttables .= pack("L<", length($jamostr) / 8) . $jamostr;
2738
2739     # Locales
2740
2741     add_registry_key( "Sorting\\Ids", "{$default_guid}" );
2742     foreach my $loc (sort keys %locales)
2743     {
2744         # skip specific locales that match more general ones
2745         my @parts = split /[-_]/, $loc;
2746         next if @parts > 1 && defined($locales{$parts[0]}) && $locales{$parts[0]} eq $locales{$loc};
2747         next if @parts > 2 && defined($locales{"$parts[0]-$parts[1]"}) && $locales{"$parts[0]-$parts[1]"} eq $locales{$loc};
2748         add_registry_value( "Sorting\\Ids", $loc, "\{$locales{$loc}\}" );
2749     }
2750
2751     # File header
2752
2753     my @header;
2754     $header[0] = 16;
2755     $header[1] = $header[0] + length $table;
2756     $header[2] = $header[1] + length $casemaps;
2757     $header[3] = $header[2] + length $chartypes;
2758
2759     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2760     print OUTPUT pack "L<*", @header;
2761     print OUTPUT $table, $casemaps, $chartypes, $sorttables;
2762     close OUTPUT;
2763     save_file($filename);
2764 }
2765
2766
2767 ################################################################
2768 # build the script to create registry keys
2769 sub dump_registry_script($%)
2770 {
2771     my ($filename, %keys) = @_;
2772     my $indent = 1;
2773
2774     printf "Building %s\n", $filename;
2775     open OUTPUT, ">$filename.new" or die "Cannot create $filename";
2776     print OUTPUT "HKLM\n{\n";
2777     foreach my $k (split /\\/, "SYSTEM\\CurrentControlSet\\Control\\Nls")
2778     {
2779         printf OUTPUT "%*sNoRemove %s\n%*s{\n", 4 * $indent, "", $k, 4 * $indent, "";
2780         $indent++;
2781     }
2782     foreach my $k (sort keys %keys)
2783     {
2784         my @subkeys = split /\\/, $k;
2785         my ($def, @vals) = @{$keys{$k}};
2786         for (my $i = 0; $i < @subkeys; $i++)
2787         {
2788             printf OUTPUT "%*s%s%s\n%*s{\n", 4 * $indent, "", $subkeys[$i],
2789                 $i == $#subkeys && $def ? " = s '$def'" : "", 4 * $indent, "";
2790             $indent++;
2791         }
2792         foreach my $v (sort @vals) { printf OUTPUT "%*sval $v\n", 4 * $indent, ""; }
2793         for (my $i = 0; $i < @subkeys; $i++) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2794     }
2795     while ($indent) { printf OUTPUT "%*s}\n", 4 * --$indent, ""; }
2796     close OUTPUT;
2797     save_file($filename);
2798 }
2799
2800
2801 ################################################################
2802 # save a file if modified
2803 sub save_file($)
2804 {
2805     my $file = shift;
2806     if (-f $file && !system "cmp $file $file.new >/dev/null")
2807     {
2808         unlink "$file.new";
2809     }
2810     else
2811     {
2812         rename "$file.new", "$file";
2813     }
2814 }
2815
2816
2817 ################################################################
2818 # main routine
2819
2820 chdir ".." if -f "./make_unicode";
2821 load_data();
2822 dump_sortkeys( "dlls/kernelbase/collation.c" );
2823 dump_bidi_dir_table( "dlls/gdi32/uniscribe/direction.c" );
2824 dump_bidi_dir_table( "dlls/dwrite/direction.c" );
2825 dump_digit_folding( "dlls/kernelbase/digitmap.c" );
2826 dump_mirroring( "dlls/gdi32/uniscribe/mirror.c" );
2827 dump_mirroring( "dlls/dwrite/mirror.c" );
2828 dump_bracket( "dlls/gdi32/uniscribe/bracket.c" );
2829 dump_bracket( "dlls/dwrite/bracket.c" );
2830 dump_shaping( "dlls/gdi32/uniscribe/shaping.c" );
2831 dump_arabic_shaping( "dlls/dwrite/shapers/arabic_table.c" );
2832 dump_linebreak( "dlls/gdi32/uniscribe/linebreak.c" );
2833 dump_linebreak( "dlls/dwrite/linebreak.c" );
2834 dump_scripts( "dlls/dwrite/scripts" );
2835 dump_indic( "dlls/gdi32/uniscribe/indicsyllable.c" );
2836 dump_vertical( "dlls/win32u/vertical.c", 1 );
2837 dump_vertical( "dlls/wineps.drv/vertical.c", 0 );
2838 dump_intl_nls("nls/l_intl.nls");
2839 dump_norm_table( "nls/normnfc.nls" );
2840 dump_norm_table( "nls/normnfd.nls" );
2841 dump_norm_table( "nls/normnfkc.nls" );
2842 dump_norm_table( "nls/normnfkd.nls" );
2843 dump_norm_table( "nls/normidna.nls" );
2844 dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
2845 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
2846 dump_eucjp_codepage();
2847 dump_registry_script( "dlls/kernelbase/kernelbase.rgs", %registry_keys );
2848
2849 exit 0;
2850
2851 # Local Variables:
2852 # compile-command: "./make_unicode"
2853 # End: