Tools/unicode/makeunicodedata.py

   1 #
   2 # (re)generate unicode property and type databases
   3 #
   4 # this script converts a unicode 3.2 database file to
   5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
   6 # and Objects/unicodetype_db.h
   7 #
   8 # history:
   9 # 2000-09-24 fl   created (based on bits and pieces from unidb)
  10 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
  11 # 2000-09-25 fl   added character type table
  12 # 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
  13 # 2000-11-03 fl   expand first/last ranges
  14 # 2001-01-19 fl   added character name tables (2.1)
  15 # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
  16 # 2002-09-11 wd   use string methods
  17 # 2002-10-18 mvl  update to Unicode 3.2
  18 # 2002-10-22 mvl  generate NFC tables
  19 # 2002-11-24 mvl  expand all ranges, sort names version-independently
  20 # 2002-11-25 mvl  add UNIDATA_VERSION
  21 # 2004-05-29 perky add east asian width information
  22 # 2006-03-10 mvl  update to Unicode 4.1; add UCD 3.2 delta
  23 #
  24 # written by Fredrik Lundh (fredrik@pythonware.com)
  25 #
  26
  27 import sys
  28
  29 SCRIPT = sys.argv[0]
  30 VERSION = "2.5"
  31
  32 # The Unicode Database
  33 UNIDATA_VERSION = "4.1.0"
  34 UNICODE_DATA = "UnicodeData%s.txt"
  35 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
  36 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
  37
  38 old_versions = ["3.2.0"]
  39
  40 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
  41     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
  42     "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
  43     "So" ]
  44
  45 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
  46     "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
  47     "ON" ]
  48
  49 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
  50
  51 # note: should match definitions in Objects/unicodectype.c
  52 ALPHA_MASK = 0x01
  53 DECIMAL_MASK = 0x02
  54 DIGIT_MASK = 0x04
  55 LOWER_MASK = 0x08
  56 LINEBREAK_MASK = 0x10
  57 SPACE_MASK = 0x20
  58 TITLE_MASK = 0x40
  59 UPPER_MASK = 0x80
  60
  61 def maketables(trace=0):
  62
  63     print "--- Reading", UNICODE_DATA % "", "..."
  64
  65     version = ""
  66     unicode = UnicodeData(UNICODE_DATA % version,
  67                           COMPOSITION_EXCLUSIONS % version,
  68                           EASTASIAN_WIDTH % version)
  69
  70     print len(filter(None, unicode.table)), "characters"
  71
  72     for version in old_versions:
  73         print "--- Reading", UNICODE_DATA % ("-"+version), "..."
  74         old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
  75                                   COMPOSITION_EXCLUSIONS % ("-"+version),
  76                                   EASTASIAN_WIDTH % ("-"+version))
  77         print len(filter(None, old_unicode.table)), "characters"
  78         merge_old_version(version, unicode, old_unicode)
  79
  80     makeunicodename(unicode, trace)
  81     makeunicodedata(unicode, trace)
  82     makeunicodetype(unicode, trace)
  83
  84 # --------------------------------------------------------------------
  85 # unicode character properties
  86
  87 def makeunicodedata(unicode, trace):
  88
  89     dummy = (0, 0, 0, 0, 0)
  90     table = [dummy]
  91     cache = {0: dummy}
  92     index = [0] * len(unicode.chars)
  93
  94     FILE = "Modules/unicodedata_db.h"
  95
  96     print "--- Preparing", FILE, "..."
  97
  98     # 1) database properties
  99
 100     for char in unicode.chars:
 101         record = unicode.table[char]
 102         if record:
 103             # extract database properties
 104             category = CATEGORY_NAMES.index(record[2])
 105             combining = int(record[3])
 106             bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
 107             mirrored = record[9] == "Y"
 108             eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
 109             item = (
 110                 category, combining, bidirectional, mirrored, eastasianwidth
 111                 )
 112             # add entry to index and item tables
 113             i = cache.get(item)
 114             if i is None:
 115                 cache[item] = i = len(table)
 116                 table.append(item)
 117             index[char] = i
 118
 119     # 2) decomposition data
 120
 121     decomp_data = [0]
 122     decomp_prefix = [""]
 123     decomp_index = [0] * len(unicode.chars)
 124     decomp_size = 0
 125
 126     comp_pairs = []
 127     comp_first = [None] * len(unicode.chars)
 128     comp_last = [None] * len(unicode.chars)
 129
 130     for char in unicode.chars:
 131         record = unicode.table[char]
 132         if record:
 133             if record[5]:
 134                 decomp = record[5].split()
 135                 if len(decomp) > 19:
 136                     raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
 137                 # prefix
 138                 if decomp[0][0] == "<":
 139                     prefix = decomp.pop(0)
 140                 else:
 141                     prefix = ""
 142                 try:
 143                     i = decomp_prefix.index(prefix)
 144                 except ValueError:
 145                     i = len(decomp_prefix)
 146                     decomp_prefix.append(prefix)
 147                 prefix = i
 148                 assert prefix < 256
 149                 # content
 150                 decomp = [prefix + (len(decomp)<<8)] +\
 151                          map(lambda s: int(s, 16), decomp)
 152                 # Collect NFC pairs
 153                 if not prefix and len(decomp) == 3 and \
 154                    char not in unicode.exclusions and \
 155                    unicode.table[decomp[1]][3] == "0":
 156                     p, l, r = decomp
 157                     comp_first[l] = 1
 158                     comp_last[r] = 1
 159                     comp_pairs.append((l,r,char))
 160                 try:
 161                     i = decomp_data.index(decomp)
 162                 except ValueError:
 163                     i = len(decomp_data)
 164                     decomp_data.extend(decomp)
 165                     decomp_size = decomp_size + len(decomp) * 2
 166             else:
 167                 i = 0
 168             decomp_index[char] = i
 169
 170     f = l = 0
 171     comp_first_ranges = []
 172     comp_last_ranges = []
 173     prev_f = prev_l = None
 174     for i in unicode.chars:
 175         if comp_first[i] is not None:
 176             comp_first[i] = f
 177             f += 1
 178             if prev_f is None:
 179                 prev_f = (i,i)
 180             elif prev_f[1]+1 == i:
 181                 prev_f = prev_f[0],i
 182             else:
 183                 comp_first_ranges.append(prev_f)
 184                 prev_f = (i,i)
 185         if comp_last[i] is not None:
 186             comp_last[i] = l
 187             l += 1
 188             if prev_l is None:
 189                 prev_l = (i,i)
 190             elif prev_l[1]+1 == i:
 191                 prev_l = prev_l[0],i
 192             else:
 193                 comp_last_ranges.append(prev_l)
 194                 prev_l = (i,i)
 195     comp_first_ranges.append(prev_f)
 196     comp_last_ranges.append(prev_l)
 197     total_first = f
 198     total_last = l
 199
 200     comp_data = [0]*(total_first*total_last)
 201     for f,l,char in comp_pairs:
 202         f = comp_first[f]
 203         l = comp_last[l]
 204         comp_data[f*total_last+l] = char
 205
 206     print len(table), "unique properties"
 207     print len(decomp_prefix), "unique decomposition prefixes"
 208     print len(decomp_data), "unique decomposition entries:",
 209     print decomp_size, "bytes"
 210     print total_first, "first characters in NFC"
 211     print total_last, "last characters in NFC"
 212     print len(comp_pairs), "NFC pairs"
 213
 214     print "--- Writing", FILE, "..."
 215
 216     fp = open(FILE, "w")
 217     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 218     print >>fp
 219     print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
 220     print >>fp, "/* a list of unique database records */"
 221     print >>fp, \
 222           "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
 223     for item in table:
 224         print >>fp, "    {%d, %d, %d, %d, %d}," % item
 225     print >>fp, "};"
 226     print >>fp
 227
 228     print >>fp, "/* Reindexing of NFC first characters. */"
 229     print >>fp, "#define TOTAL_FIRST",total_first
 230     print >>fp, "#define TOTAL_LAST",total_last
 231     print >>fp, "struct reindex{int start;short count,index;};"
 232     print >>fp, "struct reindex nfc_first[] = {"
 233     for start,end in comp_first_ranges:
 234         print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_first[start])
 235     print >>fp,"  {0,0,0}"
 236     print >>fp,"};\n"
 237     print >>fp, "struct reindex nfc_last[] = {"
 238     for start,end in comp_last_ranges:
 239         print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_last[start])
 240     print >>fp,"  {0,0,0}"
 241     print >>fp,"};\n"
 242
 243     # FIXME: <fl> the following tables could be made static, and
 244     # the support code moved into unicodedatabase.c
 245
 246     print >>fp, "/* string literals */"
 247     print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
 248     for name in CATEGORY_NAMES:
 249         print >>fp, "    \"%s\"," % name
 250     print >>fp, "    NULL"
 251     print >>fp, "};"
 252
 253     print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
 254     for name in BIDIRECTIONAL_NAMES:
 255         print >>fp, "    \"%s\"," % name
 256     print >>fp, "    NULL"
 257     print >>fp, "};"
 258
 259     print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
 260     for name in EASTASIANWIDTH_NAMES:
 261         print >>fp, "    \"%s\"," % name
 262     print >>fp, "    NULL"
 263     print >>fp, "};"
 264
 265     print >>fp, "static const char *decomp_prefix[] = {"
 266     for name in decomp_prefix:
 267         print >>fp, "    \"%s\"," % name
 268     print >>fp, "    NULL"
 269     print >>fp, "};"
 270
 271     # split record index table
 272     index1, index2, shift = splitbins(index, trace)
 273
 274     print >>fp, "/* index tables for the database records */"
 275     print >>fp, "#define SHIFT", shift
 276     Array("index1", index1).dump(fp, trace)
 277     Array("index2", index2).dump(fp, trace)
 278
 279     # split decomposition index table
 280     index1, index2, shift = splitbins(decomp_index, trace)
 281
 282     print >>fp, "/* decomposition data */"
 283     Array("decomp_data", decomp_data).dump(fp, trace)
 284
 285     print >>fp, "/* index tables for the decomposition data */"
 286     print >>fp, "#define DECOMP_SHIFT", shift
 287     Array("decomp_index1", index1).dump(fp, trace)
 288     Array("decomp_index2", index2).dump(fp, trace)
 289
 290     index, index2, shift = splitbins(comp_data, trace)
 291     print >>fp, "/* NFC pairs */"
 292     print >>fp, "#define COMP_SHIFT", shift
 293     Array("comp_index", index).dump(fp, trace)
 294     Array("comp_data", index2).dump(fp, trace)
 295
 296     # Generate delta tables for old versions
 297     for version, table, normalization in unicode.changed:
 298         cversion = version.replace(".","_")
 299         records = [table[0]]
 300         cache = {table[0]:0}
 301         index = [0] * len(table)
 302         for i, record in enumerate(table):
 303             try:
 304                 index[i] = cache[record]
 305             except KeyError:
 306                 index[i] = cache[record] = len(records)
 307                 records.append(record)
 308         index1, index2, shift = splitbins(index, trace)
 309         print >>fp, "static const change_record change_records_%s[] = {" % cversion
 310         for record in records:
 311             print >>fp, "\t{ %s }," % ", ".join(map(str,record))
 312         print >>fp, "};"
 313         Array("changes_%s_index" % cversion, index1).dump(fp, trace)
 314         Array("changes_%s_data" % cversion, index2).dump(fp, trace)
 315         print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
 316         print >>fp, "{"
 317         print >>fp, "\tint index;"
 318         print >>fp, "\tif (n >= 0x110000) index = 0;"
 319         print >>fp, "\telse {"
 320         print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
 321         print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
 322               (cversion, shift, ((1<<shift)-1))
 323         print >>fp, "\t}"
 324         print >>fp, "\treturn change_records_%s+index;" % cversion
 325         print >>fp, "}\n"
 326         print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
 327         print >>fp, "{"
 328         print >>fp, "\tswitch(n) {"
 329         for k, v in normalization:
 330             print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
 331         print >>fp, "\tdefault: return 0;"
 332         print >>fp, "\t}\n}\n"
 333
 334     fp.close()
 335
 336 # --------------------------------------------------------------------
 337 # unicode character type tables
 338
 339 def makeunicodetype(unicode, trace):
 340
 341     FILE = "Objects/unicodetype_db.h"
 342
 343     print "--- Preparing", FILE, "..."
 344
 345     # extract unicode types
 346     dummy = (0, 0, 0, 0, 0, 0)
 347     table = [dummy]
 348     cache = {0: dummy}
 349     index = [0] * len(unicode.chars)
 350
 351     for char in unicode.chars:
 352         record = unicode.table[char]
 353         if record:
 354             # extract database properties
 355             category = record[2]
 356             bidirectional = record[4]
 357             flags = 0
 358             if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
 359                 flags |= ALPHA_MASK
 360             if category == "Ll":
 361                 flags |= LOWER_MASK
 362             if category == "Zl" or bidirectional == "B":
 363                 flags |= LINEBREAK_MASK
 364             if category == "Zs" or bidirectional in ("WS", "B", "S"):
 365                 flags |= SPACE_MASK
 366             if category == "Lt":
 367                 flags |= TITLE_MASK
 368             if category == "Lu":
 369                 flags |= UPPER_MASK
 370             # use delta predictor for upper/lower/title
 371             if record[12]:
 372                 upper = int(record[12], 16) - char
 373                 assert -32768 <= upper <= 32767
 374                 upper = upper & 0xffff
 375             else:
 376                 upper = 0
 377             if record[13]:
 378                 lower = int(record[13], 16) - char
 379                 assert -32768 <= lower <= 32767
 380                 lower = lower & 0xffff
 381             else:
 382                 lower = 0
 383             if record[14]:
 384                 title = int(record[14], 16) - char
 385                 assert -32768 <= lower <= 32767
 386                 title = title & 0xffff
 387             else:
 388                 title = 0
 389             # decimal digit, integer digit
 390             decimal = 0
 391             if record[6]:
 392                 flags |= DECIMAL_MASK
 393                 decimal = int(record[6])
 394             digit = 0
 395             if record[7]:
 396                 flags |= DIGIT_MASK
 397                 digit = int(record[7])
 398             item = (
 399                 upper, lower, title, decimal, digit, flags
 400                 )
 401             # add entry to index and item tables
 402             i = cache.get(item)
 403             if i is None:
 404                 cache[item] = i = len(table)
 405                 table.append(item)
 406             index[char] = i
 407
 408     print len(table), "unique character type entries"
 409
 410     print "--- Writing", FILE, "..."
 411
 412     fp = open(FILE, "w")
 413     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 414     print >>fp
 415     print >>fp, "/* a list of unique character type descriptors */"
 416     print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
 417     for item in table:
 418         print >>fp, "    {%d, %d, %d, %d, %d, %d}," % item
 419     print >>fp, "};"
 420     print >>fp
 421
 422     # split decomposition index table
 423     index1, index2, shift = splitbins(index, trace)
 424
 425     print >>fp, "/* type indexes */"
 426     print >>fp, "#define SHIFT", shift
 427     Array("index1", index1).dump(fp, trace)
 428     Array("index2", index2).dump(fp, trace)
 429
 430     fp.close()
 431
 432 # --------------------------------------------------------------------
 433 # unicode name database
 434
 435 def makeunicodename(unicode, trace):
 436
 437     FILE = "Modules/unicodename_db.h"
 438
 439     print "--- Preparing", FILE, "..."
 440
 441     # collect names
 442     names = [None] * len(unicode.chars)
 443
 444     for char in unicode.chars:
 445         record = unicode.table[char]
 446         if record:
 447             name = record[1].strip()
 448             if name and name[0] != "<":
 449                 names[char] = name + chr(0)
 450
 451     print len(filter(lambda n: n is not None, names)), "distinct names"
 452
 453     # collect unique words from names (note that we differ between
 454     # words inside a sentence, and words ending a sentence.  the
 455     # latter includes the trailing null byte.
 456
 457     words = {}
 458     n = b = 0
 459     for char in unicode.chars:
 460         name = names[char]
 461         if name:
 462             w = name.split()
 463             b = b + len(name)
 464             n = n + len(w)
 465             for w in w:
 466                 l = words.get(w)
 467                 if l:
 468                     l.append(None)
 469                 else:
 470                     words[w] = [len(words)]
 471
 472     print n, "words in text;", b, "bytes"
 473
 474     wordlist = words.items()
 475
 476     # sort on falling frequency, then by name
 477     def cmpwords((aword, alist),(bword, blist)):
 478         r = -cmp(len(alist),len(blist))
 479         if r:
 480             return r
 481         return cmp(aword, bword)
 482     wordlist.sort(cmpwords)
 483
 484     # figure out how many phrasebook escapes we need
 485     escapes = 0
 486     while escapes * 256 < len(wordlist):
 487         escapes = escapes + 1
 488     print escapes, "escapes"
 489
 490     short = 256 - escapes
 491
 492     assert short > 0
 493
 494     print short, "short indexes in lexicon"
 495
 496     # statistics
 497     n = 0
 498     for i in range(short):
 499         n = n + len(wordlist[i][1])
 500     print n, "short indexes in phrasebook"
 501
 502     # pick the most commonly used words, and sort the rest on falling
 503     # length (to maximize overlap)
 504
 505     wordlist, wordtail = wordlist[:short], wordlist[short:]
 506     wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
 507     wordlist.extend(wordtail)
 508
 509     # generate lexicon from words
 510
 511     lexicon_offset = [0]
 512     lexicon = ""
 513     words = {}
 514
 515     # build a lexicon string
 516     offset = 0
 517     for w, x in wordlist:
 518         # encoding: bit 7 indicates last character in word (chr(128)
 519         # indicates the last character in an entire string)
 520         ww = w[:-1] + chr(ord(w[-1])+128)
 521         # reuse string tails, when possible
 522         o = lexicon.find(ww)
 523         if o < 0:
 524             o = offset
 525             lexicon = lexicon + ww
 526             offset = offset + len(w)
 527         words[w] = len(lexicon_offset)
 528         lexicon_offset.append(o)
 529
 530     lexicon = map(ord, lexicon)
 531
 532     # generate phrasebook from names and lexicon
 533     phrasebook = [0]
 534     phrasebook_offset = [0] * len(unicode.chars)
 535     for char in unicode.chars:
 536         name = names[char]
 537         if name:
 538             w = name.split()
 539             phrasebook_offset[char] = len(phrasebook)
 540             for w in w:
 541                 i = words[w]
 542                 if i < short:
 543                     phrasebook.append(i)
 544                 else:
 545                     # store as two bytes
 546                     phrasebook.append((i>>8) + short)
 547                     phrasebook.append(i&255)
 548
 549     assert getsize(phrasebook) == 1
 550
 551     #
 552     # unicode name hash table
 553
 554     # extract names
 555     data = []
 556     for char in unicode.chars:
 557         record = unicode.table[char]
 558         if record:
 559             name = record[1].strip()
 560             if name and name[0] != "<":
 561                 data.append((name, char))
 562
 563     # the magic number 47 was chosen to minimize the number of
 564     # collisions on the current data set.  if you like, change it
 565     # and see what happens...
 566
 567     codehash = Hash("code", data, 47)
 568
 569     print "--- Writing", FILE, "..."
 570
 571     fp = open(FILE, "w")
 572     print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
 573     print >>fp
 574     print >>fp, "#define NAME_MAXLEN", 256
 575     print >>fp
 576     print >>fp, "/* lexicon */"
 577     Array("lexicon", lexicon).dump(fp, trace)
 578     Array("lexicon_offset", lexicon_offset).dump(fp, trace)
 579
 580     # split decomposition index table
 581     offset1, offset2, shift = splitbins(phrasebook_offset, trace)
 582
 583     print >>fp, "/* code->name phrasebook */"
 584     print >>fp, "#define phrasebook_shift", shift
 585     print >>fp, "#define phrasebook_short", short
 586
 587     Array("phrasebook", phrasebook).dump(fp, trace)
 588     Array("phrasebook_offset1", offset1).dump(fp, trace)
 589     Array("phrasebook_offset2", offset2).dump(fp, trace)
 590
 591     print >>fp, "/* name->code dictionary */"
 592     codehash.dump(fp, trace)
 593
 594     fp.close()
 595
 596
 597 def merge_old_version(version, new, old):
 598     # Changes to exclusion file not implemented yet
 599     if old.exclusions != new.exclusions:
 600         raise NotImplementedError, "exclusions differ"
 601
 602     # In these change records, 0xFF means "no change"
 603     bidir_changes = [0xFF]*0x110000
 604     category_changes = [0xFF]*0x110000
 605     decimal_changes = [0xFF]*0x110000
 606     # In numeric data, 0 means "no change",
 607     # -1 means "did not have a numeric value
 608     numeric_changes = [0] * 0x110000
 609     # normalization_changes is a list of key-value pairs
 610     normalization_changes = []
 611     for i in range(0x110000):
 612         if new.table[i] is None:
 613             # Characters unassigned in the new version ought to
 614             # be unassigned in the old one
 615             assert old.table[i] is None
 616             continue
 617         # check characters unassigned in the old version
 618         if old.table[i] is None:
 619             # category 0 is "unassigned"
 620             category_changes[i] = 0
 621             continue
 622         # check characters that differ
 623         if old.table[i] != new.table[i]:
 624             for k in range(len(old.table[i])):
 625                 if old.table[i][k] != new.table[i][k]:
 626                     value = old.table[i][k]
 627                     if k == 2:
 628                         #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
 629                         category_changes[i] = CATEGORY_NAMES.index(value)
 630                     elif k == 4:
 631                         #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
 632                         bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
 633                     elif k == 5:
 634                         #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
 635                         # We assume that all normalization changes are in 1:1 mappings
 636                         assert " " not in value
 637                         normalization_changes.append((i, value))
 638                     elif k == 6:
 639                         #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
 640                         # we only support changes where the old value is a single digit
 641                         assert value in "0123456789"
 642                         decimal_changes[i] = int(value)
 643                     elif k == 8:
 644                         # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
 645                         # Since 0 encodes "no change", the old value is better not 0
 646                         assert value != "0" and value != "-1"
 647                         if not value:
 648                             numeric_changes[i] = -1
 649                         else:
 650                             assert re.match("^[0-9]+$", value)
 651                             numeric_changes[i] = int(value)
 652                     elif k == 11:
 653                         # change to ISO comment, ignore
 654                         pass
 655                     elif k == 12:
 656                         # change to simple uppercase mapping; ignore
 657                         pass
 658                     elif k == 13:
 659                         # change to simple lowercase mapping; ignore
 660                         pass
 661                     elif k == 14:
 662                         # change to simple titlecase mapping; ignore
 663                         pass
 664                     else:
 665                         class Difference(Exception):pass
 666                         raise Difference, (hex(i), k, old.table[i], new.table[i])
 667     new.changed.append((version, zip(bidir_changes, category_changes,
 668                                      decimal_changes, numeric_changes),
 669                         normalization_changes))
 670
 671
 672 # --------------------------------------------------------------------
 673 # the following support code is taken from the unidb utilities
 674 # Copyright (c) 1999-2000 by Secret Labs AB
 675
 676 # load a unicode-data file from disk
 677
 678 import sys
 679
 680 class UnicodeData:
 681
 682     def __init__(self, filename, exclusions, eastasianwidth, expand=1):
 683         self.changed = []
 684         file = open(filename)
 685         table = [None] * 0x110000
 686         while 1:
 687             s = file.readline()
 688             if not s:
 689                 break
 690             s = s.strip().split(";")
 691             char = int(s[0], 16)
 692             table[char] = s
 693
 694         # expand first-last ranges
 695         if expand:
 696             field = None
 697             for i in range(0, 0x110000):
 698                 s = table[i]
 699                 if s:
 700                     if s[1][-6:] == "First>":
 701                         s[1] = ""
 702                         field = s
 703                     elif s[1][-5:] == "Last>":
 704                         s[1] = ""
 705                         field = None
 706                 elif field:
 707                     f2 = field[:]
 708                     f2[0] = "%X" % i
 709                     table[i] = f2
 710
 711         # public attributes
 712         self.filename = filename
 713         self.table = table
 714         self.chars = range(0x110000) # unicode 3.2
 715
 716         file = open(exclusions)
 717         self.exclusions = {}
 718         for s in file:
 719             s = s.strip()
 720             if not s:
 721                 continue
 722             if s[0] == '#':
 723                 continue
 724             char = int(s.split()[0],16)
 725             self.exclusions[char] = 1
 726
 727         widths = [None] * 0x110000
 728         for s in open(eastasianwidth):
 729             s = s.strip()
 730             if not s:
 731                 continue
 732             if s[0] == '#':
 733                 continue
 734             s = s.split()[0].split(';')
 735             if '..' in s[0]:
 736                 first, last = [int(c, 16) for c in s[0].split('..')]
 737                 chars = range(first, last+1)
 738             else:
 739                 chars = [int(s[0], 16)]
 740             for char in chars:
 741                 widths[char] = s[1]
 742         for i in range(0, 0x110000):
 743             if table[i] is not None:
 744                 table[i].append(widths[i])
 745
 746     def uselatin1(self):
 747         # restrict character range to ISO Latin 1
 748         self.chars = range(256)
 749
 750 # hash table tools
 751
 752 # this is a straight-forward reimplementation of Python's built-in
 753 # dictionary type, using a static data structure, and a custom string
 754 # hash algorithm.
 755
 756 def myhash(s, magic):
 757     h = 0
 758     for c in map(ord, s.upper()):
 759         h = (h * magic) + c
 760         ix = h & 0xff000000L
 761         if ix:
 762             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
 763     return h
 764
 765 SIZES = [
 766     (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
 767     (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
 768     (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
 769     (2097152,5), (4194304,3), (8388608,33), (16777216,27)
 770 ]
 771
 772 class Hash:
 773     def __init__(self, name, data, magic):
 774         # turn a (key, value) list into a static hash table structure
 775
 776         # determine table size
 777         for size, poly in SIZES:
 778             if size > len(data):
 779                 poly = size + poly
 780                 break
 781         else:
 782             raise AssertionError, "ran out of polynominals"
 783
 784         print size, "slots in hash table"
 785
 786         table = [None] * size
 787
 788         mask = size-1
 789
 790         n = 0
 791
 792         hash = myhash
 793
 794         # initialize hash table
 795         for key, value in data:
 796             h = hash(key, magic)
 797             i = (~h) & mask
 798             v = table[i]
 799             if v is None:
 800                 table[i] = value
 801                 continue
 802             incr = (h ^ (h >> 3)) & mask;
 803             if not incr:
 804                 incr = mask
 805             while 1:
 806                 n = n + 1
 807                 i = (i + incr) & mask
 808                 v = table[i]
 809                 if v is None:
 810                     table[i] = value
 811                     break
 812                 incr = incr << 1
 813                 if incr > mask:
 814                     incr = incr ^ poly
 815
 816         print n, "collisions"
 817         self.collisions = n
 818
 819         for i in range(len(table)):
 820             if table[i] is None:
 821                 table[i] = 0
 822
 823         self.data = Array(name + "_hash", table)
 824         self.magic = magic
 825         self.name = name
 826         self.size = size
 827         self.poly = poly
 828
 829     def dump(self, file, trace):
 830         # write data to file, as a C array
 831         self.data.dump(file, trace)
 832         file.write("#define %s_magic %d\n" % (self.name, self.magic))
 833         file.write("#define %s_size %d\n" % (self.name, self.size))
 834         file.write("#define %s_poly %d\n" % (self.name, self.poly))
 835
 836 # stuff to deal with arrays of unsigned integers
 837
 838 class Array:
 839
 840     def __init__(self, name, data):
 841         self.name = name
 842         self.data = data
 843
 844     def dump(self, file, trace=0):
 845         # write data to file, as a C array
 846         size = getsize(self.data)
 847         if trace:
 848             print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
 849         file.write("static ")
 850         if size == 1:
 851             file.write("unsigned char")
 852         elif size == 2:
 853             file.write("unsigned short")
 854         else:
 855             file.write("unsigned int")
 856         file.write(" " + self.name + "[] = {\n")
 857         if self.data:
 858             s = "    "
 859             for item in self.data:
 860                 i = str(item) + ", "
 861                 if len(s) + len(i) > 78:
 862                     file.write(s + "\n")
 863                     s = "    " + i
 864                 else:
 865                     s = s + i
 866             if s.strip():
 867                 file.write(s + "\n")
 868         file.write("};\n\n")
 869
 870 def getsize(data):
 871     # return smallest possible integer size for the given array
 872     maxdata = max(data)
 873     if maxdata < 256:
 874         return 1
 875     elif maxdata < 65536:
 876         return 2
 877     else:
 878         return 4
 879
 880 def splitbins(t, trace=0):
 881     """t, trace=0 -> (t1, t2, shift).  Split a table to save space.
 882
 883     t is a sequence of ints.  This function can be useful to save space if
 884     many of the ints are the same.  t1 and t2 are lists of ints, and shift
 885     is an int, chosen to minimize the combined size of t1 and t2 (in C
 886     code), and where for each i in range(len(t)),
 887         t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 888     where mask is a bitmask isolating the last "shift" bits.
 889
 890     If optional arg trace is non-zero (default zero), progress info
 891     is printed to sys.stderr.  The higher the value, the more info
 892     you'll get.
 893     """
 894
 895     import sys
 896     if trace:
 897         def dump(t1, t2, shift, bytes):
 898             print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
 899                 len(t1), len(t2), shift, bytes)
 900         print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
 901                             "bytes"
 902     n = len(t)-1    # last valid index
 903     maxshift = 0    # the most we can shift n and still have something left
 904     if n > 0:
 905         while n >> 1:
 906             n >>= 1
 907             maxshift += 1
 908     del n
 909     bytes = sys.maxint  # smallest total size so far
 910     t = tuple(t)    # so slices can be dict keys
 911     for shift in range(maxshift + 1):
 912         t1 = []
 913         t2 = []
 914         size = 2**shift
 915         bincache = {}
 916         for i in range(0, len(t), size):
 917             bin = t[i:i+size]
 918             index = bincache.get(bin)
 919             if index is None:
 920                 index = len(t2)
 921                 bincache[bin] = index
 922                 t2.extend(bin)
 923             t1.append(index >> shift)
 924         # determine memory size
 925         b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
 926         if trace > 1:
 927             dump(t1, t2, shift, b)
 928         if b < bytes:
 929             best = t1, t2, shift
 930             bytes = b
 931     t1, t2, shift = best
 932     if trace:
 933         print >>sys.stderr, "Best:",
 934         dump(t1, t2, shift, bytes)
 935     if __debug__:
 936         # exhaustively verify that the decomposition is correct
 937         mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
 938         for i in xrange(len(t)):
 939             assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
 940     return best
 941
 942 if __name__ == "__main__":
 943     maketables(1)