2 # (re)generate unicode property and type databases
4 # this script converts a unicode 3.2 database file to
5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
6 # and Objects/unicodetype_db.h
9 # 2000-09-24 fl created (based on bits and pieces from unidb)
10 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11 # 2000-09-25 fl added character type table
12 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13 # 2000-11-03 fl expand first/last ranges
14 # 2001-01-19 fl added character name tables (2.1)
15 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16 # 2002-09-11 wd use string methods
17 # 2002-10-18 mvl update to Unicode 3.2
18 # 2002-10-22 mvl generate NFC tables
19 # 2002-11-24 mvl expand all ranges, sort names version-independently
20 # 2002-11-25 mvl add UNIDATA_VERSION
21 # 2004-05-29 perky add east asian width information
22 # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
24 # written by Fredrik Lundh (fredrik@pythonware.com)
32 # The Unicode Database
33 UNIDATA_VERSION
= "4.1.0"
34 UNICODE_DATA
= "UnicodeData%s.txt"
35 COMPOSITION_EXCLUSIONS
= "CompositionExclusions%s.txt"
36 EASTASIAN_WIDTH
= "EastAsianWidth%s.txt"
38 old_versions
= ["3.2.0"]
40 CATEGORY_NAMES
= [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
45 BIDIRECTIONAL_NAMES
= [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
49 EASTASIANWIDTH_NAMES
= [ "F", "H", "W", "Na", "A", "N" ]
51 # note: should match definitions in Objects/unicodectype.c
61 def maketables(trace
=0):
63 print "--- Reading", UNICODE_DATA
% "", "..."
66 unicode = UnicodeData(UNICODE_DATA
% version
,
67 COMPOSITION_EXCLUSIONS
% version
,
68 EASTASIAN_WIDTH
% version
)
70 print len(filter(None, unicode.table
)), "characters"
72 for version
in old_versions
:
73 print "--- Reading", UNICODE_DATA
% ("-"+version
), "..."
74 old_unicode
= UnicodeData(UNICODE_DATA
% ("-"+version
),
75 COMPOSITION_EXCLUSIONS
% ("-"+version
),
76 EASTASIAN_WIDTH
% ("-"+version
))
77 print len(filter(None, old_unicode
.table
)), "characters"
78 merge_old_version(version
, unicode, old_unicode
)
80 makeunicodename(unicode, trace
)
81 makeunicodedata(unicode, trace
)
82 makeunicodetype(unicode, trace
)
84 # --------------------------------------------------------------------
85 # unicode character properties
87 def makeunicodedata(unicode, trace
):
89 dummy
= (0, 0, 0, 0, 0)
92 index
= [0] * len(unicode.chars
)
94 FILE
= "Modules/unicodedata_db.h"
96 print "--- Preparing", FILE
, "..."
98 # 1) database properties
100 for char
in unicode.chars
:
101 record
= unicode.table
[char
]
103 # extract database properties
104 category
= CATEGORY_NAMES
.index(record
[2])
105 combining
= int(record
[3])
106 bidirectional
= BIDIRECTIONAL_NAMES
.index(record
[4])
107 mirrored
= record
[9] == "Y"
108 eastasianwidth
= EASTASIANWIDTH_NAMES
.index(record
[15])
110 category
, combining
, bidirectional
, mirrored
, eastasianwidth
112 # add entry to index and item tables
115 cache
[item
] = i
= len(table
)
119 # 2) decomposition data
123 decomp_index
= [0] * len(unicode.chars
)
127 comp_first
= [None] * len(unicode.chars
)
128 comp_last
= [None] * len(unicode.chars
)
130 for char
in unicode.chars
:
131 record
= unicode.table
[char
]
134 decomp
= record
[5].split()
136 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
138 if decomp
[0][0] == "<":
139 prefix
= decomp
.pop(0)
143 i
= decomp_prefix
.index(prefix
)
145 i
= len(decomp_prefix
)
146 decomp_prefix
.append(prefix
)
150 decomp
= [prefix
+ (len(decomp
)<<8)] +\
151 map(lambda s
: int(s
, 16), decomp
)
153 if not prefix
and len(decomp
) == 3 and \
154 char
not in unicode.exclusions
and \
155 unicode.table
[decomp
[1]][3] == "0":
159 comp_pairs
.append((l
,r
,char
))
161 i
= decomp_data
.index(decomp
)
164 decomp_data
.extend(decomp
)
165 decomp_size
= decomp_size
+ len(decomp
) * 2
168 decomp_index
[char
] = i
171 comp_first_ranges
= []
172 comp_last_ranges
= []
173 prev_f
= prev_l
= None
174 for i
in unicode.chars
:
175 if comp_first
[i
] is not None:
180 elif prev_f
[1]+1 == i
:
183 comp_first_ranges
.append(prev_f
)
185 if comp_last
[i
] is not None:
190 elif prev_l
[1]+1 == i
:
193 comp_last_ranges
.append(prev_l
)
195 comp_first_ranges
.append(prev_f
)
196 comp_last_ranges
.append(prev_l
)
200 comp_data
= [0]*(total_first
*total_last
)
201 for f
,l
,char
in comp_pairs
:
204 comp_data
[f
*total_last
+l
] = char
206 print len(table
), "unique properties"
207 print len(decomp_prefix
), "unique decomposition prefixes"
208 print len(decomp_data
), "unique decomposition entries:",
209 print decomp_size
, "bytes"
210 print total_first
, "first characters in NFC"
211 print total_last
, "last characters in NFC"
212 print len(comp_pairs
), "NFC pairs"
214 print "--- Writing", FILE
, "..."
217 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
219 print >>fp
, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
220 print >>fp
, "/* a list of unique database records */"
222 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
224 print >>fp
, " {%d, %d, %d, %d, %d}," % item
228 print >>fp
, "/* Reindexing of NFC first characters. */"
229 print >>fp
, "#define TOTAL_FIRST",total_first
230 print >>fp
, "#define TOTAL_LAST",total_last
231 print >>fp
, "struct reindex{int start;short count,index;};"
232 print >>fp
, "struct reindex nfc_first[] = {"
233 for start
,end
in comp_first_ranges
:
234 print >>fp
," { %d, %d, %d}," % (start
,end
-start
,comp_first
[start
])
235 print >>fp
," {0,0,0}"
237 print >>fp
, "struct reindex nfc_last[] = {"
238 for start
,end
in comp_last_ranges
:
239 print >>fp
," { %d, %d, %d}," % (start
,end
-start
,comp_last
[start
])
240 print >>fp
," {0,0,0}"
243 # FIXME: <fl> the following tables could be made static, and
244 # the support code moved into unicodedatabase.c
246 print >>fp
, "/* string literals */"
247 print >>fp
, "const char *_PyUnicode_CategoryNames[] = {"
248 for name
in CATEGORY_NAMES
:
249 print >>fp
, " \"%s\"," % name
253 print >>fp
, "const char *_PyUnicode_BidirectionalNames[] = {"
254 for name
in BIDIRECTIONAL_NAMES
:
255 print >>fp
, " \"%s\"," % name
259 print >>fp
, "const char *_PyUnicode_EastAsianWidthNames[] = {"
260 for name
in EASTASIANWIDTH_NAMES
:
261 print >>fp
, " \"%s\"," % name
265 print >>fp
, "static const char *decomp_prefix[] = {"
266 for name
in decomp_prefix
:
267 print >>fp
, " \"%s\"," % name
271 # split record index table
272 index1
, index2
, shift
= splitbins(index
, trace
)
274 print >>fp
, "/* index tables for the database records */"
275 print >>fp
, "#define SHIFT", shift
276 Array("index1", index1
).dump(fp
, trace
)
277 Array("index2", index2
).dump(fp
, trace
)
279 # split decomposition index table
280 index1
, index2
, shift
= splitbins(decomp_index
, trace
)
282 print >>fp
, "/* decomposition data */"
283 Array("decomp_data", decomp_data
).dump(fp
, trace
)
285 print >>fp
, "/* index tables for the decomposition data */"
286 print >>fp
, "#define DECOMP_SHIFT", shift
287 Array("decomp_index1", index1
).dump(fp
, trace
)
288 Array("decomp_index2", index2
).dump(fp
, trace
)
290 index
, index2
, shift
= splitbins(comp_data
, trace
)
291 print >>fp
, "/* NFC pairs */"
292 print >>fp
, "#define COMP_SHIFT", shift
293 Array("comp_index", index
).dump(fp
, trace
)
294 Array("comp_data", index2
).dump(fp
, trace
)
296 # Generate delta tables for old versions
297 for version
, table
, normalization
in unicode.changed
:
298 cversion
= version
.replace(".","_")
301 index
= [0] * len(table
)
302 for i
, record
in enumerate(table
):
304 index
[i
] = cache
[record
]
306 index
[i
] = cache
[record
] = len(records
)
307 records
.append(record
)
308 index1
, index2
, shift
= splitbins(index
, trace
)
309 print >>fp
, "static const change_record change_records_%s[] = {" % cversion
310 for record
in records
:
311 print >>fp
, "\t{ %s }," % ", ".join(map(str,record
))
313 Array("changes_%s_index" % cversion
, index1
).dump(fp
, trace
)
314 Array("changes_%s_data" % cversion
, index2
).dump(fp
, trace
)
315 print >>fp
, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
317 print >>fp
, "\tint index;"
318 print >>fp
, "\tif (n >= 0x110000) index = 0;"
319 print >>fp
, "\telse {"
320 print >>fp
, "\t\tindex = changes_%s_index[n>>%d];" % (cversion
, shift
)
321 print >>fp
, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
322 (cversion
, shift
, ((1<<shift
)-1))
324 print >>fp
, "\treturn change_records_%s+index;" % cversion
326 print >>fp
, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
328 print >>fp
, "\tswitch(n) {"
329 for k
, v
in normalization
:
330 print >>fp
, "\tcase %s: return 0x%s;" % (hex(k
), v
)
331 print >>fp
, "\tdefault: return 0;"
332 print >>fp
, "\t}\n}\n"
336 # --------------------------------------------------------------------
337 # unicode character type tables
339 def makeunicodetype(unicode, trace
):
341 FILE
= "Objects/unicodetype_db.h"
343 print "--- Preparing", FILE
, "..."
345 # extract unicode types
346 dummy
= (0, 0, 0, 0, 0, 0)
349 index
= [0] * len(unicode.chars
)
351 for char
in unicode.chars
:
352 record
= unicode.table
[char
]
354 # extract database properties
356 bidirectional
= record
[4]
358 if category
in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
362 if category
== "Zl" or bidirectional
== "B":
363 flags |
= LINEBREAK_MASK
364 if category
== "Zs" or bidirectional
in ("WS", "B", "S"):
370 # use delta predictor for upper/lower/title
372 upper
= int(record
[12], 16) - char
373 assert -32768 <= upper
<= 32767
374 upper
= upper
& 0xffff
378 lower
= int(record
[13], 16) - char
379 assert -32768 <= lower
<= 32767
380 lower
= lower
& 0xffff
384 title
= int(record
[14], 16) - char
385 assert -32768 <= lower
<= 32767
386 title
= title
& 0xffff
389 # decimal digit, integer digit
392 flags |
= DECIMAL_MASK
393 decimal
= int(record
[6])
397 digit
= int(record
[7])
399 upper
, lower
, title
, decimal
, digit
, flags
401 # add entry to index and item tables
404 cache
[item
] = i
= len(table
)
408 print len(table
), "unique character type entries"
410 print "--- Writing", FILE
, "..."
413 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
415 print >>fp
, "/* a list of unique character type descriptors */"
416 print >>fp
, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
418 print >>fp
, " {%d, %d, %d, %d, %d, %d}," % item
422 # split decomposition index table
423 index1
, index2
, shift
= splitbins(index
, trace
)
425 print >>fp
, "/* type indexes */"
426 print >>fp
, "#define SHIFT", shift
427 Array("index1", index1
).dump(fp
, trace
)
428 Array("index2", index2
).dump(fp
, trace
)
432 # --------------------------------------------------------------------
433 # unicode name database
435 def makeunicodename(unicode, trace
):
437 FILE
= "Modules/unicodename_db.h"
439 print "--- Preparing", FILE
, "..."
442 names
= [None] * len(unicode.chars
)
444 for char
in unicode.chars
:
445 record
= unicode.table
[char
]
447 name
= record
[1].strip()
448 if name
and name
[0] != "<":
449 names
[char
] = name
+ chr(0)
451 print len(filter(lambda n
: n
is not None, names
)), "distinct names"
453 # collect unique words from names (note that we differ between
454 # words inside a sentence, and words ending a sentence. the
455 # latter includes the trailing null byte.
459 for char
in unicode.chars
:
470 words
[w
] = [len(words
)]
472 print n
, "words in text;", b
, "bytes"
474 wordlist
= words
.items()
476 # sort on falling frequency, then by name
477 def cmpwords((aword
, alist
),(bword
, blist
)):
478 r
= -cmp(len(alist
),len(blist
))
481 return cmp(aword
, bword
)
482 wordlist
.sort(cmpwords
)
484 # figure out how many phrasebook escapes we need
486 while escapes
* 256 < len(wordlist
):
487 escapes
= escapes
+ 1
488 print escapes
, "escapes"
490 short
= 256 - escapes
494 print short
, "short indexes in lexicon"
498 for i
in range(short
):
499 n
= n
+ len(wordlist
[i
][1])
500 print n
, "short indexes in phrasebook"
502 # pick the most commonly used words, and sort the rest on falling
503 # length (to maximize overlap)
505 wordlist
, wordtail
= wordlist
[:short
], wordlist
[short
:]
506 wordtail
.sort(lambda a
, b
: len(b
[0])-len(a
[0]))
507 wordlist
.extend(wordtail
)
509 # generate lexicon from words
515 # build a lexicon string
517 for w
, x
in wordlist
:
518 # encoding: bit 7 indicates last character in word (chr(128)
519 # indicates the last character in an entire string)
520 ww
= w
[:-1] + chr(ord(w
[-1])+128)
521 # reuse string tails, when possible
525 lexicon
= lexicon
+ ww
526 offset
= offset
+ len(w
)
527 words
[w
] = len(lexicon_offset
)
528 lexicon_offset
.append(o
)
530 lexicon
= map(ord, lexicon
)
532 # generate phrasebook from names and lexicon
534 phrasebook_offset
= [0] * len(unicode.chars
)
535 for char
in unicode.chars
:
539 phrasebook_offset
[char
] = len(phrasebook
)
546 phrasebook
.append((i
>>8) + short
)
547 phrasebook
.append(i
&255)
549 assert getsize(phrasebook
) == 1
552 # unicode name hash table
556 for char
in unicode.chars
:
557 record
= unicode.table
[char
]
559 name
= record
[1].strip()
560 if name
and name
[0] != "<":
561 data
.append((name
, char
))
563 # the magic number 47 was chosen to minimize the number of
564 # collisions on the current data set. if you like, change it
565 # and see what happens...
567 codehash
= Hash("code", data
, 47)
569 print "--- Writing", FILE
, "..."
572 print >>fp
, "/* this file was generated by %s %s */" % (SCRIPT
, VERSION
)
574 print >>fp
, "#define NAME_MAXLEN", 256
576 print >>fp
, "/* lexicon */"
577 Array("lexicon", lexicon
).dump(fp
, trace
)
578 Array("lexicon_offset", lexicon_offset
).dump(fp
, trace
)
580 # split decomposition index table
581 offset1
, offset2
, shift
= splitbins(phrasebook_offset
, trace
)
583 print >>fp
, "/* code->name phrasebook */"
584 print >>fp
, "#define phrasebook_shift", shift
585 print >>fp
, "#define phrasebook_short", short
587 Array("phrasebook", phrasebook
).dump(fp
, trace
)
588 Array("phrasebook_offset1", offset1
).dump(fp
, trace
)
589 Array("phrasebook_offset2", offset2
).dump(fp
, trace
)
591 print >>fp
, "/* name->code dictionary */"
592 codehash
.dump(fp
, trace
)
597 def merge_old_version(version
, new
, old
):
598 # Changes to exclusion file not implemented yet
599 if old
.exclusions
!= new
.exclusions
:
600 raise NotImplementedError, "exclusions differ"
602 # In these change records, 0xFF means "no change"
603 bidir_changes
= [0xFF]*0x110000
604 category_changes
= [0xFF]*0x110000
605 decimal_changes
= [0xFF]*0x110000
606 # In numeric data, 0 means "no change",
607 # -1 means "did not have a numeric value
608 numeric_changes
= [0] * 0x110000
609 # normalization_changes is a list of key-value pairs
610 normalization_changes
= []
611 for i
in range(0x110000):
612 if new
.table
[i
] is None:
613 # Characters unassigned in the new version ought to
614 # be unassigned in the old one
615 assert old
.table
[i
] is None
617 # check characters unassigned in the old version
618 if old
.table
[i
] is None:
619 # category 0 is "unassigned"
620 category_changes
[i
] = 0
622 # check characters that differ
623 if old
.table
[i
] != new
.table
[i
]:
624 for k
in range(len(old
.table
[i
])):
625 if old
.table
[i
][k
] != new
.table
[i
][k
]:
626 value
= old
.table
[i
][k
]
628 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
629 category_changes
[i
] = CATEGORY_NAMES
.index(value
)
631 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
632 bidir_changes
[i
] = BIDIRECTIONAL_NAMES
.index(value
)
634 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
635 # We assume that all normalization changes are in 1:1 mappings
636 assert " " not in value
637 normalization_changes
.append((i
, value
))
639 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
640 # we only support changes where the old value is a single digit
641 assert value
in "0123456789"
642 decimal_changes
[i
] = int(value
)
644 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
645 # Since 0 encodes "no change", the old value is better not 0
646 assert value
!= "0" and value
!= "-1"
648 numeric_changes
[i
] = -1
650 assert re
.match("^[0-9]+$", value
)
651 numeric_changes
[i
] = int(value
)
653 # change to ISO comment, ignore
656 # change to simple uppercase mapping; ignore
659 # change to simple lowercase mapping; ignore
662 # change to simple titlecase mapping; ignore
665 class Difference(Exception):pass
666 raise Difference
, (hex(i
), k
, old
.table
[i
], new
.table
[i
])
667 new
.changed
.append((version
, zip(bidir_changes
, category_changes
,
668 decimal_changes
, numeric_changes
),
669 normalization_changes
))
672 # --------------------------------------------------------------------
673 # the following support code is taken from the unidb utilities
674 # Copyright (c) 1999-2000 by Secret Labs AB
676 # load a unicode-data file from disk
682 def __init__(self
, filename
, exclusions
, eastasianwidth
, expand
=1):
684 file = open(filename
)
685 table
= [None] * 0x110000
690 s
= s
.strip().split(";")
694 # expand first-last ranges
697 for i
in range(0, 0x110000):
700 if s
[1][-6:] == "First>":
703 elif s
[1][-5:] == "Last>":
712 self
.filename
= filename
714 self
.chars
= range(0x110000) # unicode 3.2
716 file = open(exclusions
)
724 char
= int(s
.split()[0],16)
725 self
.exclusions
[char
] = 1
727 widths
= [None] * 0x110000
728 for s
in open(eastasianwidth
):
734 s
= s
.split()[0].split(';')
736 first
, last
= [int(c
, 16) for c
in s
[0].split('..')]
737 chars
= range(first
, last
+1)
739 chars
= [int(s
[0], 16)]
742 for i
in range(0, 0x110000):
743 if table
[i
] is not None:
744 table
[i
].append(widths
[i
])
747 # restrict character range to ISO Latin 1
748 self
.chars
= range(256)
752 # this is a straight-forward reimplementation of Python's built-in
753 # dictionary type, using a static data structure, and a custom string
756 def myhash(s
, magic
):
758 for c
in map(ord, s
.upper()):
762 h
= (h ^
((ix
>>24) & 0xff)) & 0x00ffffff
766 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
767 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
768 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
769 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
773 def __init__(self
, name
, data
, magic
):
774 # turn a (key, value) list into a static hash table structure
776 # determine table size
777 for size
, poly
in SIZES
:
782 raise AssertionError, "ran out of polynominals"
784 print size
, "slots in hash table"
786 table
= [None] * size
794 # initialize hash table
795 for key
, value
in data
:
802 incr
= (h ^
(h
>> 3)) & mask
;
807 i
= (i
+ incr
) & mask
816 print n
, "collisions"
819 for i
in range(len(table
)):
823 self
.data
= Array(name
+ "_hash", table
)
829 def dump(self
, file, trace
):
830 # write data to file, as a C array
831 self
.data
.dump(file, trace
)
832 file.write("#define %s_magic %d\n" % (self
.name
, self
.magic
))
833 file.write("#define %s_size %d\n" % (self
.name
, self
.size
))
834 file.write("#define %s_poly %d\n" % (self
.name
, self
.poly
))
836 # stuff to deal with arrays of unsigned integers
840 def __init__(self
, name
, data
):
844 def dump(self
, file, trace
=0):
845 # write data to file, as a C array
846 size
= getsize(self
.data
)
848 print >>sys
.stderr
, self
.name
+":", size
*len(self
.data
), "bytes"
849 file.write("static ")
851 file.write("unsigned char")
853 file.write("unsigned short")
855 file.write("unsigned int")
856 file.write(" " + self
.name
+ "[] = {\n")
859 for item
in self
.data
:
861 if len(s
) + len(i
) > 78:
871 # return smallest possible integer size for the given array
875 elif maxdata
< 65536:
880 def splitbins(t
, trace
=0):
881 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
883 t is a sequence of ints. This function can be useful to save space if
884 many of the ints are the same. t1 and t2 are lists of ints, and shift
885 is an int, chosen to minimize the combined size of t1 and t2 (in C
886 code), and where for each i in range(len(t)),
887 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
888 where mask is a bitmask isolating the last "shift" bits.
890 If optional arg trace is non-zero (default zero), progress info
891 is printed to sys.stderr. The higher the value, the more info
897 def dump(t1
, t2
, shift
, bytes
):
898 print >>sys
.stderr
, "%d+%d bins at shift %d; %d bytes" % (
899 len(t1
), len(t2
), shift
, bytes
)
900 print >>sys
.stderr
, "Size of original table:", len(t
)*getsize(t
), \
902 n
= len(t
)-1 # last valid index
903 maxshift
= 0 # the most we can shift n and still have something left
909 bytes
= sys
.maxint
# smallest total size so far
910 t
= tuple(t
) # so slices can be dict keys
911 for shift
in range(maxshift
+ 1):
916 for i
in range(0, len(t
), size
):
918 index
= bincache
.get(bin
)
921 bincache
[bin
] = index
923 t1
.append(index
>> shift
)
924 # determine memory size
925 b
= len(t1
)*getsize(t1
) + len(t2
)*getsize(t2
)
927 dump(t1
, t2
, shift
, b
)
933 print >>sys
.stderr
, "Best:",
934 dump(t1
, t2
, shift
, bytes
)
936 # exhaustively verify that the decomposition is correct
937 mask
= ~
((~
0) << shift
) # i.e., low-bit mask of shift bits
938 for i
in xrange(len(t
)):
939 assert t
[i
] == t2
[(t1
[i
>> shift
] << shift
) + (i
& mask
)]
942 if __name__
== "__main__":