Put code examples at left margin instead of indenting them
[python.git] / Tools / unicode / makeunicodedata.py
blob9de69048eb6105ff6430c1869ee810d01cb94904
2 # (re)generate unicode property and type databases
4 # this script converts a unicode 3.2 database file to
5 # Modules/unicodedata_db.h, Modules/unicodename_db.h,
6 # and Objects/unicodetype_db.h
8 # history:
9 # 2000-09-24 fl created (based on bits and pieces from unidb)
10 # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11 # 2000-09-25 fl added character type table
12 # 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13 # 2000-11-03 fl expand first/last ranges
14 # 2001-01-19 fl added character name tables (2.1)
15 # 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16 # 2002-09-11 wd use string methods
17 # 2002-10-18 mvl update to Unicode 3.2
18 # 2002-10-22 mvl generate NFC tables
19 # 2002-11-24 mvl expand all ranges, sort names version-independently
20 # 2002-11-25 mvl add UNIDATA_VERSION
21 # 2004-05-29 perky add east asian width information
22 # 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
24 # written by Fredrik Lundh (fredrik@pythonware.com)
27 import sys
29 SCRIPT = sys.argv[0]
30 VERSION = "2.5"
32 # The Unicode Database
33 UNIDATA_VERSION = "4.1.0"
34 UNICODE_DATA = "UnicodeData%s.txt"
35 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
38 old_versions = ["3.2.0"]
40 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43 "So" ]
45 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47 "ON" ]
49 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
51 # note: should match definitions in Objects/unicodectype.c
52 ALPHA_MASK = 0x01
53 DECIMAL_MASK = 0x02
54 DIGIT_MASK = 0x04
55 LOWER_MASK = 0x08
56 LINEBREAK_MASK = 0x10
57 SPACE_MASK = 0x20
58 TITLE_MASK = 0x40
59 UPPER_MASK = 0x80
61 def maketables(trace=0):
63 print "--- Reading", UNICODE_DATA % "", "..."
65 version = ""
66 unicode = UnicodeData(UNICODE_DATA % version,
67 COMPOSITION_EXCLUSIONS % version,
68 EASTASIAN_WIDTH % version)
70 print len(filter(None, unicode.table)), "characters"
72 for version in old_versions:
73 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
74 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
75 COMPOSITION_EXCLUSIONS % ("-"+version),
76 EASTASIAN_WIDTH % ("-"+version))
77 print len(filter(None, old_unicode.table)), "characters"
78 merge_old_version(version, unicode, old_unicode)
80 makeunicodename(unicode, trace)
81 makeunicodedata(unicode, trace)
82 makeunicodetype(unicode, trace)
84 # --------------------------------------------------------------------
85 # unicode character properties
87 def makeunicodedata(unicode, trace):
89 dummy = (0, 0, 0, 0, 0)
90 table = [dummy]
91 cache = {0: dummy}
92 index = [0] * len(unicode.chars)
94 FILE = "Modules/unicodedata_db.h"
96 print "--- Preparing", FILE, "..."
98 # 1) database properties
100 for char in unicode.chars:
101 record = unicode.table[char]
102 if record:
103 # extract database properties
104 category = CATEGORY_NAMES.index(record[2])
105 combining = int(record[3])
106 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
107 mirrored = record[9] == "Y"
108 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
109 item = (
110 category, combining, bidirectional, mirrored, eastasianwidth
112 # add entry to index and item tables
113 i = cache.get(item)
114 if i is None:
115 cache[item] = i = len(table)
116 table.append(item)
117 index[char] = i
119 # 2) decomposition data
121 decomp_data = [0]
122 decomp_prefix = [""]
123 decomp_index = [0] * len(unicode.chars)
124 decomp_size = 0
126 comp_pairs = []
127 comp_first = [None] * len(unicode.chars)
128 comp_last = [None] * len(unicode.chars)
130 for char in unicode.chars:
131 record = unicode.table[char]
132 if record:
133 if record[5]:
134 decomp = record[5].split()
135 if len(decomp) > 19:
136 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
137 # prefix
138 if decomp[0][0] == "<":
139 prefix = decomp.pop(0)
140 else:
141 prefix = ""
142 try:
143 i = decomp_prefix.index(prefix)
144 except ValueError:
145 i = len(decomp_prefix)
146 decomp_prefix.append(prefix)
147 prefix = i
148 assert prefix < 256
149 # content
150 decomp = [prefix + (len(decomp)<<8)] +\
151 map(lambda s: int(s, 16), decomp)
152 # Collect NFC pairs
153 if not prefix and len(decomp) == 3 and \
154 char not in unicode.exclusions and \
155 unicode.table[decomp[1]][3] == "0":
156 p, l, r = decomp
157 comp_first[l] = 1
158 comp_last[r] = 1
159 comp_pairs.append((l,r,char))
160 try:
161 i = decomp_data.index(decomp)
162 except ValueError:
163 i = len(decomp_data)
164 decomp_data.extend(decomp)
165 decomp_size = decomp_size + len(decomp) * 2
166 else:
167 i = 0
168 decomp_index[char] = i
170 f = l = 0
171 comp_first_ranges = []
172 comp_last_ranges = []
173 prev_f = prev_l = None
174 for i in unicode.chars:
175 if comp_first[i] is not None:
176 comp_first[i] = f
177 f += 1
178 if prev_f is None:
179 prev_f = (i,i)
180 elif prev_f[1]+1 == i:
181 prev_f = prev_f[0],i
182 else:
183 comp_first_ranges.append(prev_f)
184 prev_f = (i,i)
185 if comp_last[i] is not None:
186 comp_last[i] = l
187 l += 1
188 if prev_l is None:
189 prev_l = (i,i)
190 elif prev_l[1]+1 == i:
191 prev_l = prev_l[0],i
192 else:
193 comp_last_ranges.append(prev_l)
194 prev_l = (i,i)
195 comp_first_ranges.append(prev_f)
196 comp_last_ranges.append(prev_l)
197 total_first = f
198 total_last = l
200 comp_data = [0]*(total_first*total_last)
201 for f,l,char in comp_pairs:
202 f = comp_first[f]
203 l = comp_last[l]
204 comp_data[f*total_last+l] = char
206 print len(table), "unique properties"
207 print len(decomp_prefix), "unique decomposition prefixes"
208 print len(decomp_data), "unique decomposition entries:",
209 print decomp_size, "bytes"
210 print total_first, "first characters in NFC"
211 print total_last, "last characters in NFC"
212 print len(comp_pairs), "NFC pairs"
214 print "--- Writing", FILE, "..."
216 fp = open(FILE, "w")
217 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
218 print >>fp
219 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
220 print >>fp, "/* a list of unique database records */"
221 print >>fp, \
222 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
223 for item in table:
224 print >>fp, " {%d, %d, %d, %d, %d}," % item
225 print >>fp, "};"
226 print >>fp
228 print >>fp, "/* Reindexing of NFC first characters. */"
229 print >>fp, "#define TOTAL_FIRST",total_first
230 print >>fp, "#define TOTAL_LAST",total_last
231 print >>fp, "struct reindex{int start;short count,index;};"
232 print >>fp, "struct reindex nfc_first[] = {"
233 for start,end in comp_first_ranges:
234 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
235 print >>fp," {0,0,0}"
236 print >>fp,"};\n"
237 print >>fp, "struct reindex nfc_last[] = {"
238 for start,end in comp_last_ranges:
239 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
240 print >>fp," {0,0,0}"
241 print >>fp,"};\n"
243 # FIXME: <fl> the following tables could be made static, and
244 # the support code moved into unicodedatabase.c
246 print >>fp, "/* string literals */"
247 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
248 for name in CATEGORY_NAMES:
249 print >>fp, " \"%s\"," % name
250 print >>fp, " NULL"
251 print >>fp, "};"
253 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
254 for name in BIDIRECTIONAL_NAMES:
255 print >>fp, " \"%s\"," % name
256 print >>fp, " NULL"
257 print >>fp, "};"
259 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
260 for name in EASTASIANWIDTH_NAMES:
261 print >>fp, " \"%s\"," % name
262 print >>fp, " NULL"
263 print >>fp, "};"
265 print >>fp, "static const char *decomp_prefix[] = {"
266 for name in decomp_prefix:
267 print >>fp, " \"%s\"," % name
268 print >>fp, " NULL"
269 print >>fp, "};"
271 # split record index table
272 index1, index2, shift = splitbins(index, trace)
274 print >>fp, "/* index tables for the database records */"
275 print >>fp, "#define SHIFT", shift
276 Array("index1", index1).dump(fp, trace)
277 Array("index2", index2).dump(fp, trace)
279 # split decomposition index table
280 index1, index2, shift = splitbins(decomp_index, trace)
282 print >>fp, "/* decomposition data */"
283 Array("decomp_data", decomp_data).dump(fp, trace)
285 print >>fp, "/* index tables for the decomposition data */"
286 print >>fp, "#define DECOMP_SHIFT", shift
287 Array("decomp_index1", index1).dump(fp, trace)
288 Array("decomp_index2", index2).dump(fp, trace)
290 index, index2, shift = splitbins(comp_data, trace)
291 print >>fp, "/* NFC pairs */"
292 print >>fp, "#define COMP_SHIFT", shift
293 Array("comp_index", index).dump(fp, trace)
294 Array("comp_data", index2).dump(fp, trace)
296 # Generate delta tables for old versions
297 for version, table, normalization in unicode.changed:
298 cversion = version.replace(".","_")
299 records = [table[0]]
300 cache = {table[0]:0}
301 index = [0] * len(table)
302 for i, record in enumerate(table):
303 try:
304 index[i] = cache[record]
305 except KeyError:
306 index[i] = cache[record] = len(records)
307 records.append(record)
308 index1, index2, shift = splitbins(index, trace)
309 print >>fp, "static const change_record change_records_%s[] = {" % cversion
310 for record in records:
311 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
312 print >>fp, "};"
313 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
314 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
315 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
316 print >>fp, "{"
317 print >>fp, "\tint index;"
318 print >>fp, "\tif (n >= 0x110000) index = 0;"
319 print >>fp, "\telse {"
320 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
321 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
322 (cversion, shift, ((1<<shift)-1))
323 print >>fp, "\t}"
324 print >>fp, "\treturn change_records_%s+index;" % cversion
325 print >>fp, "}\n"
326 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
327 print >>fp, "{"
328 print >>fp, "\tswitch(n) {"
329 for k, v in normalization:
330 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
331 print >>fp, "\tdefault: return 0;"
332 print >>fp, "\t}\n}\n"
334 fp.close()
336 # --------------------------------------------------------------------
337 # unicode character type tables
339 def makeunicodetype(unicode, trace):
341 FILE = "Objects/unicodetype_db.h"
343 print "--- Preparing", FILE, "..."
345 # extract unicode types
346 dummy = (0, 0, 0, 0, 0, 0)
347 table = [dummy]
348 cache = {0: dummy}
349 index = [0] * len(unicode.chars)
351 for char in unicode.chars:
352 record = unicode.table[char]
353 if record:
354 # extract database properties
355 category = record[2]
356 bidirectional = record[4]
357 flags = 0
358 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
359 flags |= ALPHA_MASK
360 if category == "Ll":
361 flags |= LOWER_MASK
362 if category == "Zl" or bidirectional == "B":
363 flags |= LINEBREAK_MASK
364 if category == "Zs" or bidirectional in ("WS", "B", "S"):
365 flags |= SPACE_MASK
366 if category == "Lt":
367 flags |= TITLE_MASK
368 if category == "Lu":
369 flags |= UPPER_MASK
370 # use delta predictor for upper/lower/title
371 if record[12]:
372 upper = int(record[12], 16) - char
373 assert -32768 <= upper <= 32767
374 upper = upper & 0xffff
375 else:
376 upper = 0
377 if record[13]:
378 lower = int(record[13], 16) - char
379 assert -32768 <= lower <= 32767
380 lower = lower & 0xffff
381 else:
382 lower = 0
383 if record[14]:
384 title = int(record[14], 16) - char
385 assert -32768 <= lower <= 32767
386 title = title & 0xffff
387 else:
388 title = 0
389 # decimal digit, integer digit
390 decimal = 0
391 if record[6]:
392 flags |= DECIMAL_MASK
393 decimal = int(record[6])
394 digit = 0
395 if record[7]:
396 flags |= DIGIT_MASK
397 digit = int(record[7])
398 item = (
399 upper, lower, title, decimal, digit, flags
401 # add entry to index and item tables
402 i = cache.get(item)
403 if i is None:
404 cache[item] = i = len(table)
405 table.append(item)
406 index[char] = i
408 print len(table), "unique character type entries"
410 print "--- Writing", FILE, "..."
412 fp = open(FILE, "w")
413 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
414 print >>fp
415 print >>fp, "/* a list of unique character type descriptors */"
416 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
417 for item in table:
418 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
419 print >>fp, "};"
420 print >>fp
422 # split decomposition index table
423 index1, index2, shift = splitbins(index, trace)
425 print >>fp, "/* type indexes */"
426 print >>fp, "#define SHIFT", shift
427 Array("index1", index1).dump(fp, trace)
428 Array("index2", index2).dump(fp, trace)
430 fp.close()
432 # --------------------------------------------------------------------
433 # unicode name database
435 def makeunicodename(unicode, trace):
437 FILE = "Modules/unicodename_db.h"
439 print "--- Preparing", FILE, "..."
441 # collect names
442 names = [None] * len(unicode.chars)
444 for char in unicode.chars:
445 record = unicode.table[char]
446 if record:
447 name = record[1].strip()
448 if name and name[0] != "<":
449 names[char] = name + chr(0)
451 print len(filter(lambda n: n is not None, names)), "distinct names"
453 # collect unique words from names (note that we differ between
454 # words inside a sentence, and words ending a sentence. the
455 # latter includes the trailing null byte.
457 words = {}
458 n = b = 0
459 for char in unicode.chars:
460 name = names[char]
461 if name:
462 w = name.split()
463 b = b + len(name)
464 n = n + len(w)
465 for w in w:
466 l = words.get(w)
467 if l:
468 l.append(None)
469 else:
470 words[w] = [len(words)]
472 print n, "words in text;", b, "bytes"
474 wordlist = words.items()
476 # sort on falling frequency, then by name
477 def cmpwords((aword, alist),(bword, blist)):
478 r = -cmp(len(alist),len(blist))
479 if r:
480 return r
481 return cmp(aword, bword)
482 wordlist.sort(cmpwords)
484 # figure out how many phrasebook escapes we need
485 escapes = 0
486 while escapes * 256 < len(wordlist):
487 escapes = escapes + 1
488 print escapes, "escapes"
490 short = 256 - escapes
492 assert short > 0
494 print short, "short indexes in lexicon"
496 # statistics
497 n = 0
498 for i in range(short):
499 n = n + len(wordlist[i][1])
500 print n, "short indexes in phrasebook"
502 # pick the most commonly used words, and sort the rest on falling
503 # length (to maximize overlap)
505 wordlist, wordtail = wordlist[:short], wordlist[short:]
506 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
507 wordlist.extend(wordtail)
509 # generate lexicon from words
511 lexicon_offset = [0]
512 lexicon = ""
513 words = {}
515 # build a lexicon string
516 offset = 0
517 for w, x in wordlist:
518 # encoding: bit 7 indicates last character in word (chr(128)
519 # indicates the last character in an entire string)
520 ww = w[:-1] + chr(ord(w[-1])+128)
521 # reuse string tails, when possible
522 o = lexicon.find(ww)
523 if o < 0:
524 o = offset
525 lexicon = lexicon + ww
526 offset = offset + len(w)
527 words[w] = len(lexicon_offset)
528 lexicon_offset.append(o)
530 lexicon = map(ord, lexicon)
532 # generate phrasebook from names and lexicon
533 phrasebook = [0]
534 phrasebook_offset = [0] * len(unicode.chars)
535 for char in unicode.chars:
536 name = names[char]
537 if name:
538 w = name.split()
539 phrasebook_offset[char] = len(phrasebook)
540 for w in w:
541 i = words[w]
542 if i < short:
543 phrasebook.append(i)
544 else:
545 # store as two bytes
546 phrasebook.append((i>>8) + short)
547 phrasebook.append(i&255)
549 assert getsize(phrasebook) == 1
552 # unicode name hash table
554 # extract names
555 data = []
556 for char in unicode.chars:
557 record = unicode.table[char]
558 if record:
559 name = record[1].strip()
560 if name and name[0] != "<":
561 data.append((name, char))
563 # the magic number 47 was chosen to minimize the number of
564 # collisions on the current data set. if you like, change it
565 # and see what happens...
567 codehash = Hash("code", data, 47)
569 print "--- Writing", FILE, "..."
571 fp = open(FILE, "w")
572 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
573 print >>fp
574 print >>fp, "#define NAME_MAXLEN", 256
575 print >>fp
576 print >>fp, "/* lexicon */"
577 Array("lexicon", lexicon).dump(fp, trace)
578 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
580 # split decomposition index table
581 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
583 print >>fp, "/* code->name phrasebook */"
584 print >>fp, "#define phrasebook_shift", shift
585 print >>fp, "#define phrasebook_short", short
587 Array("phrasebook", phrasebook).dump(fp, trace)
588 Array("phrasebook_offset1", offset1).dump(fp, trace)
589 Array("phrasebook_offset2", offset2).dump(fp, trace)
591 print >>fp, "/* name->code dictionary */"
592 codehash.dump(fp, trace)
594 fp.close()
597 def merge_old_version(version, new, old):
598 # Changes to exclusion file not implemented yet
599 if old.exclusions != new.exclusions:
600 raise NotImplementedError, "exclusions differ"
602 # In these change records, 0xFF means "no change"
603 bidir_changes = [0xFF]*0x110000
604 category_changes = [0xFF]*0x110000
605 decimal_changes = [0xFF]*0x110000
606 # In numeric data, 0 means "no change",
607 # -1 means "did not have a numeric value
608 numeric_changes = [0] * 0x110000
609 # normalization_changes is a list of key-value pairs
610 normalization_changes = []
611 for i in range(0x110000):
612 if new.table[i] is None:
613 # Characters unassigned in the new version ought to
614 # be unassigned in the old one
615 assert old.table[i] is None
616 continue
617 # check characters unassigned in the old version
618 if old.table[i] is None:
619 # category 0 is "unassigned"
620 category_changes[i] = 0
621 continue
622 # check characters that differ
623 if old.table[i] != new.table[i]:
624 for k in range(len(old.table[i])):
625 if old.table[i][k] != new.table[i][k]:
626 value = old.table[i][k]
627 if k == 2:
628 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
629 category_changes[i] = CATEGORY_NAMES.index(value)
630 elif k == 4:
631 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
632 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
633 elif k == 5:
634 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
635 # We assume that all normalization changes are in 1:1 mappings
636 assert " " not in value
637 normalization_changes.append((i, value))
638 elif k == 6:
639 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
640 # we only support changes where the old value is a single digit
641 assert value in "0123456789"
642 decimal_changes[i] = int(value)
643 elif k == 8:
644 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
645 # Since 0 encodes "no change", the old value is better not 0
646 assert value != "0" and value != "-1"
647 if not value:
648 numeric_changes[i] = -1
649 else:
650 assert re.match("^[0-9]+$", value)
651 numeric_changes[i] = int(value)
652 elif k == 11:
653 # change to ISO comment, ignore
654 pass
655 elif k == 12:
656 # change to simple uppercase mapping; ignore
657 pass
658 elif k == 13:
659 # change to simple lowercase mapping; ignore
660 pass
661 elif k == 14:
662 # change to simple titlecase mapping; ignore
663 pass
664 else:
665 class Difference(Exception):pass
666 raise Difference, (hex(i), k, old.table[i], new.table[i])
667 new.changed.append((version, zip(bidir_changes, category_changes,
668 decimal_changes, numeric_changes),
669 normalization_changes))
672 # --------------------------------------------------------------------
673 # the following support code is taken from the unidb utilities
674 # Copyright (c) 1999-2000 by Secret Labs AB
676 # load a unicode-data file from disk
678 import sys
680 class UnicodeData:
682 def __init__(self, filename, exclusions, eastasianwidth, expand=1):
683 self.changed = []
684 file = open(filename)
685 table = [None] * 0x110000
686 while 1:
687 s = file.readline()
688 if not s:
689 break
690 s = s.strip().split(";")
691 char = int(s[0], 16)
692 table[char] = s
694 # expand first-last ranges
695 if expand:
696 field = None
697 for i in range(0, 0x110000):
698 s = table[i]
699 if s:
700 if s[1][-6:] == "First>":
701 s[1] = ""
702 field = s
703 elif s[1][-5:] == "Last>":
704 s[1] = ""
705 field = None
706 elif field:
707 f2 = field[:]
708 f2[0] = "%X" % i
709 table[i] = f2
711 # public attributes
712 self.filename = filename
713 self.table = table
714 self.chars = range(0x110000) # unicode 3.2
716 file = open(exclusions)
717 self.exclusions = {}
718 for s in file:
719 s = s.strip()
720 if not s:
721 continue
722 if s[0] == '#':
723 continue
724 char = int(s.split()[0],16)
725 self.exclusions[char] = 1
727 widths = [None] * 0x110000
728 for s in open(eastasianwidth):
729 s = s.strip()
730 if not s:
731 continue
732 if s[0] == '#':
733 continue
734 s = s.split()[0].split(';')
735 if '..' in s[0]:
736 first, last = [int(c, 16) for c in s[0].split('..')]
737 chars = range(first, last+1)
738 else:
739 chars = [int(s[0], 16)]
740 for char in chars:
741 widths[char] = s[1]
742 for i in range(0, 0x110000):
743 if table[i] is not None:
744 table[i].append(widths[i])
746 def uselatin1(self):
747 # restrict character range to ISO Latin 1
748 self.chars = range(256)
750 # hash table tools
752 # this is a straight-forward reimplementation of Python's built-in
753 # dictionary type, using a static data structure, and a custom string
754 # hash algorithm.
756 def myhash(s, magic):
757 h = 0
758 for c in map(ord, s.upper()):
759 h = (h * magic) + c
760 ix = h & 0xff000000L
761 if ix:
762 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
763 return h
765 SIZES = [
766 (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
767 (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
768 (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
769 (2097152,5), (4194304,3), (8388608,33), (16777216,27)
772 class Hash:
773 def __init__(self, name, data, magic):
774 # turn a (key, value) list into a static hash table structure
776 # determine table size
777 for size, poly in SIZES:
778 if size > len(data):
779 poly = size + poly
780 break
781 else:
782 raise AssertionError, "ran out of polynominals"
784 print size, "slots in hash table"
786 table = [None] * size
788 mask = size-1
790 n = 0
792 hash = myhash
794 # initialize hash table
795 for key, value in data:
796 h = hash(key, magic)
797 i = (~h) & mask
798 v = table[i]
799 if v is None:
800 table[i] = value
801 continue
802 incr = (h ^ (h >> 3)) & mask;
803 if not incr:
804 incr = mask
805 while 1:
806 n = n + 1
807 i = (i + incr) & mask
808 v = table[i]
809 if v is None:
810 table[i] = value
811 break
812 incr = incr << 1
813 if incr > mask:
814 incr = incr ^ poly
816 print n, "collisions"
817 self.collisions = n
819 for i in range(len(table)):
820 if table[i] is None:
821 table[i] = 0
823 self.data = Array(name + "_hash", table)
824 self.magic = magic
825 self.name = name
826 self.size = size
827 self.poly = poly
829 def dump(self, file, trace):
830 # write data to file, as a C array
831 self.data.dump(file, trace)
832 file.write("#define %s_magic %d\n" % (self.name, self.magic))
833 file.write("#define %s_size %d\n" % (self.name, self.size))
834 file.write("#define %s_poly %d\n" % (self.name, self.poly))
836 # stuff to deal with arrays of unsigned integers
838 class Array:
840 def __init__(self, name, data):
841 self.name = name
842 self.data = data
844 def dump(self, file, trace=0):
845 # write data to file, as a C array
846 size = getsize(self.data)
847 if trace:
848 print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
849 file.write("static ")
850 if size == 1:
851 file.write("unsigned char")
852 elif size == 2:
853 file.write("unsigned short")
854 else:
855 file.write("unsigned int")
856 file.write(" " + self.name + "[] = {\n")
857 if self.data:
858 s = " "
859 for item in self.data:
860 i = str(item) + ", "
861 if len(s) + len(i) > 78:
862 file.write(s + "\n")
863 s = " " + i
864 else:
865 s = s + i
866 if s.strip():
867 file.write(s + "\n")
868 file.write("};\n\n")
870 def getsize(data):
871 # return smallest possible integer size for the given array
872 maxdata = max(data)
873 if maxdata < 256:
874 return 1
875 elif maxdata < 65536:
876 return 2
877 else:
878 return 4
880 def splitbins(t, trace=0):
881 """t, trace=0 -> (t1, t2, shift). Split a table to save space.
883 t is a sequence of ints. This function can be useful to save space if
884 many of the ints are the same. t1 and t2 are lists of ints, and shift
885 is an int, chosen to minimize the combined size of t1 and t2 (in C
886 code), and where for each i in range(len(t)),
887 t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
888 where mask is a bitmask isolating the last "shift" bits.
890 If optional arg trace is non-zero (default zero), progress info
891 is printed to sys.stderr. The higher the value, the more info
892 you'll get.
895 import sys
896 if trace:
897 def dump(t1, t2, shift, bytes):
898 print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
899 len(t1), len(t2), shift, bytes)
900 print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
901 "bytes"
902 n = len(t)-1 # last valid index
903 maxshift = 0 # the most we can shift n and still have something left
904 if n > 0:
905 while n >> 1:
906 n >>= 1
907 maxshift += 1
908 del n
909 bytes = sys.maxint # smallest total size so far
910 t = tuple(t) # so slices can be dict keys
911 for shift in range(maxshift + 1):
912 t1 = []
913 t2 = []
914 size = 2**shift
915 bincache = {}
916 for i in range(0, len(t), size):
917 bin = t[i:i+size]
918 index = bincache.get(bin)
919 if index is None:
920 index = len(t2)
921 bincache[bin] = index
922 t2.extend(bin)
923 t1.append(index >> shift)
924 # determine memory size
925 b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
926 if trace > 1:
927 dump(t1, t2, shift, b)
928 if b < bytes:
929 best = t1, t2, shift
930 bytes = b
931 t1, t2, shift = best
932 if trace:
933 print >>sys.stderr, "Best:",
934 dump(t1, t2, shift, bytes)
935 if __debug__:
936 # exhaustively verify that the decomposition is correct
937 mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
938 for i in xrange(len(t)):
939 assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
940 return best
942 if __name__ == "__main__":
943 maketables(1)