remove Author, there are a lot
[docutils/kirr.git] / sandbox / davidg / unispace.py
blobd69400517b48d341fc6f4a3ce7833443727d177b
1 #! /usr/bin/env python
3 """
4 Analysis of the re.UNICODE flag on whitespace recognition.
5 """
7 # Running this program produces this output:
8 """
9 Regular expressions:
11 1. '\\s'
12 2. '\\s', re.UNICODE
13 3. u'(?![\xa0\u202f])\\s', re.UNICODE
15 === ========= ======= ========================= =======
16 Cat Codepoint Decimal Name/Description Regexps
17 === ========= ======= ========================= =======
18 Cc U+0009 9 (HT) TAB \t 1 2 3
19 Cc U+000a 10 (LF) LINE FEED \n 1 2 3
20 Cc U+000b 11 (VT) VERTICAL TAB \v 1 2 3
21 Cc U+000c 12 (FF) FORM FEED \f 1 2 3
22 Cc U+000d 13 (CR) CARRIAGE RETURN \r 1 2 3
23 Cc U+001c 28 (FS) FILE SEPARATOR 2 3
24 Cc U+001d 29 (GS) GROUP SEPARATOR 2 3
25 Cc U+001e 30 (RS) RECORD SEPARATOR 2 3
26 Cc U+001f 31 (US) UNIT SEPARATOR 2 3
27 Zs U+0020 32 SPACE 1 2 3
28 Cc U+0085 133 (NEL) NEXT LINE 2 3
29 Zs U+00a0 160 NO-BREAK SPACE 2
30 Zs U+1680 5760 OGHAM SPACE MARK 2 3
31 Zs U+2000 8192 EN QUAD 2 3
32 Zs U+2001 8193 EM QUAD 2 3
33 Zs U+2002 8194 EN SPACE 2 3
34 Zs U+2003 8195 EM SPACE 2 3
35 Zs U+2004 8196 THREE-PER-EM SPACE 2 3
36 Zs U+2005 8197 FOUR-PER-EM SPACE 2 3
37 Zs U+2006 8198 SIX-PER-EM SPACE 2 3
38 Zs U+2007 8199 FIGURE SPACE 2 3
39 Zs U+2008 8200 PUNCTUATION SPACE 2 3
40 Zs U+2009 8201 THIN SPACE 2 3
41 Zs U+200a 8202 HAIR SPACE 2 3
42 Zs U+200b 8203 ZERO WIDTH SPACE 2 3
43 Zl U+2028 8232 LINE SEPARATOR 2 3
44 Zp U+2029 8233 PARAGRAPH SEPARATOR 2 3
45 Zs U+202f 8239 NARROW NO-BREAK SPACE 2
46 Zs U+205f 8287 MEDIUM MATHEMATICAL SPACE 2 3
47 Zs U+3000 12288 IDEOGRAPHIC SPACE 2 3
48 === ========= ======= ========================= =======
49 """
51 # For Unicode category information, see
52 # http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
53 """
54 ======== ====================
55 Category Description
56 ======== ====================
57 Zs Separator, Space
58 Zl Separator, Line
59 Zp Separator, Paragraph
60 Cc Other, Control
61 Cf Other, Format
62 ======== ====================
63 """
65 import re
66 import unicodedata
68 charnames = {9: '(HT) TAB \\t',
69 10: '(LF) LINE FEED \\n',
70 11: '(VT) VERTICAL TAB \\v',
71 12: '(FF) FORM FEED \\f',
72 13: '(CR) CARRIAGE RETURN \\r',
73 28: '(FS) FILE SEPARATOR',
74 29: '(GS) GROUP SEPARATOR',
75 30: '(RS) RECORD SEPARATOR',
76 31: '(US) UNIT SEPARATOR',
77 133: '(NEL) NEXT LINE'}
79 pats = [re.compile(r'\s'),
80 re.compile(r'\s', re.UNICODE),
81 re.compile(u'(?![\u00a0\u202f])\\s', re.UNICODE),]
83 border = '=== ========= ======= ========================= ======='
84 header = 'Cat Codepoint Decimal Name/Description Regexps'
86 print 'Regular expressions:\n'
87 for i, pat in enumerate(pats):
88 if pat.flags & re.UNICODE:
89 flag = ', re.UNICODE'
90 else:
91 flag = ''
92 print '%s. %r%s' % (i + 1, pat.pattern, flag)
93 print
95 print border
96 print header
97 print border
99 chars = []
100 for u in range(0x10000):
101 c = unichr(u)
102 category = unicodedata.category(c)
103 if category[:0] in 'ZC': # Z: whitespace; C: controls
104 respace = 0
105 parts = []
106 for i, pat in enumerate(pats):
107 if pat.search(c):
108 parts.append(str(i + 1))
109 respace += 1
110 else:
111 parts.append(' ')
112 if category.startswith('Z') or respace:
113 print ('%s U+%04x %5s %-25s %s'
114 % (category, u, u,
115 unicodedata.name(c, charnames.get(u, repr(c))),
116 ' '.join(parts)))
117 chars.append(c)
118 print border