4 Analysis of the re.UNICODE flag on whitespace recognition.
7 # Running this program produces this output:
13 3. u'(?![\xa0\u202f])\\s', re.UNICODE
15 === ========= ======= ========================= =======
16 Cat Codepoint Decimal Name/Description Regexps
17 === ========= ======= ========================= =======
18 Cc U+0009 9 (HT) TAB \t 1 2 3
19 Cc U+000a 10 (LF) LINE FEED \n 1 2 3
20 Cc U+000b 11 (VT) VERTICAL TAB \v 1 2 3
21 Cc U+000c 12 (FF) FORM FEED \f 1 2 3
22 Cc U+000d 13 (CR) CARRIAGE RETURN \r 1 2 3
23 Cc U+001c 28 (FS) FILE SEPARATOR 2 3
24 Cc U+001d 29 (GS) GROUP SEPARATOR 2 3
25 Cc U+001e 30 (RS) RECORD SEPARATOR 2 3
26 Cc U+001f 31 (US) UNIT SEPARATOR 2 3
27 Zs U+0020 32 SPACE 1 2 3
28 Cc U+0085 133 (NEL) NEXT LINE 2 3
29 Zs U+00a0 160 NO-BREAK SPACE 2
30 Zs U+1680 5760 OGHAM SPACE MARK 2 3
31 Zs U+2000 8192 EN QUAD 2 3
32 Zs U+2001 8193 EM QUAD 2 3
33 Zs U+2002 8194 EN SPACE 2 3
34 Zs U+2003 8195 EM SPACE 2 3
35 Zs U+2004 8196 THREE-PER-EM SPACE 2 3
36 Zs U+2005 8197 FOUR-PER-EM SPACE 2 3
37 Zs U+2006 8198 SIX-PER-EM SPACE 2 3
38 Zs U+2007 8199 FIGURE SPACE 2 3
39 Zs U+2008 8200 PUNCTUATION SPACE 2 3
40 Zs U+2009 8201 THIN SPACE 2 3
41 Zs U+200a 8202 HAIR SPACE 2 3
42 Zs U+200b 8203 ZERO WIDTH SPACE 2 3
43 Zl U+2028 8232 LINE SEPARATOR 2 3
44 Zp U+2029 8233 PARAGRAPH SEPARATOR 2 3
45 Zs U+202f 8239 NARROW NO-BREAK SPACE 2
46 Zs U+205f 8287 MEDIUM MATHEMATICAL SPACE 2 3
47 Zs U+3000 12288 IDEOGRAPHIC SPACE 2 3
48 === ========= ======= ========================= =======
51 # For Unicode category information, see
52 # http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
54 ======== ====================
56 ======== ====================
59 Zp Separator, Paragraph
62 ======== ====================
68 charnames
= {9: '(HT) TAB \\t',
69 10: '(LF) LINE FEED \\n',
70 11: '(VT) VERTICAL TAB \\v',
71 12: '(FF) FORM FEED \\f',
72 13: '(CR) CARRIAGE RETURN \\r',
73 28: '(FS) FILE SEPARATOR',
74 29: '(GS) GROUP SEPARATOR',
75 30: '(RS) RECORD SEPARATOR',
76 31: '(US) UNIT SEPARATOR',
77 133: '(NEL) NEXT LINE'}
79 pats
= [re
.compile(r
'\s'),
80 re
.compile(r
'\s', re
.UNICODE
),
81 re
.compile(u
'(?![\u00a0\u202f])\\s', re
.UNICODE
),]
83 border
= '=== ========= ======= ========================= ======='
84 header
= 'Cat Codepoint Decimal Name/Description Regexps'
86 print 'Regular expressions:\n'
87 for i
, pat
in enumerate(pats
):
88 if pat
.flags
& re
.UNICODE
:
92 print '%s. %r%s' % (i
+ 1, pat
.pattern
, flag
)
100 for u
in range(0x10000):
102 category
= unicodedata
.category(c
)
103 if category
[:0] in 'ZC': # Z: whitespace; C: controls
106 for i
, pat
in enumerate(pats
):
108 parts
.append(str(i
+ 1))
112 if category
.startswith('Z') or respace
:
113 print ('%s U+%04x %5s %-25s %s'
115 unicodedata
.name(c
, charnames
.get(u
, repr(c
))),