sandbox/davidg/unispace.py

   1 #! /usr/bin/env python
   2
   3 """
   4 Analysis of the re.UNICODE flag on whitespace recognition.
   5 """
   6
   7 # Running this program produces this output:
   8 """
   9 Regular expressions:
  10
  11 1. '\\s'
  12 2. '\\s', re.UNICODE
  13 3. u'(?![\xa0\u202f])\\s', re.UNICODE
  14
  15 ===  =========  =======  =========================  =======
  16 Cat  Codepoint  Decimal  Name/Description           Regexps
  17 ===  =========  =======  =========================  =======
  18 Cc    U+0009         9   (HT) TAB \t                1 2 3
  19 Cc    U+000a        10   (LF) LINE FEED \n          1 2 3
  20 Cc    U+000b        11   (VT) VERTICAL TAB \v       1 2 3
  21 Cc    U+000c        12   (FF) FORM FEED \f          1 2 3
  22 Cc    U+000d        13   (CR) CARRIAGE RETURN \r    1 2 3
  23 Cc    U+001c        28   (FS) FILE SEPARATOR          2 3
  24 Cc    U+001d        29   (GS) GROUP SEPARATOR         2 3
  25 Cc    U+001e        30   (RS) RECORD SEPARATOR        2 3
  26 Cc    U+001f        31   (US) UNIT SEPARATOR          2 3
  27 Zs    U+0020        32   SPACE                      1 2 3
  28 Cc    U+0085       133   (NEL) NEXT LINE              2 3
  29 Zs    U+00a0       160   NO-BREAK SPACE               2
  30 Zs    U+1680      5760   OGHAM SPACE MARK             2 3
  31 Zs    U+2000      8192   EN QUAD                      2 3
  32 Zs    U+2001      8193   EM QUAD                      2 3
  33 Zs    U+2002      8194   EN SPACE                     2 3
  34 Zs    U+2003      8195   EM SPACE                     2 3
  35 Zs    U+2004      8196   THREE-PER-EM SPACE           2 3
  36 Zs    U+2005      8197   FOUR-PER-EM SPACE            2 3
  37 Zs    U+2006      8198   SIX-PER-EM SPACE             2 3
  38 Zs    U+2007      8199   FIGURE SPACE                 2 3
  39 Zs    U+2008      8200   PUNCTUATION SPACE            2 3
  40 Zs    U+2009      8201   THIN SPACE                   2 3
  41 Zs    U+200a      8202   HAIR SPACE                   2 3
  42 Zs    U+200b      8203   ZERO WIDTH SPACE             2 3
  43 Zl    U+2028      8232   LINE SEPARATOR               2 3
  44 Zp    U+2029      8233   PARAGRAPH SEPARATOR          2 3
  45 Zs    U+202f      8239   NARROW NO-BREAK SPACE        2
  46 Zs    U+205f      8287   MEDIUM MATHEMATICAL SPACE    2 3
  47 Zs    U+3000     12288   IDEOGRAPHIC SPACE            2 3
  48 ===  =========  =======  =========================  =======
  49 """
  50
  51 # For Unicode category information, see
  52 # http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values
  53 """
  54 ========  ====================
  55 Category  Description
  56 ========  ====================
  57 Zs        Separator, Space
  58 Zl        Separator, Line
  59 Zp        Separator, Paragraph
  60 Cc        Other, Control
  61 Cf        Other, Format
  62 ========  ====================
  63 """
  64
  65 import re
  66 import unicodedata
  67
  68 charnames = {9: '(HT) TAB \\t',
  69              10: '(LF) LINE FEED \\n',
  70              11: '(VT) VERTICAL TAB \\v',
  71              12: '(FF) FORM FEED \\f',
  72              13: '(CR) CARRIAGE RETURN \\r',
  73              28: '(FS) FILE SEPARATOR',
  74              29: '(GS) GROUP SEPARATOR',
  75              30: '(RS) RECORD SEPARATOR',
  76              31: '(US) UNIT SEPARATOR',
  77              133: '(NEL) NEXT LINE'}
  78
  79 pats = [re.compile(r'\s'),
  80         re.compile(r'\s', re.UNICODE),
  81         re.compile(u'(?![\u00a0\u202f])\\s', re.UNICODE),]
  82
  83 border = '===  =========  =======  =========================  ======='
  84 header = 'Cat  Codepoint  Decimal  Name/Description           Regexps'
  85
  86 print 'Regular expressions:\n'
  87 for i, pat in enumerate(pats):
  88     if pat.flags & re.UNICODE:
  89         flag = ', re.UNICODE'
  90     else:
  91         flag = ''
  92     print '%s. %r%s' % (i + 1, pat.pattern, flag)
  93 print
  94
  95 print border
  96 print header
  97 print border
  98
  99 chars = []
 100 for u in range(0x10000):
 101     c = unichr(u)
 102     category = unicodedata.category(c)
 103     if category[:0] in 'ZC':            # Z: whitespace; C: controls
 104         respace = 0
 105         parts = []
 106         for i, pat in enumerate(pats):
 107             if pat.search(c):
 108                 parts.append(str(i + 1))
 109                 respace += 1
 110             else:
 111                 parts.append(' ')
 112         if category.startswith('Z') or respace:
 113             print ('%s    U+%04x     %5s   %-25s  %s'
 114                    % (category, u, u,
 115                       unicodedata.name(c, charnames.get(u, repr(c))),
 116                       ' '.join(parts)))
 117             chars.append(c)
 118 print border