Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  26 __credits__ = \
  27     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  28
  29 import string, re
  30 from token import *
  31
  32 import token
  33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
  34            "generate_tokens", "NL", "untokenize"]
  35 del x
  36 del token
  37
  38 COMMENT = N_TOKENS
  39 tok_name[COMMENT] = 'COMMENT'
  40 NL = N_TOKENS + 1
  41 tok_name[NL] = 'NL'
  42 N_TOKENS += 2
  43
  44 def group(*choices): return '(' + '|'.join(choices) + ')'
  45 def any(*choices): return group(*choices) + '*'
  46 def maybe(*choices): return group(*choices) + '?'
  47
  48 Whitespace = r'[ \f\t]*'
  49 Comment = r'#[^\r\n]*'
  50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  51 Name = r'[a-zA-Z_]\w*'
  52
  53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  54 Octnumber = r'0[0-7]*[lL]?'
  55 Decnumber = r'[1-9]\d*[lL]?'
  56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
  57 Exponent = r'[eE][-+]?\d+'
  58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  59 Expfloat = r'\d+' + Exponent
  60 Floatnumber = group(Pointfloat, Expfloat)
  61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  62 Number = group(Imagnumber, Floatnumber, Intnumber)
  63
  64 # Tail end of ' string.
  65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  66 # Tail end of " string.
  67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  68 # Tail end of ''' string.
  69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  70 # Tail end of """ string.
  71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  73 # Single-line ' or " string.
  74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  75                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  76
  77 # Because of leftmost-then-longest match semantics, be sure to put the
  78 # longest operators first (e.g., if = came before ==, == would get
  79 # recognized as two instances of =).
  80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  81                  r"//=?",
  82                  r"[+\-*/%&|^=<>]=?",
  83                  r"~")
  84
  85 Bracket = '[][(){}]'
  86 Special = group(r'\r?\n', r'[:;.,`@]')
  87 Funny = group(Operator, Bracket, Special)
  88
  89 PlainToken = group(Number, Funny, String, Name)
  90 Token = Ignore + PlainToken
  91
  92 # First (or only) line of ' or " string.
  93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  94                 group("'", r'\\\r?\n'),
  95                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  96                 group('"', r'\\\r?\n'))
  97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  99
 100 tokenprog, pseudoprog, single3prog, double3prog = map(
 101     re.compile, (Token, PseudoToken, Single3, Double3))
 102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 103             "'''": single3prog, '"""': double3prog,
 104             "r'''": single3prog, 'r"""': double3prog,
 105             "u'''": single3prog, 'u"""': double3prog,
 106             "ur'''": single3prog, 'ur"""': double3prog,
 107             "R'''": single3prog, 'R"""': double3prog,
 108             "U'''": single3prog, 'U"""': double3prog,
 109             "uR'''": single3prog, 'uR"""': double3prog,
 110             "Ur'''": single3prog, 'Ur"""': double3prog,
 111             "UR'''": single3prog, 'UR"""': double3prog,
 112             'r': None, 'R': None, 'u': None, 'U': None}
 113
 114 triple_quoted = {}
 115 for t in ("'''", '"""',
 116           "r'''", 'r"""', "R'''", 'R"""',
 117           "u'''", 'u"""', "U'''", 'U"""',
 118           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 119           "uR'''", 'uR"""', "UR'''", 'UR"""'):
 120     triple_quoted[t] = t
 121 single_quoted = {}
 122 for t in ("'", '"',
 123           "r'", 'r"', "R'", 'R"',
 124           "u'", 'u"', "U'", 'U"',
 125           "ur'", 'ur"', "Ur'", 'Ur"',
 126           "uR'", 'uR"', "UR'", 'UR"' ):
 127     single_quoted[t] = t
 128
 129 tabsize = 8
 130
 131 class TokenError(Exception): pass
 132
 133 class StopTokenizing(Exception): pass
 134
 135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
 136     print "%d,%d-%d,%d:\t%s\t%s" % \
 137         (srow, scol, erow, ecol, tok_name[type], repr(token))
 138
 139 def tokenize(readline, tokeneater=printtoken):
 140     """
 141     The tokenize() function accepts two parameters: one representing the
 142     input stream, and one providing an output mechanism for tokenize().
 143
 144     The first parameter, readline, must be a callable object which provides
 145     the same interface as the readline() method of built-in file objects.
 146     Each call to the function should return one line of input as a string.
 147
 148     The second parameter, tokeneater, must also be a callable object. It is
 149     called once for each token, with five arguments, corresponding to the
 150     tuples generated by generate_tokens().
 151     """
 152     try:
 153         tokenize_loop(readline, tokeneater)
 154     except StopTokenizing:
 155         pass
 156
 157 # backwards compatible interface
 158 def tokenize_loop(readline, tokeneater):
 159     for token_info in generate_tokens(readline):
 160         tokeneater(*token_info)
 161
 162
 163 def untokenize(iterable):
 164     """Transform tokens back into Python source code.
 165
 166     Each element returned by the iterable must be a token sequence
 167     with at least two elements, a token number and token value.
 168
 169     Round-trip invariant:
 170         # Output text will tokenize the back to the input
 171         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 172         newcode = untokenize(t1)
 173         readline = iter(newcode.splitlines(1)).next
 174         t2 = [tok[:2] for tokin generate_tokens(readline)]
 175         assert t1 == t2
 176     """
 177
 178     startline = False
 179     indents = []
 180     toks = []
 181     toks_append = toks.append
 182     for tok in iterable:
 183         toknum, tokval = tok[:2]
 184
 185         if toknum in (NAME, NUMBER):
 186             tokval += ' '
 187
 188         if toknum == INDENT:
 189             indents.append(tokval)
 190             continue
 191         elif toknum == DEDENT:
 192             indents.pop()
 193             continue
 194         elif toknum in (NEWLINE, COMMENT, NL):
 195             startline = True
 196         elif startline and indents:
 197             toks_append(indents[-1])
 198             startline = False
 199         toks_append(tokval)
 200     return ''.join(toks)
 201
 202
 203 def generate_tokens(readline):
 204     """
 205     The generate_tokens() generator requires one argment, readline, which
 206     must be a callable object which provides the same interface as the
 207     readline() method of built-in file objects. Each call to the function
 208     should return one line of input as a string.  Alternately, readline
 209     can be a callable function terminating with StopIteration:
 210         readline = open(myfile).next    # Example of alternate readline
 211
 212     The generator produces 5-tuples with these members: the token type; the
 213     token string; a 2-tuple (srow, scol) of ints specifying the row and
 214     column where the token begins in the source; a 2-tuple (erow, ecol) of
 215     ints specifying the row and column where the token ends in the source;
 216     and the line on which the token was found. The line passed is the
 217     logical line; continuation lines are included.
 218     """
 219     lnum = parenlev = continued = 0
 220     namechars, numchars = string.ascii_letters + '_', '0123456789'
 221     contstr, needcont = '', 0
 222     contline = None
 223     indents = [0]
 224
 225     while 1:                                   # loop over lines in stream
 226         try:
 227             line = readline()
 228         except StopIteration:
 229             line = ''
 230         lnum = lnum + 1
 231         pos, max = 0, len(line)
 232
 233         if contstr:                            # continued string
 234             if not line:
 235                 raise TokenError, ("EOF in multi-line string", strstart)
 236             endmatch = endprog.match(line)
 237             if endmatch:
 238                 pos = end = endmatch.end(0)
 239                 yield (STRING, contstr + line[:end],
 240                            strstart, (lnum, end), contline + line)
 241                 contstr, needcont = '', 0
 242                 contline = None
 243             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 244                 yield (ERRORTOKEN, contstr + line,
 245                            strstart, (lnum, len(line)), contline)
 246                 contstr = ''
 247                 contline = None
 248                 continue
 249             else:
 250                 contstr = contstr + line
 251                 contline = contline + line
 252                 continue
 253
 254         elif parenlev == 0 and not continued:  # new statement
 255             if not line: break
 256             column = 0
 257             while pos < max:                   # measure leading whitespace
 258                 if line[pos] == ' ': column = column + 1
 259                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 260                 elif line[pos] == '\f': column = 0
 261                 else: break
 262                 pos = pos + 1
 263             if pos == max: break
 264
 265             if line[pos] in '#\r\n':           # skip comments or blank lines
 266                 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 267                            (lnum, pos), (lnum, len(line)), line)
 268                 continue
 269
 270             if column > indents[-1]:           # count indents or dedents
 271                 indents.append(column)
 272                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 273             while column < indents[-1]:
 274                 if column not in indents:
 275                     raise IndentationError(
 276                         "unindent does not match any outer indentation level")
 277                 indents = indents[:-1]
 278                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 279
 280         else:                                  # continued statement
 281             if not line:
 282                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 283             continued = 0
 284
 285         while pos < max:
 286             pseudomatch = pseudoprog.match(line, pos)
 287             if pseudomatch:                                # scan for tokens
 288                 start, end = pseudomatch.span(1)
 289                 spos, epos, pos = (lnum, start), (lnum, end), end
 290                 token, initial = line[start:end], line[start]
 291
 292                 if initial in numchars or \
 293                    (initial == '.' and token != '.'):      # ordinary number
 294                     yield (NUMBER, token, spos, epos, line)
 295                 elif initial in '\r\n':
 296                     yield (parenlev > 0 and NL or NEWLINE,
 297                                token, spos, epos, line)
 298                 elif initial == '#':
 299                     yield (COMMENT, token, spos, epos, line)
 300                 elif token in triple_quoted:
 301                     endprog = endprogs[token]
 302                     endmatch = endprog.match(line, pos)
 303                     if endmatch:                           # all on one line
 304                         pos = endmatch.end(0)
 305                         token = line[start:pos]
 306                         yield (STRING, token, spos, (lnum, pos), line)
 307                     else:
 308                         strstart = (lnum, start)           # multiple lines
 309                         contstr = line[start:]
 310                         contline = line
 311                         break
 312                 elif initial in single_quoted or \
 313                     token[:2] in single_quoted or \
 314                     token[:3] in single_quoted:
 315                     if token[-1] == '\n':                  # continued string
 316                         strstart = (lnum, start)
 317                         endprog = (endprogs[initial] or endprogs[token[1]] or
 318                                    endprogs[token[2]])
 319                         contstr, needcont = line[start:], 1
 320                         contline = line
 321                         break
 322                     else:                                  # ordinary string
 323                         yield (STRING, token, spos, epos, line)
 324                 elif initial in namechars:                 # ordinary name
 325                     yield (NAME, token, spos, epos, line)
 326                 elif initial == '\\':                      # continued stmt
 327                     continued = 1
 328                 else:
 329                     if initial in '([{': parenlev = parenlev + 1
 330                     elif initial in ')]}': parenlev = parenlev - 1
 331                     yield (OP, token, spos, epos, line)
 332             else:
 333                 yield (ERRORTOKEN, line[pos],
 334                            (lnum, pos), (lnum, pos+1), line)
 335                 pos = pos + 1
 336
 337     for indent in indents[1:]:                 # pop remaining indent levels
 338         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 339     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 340
 341 if __name__ == '__main__':                     # testing
 342     import sys
 343     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 344     else: tokenize(sys.stdin.readline)