Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  26 __credits__ = \
  27     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
  28
  29 import string, re
  30 from token import *
  31
  32 import token
  33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
  34            "generate_tokens", "NL", "untokenize"]
  35 del x
  36 del token
  37
  38 COMMENT = N_TOKENS
  39 tok_name[COMMENT] = 'COMMENT'
  40 NL = N_TOKENS + 1
  41 tok_name[NL] = 'NL'
  42 N_TOKENS += 2
  43
  44 def group(*choices): return '(' + '|'.join(choices) + ')'
  45 def any(*choices): return group(*choices) + '*'
  46 def maybe(*choices): return group(*choices) + '?'
  47
  48 Whitespace = r'[ \f\t]*'
  49 Comment = r'#[^\r\n]*'
  50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  51 Name = r'[a-zA-Z_]\w*'
  52
  53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  54 Octnumber = r'0[0-7]*[lL]?'
  55 Decnumber = r'[1-9]\d*[lL]?'
  56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
  57 Exponent = r'[eE][-+]?\d+'
  58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  59 Expfloat = r'\d+' + Exponent
  60 Floatnumber = group(Pointfloat, Expfloat)
  61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  62 Number = group(Imagnumber, Floatnumber, Intnumber)
  63
  64 # Tail end of ' string.
  65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  66 # Tail end of " string.
  67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  68 # Tail end of ''' string.
  69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  70 # Tail end of """ string.
  71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  73 # Single-line ' or " string.
  74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  75                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  76
  77 # Because of leftmost-then-longest match semantics, be sure to put the
  78 # longest operators first (e.g., if = came before ==, == would get
  79 # recognized as two instances of =).
  80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  81                  r"//=?",
  82                  r"[+\-*/%&|^=<>]=?",
  83                  r"~")
  84
  85 Bracket = '[][(){}]'
  86 Special = group(r'\r?\n', r'[:;.,`@]')
  87 Funny = group(Operator, Bracket, Special)
  88
  89 PlainToken = group(Number, Funny, String, Name)
  90 Token = Ignore + PlainToken
  91
  92 # First (or only) line of ' or " string.
  93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  94                 group("'", r'\\\r?\n'),
  95                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  96                 group('"', r'\\\r?\n'))
  97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  99
 100 tokenprog, pseudoprog, single3prog, double3prog = map(
 101     re.compile, (Token, PseudoToken, Single3, Double3))
 102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 103             "'''": single3prog, '"""': double3prog,
 104             "r'''": single3prog, 'r"""': double3prog,
 105             "u'''": single3prog, 'u"""': double3prog,
 106             "ur'''": single3prog, 'ur"""': double3prog,
 107             "R'''": single3prog, 'R"""': double3prog,
 108             "U'''": single3prog, 'U"""': double3prog,
 109             "uR'''": single3prog, 'uR"""': double3prog,
 110             "Ur'''": single3prog, 'Ur"""': double3prog,
 111             "UR'''": single3prog, 'UR"""': double3prog,
 112             'r': None, 'R': None, 'u': None, 'U': None}
 113
 114 triple_quoted = {}
 115 for t in ("'''", '"""',
 116           "r'''", 'r"""', "R'''", 'R"""',
 117           "u'''", 'u"""', "U'''", 'U"""',
 118           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 119           "uR'''", 'uR"""', "UR'''", 'UR"""'):
 120     triple_quoted[t] = t
 121 single_quoted = {}
 122 for t in ("'", '"',
 123           "r'", 'r"', "R'", 'R"',
 124           "u'", 'u"', "U'", 'U"',
 125           "ur'", 'ur"', "Ur'", 'Ur"',
 126           "uR'", 'uR"', "UR'", 'UR"' ):
 127     single_quoted[t] = t
 128
 129 tabsize = 8
 130
 131 class TokenError(Exception): pass
 132
 133 class StopTokenizing(Exception): pass
 134
 135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
 136     print "%d,%d-%d,%d:\t%s\t%s" % \
 137         (srow, scol, erow, ecol, tok_name[type], repr(token))
 138
 139 def tokenize(readline, tokeneater=printtoken):
 140     """
 141     The tokenize() function accepts two parameters: one representing the
 142     input stream, and one providing an output mechanism for tokenize().
 143
 144     The first parameter, readline, must be a callable object which provides
 145     the same interface as the readline() method of built-in file objects.
 146     Each call to the function should return one line of input as a string.
 147
 148     The second parameter, tokeneater, must also be a callable object. It is
 149     called once for each token, with five arguments, corresponding to the
 150     tuples generated by generate_tokens().
 151     """
 152     try:
 153         tokenize_loop(readline, tokeneater)
 154     except StopTokenizing:
 155         pass
 156
 157 # backwards compatible interface
 158 def tokenize_loop(readline, tokeneater):
 159     for token_info in generate_tokens(readline):
 160         tokeneater(*token_info)
 161
 162 class Untokenizer:
 163
 164     def __init__(self):
 165         self.tokens = []
 166         self.prev_row = 1
 167         self.prev_col = 0
 168
 169     def add_whitespace(self, start):
 170         row, col = start
 171         assert row <= self.prev_row
 172         col_offset = col - self.prev_col
 173         if col_offset:
 174             self.tokens.append(" " * col_offset)
 175
 176     def untokenize(self, iterable):
 177         for t in iterable:
 178             if len(t) == 2:
 179                 self.compat(t, iterable)
 180                 break
 181             tok_type, token, start, end, line = t
 182             self.add_whitespace(start)
 183             self.tokens.append(token)
 184             self.prev_row, self.prev_col = end
 185             if tok_type in (NEWLINE, NL):
 186                 self.prev_row += 1
 187                 self.prev_col = 0
 188         return "".join(self.tokens)
 189
 190     def compat(self, token, iterable):
 191         startline = False
 192         indents = []
 193         toks_append = self.tokens.append
 194         toknum, tokval = token
 195         if toknum in (NAME, NUMBER):
 196             tokval += ' '
 197         if toknum in (NEWLINE, NL):
 198             startline = True
 199         for tok in iterable:
 200             toknum, tokval = tok[:2]
 201
 202             if toknum in (NAME, NUMBER):
 203                 tokval += ' '
 204
 205             if toknum == INDENT:
 206                 indents.append(tokval)
 207                 continue
 208             elif toknum == DEDENT:
 209                 indents.pop()
 210                 continue
 211             elif toknum in (NEWLINE, NL):
 212                 startline = True
 213             elif startline and indents:
 214                 toks_append(indents[-1])
 215                 startline = False
 216             toks_append(tokval)
 217
 218 def untokenize(iterable):
 219     """Transform tokens back into Python source code.
 220
 221     Each element returned by the iterable must be a token sequence
 222     with at least two elements, a token number and token value.  If
 223     only two tokens are passed, the resulting output is poor.
 224
 225     Round-trip invariant for full input:
 226         Untokenized source will match input source exactly
 227
 228     Round-trip invariant for limited intput:
 229         # Output text will tokenize the back to the input
 230         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 231         newcode = untokenize(t1)
 232         readline = iter(newcode.splitlines(1)).next
 233         t2 = [tok[:2] for tokin generate_tokens(readline)]
 234         assert t1 == t2
 235     """
 236     ut = Untokenizer()
 237     return ut.untokenize(iterable)
 238
 239 def generate_tokens(readline):
 240     """
 241     The generate_tokens() generator requires one argment, readline, which
 242     must be a callable object which provides the same interface as the
 243     readline() method of built-in file objects. Each call to the function
 244     should return one line of input as a string.  Alternately, readline
 245     can be a callable function terminating with StopIteration:
 246         readline = open(myfile).next    # Example of alternate readline
 247
 248     The generator produces 5-tuples with these members: the token type; the
 249     token string; a 2-tuple (srow, scol) of ints specifying the row and
 250     column where the token begins in the source; a 2-tuple (erow, ecol) of
 251     ints specifying the row and column where the token ends in the source;
 252     and the line on which the token was found. The line passed is the
 253     logical line; continuation lines are included.
 254     """
 255     lnum = parenlev = continued = 0
 256     namechars, numchars = string.ascii_letters + '_', '0123456789'
 257     contstr, needcont = '', 0
 258     contline = None
 259     indents = [0]
 260
 261     while 1:                                   # loop over lines in stream
 262         try:
 263             line = readline()
 264         except StopIteration:
 265             line = ''
 266         lnum = lnum + 1
 267         pos, max = 0, len(line)
 268
 269         if contstr:                            # continued string
 270             if not line:
 271                 raise TokenError, ("EOF in multi-line string", strstart)
 272             endmatch = endprog.match(line)
 273             if endmatch:
 274                 pos = end = endmatch.end(0)
 275                 yield (STRING, contstr + line[:end],
 276                        strstart, (lnum, end), contline + line)
 277                 contstr, needcont = '', 0
 278                 contline = None
 279             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 280                 yield (ERRORTOKEN, contstr + line,
 281                            strstart, (lnum, len(line)), contline)
 282                 contstr = ''
 283                 contline = None
 284                 continue
 285             else:
 286                 contstr = contstr + line
 287                 contline = contline + line
 288                 continue
 289
 290         elif parenlev == 0 and not continued:  # new statement
 291             if not line: break
 292             column = 0
 293             while pos < max:                   # measure leading whitespace
 294                 if line[pos] == ' ': column = column + 1
 295                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 296                 elif line[pos] == '\f': column = 0
 297                 else: break
 298                 pos = pos + 1
 299             if pos == max: break
 300
 301             if line[pos] in '#\r\n':           # skip comments or blank lines
 302                 if line[pos] == '#':
 303                     comment_token = line[pos:].rstrip('\r\n')
 304                     nl_pos = pos + len(comment_token)
 305                     yield (COMMENT, comment_token,
 306                            (lnum, pos), (lnum, pos + len(comment_token)), line)
 307                     yield (NL, line[nl_pos:],
 308                            (lnum, nl_pos), (lnum, len(line)), line)
 309                 else:
 310                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 311                            (lnum, pos), (lnum, len(line)), line)
 312                 continue
 313
 314             if column > indents[-1]:           # count indents or dedents
 315                 indents.append(column)
 316                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 317             while column < indents[-1]:
 318                 if column not in indents:
 319                     raise IndentationError(
 320                         "unindent does not match any outer indentation level",
 321                         ("<tokenize>", lnum, pos, line))
 322                 indents = indents[:-1]
 323                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 324
 325         else:                                  # continued statement
 326             if not line:
 327                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 328             continued = 0
 329
 330         while pos < max:
 331             pseudomatch = pseudoprog.match(line, pos)
 332             if pseudomatch:                                # scan for tokens
 333                 start, end = pseudomatch.span(1)
 334                 spos, epos, pos = (lnum, start), (lnum, end), end
 335                 token, initial = line[start:end], line[start]
 336
 337                 if initial in numchars or \
 338                    (initial == '.' and token != '.'):      # ordinary number
 339                     yield (NUMBER, token, spos, epos, line)
 340                 elif initial in '\r\n':
 341                     yield (NL if parenlev > 0 else NEWLINE,
 342                            token, spos, epos, line)
 343                 elif initial == '#':
 344                     assert not token.endswith("\n")
 345                     yield (COMMENT, token, spos, epos, line)
 346                 elif token in triple_quoted:
 347                     endprog = endprogs[token]
 348                     endmatch = endprog.match(line, pos)
 349                     if endmatch:                           # all on one line
 350                         pos = endmatch.end(0)
 351                         token = line[start:pos]
 352                         yield (STRING, token, spos, (lnum, pos), line)
 353                     else:
 354                         strstart = (lnum, start)           # multiple lines
 355                         contstr = line[start:]
 356                         contline = line
 357                         break
 358                 elif initial in single_quoted or \
 359                     token[:2] in single_quoted or \
 360                     token[:3] in single_quoted:
 361                     if token[-1] == '\n':                  # continued string
 362                         strstart = (lnum, start)
 363                         endprog = (endprogs[initial] or endprogs[token[1]] or
 364                                    endprogs[token[2]])
 365                         contstr, needcont = line[start:], 1
 366                         contline = line
 367                         break
 368                     else:                                  # ordinary string
 369                         yield (STRING, token, spos, epos, line)
 370                 elif initial in namechars:                 # ordinary name
 371                     yield (NAME, token, spos, epos, line)
 372                 elif initial == '\\':                      # continued stmt
 373                     continued = 1
 374                 else:
 375                     if initial in '([{': parenlev = parenlev + 1
 376                     elif initial in ')]}': parenlev = parenlev - 1
 377                     yield (OP, token, spos, epos, line)
 378             else:
 379                 yield (ERRORTOKEN, line[pos],
 380                            (lnum, pos), (lnum, pos+1), line)
 381                 pos = pos + 1
 382
 383     for indent in indents[1:]:                 # pop remaining indent levels
 384         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 385     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 386
 387 if __name__ == '__main__':                     # testing
 388     import sys
 389     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 390     else: tokenize(sys.stdin.readline)