Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  26 __credits__ = \
  27     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
  28
  29 import string, re
  30 from token import *
  31
  32 import token
  33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
  34            "generate_tokens", "NL", "untokenize"]
  35 del x
  36 del token
  37
  38 COMMENT = N_TOKENS
  39 tok_name[COMMENT] = 'COMMENT'
  40 NL = N_TOKENS + 1
  41 tok_name[NL] = 'NL'
  42 N_TOKENS += 2
  43
  44 def group(*choices): return '(' + '|'.join(choices) + ')'
  45 def any(*choices): return group(*choices) + '*'
  46 def maybe(*choices): return group(*choices) + '?'
  47
  48 Whitespace = r'[ \f\t]*'
  49 Comment = r'#[^\r\n]*'
  50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  51 Name = r'[a-zA-Z_]\w*'
  52
  53 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
  54 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
  55 Binnumber = r'0[bB][01]+[lL]?'
  56 Decnumber = r'[1-9]\d*[lL]?'
  57 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
  58 Exponent = r'[eE][-+]?\d+'
  59 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  60 Expfloat = r'\d+' + Exponent
  61 Floatnumber = group(Pointfloat, Expfloat)
  62 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  63 Number = group(Imagnumber, Floatnumber, Intnumber)
  64
  65 # Tail end of ' string.
  66 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  67 # Tail end of " string.
  68 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  69 # Tail end of ''' string.
  70 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  71 # Tail end of """ string.
  72 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  73 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  74 # Single-line ' or " string.
  75 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  76                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  77
  78 # Because of leftmost-then-longest match semantics, be sure to put the
  79 # longest operators first (e.g., if = came before ==, == would get
  80 # recognized as two instances of =).
  81 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  82                  r"//=?",
  83                  r"[+\-*/%&|^=<>]=?",
  84                  r"~")
  85
  86 Bracket = '[][(){}]'
  87 Special = group(r'\r?\n', r'[:;.,`@]')
  88 Funny = group(Operator, Bracket, Special)
  89
  90 PlainToken = group(Number, Funny, String, Name)
  91 Token = Ignore + PlainToken
  92
  93 # First (or only) line of ' or " string.
  94 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  95                 group("'", r'\\\r?\n'),
  96                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  97                 group('"', r'\\\r?\n'))
  98 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  99 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 100
 101 tokenprog, pseudoprog, single3prog, double3prog = map(
 102     re.compile, (Token, PseudoToken, Single3, Double3))
 103 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 104             "'''": single3prog, '"""': double3prog,
 105             "r'''": single3prog, 'r"""': double3prog,
 106             "u'''": single3prog, 'u"""': double3prog,
 107             "ur'''": single3prog, 'ur"""': double3prog,
 108             "R'''": single3prog, 'R"""': double3prog,
 109             "U'''": single3prog, 'U"""': double3prog,
 110             "uR'''": single3prog, 'uR"""': double3prog,
 111             "Ur'''": single3prog, 'Ur"""': double3prog,
 112             "UR'''": single3prog, 'UR"""': double3prog,
 113             "b'''": single3prog, 'b"""': double3prog,
 114             "br'''": single3prog, 'br"""': double3prog,
 115             "B'''": single3prog, 'B"""': double3prog,
 116             "bR'''": single3prog, 'bR"""': double3prog,
 117             "Br'''": single3prog, 'Br"""': double3prog,
 118             "BR'''": single3prog, 'BR"""': double3prog,
 119             'r': None, 'R': None, 'u': None, 'U': None,
 120             'b': None, 'B': None}
 121
 122 triple_quoted = {}
 123 for t in ("'''", '"""',
 124           "r'''", 'r"""', "R'''", 'R"""',
 125           "u'''", 'u"""', "U'''", 'U"""',
 126           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 127           "uR'''", 'uR"""', "UR'''", 'UR"""',
 128           "b'''", 'b"""', "B'''", 'B"""',
 129           "br'''", 'br"""', "Br'''", 'Br"""',
 130           "bR'''", 'bR"""', "BR'''", 'BR"""'):
 131     triple_quoted[t] = t
 132 single_quoted = {}
 133 for t in ("'", '"',
 134           "r'", 'r"', "R'", 'R"',
 135           "u'", 'u"', "U'", 'U"',
 136           "ur'", 'ur"', "Ur'", 'Ur"',
 137           "uR'", 'uR"', "UR'", 'UR"',
 138           "b'", 'b"', "B'", 'B"',
 139           "br'", 'br"', "Br'", 'Br"',
 140           "bR'", 'bR"', "BR'", 'BR"' ):
 141     single_quoted[t] = t
 142
 143 tabsize = 8
 144
 145 class TokenError(Exception): pass
 146
 147 class StopTokenizing(Exception): pass
 148
 149 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
 150     print "%d,%d-%d,%d:\t%s\t%s" % \
 151         (srow, scol, erow, ecol, tok_name[type], repr(token))
 152
 153 def tokenize(readline, tokeneater=printtoken):
 154     """
 155     The tokenize() function accepts two parameters: one representing the
 156     input stream, and one providing an output mechanism for tokenize().
 157
 158     The first parameter, readline, must be a callable object which provides
 159     the same interface as the readline() method of built-in file objects.
 160     Each call to the function should return one line of input as a string.
 161
 162     The second parameter, tokeneater, must also be a callable object. It is
 163     called once for each token, with five arguments, corresponding to the
 164     tuples generated by generate_tokens().
 165     """
 166     try:
 167         tokenize_loop(readline, tokeneater)
 168     except StopTokenizing:
 169         pass
 170
 171 # backwards compatible interface
 172 def tokenize_loop(readline, tokeneater):
 173     for token_info in generate_tokens(readline):
 174         tokeneater(*token_info)
 175
 176 class Untokenizer:
 177
 178     def __init__(self):
 179         self.tokens = []
 180         self.prev_row = 1
 181         self.prev_col = 0
 182
 183     def add_whitespace(self, start):
 184         row, col = start
 185         assert row <= self.prev_row
 186         col_offset = col - self.prev_col
 187         if col_offset:
 188             self.tokens.append(" " * col_offset)
 189
 190     def untokenize(self, iterable):
 191         for t in iterable:
 192             if len(t) == 2:
 193                 self.compat(t, iterable)
 194                 break
 195             tok_type, token, start, end, line = t
 196             self.add_whitespace(start)
 197             self.tokens.append(token)
 198             self.prev_row, self.prev_col = end
 199             if tok_type in (NEWLINE, NL):
 200                 self.prev_row += 1
 201                 self.prev_col = 0
 202         return "".join(self.tokens)
 203
 204     def compat(self, token, iterable):
 205         startline = False
 206         indents = []
 207         toks_append = self.tokens.append
 208         toknum, tokval = token
 209         if toknum in (NAME, NUMBER):
 210             tokval += ' '
 211         if toknum in (NEWLINE, NL):
 212             startline = True
 213         prevstring = False
 214         for tok in iterable:
 215             toknum, tokval = tok[:2]
 216
 217             if toknum in (NAME, NUMBER):
 218                 tokval += ' '
 219
 220             # Insert a space between two consecutive strings
 221             if toknum == STRING:
 222                 if prevstring:
 223                     tokval = ' ' + tokval
 224                 prevstring = True
 225             else:
 226                 prevstring = False
 227
 228             if toknum == INDENT:
 229                 indents.append(tokval)
 230                 continue
 231             elif toknum == DEDENT:
 232                 indents.pop()
 233                 continue
 234             elif toknum in (NEWLINE, NL):
 235                 startline = True
 236             elif startline and indents:
 237                 toks_append(indents[-1])
 238                 startline = False
 239             toks_append(tokval)
 240
 241 def untokenize(iterable):
 242     """Transform tokens back into Python source code.
 243
 244     Each element returned by the iterable must be a token sequence
 245     with at least two elements, a token number and token value.  If
 246     only two tokens are passed, the resulting output is poor.
 247
 248     Round-trip invariant for full input:
 249         Untokenized source will match input source exactly
 250
 251     Round-trip invariant for limited intput:
 252         # Output text will tokenize the back to the input
 253         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 254         newcode = untokenize(t1)
 255         readline = iter(newcode.splitlines(1)).next
 256         t2 = [tok[:2] for tok in generate_tokens(readline)]
 257         assert t1 == t2
 258     """
 259     ut = Untokenizer()
 260     return ut.untokenize(iterable)
 261
 262 def generate_tokens(readline):
 263     """
 264     The generate_tokens() generator requires one argment, readline, which
 265     must be a callable object which provides the same interface as the
 266     readline() method of built-in file objects. Each call to the function
 267     should return one line of input as a string.  Alternately, readline
 268     can be a callable function terminating with StopIteration:
 269         readline = open(myfile).next    # Example of alternate readline
 270
 271     The generator produces 5-tuples with these members: the token type; the
 272     token string; a 2-tuple (srow, scol) of ints specifying the row and
 273     column where the token begins in the source; a 2-tuple (erow, ecol) of
 274     ints specifying the row and column where the token ends in the source;
 275     and the line on which the token was found. The line passed is the
 276     logical line; continuation lines are included.
 277     """
 278     lnum = parenlev = continued = 0
 279     namechars, numchars = string.ascii_letters + '_', '0123456789'
 280     contstr, needcont = '', 0
 281     contline = None
 282     indents = [0]
 283
 284     while 1:                                   # loop over lines in stream
 285         try:
 286             line = readline()
 287         except StopIteration:
 288             line = ''
 289         lnum = lnum + 1
 290         pos, max = 0, len(line)
 291
 292         if contstr:                            # continued string
 293             if not line:
 294                 raise TokenError, ("EOF in multi-line string", strstart)
 295             endmatch = endprog.match(line)
 296             if endmatch:
 297                 pos = end = endmatch.end(0)
 298                 yield (STRING, contstr + line[:end],
 299                        strstart, (lnum, end), contline + line)
 300                 contstr, needcont = '', 0
 301                 contline = None
 302             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 303                 yield (ERRORTOKEN, contstr + line,
 304                            strstart, (lnum, len(line)), contline)
 305                 contstr = ''
 306                 contline = None
 307                 continue
 308             else:
 309                 contstr = contstr + line
 310                 contline = contline + line
 311                 continue
 312
 313         elif parenlev == 0 and not continued:  # new statement
 314             if not line: break
 315             column = 0
 316             while pos < max:                   # measure leading whitespace
 317                 if line[pos] == ' ': column = column + 1
 318                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 319                 elif line[pos] == '\f': column = 0
 320                 else: break
 321                 pos = pos + 1
 322             if pos == max: break
 323
 324             if line[pos] in '#\r\n':           # skip comments or blank lines
 325                 if line[pos] == '#':
 326                     comment_token = line[pos:].rstrip('\r\n')
 327                     nl_pos = pos + len(comment_token)
 328                     yield (COMMENT, comment_token,
 329                            (lnum, pos), (lnum, pos + len(comment_token)), line)
 330                     yield (NL, line[nl_pos:],
 331                            (lnum, nl_pos), (lnum, len(line)), line)
 332                 else:
 333                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 334                            (lnum, pos), (lnum, len(line)), line)
 335                 continue
 336
 337             if column > indents[-1]:           # count indents or dedents
 338                 indents.append(column)
 339                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 340             while column < indents[-1]:
 341                 if column not in indents:
 342                     raise IndentationError(
 343                         "unindent does not match any outer indentation level",
 344                         ("<tokenize>", lnum, pos, line))
 345                 indents = indents[:-1]
 346                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 347
 348         else:                                  # continued statement
 349             if not line:
 350                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 351             continued = 0
 352
 353         while pos < max:
 354             pseudomatch = pseudoprog.match(line, pos)
 355             if pseudomatch:                                # scan for tokens
 356                 start, end = pseudomatch.span(1)
 357                 spos, epos, pos = (lnum, start), (lnum, end), end
 358                 token, initial = line[start:end], line[start]
 359
 360                 if initial in numchars or \
 361                    (initial == '.' and token != '.'):      # ordinary number
 362                     yield (NUMBER, token, spos, epos, line)
 363                 elif initial in '\r\n':
 364                     yield (NL if parenlev > 0 else NEWLINE,
 365                            token, spos, epos, line)
 366                 elif initial == '#':
 367                     assert not token.endswith("\n")
 368                     yield (COMMENT, token, spos, epos, line)
 369                 elif token in triple_quoted:
 370                     endprog = endprogs[token]
 371                     endmatch = endprog.match(line, pos)
 372                     if endmatch:                           # all on one line
 373                         pos = endmatch.end(0)
 374                         token = line[start:pos]
 375                         yield (STRING, token, spos, (lnum, pos), line)
 376                     else:
 377                         strstart = (lnum, start)           # multiple lines
 378                         contstr = line[start:]
 379                         contline = line
 380                         break
 381                 elif initial in single_quoted or \
 382                     token[:2] in single_quoted or \
 383                     token[:3] in single_quoted:
 384                     if token[-1] == '\n':                  # continued string
 385                         strstart = (lnum, start)
 386                         endprog = (endprogs[initial] or endprogs[token[1]] or
 387                                    endprogs[token[2]])
 388                         contstr, needcont = line[start:], 1
 389                         contline = line
 390                         break
 391                     else:                                  # ordinary string
 392                         yield (STRING, token, spos, epos, line)
 393                 elif initial in namechars:                 # ordinary name
 394                     yield (NAME, token, spos, epos, line)
 395                 elif initial == '\\':                      # continued stmt
 396                     continued = 1
 397                 else:
 398                     if initial in '([{': parenlev = parenlev + 1
 399                     elif initial in ')]}': parenlev = parenlev - 1
 400                     yield (OP, token, spos, epos, line)
 401             else:
 402                 yield (ERRORTOKEN, line[pos],
 403                            (lnum, pos), (lnum, pos+1), line)
 404                 pos = pos + 1
 405
 406     for indent in indents[1:]:                 # pop remaining indent levels
 407         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 408     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 409
 410 if __name__ == '__main__':                     # testing
 411     import sys
 412     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 413     else: tokenize(sys.stdin.readline)