Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  26 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
  27                'Skip Montanaro, Raymond Hettinger')
  28
  29 import string, re
  30 from token import *
  31
  32 import token
  33 __all__ = [x for x in dir(token) if not x.startswith("_")]
  34 __all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
  35 del x
  36 del token
  37
  38 COMMENT = N_TOKENS
  39 tok_name[COMMENT] = 'COMMENT'
  40 NL = N_TOKENS + 1
  41 tok_name[NL] = 'NL'
  42 N_TOKENS += 2
  43
  44 def group(*choices): return '(' + '|'.join(choices) + ')'
  45 def any(*choices): return group(*choices) + '*'
  46 def maybe(*choices): return group(*choices) + '?'
  47
  48 Whitespace = r'[ \f\t]*'
  49 Comment = r'#[^\r\n]*'
  50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  51 Name = r'[a-zA-Z_]\w*'
  52
  53 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
  54 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
  55 Binnumber = r'0[bB][01]+[lL]?'
  56 Decnumber = r'[1-9]\d*[lL]?'
  57 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
  58 Exponent = r'[eE][-+]?\d+'
  59 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  60 Expfloat = r'\d+' + Exponent
  61 Floatnumber = group(Pointfloat, Expfloat)
  62 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  63 Number = group(Imagnumber, Floatnumber, Intnumber)
  64
  65 # Tail end of ' string.
  66 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  67 # Tail end of " string.
  68 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  69 # Tail end of ''' string.
  70 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  71 # Tail end of """ string.
  72 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  73 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  74 # Single-line ' or " string.
  75 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  76                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  77
  78 # Because of leftmost-then-longest match semantics, be sure to put the
  79 # longest operators first (e.g., if = came before ==, == would get
  80 # recognized as two instances of =).
  81 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  82                  r"//=?",
  83                  r"[+\-*/%&|^=<>]=?",
  84                  r"~")
  85
  86 Bracket = '[][(){}]'
  87 Special = group(r'\r?\n', r'[:;.,`@]')
  88 Funny = group(Operator, Bracket, Special)
  89
  90 PlainToken = group(Number, Funny, String, Name)
  91 Token = Ignore + PlainToken
  92
  93 # First (or only) line of ' or " string.
  94 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  95                 group("'", r'\\\r?\n'),
  96                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  97                 group('"', r'\\\r?\n'))
  98 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  99 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 100
 101 tokenprog, pseudoprog, single3prog, double3prog = map(
 102     re.compile, (Token, PseudoToken, Single3, Double3))
 103 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 104             "'''": single3prog, '"""': double3prog,
 105             "r'''": single3prog, 'r"""': double3prog,
 106             "u'''": single3prog, 'u"""': double3prog,
 107             "ur'''": single3prog, 'ur"""': double3prog,
 108             "R'''": single3prog, 'R"""': double3prog,
 109             "U'''": single3prog, 'U"""': double3prog,
 110             "uR'''": single3prog, 'uR"""': double3prog,
 111             "Ur'''": single3prog, 'Ur"""': double3prog,
 112             "UR'''": single3prog, 'UR"""': double3prog,
 113             "b'''": single3prog, 'b"""': double3prog,
 114             "br'''": single3prog, 'br"""': double3prog,
 115             "B'''": single3prog, 'B"""': double3prog,
 116             "bR'''": single3prog, 'bR"""': double3prog,
 117             "Br'''": single3prog, 'Br"""': double3prog,
 118             "BR'''": single3prog, 'BR"""': double3prog,
 119             'r': None, 'R': None, 'u': None, 'U': None,
 120             'b': None, 'B': None}
 121
 122 triple_quoted = {}
 123 for t in ("'''", '"""',
 124           "r'''", 'r"""', "R'''", 'R"""',
 125           "u'''", 'u"""', "U'''", 'U"""',
 126           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 127           "uR'''", 'uR"""', "UR'''", 'UR"""',
 128           "b'''", 'b"""', "B'''", 'B"""',
 129           "br'''", 'br"""', "Br'''", 'Br"""',
 130           "bR'''", 'bR"""', "BR'''", 'BR"""'):
 131     triple_quoted[t] = t
 132 single_quoted = {}
 133 for t in ("'", '"',
 134           "r'", 'r"', "R'", 'R"',
 135           "u'", 'u"', "U'", 'U"',
 136           "ur'", 'ur"', "Ur'", 'Ur"',
 137           "uR'", 'uR"', "UR'", 'UR"',
 138           "b'", 'b"', "B'", 'B"',
 139           "br'", 'br"', "Br'", 'Br"',
 140           "bR'", 'bR"', "BR'", 'BR"' ):
 141     single_quoted[t] = t
 142
 143 tabsize = 8
 144
 145 class TokenError(Exception): pass
 146
 147 class StopTokenizing(Exception): pass
 148
 149 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
 150     srow, scol = srow_scol
 151     erow, ecol = erow_ecol
 152     print "%d,%d-%d,%d:\t%s\t%s" % \
 153         (srow, scol, erow, ecol, tok_name[type], repr(token))
 154
 155 def tokenize(readline, tokeneater=printtoken):
 156     """
 157     The tokenize() function accepts two parameters: one representing the
 158     input stream, and one providing an output mechanism for tokenize().
 159
 160     The first parameter, readline, must be a callable object which provides
 161     the same interface as the readline() method of built-in file objects.
 162     Each call to the function should return one line of input as a string.
 163
 164     The second parameter, tokeneater, must also be a callable object. It is
 165     called once for each token, with five arguments, corresponding to the
 166     tuples generated by generate_tokens().
 167     """
 168     try:
 169         tokenize_loop(readline, tokeneater)
 170     except StopTokenizing:
 171         pass
 172
 173 # backwards compatible interface
 174 def tokenize_loop(readline, tokeneater):
 175     for token_info in generate_tokens(readline):
 176         tokeneater(*token_info)
 177
 178 class Untokenizer:
 179
 180     def __init__(self):
 181         self.tokens = []
 182         self.prev_row = 1
 183         self.prev_col = 0
 184
 185     def add_whitespace(self, start):
 186         row, col = start
 187         assert row <= self.prev_row
 188         col_offset = col - self.prev_col
 189         if col_offset:
 190             self.tokens.append(" " * col_offset)
 191
 192     def untokenize(self, iterable):
 193         for t in iterable:
 194             if len(t) == 2:
 195                 self.compat(t, iterable)
 196                 break
 197             tok_type, token, start, end, line = t
 198             self.add_whitespace(start)
 199             self.tokens.append(token)
 200             self.prev_row, self.prev_col = end
 201             if tok_type in (NEWLINE, NL):
 202                 self.prev_row += 1
 203                 self.prev_col = 0
 204         return "".join(self.tokens)
 205
 206     def compat(self, token, iterable):
 207         startline = False
 208         indents = []
 209         toks_append = self.tokens.append
 210         toknum, tokval = token
 211         if toknum in (NAME, NUMBER):
 212             tokval += ' '
 213         if toknum in (NEWLINE, NL):
 214             startline = True
 215         prevstring = False
 216         for tok in iterable:
 217             toknum, tokval = tok[:2]
 218
 219             if toknum in (NAME, NUMBER):
 220                 tokval += ' '
 221
 222             # Insert a space between two consecutive strings
 223             if toknum == STRING:
 224                 if prevstring:
 225                     tokval = ' ' + tokval
 226                 prevstring = True
 227             else:
 228                 prevstring = False
 229
 230             if toknum == INDENT:
 231                 indents.append(tokval)
 232                 continue
 233             elif toknum == DEDENT:
 234                 indents.pop()
 235                 continue
 236             elif toknum in (NEWLINE, NL):
 237                 startline = True
 238             elif startline and indents:
 239                 toks_append(indents[-1])
 240                 startline = False
 241             toks_append(tokval)
 242
 243 def untokenize(iterable):
 244     """Transform tokens back into Python source code.
 245
 246     Each element returned by the iterable must be a token sequence
 247     with at least two elements, a token number and token value.  If
 248     only two tokens are passed, the resulting output is poor.
 249
 250     Round-trip invariant for full input:
 251         Untokenized source will match input source exactly
 252
 253     Round-trip invariant for limited intput:
 254         # Output text will tokenize the back to the input
 255         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 256         newcode = untokenize(t1)
 257         readline = iter(newcode.splitlines(1)).next
 258         t2 = [tok[:2] for tok in generate_tokens(readline)]
 259         assert t1 == t2
 260     """
 261     ut = Untokenizer()
 262     return ut.untokenize(iterable)
 263
 264 def generate_tokens(readline):
 265     """
 266     The generate_tokens() generator requires one argment, readline, which
 267     must be a callable object which provides the same interface as the
 268     readline() method of built-in file objects. Each call to the function
 269     should return one line of input as a string.  Alternately, readline
 270     can be a callable function terminating with StopIteration:
 271         readline = open(myfile).next    # Example of alternate readline
 272
 273     The generator produces 5-tuples with these members: the token type; the
 274     token string; a 2-tuple (srow, scol) of ints specifying the row and
 275     column where the token begins in the source; a 2-tuple (erow, ecol) of
 276     ints specifying the row and column where the token ends in the source;
 277     and the line on which the token was found. The line passed is the
 278     logical line; continuation lines are included.
 279     """
 280     lnum = parenlev = continued = 0
 281     namechars, numchars = string.ascii_letters + '_', '0123456789'
 282     contstr, needcont = '', 0
 283     contline = None
 284     indents = [0]
 285
 286     while 1:                                   # loop over lines in stream
 287         try:
 288             line = readline()
 289         except StopIteration:
 290             line = ''
 291         lnum += 1
 292         pos, max = 0, len(line)
 293
 294         if contstr:                            # continued string
 295             if not line:
 296                 raise TokenError, ("EOF in multi-line string", strstart)
 297             endmatch = endprog.match(line)
 298             if endmatch:
 299                 pos = end = endmatch.end(0)
 300                 yield (STRING, contstr + line[:end],
 301                        strstart, (lnum, end), contline + line)
 302                 contstr, needcont = '', 0
 303                 contline = None
 304             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 305                 yield (ERRORTOKEN, contstr + line,
 306                            strstart, (lnum, len(line)), contline)
 307                 contstr = ''
 308                 contline = None
 309                 continue
 310             else:
 311                 contstr = contstr + line
 312                 contline = contline + line
 313                 continue
 314
 315         elif parenlev == 0 and not continued:  # new statement
 316             if not line: break
 317             column = 0
 318             while pos < max:                   # measure leading whitespace
 319                 if line[pos] == ' ':
 320                     column += 1
 321                 elif line[pos] == '\t':
 322                     column = (column//tabsize + 1)*tabsize
 323                 elif line[pos] == '\f':
 324                     column = 0
 325                 else:
 326                     break
 327                 pos += 1
 328             if pos == max:
 329                 break
 330
 331             if line[pos] in '#\r\n':           # skip comments or blank lines
 332                 if line[pos] == '#':
 333                     comment_token = line[pos:].rstrip('\r\n')
 334                     nl_pos = pos + len(comment_token)
 335                     yield (COMMENT, comment_token,
 336                            (lnum, pos), (lnum, pos + len(comment_token)), line)
 337                     yield (NL, line[nl_pos:],
 338                            (lnum, nl_pos), (lnum, len(line)), line)
 339                 else:
 340                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 341                            (lnum, pos), (lnum, len(line)), line)
 342                 continue
 343
 344             if column > indents[-1]:           # count indents or dedents
 345                 indents.append(column)
 346                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 347             while column < indents[-1]:
 348                 if column not in indents:
 349                     raise IndentationError(
 350                         "unindent does not match any outer indentation level",
 351                         ("<tokenize>", lnum, pos, line))
 352                 indents = indents[:-1]
 353                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 354
 355         else:                                  # continued statement
 356             if not line:
 357                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 358             continued = 0
 359
 360         while pos < max:
 361             pseudomatch = pseudoprog.match(line, pos)
 362             if pseudomatch:                                # scan for tokens
 363                 start, end = pseudomatch.span(1)
 364                 spos, epos, pos = (lnum, start), (lnum, end), end
 365                 token, initial = line[start:end], line[start]
 366
 367                 if initial in numchars or \
 368                    (initial == '.' and token != '.'):      # ordinary number
 369                     yield (NUMBER, token, spos, epos, line)
 370                 elif initial in '\r\n':
 371                     yield (NL if parenlev > 0 else NEWLINE,
 372                            token, spos, epos, line)
 373                 elif initial == '#':
 374                     assert not token.endswith("\n")
 375                     yield (COMMENT, token, spos, epos, line)
 376                 elif token in triple_quoted:
 377                     endprog = endprogs[token]
 378                     endmatch = endprog.match(line, pos)
 379                     if endmatch:                           # all on one line
 380                         pos = endmatch.end(0)
 381                         token = line[start:pos]
 382                         yield (STRING, token, spos, (lnum, pos), line)
 383                     else:
 384                         strstart = (lnum, start)           # multiple lines
 385                         contstr = line[start:]
 386                         contline = line
 387                         break
 388                 elif initial in single_quoted or \
 389                     token[:2] in single_quoted or \
 390                     token[:3] in single_quoted:
 391                     if token[-1] == '\n':                  # continued string
 392                         strstart = (lnum, start)
 393                         endprog = (endprogs[initial] or endprogs[token[1]] or
 394                                    endprogs[token[2]])
 395                         contstr, needcont = line[start:], 1
 396                         contline = line
 397                         break
 398                     else:                                  # ordinary string
 399                         yield (STRING, token, spos, epos, line)
 400                 elif initial in namechars:                 # ordinary name
 401                     yield (NAME, token, spos, epos, line)
 402                 elif initial == '\\':                      # continued stmt
 403                     continued = 1
 404                 else:
 405                     if initial in '([{':
 406                         parenlev += 1
 407                     elif initial in ')]}':
 408                         parenlev -= 1
 409                     yield (OP, token, spos, epos, line)
 410             else:
 411                 yield (ERRORTOKEN, line[pos],
 412                            (lnum, pos), (lnum, pos+1), line)
 413                 pos += 1
 414
 415     for indent in indents[1:]:                 # pop remaining indent levels
 416         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 417     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 418
 419 if __name__ == '__main__':                     # testing
 420     import sys
 421     if len(sys.argv) > 1:
 422         tokenize(open(sys.argv[1]).readline)
 423     else:
 424         tokenize(sys.stdin.readline)