Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  26 __credits__ = \
  27     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
  28
  29 import string, re
  30 from token import *
  31
  32 import token
  33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
  34            "generate_tokens", "NL", "untokenize"]
  35 del x
  36 del token
  37
  38 COMMENT = N_TOKENS
  39 tok_name[COMMENT] = 'COMMENT'
  40 NL = N_TOKENS + 1
  41 tok_name[NL] = 'NL'
  42 N_TOKENS += 2
  43
  44 def group(*choices): return '(' + '|'.join(choices) + ')'
  45 def any(*choices): return group(*choices) + '*'
  46 def maybe(*choices): return group(*choices) + '?'
  47
  48 Whitespace = r'[ \f\t]*'
  49 Comment = r'#[^\r\n]*'
  50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  51 Name = r'[a-zA-Z_]\w*'
  52
  53 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
  54 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
  55 Binnumber = r'0[bB][01]+[lL]?'
  56 Decnumber = r'[1-9]\d*[lL]?'
  57 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
  58 Exponent = r'[eE][-+]?\d+'
  59 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  60 Expfloat = r'\d+' + Exponent
  61 Floatnumber = group(Pointfloat, Expfloat)
  62 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  63 Number = group(Imagnumber, Floatnumber, Intnumber)
  64
  65 # Tail end of ' string.
  66 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  67 # Tail end of " string.
  68 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  69 # Tail end of ''' string.
  70 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  71 # Tail end of """ string.
  72 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  73 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  74 # Single-line ' or " string.
  75 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  76                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  77
  78 # Because of leftmost-then-longest match semantics, be sure to put the
  79 # longest operators first (e.g., if = came before ==, == would get
  80 # recognized as two instances of =).
  81 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  82                  r"//=?",
  83                  r"[+\-*/%&|^=<>]=?",
  84                  r"~")
  85
  86 Bracket = '[][(){}]'
  87 Special = group(r'\r?\n', r'[:;.,`@]')
  88 Funny = group(Operator, Bracket, Special)
  89
  90 PlainToken = group(Number, Funny, String, Name)
  91 Token = Ignore + PlainToken
  92
  93 # First (or only) line of ' or " string.
  94 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  95                 group("'", r'\\\r?\n'),
  96                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  97                 group('"', r'\\\r?\n'))
  98 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  99 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 100
 101 tokenprog, pseudoprog, single3prog, double3prog = map(
 102     re.compile, (Token, PseudoToken, Single3, Double3))
 103 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 104             "'''": single3prog, '"""': double3prog,
 105             "r'''": single3prog, 'r"""': double3prog,
 106             "u'''": single3prog, 'u"""': double3prog,
 107             "ur'''": single3prog, 'ur"""': double3prog,
 108             "R'''": single3prog, 'R"""': double3prog,
 109             "U'''": single3prog, 'U"""': double3prog,
 110             "uR'''": single3prog, 'uR"""': double3prog,
 111             "Ur'''": single3prog, 'Ur"""': double3prog,
 112             "UR'''": single3prog, 'UR"""': double3prog,
 113             "b'''": single3prog, 'b"""': double3prog,
 114             "br'''": single3prog, 'br"""': double3prog,
 115             "B'''": single3prog, 'B"""': double3prog,
 116             "bR'''": single3prog, 'bR"""': double3prog,
 117             "Br'''": single3prog, 'Br"""': double3prog,
 118             "BR'''": single3prog, 'BR"""': double3prog,
 119             'r': None, 'R': None, 'u': None, 'U': None,
 120             'b': None, 'B': None}
 121
 122 triple_quoted = {}
 123 for t in ("'''", '"""',
 124           "r'''", 'r"""', "R'''", 'R"""',
 125           "u'''", 'u"""', "U'''", 'U"""',
 126           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 127           "uR'''", 'uR"""', "UR'''", 'UR"""',
 128           "b'''", 'b"""', "B'''", 'B"""',
 129           "br'''", 'br"""', "Br'''", 'Br"""',
 130           "bR'''", 'bR"""', "BR'''", 'BR"""'):
 131     triple_quoted[t] = t
 132 single_quoted = {}
 133 for t in ("'", '"',
 134           "r'", 'r"', "R'", 'R"',
 135           "u'", 'u"', "U'", 'U"',
 136           "ur'", 'ur"', "Ur'", 'Ur"',
 137           "uR'", 'uR"', "UR'", 'UR"',
 138           "b'", 'b"', "B'", 'B"',
 139           "br'", 'br"', "Br'", 'Br"',
 140           "bR'", 'bR"', "BR'", 'BR"' ):
 141     single_quoted[t] = t
 142
 143 tabsize = 8
 144
 145 class TokenError(Exception): pass
 146
 147 class StopTokenizing(Exception): pass
 148
 149 def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
 150     srow, scol = srow_scol
 151     erow, ecol = erow_ecol
 152     print "%d,%d-%d,%d:\t%s\t%s" % \
 153         (srow, scol, erow, ecol, tok_name[type], repr(token))
 154
 155 def tokenize(readline, tokeneater=printtoken):
 156     """
 157     The tokenize() function accepts two parameters: one representing the
 158     input stream, and one providing an output mechanism for tokenize().
 159
 160     The first parameter, readline, must be a callable object which provides
 161     the same interface as the readline() method of built-in file objects.
 162     Each call to the function should return one line of input as a string.
 163
 164     The second parameter, tokeneater, must also be a callable object. It is
 165     called once for each token, with five arguments, corresponding to the
 166     tuples generated by generate_tokens().
 167     """
 168     try:
 169         tokenize_loop(readline, tokeneater)
 170     except StopTokenizing:
 171         pass
 172
 173 # backwards compatible interface
 174 def tokenize_loop(readline, tokeneater):
 175     for token_info in generate_tokens(readline):
 176         tokeneater(*token_info)
 177
 178 class Untokenizer:
 179
 180     def __init__(self):
 181         self.tokens = []
 182         self.prev_row = 1
 183         self.prev_col = 0
 184
 185     def add_whitespace(self, start):
 186         row, col = start
 187         assert row <= self.prev_row
 188         col_offset = col - self.prev_col
 189         if col_offset:
 190             self.tokens.append(" " * col_offset)
 191
 192     def untokenize(self, iterable):
 193         for t in iterable:
 194             if len(t) == 2:
 195                 self.compat(t, iterable)
 196                 break
 197             tok_type, token, start, end, line = t
 198             self.add_whitespace(start)
 199             self.tokens.append(token)
 200             self.prev_row, self.prev_col = end
 201             if tok_type in (NEWLINE, NL):
 202                 self.prev_row += 1
 203                 self.prev_col = 0
 204         return "".join(self.tokens)
 205
 206     def compat(self, token, iterable):
 207         startline = False
 208         indents = []
 209         toks_append = self.tokens.append
 210         toknum, tokval = token
 211         if toknum in (NAME, NUMBER):
 212             tokval += ' '
 213         if toknum in (NEWLINE, NL):
 214             startline = True
 215         prevstring = False
 216         for tok in iterable:
 217             toknum, tokval = tok[:2]
 218
 219             if toknum in (NAME, NUMBER):
 220                 tokval += ' '
 221
 222             # Insert a space between two consecutive strings
 223             if toknum == STRING:
 224                 if prevstring:
 225                     tokval = ' ' + tokval
 226                 prevstring = True
 227             else:
 228                 prevstring = False
 229
 230             if toknum == INDENT:
 231                 indents.append(tokval)
 232                 continue
 233             elif toknum == DEDENT:
 234                 indents.pop()
 235                 continue
 236             elif toknum in (NEWLINE, NL):
 237                 startline = True
 238             elif startline and indents:
 239                 toks_append(indents[-1])
 240                 startline = False
 241             toks_append(tokval)
 242
 243 def untokenize(iterable):
 244     """Transform tokens back into Python source code.
 245
 246     Each element returned by the iterable must be a token sequence
 247     with at least two elements, a token number and token value.  If
 248     only two tokens are passed, the resulting output is poor.
 249
 250     Round-trip invariant for full input:
 251         Untokenized source will match input source exactly
 252
 253     Round-trip invariant for limited intput:
 254         # Output text will tokenize the back to the input
 255         t1 = [tok[:2] for tok in generate_tokens(f.readline)]
 256         newcode = untokenize(t1)
 257         readline = iter(newcode.splitlines(1)).next
 258         t2 = [tok[:2] for tok in generate_tokens(readline)]
 259         assert t1 == t2
 260     """
 261     ut = Untokenizer()
 262     return ut.untokenize(iterable)
 263
 264 def generate_tokens(readline):
 265     """
 266     The generate_tokens() generator requires one argment, readline, which
 267     must be a callable object which provides the same interface as the
 268     readline() method of built-in file objects. Each call to the function
 269     should return one line of input as a string.  Alternately, readline
 270     can be a callable function terminating with StopIteration:
 271         readline = open(myfile).next    # Example of alternate readline
 272
 273     The generator produces 5-tuples with these members: the token type; the
 274     token string; a 2-tuple (srow, scol) of ints specifying the row and
 275     column where the token begins in the source; a 2-tuple (erow, ecol) of
 276     ints specifying the row and column where the token ends in the source;
 277     and the line on which the token was found. The line passed is the
 278     logical line; continuation lines are included.
 279     """
 280     lnum = parenlev = continued = 0
 281     namechars, numchars = string.ascii_letters + '_', '0123456789'
 282     contstr, needcont = '', 0
 283     contline = None
 284     indents = [0]
 285
 286     while 1:                                   # loop over lines in stream
 287         try:
 288             line = readline()
 289         except StopIteration:
 290             line = ''
 291         lnum = lnum + 1
 292         pos, max = 0, len(line)
 293
 294         if contstr:                            # continued string
 295             if not line:
 296                 raise TokenError, ("EOF in multi-line string", strstart)
 297             endmatch = endprog.match(line)
 298             if endmatch:
 299                 pos = end = endmatch.end(0)
 300                 yield (STRING, contstr + line[:end],
 301                        strstart, (lnum, end), contline + line)
 302                 contstr, needcont = '', 0
 303                 contline = None
 304             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 305                 yield (ERRORTOKEN, contstr + line,
 306                            strstart, (lnum, len(line)), contline)
 307                 contstr = ''
 308                 contline = None
 309                 continue
 310             else:
 311                 contstr = contstr + line
 312                 contline = contline + line
 313                 continue
 314
 315         elif parenlev == 0 and not continued:  # new statement
 316             if not line: break
 317             column = 0
 318             while pos < max:                   # measure leading whitespace
 319                 if line[pos] == ' ': column = column + 1
 320                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 321                 elif line[pos] == '\f': column = 0
 322                 else: break
 323                 pos = pos + 1
 324             if pos == max: break
 325
 326             if line[pos] in '#\r\n':           # skip comments or blank lines
 327                 if line[pos] == '#':
 328                     comment_token = line[pos:].rstrip('\r\n')
 329                     nl_pos = pos + len(comment_token)
 330                     yield (COMMENT, comment_token,
 331                            (lnum, pos), (lnum, pos + len(comment_token)), line)
 332                     yield (NL, line[nl_pos:],
 333                            (lnum, nl_pos), (lnum, len(line)), line)
 334                 else:
 335                     yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 336                            (lnum, pos), (lnum, len(line)), line)
 337                 continue
 338
 339             if column > indents[-1]:           # count indents or dedents
 340                 indents.append(column)
 341                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 342             while column < indents[-1]:
 343                 if column not in indents:
 344                     raise IndentationError(
 345                         "unindent does not match any outer indentation level",
 346                         ("<tokenize>", lnum, pos, line))
 347                 indents = indents[:-1]
 348                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 349
 350         else:                                  # continued statement
 351             if not line:
 352                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 353             continued = 0
 354
 355         while pos < max:
 356             pseudomatch = pseudoprog.match(line, pos)
 357             if pseudomatch:                                # scan for tokens
 358                 start, end = pseudomatch.span(1)
 359                 spos, epos, pos = (lnum, start), (lnum, end), end
 360                 token, initial = line[start:end], line[start]
 361
 362                 if initial in numchars or \
 363                    (initial == '.' and token != '.'):      # ordinary number
 364                     yield (NUMBER, token, spos, epos, line)
 365                 elif initial in '\r\n':
 366                     yield (NL if parenlev > 0 else NEWLINE,
 367                            token, spos, epos, line)
 368                 elif initial == '#':
 369                     assert not token.endswith("\n")
 370                     yield (COMMENT, token, spos, epos, line)
 371                 elif token in triple_quoted:
 372                     endprog = endprogs[token]
 373                     endmatch = endprog.match(line, pos)
 374                     if endmatch:                           # all on one line
 375                         pos = endmatch.end(0)
 376                         token = line[start:pos]
 377                         yield (STRING, token, spos, (lnum, pos), line)
 378                     else:
 379                         strstart = (lnum, start)           # multiple lines
 380                         contstr = line[start:]
 381                         contline = line
 382                         break
 383                 elif initial in single_quoted or \
 384                     token[:2] in single_quoted or \
 385                     token[:3] in single_quoted:
 386                     if token[-1] == '\n':                  # continued string
 387                         strstart = (lnum, start)
 388                         endprog = (endprogs[initial] or endprogs[token[1]] or
 389                                    endprogs[token[2]])
 390                         contstr, needcont = line[start:], 1
 391                         contline = line
 392                         break
 393                     else:                                  # ordinary string
 394                         yield (STRING, token, spos, epos, line)
 395                 elif initial in namechars:                 # ordinary name
 396                     yield (NAME, token, spos, epos, line)
 397                 elif initial == '\\':                      # continued stmt
 398                     continued = 1
 399                 else:
 400                     if initial in '([{': parenlev = parenlev + 1
 401                     elif initial in ')]}': parenlev = parenlev - 1
 402                     yield (OP, token, spos, epos, line)
 403             else:
 404                 yield (ERRORTOKEN, line[pos],
 405                            (lnum, pos), (lnum, pos+1), line)
 406                 pos = pos + 1
 407
 408     for indent in indents[1:]:                 # pop remaining indent levels
 409         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 410     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 411
 412 if __name__ == '__main__':                     # testing
 413     import sys
 414     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 415     else: tokenize(sys.stdin.readline)