Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 tokenize(readline) is a generator that breaks a stream of
   4 bytes into Python tokens. It decodes the bytes according to
   5 PEP-0263 for determining source file encoding.
   6
   7 It accepts a readline-like method which is called
   8 repeatedly to get the next line of input (or b"" for EOF).  It generates
   9 5-tuples with these members:
  10
  11     the token type (see token.py)
  12     the token (a string)
  13     the starting (row, column) indices of the token (a 2-tuple of ints)
  14     the ending (row, column) indices of the token (a 2-tuple of ints)
  15     the original line (string)
  16
  17 It is designed to match the working of the Python tokenizer exactly, except
  18 that it produces COMMENT tokens for comments and gives type OP for all
  19 operators. Aditionally, all token lists start with an ENCODING token
  20 which tells you which encoding was used to decode the bytes stream."""
  21
  22 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  23 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
  24                'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
  25                'Michael Foord')
  26 import re, string, sys
  27 from token import *
  28 from codecs import lookup, BOM_UTF8
  29 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
  30
  31 import token
  32 __all__ = [x for x in dir(token) if not x.startswith("_")]
  33 __all__.extend(["COMMENT", "tokenize", "detect_encoding", "NL", "untokenize",
  34                 "ENCODING", "TokenInfo"])
  35 del token
  36
  37 COMMENT = N_TOKENS
  38 tok_name[COMMENT] = 'COMMENT'
  39 NL = N_TOKENS + 1
  40 tok_name[NL] = 'NL'
  41 ENCODING = N_TOKENS + 2
  42 tok_name[ENCODING] = 'ENCODING'
  43 N_TOKENS += 3
  44
  45 class TokenInfo(tuple):
  46     'TokenInfo(type, string, start, end, line)'
  47
  48     __slots__ = ()
  49
  50     _fields = ('type', 'string', 'start', 'end', 'line')
  51
  52     def __new__(cls, type, string, start, end, line):
  53         return tuple.__new__(cls, (type, string, start, end, line))
  54
  55     @classmethod
  56     def _make(cls, iterable, new=tuple.__new__, len=len):
  57         'Make a new TokenInfo object from a sequence or iterable'
  58         result = new(cls, iterable)
  59         if len(result) != 5:
  60             raise TypeError('Expected 5 arguments, got %d' % len(result))
  61         return result
  62
  63     def __repr__(self):
  64         return 'TokenInfo(type=%r, string=%r, start=%r, end=%r, line=%r)' % self
  65
  66     def _asdict(self):
  67         'Return a new dict which maps field names to their values'
  68         return dict(zip(self._fields, self))
  69
  70     def _replace(self, **kwds):
  71         'Return a new TokenInfo object replacing specified fields with new values'
  72         result = self._make(map(kwds.pop, ('type', 'string', 'start', 'end', 'line'), self))
  73         if kwds:
  74             raise ValueError('Got unexpected field names: %r' % kwds.keys())
  75         return result
  76
  77     def __getnewargs__(self):
  78         return tuple(self)
  79
  80     type = property(lambda t: t[0])
  81     string = property(lambda t: t[1])
  82     start = property(lambda t: t[2])
  83     end = property(lambda t: t[3])
  84     line = property(lambda t: t[4])
  85
  86 def group(*choices): return '(' + '|'.join(choices) + ')'
  87 def any(*choices): return group(*choices) + '*'
  88 def maybe(*choices): return group(*choices) + '?'
  89
  90 # Note: we use unicode matching for names ("\w") but ascii matching for
  91 # number literals.
  92 Whitespace = r'[ \f\t]*'
  93 Comment = r'#[^\r\n]*'
  94 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  95 Name = r'\w+'
  96
  97 Hexnumber = r'0[xX][0-9a-fA-F]+'
  98 Binnumber = r'0[bB][01]+'
  99 Octnumber = r'0[oO][0-7]+'
 100 Decnumber = r'(?:0+|[1-9][0-9]*)'
 101 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
 102 Exponent = r'[eE][-+]?[0-9]+'
 103 Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
 104 Expfloat = r'[0-9]+' + Exponent
 105 Floatnumber = group(Pointfloat, Expfloat)
 106 Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
 107 Number = group(Imagnumber, Floatnumber, Intnumber)
 108
 109 # Tail end of ' string.
 110 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
 111 # Tail end of " string.
 112 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
 113 # Tail end of ''' string.
 114 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
 115 # Tail end of """ string.
 116 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
 117 Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
 118 # Single-line ' or " string.
 119 String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
 120                r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
 121
 122 # Because of leftmost-then-longest match semantics, be sure to put the
 123 # longest operators first (e.g., if = came before ==, == would get
 124 # recognized as two instances of =).
 125 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
 126                  r"//=?", r"->",
 127                  r"[+\-*/%&|^=<>]=?",
 128                  r"~")
 129
 130 Bracket = '[][(){}]'
 131 Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
 132 Funny = group(Operator, Bracket, Special)
 133
 134 PlainToken = group(Number, Funny, String, Name)
 135 Token = Ignore + PlainToken
 136
 137 # First (or only) line of ' or " string.
 138 ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
 139                 group("'", r'\\\r?\n'),
 140                 r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
 141                 group('"', r'\\\r?\n'))
 142 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
 143 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
 144
 145 def _compile(expr):
 146     return re.compile(expr, re.UNICODE)
 147
 148 tokenprog, pseudoprog, single3prog, double3prog = map(
 149     _compile, (Token, PseudoToken, Single3, Double3))
 150 endprogs = {"'": _compile(Single), '"': _compile(Double),
 151             "'''": single3prog, '"""': double3prog,
 152             "r'''": single3prog, 'r"""': double3prog,
 153             "b'''": single3prog, 'b"""': double3prog,
 154             "br'''": single3prog, 'br"""': double3prog,
 155             "R'''": single3prog, 'R"""': double3prog,
 156             "B'''": single3prog, 'B"""': double3prog,
 157             "bR'''": single3prog, 'bR"""': double3prog,
 158             "Br'''": single3prog, 'Br"""': double3prog,
 159             "BR'''": single3prog, 'BR"""': double3prog,
 160             'r': None, 'R': None, 'b': None, 'B': None}
 161
 162 triple_quoted = {}
 163 for t in ("'''", '"""',
 164           "r'''", 'r"""', "R'''", 'R"""',
 165           "b'''", 'b"""', "B'''", 'B"""',
 166           "br'''", 'br"""', "Br'''", 'Br"""',
 167           "bR'''", 'bR"""', "BR'''", 'BR"""'):
 168     triple_quoted[t] = t
 169 single_quoted = {}
 170 for t in ("'", '"',
 171           "r'", 'r"', "R'", 'R"',
 172           "b'", 'b"', "B'", 'B"',
 173           "br'", 'br"', "Br'", 'Br"',
 174           "bR'", 'bR"', "BR'", 'BR"' ):
 175     single_quoted[t] = t
 176
 177 del _compile
 178
 179 tabsize = 8
 180
 181 class TokenError(Exception): pass
 182
 183 class StopTokenizing(Exception): pass
 184
 185
 186 class Untokenizer:
 187
 188     def __init__(self):
 189         self.tokens = []
 190         self.prev_row = 1
 191         self.prev_col = 0
 192         self.encoding = None
 193
 194     def add_whitespace(self, start):
 195         row, col = start
 196         assert row <= self.prev_row
 197         col_offset = col - self.prev_col
 198         if col_offset:
 199             self.tokens.append(" " * col_offset)
 200
 201     def untokenize(self, iterable):
 202         for t in iterable:
 203             if len(t) == 2:
 204                 self.compat(t, iterable)
 205                 break
 206             tok_type, token, start, end, line = t
 207             if tok_type == ENCODING:
 208                 self.encoding = token
 209                 continue
 210             self.add_whitespace(start)
 211             self.tokens.append(token)
 212             self.prev_row, self.prev_col = end
 213             if tok_type in (NEWLINE, NL):
 214                 self.prev_row += 1
 215                 self.prev_col = 0
 216         return "".join(self.tokens)
 217
 218     def compat(self, token, iterable):
 219         startline = False
 220         indents = []
 221         toks_append = self.tokens.append
 222         toknum, tokval = token
 223
 224         if toknum in (NAME, NUMBER):
 225             tokval += ' '
 226         if toknum in (NEWLINE, NL):
 227             startline = True
 228         prevstring = False
 229         for tok in iterable:
 230             toknum, tokval = tok[:2]
 231             if toknum == ENCODING:
 232                 self.encoding = tokval
 233                 continue
 234
 235             if toknum in (NAME, NUMBER):
 236                 tokval += ' '
 237
 238             # Insert a space between two consecutive strings
 239             if toknum == STRING:
 240                 if prevstring:
 241                     tokval = ' ' + tokval
 242                 prevstring = True
 243             else:
 244                 prevstring = False
 245
 246             if toknum == INDENT:
 247                 indents.append(tokval)
 248                 continue
 249             elif toknum == DEDENT:
 250                 indents.pop()
 251                 continue
 252             elif toknum in (NEWLINE, NL):
 253                 startline = True
 254             elif startline and indents:
 255                 toks_append(indents[-1])
 256                 startline = False
 257             toks_append(tokval)
 258
 259
 260 def untokenize(iterable):
 261     """Transform tokens back into Python source code.
 262     It returns a bytes object, encoded using the ENCODING
 263     token, which is the first token sequence output by tokenize.
 264
 265     Each element returned by the iterable must be a token sequence
 266     with at least two elements, a token number and token value.  If
 267     only two tokens are passed, the resulting output is poor.
 268
 269     Round-trip invariant for full input:
 270         Untokenized source will match input source exactly
 271
 272     Round-trip invariant for limited intput:
 273         # Output bytes will tokenize the back to the input
 274         t1 = [tok[:2] for tok in tokenize(f.readline)]
 275         newcode = untokenize(t1)
 276         readline = BytesIO(newcode).readline
 277         t2 = [tok[:2] for tok in tokenize(readline)]
 278         assert t1 == t2
 279     """
 280     ut = Untokenizer()
 281     out = ut.untokenize(iterable)
 282     if ut.encoding is not None:
 283         out = out.encode(ut.encoding)
 284     return out
 285
 286
 287 def _get_normal_name(orig_enc):
 288     """Imitates get_normal_name in tokenizer.c."""
 289     # Only care about the first 12 characters.
 290     enc = orig_enc[:12].lower().replace("_", "-")
 291     if enc == "utf-8" or enc.startswith("utf-8-"):
 292         return "utf-8"
 293     if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
 294        enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
 295         return "iso-8859-1"
 296     return orig_enc
 297
 298 def detect_encoding(readline):
 299     """
 300     The detect_encoding() function is used to detect the encoding that should
 301     be used to decode a Python source file. It requires one argment, readline,
 302     in the same way as the tokenize() generator.
 303
 304     It will call readline a maximum of twice, and return the encoding used
 305     (as a string) and a list of any lines (left as bytes) it has read
 306     in.
 307
 308     It detects the encoding from the presence of a utf-8 bom or an encoding
 309     cookie as specified in pep-0263. If both a bom and a cookie are present,
 310     but disagree, a SyntaxError will be raised. If the encoding cookie is an
 311     invalid charset, raise a SyntaxError.
 312
 313     If no encoding is specified, then the default of 'utf-8' will be returned.
 314     """
 315     bom_found = False
 316     encoding = None
 317     def read_or_stop():
 318         try:
 319             return readline()
 320         except StopIteration:
 321             return b''
 322
 323     def find_cookie(line):
 324         try:
 325             line_string = line.decode('ascii')
 326         except UnicodeDecodeError:
 327             return None
 328
 329         matches = cookie_re.findall(line_string)
 330         if not matches:
 331             return None
 332         encoding = _get_normal_name(matches[0])
 333         try:
 334             codec = lookup(encoding)
 335         except LookupError:
 336             # This behaviour mimics the Python interpreter
 337             raise SyntaxError("unknown encoding: " + encoding)
 338
 339         if bom_found and codec.name != 'utf-8':
 340             # This behaviour mimics the Python interpreter
 341             raise SyntaxError('encoding problem: utf-8')
 342         return encoding
 343
 344     first = read_or_stop()
 345     if first.startswith(BOM_UTF8):
 346         bom_found = True
 347         first = first[3:]
 348     if not first:
 349         return 'utf-8', []
 350
 351     encoding = find_cookie(first)
 352     if encoding:
 353         return encoding, [first]
 354
 355     second = read_or_stop()
 356     if not second:
 357         return 'utf-8', [first]
 358
 359     encoding = find_cookie(second)
 360     if encoding:
 361         return encoding, [first, second]
 362
 363     return 'utf-8', [first, second]
 364
 365
 366 def tokenize(readline):
 367     """
 368     The tokenize() generator requires one argment, readline, which
 369     must be a callable object which provides the same interface as the
 370     readline() method of built-in file objects. Each call to the function
 371     should return one line of input as bytes.  Alternately, readline
 372     can be a callable function terminating with StopIteration:
 373         readline = open(myfile, 'rb').__next__  # Example of alternate readline
 374
 375     The generator produces 5-tuples with these members: the token type; the
 376     token string; a 2-tuple (srow, scol) of ints specifying the row and
 377     column where the token begins in the source; a 2-tuple (erow, ecol) of
 378     ints specifying the row and column where the token ends in the source;
 379     and the line on which the token was found. The line passed is the
 380     logical line; continuation lines are included.
 381
 382     The first token sequence will always be an ENCODING token
 383     which tells you which encoding was used to decode the bytes stream.
 384     """
 385     encoding, consumed = detect_encoding(readline)
 386     def readline_generator(consumed):
 387         for line in consumed:
 388             yield line
 389         while True:
 390             try:
 391                 yield readline()
 392             except StopIteration:
 393                 return
 394     chained = readline_generator(consumed)
 395     return _tokenize(chained.__next__, encoding)
 396
 397
 398 def _tokenize(readline, encoding):
 399     lnum = parenlev = continued = 0
 400     numchars = '0123456789'
 401     contstr, needcont = '', 0
 402     contline = None
 403     indents = [0]
 404
 405     if encoding is not None:
 406         yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
 407     while True:             # loop over lines in stream
 408         try:
 409             line = readline()
 410         except StopIteration:
 411             line = b''
 412
 413         if encoding is not None:
 414             line = line.decode(encoding)
 415         lnum += 1
 416         pos, max = 0, len(line)
 417
 418         if contstr:                            # continued string
 419             if not line:
 420                 raise TokenError("EOF in multi-line string", strstart)
 421             endmatch = endprog.match(line)
 422             if endmatch:
 423                 pos = end = endmatch.end(0)
 424                 yield TokenInfo(STRING, contstr + line[:end],
 425                        strstart, (lnum, end), contline + line)
 426                 contstr, needcont = '', 0
 427                 contline = None
 428             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 429                 yield TokenInfo(ERRORTOKEN, contstr + line,
 430                            strstart, (lnum, len(line)), contline)
 431                 contstr = ''
 432                 contline = None
 433                 continue
 434             else:
 435                 contstr = contstr + line
 436                 contline = contline + line
 437                 continue
 438
 439         elif parenlev == 0 and not continued:  # new statement
 440             if not line: break
 441             column = 0
 442             while pos < max:                   # measure leading whitespace
 443                 if line[pos] == ' ':
 444                     column += 1
 445                 elif line[pos] == '\t':
 446                     column = (column//tabsize + 1)*tabsize
 447                 elif line[pos] == '\f':
 448                     column = 0
 449                 else:
 450                     break
 451                 pos += 1
 452             if pos == max:
 453                 break
 454
 455             if line[pos] in '#\r\n':           # skip comments or blank lines
 456                 if line[pos] == '#':
 457                     comment_token = line[pos:].rstrip('\r\n')
 458                     nl_pos = pos + len(comment_token)
 459                     yield TokenInfo(COMMENT, comment_token,
 460                            (lnum, pos), (lnum, pos + len(comment_token)), line)
 461                     yield TokenInfo(NL, line[nl_pos:],
 462                            (lnum, nl_pos), (lnum, len(line)), line)
 463                 else:
 464                     yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],
 465                            (lnum, pos), (lnum, len(line)), line)
 466                 continue
 467
 468             if column > indents[-1]:           # count indents or dedents
 469                 indents.append(column)
 470                 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 471             while column < indents[-1]:
 472                 if column not in indents:
 473                     raise IndentationError(
 474                         "unindent does not match any outer indentation level",
 475                         ("<tokenize>", lnum, pos, line))
 476                 indents = indents[:-1]
 477                 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
 478
 479         else:                                  # continued statement
 480             if not line:
 481                 raise TokenError("EOF in multi-line statement", (lnum, 0))
 482             continued = 0
 483
 484         while pos < max:
 485             pseudomatch = pseudoprog.match(line, pos)
 486             if pseudomatch:                                # scan for tokens
 487                 start, end = pseudomatch.span(1)
 488                 spos, epos, pos = (lnum, start), (lnum, end), end
 489                 token, initial = line[start:end], line[start]
 490
 491                 if (initial in numchars or                  # ordinary number
 492                     (initial == '.' and token != '.' and token != '...')):
 493                     yield TokenInfo(NUMBER, token, spos, epos, line)
 494                 elif initial in '\r\n':
 495                     yield TokenInfo(NL if parenlev > 0 else NEWLINE,
 496                            token, spos, epos, line)
 497                 elif initial == '#':
 498                     assert not token.endswith("\n")
 499                     yield TokenInfo(COMMENT, token, spos, epos, line)
 500                 elif token in triple_quoted:
 501                     endprog = endprogs[token]
 502                     endmatch = endprog.match(line, pos)
 503                     if endmatch:                           # all on one line
 504                         pos = endmatch.end(0)
 505                         token = line[start:pos]
 506                         yield TokenInfo(STRING, token, spos, (lnum, pos), line)
 507                     else:
 508                         strstart = (lnum, start)           # multiple lines
 509                         contstr = line[start:]
 510                         contline = line
 511                         break
 512                 elif initial in single_quoted or \
 513                     token[:2] in single_quoted or \
 514                     token[:3] in single_quoted:
 515                     if token[-1] == '\n':                  # continued string
 516                         strstart = (lnum, start)
 517                         endprog = (endprogs[initial] or endprogs[token[1]] or
 518                                    endprogs[token[2]])
 519                         contstr, needcont = line[start:], 1
 520                         contline = line
 521                         break
 522                     else:                                  # ordinary string
 523                         yield TokenInfo(STRING, token, spos, epos, line)
 524                 elif initial.isidentifier():               # ordinary name
 525                     yield TokenInfo(NAME, token, spos, epos, line)
 526                 elif initial == '\\':                      # continued stmt
 527                     continued = 1
 528                 else:
 529                     if initial in '([{':
 530                         parenlev += 1
 531                     elif initial in ')]}':
 532                         parenlev -= 1
 533                     yield TokenInfo(OP, token, spos, epos, line)
 534             else:
 535                 yield TokenInfo(ERRORTOKEN, line[pos],
 536                            (lnum, pos), (lnum, pos+1), line)
 537                 pos += 1
 538
 539     for indent in indents[1:]:                 # pop remaining indent levels
 540         yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
 541     yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 542
 543
 544 # An undocumented, backwards compatible, API for all the places in the standard
 545 # library that expect to be able to use tokenize with strings
 546 def generate_tokens(readline):
 547     return _tokenize(readline, None)