Tweak the comments and formatting.
[python.git] / Lib / tokenize.py
blob0f68b4034bae2952a8135eadaa9006cddcf53df1
1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
16 operators
18 Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
26 __credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
29 import string, re
30 from token import *
32 import token
33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL", "untokenize"]
35 del x
36 del token
38 COMMENT = N_TOKENS
39 tok_name[COMMENT] = 'COMMENT'
40 NL = N_TOKENS + 1
41 tok_name[NL] = 'NL'
42 N_TOKENS += 2
44 def group(*choices): return '(' + '|'.join(choices) + ')'
45 def any(*choices): return group(*choices) + '*'
46 def maybe(*choices): return group(*choices) + '?'
48 Whitespace = r'[ \f\t]*'
49 Comment = r'#[^\r\n]*'
50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51 Name = r'[a-zA-Z_]\w*'
53 Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
54 Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
55 Binnumber = r'0[bB][01]+[lL]?'
56 Decnumber = r'[1-9]\d*[lL]?'
57 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
58 Exponent = r'[eE][-+]?\d+'
59 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
60 Expfloat = r'\d+' + Exponent
61 Floatnumber = group(Pointfloat, Expfloat)
62 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
63 Number = group(Imagnumber, Floatnumber, Intnumber)
65 # Tail end of ' string.
66 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
67 # Tail end of " string.
68 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
69 # Tail end of ''' string.
70 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
71 # Tail end of """ string.
72 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
73 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
74 # Single-line ' or " string.
75 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
76 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
78 # Because of leftmost-then-longest match semantics, be sure to put the
79 # longest operators first (e.g., if = came before ==, == would get
80 # recognized as two instances of =).
81 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
82 r"//=?",
83 r"[+\-*/%&|^=<>]=?",
84 r"~")
86 Bracket = '[][(){}]'
87 Special = group(r'\r?\n', r'[:;.,`@]')
88 Funny = group(Operator, Bracket, Special)
90 PlainToken = group(Number, Funny, String, Name)
91 Token = Ignore + PlainToken
93 # First (or only) line of ' or " string.
94 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
95 group("'", r'\\\r?\n'),
96 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
97 group('"', r'\\\r?\n'))
98 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
99 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
101 tokenprog, pseudoprog, single3prog, double3prog = map(
102 re.compile, (Token, PseudoToken, Single3, Double3))
103 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
104 "'''": single3prog, '"""': double3prog,
105 "r'''": single3prog, 'r"""': double3prog,
106 "u'''": single3prog, 'u"""': double3prog,
107 "ur'''": single3prog, 'ur"""': double3prog,
108 "R'''": single3prog, 'R"""': double3prog,
109 "U'''": single3prog, 'U"""': double3prog,
110 "uR'''": single3prog, 'uR"""': double3prog,
111 "Ur'''": single3prog, 'Ur"""': double3prog,
112 "UR'''": single3prog, 'UR"""': double3prog,
113 "b'''": single3prog, 'b"""': double3prog,
114 "br'''": single3prog, 'br"""': double3prog,
115 "B'''": single3prog, 'B"""': double3prog,
116 "bR'''": single3prog, 'bR"""': double3prog,
117 "Br'''": single3prog, 'Br"""': double3prog,
118 "BR'''": single3prog, 'BR"""': double3prog,
119 'r': None, 'R': None, 'u': None, 'U': None,
120 'b': None, 'B': None}
122 triple_quoted = {}
123 for t in ("'''", '"""',
124 "r'''", 'r"""', "R'''", 'R"""',
125 "u'''", 'u"""', "U'''", 'U"""',
126 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
127 "uR'''", 'uR"""', "UR'''", 'UR"""',
128 "b'''", 'b"""', "B'''", 'B"""',
129 "br'''", 'br"""', "Br'''", 'Br"""',
130 "bR'''", 'bR"""', "BR'''", 'BR"""'):
131 triple_quoted[t] = t
132 single_quoted = {}
133 for t in ("'", '"',
134 "r'", 'r"', "R'", 'R"',
135 "u'", 'u"', "U'", 'U"',
136 "ur'", 'ur"', "Ur'", 'Ur"',
137 "uR'", 'uR"', "UR'", 'UR"',
138 "b'", 'b"', "B'", 'B"',
139 "br'", 'br"', "Br'", 'Br"',
140 "bR'", 'bR"', "BR'", 'BR"' ):
141 single_quoted[t] = t
143 tabsize = 8
145 class TokenError(Exception): pass
147 class StopTokenizing(Exception): pass
149 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
150 print "%d,%d-%d,%d:\t%s\t%s" % \
151 (srow, scol, erow, ecol, tok_name[type], repr(token))
153 def tokenize(readline, tokeneater=printtoken):
155 The tokenize() function accepts two parameters: one representing the
156 input stream, and one providing an output mechanism for tokenize().
158 The first parameter, readline, must be a callable object which provides
159 the same interface as the readline() method of built-in file objects.
160 Each call to the function should return one line of input as a string.
162 The second parameter, tokeneater, must also be a callable object. It is
163 called once for each token, with five arguments, corresponding to the
164 tuples generated by generate_tokens().
166 try:
167 tokenize_loop(readline, tokeneater)
168 except StopTokenizing:
169 pass
171 # backwards compatible interface
172 def tokenize_loop(readline, tokeneater):
173 for token_info in generate_tokens(readline):
174 tokeneater(*token_info)
176 class Untokenizer:
178 def __init__(self):
179 self.tokens = []
180 self.prev_row = 1
181 self.prev_col = 0
183 def add_whitespace(self, start):
184 row, col = start
185 assert row <= self.prev_row
186 col_offset = col - self.prev_col
187 if col_offset:
188 self.tokens.append(" " * col_offset)
190 def untokenize(self, iterable):
191 for t in iterable:
192 if len(t) == 2:
193 self.compat(t, iterable)
194 break
195 tok_type, token, start, end, line = t
196 self.add_whitespace(start)
197 self.tokens.append(token)
198 self.prev_row, self.prev_col = end
199 if tok_type in (NEWLINE, NL):
200 self.prev_row += 1
201 self.prev_col = 0
202 return "".join(self.tokens)
204 def compat(self, token, iterable):
205 startline = False
206 indents = []
207 toks_append = self.tokens.append
208 toknum, tokval = token
209 if toknum in (NAME, NUMBER):
210 tokval += ' '
211 if toknum in (NEWLINE, NL):
212 startline = True
213 prevstring = False
214 for tok in iterable:
215 toknum, tokval = tok[:2]
217 if toknum in (NAME, NUMBER):
218 tokval += ' '
220 # Insert a space between two consecutive strings
221 if toknum == STRING:
222 if prevstring:
223 tokval = ' ' + tokval
224 prevstring = True
225 else:
226 prevstring = False
228 if toknum == INDENT:
229 indents.append(tokval)
230 continue
231 elif toknum == DEDENT:
232 indents.pop()
233 continue
234 elif toknum in (NEWLINE, NL):
235 startline = True
236 elif startline and indents:
237 toks_append(indents[-1])
238 startline = False
239 toks_append(tokval)
241 def untokenize(iterable):
242 """Transform tokens back into Python source code.
244 Each element returned by the iterable must be a token sequence
245 with at least two elements, a token number and token value. If
246 only two tokens are passed, the resulting output is poor.
248 Round-trip invariant for full input:
249 Untokenized source will match input source exactly
251 Round-trip invariant for limited intput:
252 # Output text will tokenize the back to the input
253 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
254 newcode = untokenize(t1)
255 readline = iter(newcode.splitlines(1)).next
256 t2 = [tok[:2] for tok in generate_tokens(readline)]
257 assert t1 == t2
259 ut = Untokenizer()
260 return ut.untokenize(iterable)
262 def generate_tokens(readline):
264 The generate_tokens() generator requires one argment, readline, which
265 must be a callable object which provides the same interface as the
266 readline() method of built-in file objects. Each call to the function
267 should return one line of input as a string. Alternately, readline
268 can be a callable function terminating with StopIteration:
269 readline = open(myfile).next # Example of alternate readline
271 The generator produces 5-tuples with these members: the token type; the
272 token string; a 2-tuple (srow, scol) of ints specifying the row and
273 column where the token begins in the source; a 2-tuple (erow, ecol) of
274 ints specifying the row and column where the token ends in the source;
275 and the line on which the token was found. The line passed is the
276 logical line; continuation lines are included.
278 lnum = parenlev = continued = 0
279 namechars, numchars = string.ascii_letters + '_', '0123456789'
280 contstr, needcont = '', 0
281 contline = None
282 indents = [0]
284 while 1: # loop over lines in stream
285 try:
286 line = readline()
287 except StopIteration:
288 line = ''
289 lnum = lnum + 1
290 pos, max = 0, len(line)
292 if contstr: # continued string
293 if not line:
294 raise TokenError, ("EOF in multi-line string", strstart)
295 endmatch = endprog.match(line)
296 if endmatch:
297 pos = end = endmatch.end(0)
298 yield (STRING, contstr + line[:end],
299 strstart, (lnum, end), contline + line)
300 contstr, needcont = '', 0
301 contline = None
302 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
303 yield (ERRORTOKEN, contstr + line,
304 strstart, (lnum, len(line)), contline)
305 contstr = ''
306 contline = None
307 continue
308 else:
309 contstr = contstr + line
310 contline = contline + line
311 continue
313 elif parenlev == 0 and not continued: # new statement
314 if not line: break
315 column = 0
316 while pos < max: # measure leading whitespace
317 if line[pos] == ' ': column = column + 1
318 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
319 elif line[pos] == '\f': column = 0
320 else: break
321 pos = pos + 1
322 if pos == max: break
324 if line[pos] in '#\r\n': # skip comments or blank lines
325 if line[pos] == '#':
326 comment_token = line[pos:].rstrip('\r\n')
327 nl_pos = pos + len(comment_token)
328 yield (COMMENT, comment_token,
329 (lnum, pos), (lnum, pos + len(comment_token)), line)
330 yield (NL, line[nl_pos:],
331 (lnum, nl_pos), (lnum, len(line)), line)
332 else:
333 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
334 (lnum, pos), (lnum, len(line)), line)
335 continue
337 if column > indents[-1]: # count indents or dedents
338 indents.append(column)
339 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
340 while column < indents[-1]:
341 if column not in indents:
342 raise IndentationError(
343 "unindent does not match any outer indentation level",
344 ("<tokenize>", lnum, pos, line))
345 indents = indents[:-1]
346 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
348 else: # continued statement
349 if not line:
350 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
351 continued = 0
353 while pos < max:
354 pseudomatch = pseudoprog.match(line, pos)
355 if pseudomatch: # scan for tokens
356 start, end = pseudomatch.span(1)
357 spos, epos, pos = (lnum, start), (lnum, end), end
358 token, initial = line[start:end], line[start]
360 if initial in numchars or \
361 (initial == '.' and token != '.'): # ordinary number
362 yield (NUMBER, token, spos, epos, line)
363 elif initial in '\r\n':
364 yield (NL if parenlev > 0 else NEWLINE,
365 token, spos, epos, line)
366 elif initial == '#':
367 assert not token.endswith("\n")
368 yield (COMMENT, token, spos, epos, line)
369 elif token in triple_quoted:
370 endprog = endprogs[token]
371 endmatch = endprog.match(line, pos)
372 if endmatch: # all on one line
373 pos = endmatch.end(0)
374 token = line[start:pos]
375 yield (STRING, token, spos, (lnum, pos), line)
376 else:
377 strstart = (lnum, start) # multiple lines
378 contstr = line[start:]
379 contline = line
380 break
381 elif initial in single_quoted or \
382 token[:2] in single_quoted or \
383 token[:3] in single_quoted:
384 if token[-1] == '\n': # continued string
385 strstart = (lnum, start)
386 endprog = (endprogs[initial] or endprogs[token[1]] or
387 endprogs[token[2]])
388 contstr, needcont = line[start:], 1
389 contline = line
390 break
391 else: # ordinary string
392 yield (STRING, token, spos, epos, line)
393 elif initial in namechars: # ordinary name
394 yield (NAME, token, spos, epos, line)
395 elif initial == '\\': # continued stmt
396 continued = 1
397 else:
398 if initial in '([{': parenlev = parenlev + 1
399 elif initial in ')]}': parenlev = parenlev - 1
400 yield (OP, token, spos, epos, line)
401 else:
402 yield (ERRORTOKEN, line[pos],
403 (lnum, pos), (lnum, pos+1), line)
404 pos = pos + 1
406 for indent in indents[1:]: # pop remaining indent levels
407 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
408 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
410 if __name__ == '__main__': # testing
411 import sys
412 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
413 else: tokenize(sys.stdin.readline)