Reindent code
[pytest.git] / Lib / tokenize.py
bloba30791c2cdd7b24d33d206c5ebfb25946e629f73
1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
16 operators
18 Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
26 __credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
29 import string, re
30 from token import *
32 import token
33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL", "untokenize"]
35 del x
36 del token
38 COMMENT = N_TOKENS
39 tok_name[COMMENT] = 'COMMENT'
40 NL = N_TOKENS + 1
41 tok_name[NL] = 'NL'
42 N_TOKENS += 2
44 def group(*choices): return '(' + '|'.join(choices) + ')'
45 def any(*choices): return group(*choices) + '*'
46 def maybe(*choices): return group(*choices) + '?'
48 Whitespace = r'[ \f\t]*'
49 Comment = r'#[^\r\n]*'
50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51 Name = r'[a-zA-Z_]\w*'
53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54 Octnumber = r'0[0-7]*[lL]?'
55 Decnumber = r'[1-9]\d*[lL]?'
56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
57 Exponent = r'[eE][-+]?\d+'
58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59 Expfloat = r'\d+' + Exponent
60 Floatnumber = group(Pointfloat, Expfloat)
61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62 Number = group(Imagnumber, Floatnumber, Intnumber)
64 # Tail end of ' string.
65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81 r"//=?",
82 r"[+\-*/%&|^=<>]=?",
83 r"~")
85 Bracket = '[][(){}]'
86 Special = group(r'\r?\n', r'[:;.,`@]')
87 Funny = group(Operator, Bracket, Special)
89 PlainToken = group(Number, Funny, String, Name)
90 Token = Ignore + PlainToken
92 # First (or only) line of ' or " string.
93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100 tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103 "'''": single3prog, '"""': double3prog,
104 "r'''": single3prog, 'r"""': double3prog,
105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
114 triple_quoted = {}
115 for t in ("'''", '"""',
116 "r'''", 'r"""', "R'''", 'R"""',
117 "u'''", 'u"""', "U'''", 'U"""',
118 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119 "uR'''", 'uR"""', "UR'''", 'UR"""'):
120 triple_quoted[t] = t
121 single_quoted = {}
122 for t in ("'", '"',
123 "r'", 'r"', "R'", 'R"',
124 "u'", 'u"', "U'", 'U"',
125 "ur'", 'ur"', "Ur'", 'Ur"',
126 "uR'", 'uR"', "UR'", 'UR"' ):
127 single_quoted[t] = t
129 tabsize = 8
131 class TokenError(Exception): pass
133 class StopTokenizing(Exception): pass
135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
136 print "%d,%d-%d,%d:\t%s\t%s" % \
137 (srow, scol, erow, ecol, tok_name[type], repr(token))
139 def tokenize(readline, tokeneater=printtoken):
141 The tokenize() function accepts two parameters: one representing the
142 input stream, and one providing an output mechanism for tokenize().
144 The first parameter, readline, must be a callable object which provides
145 the same interface as the readline() method of built-in file objects.
146 Each call to the function should return one line of input as a string.
148 The second parameter, tokeneater, must also be a callable object. It is
149 called once for each token, with five arguments, corresponding to the
150 tuples generated by generate_tokens().
152 try:
153 tokenize_loop(readline, tokeneater)
154 except StopTokenizing:
155 pass
157 # backwards compatible interface
158 def tokenize_loop(readline, tokeneater):
159 for token_info in generate_tokens(readline):
160 tokeneater(*token_info)
163 def untokenize(iterable):
164 """Transform tokens back into Python source code.
166 Each element returned by the iterable must be a token sequence
167 with at least two elements, a token number and token value.
169 Round-trip invariant:
170 # Output text will tokenize the back to the input
171 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
172 newcode = untokenize(t1)
173 readline = iter(newcode.splitlines(1)).next
174 t2 = [tok[:2] for tokin generate_tokens(readline)]
175 assert t1 == t2
178 startline = False
179 indents = []
180 toks = []
181 toks_append = toks.append
182 for tok in iterable:
183 toknum, tokval = tok[:2]
185 if toknum in (NAME, NUMBER):
186 tokval += ' '
188 if toknum == INDENT:
189 indents.append(tokval)
190 continue
191 elif toknum == DEDENT:
192 indents.pop()
193 continue
194 elif toknum in (NEWLINE, COMMENT, NL):
195 startline = True
196 elif startline and indents:
197 toks_append(indents[-1])
198 startline = False
199 toks_append(tokval)
200 return ''.join(toks)
203 def generate_tokens(readline):
205 The generate_tokens() generator requires one argment, readline, which
206 must be a callable object which provides the same interface as the
207 readline() method of built-in file objects. Each call to the function
208 should return one line of input as a string. Alternately, readline
209 can be a callable function terminating with StopIteration:
210 readline = open(myfile).next # Example of alternate readline
212 The generator produces 5-tuples with these members: the token type; the
213 token string; a 2-tuple (srow, scol) of ints specifying the row and
214 column where the token begins in the source; a 2-tuple (erow, ecol) of
215 ints specifying the row and column where the token ends in the source;
216 and the line on which the token was found. The line passed is the
217 logical line; continuation lines are included.
219 lnum = parenlev = continued = 0
220 namechars, numchars = string.ascii_letters + '_', '0123456789'
221 contstr, needcont = '', 0
222 contline = None
223 indents = [0]
225 while 1: # loop over lines in stream
226 try:
227 line = readline()
228 except StopIteration:
229 line = ''
230 lnum = lnum + 1
231 pos, max = 0, len(line)
233 if contstr: # continued string
234 if not line:
235 raise TokenError, ("EOF in multi-line string", strstart)
236 endmatch = endprog.match(line)
237 if endmatch:
238 pos = end = endmatch.end(0)
239 yield (STRING, contstr + line[:end],
240 strstart, (lnum, end), contline + line)
241 contstr, needcont = '', 0
242 contline = None
243 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
244 yield (ERRORTOKEN, contstr + line,
245 strstart, (lnum, len(line)), contline)
246 contstr = ''
247 contline = None
248 continue
249 else:
250 contstr = contstr + line
251 contline = contline + line
252 continue
254 elif parenlev == 0 and not continued: # new statement
255 if not line: break
256 column = 0
257 while pos < max: # measure leading whitespace
258 if line[pos] == ' ': column = column + 1
259 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
260 elif line[pos] == '\f': column = 0
261 else: break
262 pos = pos + 1
263 if pos == max: break
265 if line[pos] in '#\r\n': # skip comments or blank lines
266 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
267 (lnum, pos), (lnum, len(line)), line)
268 continue
270 if column > indents[-1]: # count indents or dedents
271 indents.append(column)
272 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
273 while column < indents[-1]:
274 if column not in indents:
275 raise IndentationError(
276 "unindent does not match any outer indentation level")
277 indents = indents[:-1]
278 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
280 else: # continued statement
281 if not line:
282 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
283 continued = 0
285 while pos < max:
286 pseudomatch = pseudoprog.match(line, pos)
287 if pseudomatch: # scan for tokens
288 start, end = pseudomatch.span(1)
289 spos, epos, pos = (lnum, start), (lnum, end), end
290 token, initial = line[start:end], line[start]
292 if initial in numchars or \
293 (initial == '.' and token != '.'): # ordinary number
294 yield (NUMBER, token, spos, epos, line)
295 elif initial in '\r\n':
296 yield (parenlev > 0 and NL or NEWLINE,
297 token, spos, epos, line)
298 elif initial == '#':
299 yield (COMMENT, token, spos, epos, line)
300 elif token in triple_quoted:
301 endprog = endprogs[token]
302 endmatch = endprog.match(line, pos)
303 if endmatch: # all on one line
304 pos = endmatch.end(0)
305 token = line[start:pos]
306 yield (STRING, token, spos, (lnum, pos), line)
307 else:
308 strstart = (lnum, start) # multiple lines
309 contstr = line[start:]
310 contline = line
311 break
312 elif initial in single_quoted or \
313 token[:2] in single_quoted or \
314 token[:3] in single_quoted:
315 if token[-1] == '\n': # continued string
316 strstart = (lnum, start)
317 endprog = (endprogs[initial] or endprogs[token[1]] or
318 endprogs[token[2]])
319 contstr, needcont = line[start:], 1
320 contline = line
321 break
322 else: # ordinary string
323 yield (STRING, token, spos, epos, line)
324 elif initial in namechars: # ordinary name
325 yield (NAME, token, spos, epos, line)
326 elif initial == '\\': # continued stmt
327 continued = 1
328 else:
329 if initial in '([{': parenlev = parenlev + 1
330 elif initial in ')]}': parenlev = parenlev - 1
331 yield (OP, token, spos, epos, line)
332 else:
333 yield (ERRORTOKEN, line[pos],
334 (lnum, pos), (lnum, pos+1), line)
335 pos = pos + 1
337 for indent in indents[1:]: # pop remaining indent levels
338 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
339 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
341 if __name__ == '__main__': # testing
342 import sys
343 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
344 else: tokenize(sys.stdin.readline)