Added documentation for WatchedFileHandler (based on SF patch #1598415)
[python.git] / Lib / tokenize.py
blob5a9d08c7e643ae2ddcc469a5f3621b71a6cad05e
1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
16 operators
18 Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
26 __credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
29 import string, re
30 from token import *
32 import token
33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL", "untokenize"]
35 del x
36 del token
38 COMMENT = N_TOKENS
39 tok_name[COMMENT] = 'COMMENT'
40 NL = N_TOKENS + 1
41 tok_name[NL] = 'NL'
42 N_TOKENS += 2
44 def group(*choices): return '(' + '|'.join(choices) + ')'
45 def any(*choices): return group(*choices) + '*'
46 def maybe(*choices): return group(*choices) + '?'
48 Whitespace = r'[ \f\t]*'
49 Comment = r'#[^\r\n]*'
50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51 Name = r'[a-zA-Z_]\w*'
53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54 Octnumber = r'0[0-7]*[lL]?'
55 Decnumber = r'[1-9]\d*[lL]?'
56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
57 Exponent = r'[eE][-+]?\d+'
58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59 Expfloat = r'\d+' + Exponent
60 Floatnumber = group(Pointfloat, Expfloat)
61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62 Number = group(Imagnumber, Floatnumber, Intnumber)
64 # Tail end of ' string.
65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81 r"//=?",
82 r"[+\-*/%&|^=<>]=?",
83 r"~")
85 Bracket = '[][(){}]'
86 Special = group(r'\r?\n', r'[:;.,`@]')
87 Funny = group(Operator, Bracket, Special)
89 PlainToken = group(Number, Funny, String, Name)
90 Token = Ignore + PlainToken
92 # First (or only) line of ' or " string.
93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100 tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103 "'''": single3prog, '"""': double3prog,
104 "r'''": single3prog, 'r"""': double3prog,
105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
114 triple_quoted = {}
115 for t in ("'''", '"""',
116 "r'''", 'r"""', "R'''", 'R"""',
117 "u'''", 'u"""', "U'''", 'U"""',
118 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119 "uR'''", 'uR"""', "UR'''", 'UR"""'):
120 triple_quoted[t] = t
121 single_quoted = {}
122 for t in ("'", '"',
123 "r'", 'r"', "R'", 'R"',
124 "u'", 'u"', "U'", 'U"',
125 "ur'", 'ur"', "Ur'", 'Ur"',
126 "uR'", 'uR"', "UR'", 'UR"' ):
127 single_quoted[t] = t
129 tabsize = 8
131 class TokenError(Exception): pass
133 class StopTokenizing(Exception): pass
135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
136 print "%d,%d-%d,%d:\t%s\t%s" % \
137 (srow, scol, erow, ecol, tok_name[type], repr(token))
139 def tokenize(readline, tokeneater=printtoken):
141 The tokenize() function accepts two parameters: one representing the
142 input stream, and one providing an output mechanism for tokenize().
144 The first parameter, readline, must be a callable object which provides
145 the same interface as the readline() method of built-in file objects.
146 Each call to the function should return one line of input as a string.
148 The second parameter, tokeneater, must also be a callable object. It is
149 called once for each token, with five arguments, corresponding to the
150 tuples generated by generate_tokens().
152 try:
153 tokenize_loop(readline, tokeneater)
154 except StopTokenizing:
155 pass
157 # backwards compatible interface
158 def tokenize_loop(readline, tokeneater):
159 for token_info in generate_tokens(readline):
160 tokeneater(*token_info)
162 class Untokenizer:
164 def __init__(self):
165 self.tokens = []
166 self.prev_row = 1
167 self.prev_col = 0
169 def add_whitespace(self, start):
170 row, col = start
171 assert row <= self.prev_row
172 col_offset = col - self.prev_col
173 if col_offset:
174 self.tokens.append(" " * col_offset)
176 def untokenize(self, iterable):
177 for t in iterable:
178 if len(t) == 2:
179 self.compat(t, iterable)
180 break
181 tok_type, token, start, end, line = t
182 self.add_whitespace(start)
183 self.tokens.append(token)
184 self.prev_row, self.prev_col = end
185 if tok_type in (NEWLINE, NL):
186 self.prev_row += 1
187 self.prev_col = 0
188 return "".join(self.tokens)
190 def compat(self, token, iterable):
191 startline = False
192 indents = []
193 toks_append = self.tokens.append
194 toknum, tokval = token
195 if toknum in (NAME, NUMBER):
196 tokval += ' '
197 if toknum in (NEWLINE, NL):
198 startline = True
199 for tok in iterable:
200 toknum, tokval = tok[:2]
202 if toknum in (NAME, NUMBER):
203 tokval += ' '
205 if toknum == INDENT:
206 indents.append(tokval)
207 continue
208 elif toknum == DEDENT:
209 indents.pop()
210 continue
211 elif toknum in (NEWLINE, NL):
212 startline = True
213 elif startline and indents:
214 toks_append(indents[-1])
215 startline = False
216 toks_append(tokval)
218 def untokenize(iterable):
219 """Transform tokens back into Python source code.
221 Each element returned by the iterable must be a token sequence
222 with at least two elements, a token number and token value. If
223 only two tokens are passed, the resulting output is poor.
225 Round-trip invariant for full input:
226 Untokenized source will match input source exactly
228 Round-trip invariant for limited intput:
229 # Output text will tokenize the back to the input
230 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
231 newcode = untokenize(t1)
232 readline = iter(newcode.splitlines(1)).next
233 t2 = [tok[:2] for tokin generate_tokens(readline)]
234 assert t1 == t2
236 ut = Untokenizer()
237 return ut.untokenize(iterable)
239 def generate_tokens(readline):
241 The generate_tokens() generator requires one argment, readline, which
242 must be a callable object which provides the same interface as the
243 readline() method of built-in file objects. Each call to the function
244 should return one line of input as a string. Alternately, readline
245 can be a callable function terminating with StopIteration:
246 readline = open(myfile).next # Example of alternate readline
248 The generator produces 5-tuples with these members: the token type; the
249 token string; a 2-tuple (srow, scol) of ints specifying the row and
250 column where the token begins in the source; a 2-tuple (erow, ecol) of
251 ints specifying the row and column where the token ends in the source;
252 and the line on which the token was found. The line passed is the
253 logical line; continuation lines are included.
255 lnum = parenlev = continued = 0
256 namechars, numchars = string.ascii_letters + '_', '0123456789'
257 contstr, needcont = '', 0
258 contline = None
259 indents = [0]
261 while 1: # loop over lines in stream
262 try:
263 line = readline()
264 except StopIteration:
265 line = ''
266 lnum = lnum + 1
267 pos, max = 0, len(line)
269 if contstr: # continued string
270 if not line:
271 raise TokenError, ("EOF in multi-line string", strstart)
272 endmatch = endprog.match(line)
273 if endmatch:
274 pos = end = endmatch.end(0)
275 yield (STRING, contstr + line[:end],
276 strstart, (lnum, end), contline + line)
277 contstr, needcont = '', 0
278 contline = None
279 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
280 yield (ERRORTOKEN, contstr + line,
281 strstart, (lnum, len(line)), contline)
282 contstr = ''
283 contline = None
284 continue
285 else:
286 contstr = contstr + line
287 contline = contline + line
288 continue
290 elif parenlev == 0 and not continued: # new statement
291 if not line: break
292 column = 0
293 while pos < max: # measure leading whitespace
294 if line[pos] == ' ': column = column + 1
295 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
296 elif line[pos] == '\f': column = 0
297 else: break
298 pos = pos + 1
299 if pos == max: break
301 if line[pos] in '#\r\n': # skip comments or blank lines
302 if line[pos] == '#':
303 comment_token = line[pos:].rstrip('\r\n')
304 nl_pos = pos + len(comment_token)
305 yield (COMMENT, comment_token,
306 (lnum, pos), (lnum, pos + len(comment_token)), line)
307 yield (NL, line[nl_pos:],
308 (lnum, nl_pos), (lnum, len(line)), line)
309 else:
310 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
311 (lnum, pos), (lnum, len(line)), line)
312 continue
314 if column > indents[-1]: # count indents or dedents
315 indents.append(column)
316 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
317 while column < indents[-1]:
318 if column not in indents:
319 raise IndentationError(
320 "unindent does not match any outer indentation level",
321 ("<tokenize>", lnum, pos, line))
322 indents = indents[:-1]
323 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
325 else: # continued statement
326 if not line:
327 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
328 continued = 0
330 while pos < max:
331 pseudomatch = pseudoprog.match(line, pos)
332 if pseudomatch: # scan for tokens
333 start, end = pseudomatch.span(1)
334 spos, epos, pos = (lnum, start), (lnum, end), end
335 token, initial = line[start:end], line[start]
337 if initial in numchars or \
338 (initial == '.' and token != '.'): # ordinary number
339 yield (NUMBER, token, spos, epos, line)
340 elif initial in '\r\n':
341 yield (NL if parenlev > 0 else NEWLINE,
342 token, spos, epos, line)
343 elif initial == '#':
344 assert not token.endswith("\n")
345 yield (COMMENT, token, spos, epos, line)
346 elif token in triple_quoted:
347 endprog = endprogs[token]
348 endmatch = endprog.match(line, pos)
349 if endmatch: # all on one line
350 pos = endmatch.end(0)
351 token = line[start:pos]
352 yield (STRING, token, spos, (lnum, pos), line)
353 else:
354 strstart = (lnum, start) # multiple lines
355 contstr = line[start:]
356 contline = line
357 break
358 elif initial in single_quoted or \
359 token[:2] in single_quoted or \
360 token[:3] in single_quoted:
361 if token[-1] == '\n': # continued string
362 strstart = (lnum, start)
363 endprog = (endprogs[initial] or endprogs[token[1]] or
364 endprogs[token[2]])
365 contstr, needcont = line[start:], 1
366 contline = line
367 break
368 else: # ordinary string
369 yield (STRING, token, spos, epos, line)
370 elif initial in namechars: # ordinary name
371 yield (NAME, token, spos, epos, line)
372 elif initial == '\\': # continued stmt
373 continued = 1
374 else:
375 if initial in '([{': parenlev = parenlev + 1
376 elif initial in ')]}': parenlev = parenlev - 1
377 yield (OP, token, spos, epos, line)
378 else:
379 yield (ERRORTOKEN, line[pos],
380 (lnum, pos), (lnum, pos+1), line)
381 pos = pos + 1
383 for indent in indents[1:]: # pop remaining indent levels
384 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
385 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
387 if __name__ == '__main__': # testing
388 import sys
389 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
390 else: tokenize(sys.stdin.readline)