1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
26 __credits__
= ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
27 'Skip Montanaro, Raymond Hettinger')
33 __all__
= [x
for x
in dir(token
) if not x
.startswith("_")]
34 __all__
+= ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
39 tok_name
[COMMENT
] = 'COMMENT'
44 def group(*choices
): return '(' + '|'.join(choices
) + ')'
45 def any(*choices
): return group(*choices
) + '*'
46 def maybe(*choices
): return group(*choices
) + '?'
48 Whitespace
= r
'[ \f\t]*'
49 Comment
= r
'#[^\r\n]*'
50 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
51 Name
= r
'[a-zA-Z_]\w*'
53 Hexnumber
= r
'0[xX][\da-fA-F]+[lL]?'
54 Octnumber
= r
'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
55 Binnumber
= r
'0[bB][01]+[lL]?'
56 Decnumber
= r
'[1-9]\d*[lL]?'
57 Intnumber
= group(Hexnumber
, Binnumber
, Octnumber
, Decnumber
)
58 Exponent
= r
'[eE][-+]?\d+'
59 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
60 Expfloat
= r
'\d+' + Exponent
61 Floatnumber
= group(Pointfloat
, Expfloat
)
62 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
63 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
65 # Tail end of ' string.
66 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
67 # Tail end of " string.
68 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
69 # Tail end of ''' string.
70 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
71 # Tail end of """ string.
72 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
73 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
74 # Single-line ' or " string.
75 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
76 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
78 # Because of leftmost-then-longest match semantics, be sure to put the
79 # longest operators first (e.g., if = came before ==, == would get
80 # recognized as two instances of =).
81 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
87 Special
= group(r
'\r?\n', r
'[:;.,`@]')
88 Funny
= group(Operator
, Bracket
, Special
)
90 PlainToken
= group(Number
, Funny
, String
, Name
)
91 Token
= Ignore
+ PlainToken
93 # First (or only) line of ' or " string.
94 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
95 group("'", r
'\\\r?\n'),
96 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
97 group('"', r
'\\\r?\n'))
98 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
99 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
101 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
102 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
103 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
104 "'''": single3prog
, '"""': double3prog
,
105 "r'''": single3prog
, 'r"""': double3prog
,
106 "u'''": single3prog
, 'u"""': double3prog
,
107 "ur'''": single3prog
, 'ur"""': double3prog
,
108 "R'''": single3prog
, 'R"""': double3prog
,
109 "U'''": single3prog
, 'U"""': double3prog
,
110 "uR'''": single3prog
, 'uR"""': double3prog
,
111 "Ur'''": single3prog
, 'Ur"""': double3prog
,
112 "UR'''": single3prog
, 'UR"""': double3prog
,
113 "b'''": single3prog
, 'b"""': double3prog
,
114 "br'''": single3prog
, 'br"""': double3prog
,
115 "B'''": single3prog
, 'B"""': double3prog
,
116 "bR'''": single3prog
, 'bR"""': double3prog
,
117 "Br'''": single3prog
, 'Br"""': double3prog
,
118 "BR'''": single3prog
, 'BR"""': double3prog
,
119 'r': None, 'R': None, 'u': None, 'U': None,
120 'b': None, 'B': None}
123 for t
in ("'''", '"""',
124 "r'''", 'r"""', "R'''", 'R"""',
125 "u'''", 'u"""', "U'''", 'U"""',
126 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
127 "uR'''", 'uR"""', "UR'''", 'UR"""',
128 "b'''", 'b"""', "B'''", 'B"""',
129 "br'''", 'br"""', "Br'''", 'Br"""',
130 "bR'''", 'bR"""', "BR'''", 'BR"""'):
134 "r'", 'r"', "R'", 'R"',
135 "u'", 'u"', "U'", 'U"',
136 "ur'", 'ur"', "Ur'", 'Ur"',
137 "uR'", 'uR"', "UR'", 'UR"',
138 "b'", 'b"', "B'", 'B"',
139 "br'", 'br"', "Br'", 'Br"',
140 "bR'", 'bR"', "BR'", 'BR"' ):
145 class TokenError(Exception): pass
147 class StopTokenizing(Exception): pass
149 def printtoken(type, token
, srow_scol
, erow_ecol
, line
): # for testing
150 srow
, scol
= srow_scol
151 erow
, ecol
= erow_ecol
152 print "%d,%d-%d,%d:\t%s\t%s" % \
153 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
155 def tokenize(readline
, tokeneater
=printtoken
):
157 The tokenize() function accepts two parameters: one representing the
158 input stream, and one providing an output mechanism for tokenize().
160 The first parameter, readline, must be a callable object which provides
161 the same interface as the readline() method of built-in file objects.
162 Each call to the function should return one line of input as a string.
164 The second parameter, tokeneater, must also be a callable object. It is
165 called once for each token, with five arguments, corresponding to the
166 tuples generated by generate_tokens().
169 tokenize_loop(readline
, tokeneater
)
170 except StopTokenizing
:
173 # backwards compatible interface
174 def tokenize_loop(readline
, tokeneater
):
175 for token_info
in generate_tokens(readline
):
176 tokeneater(*token_info
)
185 def add_whitespace(self
, start
):
187 assert row
<= self
.prev_row
188 col_offset
= col
- self
.prev_col
190 self
.tokens
.append(" " * col_offset
)
192 def untokenize(self
, iterable
):
195 self
.compat(t
, iterable
)
197 tok_type
, token
, start
, end
, line
= t
198 self
.add_whitespace(start
)
199 self
.tokens
.append(token
)
200 self
.prev_row
, self
.prev_col
= end
201 if tok_type
in (NEWLINE
, NL
):
204 return "".join(self
.tokens
)
206 def compat(self
, token
, iterable
):
209 toks_append
= self
.tokens
.append
210 toknum
, tokval
= token
211 if toknum
in (NAME
, NUMBER
):
213 if toknum
in (NEWLINE
, NL
):
217 toknum
, tokval
= tok
[:2]
219 if toknum
in (NAME
, NUMBER
):
222 # Insert a space between two consecutive strings
225 tokval
= ' ' + tokval
231 indents
.append(tokval
)
233 elif toknum
== DEDENT
:
236 elif toknum
in (NEWLINE
, NL
):
238 elif startline
and indents
:
239 toks_append(indents
[-1])
243 def untokenize(iterable
):
244 """Transform tokens back into Python source code.
246 Each element returned by the iterable must be a token sequence
247 with at least two elements, a token number and token value. If
248 only two tokens are passed, the resulting output is poor.
250 Round-trip invariant for full input:
251 Untokenized source will match input source exactly
253 Round-trip invariant for limited intput:
254 # Output text will tokenize the back to the input
255 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
256 newcode = untokenize(t1)
257 readline = iter(newcode.splitlines(1)).next
258 t2 = [tok[:2] for tok in generate_tokens(readline)]
262 return ut
.untokenize(iterable
)
264 def generate_tokens(readline
):
266 The generate_tokens() generator requires one argment, readline, which
267 must be a callable object which provides the same interface as the
268 readline() method of built-in file objects. Each call to the function
269 should return one line of input as a string. Alternately, readline
270 can be a callable function terminating with StopIteration:
271 readline = open(myfile).next # Example of alternate readline
273 The generator produces 5-tuples with these members: the token type; the
274 token string; a 2-tuple (srow, scol) of ints specifying the row and
275 column where the token begins in the source; a 2-tuple (erow, ecol) of
276 ints specifying the row and column where the token ends in the source;
277 and the line on which the token was found. The line passed is the
278 logical line; continuation lines are included.
280 lnum
= parenlev
= continued
= 0
281 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
282 contstr
, needcont
= '', 0
286 while 1: # loop over lines in stream
289 except StopIteration:
292 pos
, max = 0, len(line
)
294 if contstr
: # continued string
296 raise TokenError
, ("EOF in multi-line string", strstart
)
297 endmatch
= endprog
.match(line
)
299 pos
= end
= endmatch
.end(0)
300 yield (STRING
, contstr
+ line
[:end
],
301 strstart
, (lnum
, end
), contline
+ line
)
302 contstr
, needcont
= '', 0
304 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
305 yield (ERRORTOKEN
, contstr
+ line
,
306 strstart
, (lnum
, len(line
)), contline
)
311 contstr
= contstr
+ line
312 contline
= contline
+ line
315 elif parenlev
== 0 and not continued
: # new statement
318 while pos
< max: # measure leading whitespace
321 elif line
[pos
] == '\t':
322 column
= (column
//tabsize
+ 1)*tabsize
323 elif line
[pos
] == '\f':
331 if line
[pos
] in '#\r\n': # skip comments or blank lines
333 comment_token
= line
[pos
:].rstrip('\r\n')
334 nl_pos
= pos
+ len(comment_token
)
335 yield (COMMENT
, comment_token
,
336 (lnum
, pos
), (lnum
, pos
+ len(comment_token
)), line
)
337 yield (NL
, line
[nl_pos
:],
338 (lnum
, nl_pos
), (lnum
, len(line
)), line
)
340 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
341 (lnum
, pos
), (lnum
, len(line
)), line
)
344 if column
> indents
[-1]: # count indents or dedents
345 indents
.append(column
)
346 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
347 while column
< indents
[-1]:
348 if column
not in indents
:
349 raise IndentationError(
350 "unindent does not match any outer indentation level",
351 ("<tokenize>", lnum
, pos
, line
))
352 indents
= indents
[:-1]
353 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
355 else: # continued statement
357 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
361 pseudomatch
= pseudoprog
.match(line
, pos
)
362 if pseudomatch
: # scan for tokens
363 start
, end
= pseudomatch
.span(1)
364 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
365 token
, initial
= line
[start
:end
], line
[start
]
367 if initial
in numchars
or \
368 (initial
== '.' and token
!= '.'): # ordinary number
369 yield (NUMBER
, token
, spos
, epos
, line
)
370 elif initial
in '\r\n':
371 yield (NL
if parenlev
> 0 else NEWLINE
,
372 token
, spos
, epos
, line
)
374 assert not token
.endswith("\n")
375 yield (COMMENT
, token
, spos
, epos
, line
)
376 elif token
in triple_quoted
:
377 endprog
= endprogs
[token
]
378 endmatch
= endprog
.match(line
, pos
)
379 if endmatch
: # all on one line
380 pos
= endmatch
.end(0)
381 token
= line
[start
:pos
]
382 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
384 strstart
= (lnum
, start
) # multiple lines
385 contstr
= line
[start
:]
388 elif initial
in single_quoted
or \
389 token
[:2] in single_quoted
or \
390 token
[:3] in single_quoted
:
391 if token
[-1] == '\n': # continued string
392 strstart
= (lnum
, start
)
393 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
395 contstr
, needcont
= line
[start
:], 1
398 else: # ordinary string
399 yield (STRING
, token
, spos
, epos
, line
)
400 elif initial
in namechars
: # ordinary name
401 yield (NAME
, token
, spos
, epos
, line
)
402 elif initial
== '\\': # continued stmt
407 elif initial
in ')]}':
409 yield (OP
, token
, spos
, epos
, line
)
411 yield (ERRORTOKEN
, line
[pos
],
412 (lnum
, pos
), (lnum
, pos
+1), line
)
415 for indent
in indents
[1:]: # pop remaining indent levels
416 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
417 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
419 if __name__
== '__main__': # testing
421 if len(sys
.argv
) > 1:
422 tokenize(open(sys
.argv
[1]).readline
)
424 tokenize(sys
.stdin
.readline
)