1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
33 __all__
= [x
for x
in dir(token
) if x
[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL", "untokenize"]
39 tok_name
[COMMENT
] = 'COMMENT'
44 def group(*choices
): return '(' + '|'.join(choices
) + ')'
45 def any(*choices
): return group(*choices
) + '*'
46 def maybe(*choices
): return group(*choices
) + '?'
48 Whitespace
= r
'[ \f\t]*'
49 Comment
= r
'#[^\r\n]*'
50 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
51 Name
= r
'[a-zA-Z_]\w*'
53 Hexnumber
= r
'0[xX][\da-fA-F]+[lL]?'
54 Octnumber
= r
'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
55 Binnumber
= r
'0[bB][01]+[lL]?'
56 Decnumber
= r
'[1-9]\d*[lL]?'
57 Intnumber
= group(Hexnumber
, Binnumber
, Octnumber
, Decnumber
)
58 Exponent
= r
'[eE][-+]?\d+'
59 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
60 Expfloat
= r
'\d+' + Exponent
61 Floatnumber
= group(Pointfloat
, Expfloat
)
62 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
63 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
65 # Tail end of ' string.
66 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
67 # Tail end of " string.
68 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
69 # Tail end of ''' string.
70 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
71 # Tail end of """ string.
72 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
73 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
74 # Single-line ' or " string.
75 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
76 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
78 # Because of leftmost-then-longest match semantics, be sure to put the
79 # longest operators first (e.g., if = came before ==, == would get
80 # recognized as two instances of =).
81 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
87 Special
= group(r
'\r?\n', r
'[:;.,`@]')
88 Funny
= group(Operator
, Bracket
, Special
)
90 PlainToken
= group(Number
, Funny
, String
, Name
)
91 Token
= Ignore
+ PlainToken
93 # First (or only) line of ' or " string.
94 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
95 group("'", r
'\\\r?\n'),
96 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
97 group('"', r
'\\\r?\n'))
98 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
99 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
101 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
102 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
103 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
104 "'''": single3prog
, '"""': double3prog
,
105 "r'''": single3prog
, 'r"""': double3prog
,
106 "u'''": single3prog
, 'u"""': double3prog
,
107 "ur'''": single3prog
, 'ur"""': double3prog
,
108 "R'''": single3prog
, 'R"""': double3prog
,
109 "U'''": single3prog
, 'U"""': double3prog
,
110 "uR'''": single3prog
, 'uR"""': double3prog
,
111 "Ur'''": single3prog
, 'Ur"""': double3prog
,
112 "UR'''": single3prog
, 'UR"""': double3prog
,
113 "b'''": single3prog
, 'b"""': double3prog
,
114 "br'''": single3prog
, 'br"""': double3prog
,
115 "B'''": single3prog
, 'B"""': double3prog
,
116 "bR'''": single3prog
, 'bR"""': double3prog
,
117 "Br'''": single3prog
, 'Br"""': double3prog
,
118 "BR'''": single3prog
, 'BR"""': double3prog
,
119 'r': None, 'R': None, 'u': None, 'U': None,
120 'b': None, 'B': None}
123 for t
in ("'''", '"""',
124 "r'''", 'r"""', "R'''", 'R"""',
125 "u'''", 'u"""', "U'''", 'U"""',
126 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
127 "uR'''", 'uR"""', "UR'''", 'UR"""',
128 "b'''", 'b"""', "B'''", 'B"""',
129 "br'''", 'br"""', "Br'''", 'Br"""',
130 "bR'''", 'bR"""', "BR'''", 'BR"""'):
134 "r'", 'r"', "R'", 'R"',
135 "u'", 'u"', "U'", 'U"',
136 "ur'", 'ur"', "Ur'", 'Ur"',
137 "uR'", 'uR"', "UR'", 'UR"',
138 "b'", 'b"', "B'", 'B"',
139 "br'", 'br"', "Br'", 'Br"',
140 "bR'", 'bR"', "BR'", 'BR"' ):
145 class TokenError(Exception): pass
147 class StopTokenizing(Exception): pass
149 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
150 print "%d,%d-%d,%d:\t%s\t%s" % \
151 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
153 def tokenize(readline
, tokeneater
=printtoken
):
155 The tokenize() function accepts two parameters: one representing the
156 input stream, and one providing an output mechanism for tokenize().
158 The first parameter, readline, must be a callable object which provides
159 the same interface as the readline() method of built-in file objects.
160 Each call to the function should return one line of input as a string.
162 The second parameter, tokeneater, must also be a callable object. It is
163 called once for each token, with five arguments, corresponding to the
164 tuples generated by generate_tokens().
167 tokenize_loop(readline
, tokeneater
)
168 except StopTokenizing
:
171 # backwards compatible interface
172 def tokenize_loop(readline
, tokeneater
):
173 for token_info
in generate_tokens(readline
):
174 tokeneater(*token_info
)
183 def add_whitespace(self
, start
):
185 assert row
<= self
.prev_row
186 col_offset
= col
- self
.prev_col
188 self
.tokens
.append(" " * col_offset
)
190 def untokenize(self
, iterable
):
193 self
.compat(t
, iterable
)
195 tok_type
, token
, start
, end
, line
= t
196 self
.add_whitespace(start
)
197 self
.tokens
.append(token
)
198 self
.prev_row
, self
.prev_col
= end
199 if tok_type
in (NEWLINE
, NL
):
202 return "".join(self
.tokens
)
204 def compat(self
, token
, iterable
):
207 toks_append
= self
.tokens
.append
208 toknum
, tokval
= token
209 if toknum
in (NAME
, NUMBER
):
211 if toknum
in (NEWLINE
, NL
):
215 toknum
, tokval
= tok
[:2]
217 if toknum
in (NAME
, NUMBER
):
220 # Insert a space between two consecutive strings
223 tokval
= ' ' + tokval
229 indents
.append(tokval
)
231 elif toknum
== DEDENT
:
234 elif toknum
in (NEWLINE
, NL
):
236 elif startline
and indents
:
237 toks_append(indents
[-1])
241 def untokenize(iterable
):
242 """Transform tokens back into Python source code.
244 Each element returned by the iterable must be a token sequence
245 with at least two elements, a token number and token value. If
246 only two tokens are passed, the resulting output is poor.
248 Round-trip invariant for full input:
249 Untokenized source will match input source exactly
251 Round-trip invariant for limited intput:
252 # Output text will tokenize the back to the input
253 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
254 newcode = untokenize(t1)
255 readline = iter(newcode.splitlines(1)).next
256 t2 = [tok[:2] for tok in generate_tokens(readline)]
260 return ut
.untokenize(iterable
)
262 def generate_tokens(readline
):
264 The generate_tokens() generator requires one argment, readline, which
265 must be a callable object which provides the same interface as the
266 readline() method of built-in file objects. Each call to the function
267 should return one line of input as a string. Alternately, readline
268 can be a callable function terminating with StopIteration:
269 readline = open(myfile).next # Example of alternate readline
271 The generator produces 5-tuples with these members: the token type; the
272 token string; a 2-tuple (srow, scol) of ints specifying the row and
273 column where the token begins in the source; a 2-tuple (erow, ecol) of
274 ints specifying the row and column where the token ends in the source;
275 and the line on which the token was found. The line passed is the
276 logical line; continuation lines are included.
278 lnum
= parenlev
= continued
= 0
279 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
280 contstr
, needcont
= '', 0
284 while 1: # loop over lines in stream
287 except StopIteration:
290 pos
, max = 0, len(line
)
292 if contstr
: # continued string
294 raise TokenError
, ("EOF in multi-line string", strstart
)
295 endmatch
= endprog
.match(line
)
297 pos
= end
= endmatch
.end(0)
298 yield (STRING
, contstr
+ line
[:end
],
299 strstart
, (lnum
, end
), contline
+ line
)
300 contstr
, needcont
= '', 0
302 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
303 yield (ERRORTOKEN
, contstr
+ line
,
304 strstart
, (lnum
, len(line
)), contline
)
309 contstr
= contstr
+ line
310 contline
= contline
+ line
313 elif parenlev
== 0 and not continued
: # new statement
316 while pos
< max: # measure leading whitespace
317 if line
[pos
] == ' ': column
= column
+ 1
318 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
319 elif line
[pos
] == '\f': column
= 0
324 if line
[pos
] in '#\r\n': # skip comments or blank lines
326 comment_token
= line
[pos
:].rstrip('\r\n')
327 nl_pos
= pos
+ len(comment_token
)
328 yield (COMMENT
, comment_token
,
329 (lnum
, pos
), (lnum
, pos
+ len(comment_token
)), line
)
330 yield (NL
, line
[nl_pos
:],
331 (lnum
, nl_pos
), (lnum
, len(line
)), line
)
333 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
334 (lnum
, pos
), (lnum
, len(line
)), line
)
337 if column
> indents
[-1]: # count indents or dedents
338 indents
.append(column
)
339 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
340 while column
< indents
[-1]:
341 if column
not in indents
:
342 raise IndentationError(
343 "unindent does not match any outer indentation level",
344 ("<tokenize>", lnum
, pos
, line
))
345 indents
= indents
[:-1]
346 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
348 else: # continued statement
350 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
354 pseudomatch
= pseudoprog
.match(line
, pos
)
355 if pseudomatch
: # scan for tokens
356 start
, end
= pseudomatch
.span(1)
357 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
358 token
, initial
= line
[start
:end
], line
[start
]
360 if initial
in numchars
or \
361 (initial
== '.' and token
!= '.'): # ordinary number
362 yield (NUMBER
, token
, spos
, epos
, line
)
363 elif initial
in '\r\n':
364 yield (NL
if parenlev
> 0 else NEWLINE
,
365 token
, spos
, epos
, line
)
367 assert not token
.endswith("\n")
368 yield (COMMENT
, token
, spos
, epos
, line
)
369 elif token
in triple_quoted
:
370 endprog
= endprogs
[token
]
371 endmatch
= endprog
.match(line
, pos
)
372 if endmatch
: # all on one line
373 pos
= endmatch
.end(0)
374 token
= line
[start
:pos
]
375 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
377 strstart
= (lnum
, start
) # multiple lines
378 contstr
= line
[start
:]
381 elif initial
in single_quoted
or \
382 token
[:2] in single_quoted
or \
383 token
[:3] in single_quoted
:
384 if token
[-1] == '\n': # continued string
385 strstart
= (lnum
, start
)
386 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
388 contstr
, needcont
= line
[start
:], 1
391 else: # ordinary string
392 yield (STRING
, token
, spos
, epos
, line
)
393 elif initial
in namechars
: # ordinary name
394 yield (NAME
, token
, spos
, epos
, line
)
395 elif initial
== '\\': # continued stmt
398 if initial
in '([{': parenlev
= parenlev
+ 1
399 elif initial
in ')]}': parenlev
= parenlev
- 1
400 yield (OP
, token
, spos
, epos
, line
)
402 yield (ERRORTOKEN
, line
[pos
],
403 (lnum
, pos
), (lnum
, pos
+1), line
)
406 for indent
in indents
[1:]: # pop remaining indent levels
407 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
408 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
410 if __name__
== '__main__': # testing
412 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
413 else: tokenize(sys
.stdin
.readline
)