1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro, Raymond Hettinger'
33 __all__
= [x
for x
in dir(token
) if x
[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL", "untokenize"]
39 tok_name
[COMMENT
] = 'COMMENT'
44 def group(*choices
): return '(' + '|'.join(choices
) + ')'
45 def any(*choices
): return group(*choices
) + '*'
46 def maybe(*choices
): return group(*choices
) + '?'
48 Whitespace
= r
'[ \f\t]*'
49 Comment
= r
'#[^\r\n]*'
50 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
51 Name
= r
'[a-zA-Z_]\w*'
53 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
54 Octnumber
= r
'0[0-7]*[lL]?'
55 Decnumber
= r
'[1-9]\d*[lL]?'
56 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
57 Exponent
= r
'[eE][-+]?\d+'
58 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
59 Expfloat
= r
'\d+' + Exponent
60 Floatnumber
= group(Pointfloat
, Expfloat
)
61 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
62 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
64 # Tail end of ' string.
65 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
86 Special
= group(r
'\r?\n', r
'[:;.,`@]')
87 Funny
= group(Operator
, Bracket
, Special
)
89 PlainToken
= group(Number
, Funny
, String
, Name
)
90 Token
= Ignore
+ PlainToken
92 # First (or only) line of ' or " string.
93 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r
'\\\r?\n'),
95 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r
'\\\r?\n'))
97 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
98 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
100 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
101 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
102 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
103 "'''": single3prog
, '"""': double3prog
,
104 "r'''": single3prog
, 'r"""': double3prog
,
105 "u'''": single3prog
, 'u"""': double3prog
,
106 "ur'''": single3prog
, 'ur"""': double3prog
,
107 "R'''": single3prog
, 'R"""': double3prog
,
108 "U'''": single3prog
, 'U"""': double3prog
,
109 "uR'''": single3prog
, 'uR"""': double3prog
,
110 "Ur'''": single3prog
, 'Ur"""': double3prog
,
111 "UR'''": single3prog
, 'UR"""': double3prog
,
112 'r': None, 'R': None, 'u': None, 'U': None}
115 for t
in ("'''", '"""',
116 "r'''", 'r"""', "R'''", 'R"""',
117 "u'''", 'u"""', "U'''", 'U"""',
118 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119 "uR'''", 'uR"""', "UR'''", 'UR"""'):
123 "r'", 'r"', "R'", 'R"',
124 "u'", 'u"', "U'", 'U"',
125 "ur'", 'ur"', "Ur'", 'Ur"',
126 "uR'", 'uR"', "UR'", 'UR"' ):
131 class TokenError(Exception): pass
133 class StopTokenizing(Exception): pass
135 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
136 print "%d,%d-%d,%d:\t%s\t%s" % \
137 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
139 def tokenize(readline
, tokeneater
=printtoken
):
141 The tokenize() function accepts two parameters: one representing the
142 input stream, and one providing an output mechanism for tokenize().
144 The first parameter, readline, must be a callable object which provides
145 the same interface as the readline() method of built-in file objects.
146 Each call to the function should return one line of input as a string.
148 The second parameter, tokeneater, must also be a callable object. It is
149 called once for each token, with five arguments, corresponding to the
150 tuples generated by generate_tokens().
153 tokenize_loop(readline
, tokeneater
)
154 except StopTokenizing
:
157 # backwards compatible interface
158 def tokenize_loop(readline
, tokeneater
):
159 for token_info
in generate_tokens(readline
):
160 tokeneater(*token_info
)
169 def add_whitespace(self
, start
):
171 assert row
<= self
.prev_row
172 col_offset
= col
- self
.prev_col
174 self
.tokens
.append(" " * col_offset
)
176 def untokenize(self
, iterable
):
179 self
.compat(t
, iterable
)
181 tok_type
, token
, start
, end
, line
= t
182 self
.add_whitespace(start
)
183 self
.tokens
.append(token
)
184 self
.prev_row
, self
.prev_col
= end
185 if tok_type
in (NEWLINE
, NL
):
188 return "".join(self
.tokens
)
190 def compat(self
, token
, iterable
):
193 toks_append
= self
.tokens
.append
194 toknum
, tokval
= token
195 if toknum
in (NAME
, NUMBER
):
197 if toknum
in (NEWLINE
, NL
):
200 toknum
, tokval
= tok
[:2]
202 if toknum
in (NAME
, NUMBER
):
206 indents
.append(tokval
)
208 elif toknum
== DEDENT
:
211 elif toknum
in (NEWLINE
, NL
):
213 elif startline
and indents
:
214 toks_append(indents
[-1])
218 def untokenize(iterable
):
219 """Transform tokens back into Python source code.
221 Each element returned by the iterable must be a token sequence
222 with at least two elements, a token number and token value. If
223 only two tokens are passed, the resulting output is poor.
225 Round-trip invariant for full input:
226 Untokenized source will match input source exactly
228 Round-trip invariant for limited intput:
229 # Output text will tokenize the back to the input
230 t1 = [tok[:2] for tok in generate_tokens(f.readline)]
231 newcode = untokenize(t1)
232 readline = iter(newcode.splitlines(1)).next
233 t2 = [tok[:2] for tokin generate_tokens(readline)]
237 return ut
.untokenize(iterable
)
239 def generate_tokens(readline
):
241 The generate_tokens() generator requires one argment, readline, which
242 must be a callable object which provides the same interface as the
243 readline() method of built-in file objects. Each call to the function
244 should return one line of input as a string. Alternately, readline
245 can be a callable function terminating with StopIteration:
246 readline = open(myfile).next # Example of alternate readline
248 The generator produces 5-tuples with these members: the token type; the
249 token string; a 2-tuple (srow, scol) of ints specifying the row and
250 column where the token begins in the source; a 2-tuple (erow, ecol) of
251 ints specifying the row and column where the token ends in the source;
252 and the line on which the token was found. The line passed is the
253 logical line; continuation lines are included.
255 lnum
= parenlev
= continued
= 0
256 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
257 contstr
, needcont
= '', 0
261 while 1: # loop over lines in stream
264 except StopIteration:
267 pos
, max = 0, len(line
)
269 if contstr
: # continued string
271 raise TokenError
, ("EOF in multi-line string", strstart
)
272 endmatch
= endprog
.match(line
)
274 pos
= end
= endmatch
.end(0)
275 yield (STRING
, contstr
+ line
[:end
],
276 strstart
, (lnum
, end
), contline
+ line
)
277 contstr
, needcont
= '', 0
279 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
280 yield (ERRORTOKEN
, contstr
+ line
,
281 strstart
, (lnum
, len(line
)), contline
)
286 contstr
= contstr
+ line
287 contline
= contline
+ line
290 elif parenlev
== 0 and not continued
: # new statement
293 while pos
< max: # measure leading whitespace
294 if line
[pos
] == ' ': column
= column
+ 1
295 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
296 elif line
[pos
] == '\f': column
= 0
301 if line
[pos
] in '#\r\n': # skip comments or blank lines
303 comment_token
= line
[pos
:].rstrip('\r\n')
304 nl_pos
= pos
+ len(comment_token
)
305 yield (COMMENT
, comment_token
,
306 (lnum
, pos
), (lnum
, pos
+ len(comment_token
)), line
)
307 yield (NL
, line
[nl_pos
:],
308 (lnum
, nl_pos
), (lnum
, len(line
)), line
)
310 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
311 (lnum
, pos
), (lnum
, len(line
)), line
)
314 if column
> indents
[-1]: # count indents or dedents
315 indents
.append(column
)
316 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
317 while column
< indents
[-1]:
318 if column
not in indents
:
319 raise IndentationError(
320 "unindent does not match any outer indentation level",
321 ("<tokenize>", lnum
, pos
, line
))
322 indents
= indents
[:-1]
323 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
325 else: # continued statement
327 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
331 pseudomatch
= pseudoprog
.match(line
, pos
)
332 if pseudomatch
: # scan for tokens
333 start
, end
= pseudomatch
.span(1)
334 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
335 token
, initial
= line
[start
:end
], line
[start
]
337 if initial
in numchars
or \
338 (initial
== '.' and token
!= '.'): # ordinary number
339 yield (NUMBER
, token
, spos
, epos
, line
)
340 elif initial
in '\r\n':
341 yield (NL
if parenlev
> 0 else NEWLINE
,
342 token
, spos
, epos
, line
)
344 assert not token
.endswith("\n")
345 yield (COMMENT
, token
, spos
, epos
, line
)
346 elif token
in triple_quoted
:
347 endprog
= endprogs
[token
]
348 endmatch
= endprog
.match(line
, pos
)
349 if endmatch
: # all on one line
350 pos
= endmatch
.end(0)
351 token
= line
[start
:pos
]
352 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
354 strstart
= (lnum
, start
) # multiple lines
355 contstr
= line
[start
:]
358 elif initial
in single_quoted
or \
359 token
[:2] in single_quoted
or \
360 token
[:3] in single_quoted
:
361 if token
[-1] == '\n': # continued string
362 strstart
= (lnum
, start
)
363 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
365 contstr
, needcont
= line
[start
:], 1
368 else: # ordinary string
369 yield (STRING
, token
, spos
, epos
, line
)
370 elif initial
in namechars
: # ordinary name
371 yield (NAME
, token
, spos
, epos
, line
)
372 elif initial
== '\\': # continued stmt
375 if initial
in '([{': parenlev
= parenlev
+ 1
376 elif initial
in ')]}': parenlev
= parenlev
- 1
377 yield (OP
, token
, spos
, epos
, line
)
379 yield (ERRORTOKEN
, line
[pos
],
380 (lnum
, pos
), (lnum
, pos
+1), line
)
383 for indent
in indents
[1:]: # pop remaining indent levels
384 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
385 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
387 if __name__
== '__main__': # testing
389 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
390 else: tokenize(sys
.stdin
.readline
)