1 from io
import StringIO
7 parser
.read
= reader
.read
9 parser
.REMOVE_Cf
= dict()
10 for [first
, last
] in Cf_RANGES
:
11 for c
in range(first
, last
+ 1):
12 parser
.REMOVE_Cf
[c
] = None
17 #~ parser.lookahead = list()
20 while True: # Get one or more statements or function declarations until EOF
21 # TODO: check lookahead buffer first
22 token
= parser
.read_token()
26 if token
== ('punctuator', '\n'):
28 if token
== ('reserved word', 'function'):
29 yield parser
.parse_func_def()
31 yield parser
.parse_statement(token
) # push token back to lookahead buffer?
35 Cf_RANGES
= ( # From UnicodeData-2.1.9.txt
45 match
= Parser
.TOKENS
.search(self
.buffer, self
.pos
)
52 self
.pos
= match
.end()
53 if self
.pos
== len(self
.buffer) and not self
.eof
:
54 self
.buffer = self
.buffer[match
.start():]
57 handle
= getattr(self
, f
'handle_{match.lastgroup}')
58 return handle(match
.group())
60 def parse_statement(self
, token
):
61 if token
== ('reserved word', 'var'):
64 token
= self
.read_token()
66 assert type == 'identifier'
68 token
= self
.read_token()
69 assert token
== ('punctuator', '=')
71 token
= self
.read_token()
72 assert isinstance(token
, str)
73 vars.append((id, token
))
75 token
= self
.read_token()
76 assert token
== ('punctuator', ';')
78 if token
[0] == 'identifier':
79 token2
= self
.read_token()
80 if token2
[0] == 'punctuator' \
81 and self
.BIN_TER_OPS
.fullmatch(token2
[1]):
82 rhs
= self
.read_token()
83 assert rhs
[0] == 'identifier'
84 expr
= (token2
[1], token
, rhs
)
86 token
= self
.read_token()
89 raise NotImplementedError(token2
)
90 raise NotImplementedError(token
)
92 BIN_TER_OPS
= re
.compile(
93 r
'&& | \|\| | (<< | >>>? | [=!]= | [-<>+*%&^/])=? | [.,?=]',
98 # "$", "_", plus L and Nl categories from UnicodeData-3.0.1.txt
99 IDENTIFIER_START
= r
'$_' \
100 'A-Za-z\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8' \
101 '\u02BB-\u02C1\u02D0\u02D1\u02E0-\u02E4\u02EE\u037A\u0386' \
102 '\u0388-\u0481\u048C-\u0559\u0561-\u0587\u05D0-\u05F2\u0621-\u064A' \
103 '\u0671-\u06D3\u06D5\u06E5-\u06E6\u06FA-\u06FC\u0710\u0712-\u072C' \
104 '\u0780-\u07A5\u0905-\u0939\u093D\u0950\u0958-\u0961\u0985-\u09B9' \
105 '\u09DC-\u09E1\u09F0-\u09F1\u0A05-\u0A39\u0A59-\u0A5E\u0A72-\u0A74' \
106 '\u0A85-\u0AB9\u0ABD\u0AD0-\u0AE0\u0B05-\u0B39\u0B3D\u0B5C-\u0B61' \
107 '\u0B85-\u0BB9\u0C05-\u0C39\u0C60-\u0C61\u0C85-\u0CB9\u0CDE-\u0CE1' \
108 '\u0D05-\u0D39\u0D60-\u0D61\u0D85-\u0DC6\u0E01-\u0E30\u0E32-\u0E33' \
109 '\u0E40-\u0E46\u0E81-\u0EB0\u0EB2-\u0EB3\u0EBD-\u0EC6\u0EDC-\u0F00' \
110 '\u0F40-\u0F6A\u0F88-\u0F8B\u1000-\u102A\u1050-\u1055\u10A0-\u10F6' \
111 '\u1100-\u135A\u13A0-\u166C\u166F-\u1676\u1681-\u169A\u16A0-\u16EA' \
112 '\u1780-\u17B3\u1820-\u18A8\u1E00-\u1FBC\u1FBE\u1FC2-\u1FCC' \
113 '\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC\u207F\u2102\u2107' \
114 '\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D' \
115 '\u212F-\u2131\u2133-\u2139\u2160-\u2183\u3005-\u3007\u3021-\u3029' \
116 '\u3031-\u3035\u3038-\u303A\u3041-\u3094\u309D-\u30FA\u30FC-\u318E' \
117 '\u31A0-\u31B7\u3400-\uA48C\uAC00-\uD7A3\uF900-\uFB1D\uFB1F-\uFB28' \
118 '\uFB2A-\uFD3D\uFD50-\uFDFB\uFE70-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A' \
120 IDENTIFIER_START_CHAR
= re
.compile(fr
'[{IDENTIFIER_START}]|\\')
122 # "$" plus L, Nl, Mn, Mc, Nd and Pc (includes "_") categories
123 IDENTIFIER_PART
= r
'$\w' \
124 '\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8' \
125 '\u02BB-\u02C1\u02D0-\u02D1\u02E0-\u02E4\u02EE-\u0362\u037A\u0386' \
126 '\u0388-\u0481\u0483-\u0486\u048C-\u0559\u0561-\u0587\u0591-\u05BD' \
127 '\u05BF\u05C1-\u05C2\u05C4-\u05F2\u0621-\u0669\u0670-\u06D3' \
128 '\u06D5-\u06DC\u06DF-\u06E8\u06EA-\u06FC\u0710-\u0963\u0966-\u096F' \
129 '\u0981-\u09F1\u0A02-\u0B6F\u0B82-\u0BEF\u0C01-\u0DF3\u0E01-\u0E3A' \
130 '\u0E40-\u0E4E\u0E50-\u0E59\u0E81-\u0F00\u0F18-\u0F19\u0F20-\u0F29' \
131 '\u0F35\u0F37\u0F39\u0F3E-\u0F84\u0F86-\u0FBC\u0FC6\u1000-\u1049' \
132 '\u1050-\u10F6\u1100-\u135A\u1369-\u1371\u13A0-\u166C\u166F-\u1676' \
133 '\u1681-\u169A\u16A0-\u16EA\u1780-\u17D3\u17E0-\u17E9\u1810-\u1FBC' \
134 '\u1FBE\u1FC2-\u1FCC\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC' \
135 '\u203F-\u2040\u207F\u20D0-\u20DC\u20E1\u2102\u2107\u210A-\u2113' \
136 '\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2131' \
137 '\u2133-\u2139\u2160-\u2183\u3005-\u3007\u3021-\u302F\u3031-\u3035' \
138 '\u3038-\u303A\u3041-\u309A\u309D-\u318E\u31A0-\u31B7\u3400-\uA48C' \
139 '\uAC00-\uD7A3\uF900-\uFB28\uFB2A-\uFD3D\uFD50-\uFE23\uFE33-\uFE34' \
140 '\uFE4D-\uFE4F\uFE70-\uFEFC\uFF10-\uFF19\uFF21-\uFF3A\uFF3F' \
141 '\uFF41-\uFF5A\uFF65-\uFFDC'
143 LINE_TERMINATORS
= '\n\r\u2028\u2029'
146 fr
'(?P<line_terminator> [{LINE_TERMINATORS}])'
147 r
'| (?P<singleline_comment> //)'
148 r
'| (?P<multiline_comment> /\*)'
149 fr
'| (?P<identifier> {IDENTIFIER_START_CHAR.pattern})'
150 r
'| (?P<number> [\d.])'
152 r
'\+\+ | -- | && | \|\|'
153 r
'| (<< | >>>? | [=!]= | [-<>+*%&^/])=?'
154 r
'| [][{}().;,!~?:=])'
155 '| (?P<string> ["\'])'
156 # Anything else other than whitespace:
157 '| (?P<illegal_codepoint> [^\t\v\f \xA0\u2000-\u200B\u3000])',
158 re
.DOTALL ^ re
.VERBOSE ^ re
.ASCII
)
160 def handle_line_terminator(self
, token
):
161 return ('punctuator', '\n')
163 LINE_TERMINATOR_PATTERN \
164 = re
.compile(fr
'[{LINE_TERMINATORS}]')
166 def handle_singleline_comment(self
, start
):
167 self
.read_matches(fr
'[^{self.LINE_TERMINATORS}]', 1)
169 def handle_multiline_comment(self
, start
):
170 line_terminator
= False
172 end
= self
.buffer.find('*/', self
.pos
)
176 |
= LINE_TERMINATOR_PATTERN
.search(self
.buffer, self
.pos
)
178 raise EOFError('Unterminated multi-line comment')
179 self
.buffer = self
.buffer[max(len(self
.buffer) - 1, self
.pos
):]
182 if line_terminator
or LINE_TERMINATOR_PATTERN
.search(
183 self
.buffer, self
.pos
, end
):
184 return ('punctuator', '\n')
186 def handle_identifier(self
, token
):
187 token
+= self
.read_matches(self
.IDENTIFIER_PART
, 6, re
.ASCII
)
188 if token
in self
.RESERVED_WORDS
:
189 return ('reserved word', token
)
190 token
= self
.UNICODE_ESCAPES
.sub(self
.decode_unicode
, token
)
191 assert self
.IDENTIFIER_NAMES
.fullmatch(token
)
192 assert token
not in self
.RESERVED_WORDS
193 return ('identifier', token
)
197 'break', 'else', 'new', 'var', 'case', 'finally', 'return',
198 'void', 'catch', 'for', 'switch', 'while', 'continue',
199 'function', 'this', 'with', 'default', 'if', 'throw', 'delete',
200 'in', 'try', 'do', 'instanceof', 'typeof',
201 # Future-reserved words:
202 'abstract', 'enum', 'int', 'short', 'boolean', 'export',
203 'interface', 'static', 'byte', 'extends', 'long', 'super',
204 'char', 'final', 'native', 'synchronized', 'class', 'float',
205 'package', 'throws', 'const', 'goto', 'private', 'transient',
206 'debugger', 'implements', 'protected', 'volatile', 'double',
209 'null', 'true', 'false',
212 UNICODE_ESCAPES
= re
.compile(r
'\\u(.{4})', re
.ASCII
)
215 def decode_unicode(match
):
216 return chr(int(match
.group(1), 16))
218 IDENTIFIER_PART
= fr
'[{IDENTIFIER_PART}]|\\u[\da-fA-F]{{4}}'
219 IDENTIFIER_NAMES
= re
.compile(fr
'''
220 ([{IDENTIFIER_START}] | \\u[\da-fA-F]{{4}})
222 ''', re
.VERBOSE ^ re
.ASCII
)
224 def handle_number(self
, token
):
227 token
+= self
.read_matches(r
'\d', 1, re
.ASCII
)
229 return ('punctuator', '.')
232 if self
.read_match(*self
.X
):
233 number
= self
.read_matches(r
'[\da-fA-F]', 1, re
.ASCII
)
235 number
= int(number
, 16)
236 if number
>= 2**1024 - 2**(1024 - 54):
238 else: # Nonzero digit
239 token
+= self
.read_matches(r
'\d', 1, re
.ASCII
)
240 if number
is None and self
.read_match(*self
.POINT
):
241 token
+= '.' + self
.read_matches(r
'\d', 1, re
.ASCII
)
243 prefix
= self
.read_match(*self
.EXPONENT_PREFIX
)
245 exp
= self
.read_matches(r
'\d', 1, re
.ASCII
)
247 token
+= prefix
.group() + exp
249 assert not self
.read_match(self
.IDENTIFIER_START_CHAR
, 1)
252 X
= (re
.compile(r
'[xX]'), 1)
253 POINT
= (re
.compile(r
'\.'), 1)
254 EXPONENT_PREFIX
= (re
.compile(r
'[eE][-+]?'), 2)
256 def handle_punctuator(self
, token
):
257 if token
.startswith('/'):
258 self
.pos
-= len(token
) - 1;
259 pattern
= self
.read_matches(
260 fr
'[^{self.LINE_TERMINATORS}{self.BACKSLASH}/]'
261 fr
'|\\[^{self.LINE_TERMINATORS}]', 2)
262 slash
= self
.read_match(*self
.SLASH
)
264 flags
= self
.read_matches(self
.IDENTIFIER_PART
, 6, re
.ASCII
)
265 return ('regex', pattern
, flags
)
267 return ('punctuator', token
)
269 SLASH
= (re
.compile(r
'/'), 1)
271 def handle_string(self
, quote
):
272 s
= self
.read_matches(
273 fr
'[^{quote}{self.BACKSLASH}{self.LINE_TERMINATORS}]'
274 fr
'|\\[^1-9xu{self.LINE_TERMINATORS}]'
276 r
'|\\u[\da-fA-F]{4}', 6, re
.ASCII
)
277 end
= self
.read_match(re
.compile(quote
), 1)
279 assert not self
.INVALID_NUL
.search(s
)
280 return self
.ESCAPE_SEQUENCES
.sub(self
.decode_escape
, s
)
282 BACKSLASH
= '\\{:03o}'.format(ord('\\'))
283 INVALID_NUL
= re
.compile(r
'\\0\d', re
.ASCII
)
284 ESCAPE_SEQUENCES
= re
.compile(r
'\\(x.{2}|u.{4}|.)', re
.ASCII
)
287 def decode_escape(match
):
288 match
= match
.group(1)
289 if match
.startswith(('x', 'u')):
290 c
= chr(int(match
[1:], 16))
293 'n': '\n', 'b': '\b', 'f': '\f', 'r': '\r', 't': '\t',
294 'v': '\v', '0': '\x00'}
295 c
= escapes
.get(match
, match
)
298 def parse_punctuation(self
, match
):
302 [c
, end
] = parse_block(response
)
305 def read_matches(self
, pattern
, itemlen
, flags
=0):
306 pattern
= re
.compile(r
'(?:{})*'.format(pattern
), flags
)
309 match
= pattern
.match(self
.buffer, self
.pos
)
310 result
.write(match
.group())
311 self
.pos
= match
.end()
312 if len(self
.buffer) - self
.pos
>= itemlen
or self
.eof
:
314 self
.buffer = self
.buffer[self
.pos
:]
316 return result
.getvalue()
318 def read_match(self
, pattern
, maxlen
):
319 while self
.pos
+ maxlen
> len(self
.buffer) and not self
.eof
:
320 self
.buffer = self
.buffer[self
.pos
:]
322 match
= pattern
.match(self
.buffer, self
.pos
)
324 self
.pos
= match
.end()
328 size
= self
.BUFSIZE
- len(self
.buffer)
329 new
= self
.read(size
)
330 self
.eof
= len(new
) < size
331 self
.buffer += new
.translate(self
.REMOVE_Cf
)