Quick binary expression handling for “test_token_buffering“
[vadmium-streams.git] / javascript.py
blobeb9e56a3be53abf24b6a9de640e1ff45c5a345e7
1 from io import StringIO
2 import re
3 import math
5 def parse(reader):
6 parser = Parser()
7 parser.read = reader.read
9 parser.REMOVE_Cf = dict()
10 for [first, last] in Cf_RANGES:
11 for c in range(first, last + 1):
12 parser.REMOVE_Cf[c] = None
14 parser.buffer = ''
15 parser.pos = 0
16 parser.eof = False
17 #~ parser.lookahead = list()
19 empty = True
20 while True: # Get one or more statements or function declarations until EOF
21 # TODO: check lookahead buffer first
22 token = parser.read_token()
23 if token is None:
24 break
26 if token == ('punctuator', '\n'):
27 continue
28 if token == ('reserved word', 'function'):
29 yield parser.parse_func_def()
30 else:
31 yield parser.parse_statement(token) # push token back to lookahead buffer?
32 empty = False
33 assert not empty
35 Cf_RANGES = ( # From UnicodeData-2.1.9.txt
36 (0x200C, 0x200F),
37 (0x202A, 0x202E),
38 (0x206A, 0x206F),
39 (0xFEFF, 0xFEFF),
42 class Parser:
43 def read_token(self):
44 while True:
45 match = Parser.TOKENS.search(self.buffer, self.pos)
46 if not match:
47 if self.eof:
48 return None
49 self.buffer = str()
50 self.fill()
51 continue
52 self.pos = match.end()
53 if self.pos == len(self.buffer) and not self.eof:
54 self.buffer = self.buffer[match.start():]
55 self.fill()
56 continue
57 handle = getattr(self, f'handle_{match.lastgroup}')
58 return handle(match.group())
60 def parse_statement(self, token):
61 if token == ('reserved word', 'var'):
62 vars = list()
64 token = self.read_token()
65 [type, id] = token
66 assert type == 'identifier'
68 token = self.read_token()
69 assert token == ('punctuator', '=')
71 token = self.read_token()
72 assert isinstance(token, str)
73 vars.append((id, token))
75 token = self.read_token()
76 assert token == ('punctuator', ';')
77 return ('var', vars)
78 if token[0] == 'identifier':
79 token2 = self.read_token()
80 if token2[0] == 'punctuator' \
81 and self.BIN_TER_OPS.fullmatch(token2[1]):
82 rhs = self.read_token()
83 assert rhs[0] == 'identifier'
84 expr = (token2[1], token, rhs)
86 token = self.read_token()
87 assert token is None
88 return expr
89 raise NotImplementedError(token2)
90 raise NotImplementedError(token)
92 BIN_TER_OPS = re.compile(
93 r'&& | \|\| | (<< | >>>? | [=!]= | [-<>+*%&^/])=? | [.,?=]',
94 re.VERBOSE)
96 BUFSIZE = 0x10000
98 # "$", "_", plus L and Nl categories from UnicodeData-3.0.1.txt
99 IDENTIFIER_START = r'$_' \
100 'A-Za-z\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8' \
101 '\u02BB-\u02C1\u02D0\u02D1\u02E0-\u02E4\u02EE\u037A\u0386' \
102 '\u0388-\u0481\u048C-\u0559\u0561-\u0587\u05D0-\u05F2\u0621-\u064A' \
103 '\u0671-\u06D3\u06D5\u06E5-\u06E6\u06FA-\u06FC\u0710\u0712-\u072C' \
104 '\u0780-\u07A5\u0905-\u0939\u093D\u0950\u0958-\u0961\u0985-\u09B9' \
105 '\u09DC-\u09E1\u09F0-\u09F1\u0A05-\u0A39\u0A59-\u0A5E\u0A72-\u0A74' \
106 '\u0A85-\u0AB9\u0ABD\u0AD0-\u0AE0\u0B05-\u0B39\u0B3D\u0B5C-\u0B61' \
107 '\u0B85-\u0BB9\u0C05-\u0C39\u0C60-\u0C61\u0C85-\u0CB9\u0CDE-\u0CE1' \
108 '\u0D05-\u0D39\u0D60-\u0D61\u0D85-\u0DC6\u0E01-\u0E30\u0E32-\u0E33' \
109 '\u0E40-\u0E46\u0E81-\u0EB0\u0EB2-\u0EB3\u0EBD-\u0EC6\u0EDC-\u0F00' \
110 '\u0F40-\u0F6A\u0F88-\u0F8B\u1000-\u102A\u1050-\u1055\u10A0-\u10F6' \
111 '\u1100-\u135A\u13A0-\u166C\u166F-\u1676\u1681-\u169A\u16A0-\u16EA' \
112 '\u1780-\u17B3\u1820-\u18A8\u1E00-\u1FBC\u1FBE\u1FC2-\u1FCC' \
113 '\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC\u207F\u2102\u2107' \
114 '\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D' \
115 '\u212F-\u2131\u2133-\u2139\u2160-\u2183\u3005-\u3007\u3021-\u3029' \
116 '\u3031-\u3035\u3038-\u303A\u3041-\u3094\u309D-\u30FA\u30FC-\u318E' \
117 '\u31A0-\u31B7\u3400-\uA48C\uAC00-\uD7A3\uF900-\uFB1D\uFB1F-\uFB28' \
118 '\uFB2A-\uFD3D\uFD50-\uFDFB\uFE70-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A' \
119 '\uFF66-\uFFDC'
120 IDENTIFIER_START_CHAR = re.compile(fr'[{IDENTIFIER_START}]|\\')
122 # "$" plus L, Nl, Mn, Mc, Nd and Pc (includes "_") categories
123 IDENTIFIER_PART = r'$\w' \
124 '\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8' \
125 '\u02BB-\u02C1\u02D0-\u02D1\u02E0-\u02E4\u02EE-\u0362\u037A\u0386' \
126 '\u0388-\u0481\u0483-\u0486\u048C-\u0559\u0561-\u0587\u0591-\u05BD' \
127 '\u05BF\u05C1-\u05C2\u05C4-\u05F2\u0621-\u0669\u0670-\u06D3' \
128 '\u06D5-\u06DC\u06DF-\u06E8\u06EA-\u06FC\u0710-\u0963\u0966-\u096F' \
129 '\u0981-\u09F1\u0A02-\u0B6F\u0B82-\u0BEF\u0C01-\u0DF3\u0E01-\u0E3A' \
130 '\u0E40-\u0E4E\u0E50-\u0E59\u0E81-\u0F00\u0F18-\u0F19\u0F20-\u0F29' \
131 '\u0F35\u0F37\u0F39\u0F3E-\u0F84\u0F86-\u0FBC\u0FC6\u1000-\u1049' \
132 '\u1050-\u10F6\u1100-\u135A\u1369-\u1371\u13A0-\u166C\u166F-\u1676' \
133 '\u1681-\u169A\u16A0-\u16EA\u1780-\u17D3\u17E0-\u17E9\u1810-\u1FBC' \
134 '\u1FBE\u1FC2-\u1FCC\u1FD0-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FFC' \
135 '\u203F-\u2040\u207F\u20D0-\u20DC\u20E1\u2102\u2107\u210A-\u2113' \
136 '\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2131' \
137 '\u2133-\u2139\u2160-\u2183\u3005-\u3007\u3021-\u302F\u3031-\u3035' \
138 '\u3038-\u303A\u3041-\u309A\u309D-\u318E\u31A0-\u31B7\u3400-\uA48C' \
139 '\uAC00-\uD7A3\uF900-\uFB28\uFB2A-\uFD3D\uFD50-\uFE23\uFE33-\uFE34' \
140 '\uFE4D-\uFE4F\uFE70-\uFEFC\uFF10-\uFF19\uFF21-\uFF3A\uFF3F' \
141 '\uFF41-\uFF5A\uFF65-\uFFDC'
143 LINE_TERMINATORS = '\n\r\u2028\u2029'
145 TOKENS = re.compile(
146 fr'(?P<line_terminator> [{LINE_TERMINATORS}])'
147 r'| (?P<singleline_comment> //)'
148 r'| (?P<multiline_comment> /\*)'
149 fr'| (?P<identifier> {IDENTIFIER_START_CHAR.pattern})'
150 r'| (?P<number> [\d.])'
151 r'| (?P<punctuator>'
152 r'\+\+ | -- | && | \|\|'
153 r'| (<< | >>>? | [=!]= | [-<>+*%&^/])=?'
154 r'| [][{}().;,!~?:=])'
155 '| (?P<string> ["\'])'
156 # Anything else other than whitespace:
157 '| (?P<illegal_codepoint> [^\t\v\f \xA0\u2000-\u200B\u3000])',
158 re.DOTALL ^ re.VERBOSE ^ re.ASCII)
160 def handle_line_terminator(self, token):
161 return ('punctuator', '\n')
163 LINE_TERMINATOR_PATTERN \
164 = re.compile(fr'[{LINE_TERMINATORS}]')
166 def handle_singleline_comment(self, start):
167 self.read_matches(fr'[^{self.LINE_TERMINATORS}]', 1)
169 def handle_multiline_comment(self, start):
170 line_terminator = False
171 while True:
172 end = self.buffer.find('*/', self.pos)
173 if end >= 0:
174 break
175 line_terminator \
176 |= LINE_TERMINATOR_PATTERN.search(self.buffer, self.pos)
177 if self.eof:
178 raise EOFError('Unterminated multi-line comment')
179 self.buffer = self.buffer[max(len(self.buffer) - 1, self.pos):]
180 self.fill()
181 self.pos = end + 2
182 if line_terminator or LINE_TERMINATOR_PATTERN.search(
183 self.buffer, self.pos, end):
184 return ('punctuator', '\n')
186 def handle_identifier(self, token):
187 token += self.read_matches(self.IDENTIFIER_PART, 6, re.ASCII)
188 if token in self.RESERVED_WORDS:
189 return ('reserved word', token)
190 token = self.UNICODE_ESCAPES.sub(self.decode_unicode, token)
191 assert self.IDENTIFIER_NAMES.fullmatch(token)
192 assert token not in self.RESERVED_WORDS
193 return ('identifier', token)
195 RESERVED_WORDS = {
196 # Keywords:
197 'break', 'else', 'new', 'var', 'case', 'finally', 'return',
198 'void', 'catch', 'for', 'switch', 'while', 'continue',
199 'function', 'this', 'with', 'default', 'if', 'throw', 'delete',
200 'in', 'try', 'do', 'instanceof', 'typeof',
201 # Future-reserved words:
202 'abstract', 'enum', 'int', 'short', 'boolean', 'export',
203 'interface', 'static', 'byte', 'extends', 'long', 'super',
204 'char', 'final', 'native', 'synchronized', 'class', 'float',
205 'package', 'throws', 'const', 'goto', 'private', 'transient',
206 'debugger', 'implements', 'protected', 'volatile', 'double',
207 'import', 'public',
208 # Literals:
209 'null', 'true', 'false',
212 UNICODE_ESCAPES = re.compile(r'\\u(.{4})', re.ASCII)
214 @staticmethod
215 def decode_unicode(match):
216 return chr(int(match.group(1), 16))
218 IDENTIFIER_PART = fr'[{IDENTIFIER_PART}]|\\u[\da-fA-F]{{4}}'
219 IDENTIFIER_NAMES = re.compile(fr'''
220 ([{IDENTIFIER_START}] | \\u[\da-fA-F]{{4}})
221 ({IDENTIFIER_PART})*
222 ''', re.VERBOSE ^ re.ASCII)
224 def handle_number(self, token):
225 number = None
226 if token == '.':
227 token += self.read_matches(r'\d', 1, re.ASCII)
228 if token == '.':
229 return ('punctuator', '.')
230 else:
231 if token == '0':
232 if self.read_match(*self.X):
233 number = self.read_matches(r'[\da-fA-F]', 1, re.ASCII)
234 assert number > ''
235 number = int(number, 16)
236 if number >= 2**1024 - 2**(1024 - 54):
237 number = math.inf
238 else: # Nonzero digit
239 token += self.read_matches(r'\d', 1, re.ASCII)
240 if number is None and self.read_match(*self.POINT):
241 token += '.' + self.read_matches(r'\d', 1, re.ASCII)
242 if number is None:
243 prefix = self.read_match(*self.EXPONENT_PREFIX)
244 if prefix:
245 exp = self.read_matches(r'\d', 1, re.ASCII)
246 assert exp > ''
247 token += prefix.group() + exp
248 number = token
249 assert not self.read_match(self.IDENTIFIER_START_CHAR, 1)
250 return float(number)
252 X = (re.compile(r'[xX]'), 1)
253 POINT = (re.compile(r'\.'), 1)
254 EXPONENT_PREFIX = (re.compile(r'[eE][-+]?'), 2)
256 def handle_punctuator(self, token):
257 if token.startswith('/'):
258 self.pos -= len(token) - 1;
259 pattern = self.read_matches(
260 fr'[^{self.LINE_TERMINATORS}{self.BACKSLASH}/]'
261 fr'|\\[^{self.LINE_TERMINATORS}]', 2)
262 slash = self.read_match(*self.SLASH)
263 assert slash
264 flags = self.read_matches(self.IDENTIFIER_PART, 6, re.ASCII)
265 return ('regex', pattern, flags)
266 else:
267 return ('punctuator', token)
269 SLASH = (re.compile(r'/'), 1)
271 def handle_string(self, quote):
272 s = self.read_matches(
273 fr'[^{quote}{self.BACKSLASH}{self.LINE_TERMINATORS}]'
274 fr'|\\[^1-9xu{self.LINE_TERMINATORS}]'
275 r'|\\x[\da-fA-F]{2}'
276 r'|\\u[\da-fA-F]{4}', 6, re.ASCII)
277 end = self.read_match(re.compile(quote), 1)
278 assert end
279 assert not self.INVALID_NUL.search(s)
280 return self.ESCAPE_SEQUENCES.sub(self.decode_escape, s)
282 BACKSLASH = '\\{:03o}'.format(ord('\\'))
283 INVALID_NUL = re.compile(r'\\0\d', re.ASCII)
284 ESCAPE_SEQUENCES = re.compile(r'\\(x.{2}|u.{4}|.)', re.ASCII)
286 @staticmethod
287 def decode_escape(match):
288 match = match.group(1)
289 if match.startswith(('x', 'u')):
290 c = chr(int(match[1:], 16))
291 else:
292 escapes = {
293 'n': '\n', 'b': '\b', 'f': '\f', 'r': '\r', 't': '\t',
294 'v': '\v', '0': '\x00'}
295 c = escapes.get(match, match)
296 return c
298 def parse_punctuation(self, match):
299 if c == '}':
300 return (tokens, '}')
301 if c == '{':
302 [c, end] = parse_block(response)
303 assert end == '}'
305 def read_matches(self, pattern, itemlen, flags=0):
306 pattern = re.compile(r'(?:{})*'.format(pattern), flags)
307 result = StringIO()
308 while True:
309 match = pattern.match(self.buffer, self.pos)
310 result.write(match.group())
311 self.pos = match.end()
312 if len(self.buffer) - self.pos >= itemlen or self.eof:
313 break
314 self.buffer = self.buffer[self.pos:]
315 self.fill()
316 return result.getvalue()
318 def read_match(self, pattern, maxlen):
319 while self.pos + maxlen > len(self.buffer) and not self.eof:
320 self.buffer = self.buffer[self.pos:]
321 self.fill()
322 match = pattern.match(self.buffer, self.pos)
323 if match:
324 self.pos = match.end()
325 return match
327 def fill(self):
328 size = self.BUFSIZE - len(self.buffer)
329 new = self.read(size)
330 self.eof = len(new) < size
331 self.buffer += new.translate(self.REMOVE_Cf)
332 self.pos = 0