Bug 1888590 - Mark some subtests on trusted-types-event-handlers.html as failing...
[gecko.git] / third_party / rust / jsparagus / js_parser / parse_esgrammar.py
blobefcb6404067862ef7156fb9f1e0c4575aa77d2dc
1 """Parse a grammar written in ECMArkup."""
3 from __future__ import annotations
4 # mypy: no-implicit-optional
6 import os
7 import collections
8 from typing import Dict, Iterable, Optional, Tuple
10 from jsparagus import parse_pgen, gen, grammar, extension, types
11 from jsparagus.lexer import LexicalGrammar
12 from jsparagus.ordered import OrderedSet, OrderedFrozenSet
15 ESGrammarLexer = LexicalGrammar(
16 # the operators and keywords:
17 "[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; "
18 "but empty here lookahead no not of one or returns through Some None impl for let",
20 NL="\n",
22 # any number of colons together
23 EQ=r':+',
25 # terminals of the ES grammar, quoted with backticks
26 T=r'`[^` \n]+`|```',
28 # also terminals, denoting control characters
29 CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}',
31 # nonterminals/types that will be followed by parameters
32 NTCALL=r'[A-Za-z]\w*(?=[\[<])',
34 # nonterminals (also, boolean parameters and type names)
35 NT=r'[A-Za-z]\w*',
37 # nonterminals wrapped in vertical bars for no apparent reason
38 NTALT=r'\|[A-Z]\w+\|',
40 # the spec also gives a few productions names
41 PRODID=r'#[A-Za-z]\w*',
43 # prose not wrapped in square brackets
44 # To avoid conflict with the `>` token, this is recognized only after a space.
45 PROSE=r'(?<= )>[^\n]*',
47 # prose wrapped in square brackets
48 WPROSE=r'\[>[^]]*\]',
50 # expression denoting a matched terminal or nonterminal
51 MATCH_REF=r'\$(?:0|[1-9][0-9]*)',
53 # the spec also gives a few productions names
54 RUSTCOMMENT=r'//.*\n',
58 ESGrammarParser = gen.compile(
59 parse_pgen.load_grammar(
60 os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))
63 SIGIL_FALSE = '~'
64 SIGIL_TRUE = '+'
66 # Abbreviations for single-character terminals, used in the lexical grammar.
67 ECMASCRIPT_CODE_POINTS = {
68 # From <https://tc39.es/ecma262/#table-31>
69 '<ZWNJ>': grammar.Literal('\u200c'),
70 '<ZWJ>': grammar.Literal('\u200d'),
71 '<ZWNBSP>': grammar.Literal('\ufeff'),
73 # From <https://tc39.es/ecma262/#table-32>
74 '<TAB>': grammar.Literal('\t'),
75 '<VT>': grammar.Literal('\u000b'),
76 '<FF>': grammar.Literal('\u000c'),
77 '<SP>': grammar.Literal(' '),
78 '<NBSP>': grammar.Literal('\u00a0'),
79 # <ZWNBSP> already defined above
80 '<USP>': grammar.UnicodeCategory('Zs'),
82 # From <https://tc39.es/ecma262/#table-33>
83 '<LF>': grammar.Literal('\u000a'),
84 '<CR>': grammar.Literal('\u000d'),
85 '<LS>': grammar.Literal('\u2028'),
86 '<PS>': grammar.Literal('\u2028'),
90 class ESGrammarBuilder:
91 def __init__(self, terminal_names):
92 # Names of terminals that are written as nonterminals in the grammar.
93 # For example, "BooleanLiteral" is a terminal name when parsing the
94 # syntactic grammar.
95 if terminal_names is None:
96 terminal_names = frozenset()
97 self.terminal_names = frozenset(terminal_names)
98 self.reset()
100 def reset(self):
101 self.lexer = None
102 # This is how full-parsing and lazy-parsing are implemented, using
103 # different traits.
105 # This field contains the Rust's trait used for calling the method.
106 # When a CallMethod is generated, it is assumed to be a function of
107 # this trait. The trait is used by the Rust backend to generate
108 # multiple backends which are implementing different set of traits.
109 # Having the trait on the function call is useful as a way to filter
110 # functions calls at code-generation time.
112 # This field is updated by the `rust_param_impl`, which is used in
113 # grammar extensions, and visited before producing any CallMethod.
114 self.method_trait = "AstBuilder"
116 def rust_edsl(self, impl, grammar):
117 return extension.GrammarExtension(impl, grammar, self.lexer.filename)
119 def rust_param_impl(self, trait, for_type, param):
120 self.method_trait = trait
121 return extension.ImplFor(param, trait, for_type)
123 def rust_impl(self, trait, impl_type):
124 return self.rust_param_impl(trait, impl_type, [])
126 def rust_nt_def(self, lhs, rhs_line):
127 # Right now, only focus on the syntactic grammar, and assume that all
128 # rules are patching existing grammar production by adding code.
129 return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line]))
131 def rust_rhs_line(self, symbols):
132 return self.rhs_line(None, symbols, None, None)
134 def rust_expr(self, expr):
135 assert isinstance(expr, grammar.CallMethod)
136 return expr
138 def empty(self):
139 return []
141 def single(self, x):
142 return [x]
144 def append(self, x, y):
145 return x + [y]
147 def concat(self, x, y):
148 return x + y
150 def blank_line(self):
151 return []
153 def nt_def_to_list(self, nt_def):
154 return [nt_def]
156 def to_production(self, lhs, i, rhs, is_sole_production):
157 """Wrap a list of grammar symbols `rhs` in a Production object."""
158 body, reducer, condition = rhs
159 if reducer is None:
160 reducer = self.default_reducer(lhs, i, body, is_sole_production)
161 return grammar.Production(body, reducer, condition=condition)
163 def default_reducer(self, lhs, i, body, is_sole_production):
164 assert isinstance(lhs, grammar.Nt)
165 nt_name = lhs.name
167 nargs = sum(1 for e in body if grammar.is_concrete_element(e))
168 if is_sole_production:
169 method_name = nt_name
170 else:
171 method_name = '{} {}'.format(nt_name, i)
172 return self.expr_call(method_name, tuple(range(nargs)), None)
174 def needs_asi(self, lhs, p):
175 """True if p is a production in which ASI can happen."""
176 # The purpose of the fake ForLexicalDeclaration production is to have a
177 # copy of LexicalDeclaration that does not trigger ASI.
179 # Two productions have body == [";"] -- one for EmptyStatement and one
180 # for ClassMember. Neither should trigger ASI.
182 # The only other semicolons that should not trigger ASI are the ones in
183 # `for` statement productions, which happen to be exactly those
184 # semicolons that are not at the end of a production.
185 return (not (isinstance(lhs, grammar.Nt)
186 and lhs.name == 'ForLexicalDeclaration')
187 and len(p.body) > 1
188 and p.body[-1] == ';')
190 def apply_asi(self, p, reducer_was_autogenerated):
191 """Return two rules based on p, so that ASI can be applied."""
192 assert isinstance(p.reducer, grammar.CallMethod)
194 if reducer_was_autogenerated:
195 # Don't pass the semicolon to the method.
196 reducer = self.expr_call(p.reducer.method,
197 p.reducer.args[:-1],
198 None)
199 else:
200 reducer = p.reducer
202 # Except for do-while loops, check at runtime that ASI occurs only at
203 # the end of a line.
204 if (len(p.body) == 7
205 and p.body[0] == 'do'
206 and p.body[2] == 'while'
207 and p.body[3] == '('
208 and p.body[5] == ')'
209 and p.body[6] == ';'):
210 code = "do_while_asi"
211 else:
212 code = "asi"
214 return [
215 # The preferred production, with the semicolon in.
216 p.copy_with(body=p.body[:],
217 reducer=reducer),
218 # The fallback production, performing ASI.
219 p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)],
220 reducer=reducer),
223 def expand_lexical_rhs(self, rhs):
224 body, reducer, condition = rhs
225 out = []
226 for e in body:
227 if isinstance(e, str):
228 # The terminal symbols of the lexical grammar are characters, so
229 # add each character of this string as a separate element.
230 out += [grammar.Literal(ch) for ch in e]
231 else:
232 out.append(e)
233 return [out, reducer, condition]
235 def nt_def(self, nt_type, lhs, eq, rhs_list):
236 has_sole_production = (len(rhs_list) == 1)
237 production_list = []
238 for i, rhs in enumerate(rhs_list):
239 if eq == ':':
240 # Syntactic grammar. A hack is needed for ASI.
241 reducer_was_autogenerated = rhs[1] is None
242 p = self.to_production(lhs, i, rhs, has_sole_production)
243 if self.needs_asi(lhs, p):
244 production_list += self.apply_asi(p, reducer_was_autogenerated)
245 else:
246 production_list.append(p)
247 elif eq == '::':
248 # Lexical grammar. A hack is needed to replace multicharacter
249 # terminals like `!==` into sequences of character terminals.
250 rhs = self.expand_lexical_rhs(rhs)
251 p = self.to_production(lhs, i, rhs, has_sole_production)
252 production_list.append(p)
253 return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type))
255 def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals):
256 return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals])
258 def nt_lhs_no_params(self, name):
259 return grammar.Nt(name, ())
261 def nt_lhs_with_params(self, name, params):
262 return grammar.Nt(name, tuple(params))
264 def simple_type(self, name):
265 return types.Type(name)
267 def lifetime_type(self, name):
268 return types.Lifetime(name)
270 def parameterized_type(self, name, args):
271 return types.Type(name, tuple(args))
273 def t_list_line(self, terminals):
274 return terminals
276 def terminal(self, t):
277 assert t[0] == "`"
278 assert t[-1] == "`"
279 return t[1:-1]
281 def terminal_chr(self, chr):
282 raise ValueError("FAILED: %r" % chr)
284 def rhs_line(self, ifdef, rhs, reducer, _prodid):
285 return (rhs, reducer, ifdef)
287 def rhs_line_prose(self, prose):
288 return ([prose], None, None)
290 def empty_rhs(self):
291 return []
293 def expr_match_ref(self, token):
294 assert token.startswith('$')
295 return int(token[1:])
297 def expr_call(self, method, args, fallible):
298 # NOTE: Currently "AstBuilder" functions are made fallible using the
299 # fallible_methods taken from some Rust code which extract this
300 # information to produce a JSON file.
301 if self.method_trait == "AstBuilder":
302 fallible = None
303 return grammar.CallMethod(method, args or (), types.Type(self.method_trait),
304 fallible is not None)
306 def expr_some(self, expr):
307 return grammar.Some(expr)
309 def expr_none(self):
310 return None
312 def ifdef(self, value, nt):
313 return nt, value
315 def optional(self, nt):
316 return grammar.Optional(nt)
318 def but_not(self, nt, exclusion):
319 _, exclusion = exclusion
320 return grammar.Exclude(nt, [exclusion])
321 # return ('-', nt, exclusion)
323 def but_not_one_of(self, nt, exclusion_list):
324 exclusion_list = [exclusion for _, exclusion in exclusion_list]
325 return grammar.Exclude(nt, exclusion_list)
326 # return ('-', nt, exclusion_list)
328 def no_line_terminator_here(self, lt):
329 if lt not in ('LineTerminator', '|LineTerminator|'):
330 raise ValueError("unrecognized directive " + repr("[no " + lt + " here]"))
331 return grammar.NoLineTerminatorHere
333 def nonterminal(self, name):
334 if name in self.terminal_names:
335 return name
336 return grammar.Nt(name, ())
338 def nonterminal_apply(self, name, args):
339 if name in self.terminal_names:
340 raise ValueError("parameters applied to terminal {!r}".format(name))
341 if len(set(k for k, expr in args)) != len(args):
342 raise ValueError("parameter passed multiple times")
343 return grammar.Nt(name, tuple(args))
345 def arg_expr(self, sigil, argname):
346 if sigil == '?':
347 return (argname, grammar.Var(argname))
348 else:
349 return (argname, sigil)
351 def sigil_false(self):
352 return False
354 def sigil_true(self):
355 return True
357 def exclusion_terminal(self, t):
358 return ("t", t)
360 def exclusion_nonterminal(self, nt):
361 return ("nt", nt)
363 def exclusion_chr_range(self, c1, c2):
364 return ("range", c1, c2)
366 def la_eq(self, t):
367 return grammar.LookaheadRule(OrderedFrozenSet([t]), True)
369 def la_ne(self, t):
370 return grammar.LookaheadRule(OrderedFrozenSet([t]), False)
372 def la_not_in_nonterminal(self, nt):
373 return grammar.LookaheadRule(OrderedFrozenSet([nt]), False)
375 def la_not_in_set(self, lookahead_exclusions):
376 if all(len(excl) == 1 for excl in lookahead_exclusions):
377 return grammar.LookaheadRule(
378 OrderedFrozenSet(excl[0] for excl in lookahead_exclusions),
379 False)
380 raise ValueError("unsupported: lookahead > 1 token, {!r}"
381 .format(lookahead_exclusions))
383 def chr(self, t):
384 assert t[0] == "<" or t[0] == 'U'
385 if t[0] == "<":
386 assert t[-1] == ">"
387 if t not in ECMASCRIPT_CODE_POINTS:
388 raise ValueError("unrecognized character abbreviation {!r}".format(t))
389 return ECMASCRIPT_CODE_POINTS[t]
390 else:
391 assert t[1] == "+"
392 return grammar.Literal(chr(int(t[2:], base=16)))
395 def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals,
396 single_grammar=True, extensions=[]):
397 nt_grammars = {}
398 for nt_name, eq, _ in nt_defs:
399 if nt_name in nt_grammars:
400 raise ValueError(
401 "duplicate definitions for nonterminal {!r}"
402 .format(nt_name))
403 nt_grammars[nt_name] = eq
405 # Figure out which grammar we were trying to get (":" for syntactic,
406 # "::" for lexical) based on the goal symbols.
407 goals = list(goals)
408 if len(goals) == 0:
409 raise ValueError("no goal nonterminals specified")
410 if single_grammar:
411 selected_grammars = set(nt_grammars[goal] for goal in goals)
412 assert len(selected_grammars) != 0
413 if len(selected_grammars) > 1:
414 raise ValueError(
415 "all goal nonterminals must be part of the same grammar; "
416 "got {!r} (matching these grammars: {!r})"
417 .format(set(goals), set(selected_grammars)))
418 [selected_grammar] = selected_grammars
420 terminal_set = set()
422 def hack_production(p):
423 for i, e in enumerate(p.body):
424 if isinstance(e, str) and e[:1] == "`":
425 if len(e) < 3 or e[-1:] != "`":
426 raise ValueError(
427 "Unrecognized grammar symbol: {!r} (in {!r})"
428 .format(e, p))
429 p[i] = token = e[1:-1]
430 terminal_set.add(token)
432 nonterminals = {}
433 for nt_name, eq, rhs_list_or_lambda in nt_defs:
434 if single_grammar and eq != selected_grammar:
435 continue
437 if isinstance(rhs_list_or_lambda, grammar.NtDef):
438 nonterminals[nt_name] = rhs_list_or_lambda
439 else:
440 rhs_list = rhs_list_or_lambda
441 for p in rhs_list:
442 if not isinstance(p, grammar.Production):
443 raise ValueError(
444 "invalid grammar: ifdef in non-function-call context")
445 hack_production(p)
446 if nt_name in nonterminals:
447 raise ValueError(
448 "unsupported: multiple definitions for nt " + nt_name)
449 nonterminals[nt_name] = rhs_list
451 for t in terminal_set:
452 if t in nonterminals:
453 raise ValueError(
454 "grammar contains both a terminal `{}` and nonterminal {}"
455 .format(t, t))
457 # Add execution modes to generate the various functions needed to handle
458 # syntax parsing and full parsing execution modes.
459 exec_modes = collections.defaultdict(OrderedSet)
460 noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType))
461 token_parser = types.Type("ParserTrait", (
462 types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),))))
463 ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),))
465 # Full parsing takes token as input and build an AST.
466 exec_modes["full_actions"].extend([token_parser, ast_builder])
468 # Syntax parsing takes token as input but skip building the AST.
469 # TODO: The syntax parser is commented out for now, as we need something to
470 # be produced when we cannot call the AstBuilder for producing the values.
472 # No-op parsing is used for the simulator, which is so far used for
473 # querying whether we can end the incremental input and lookup if a state
474 # can accept some kind of tokens.
475 exec_modes["noop_actions"].add(noop_parser)
477 # Extensions are using an equivalent of Rust types to define the kind of
478 # parsers to be used, this map is used to convert these type names to the
479 # various execution modes.
480 full_parser = types.Type("FullParser")
481 syntax_parser = types.Type("SyntaxParser")
482 noop_parser = types.Type("NoopParser")
483 type_to_modes = {
484 noop_parser: ["noop_actions", "full_actions"],
485 syntax_parser: ["full_actions"],
486 full_parser: ["full_actions"],
489 result = grammar.Grammar(
490 nonterminals,
491 goal_nts=goals,
492 variable_terminals=variable_terminals,
493 synthetic_terminals=synthetic_terminals,
494 exec_modes=exec_modes,
495 type_to_modes=type_to_modes)
496 result.patch(extensions)
497 return result
500 def parse_esgrammar(
501 text: str,
503 filename: Optional[str] = None,
504 extensions: Iterable[Tuple[os.PathLike, int, str]] = (),
505 goals: Optional[Iterable[str]] = None,
506 terminal_names: Iterable[str] = (),
507 synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None,
508 single_grammar: bool = True
509 ) -> grammar.Grammar:
510 if not text.endswith("\n\n"):
511 # Horrible hack: add a blank line at the end of the document so that
512 # the esgrammar grammar can use newlines as delimiters. :-P
513 text += "\n"
515 terminal_names = frozenset(terminal_names)
516 if synthetic_terminals is None:
517 synthetic_terminals = {}
519 builder = ESGrammarBuilder(terminal_names)
520 parser = ESGrammarParser(builder=builder, goal="grammar")
521 lexer = ESGrammarLexer(parser, filename=filename)
522 lexer.write(text)
523 nt_defs = lexer.close()
524 grammar_extensions = []
525 for ext_filename, start_lineno, content in extensions:
526 builder.reset()
527 parser = ESGrammarParser(builder=builder, goal="rust_edsl")
528 lexer = ESGrammarLexer(parser, filename=ext_filename)
529 builder.lexer = lexer
530 lexer.start_lineno = start_lineno
531 lexer.write(content)
532 result = lexer.close()
533 grammar_extensions.append(result)
535 if goals is None:
536 # Default to the first nonterminal in the input.
537 goals = [nt_defs[0][0]]
539 return finish_grammar(
540 nt_defs,
541 goals=goals,
542 variable_terminals=terminal_names - frozenset(synthetic_terminals),
543 synthetic_terminals=synthetic_terminals,
544 single_grammar=single_grammar,
545 extensions=grammar_extensions)