1 """Parse a grammar written in ECMArkup."""
3 from __future__
import annotations
4 # mypy: no-implicit-optional
8 from typing
import Dict
, Iterable
, Optional
, Tuple
10 from jsparagus
import parse_pgen
, gen
, grammar
, extension
, types
11 from jsparagus
.lexer
import LexicalGrammar
12 from jsparagus
.ordered
import OrderedSet
, OrderedFrozenSet
15 ESGrammarLexer
= LexicalGrammar(
16 # the operators and keywords:
17 "[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; "
18 "but empty here lookahead no not of one or returns through Some None impl for let",
22 # any number of colons together
25 # terminals of the ES grammar, quoted with backticks
28 # also terminals, denoting control characters
29 CHR
=r
'<[A-Z ]+>|U\+[0-9A-f]{4}',
31 # nonterminals/types that will be followed by parameters
32 NTCALL
=r
'[A-Za-z]\w*(?=[\[<])',
34 # nonterminals (also, boolean parameters and type names)
37 # nonterminals wrapped in vertical bars for no apparent reason
38 NTALT
=r
'\|[A-Z]\w+\|',
40 # the spec also gives a few productions names
41 PRODID
=r
'#[A-Za-z]\w*',
43 # prose not wrapped in square brackets
44 # To avoid conflict with the `>` token, this is recognized only after a space.
45 PROSE
=r
'(?<= )>[^\n]*',
47 # prose wrapped in square brackets
50 # expression denoting a matched terminal or nonterminal
51 MATCH_REF
=r
'\$(?:0|[1-9][0-9]*)',
53 # the spec also gives a few productions names
54 RUSTCOMMENT
=r
'//.*\n',
58 ESGrammarParser
= gen
.compile(
59 parse_pgen
.load_grammar(
60 os
.path
.join(os
.path
.dirname(__file__
), "esgrammar.pgen")))
66 # Abbreviations for single-character terminals, used in the lexical grammar.
67 ECMASCRIPT_CODE_POINTS
= {
68 # From <https://tc39.es/ecma262/#table-31>
69 '<ZWNJ>': grammar
.Literal('\u200c'),
70 '<ZWJ>': grammar
.Literal('\u200d'),
71 '<ZWNBSP>': grammar
.Literal('\ufeff'),
73 # From <https://tc39.es/ecma262/#table-32>
74 '<TAB>': grammar
.Literal('\t'),
75 '<VT>': grammar
.Literal('\u000b'),
76 '<FF>': grammar
.Literal('\u000c'),
77 '<SP>': grammar
.Literal(' '),
78 '<NBSP>': grammar
.Literal('\u00a0'),
79 # <ZWNBSP> already defined above
80 '<USP>': grammar
.UnicodeCategory('Zs'),
82 # From <https://tc39.es/ecma262/#table-33>
83 '<LF>': grammar
.Literal('\u000a'),
84 '<CR>': grammar
.Literal('\u000d'),
85 '<LS>': grammar
.Literal('\u2028'),
86 '<PS>': grammar
.Literal('\u2028'),
90 class ESGrammarBuilder
:
91 def __init__(self
, terminal_names
):
92 # Names of terminals that are written as nonterminals in the grammar.
93 # For example, "BooleanLiteral" is a terminal name when parsing the
95 if terminal_names
is None:
96 terminal_names
= frozenset()
97 self
.terminal_names
= frozenset(terminal_names
)
102 # This is how full-parsing and lazy-parsing are implemented, using
105 # This field contains the Rust's trait used for calling the method.
106 # When a CallMethod is generated, it is assumed to be a function of
107 # this trait. The trait is used by the Rust backend to generate
108 # multiple backends which are implementing different set of traits.
109 # Having the trait on the function call is useful as a way to filter
110 # functions calls at code-generation time.
112 # This field is updated by the `rust_param_impl`, which is used in
113 # grammar extensions, and visited before producing any CallMethod.
114 self
.method_trait
= "AstBuilder"
116 def rust_edsl(self
, impl
, grammar
):
117 return extension
.GrammarExtension(impl
, grammar
, self
.lexer
.filename
)
119 def rust_param_impl(self
, trait
, for_type
, param
):
120 self
.method_trait
= trait
121 return extension
.ImplFor(param
, trait
, for_type
)
123 def rust_impl(self
, trait
, impl_type
):
124 return self
.rust_param_impl(trait
, impl_type
, [])
126 def rust_nt_def(self
, lhs
, rhs_line
):
127 # Right now, only focus on the syntactic grammar, and assume that all
128 # rules are patching existing grammar production by adding code.
129 return extension
.ExtPatch(self
.nt_def(None, lhs
, ':', [rhs_line
]))
131 def rust_rhs_line(self
, symbols
):
132 return self
.rhs_line(None, symbols
, None, None)
134 def rust_expr(self
, expr
):
135 assert isinstance(expr
, grammar
.CallMethod
)
144 def append(self
, x
, y
):
147 def concat(self
, x
, y
):
150 def blank_line(self
):
153 def nt_def_to_list(self
, nt_def
):
156 def to_production(self
, lhs
, i
, rhs
, is_sole_production
):
157 """Wrap a list of grammar symbols `rhs` in a Production object."""
158 body
, reducer
, condition
= rhs
160 reducer
= self
.default_reducer(lhs
, i
, body
, is_sole_production
)
161 return grammar
.Production(body
, reducer
, condition
=condition
)
163 def default_reducer(self
, lhs
, i
, body
, is_sole_production
):
164 assert isinstance(lhs
, grammar
.Nt
)
167 nargs
= sum(1 for e
in body
if grammar
.is_concrete_element(e
))
168 if is_sole_production
:
169 method_name
= nt_name
171 method_name
= '{} {}'.format(nt_name
, i
)
172 return self
.expr_call(method_name
, tuple(range(nargs
)), None)
174 def needs_asi(self
, lhs
, p
):
175 """True if p is a production in which ASI can happen."""
176 # The purpose of the fake ForLexicalDeclaration production is to have a
177 # copy of LexicalDeclaration that does not trigger ASI.
179 # Two productions have body == [";"] -- one for EmptyStatement and one
180 # for ClassMember. Neither should trigger ASI.
182 # The only other semicolons that should not trigger ASI are the ones in
183 # `for` statement productions, which happen to be exactly those
184 # semicolons that are not at the end of a production.
185 return (not (isinstance(lhs
, grammar
.Nt
)
186 and lhs
.name
== 'ForLexicalDeclaration')
188 and p
.body
[-1] == ';')
190 def apply_asi(self
, p
, reducer_was_autogenerated
):
191 """Return two rules based on p, so that ASI can be applied."""
192 assert isinstance(p
.reducer
, grammar
.CallMethod
)
194 if reducer_was_autogenerated
:
195 # Don't pass the semicolon to the method.
196 reducer
= self
.expr_call(p
.reducer
.method
,
202 # Except for do-while loops, check at runtime that ASI occurs only at
205 and p
.body
[0] == 'do'
206 and p
.body
[2] == 'while'
209 and p
.body
[6] == ';'):
210 code
= "do_while_asi"
215 # The preferred production, with the semicolon in.
216 p
.copy_with(body
=p
.body
[:],
218 # The fallback production, performing ASI.
219 p
.copy_with(body
=p
.body
[:-1] + [grammar
.ErrorSymbol(code
)],
223 def expand_lexical_rhs(self
, rhs
):
224 body
, reducer
, condition
= rhs
227 if isinstance(e
, str):
228 # The terminal symbols of the lexical grammar are characters, so
229 # add each character of this string as a separate element.
230 out
+= [grammar
.Literal(ch
) for ch
in e
]
233 return [out
, reducer
, condition
]
235 def nt_def(self
, nt_type
, lhs
, eq
, rhs_list
):
236 has_sole_production
= (len(rhs_list
) == 1)
238 for i
, rhs
in enumerate(rhs_list
):
240 # Syntactic grammar. A hack is needed for ASI.
241 reducer_was_autogenerated
= rhs
[1] is None
242 p
= self
.to_production(lhs
, i
, rhs
, has_sole_production
)
243 if self
.needs_asi(lhs
, p
):
244 production_list
+= self
.apply_asi(p
, reducer_was_autogenerated
)
246 production_list
.append(p
)
248 # Lexical grammar. A hack is needed to replace multicharacter
249 # terminals like `!==` into sequences of character terminals.
250 rhs
= self
.expand_lexical_rhs(rhs
)
251 p
= self
.to_production(lhs
, i
, rhs
, has_sole_production
)
252 production_list
.append(p
)
253 return (lhs
.name
, eq
, grammar
.NtDef(lhs
.args
, production_list
, nt_type
))
255 def nt_def_one_of(self
, nt_type
, nt_lhs
, eq
, terminals
):
256 return self
.nt_def(nt_type
, nt_lhs
, eq
, [([t
], None, None) for t
in terminals
])
258 def nt_lhs_no_params(self
, name
):
259 return grammar
.Nt(name
, ())
261 def nt_lhs_with_params(self
, name
, params
):
262 return grammar
.Nt(name
, tuple(params
))
264 def simple_type(self
, name
):
265 return types
.Type(name
)
267 def lifetime_type(self
, name
):
268 return types
.Lifetime(name
)
270 def parameterized_type(self
, name
, args
):
271 return types
.Type(name
, tuple(args
))
273 def t_list_line(self
, terminals
):
276 def terminal(self
, t
):
281 def terminal_chr(self
, chr):
282 raise ValueError("FAILED: %r" % chr)
284 def rhs_line(self
, ifdef
, rhs
, reducer
, _prodid
):
285 return (rhs
, reducer
, ifdef
)
287 def rhs_line_prose(self
, prose
):
288 return ([prose
], None, None)
293 def expr_match_ref(self
, token
):
294 assert token
.startswith('$')
295 return int(token
[1:])
297 def expr_call(self
, method
, args
, fallible
):
298 # NOTE: Currently "AstBuilder" functions are made fallible using the
299 # fallible_methods taken from some Rust code which extract this
300 # information to produce a JSON file.
301 if self
.method_trait
== "AstBuilder":
303 return grammar
.CallMethod(method
, args
or (), types
.Type(self
.method_trait
),
304 fallible
is not None)
306 def expr_some(self
, expr
):
307 return grammar
.Some(expr
)
312 def ifdef(self
, value
, nt
):
315 def optional(self
, nt
):
316 return grammar
.Optional(nt
)
318 def but_not(self
, nt
, exclusion
):
319 _
, exclusion
= exclusion
320 return grammar
.Exclude(nt
, [exclusion
])
321 # return ('-', nt, exclusion)
323 def but_not_one_of(self
, nt
, exclusion_list
):
324 exclusion_list
= [exclusion
for _
, exclusion
in exclusion_list
]
325 return grammar
.Exclude(nt
, exclusion_list
)
326 # return ('-', nt, exclusion_list)
328 def no_line_terminator_here(self
, lt
):
329 if lt
not in ('LineTerminator', '|LineTerminator|'):
330 raise ValueError("unrecognized directive " + repr("[no " + lt
+ " here]"))
331 return grammar
.NoLineTerminatorHere
333 def nonterminal(self
, name
):
334 if name
in self
.terminal_names
:
336 return grammar
.Nt(name
, ())
338 def nonterminal_apply(self
, name
, args
):
339 if name
in self
.terminal_names
:
340 raise ValueError("parameters applied to terminal {!r}".format(name
))
341 if len(set(k
for k
, expr
in args
)) != len(args
):
342 raise ValueError("parameter passed multiple times")
343 return grammar
.Nt(name
, tuple(args
))
345 def arg_expr(self
, sigil
, argname
):
347 return (argname
, grammar
.Var(argname
))
349 return (argname
, sigil
)
351 def sigil_false(self
):
354 def sigil_true(self
):
357 def exclusion_terminal(self
, t
):
360 def exclusion_nonterminal(self
, nt
):
363 def exclusion_chr_range(self
, c1
, c2
):
364 return ("range", c1
, c2
)
367 return grammar
.LookaheadRule(OrderedFrozenSet([t
]), True)
370 return grammar
.LookaheadRule(OrderedFrozenSet([t
]), False)
372 def la_not_in_nonterminal(self
, nt
):
373 return grammar
.LookaheadRule(OrderedFrozenSet([nt
]), False)
375 def la_not_in_set(self
, lookahead_exclusions
):
376 if all(len(excl
) == 1 for excl
in lookahead_exclusions
):
377 return grammar
.LookaheadRule(
378 OrderedFrozenSet(excl
[0] for excl
in lookahead_exclusions
),
380 raise ValueError("unsupported: lookahead > 1 token, {!r}"
381 .format(lookahead_exclusions
))
384 assert t
[0] == "<" or t
[0] == 'U'
387 if t
not in ECMASCRIPT_CODE_POINTS
:
388 raise ValueError("unrecognized character abbreviation {!r}".format(t
))
389 return ECMASCRIPT_CODE_POINTS
[t
]
392 return grammar
.Literal(chr(int(t
[2:], base
=16)))
395 def finish_grammar(nt_defs
, goals
, variable_terminals
, synthetic_terminals
,
396 single_grammar
=True, extensions
=[]):
398 for nt_name
, eq
, _
in nt_defs
:
399 if nt_name
in nt_grammars
:
401 "duplicate definitions for nonterminal {!r}"
403 nt_grammars
[nt_name
] = eq
405 # Figure out which grammar we were trying to get (":" for syntactic,
406 # "::" for lexical) based on the goal symbols.
409 raise ValueError("no goal nonterminals specified")
411 selected_grammars
= set(nt_grammars
[goal
] for goal
in goals
)
412 assert len(selected_grammars
) != 0
413 if len(selected_grammars
) > 1:
415 "all goal nonterminals must be part of the same grammar; "
416 "got {!r} (matching these grammars: {!r})"
417 .format(set(goals
), set(selected_grammars
)))
418 [selected_grammar
] = selected_grammars
422 def hack_production(p
):
423 for i
, e
in enumerate(p
.body
):
424 if isinstance(e
, str) and e
[:1] == "`":
425 if len(e
) < 3 or e
[-1:] != "`":
427 "Unrecognized grammar symbol: {!r} (in {!r})"
429 p
[i
] = token
= e
[1:-1]
430 terminal_set
.add(token
)
433 for nt_name
, eq
, rhs_list_or_lambda
in nt_defs
:
434 if single_grammar
and eq
!= selected_grammar
:
437 if isinstance(rhs_list_or_lambda
, grammar
.NtDef
):
438 nonterminals
[nt_name
] = rhs_list_or_lambda
440 rhs_list
= rhs_list_or_lambda
442 if not isinstance(p
, grammar
.Production
):
444 "invalid grammar: ifdef in non-function-call context")
446 if nt_name
in nonterminals
:
448 "unsupported: multiple definitions for nt " + nt_name
)
449 nonterminals
[nt_name
] = rhs_list
451 for t
in terminal_set
:
452 if t
in nonterminals
:
454 "grammar contains both a terminal `{}` and nonterminal {}"
457 # Add execution modes to generate the various functions needed to handle
458 # syntax parsing and full parsing execution modes.
459 exec_modes
= collections
.defaultdict(OrderedSet
)
460 noop_parser
= types
.Type("ParserTrait", (types
.Lifetime("alloc"), types
.UnitType
))
461 token_parser
= types
.Type("ParserTrait", (
462 types
.Lifetime("alloc"), types
.Type("StackValue", (types
.Lifetime("alloc"),))))
463 ast_builder
= types
.Type("AstBuilderDelegate", (types
.Lifetime("alloc"),))
465 # Full parsing takes token as input and build an AST.
466 exec_modes
["full_actions"].extend([token_parser
, ast_builder
])
468 # Syntax parsing takes token as input but skip building the AST.
469 # TODO: The syntax parser is commented out for now, as we need something to
470 # be produced when we cannot call the AstBuilder for producing the values.
472 # No-op parsing is used for the simulator, which is so far used for
473 # querying whether we can end the incremental input and lookup if a state
474 # can accept some kind of tokens.
475 exec_modes
["noop_actions"].add(noop_parser
)
477 # Extensions are using an equivalent of Rust types to define the kind of
478 # parsers to be used, this map is used to convert these type names to the
479 # various execution modes.
480 full_parser
= types
.Type("FullParser")
481 syntax_parser
= types
.Type("SyntaxParser")
482 noop_parser
= types
.Type("NoopParser")
484 noop_parser
: ["noop_actions", "full_actions"],
485 syntax_parser
: ["full_actions"],
486 full_parser
: ["full_actions"],
489 result
= grammar
.Grammar(
492 variable_terminals
=variable_terminals
,
493 synthetic_terminals
=synthetic_terminals
,
494 exec_modes
=exec_modes
,
495 type_to_modes
=type_to_modes
)
496 result
.patch(extensions
)
503 filename
: Optional
[str] = None,
504 extensions
: Iterable
[Tuple
[os
.PathLike
, int, str]] = (),
505 goals
: Optional
[Iterable
[str]] = None,
506 terminal_names
: Iterable
[str] = (),
507 synthetic_terminals
: Optional
[Dict
[str, OrderedSet
[str]]] = None,
508 single_grammar
: bool = True
509 ) -> grammar
.Grammar
:
510 if not text
.endswith("\n\n"):
511 # Horrible hack: add a blank line at the end of the document so that
512 # the esgrammar grammar can use newlines as delimiters. :-P
515 terminal_names
= frozenset(terminal_names
)
516 if synthetic_terminals
is None:
517 synthetic_terminals
= {}
519 builder
= ESGrammarBuilder(terminal_names
)
520 parser
= ESGrammarParser(builder
=builder
, goal
="grammar")
521 lexer
= ESGrammarLexer(parser
, filename
=filename
)
523 nt_defs
= lexer
.close()
524 grammar_extensions
= []
525 for ext_filename
, start_lineno
, content
in extensions
:
527 parser
= ESGrammarParser(builder
=builder
, goal
="rust_edsl")
528 lexer
= ESGrammarLexer(parser
, filename
=ext_filename
)
529 builder
.lexer
= lexer
530 lexer
.start_lineno
= start_lineno
532 result
= lexer
.close()
533 grammar_extensions
.append(result
)
536 # Default to the first nonterminal in the input.
537 goals
= [nt_defs
[0][0]]
539 return finish_grammar(
542 variable_terminals
=terminal_names
- frozenset(synthetic_terminals
),
543 synthetic_terminals
=synthetic_terminals
,
544 single_grammar
=single_grammar
,
545 extensions
=grammar_extensions
)