third_party/rust/jsparagus/js_parser/parse_esgrammar.py

   1 """Parse a grammar written in ECMArkup."""
   2
   3 from __future__ import annotations
   4 # mypy: no-implicit-optional
   5
   6 import os
   7 import collections
   8 from typing import Dict, Iterable, Optional, Tuple
   9
  10 from jsparagus import parse_pgen, gen, grammar, extension, types
  11 from jsparagus.lexer import LexicalGrammar
  12 from jsparagus.ordered import OrderedSet, OrderedFrozenSet
  13
  14
  15 ESGrammarLexer = LexicalGrammar(
  16     # the operators and keywords:
  17     "[ ] { } , ~ + ? <! = == != => ( ) @ < > ' ; "
  18     "but empty here lookahead no not of one or returns through Some None impl for let",
  19
  20     NL="\n",
  21
  22     # any number of colons together
  23     EQ=r':+',
  24
  25     # terminals of the ES grammar, quoted with backticks
  26     T=r'`[^` \n]+`|```',
  27
  28     # also terminals, denoting control characters
  29     CHR=r'<[A-Z ]+>|U\+[0-9A-f]{4}',
  30
  31     # nonterminals/types that will be followed by parameters
  32     NTCALL=r'[A-Za-z]\w*(?=[\[<])',
  33
  34     # nonterminals (also, boolean parameters and type names)
  35     NT=r'[A-Za-z]\w*',
  36
  37     # nonterminals wrapped in vertical bars for no apparent reason
  38     NTALT=r'\|[A-Z]\w+\|',
  39
  40     # the spec also gives a few productions names
  41     PRODID=r'#[A-Za-z]\w*',
  42
  43     # prose not wrapped in square brackets
  44     # To avoid conflict with the `>` token, this is recognized only after a space.
  45     PROSE=r'(?<= )>[^\n]*',
  46
  47     # prose wrapped in square brackets
  48     WPROSE=r'\[>[^]]*\]',
  49
  50     # expression denoting a matched terminal or nonterminal
  51     MATCH_REF=r'\$(?:0|[1-9][0-9]*)',
  52
  53     # the spec also gives a few productions names
  54     RUSTCOMMENT=r'//.*\n',
  55 )
  56
  57
  58 ESGrammarParser = gen.compile(
  59     parse_pgen.load_grammar(
  60         os.path.join(os.path.dirname(__file__), "esgrammar.pgen")))
  61
  62
  63 SIGIL_FALSE = '~'
  64 SIGIL_TRUE = '+'
  65
  66 # Abbreviations for single-character terminals, used in the lexical grammar.
  67 ECMASCRIPT_CODE_POINTS = {
  68     # From <https://tc39.es/ecma262/#table-31>
  69     '<ZWNJ>': grammar.Literal('\u200c'),
  70     '<ZWJ>': grammar.Literal('\u200d'),
  71     '<ZWNBSP>': grammar.Literal('\ufeff'),
  72
  73     # From <https://tc39.es/ecma262/#table-32>
  74     '<TAB>': grammar.Literal('\t'),
  75     '<VT>': grammar.Literal('\u000b'),
  76     '<FF>': grammar.Literal('\u000c'),
  77     '<SP>': grammar.Literal(' '),
  78     '<NBSP>': grammar.Literal('\u00a0'),
  79     # <ZWNBSP> already defined above
  80     '<USP>': grammar.UnicodeCategory('Zs'),
  81
  82     # From <https://tc39.es/ecma262/#table-33>
  83     '<LF>': grammar.Literal('\u000a'),
  84     '<CR>': grammar.Literal('\u000d'),
  85     '<LS>': grammar.Literal('\u2028'),
  86     '<PS>': grammar.Literal('\u2028'),
  87 }
  88
  89
  90 class ESGrammarBuilder:
  91     def __init__(self, terminal_names):
  92         # Names of terminals that are written as nonterminals in the grammar.
  93         # For example, "BooleanLiteral" is a terminal name when parsing the
  94         # syntactic grammar.
  95         if terminal_names is None:
  96             terminal_names = frozenset()
  97         self.terminal_names = frozenset(terminal_names)
  98         self.reset()
  99
 100     def reset(self):
 101         self.lexer = None
 102         # This is how full-parsing and lazy-parsing are implemented, using
 103         # different traits.
 104         #
 105         # This field contains the Rust's trait used for calling the method.
 106         # When a CallMethod is generated, it is assumed to be a function of
 107         # this trait. The trait is used by the Rust backend to generate
 108         # multiple backends which are implementing different set of traits.
 109         # Having the trait on the function call is useful as a way to filter
 110         # functions calls at code-generation time.
 111         #
 112         # This field is updated by the `rust_param_impl`, which is used in
 113         # grammar extensions, and visited before producing any CallMethod.
 114         self.method_trait = "AstBuilder"
 115
 116     def rust_edsl(self, impl, grammar):
 117         return extension.GrammarExtension(impl, grammar, self.lexer.filename)
 118
 119     def rust_param_impl(self, trait, for_type, param):
 120         self.method_trait = trait
 121         return extension.ImplFor(param, trait, for_type)
 122
 123     def rust_impl(self, trait, impl_type):
 124         return self.rust_param_impl(trait, impl_type, [])
 125
 126     def rust_nt_def(self, lhs, rhs_line):
 127         # Right now, only focus on the syntactic grammar, and assume that all
 128         # rules are patching existing grammar production by adding code.
 129         return extension.ExtPatch(self.nt_def(None, lhs, ':', [rhs_line]))
 130
 131     def rust_rhs_line(self, symbols):
 132         return self.rhs_line(None, symbols, None, None)
 133
 134     def rust_expr(self, expr):
 135         assert isinstance(expr, grammar.CallMethod)
 136         return expr
 137
 138     def empty(self):
 139         return []
 140
 141     def single(self, x):
 142         return [x]
 143
 144     def append(self, x, y):
 145         return x + [y]
 146
 147     def concat(self, x, y):
 148         return x + y
 149
 150     def blank_line(self):
 151         return []
 152
 153     def nt_def_to_list(self, nt_def):
 154         return [nt_def]
 155
 156     def to_production(self, lhs, i, rhs, is_sole_production):
 157         """Wrap a list of grammar symbols `rhs` in a Production object."""
 158         body, reducer, condition = rhs
 159         if reducer is None:
 160             reducer = self.default_reducer(lhs, i, body, is_sole_production)
 161         return grammar.Production(body, reducer, condition=condition)
 162
 163     def default_reducer(self, lhs, i, body, is_sole_production):
 164         assert isinstance(lhs, grammar.Nt)
 165         nt_name = lhs.name
 166
 167         nargs = sum(1 for e in body if grammar.is_concrete_element(e))
 168         if is_sole_production:
 169             method_name = nt_name
 170         else:
 171             method_name = '{} {}'.format(nt_name, i)
 172         return self.expr_call(method_name, tuple(range(nargs)), None)
 173
 174     def needs_asi(self, lhs, p):
 175         """True if p is a production in which ASI can happen."""
 176         # The purpose of the fake ForLexicalDeclaration production is to have a
 177         # copy of LexicalDeclaration that does not trigger ASI.
 178         #
 179         # Two productions have body == [";"] -- one for EmptyStatement and one
 180         # for ClassMember. Neither should trigger ASI.
 181         #
 182         # The only other semicolons that should not trigger ASI are the ones in
 183         # `for` statement productions, which happen to be exactly those
 184         # semicolons that are not at the end of a production.
 185         return (not (isinstance(lhs, grammar.Nt)
 186                      and lhs.name == 'ForLexicalDeclaration')
 187                 and len(p.body) > 1
 188                 and p.body[-1] == ';')
 189
 190     def apply_asi(self, p, reducer_was_autogenerated):
 191         """Return two rules based on p, so that ASI can be applied."""
 192         assert isinstance(p.reducer, grammar.CallMethod)
 193
 194         if reducer_was_autogenerated:
 195             # Don't pass the semicolon to the method.
 196             reducer = self.expr_call(p.reducer.method,
 197                                      p.reducer.args[:-1],
 198                                      None)
 199         else:
 200             reducer = p.reducer
 201
 202         # Except for do-while loops, check at runtime that ASI occurs only at
 203         # the end of a line.
 204         if (len(p.body) == 7
 205                 and p.body[0] == 'do'
 206                 and p.body[2] == 'while'
 207                 and p.body[3] == '('
 208                 and p.body[5] == ')'
 209                 and p.body[6] == ';'):
 210             code = "do_while_asi"
 211         else:
 212             code = "asi"
 213
 214         return [
 215             # The preferred production, with the semicolon in.
 216             p.copy_with(body=p.body[:],
 217                         reducer=reducer),
 218             # The fallback production, performing ASI.
 219             p.copy_with(body=p.body[:-1] + [grammar.ErrorSymbol(code)],
 220                         reducer=reducer),
 221         ]
 222
 223     def expand_lexical_rhs(self, rhs):
 224         body, reducer, condition = rhs
 225         out = []
 226         for e in body:
 227             if isinstance(e, str):
 228                 # The terminal symbols of the lexical grammar are characters, so
 229                 # add each character of this string as a separate element.
 230                 out += [grammar.Literal(ch) for ch in e]
 231             else:
 232                 out.append(e)
 233         return [out, reducer, condition]
 234
 235     def nt_def(self, nt_type, lhs, eq, rhs_list):
 236         has_sole_production = (len(rhs_list) == 1)
 237         production_list = []
 238         for i, rhs in enumerate(rhs_list):
 239             if eq == ':':
 240                 # Syntactic grammar. A hack is needed for ASI.
 241                 reducer_was_autogenerated = rhs[1] is None
 242                 p = self.to_production(lhs, i, rhs, has_sole_production)
 243                 if self.needs_asi(lhs, p):
 244                     production_list += self.apply_asi(p, reducer_was_autogenerated)
 245                 else:
 246                     production_list.append(p)
 247             elif eq == '::':
 248                 # Lexical grammar. A hack is needed to replace multicharacter
 249                 # terminals like `!==` into sequences of character terminals.
 250                 rhs = self.expand_lexical_rhs(rhs)
 251                 p = self.to_production(lhs, i, rhs, has_sole_production)
 252                 production_list.append(p)
 253         return (lhs.name, eq, grammar.NtDef(lhs.args, production_list, nt_type))
 254
 255     def nt_def_one_of(self, nt_type, nt_lhs, eq, terminals):
 256         return self.nt_def(nt_type, nt_lhs, eq, [([t], None, None) for t in terminals])
 257
 258     def nt_lhs_no_params(self, name):
 259         return grammar.Nt(name, ())
 260
 261     def nt_lhs_with_params(self, name, params):
 262         return grammar.Nt(name, tuple(params))
 263
 264     def simple_type(self, name):
 265         return types.Type(name)
 266
 267     def lifetime_type(self, name):
 268         return types.Lifetime(name)
 269
 270     def parameterized_type(self, name, args):
 271         return types.Type(name, tuple(args))
 272
 273     def t_list_line(self, terminals):
 274         return terminals
 275
 276     def terminal(self, t):
 277         assert t[0] == "`"
 278         assert t[-1] == "`"
 279         return t[1:-1]
 280
 281     def terminal_chr(self, chr):
 282         raise ValueError("FAILED: %r" % chr)
 283
 284     def rhs_line(self, ifdef, rhs, reducer, _prodid):
 285         return (rhs, reducer, ifdef)
 286
 287     def rhs_line_prose(self, prose):
 288         return ([prose], None, None)
 289
 290     def empty_rhs(self):
 291         return []
 292
 293     def expr_match_ref(self, token):
 294         assert token.startswith('$')
 295         return int(token[1:])
 296
 297     def expr_call(self, method, args, fallible):
 298         # NOTE: Currently "AstBuilder" functions are made fallible using the
 299         # fallible_methods taken from some Rust code which extract this
 300         # information to produce a JSON file.
 301         if self.method_trait == "AstBuilder":
 302             fallible = None
 303         return grammar.CallMethod(method, args or (), types.Type(self.method_trait),
 304                                   fallible is not None)
 305
 306     def expr_some(self, expr):
 307         return grammar.Some(expr)
 308
 309     def expr_none(self):
 310         return None
 311
 312     def ifdef(self, value, nt):
 313         return nt, value
 314
 315     def optional(self, nt):
 316         return grammar.Optional(nt)
 317
 318     def but_not(self, nt, exclusion):
 319         _, exclusion = exclusion
 320         return grammar.Exclude(nt, [exclusion])
 321         # return ('-', nt, exclusion)
 322
 323     def but_not_one_of(self, nt, exclusion_list):
 324         exclusion_list = [exclusion for _, exclusion in exclusion_list]
 325         return grammar.Exclude(nt, exclusion_list)
 326         # return ('-', nt, exclusion_list)
 327
 328     def no_line_terminator_here(self, lt):
 329         if lt not in ('LineTerminator', '|LineTerminator|'):
 330             raise ValueError("unrecognized directive " + repr("[no " + lt + " here]"))
 331         return grammar.NoLineTerminatorHere
 332
 333     def nonterminal(self, name):
 334         if name in self.terminal_names:
 335             return name
 336         return grammar.Nt(name, ())
 337
 338     def nonterminal_apply(self, name, args):
 339         if name in self.terminal_names:
 340             raise ValueError("parameters applied to terminal {!r}".format(name))
 341         if len(set(k for k, expr in args)) != len(args):
 342             raise ValueError("parameter passed multiple times")
 343         return grammar.Nt(name, tuple(args))
 344
 345     def arg_expr(self, sigil, argname):
 346         if sigil == '?':
 347             return (argname, grammar.Var(argname))
 348         else:
 349             return (argname, sigil)
 350
 351     def sigil_false(self):
 352         return False
 353
 354     def sigil_true(self):
 355         return True
 356
 357     def exclusion_terminal(self, t):
 358         return ("t", t)
 359
 360     def exclusion_nonterminal(self, nt):
 361         return ("nt", nt)
 362
 363     def exclusion_chr_range(self, c1, c2):
 364         return ("range", c1, c2)
 365
 366     def la_eq(self, t):
 367         return grammar.LookaheadRule(OrderedFrozenSet([t]), True)
 368
 369     def la_ne(self, t):
 370         return grammar.LookaheadRule(OrderedFrozenSet([t]), False)
 371
 372     def la_not_in_nonterminal(self, nt):
 373         return grammar.LookaheadRule(OrderedFrozenSet([nt]), False)
 374
 375     def la_not_in_set(self, lookahead_exclusions):
 376         if all(len(excl) == 1 for excl in lookahead_exclusions):
 377             return grammar.LookaheadRule(
 378                 OrderedFrozenSet(excl[0] for excl in lookahead_exclusions),
 379                 False)
 380         raise ValueError("unsupported: lookahead > 1 token, {!r}"
 381                          .format(lookahead_exclusions))
 382
 383     def chr(self, t):
 384         assert t[0] == "<" or t[0] == 'U'
 385         if t[0] == "<":
 386             assert t[-1] == ">"
 387             if t not in ECMASCRIPT_CODE_POINTS:
 388                 raise ValueError("unrecognized character abbreviation {!r}".format(t))
 389             return ECMASCRIPT_CODE_POINTS[t]
 390         else:
 391             assert t[1] == "+"
 392             return grammar.Literal(chr(int(t[2:], base=16)))
 393
 394
 395 def finish_grammar(nt_defs, goals, variable_terminals, synthetic_terminals,
 396                    single_grammar=True, extensions=[]):
 397     nt_grammars = {}
 398     for nt_name, eq, _ in nt_defs:
 399         if nt_name in nt_grammars:
 400             raise ValueError(
 401                 "duplicate definitions for nonterminal {!r}"
 402                 .format(nt_name))
 403         nt_grammars[nt_name] = eq
 404
 405     # Figure out which grammar we were trying to get (":" for syntactic,
 406     # "::" for lexical) based on the goal symbols.
 407     goals = list(goals)
 408     if len(goals) == 0:
 409         raise ValueError("no goal nonterminals specified")
 410     if single_grammar:
 411         selected_grammars = set(nt_grammars[goal] for goal in goals)
 412         assert len(selected_grammars) != 0
 413         if len(selected_grammars) > 1:
 414             raise ValueError(
 415                 "all goal nonterminals must be part of the same grammar; "
 416                 "got {!r} (matching these grammars: {!r})"
 417                 .format(set(goals), set(selected_grammars)))
 418         [selected_grammar] = selected_grammars
 419
 420     terminal_set = set()
 421
 422     def hack_production(p):
 423         for i, e in enumerate(p.body):
 424             if isinstance(e, str) and e[:1] == "`":
 425                 if len(e) < 3 or e[-1:] != "`":
 426                     raise ValueError(
 427                         "Unrecognized grammar symbol: {!r} (in {!r})"
 428                         .format(e, p))
 429                 p[i] = token = e[1:-1]
 430                 terminal_set.add(token)
 431
 432     nonterminals = {}
 433     for nt_name, eq, rhs_list_or_lambda in nt_defs:
 434         if single_grammar and eq != selected_grammar:
 435             continue
 436
 437         if isinstance(rhs_list_or_lambda, grammar.NtDef):
 438             nonterminals[nt_name] = rhs_list_or_lambda
 439         else:
 440             rhs_list = rhs_list_or_lambda
 441             for p in rhs_list:
 442                 if not isinstance(p, grammar.Production):
 443                     raise ValueError(
 444                         "invalid grammar: ifdef in non-function-call context")
 445                 hack_production(p)
 446             if nt_name in nonterminals:
 447                 raise ValueError(
 448                     "unsupported: multiple definitions for nt " + nt_name)
 449             nonterminals[nt_name] = rhs_list
 450
 451     for t in terminal_set:
 452         if t in nonterminals:
 453             raise ValueError(
 454                 "grammar contains both a terminal `{}` and nonterminal {}"
 455                 .format(t, t))
 456
 457     # Add execution modes to generate the various functions needed to handle
 458     # syntax parsing and full parsing execution modes.
 459     exec_modes = collections.defaultdict(OrderedSet)
 460     noop_parser = types.Type("ParserTrait", (types.Lifetime("alloc"), types.UnitType))
 461     token_parser = types.Type("ParserTrait", (
 462         types.Lifetime("alloc"), types.Type("StackValue", (types.Lifetime("alloc"),))))
 463     ast_builder = types.Type("AstBuilderDelegate", (types.Lifetime("alloc"),))
 464
 465     # Full parsing takes token as input and build an AST.
 466     exec_modes["full_actions"].extend([token_parser, ast_builder])
 467
 468     # Syntax parsing takes token as input but skip building the AST.
 469     # TODO: The syntax parser is commented out for now, as we need something to
 470     # be produced when we cannot call the AstBuilder for producing the values.
 471
 472     # No-op parsing is used for the simulator, which is so far used for
 473     # querying whether we can end the incremental input and lookup if a state
 474     # can accept some kind of tokens.
 475     exec_modes["noop_actions"].add(noop_parser)
 476
 477     # Extensions are using an equivalent of Rust types to define the kind of
 478     # parsers to be used, this map is used to convert these type names to the
 479     # various execution modes.
 480     full_parser = types.Type("FullParser")
 481     syntax_parser = types.Type("SyntaxParser")
 482     noop_parser = types.Type("NoopParser")
 483     type_to_modes = {
 484         noop_parser: ["noop_actions", "full_actions"],
 485         syntax_parser: ["full_actions"],
 486         full_parser: ["full_actions"],
 487     }
 488
 489     result = grammar.Grammar(
 490         nonterminals,
 491         goal_nts=goals,
 492         variable_terminals=variable_terminals,
 493         synthetic_terminals=synthetic_terminals,
 494         exec_modes=exec_modes,
 495         type_to_modes=type_to_modes)
 496     result.patch(extensions)
 497     return result
 498
 499
 500 def parse_esgrammar(
 501         text: str,
 502         *,
 503         filename: Optional[str] = None,
 504         extensions: Iterable[Tuple[os.PathLike, int, str]] = (),
 505         goals: Optional[Iterable[str]] = None,
 506         terminal_names: Iterable[str] = (),
 507         synthetic_terminals: Optional[Dict[str, OrderedSet[str]]] = None,
 508         single_grammar: bool = True
 509 ) -> grammar.Grammar:
 510     if not text.endswith("\n\n"):
 511         # Horrible hack: add a blank line at the end of the document so that
 512         # the esgrammar grammar can use newlines as delimiters. :-P
 513         text += "\n"
 514
 515     terminal_names = frozenset(terminal_names)
 516     if synthetic_terminals is None:
 517         synthetic_terminals = {}
 518
 519     builder = ESGrammarBuilder(terminal_names)
 520     parser = ESGrammarParser(builder=builder, goal="grammar")
 521     lexer = ESGrammarLexer(parser, filename=filename)
 522     lexer.write(text)
 523     nt_defs = lexer.close()
 524     grammar_extensions = []
 525     for ext_filename, start_lineno, content in extensions:
 526         builder.reset()
 527         parser = ESGrammarParser(builder=builder, goal="rust_edsl")
 528         lexer = ESGrammarLexer(parser, filename=ext_filename)
 529         builder.lexer = lexer
 530         lexer.start_lineno = start_lineno
 531         lexer.write(content)
 532         result = lexer.close()
 533         grammar_extensions.append(result)
 534
 535     if goals is None:
 536         # Default to the first nonterminal in the input.
 537         goals = [nt_defs[0][0]]
 538
 539     return finish_grammar(
 540         nt_defs,
 541         goals=goals,
 542         variable_terminals=terminal_names - frozenset(synthetic_terminals),
 543         synthetic_terminals=synthetic_terminals,
 544         single_grammar=single_grammar,
 545         extensions=grammar_extensions)