1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
7 A parser for our grammar language that builds RTNs representing the
10 Copyright (c) 2007-2008 Joshua Haberman. See LICENSE for details.
12 --------------------------------------------------------------------]]--
18 require
"bootstrap/regex_parser"
20 -- CharStream: a cheesy sort-of lexer-like object for the RTN parser
22 function CharStream
:new(string)
23 local obj
= newobject(self
)
29 function CharStream
:ignore(what
)
30 local old_ignore
= self
.ignored
37 function CharStream
:skip_ignored()
38 if self
.ignored
== "whitespace" then
39 local first
, last
= self
.string:find("^[\r\n\t ]+", self
.offset
)
40 if last
then self
.offset
= last
+1 end
44 function CharStream
:lookahead(amount
)
46 return self
.string:sub(self
.offset
, self
.offset
+amount
-1)
49 function CharStream
:consume(str
)
51 local actual_str
= self
.string:sub(self
.offset
, self
.offset
+str
:len()-1)
52 if actual_str
~= str
then
53 error(string.format("Error parsing grammar; expected %s, got %s", str
, actual_str
))
55 self
.offset
= self
.offset
+ str
:len()
57 -- print("Consumed " .. str)
61 function CharStream
:consume_pattern(pattern
)
63 local first
, last
= self
.string:find("^" .. pattern
, self
.offset
)
67 return self
.string:sub(first
, last
)
69 error(string.format("Error parsing grammar: expected to match pattern %s, but string is %s", pattern
, self
.string:sub(self
.offset
, -1)))
73 function CharStream
:match(pattern
)
75 local first
, last
= self
.string:find("^" .. pattern
, self
.offset
)
83 function CharStream
:eof()
84 return self
.offset
> self
.string:len()
89 -- Parse the grammar file given in +chars+ and return a Grammar object.
90 function parse_grammar(chars
)
91 chars
:ignore("whitespace")
92 local grammar
= Grammar
:new()
93 local attributes
= {ignore
={}, slot_counts
={}, regex_text
={}, grammar
=grammar
}
94 while not chars
:eof() do
95 if chars
:match(" *@start") then
96 chars
:consume_pattern(" *@start")
97 grammar
.start
= parse_nonterm(chars
).name
;
100 local before_offset
= chars
.offset
101 stmt
= parse_statement(chars
, attributes
)
104 elseif stmt
.nonterm
then
105 stmt
.derivations
.final
.final
= "Final"
106 local rule_text
= chars
.string:sub(before_offset
, chars
.offset
-1)
107 grammar
:add_nonterm(stmt
.nonterm
.name
, stmt
.derivations
, stmt
.slot_count
, rule_text
)
108 elseif stmt
.term
then
109 grammar
:add_terminal(stmt
.term
, stmt
.regex
)
114 grammar
.attributes
= attributes
116 -- start symbol defaults to the first symbol
117 if not grammar
.start
then
118 grammar
.start
= grammar
.rtns
:get_key_at_offset(1)
124 function parse_statement(chars
, attributes
)
125 local old_ignore
= chars
:ignore("whitespace")
128 local ident
= parse_nonterm(chars
)
130 if chars
:match("->") then
131 attributes
.nonterm
= ident
134 attributes
.slotnum
= 1
135 ret
.derivations
= parse_derivations(chars
, attributes
)
136 ret
.slot_count
= attributes
.slotnum
- 1
138 ret
.term
= ident
.name
140 ret
.regex
= parse_regex(chars
)
144 chars
:ignore(old_ignore
)
148 function parse_derivations(chars
, attributes
)
149 local old_ignore
= chars
:ignore("whitespace")
150 local derivations
= {}
153 if chars
:match(" e ") then
154 table.insert(derivations
, fa
.RTN
:new
{symbol
=fa
.e
, properties
={slotnum
=attributes
.slotnum
}})
155 attributes
.slotnum
= attributes
.slotnum
+ 1
157 table.insert(derivations
, parse_derivation(chars
, attributes
))
159 until chars
:lookahead(1) ~= "|" or not chars
:consume("|")
161 chars
:ignore(old_ignore
)
162 return nfa_construct
.alt(derivations
)
165 function parse_derivation(chars
, attributes
)
166 local old_ignore
= chars
:ignore("whitespace")
167 local ret
= parse_term(chars
, attributes
)
168 while chars
:lookahead(1) ~= "|" and chars
:lookahead(1) ~= ";" and chars
:lookahead(1) ~= ")" do
169 ret
= nfa_construct
.concat(ret
, parse_term(chars
, attributes
))
171 chars
:ignore(old_ignore
)
175 function parse_term(chars
, attributes
)
176 local old_ignore
= chars
:ignore("whitespace")
179 if chars
:match(" *%.[%w_]+ *=") then
180 name
= parse_name(chars
)
185 if chars
:lookahead(1) == "/" then
186 name
= name
or attributes
.nonterm
.name
187 intfa
, text
= parse_regex(chars
)
188 attributes
.grammar
:add_terminal(name
, intfa
, text
)
189 ret
= fa
.RTN
:new
{symbol
=name
, properties
={name
=name
, slotnum
=attributes
.slotnum
}}
190 attributes
.slotnum
= attributes
.slotnum
+ 1
191 elseif chars
:lookahead(1) == "'" or chars
:lookahead(1) == '"' then
192 local string = parse_string(chars
)
193 attributes
.grammar
:add_terminal(string, string)
194 name
= name
or string
195 ret
= fa
.RTN
:new
{symbol
=string, properties
={name
=name
, slotnum
=attributes
.slotnum
}}
196 attributes
.slotnum
= attributes
.slotnum
+ 1
197 elseif chars
:lookahead(1) == "(" then
198 if name
then error("You cannot name a group") end
200 ret
= parse_derivations(chars
, attributes
)
202 elseif chars
:match("e[^%w]") then
204 ret
= fa
.RTN
:new
{symbol
=fa
.e
, properties
={name
="e", slotnum
=attributes
.slotnum
}}
205 attributes
.slotnum
= attributes
.slotnum
+ 1
208 local nonterm
= parse_nonterm(chars
)
209 name
= name
or nonterm
.name
210 if attributes
.grammar
.terminals
[nonterm
.name
] then
211 ret
= fa
.RTN
:new
{symbol
=nonterm
.name
, properties
={name
=nonterm
.name
, slotnum
=attributes
.slotnum
}}
213 ret
= fa
.RTN
:new
{symbol
=nonterm
, properties
={name
=name
, slotnum
=attributes
.slotnum
}}
215 attributes
.slotnum
= attributes
.slotnum
+ 1
218 local one_ahead
= chars
:lookahead(1)
219 if one_ahead
== "?" or one_ahead
== "*" or one_ahead
== "+" then
220 local modifier
, sep
= parse_modifier(chars
, attributes
)
221 -- foo +(bar) == foo (bar foo)*
222 -- foo *(bar) == (foo (bar foo)*)?
224 if modifier
== "?" then error("Question mark with separator makes no sense") end
225 ret
= nfa_construct
.concat(ret
:dup(), nfa_construct
.kleene(nfa_construct
.concat(sep
, ret
:dup())))
226 if modifier
== "*" then
227 ret
= nfa_construct
.alt2(fa
.RTN
:new
{symbol
=fa
.e
}, ret
)
230 if modifier
== "?" then
231 ret
= nfa_construct
.alt2(fa
.RTN
:new
{symbol
=fa
.e
}, ret
)
232 elseif modifier
== "*" then
233 ret
= nfa_construct
.kleene(ret
)
234 elseif modifier
== "+" then
235 ret
= nfa_construct
.rep(ret
)
240 chars
:ignore(old_ignore
)
244 function parse_name(chars
)
245 local old_ignore
= chars
:ignore()
247 local ret
= chars
:consume_pattern("[%w_]+")
248 chars
:ignore(old_ignore
)
252 function parse_modifier(chars
, attributes
)
253 local old_ignore
= chars
:ignore()
255 modifier
= chars
:consume_pattern("[?*+]")
256 if chars
:lookahead(1) == "(" then
259 if chars
:lookahead(1) == "'" or chars
:lookahead(1) == '"' then
260 sep_string
= parse_string(chars
)
262 sep_string
= chars
:consume_pattern("[^)]*")
264 str
= fa
.RTN
:new
{symbol
=sep_string
, properties
={slotnum
=attributes
.slotnum
, name
=sep_string
}}
265 attributes
.grammar
:add_terminal(sep_string
, sep_string
)
266 attributes
.slotnum
= attributes
.slotnum
+ 1
269 chars
:ignore(old_ignore
)
273 function parse_nonterm(chars
)
274 local old_ignore
= chars
:ignore()
275 local ret
= fa
.NonTerm
:new(chars
:consume_pattern("[%w_]+"))
276 chars
:ignore(old_ignore
)
280 function parse_string(chars
)
281 local old_ignore
= chars
:ignore()
283 if chars
:lookahead(1) == "'" then
285 while chars
:lookahead(1) ~= "'" do
286 if chars
:lookahead(1) == "\\" then
288 str
= str
.. chars
:consume_pattern(".") -- TODO: other backslash sequences
290 str
= str
.. chars
:consume_pattern(".")
296 while chars
:lookahead(1) ~= '"' do
297 if chars
:lookahead(1) == "\\" then
299 str
= str
.. chars
:consume_pattern(".") -- TODO: other backslash sequences
301 str
= str
.. chars
:consume_pattern(".")
306 chars
:ignore(old_ignore
)
310 function parse_regex(chars
)
311 local old_ignore
= chars
:ignore()
313 local regex_text
= ""
314 while chars
:lookahead(1) ~= "/" do
315 if chars
:lookahead(1) == "\\" then
316 regex_text
= regex_text
.. chars
:consume_pattern("..")
318 regex_text
= regex_text
.. chars
:consume_pattern(".")
321 local regex
= regex_parser
.parse_regex(regex_parser
.TokenStream
:new(regex_text
))
323 chars
:ignore(old_ignore
)
324 return regex
, regex_text