Lookahead improvements: EOF handling and better ambiguity detection.
[gazelle.git] / compiler / bootstrap / rtn.lua
blobcb48977f2a3ad48ae4a82d1273a0f92d1300355f
1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
5 rtn.lua
7 A parser for our grammar language that builds RTNs representing the
8 input grammar.
10 Copyright (c) 2007-2008 Joshua Haberman. See LICENSE for details.
12 --------------------------------------------------------------------]]--
14 require "misc"
15 require "fa"
16 require "grammar"
18 require "bootstrap/regex_parser"
20 -- CharStream: a cheesy sort-of lexer-like object for the RTN parser
21 CharStream = {}
22 function CharStream:new(string)
23 local obj = newobject(self)
24 obj.string = string
25 obj.offset = 1
26 return obj
27 end
29 function CharStream:ignore(what)
30 local old_ignore = self.ignored
31 self:skip_ignored()
32 self.ignored = what
33 self:skip_ignored()
34 return old_ignore
35 end
37 function CharStream:skip_ignored()
38 if self.ignored == "whitespace" then
39 local first, last = self.string:find("^[\r\n\t ]+", self.offset)
40 if last then self.offset = last+1 end
41 end
42 end
44 function CharStream:lookahead(amount)
45 self:skip_ignored()
46 return self.string:sub(self.offset, self.offset+amount-1)
47 end
49 function CharStream:consume(str)
50 self:skip_ignored()
51 local actual_str = self.string:sub(self.offset, self.offset+str:len()-1)
52 if actual_str ~= str then
53 error(string.format("Error parsing grammar; expected %s, got %s", str, actual_str))
54 end
55 self.offset = self.offset + str:len()
56 self:skip_ignored()
57 -- print("Consumed " .. str)
58 return true
59 end
61 function CharStream:consume_pattern(pattern)
62 self:skip_ignored()
63 local first, last = self.string:find("^" .. pattern, self.offset)
64 if last then
65 self.offset = last+1
66 self:skip_ignored()
67 return self.string:sub(first, last)
68 else
69 error(string.format("Error parsing grammar: expected to match pattern %s, but string is %s", pattern, self.string:sub(self.offset, -1)))
70 end
71 end
73 function CharStream:match(pattern)
74 self:skip_ignored()
75 local first, last = self.string:find("^" .. pattern, self.offset)
76 if last then
77 return true
78 else
79 return false
80 end
81 end
83 function CharStream:eof()
84 return self.offset > self.string:len()
85 end
87 -- class TokenStream
89 -- Parse the grammar file given in +chars+ and return a Grammar object.
90 function parse_grammar(chars)
91 chars:ignore("whitespace")
92 local grammar = Grammar:new()
93 local attributes = {ignore={}, slot_counts={}, regex_text={}, grammar=grammar}
94 while not chars:eof() do
95 if chars:match(" *@start") then
96 chars:consume_pattern(" *@start")
97 grammar.start = parse_nonterm(chars).name;
98 chars:consume(";")
99 else
100 local before_offset = chars.offset
101 stmt = parse_statement(chars, attributes)
102 if not stmt then
103 break
104 elseif stmt.nonterm then
105 stmt.derivations.final.final = "Final"
106 local rule_text = chars.string:sub(before_offset, chars.offset-1)
107 grammar:add_nonterm(stmt.nonterm.name, stmt.derivations, stmt.slot_count, rule_text)
108 elseif stmt.term then
109 grammar:add_terminal(stmt.term, stmt.regex)
114 grammar.attributes = attributes
116 -- start symbol defaults to the first symbol
117 if not grammar.start then
118 grammar.start = grammar.rtns:get_key_at_offset(1)
121 return grammar
124 function parse_statement(chars, attributes)
125 local old_ignore = chars:ignore("whitespace")
126 local ret = {}
128 local ident = parse_nonterm(chars)
130 if chars:match("->") then
131 attributes.nonterm = ident
132 ret.nonterm = ident
133 chars:consume("->")
134 attributes.slotnum = 1
135 ret.derivations = parse_derivations(chars, attributes)
136 ret.slot_count = attributes.slotnum - 1
137 else
138 ret.term = ident.name
139 chars:consume(":")
140 ret.regex = parse_regex(chars)
143 chars:consume(";")
144 chars:ignore(old_ignore)
145 return ret
148 function parse_derivations(chars, attributes)
149 local old_ignore = chars:ignore("whitespace")
150 local derivations = {}
152 repeat
153 if chars:match(" e ") then
154 table.insert(derivations, fa.RTN:new{symbol=fa.e, properties={slotnum=attributes.slotnum}})
155 attributes.slotnum = attributes.slotnum + 1
156 else
157 table.insert(derivations, parse_derivation(chars, attributes))
159 until chars:lookahead(1) ~= "|" or not chars:consume("|")
161 chars:ignore(old_ignore)
162 return nfa_construct.alt(derivations)
165 function parse_derivation(chars, attributes)
166 local old_ignore = chars:ignore("whitespace")
167 local ret = parse_term(chars, attributes)
168 while chars:lookahead(1) ~= "|" and chars:lookahead(1) ~= ";" and chars:lookahead(1) ~= ")" do
169 ret = nfa_construct.concat(ret, parse_term(chars, attributes))
171 chars:ignore(old_ignore)
172 return ret
175 function parse_term(chars, attributes)
176 local old_ignore = chars:ignore("whitespace")
177 local name
178 local ret
179 if chars:match(" *%.[%w_]+ *=") then
180 name = parse_name(chars)
181 chars:consume("=")
184 local symbol
185 if chars:lookahead(1) == "/" then
186 name = name or attributes.nonterm.name
187 intfa, text = parse_regex(chars)
188 attributes.grammar:add_terminal(name, intfa, text)
189 ret = fa.RTN:new{symbol=name, properties={name=name, slotnum=attributes.slotnum}}
190 attributes.slotnum = attributes.slotnum + 1
191 elseif chars:lookahead(1) == "'" or chars:lookahead(1) == '"' then
192 local string = parse_string(chars)
193 attributes.grammar:add_terminal(string, string)
194 name = name or string
195 ret = fa.RTN:new{symbol=string, properties={name=name, slotnum=attributes.slotnum}}
196 attributes.slotnum = attributes.slotnum + 1
197 elseif chars:lookahead(1) == "(" then
198 if name then error("You cannot name a group") end
199 chars:consume("(")
200 ret = parse_derivations(chars, attributes)
201 chars:consume(")")
202 elseif chars:match("e[^%w]") then
203 chars:consume("e")
204 ret = fa.RTN:new{symbol=fa.e, properties={name="e", slotnum=attributes.slotnum}}
205 attributes.slotnum = attributes.slotnum + 1
206 return ret
207 else
208 local nonterm = parse_nonterm(chars)
209 name = name or nonterm.name
210 if attributes.grammar.terminals[nonterm.name] then
211 ret = fa.RTN:new{symbol=nonterm.name, properties={name=nonterm.name, slotnum=attributes.slotnum}}
212 else
213 ret = fa.RTN:new{symbol=nonterm, properties={name=name, slotnum=attributes.slotnum}}
215 attributes.slotnum = attributes.slotnum + 1
218 local one_ahead = chars:lookahead(1)
219 if one_ahead == "?" or one_ahead == "*" or one_ahead == "+" then
220 local modifier, sep = parse_modifier(chars, attributes)
221 -- foo +(bar) == foo (bar foo)*
222 -- foo *(bar) == (foo (bar foo)*)?
223 if sep then
224 if modifier == "?" then error("Question mark with separator makes no sense") end
225 ret = nfa_construct.concat(ret:dup(), nfa_construct.kleene(nfa_construct.concat(sep, ret:dup())))
226 if modifier == "*" then
227 ret = nfa_construct.alt2(fa.RTN:new{symbol=fa.e}, ret)
229 else
230 if modifier == "?" then
231 ret = nfa_construct.alt2(fa.RTN:new{symbol=fa.e}, ret)
232 elseif modifier == "*" then
233 ret = nfa_construct.kleene(ret)
234 elseif modifier == "+" then
235 ret = nfa_construct.rep(ret)
240 chars:ignore(old_ignore)
241 return ret
244 function parse_name(chars)
245 local old_ignore = chars:ignore()
246 chars:consume(".")
247 local ret = chars:consume_pattern("[%w_]+")
248 chars:ignore(old_ignore)
249 return ret
252 function parse_modifier(chars, attributes)
253 local old_ignore = chars:ignore()
254 local modifier, str
255 modifier = chars:consume_pattern("[?*+]")
256 if chars:lookahead(1) == "(" then
257 chars:consume("(")
258 local sep_string
259 if chars:lookahead(1) == "'" or chars:lookahead(1) == '"' then
260 sep_string = parse_string(chars)
261 else
262 sep_string = chars:consume_pattern("[^)]*")
264 str = fa.RTN:new{symbol=sep_string, properties={slotnum=attributes.slotnum, name=sep_string}}
265 attributes.grammar:add_terminal(sep_string, sep_string)
266 attributes.slotnum = attributes.slotnum + 1
267 chars:consume(")")
269 chars:ignore(old_ignore)
270 return modifier, str
273 function parse_nonterm(chars)
274 local old_ignore = chars:ignore()
275 local ret = fa.NonTerm:new(chars:consume_pattern("[%w_]+"))
276 chars:ignore(old_ignore)
277 return ret
280 function parse_string(chars)
281 local old_ignore = chars:ignore()
282 local str = ""
283 if chars:lookahead(1) == "'" then
284 chars:consume("'")
285 while chars:lookahead(1) ~= "'" do
286 if chars:lookahead(1) == "\\" then
287 chars:consume("\\")
288 str = str .. chars:consume_pattern(".") -- TODO: other backslash sequences
289 else
290 str = str .. chars:consume_pattern(".")
293 chars:consume("'")
294 else
295 chars:consume('"')
296 while chars:lookahead(1) ~= '"' do
297 if chars:lookahead(1) == "\\" then
298 chars:consume("\\")
299 str = str .. chars:consume_pattern(".") -- TODO: other backslash sequences
300 else
301 str = str .. chars:consume_pattern(".")
304 chars:consume('"')
306 chars:ignore(old_ignore)
307 return str
310 function parse_regex(chars)
311 local old_ignore = chars:ignore()
312 chars:consume("/")
313 local regex_text = ""
314 while chars:lookahead(1) ~= "/" do
315 if chars:lookahead(1) == "\\" then
316 regex_text = regex_text .. chars:consume_pattern("..")
317 else
318 regex_text = regex_text .. chars:consume_pattern(".")
321 local regex = regex_parser.parse_regex(regex_parser.TokenStream:new(regex_text))
322 chars:consume("/")
323 chars:ignore(old_ignore)
324 return regex, regex_text
327 -- vim:et:sts=2:sw=2