Significant new feature: explicit ambiguity resolution.
[gazelle.git] / compiler / fa.lua
blob9866e88c97f352a358498ed3a109ad391ae103d4
1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
5 fa.lua
7 Data structure for representing finite automata. Both NFAs and DFAs
8 can be represented using this class.
10 The base class (FA/FAState) has three child classes:
12 - IntFA/IntFAState represents a nonrecursive FA that transitions
13 on IntSets or Epsilon. These represent the machines that recognize
14 regular expressions.
16 - RTN/RTNState represents a recursive transition network: the graph
17 that is built to recognize context-free grammars. These transition
18 on strings, regexes, epsilon, or on another RTN.
20 - GLA/GLAState represents a lookahead automaton that represents
21 LL lookahead information.
23 Either child class can be deterministic or nondeterministic. The only
24 difference is whether there are epsilons / redundant transtions.
26 This all needs to be refactored quite a bit, now that I have the insight
27 of understanding all the different ways NFAs and DFAs are used throughout
28 Gazelle.
30 Copyright (c) 2007-2008 Joshua Haberman. See LICENSE for details.
32 --------------------------------------------------------------------]]--
34 require "misc"
36 module("fa", package.seeall)
38 -- Class for representing special-case edge values that have only one
39 -- instance for the whole program.
40 SingletonEdgeValue = {name="SingletonEdgeValue"}
41 function SingletonEdgeValue:new(name)
42 self.singletons = self.singletons or {}
43 -- we index each singleton by name, creating new ones lazily.
44 self.singletons[name] = self.singletons[name] or newobject(self)
45 return self.singletons[name]
46 end
48 -- This is a special edge value that represents a transition that can be
49 -- taken without consuming any input.
50 e = SingletonEdgeValue:new("Epsilon")
52 -- This is a special edge value that represents a GLA transition that can
53 -- be taken when EOF is seen.
54 eof = SingletonEdgeValue:new("EOF")
57 --[[--------------------------------------------------------------------
59 class State -- base class of FAState and RTNState
61 --------------------------------------------------------------------]]--
63 FAState = {name="FAState"}
64 function FAState:new()
65 local obj = newobject(self)
66 obj._transitions = {}
67 return obj
68 end
70 function FAState:child_states()
71 local children = Set:new()
72 for edge_value, target_state in self:transitions() do
73 children:add(target_state)
74 end
75 return children
76 end
78 function FAState:add_transition(edge_value, target_state, edge_properties)
79 for e_edge_value, e_target_state, e_edge_properties in self:transitions() do
80 if edge_value == e_edge_value and target_state == e_target_state
81 and e_edge_properties == edge_properties then
82 return
83 end
84 end
85 table.insert(self._transitions, {edge_value, target_state, edge_properties})
86 end
88 function FAState:num_transitions()
89 return #self._transitions
90 end
92 function FAState:transitions()
93 local i = 0
94 return function ()
95 i = i + 1
96 if self._transitions[i] then
97 return unpack(self._transitions[i])
98 else
99 return nil
104 function FAState:clear_transitions()
105 self._transitions = {}
108 function FAState:transitions_for(val, prop)
109 local targets
110 if prop == "ANY" then
111 targets = {}
112 else
113 targets = Set:new()
116 for edge_val, target_state, properties in self:transitions() do
117 if edge_val == val and ((prop == "ANY") or (prop == "ALL") or (prop == properties)) then
118 if prop == "ANY" then
119 table.insert(targets, {target_state, properties})
120 else
121 targets:add(target_state)
125 return targets
128 function FAState:dest_state_for(val)
129 local states = self:transitions_for(val, "ANY")
130 if #states > 1 then
131 error(">1 transition found")
132 elseif #states == 0 then
133 return nil
134 else
135 local dest_state
136 for dest_state_properties in each(states) do
137 dest_state, properties = unpack(dest_state_properties)
139 return dest_state
144 --[[--------------------------------------------------------------------
146 class FA -- base class of IntFA and RTN
148 --------------------------------------------------------------------]]--
150 FA = {name="FA"}
151 function FA:new(init)
152 local obj = newobject(self)
153 init = init or {}
155 if obj.new_state then
156 obj.start = init.start or obj:new_state()
157 obj.final = init.final or obj:new_state() -- for all but Thompson NFA fragments we ignore this
158 if init.symbol then
159 obj.start:add_transition(init.symbol, obj.final, init.properties)
160 elseif init.string then
161 local int_set = IntSet:new()
162 local char = init.string:sub(1, 1):byte()
163 int_set:add(Range:new(char, char))
164 local fa = IntFA:new{symbol=int_set}
165 for i=2,#init.string do
166 int_set = IntSet:new()
167 char = init.string:sub(i, i):byte()
168 int_set:add(Range:new(char, char))
169 fa = nfa_construct.concat(fa, IntFA:new{symbol=int_set})
171 return fa
175 obj.properties = {}
177 return obj
180 function FA:states()
181 return depth_first_traversal(self.start, function (s) return s:child_states() end)
184 function FA:dup()
185 local new_graph = self:new_graph()
186 local new_states = {}
188 -- duplicate states
189 for state in each(self:states()) do
190 new_states[state] = new_states[state] or self:new_state()
191 if self.start == state then new_graph.start = new_states[state] end
192 if self.final == state then new_graph.final = new_states[state] end
195 -- duplicate transitions
196 for state in each(self:states()) do
197 for edge_val, target_state, properties in state:transitions() do
198 new_states[state]:add_transition(edge_val, new_states[target_state], properties)
202 return new_graph
205 --[[--------------------------------------------------------------------
207 class IntFA/IntFAState: Classes for representing machines that recognize
208 regular expressions.
210 --------------------------------------------------------------------]]--
212 IntFA = FA:new()
213 IntFA.name = "IntFA"
214 function IntFA:new_graph(init)
215 return IntFA:new(init)
218 function IntFA:new_state()
219 return IntFAState:new()
222 function IntFA:get_outgoing_edge_values(states)
223 local symbol_sets = Set:new()
224 local properties_set = Set:new()
225 for state in each(states) do
226 for symbol_set, target_state, properties in state:transitions() do
227 if properties ~= nil then
228 properties_set:add(properties)
231 if type(symbol_set) == "table" and symbol_set.class == IntSet then
232 symbol_sets:add(symbol_set)
236 symbol_sets = equivalence_classes(symbol_sets)
238 -- for now, just cross symbol sets with properties. a bit wasteful,
239 -- but we'll worry about that later.
240 local values = {}
241 for set in each(symbol_sets) do
242 for properties in each(properties_set) do
243 table.insert(values, {set, properties})
245 table.insert(values, {set, nil})
248 return values
251 function IntFA:to_dot()
252 str = ""
253 states = self:states():to_array()
254 --table.sort(states, function (a, b) return a.statenum < b.statenum end)
255 for i,state in ipairs(states) do
256 local label = ""
257 local peripheries = 1
258 if state == self.start then label = "Begin" end
259 if state == self.final or state.final then
260 if label ~= "" then
261 label = label .. "NEWLINE" .. state.final
262 else
263 label = state.final
265 peripheries = 2
267 label = label:gsub("[\"\\]", "\\%1")
268 label = label:gsub("NEWLINE", "\\n")
269 str = str .. string.format(' "%s" [label="%s", peripheries=%d];\n', tostring(state), label, peripheries)
270 for char, tostate, attributes in state:transitions() do
271 local print_char
272 if char == fa.e then
273 print_char = "ep"
274 -- elseif char == "(" then
275 -- print_char = "start capture"
276 -- elseif char == ")" then
277 -- print_char = "end capture"
278 elseif type(char) == "string" then
279 print_char = char
280 elseif type(char) == 'table' and char.class == IntSet then
281 if char:isunbounded() then char = char:invert() end
282 print_char = char:toasciistring()
283 else
284 print(serialize(char, 3, true))
285 print_char = string.char(char)
287 print_char = print_char:gsub("[\"\\]", "\\%1")
288 print_char = print_char:gsub("NEWLINE", "\\n")
289 str = str .. string.format(' "%s" -> "%s" [label="%s"];\n', tostring(state), tostring(tostate), print_char)
292 return str
296 IntFAState = FAState:new()
297 IntFAState.name = "IntFAState"
299 function IntFAState:add_transition(edge_value, target_state, edge_properties)
300 -- as a special case, IntSet edge_values can be combined if two edge_values
301 -- have the same target_state and neither has any edge_properties.
302 if edge_value.class == IntSet and edge_properties == nil then
303 for existing_edge_value, existing_target_state, existing_edge_properties in self:transitions() do
304 if existing_edge_value.class == IntSet and target_state == existing_target_state and existing_edge_properties == nil then
305 existing_edge_value:add_intset(edge_value)
306 return
310 FAState.add_transition(self, edge_value, target_state, edge_properties)
313 function IntFAState:transitions_for(val, prop)
314 local targets = Set:new()
315 if type(val) == "table" and val.class == IntSet then
316 val = val:sampleint()
319 for edge_val, target_state, properties in self:transitions() do
320 if edge_val == val or (val ~= fa.e and edge_val.class == IntSet and edge_val:contains(val)) then
321 if (prop == "ANY") or (prop == properties) then
322 targets:add(target_state)
326 return targets
329 --[[--------------------------------------------------------------------
331 class GLA/GLAState: Classes for representing machines that represent
332 lookahead information.
334 --------------------------------------------------------------------]]--
336 GLA = FA:new()
337 GLA.name = "GLA"
338 function GLA:new_graph(init)
339 return GLA:new(init)
342 function GLA:new_state()
343 return GLAState:new()
346 function GLA:get_outgoing_edge_values(states)
347 local values = {}
348 for state in each(states) do
349 for edge_val, target_state, properties in state:transitions() do
350 if edge_val ~= fa.e then
351 table.insert(values, {edge_val, properties})
355 return values
358 function GLA:to_dot(indent, suffix)
359 local str = ""
360 suffix = suffix or ""
361 indent = indent or ""
362 for state in each(self:states()) do
363 peripheries = 1
364 extra_label = ""
365 if state.final then
366 peripheries = 2
367 if state.final[1] == 0 then -- the special value that means "return"
368 extra_label = "Return"
369 else
370 for edge_val, dest_state, properties in self.rtn_state:transitions() do
371 if edge_val == state.final[1] and dest_state == state.final[2] then
372 extra_label = tostring(properties.slotnum)
377 if self.start == state then extra_label = "Start" end
378 str = str .. string.format('%s"%s" [label="%s" peripheries=%d]\n',
379 indent, tostring(state) .. suffix, escape(extra_label), peripheries)
380 for edge_val, target_state in state:transitions() do
381 if edge_val == fa.eof then
382 edge_val = "EOF"
384 str = str .. string.format('%s"%s" -> "%s" [label="%s"]\n',
385 indent, tostring(state) .. suffix, tostring(target_state) .. suffix,
386 escape(edge_val))
389 return str
393 GLAState = FAState:new()
394 GLAState.name = "GLAState"
395 function GLAState:new(paths)
396 local obj = FAState:new()
397 obj.rtn_paths = paths
399 if paths then
400 for path in each(paths) do
401 if obj.lookahead_k and obj.lookahead_k ~= path.lookahead_k then
402 error("Internal error: expected all paths for the GLA state to have the same length")
404 obj.lookahead_k = path.lookahead_k
406 else
407 obj.lookahead_k = 0
409 return obj
413 --[[--------------------------------------------------------------------
415 class RTN/RTNState: Classes for representing machines that represent
416 context-free grammars.
418 --------------------------------------------------------------------]]--
420 RTN = FA:new()
421 RTN.name = "RTN"
422 function RTN:new_graph(init)
423 return RTN:new(init)
426 function RTN:new_state()
427 return RTNState:new()
430 function RTN:get_outgoing_edge_values(states)
431 local values = {}
432 for state in each(states) do
433 for edge_val, target_state, properties in state:transitions() do
434 if edge_val ~= fa.e then
435 table.insert(values, {edge_val, properties})
439 return values
442 function escape(str)
443 return str:gsub("[\"\\]", "\\%1")
446 function RTN:to_dot(indent, suffix, intfas, glas)
447 suffix = suffix or ""
448 str = indent .. "rankdir=LR;\n"
449 -- str = str .. indent .. string.format('label="%s"\n', self.name)
450 for state in each(self:states()) do
451 peripheries = 1
452 extra_label = ""
453 color = ""
454 if state.gla then
455 if state.gla.longest_path == 1 then
456 color = " fillcolor=\"cornflowerblue\""
457 elseif state.gla.longest_path == 2 then
458 color = " fillcolor=\"gold\""
459 elseif state.gla.longest_path > 2 then
460 color = " fillcolor=\"firebrick\""
462 color = color .. " style=\"filled\""
464 if state.final then peripheries = 2 end
465 if self.start == state then extra_label = "Start" end
466 if intfas and state.intfa then
467 if extra_label ~= "" then
468 extra_label = extra_label .. "\\n"
470 extra_label = extra_label .. "I: " .. tostring(intfas:offset_of(state.intfa))
472 if glas and state.gla then
473 if extra_label ~= "" then
474 extra_label = extra_label .. "\\n"
476 extra_label = extra_label .. "G: " .. tostring(glas:offset_of(state.gla))
478 str = str .. string.format('%s"%s" [label="%s" peripheries=%d%s]\n',
479 indent, tostring(state) .. suffix, extra_label,
480 peripheries, color)
481 for edge_val, target_state in state:transitions() do
482 if fa.is_nonterm(edge_val) then
483 str = str .. string.format('%s"%s" -> "%s" [label="<%s>"]\n',
484 indent, tostring(state) .. suffix, tostring(target_state) .. suffix,
485 escape(edge_val.name))
486 else
487 --if attributes.regex_text[edge_val] then
488 -- edge_val = "/" .. attributes.regex_text[edge_val] .. "/"
489 --end
490 if edge_val == fa.eof then
491 edge_val = "EOF"
493 str = str .. string.format('%s"%s" -> "%s" [label="%s"]\n',
494 indent, tostring(state) .. suffix, tostring(target_state) .. suffix,
495 escape(edge_val))
499 return str
503 RTNState = FAState:new()
504 RTNState.name = "RTNState"
506 -- A trivial state is one where you can tell just by looking
507 -- at the state's transitions and its final status alone what
508 -- transition you should take for a given terminal.
509 function RTNState:needs_gla()
510 if self.final then
511 -- a final state needs a GLA if it has any outgoing transitions
512 if self:num_transitions() > 0 then
513 return true
514 else
515 return false
517 else
518 -- a nonfinal state needs a GLA if it has more than one
519 -- outgoing transition and either:
520 -- - at least one of those transitions is a nonterminal
521 -- - there are two or more terminal transitions for the same state
522 -- TODO: what about states with exactly 1 outgoing nonterminal
523 -- transition? We don't technically need a GLA's help to
524 -- figure out the right transition.
525 if self:num_transitions() == 1 then
526 return false
527 else
528 local edge_vals = Set:new()
529 for edge_val in self:transitions() do
530 if fa.is_nonterm(edge_val) then
531 return true
532 else
533 if edge_vals:contains(edge_val) then
534 return true
536 edge_vals:add(edge_val)
539 return false
544 -- In most cases, a state needs an intfa if it doesn't have a GLA,
545 -- but there are a few exceptions. Final states with no transitions
546 -- don't need an intfa. Neither do states that have only one
547 -- nonterminal transition.
548 function RTNState:needs_intfa()
549 if self:needs_gla() then
550 return false
551 else
552 if self.final then
553 return false
554 elseif self:num_transitions() == 1 and fa.is_nonterm(self._transitions[1][1]) then
555 return false
556 else
557 return true
563 NonTerm = {name="NonTerm"}
564 function NonTerm:new(name)
565 -- keep a cache of nonterm objects, so that we always return the same object
566 -- for a given name. This lets us compare nonterms for equality.
567 if not self.cache then
568 self.cache = {}
571 if not self.cache[name] then
572 obj = newobject(self)
573 obj.name = name
574 self.cache[name] = obj
577 return self.cache[name]
580 function is_nonterm(thing)
581 return (type(thing) == "table" and thing.class == NonTerm)
584 -- vim:et:sts=2:sw=2