Bugfix to error message when IntFAs conflict.
[gazelle.git] / compiler / fa_algorithms.lua
blobedbc6cdb9df2609f1b8406ef7abb74bd06282f39
1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
5 fa_algorithms.lua
7 Algorithms that operate on finite automata (NFAs and/or DFAs). For
8 the most part these can work on any of the FA types, since they do
9 not interpret the meaning of the edges. It is nice to keep these
10 algorithms separate from the FA data structure in fa.lua.
12 Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
14 --------------------------------------------------------------------]]--
17 --[[--------------------------------------------------------------------
19 NFA to DFA conversion.
21 --------------------------------------------------------------------]]--
23 function epsilon_closure(state)
24 return depth_first_traversal(state, function(s) return s:transitions_for(fa.e, "ALL") end)
25 end
27 -- we as input an array of NFAs, one for each token we want to match simultaneously,
28 -- and strings describing each token. So:
29 -- { {nfa1, "token string 1"},
30 -- {nfa2, "token string 2", etc} }
31 function nfas_to_dfa(nfa_string_pairs, ambiguous_ok)
32 ambiguous_ok = ambiguous_ok or false
33 -- First we need to mark all final states and capture groups with the token string
34 local nfas = {}
36 for i, nfa_string_pair in ipairs(nfa_string_pairs) do
37 local nfa, token_string = unpack(nfa_string_pair)
38 table.insert(nfas, nfa)
40 -- Mark the nfa fragment's final state as the final state for this *token*
41 nfa.final.final = token_string
42 end
44 -- Now combine all the nfas with alternation
45 local final_nfa = nfa_construct.alt(nfas)
46 return nfa_to_dfa(final_nfa, ambiguous_ok)
47 end
49 function new_dfa_state(nfa, nfa_states, ambiguous_ok)
50 local dfa_state = nfa:new_state()
52 -- If this is a final state for one or more of the nfas, make it an
53 -- (appropriately labeled) final state for the dfa
54 for nfa_state in nfa_states:each() do
55 if nfa_state.final then
56 if dfa_state.final and dfa_state.final ~= nfa_state.final then
57 if ambiguous_ok then
58 if type(dfa_state.final) ~= "table" then dfa_state.final = Set:new({dfa_state.final}) end
59 dfa_state.final:add(nfa_state.final)
60 else
61 error("Ambiguous finality not supported yet!! (" .. tostring(dfa_state.final) .. " and " .. tostring(nfa_state.final .. ")"))
62 end
63 else
64 dfa_state.final = nfa_state.final
65 end
66 end
67 end
69 return dfa_state
70 end
72 function nfa_to_dfa(nfa, ambiguous_ok)
73 -- The sets of NFA states we need to process for outgoing transitions
74 local first_nfa_states = epsilon_closure(nfa.start)
75 local queue = Queue:new(first_nfa_states)
77 local dfa = nfa:new_graph{start = new_dfa_state(nfa, first_nfa_states, ambiguous_ok)}
78 -- The DFA states we create from sets of NFA states
79 local dfa_states = {[first_nfa_states:hash_key()] = dfa.start}
81 while not queue:isempty() do
82 local nfa_states = queue:dequeue()
83 local dfa_state = dfa_states[nfa_states:hash_key()]
85 -- Generate a list of symbols that transition out of this set of NFA states.
86 -- We prefer this to iterating over the entire symbol space because it's
87 -- vastly more efficient in the case of a large symbol space (eg. Unicode)
88 local symbol_tuples = nfa:get_outgoing_edge_values(nfa_states)
90 -- For each output symbol, generate the list of destination NFA states that
91 -- recognizing this symbol could put you in (including epsilon transitions).
92 for symbol_tuple in each(symbol_tuples) do
93 local symbol, properties = unpack(symbol_tuple)
94 local dest_nfa_states = Set:new()
95 for nfa_state in nfa_states:each() do
96 -- equivalence classes dictate that this character represents what will
97 -- happen to ALL characters in the set
98 local target_states = nfa_state:transitions_for(symbol, properties)
100 if target_states then
101 for target_state in each(target_states) do
102 dest_nfa_states:add(target_state)
103 dest_nfa_states:add_collection(epsilon_closure(target_state):to_array())
108 -- this is necessary because (at the moment) get_outgoing_edge_values will generate
109 -- combinations of symbol/properties that don't *actually* transtion anywhere
110 -- TODO: get rid of that shortcoming
111 if not dest_nfa_states:isempty() then
113 -- create a DFA state for this set of NFA states, if one does not
114 -- already exist.
115 local dest_dfa_state = dfa_states[dest_nfa_states:hash_key()]
116 if dest_dfa_state == nil then
117 dest_dfa_state = new_dfa_state(nfa, dest_nfa_states, ambiguous_ok)
118 dfa_states[dest_nfa_states:hash_key()] = dest_dfa_state
119 queue:enqueue(dest_nfa_states)
122 -- create a transition from the current DFA state into the new one
123 dfa_state:add_transition(symbol, dest_dfa_state, properties)
128 return dfa
133 --[[--------------------------------------------------------------------
135 DFA minimization.
137 hopcroft_minimize(dfa): transform a DFA into an equivalent DFA with
138 the minimal number of states. Uses Hopcroft's algorithm, which is
139 O(n lg n) in the number of states, as explained by both Hopcroft and
140 Gries (see BIBLIOGRAPHY for details).
142 --------------------------------------------------------------------]]--
144 function hopcroft_minimize(dfa)
145 -- First create the alphabet and an inverse transition table.
146 local alphabet = dfa:get_outgoing_edge_values(dfa:states())
147 local inverse_transitions = {}
149 for state in each(dfa:states()) do
150 for symbol, dest_state, properties in state:transitions() do
151 inverse_transitions[dest_state] = inverse_transitions[dest_state] or dfa:new_state()
152 inverse_transitions[dest_state]:add_transition(symbol, state, properties)
156 -- Create initial blocks, grouped by finality.
157 local initial_blocks = {}
158 for state in each(dfa:states()) do
159 local finality = state.final or "NONE"
160 initial_blocks[finality] = initial_blocks[finality] or {}
161 table.insert(initial_blocks[finality], state)
164 local blocks = Set:new()
165 local work_queue = Queue:new()
166 local work_queue_set = Set:new()
167 for finality, states in pairs(initial_blocks) do
168 local block = Set:new(states)
169 blocks:add(block)
170 for state in each(states) do
171 state.block = block
173 for symbol_tuple in each(alphabet) do
174 local symbol, properties = unpack(symbol_tuple)
175 work_queue:enqueue({block, symbol, properties})
176 work_queue_set:add(tostring(block) .. tostring(symbol) .. tostring(properties))
180 local num_iterations = 0
181 while (not work_queue:isempty()) do
182 num_iterations = num_iterations + 1
183 local block, symbol, properties = unpack(work_queue:dequeue())
184 work_queue_set:remove(tostring(block) .. tostring(symbol) .. tostring(properties))
186 -- determine what blocks need to be split
187 local states_to_split = Set:new()
188 for state in each(block) do
189 if inverse_transitions[state] then
190 states_to_split:add_collection(inverse_transitions[state]:transitions_for(symbol, properties))
194 -- split blocks
195 local new_twins = {}
196 for state_to_split in each(states_to_split) do
197 for state in each(state_to_split.block) do
198 local dest_state = state:transitions_for(symbol, properties):to_array()[1]
199 if not (dest_state and dest_state.block == block) then
200 if not new_twins[state.block] then
201 local new_twin = Set:new()
202 blocks:add(new_twin)
203 new_twins[state.block] = new_twin
205 new_twins[state.block]:add(state)
210 -- fix work queue according to splits
211 for old_block, new_twin in pairs(new_twins) do
212 for state in each(new_twin) do
213 state.block:remove(state)
214 state.block = new_twin
217 local smaller_block
218 if old_block:count() < new_twin:count() then
219 smaller_block = old_block
220 else
221 smaller_block = new_twin
224 for alphabet_symbol_tuple in each(alphabet) do
225 local alphabet_symbol, alphabet_properties = unpack(alphabet_symbol_tuple)
226 if work_queue_set:contains(tostring(old_block) .. tostring(alphabet_symbol) .. tostring(alphabet_properties)) then
227 work_queue:enqueue({new_twin, alphabet_symbol, alphabet_properties})
228 work_queue_set:add(tostring(new_twin) .. tostring(alphabet_symbol) .. tostring(alphabet_properties))
229 else
230 work_queue:enqueue({smaller_block, alphabet_symbol, alphabet_properties})
231 work_queue_set:add(tostring(smaller_block) .. tostring(alphabet_symbol) .. tostring(alphabet_properties))
237 -- the blocks are the new states
238 local states = {}
239 for block in blocks:each() do
240 states[block] = dfa:new_state()
241 for state in each(block) do
242 if state.final then
243 states[block].final = state.final
248 local minimal_dfa = dfa:new_graph()
249 minimal_dfa.start = states[dfa.start.block]
250 for block in blocks:each() do
251 for state in each(block) do
252 for symbol, dest_state, properties in state:transitions() do
253 states[block]:add_transition(symbol, states[dest_state.block], properties)
258 -- print("Num states: " .. tostring(dfa:states():count()) ..
259 -- ", alphabet size: " .. tostring(#alphabet) ..
260 -- ", num iterations: " .. tostring(num_iterations))
261 return minimal_dfa
265 --[[--------------------------------------------------------------------
267 FA comparison
269 fa_isequal(fa1, fa2): Returns true if the given finite automata are
270 equal, false otherwise.
272 --------------------------------------------------------------------]]--
274 function fa_isequal(fa1, fa2)
275 local equivalent_states={[fa1.start]=fa2.start}
276 local queue = Queue:new(fa1.start)
277 local fa2_seen_states = Set:new()
278 fa2_seen_states:add(fa2.start)
280 while not queue:isempty() do
281 local s1 = queue:dequeue()
282 local s2 = equivalent_states[s1]
284 if s1:num_transitions() ~= s2:num_transitions() then
285 return false
288 if (s1.final or s2.final) then
289 if not (s1.final and s2.final) then
290 return false
293 if type(s1.final) ~= type(s2.final) then
294 return false
297 if type(s1.final) == "table" then
298 if not table_shallow_eql(s1.final, s2.final) then
299 return false
301 elseif s1.final ~= s2.final then
302 return false
306 for edge_val, dest_state in s1:transitions() do
307 local s2_dest_state = s2:dest_state_for(edge_val)
308 if not s2_dest_state then
309 return false
310 elseif equivalent_states[dest_state] then
311 if equivalent_states[dest_state] ~= s2_dest_state then
312 return false
314 elseif fa2_seen_states:contains(s2_dest_state) then
315 -- we have seen this state before, but not as equivalent to
316 -- the dest_state
317 return false
318 else
319 equivalent_states[dest_state] = s2_dest_state
320 fa2_seen_states:add(s2_dest_state)
321 queue:enqueue(dest_state)
326 return true
330 --[[--------------------------------------------------------------------
332 FA longest path
334 fa_longest_path(fa): Returns an integer representing how long the
335 longest path from the start state to a final state can be. Returns
336 math.huge if the graph has cycles.
338 --------------------------------------------------------------------]]--
340 function fa_longest_path(fa)
341 local longest = 0
342 local current_depth = 0
343 local seen = Set:new()
344 function dfs_helper(state)
345 seen:add(state)
346 if state.final and current_depth > longest then
347 longest = current_depth
350 for edge_val, dest_state in state:transitions() do
351 if seen:contains(dest_state) then
352 longest = math.huge
353 else
354 current_depth = current_depth + 1
355 dfs_helper(dest_state)
356 current_depth = current_depth - 1
359 seen:remove(state)
362 dfs_helper(fa.start)
364 return longest