Lookahead improvements: EOF handling and better ambiguity detection.
[gazelle.git] / compiler / intfa_combine.lua
blobdab1f9381f7e73584710f284fa5106c4c8d25087
1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
5 intfa_combine.lua
7 Once the lookahead has been calculated, we know what terminal(s)
8 each RTN/GLA state is expecting to see. We use this information to
9 build IntFAs that recognize any possible valid token that could
10 occur at this point in the input. We combine and reuse DFAs as much
11 as possible -- only when two terminals conflict is it necessary to
12 use different DFAs.
14 Copyright (c) 2007 Joshua Haberman. See LICENSE for details.
16 --------------------------------------------------------------------]]--
18 -- Determine what terminals (if any) conflict with each other.
19 -- In this context, "conflict" means that a string of characters can
20 -- be interpreted as one or more terminals.
21 function analyze_conflicts(terminals)
22 -- We detect conflicts by combining all the NFAs into a single DFA.
23 -- We then observe what states are final to more than one terminal.
24 local conflicts = {}
25 local nfas = {}
26 for name, terminal in pairs(terminals) do
27 if type(terminal) == "string" then
28 terminal = fa.IntFA:new{string=terminal}
29 end
30 table.insert(nfas, {terminal, name})
31 end
33 local uber_dfa = nfas_to_dfa(nfas, true)
34 for state in each(uber_dfa:states()) do
35 if type(state.final) == "table" then -- more than one terminal ended in this state
36 for term1 in each(state.final) do
37 for term2 in each(state.final) do
38 if term1 ~= term2 then
39 conflicts[term1] = conflicts[term1] or Set:new()
40 conflicts[term1]:add(term2)
41 end
42 end
43 end
44 end
45 end
47 return conflicts
48 end
50 function has_conflicts(conflicts, term_set1, term_set2)
51 for term1 in each(term_set1) do
52 if conflicts[term1] then
53 for conflict in each(conflicts[term1]) do
54 if term_set2:contains(conflict) then
55 return true, term1, conflict
56 end
57 end
58 end
59 end
60 end
62 function create_or_reuse_termset_for(terminals, conflicts, termsets, nonterm)
63 if has_conflicts(conflicts, terminals, terminals) then
64 local has_conflict, c1, c2 = has_conflicts(conflicts, terminals, terminals)
65 error(string.format("Can't build DFA inside %s, because terminals %s and %s conflict",
66 nonterm, c1, c2))
67 end
69 local found_termset = false
70 for i, termset in ipairs(termsets) do
71 -- will this termset do? it will if none of our terminals conflict with any of the
72 -- existing terminals in this set.
73 -- (we can probably compute this faster by pre-computing equivalence classes)
74 if not has_conflicts(conflicts, termset, terminals) then
75 found_termset = i
76 break
77 end
78 end
80 if found_termset == false then
81 local new_termset = Set:new()
82 table.insert(termsets, new_termset)
83 found_termset = #termsets
84 end
86 -- add all the terminals for this phase of lookahead to the termset we found
87 for term in each(terminals) do
88 termsets[found_termset]:add(term)
89 end
91 return found_termset
92 end
94 function intfa_combine(all_terminals, state_term_pairs)
95 local conflicts = analyze_conflicts(all_terminals)
97 -- For each state in the grammar, create (or reuse) a DFA to run
98 -- when we hit that state.
99 local termsets = {}
100 local intfa_nums = {}
101 for state_term_pair in each(state_term_pairs) do
102 local state, terms = unpack(state_term_pair)
103 intfa_nums[state] = create_or_reuse_termset_for(terms, conflicts, termsets)
106 local dfas = OrderedSet:new()
107 for termset in each(termsets) do
108 local nfas = {}
109 for term in each(termset) do
110 local target = all_terminals[term]
111 if type(target) == "string" then
112 target = fa.IntFA:new{string=target}
114 table.insert(nfas, {target, term})
116 local dfa = hopcroft_minimize(nfas_to_dfa(nfas))
117 dfa.termset = termset
118 dfas:add(dfa)
121 for state, intfa_num in pairs(intfa_nums) do
122 state.intfa = dfas:element_at(intfa_num)
125 dfas:sort(function (a, b) return b.termset:count() < a.termset:count() end)
127 return dfas
130 -- vim:et:sts=2:sw=2