make compiler emit intfa states in order.
[philodendron.git] / compiler / compile.lua
blobcfeac0863cf1b38317cf71d8169891bb48f5db46
2 require "bootstrap/rtn"
3 require "bc"
4 require "bc_constants"
6 --print(serialize(attributes.ignore))
8 function child_edges(edge, stack, grammar, decisions, terminals)
9 if type(edge) == "table" and edge.class == fa.NonTerm then
10 local child_edges = {}
11 for edge_val in grammar[edge.name].start:transitions() do
12 table.insert(child_edges, edge_val)
13 end
14 return child_edges
15 else
16 local str_or_regex
17 if type(edge) == "table" then
18 str_or_regex = edge.properties.string
19 else
20 str_or_regex = edge
21 end
23 table.insert(terminals, str_or_regex)
25 local decision_stack = stack:to_array()
26 if #decision_stack > 1 then
27 decisions[str_or_regex] = stack:to_array()
28 end
29 end
30 end
32 -- require "sketches/regex_debug"
33 -- require "sketches/pp"
35 TerminalTransition = {name="TerminalTransition", order=1}
36 NontermTransition = {name="NontermTransition", order=2}
37 Decision = {name="Decision", order=3}
39 function load_grammar(file)
40 -- First read grammar file
42 local grm = io.open(file, "r")
43 local grm_str = grm:read("*a")
45 local grammar, attributes = parse_grammar(CharStream:new(grm_str))
47 -- First, determine what terminals (if any) conflict with each other.
48 -- In this context, "conflict" means that a string of characters can
49 -- be interpreted as one or more terminals.
50 local conflicts = {}
52 local nfas = {}
53 for name, terminal in pairs(attributes.terminals) do
54 if type(terminal) == "string" then
55 terminal = fa.IntFA:new{string=terminal}
56 end
57 table.insert(nfas, {terminal, name})
58 end
59 local uber_dfa = nfas_to_dfa(nfas, true)
60 for state in each(uber_dfa:states()) do
61 if type(state.final) == "table" then -- more than one terminal ended in this state
62 for term1 in each(state.final) do
63 for term2 in each(state.final) do
64 if term1 ~= term2 then
65 conflicts[term1] = conflicts[term1] or Set:new()
66 conflicts[term1]:add(term2)
67 end
68 end
69 end
70 end
71 end
72 end
74 -- For each state in the grammar, create (or reuse) a DFA to run
75 -- when we hit that state.
76 local dfas = {}
78 function has_conflicts(conflicts, dfa, terminals)
79 for term in each(terminals) do
80 if conflicts[term] then
81 for conflict in each(conflicts[term]) do
82 if dfa:contains(conflict) then
83 return true
84 end
85 end
86 end
87 end
88 end
90 for nonterm, rtn in pairs(grammar) do
91 -- print(nonterm)
92 -- print(rtn)
93 for state in each(rtn:states()) do
94 local terminals = {}
95 function my_child_edges(edge, stack)
96 return child_edges(edge, stack, grammar, state.decisions, terminals)
97 end
99 state.decisions = {}
100 if state:num_transitions() > 0 then
101 for edge_val, target_state in state:transitions() do
102 depth_first_traversal(edge_val, my_child_edges)
105 -- print("Inside " .. nonterm .. ", state=" .. tostring(state) .. "...")
106 -- print(serialize(decisions))
108 -- We now have a list of terminals we want to find when we are in this RTN
109 -- state. Now get a DFA that will match all of them, either by creating
110 -- a new DFA or by finding an existing one that will work (without conflicting
111 -- with any of our terminals).
112 local found_dfa = false
113 for i, dfa in ipairs(dfas) do
114 -- will this dfa do? it will if none of our terminals conflict with any of the
115 -- existing terminals in this dfa.
116 -- (we can probably compute this faster by pre-computing equivalence classes)
117 if not has_conflicts(conflicts, dfa, terminals) then
118 found_dfa = i
119 break
123 if found_dfa == false then
124 new_dfa = Set:new()
125 table.insert(dfas, new_dfa)
126 found_dfa = #dfas
129 -- add all the terminals for this state to the dfa we found
130 for term in each(terminals) do
131 dfas[found_dfa]:add(term)
134 state.dfa = found_dfa
139 local real_dfas = {}
140 for dfa in each(dfas) do
141 local nfas = {}
142 for term in each(dfa) do
143 local target = attributes.terminals[term]
144 if type(target) == "string" then
145 target = fa.IntFA:new{string=target}
147 table.insert(nfas, {target, term})
149 local real_dfa = hopcroft_minimize(nfas_to_dfa(nfas))
150 table.insert(real_dfas, real_dfa)
153 return grammar, attributes, real_dfas
156 function write_grammar(infilename, outfilename)
157 local grammar, attributes, intfas = load_grammar(infilename)
159 -- write Bitcode header
160 bc_file = bc.File:new(outfilename, "GH")
162 -- Enter a BLOCKINFO record to define abbreviations for all our records.
163 -- See FILEFORMAT for a description of what all the record types mean.
164 bc_file:enter_subblock(bc.BLOCKINFO)
166 -- IntFA abbreviations
167 bc_file:write_unabbreviated_record(bc.SETBID, BC_INTFA)
169 bc_intfa_final_state = bc_file:define_abbreviation(4,
170 bc.LiteralOp:new(BC_INTFA_FINAL_STATE),
171 bc.VBROp:new(5),
172 bc.VBROp:new(5))
174 bc_intfa_state = bc_file:define_abbreviation(5,
175 bc.LiteralOp:new(BC_INTFA_STATE),
176 bc.VBROp:new(5))
178 bc_intfa_transition = bc_file:define_abbreviation(6,
179 bc.LiteralOp:new(BC_INTFA_TRANSITION),
180 bc.VBROp:new(8),
181 bc.VBROp:new(6))
183 bc_intfa_transition_range = bc_file:define_abbreviation(7,
184 bc.LiteralOp:new(BC_INTFA_TRANSITION_RANGE),
185 bc.VBROp:new(8),
186 bc.VBROp:new(8),
187 bc.VBROp:new(6))
189 -- Strings abbreviations
190 bc_file:write_unabbreviated_record(bc.SETBID, BC_STRINGS)
192 bc_string = bc_file:define_abbreviation(4,
193 bc.LiteralOp:new(BC_STRING),
194 bc.ArrayOp:new(bc.FixedOp:new(7)))
196 -- RTN abbreviations
197 bc_file:write_unabbreviated_record(bc.SETBID, BC_RTN)
199 bc_rtn_info = bc_file:define_abbreviation(4,
200 bc.LiteralOp:new(BC_RTN_INFO),
201 bc.VBROp:new(6),
202 bc.VBROp:new(4))
204 bc_rtn_state = bc_file:define_abbreviation(5,
205 bc.LiteralOp:new(BC_RTN_STATE),
206 bc.VBROp:new(4),
207 bc.VBROp:new(4),
208 bc.FixedOp:new(1))
210 bc_rtn_transition_terminal = bc_file:define_abbreviation(6,
211 bc.LiteralOp:new(BC_RTN_TRANSITION_TERMINAL),
212 bc.VBROp:new(6),
213 bc.VBROp:new(5),
214 bc.VBROp:new(5),
215 bc.VBROp:new(4))
217 bc_rtn_transition_nonterm = bc_file:define_abbreviation(7,
218 bc.LiteralOp:new(BC_RTN_TRANSITION_NONTERM),
219 bc.VBROp:new(6),
220 bc.VBROp:new(5),
221 bc.VBROp:new(5),
222 bc.VBROp:new(4))
224 bc_rtn_ignore = bc_file:define_abbreviation(8,
225 bc.LiteralOp:new(BC_RTN_IGNORE),
226 bc.VBROp:new(6))
228 bc_rtn_decision = bc_file:define_abbreviation(9,
229 bc.LiteralOp:new(BC_RTN_DECISION),
230 bc.VBROp:new(6),
231 bc.ArrayOp:new(bc.VBROp:new(4)))
233 bc_file:end_subblock(bc.BLOCKINFO)
235 print(string.format("Writing grammar to disk..."))
237 local strings = {}
238 local string_offsets = {}
240 -- gather a list of all the strings from intfas
241 for intfa in each(intfas) do
242 for state in each(intfa:states()) do
243 if state.final and not string_offsets[state.final] then
244 string_offsets[state.final] = #strings
245 table.insert(strings, state.final)
250 -- build an ordered list of RTNs and gather the strings from them
251 local rtns = {{attributes.start, grammar[attributes.start]}}
252 local rtns_offsets = {}
253 rtns_offsets[grammar[attributes.start]] = 0
254 for name, rtn in pairs(grammar) do
255 if name ~= attributes.start then
256 rtns_offsets[rtn] = #rtns
257 if not string_offsets[name] then
258 string_offsets[name] = #strings
259 table.insert(strings, name)
262 table.insert(rtns, {name, rtn})
264 for rtn_state in each(rtn:states()) do
265 for edge_val, target_state, properties in rtn_state:transitions() do
266 if properties and not string_offsets[properties.name] then
267 string_offsets[properties.name] = #strings
268 table.insert(strings, properties.name)
275 -- emit the strings
276 print(string.format("Writing %d strings...", #strings))
277 bc_file:enter_subblock(BC_STRINGS)
278 for string in each(strings) do
279 bc_file:write_abbreviated_record(bc_string, string)
281 bc_file:end_subblock(BC_STRINGS)
283 -- emit the intfas
284 print(string.format("Writing %d IntFAs...", #intfas))
285 bc_file:enter_subblock(BC_INTFAS)
286 for intfa in each(intfas) do
287 bc_file:enter_subblock(BC_INTFA)
288 local intfa_state_offsets = {}
289 local intfa_transitions = {}
291 -- make sure the start state is emitted first
292 local states = intfa:states()
293 states:remove(intfa.start)
294 states = states:to_array()
295 table.insert(states, 1, intfa.start)
296 for i, state in ipairs(states) do
297 intfa_state_offsets[state] = i - 1
298 local this_state_transitions = {}
299 for edge_val, target_state, properties in state:transitions() do
300 for range in edge_val:each_range() do
301 table.insert(this_state_transitions, {range, target_state})
304 table.sort(this_state_transitions, function (a, b) return a[1].low < b[1].low end)
305 for t in each(this_state_transitions) do table.insert(intfa_transitions, t) end
306 if state.final then
307 bc_file:write_abbreviated_record(bc_intfa_final_state, #this_state_transitions, string_offsets[state.final])
308 else
309 bc_file:write_abbreviated_record(bc_intfa_state, #this_state_transitions)
313 print(string.format(" %d states, %d transitions", #states, #intfa_transitions))
315 for transition in each(intfa_transitions) do
316 local range, target_state = unpack(transition)
317 target_state_offset = intfa_state_offsets[target_state]
318 if range.low == range.high then
319 bc_file:write_abbreviated_record(bc_intfa_transition, range.low, target_state_offset)
320 else
321 if range.high == math.huge then range.high = 255 end -- temporary ASCII-specific hack
322 bc_file:write_abbreviated_record(bc_intfa_transition_range, range.low, range.high, target_state_offset)
326 bc_file:end_subblock(BC_INTFA)
328 bc_file:end_subblock(BC_INTFAS)
330 -- create a sorted list of transitions out of each RTN state, for all RTNs
331 local rtn_state_transitions = {}
332 for name_rtn_pair in each(rtns) do
333 local name, rtn = unpack(name_rtn_pair)
334 for rtn_state in each(rtn:states()) do
335 local this_state_transitions = {}
336 for edge_val, target_state, properties in rtn_state:transitions() do
337 if type(edge_val) == "table" and edge_val.class == fa.NonTerm then
338 table.insert(this_state_transitions, {NontermTransition, rtn_state, {edge_val, target_state, properties}})
339 else
340 table.insert(this_state_transitions, {TerminalTransition, rtn_state, {edge_val, target_state, properties}})
343 for terminal, stack in pairs(rtn_state.decisions) do
344 table.insert(this_state_transitions, {Decision, rtn_state, {terminal, stack}})
347 table.sort(this_state_transitions, function (a, b) return a[1].order < b[1].order end)
348 rtn_state_transitions[rtn_state] = this_state_transitions
352 -- emit the RTNs
353 bc_file:enter_subblock(BC_RTNS)
354 print(string.format("Writing %d RTNs...", #rtns))
355 for name_rtn_pair in each(rtns) do
356 local name, rtn = unpack(name_rtn_pair)
357 bc_file:enter_subblock(BC_RTN)
358 bc_file:write_abbreviated_record(bc_rtn_info, string_offsets[name], attributes.slot_counts[name]-1)
360 if attributes.ignore[name] then
361 for ign_terminal in each(attributes.ignore[name]) do
362 bc_file:write_abbreviated_record(bc_rtn_ignore, string_offsets[ign_terminal])
366 local rtn_states = {}
367 local rtn_state_offsets = {}
368 local rtn_transitions = {}
370 -- make sure the start state is emitted first
371 local states = rtn:states()
372 states:remove(rtn.start)
373 states = states:to_array()
374 table.insert(states, 1, rtn.start)
376 -- emit states
377 for i, rtn_state in pairs(states) do
378 rtn_state_offsets[rtn_state] = i - 1
379 local is_final = 0
380 if rtn_state.final then is_final = 1 end
381 -- states don't have an associated DFA if there are no outgoing transitions
382 rtn_state.dfa = rtn_state.dfa or 1
383 bc_file:write_abbreviated_record(bc_rtn_state, #rtn_state_transitions[rtn_state], rtn_state.dfa - 1, is_final)
384 for t in each(rtn_state_transitions[rtn_state]) do
385 table.insert(rtn_transitions, t)
389 print(string.format(" %s: %d states, %d transitions", name, #states, #rtn_transitions))
391 -- emit transitions
392 for transition in each(rtn_transitions) do
393 local transition_type, src_state, data = unpack(transition)
394 if transition_type == Decision then
395 local terminal, stack = unpack(data)
396 local stack_str = ""
397 for i, step in ipairs(stack) do
398 --print(string.format("Trying to find %s, %d transitions to consider", serialize(step), #rtn_state_transitions[src_state]))
399 for j, transition2 in ipairs(rtn_state_transitions[src_state]) do
400 local transition_type2, src_state2, data2 = unpack(transition2)
401 local edge_val, target_state, properties = unpack(data2)
402 --print(string.format("Is it %s?", serialize(edge_val)))
403 if edge_val == step then
404 --print("Transition found!")
405 stack_str = stack_str .. string.char(j-1)
406 if i < #stack then
407 src_state = grammar[edge_val.name].start
409 break
411 if j == #rtn_state_transitions[src_state] then
412 error(string.format("Transition not found! Step=%s", step))
416 -- print(serialize(stack))
417 bc_file:write_abbreviated_record(bc_rtn_decision, string_offsets[terminal], stack_str)
418 else
419 local edge_val, target_state, properties = unpack(data)
420 target_state_offset = rtn_state_offsets[target_state]
421 if transition_type == TerminalTransition then
422 bc_file:write_abbreviated_record(bc_rtn_transition_terminal, string_offsets[edge_val],
423 rtn_state_offsets[target_state], string_offsets[properties.name],
424 properties.slotnum-1)
425 else
426 bc_file:write_abbreviated_record(bc_rtn_transition_nonterm, rtns_offsets[grammar[edge_val.name]],
427 rtn_state_offsets[target_state], string_offsets[properties.name],
428 properties.slotnum-1)
433 bc_file:end_subblock(BC_RTN)
435 bc_file:end_subblock(BC_RTNS)
438 write_grammar(arg[1], arg[2])