Many changes to get RTNs and IntFAs properly emitted into bytecode again.
[gazelle.git] / compiler / bytecode.lua
blob08ec7e366cb9a46bceef1094222103c2e61aec86
1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
5 bytecode.lua
7 Code that takes the final optimized parsing structures and emits them
8 to bytecode (in Bitcode format).
10 Copyright (c) 2007 Joshua Haberman. See LICENSE for details.
12 --------------------------------------------------------------------]]--
14 require "bc"
16 -- See FILEFORMAT for details about what these constants mean
17 BC_INTFAS = 8
18 BC_INTFA = 9
19 BC_STRINGS = 10
20 BC_RTNS = 11
21 BC_RTN = 12
22 BC_GLAS = 13
23 BC_GLA = 14
25 BC_INTFA_STATE = 0
26 BC_INTFA_FINAL_STATE = 1
27 BC_INTFA_TRANSITION = 2
28 BC_INTFA_TRANSITION_RANGE = 3
30 BC_STRING = 0
32 BC_RTN_INFO = 0
33 BC_RTN_STATE = 1
34 BC_RTN_STATE_WITH_GLA = 4
35 BC_RTN_TRIVIAL_FINAL_STATE = 6
36 BC_RTN_TRANSITION_TERMINAL = 2
37 BC_RTN_TRANSITION_NONTERM = 3
38 BC_RTN_IGNORE = 5
40 BC_GLA_STATE = 0
41 BC_GLA_FINAL_STATE = 1
42 BC_GLA_TRANSITION = 2
45 function write_bytecode(grammar, outfilename)
46 -- write Bitcode header
47 bc_file = bc.File:new(outfilename, "GH")
48 abbrevs = define_abbrevs(bc_file)
50 print(string.format("Writing grammar to disk..."))
52 -- Obtain linearized representations of all the DFAs from the Grammar object.
53 local strings = grammar:get_strings()
54 local rtns = grammar:get_flattened_rtn_list()
55 local glas = grammar:get_flattened_gla_list()
56 local intfas = grammar.master_intfas
58 -- emit the strings
59 print(string.format("Writing %d strings...", strings:count()))
60 bc_file:enter_subblock(BC_STRINGS)
61 for string in each(strings) do
62 bc_file:write_abbreviated_record(abbrevs.bc_string, string)
63 end
64 bc_file:end_subblock(BC_STRINGS)
66 -- emit the intfas
67 print(string.format("Writing %d IntFAs...", intfas:count()))
68 bc_file:enter_subblock(BC_INTFAS)
69 for intfa in each(intfas) do
70 emit_intfa(intfa, strings, bc_file, abbrevs)
71 end
72 bc_file:end_subblock(BC_INTFAS)
74 -- emit the GLAs
75 print(string.format("Writing %d GLAs...", glas:count()))
76 bc_file:enter_subblock(BC_GLAS)
77 for gla in each(glas) do
78 emit_gla(gla, bc_file, abbrevs)
79 end
80 bc_file:end_subblock(BC_GLAS)
82 -- emit the RTNs
83 bc_file:enter_subblock(BC_RTNS)
84 print(string.format("Writing %d RTNs...", rtns:count()))
85 for name, rtn in each(rtns) do
86 emit_rtn(name, rtn, rtns, glas, intfas, strings, bc_file, abbrevs)
87 end
88 bc_file:end_subblock(BC_RTNS)
90 end
93 function emit_intfa(intfa, strings, bc_file, abbrevs)
94 bc_file:enter_subblock(BC_INTFA)
96 local intfa_state_offsets = {}
97 local intfa_transitions = {}
99 -- order the states such that the start state is emitted first
100 local states = intfa:states()
101 states:remove(intfa.start)
102 states = states:to_array()
103 table.insert(states, 1, intfa.start)
105 -- do a first pass over the states that records their order and builds
106 -- each state's list of transitions.
107 local state_transitions = {}
108 for i, state in ipairs(states) do
109 intfa_state_offsets[state] = i - 1
111 state_transitions[state] = {}
112 for edge_val, target_state, properties in state:transitions() do
113 for range in edge_val:each_range() do
114 table.insert(state_transitions[state], {range, target_state})
118 -- sort the transitions into a stable order, to make the output
119 -- more deterministic.
120 table.sort(state_transitions[state], function (a, b) return a[1].low < b[1].low end)
122 -- add this state's transitions to the global list of transitions for the IntFA
123 for t in each(state_transitions[state])
124 do table.insert(intfa_transitions, t)
128 print(string.format(" %d states, %d transitions", #states, #intfa_transitions))
130 -- emit the states
131 for state in each(states) do
132 if state.final then
133 bc_file:write_abbreviated_record(abbrevs.bc_intfa_final_state,
134 #state_transitions[state],
135 strings:offset_of(state.final))
136 else
137 bc_file:write_abbreviated_record(abbrevs.bc_intfa_state, #state_transitions[state])
141 -- emit the transitions
142 for transition in each(intfa_transitions) do
143 local range, target_state = unpack(transition)
144 target_state_offset = intfa_state_offsets[target_state]
145 if range.low == range.high then
146 bc_file:write_abbreviated_record(abbrevs.bc_intfa_transition, range.low, target_state_offset)
147 else
148 if range.high == math.huge then range.high = 255 end -- temporary ASCII-specific hack
149 bc_file:write_abbreviated_record(abbrevs.bc_intfa_transition_range, range.low, range.high, target_state_offset)
153 bc_file:end_subblock(BC_INTFA)
156 function emit_gla(gla, strings, bc_file, abbrevs)
157 bc_file:enter_subblock(BC_GLA)
159 local states = OrderedSet:new()
160 states:add(gla.start)
161 for state in each(gla:states()) do
162 if state ~= gla.start then
163 states:add(state)
167 for state in each(states) do
170 bc_file:end_subblock(BC_GLA)
173 function emit_rtn(name, rtn, rtns, glas, intfas, strings, bc_file, abbrevs)
174 -- emit RTN name and ignore info
175 bc_file:enter_subblock(BC_RTN)
176 bc_file:write_abbreviated_record(abbrevs.bc_rtn_info, strings:offset_of(name), rtn.slot_count)
178 if rtn.ignore then
179 for ign_terminal in each(rtn.ignore) do
180 bc_file:write_abbreviated_record(abbrevs.bc_rtn_ignore, strings:offset_of(ign_terminal))
184 -- emit states
185 for state in each(rtn.states) do
186 local is_final
187 if state.final then
188 is_final = 1
189 else
190 is_final = 0
193 if state.gla then
194 bc_file:write_abbreviated_record(abbrevs.bc_rtn_state_with_gla,
195 #rtn.transitions[state],
196 glas:offset_of(state.gla),
197 is_final)
198 elseif state.intfa then
199 bc_file:write_abbreviated_record(abbrevs.bc_rtn_state,
200 #rtn.transitions[state],
201 intfas:offset_of(state.intfa),
202 is_final)
203 else
204 bc_file:write_abbreviated_record(abbrevs.bc_rtn_trivial_final_state)
208 -- emit transitions
209 for state in each(rtn.states) do
210 for transition in each(rtn.transitions[state]) do
211 local edge_val, dest_state, properties = unpack(transition)
212 if fa.is_nonterm(edge_val) then
213 bc_file:write_abbreviated_record(abbrevs.bc_rtn_transition_nonterm,
214 rtns:offset_of_key(edge_val.name),
215 rtn.states:offset_of(dest_state),
216 strings:offset_of(properties.name),
217 properties.slotnum-1)
218 else
219 bc_file:write_abbreviated_record(abbrevs.bc_rtn_transition_terminal,
220 strings:offset_of(edge_val),
221 rtn.states:offset_of(dest_state),
222 strings:offset_of(properties.name),
223 properties.slotnum-1)
228 bc_file:end_subblock(BC_RTN)
232 function define_abbrevs(bc_file)
233 abbrevs = {}
235 -- Enter a BLOCKINFO record to define abbreviations for all our records.
236 -- See FILEFORMAT for a description of what all the record types mean.
237 bc_file:enter_subblock(bc.BLOCKINFO)
239 -- IntFA abbreviations
240 bc_file:write_unabbreviated_record(bc.SETBID, BC_INTFA)
242 abbrevs.bc_intfa_final_state = bc_file:define_abbreviation(4,
243 bc.LiteralOp:new(BC_INTFA_FINAL_STATE),
244 bc.VBROp:new(5),
245 bc.VBROp:new(5))
247 abbrevs.bc_intfa_state = bc_file:define_abbreviation(5,
248 bc.LiteralOp:new(BC_INTFA_STATE),
249 bc.VBROp:new(5))
251 abbrevs.bc_intfa_transition = bc_file:define_abbreviation(6,
252 bc.LiteralOp:new(BC_INTFA_TRANSITION),
253 bc.VBROp:new(8),
254 bc.VBROp:new(6))
256 abbrevs.bc_intfa_transition_range = bc_file:define_abbreviation(7,
257 bc.LiteralOp:new(BC_INTFA_TRANSITION_RANGE),
258 bc.VBROp:new(8),
259 bc.VBROp:new(8),
260 bc.VBROp:new(6))
262 -- Strings abbreviations
263 bc_file:write_unabbreviated_record(bc.SETBID, BC_STRINGS)
265 abbrevs.bc_string = bc_file:define_abbreviation(4,
266 bc.LiteralOp:new(BC_STRING),
267 bc.ArrayOp:new(bc.FixedOp:new(7)))
269 -- RTN abbreviations
270 bc_file:write_unabbreviated_record(bc.SETBID, BC_RTN)
272 abbrevs.bc_rtn_info = bc_file:define_abbreviation(4,
273 bc.LiteralOp:new(BC_RTN_INFO),
274 bc.VBROp:new(6),
275 bc.VBROp:new(4))
277 abbrevs.bc_rtn_state = bc_file:define_abbreviation(5,
278 bc.LiteralOp:new(BC_RTN_STATE),
279 bc.VBROp:new(4),
280 bc.VBROp:new(4),
281 bc.FixedOp:new(1))
283 abbrevs.bc_rtn_state_with_gla = bc_file:define_abbreviation(6,
284 bc.LiteralOp:new(BC_RTN_STATE_WITH_GLA),
285 bc.VBROp:new(4),
286 bc.VBROp:new(4),
287 bc.FixedOp:new(1))
289 abbrevs.bc_rtn_trivial_final_state = bc_file:define_abbreviation(7,
290 bc.LiteralOp:new(BC_RTN_TRIVIAL_FINAL_STATE))
292 abbrevs.bc_rtn_transition_terminal = bc_file:define_abbreviation(8,
293 bc.LiteralOp:new(BC_RTN_TRANSITION_TERMINAL),
294 bc.VBROp:new(6),
295 bc.VBROp:new(5),
296 bc.VBROp:new(5),
297 bc.VBROp:new(4))
299 abbrevs.bc_rtn_transition_nonterm = bc_file:define_abbreviation(9,
300 bc.LiteralOp:new(BC_RTN_TRANSITION_NONTERM),
301 bc.VBROp:new(6),
302 bc.VBROp:new(5),
303 bc.VBROp:new(5),
304 bc.VBROp:new(4))
306 abbrevs.bc_rtn_ignore = bc_file:define_abbreviation(10,
307 bc.LiteralOp:new(BC_RTN_IGNORE),
308 bc.VBROp:new(6))
310 -- GLA abbreviations
311 bc_file:write_unabbreviated_record(bc.SETBID, BC_GLA)
313 abbrevs.bc_gla_state = bc_file:define_abbreviation(4,
314 bc.LiteralOp:new(BC_GLA_STATE),
315 bc.VBROp:new(4),
316 bc.VBROp:new(4))
318 abbrevs.bc_final_state = bc_file:define_abbreviation(5,
319 bc.LiteralOp:new(BC_GLA_FINAL_STATE),
320 bc.FixedOp:new(1),
321 bc.ArrayOp:new(bc.VBROp:new(4)))
323 abbrevs.bc_gla_transition = bc_file:define_abbreviation(6,
324 bc.LiteralOp:new(BC_GLA_TRANSITION),
325 bc.VBROp:new(4),
326 bc.VBROp:new(4))
328 bc_file:end_subblock(bc.BLOCKINFO)
330 return abbrevs
333 -- vim:et:sts=2:sw=2