1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
7 Data structure for representing finite automata. Both NFAs and DFAs
8 can be represented using this class.
10 The base class (FA/FAState) has three child classes:
12 - IntFA/IntFAState represents a nonrecursive FA that transitions
13 on IntSets or Epsilon. These represent the machines that recognize
16 - RTN/RTNState represents a recursive transition network: the graph
17 that is built to recognize context-free grammars. These transition
18 on strings, regexes, epsilon, or on another RTN.
20 - GLA/GLAState represents a lookahead automaton that represents
21 LL lookahead information.
23 Either child class can be deterministic or nondeterministic. The only
24 difference is whether there are epsilons / redundant transtions.
26 This all needs to be refactored quite a bit, now that I have the insight
27 of understanding all the different ways NFAs and DFAs are used throughout
30 Copyright (c) 2007-2008 Joshua Haberman. See LICENSE for details.
32 --------------------------------------------------------------------]]--
36 module("fa", package
.seeall
)
38 -- Class for representing special-case edge values that have only one
39 -- instance for the whole program.
40 SingletonEdgeValue
= {name
="SingletonEdgeValue"}
41 function SingletonEdgeValue
:new(name
)
42 self
.singletons
= self
.singletons
or {}
43 -- we index each singleton by name, creating new ones lazily.
44 self
.singletons
[name
] = self
.singletons
[name
] or newobject(self
)
45 return self
.singletons
[name
]
48 -- This is a special edge value that represents a transition that can be
49 -- taken without consuming any input.
50 e
= SingletonEdgeValue
:new("Epsilon")
52 -- This is a special edge value that represents a GLA transition that can
53 -- be taken when EOF is seen.
54 eof
= SingletonEdgeValue
:new("EOF")
57 --[[--------------------------------------------------------------------
59 class State -- base class of FAState and RTNState
61 --------------------------------------------------------------------]]--
63 FAState
= {name
="FAState"}
64 function FAState
:new()
65 local obj
= newobject(self
)
70 function FAState
:child_states()
71 local children
= Set
:new()
72 for edge_value
, target_state
in self
:transitions() do
73 children
:add(target_state
)
78 function FAState
:add_transition(edge_value
, target_state
, edge_properties
)
79 for e_edge_value
, e_target_state
, e_edge_properties
in self
:transitions() do
80 if edge_value
== e_edge_value
and target_state
== e_target_state
81 and e_edge_properties
== edge_properties
then
85 table.insert(self
._transitions
, {edge_value
, target_state
, edge_properties
})
88 function FAState
:num_transitions()
89 return #self
._transitions
92 function FAState
:transitions()
96 if self
._transitions
[i
] then
97 return unpack(self
._transitions
[i
])
104 function FAState
:clear_transitions()
105 self
._transitions
= {}
108 function FAState
:transitions_for(val
, prop
)
110 if prop
== "ANY" then
116 for edge_val
, target_state
, properties
in self
:transitions() do
117 if edge_val
== val
and ((prop
== "ANY") or (prop
== "ALL") or (prop
== properties
)) then
118 if prop
== "ANY" then
119 table.insert(targets
, {target_state
, properties
})
121 targets
:add(target_state
)
128 function FAState
:dest_state_for(val
)
129 local states
= self
:transitions_for(val
, "ANY")
131 error(">1 transition found")
132 elseif #states
== 0 then
136 for dest_state_properties
in each(states
) do
137 dest_state
, properties
= unpack(dest_state_properties
)
144 --[[--------------------------------------------------------------------
146 class FA -- base class of IntFA and RTN
148 --------------------------------------------------------------------]]--
151 function FA
:new(init
)
152 local obj
= newobject(self
)
155 if obj
.new_state
then
156 obj
.start
= init
.start
or obj
:new_state()
157 obj
.final
= init
.final
or obj
:new_state() -- for all but Thompson NFA fragments we ignore this
159 obj
.start
:add_transition(init
.symbol
, obj
.final
, init
.properties
)
160 elseif init
.string then
161 local int_set
= IntSet
:new()
162 local char
= init
.string:sub(1, 1):byte()
163 int_set
:add(Range
:new(char
, char
))
164 local fa
= IntFA
:new
{symbol
=int_set
}
165 for i
=2,#init
.string do
166 int_set
= IntSet
:new()
167 char
= init
.string:sub(i
, i
):byte()
168 int_set
:add(Range
:new(char
, char
))
169 fa
= nfa_construct
.concat(fa
, IntFA
:new
{symbol
=int_set
})
181 return depth_first_traversal(self
.start
, function (s
) return s
:child_states() end)
185 local new_graph
= self
:new_graph()
186 local new_states
= {}
189 for state
in each(self
:states()) do
190 new_states
[state
] = new_states
[state
] or self
:new_state()
191 if self
.start
== state
then new_graph
.start
= new_states
[state
] end
192 if self
.final
== state
then new_graph
.final
= new_states
[state
] end
195 -- duplicate transitions
196 for state
in each(self
:states()) do
197 for edge_val
, target_state
, properties
in state
:transitions() do
198 new_states
[state
]:add_transition(edge_val
, new_states
[target_state
], properties
)
205 --[[--------------------------------------------------------------------
207 class IntFA/IntFAState: Classes for representing machines that recognize
210 --------------------------------------------------------------------]]--
214 function IntFA
:new_graph(init
)
215 return IntFA
:new(init
)
218 function IntFA
:new_state()
219 return IntFAState
:new()
222 function IntFA
:get_outgoing_edge_values(states
)
223 local symbol_sets
= Set
:new()
224 local properties_set
= Set
:new()
225 for state
in each(states
) do
226 for symbol_set
, target_state
, properties
in state
:transitions() do
227 if properties
~= nil then
228 properties_set
:add(properties
)
231 if type(symbol_set
) == "table" and symbol_set
.class
== IntSet
then
232 symbol_sets
:add(symbol_set
)
236 symbol_sets
= equivalence_classes(symbol_sets
)
238 -- for now, just cross symbol sets with properties. a bit wasteful,
239 -- but we'll worry about that later.
241 for set
in each(symbol_sets
) do
242 for properties
in each(properties_set
) do
243 table.insert(values
, {set
, properties
})
245 table.insert(values
, {set
, nil})
251 function IntFA
:to_dot()
253 states
= self
:states():to_array()
254 --table.sort(states, function (a, b) return a.statenum < b.statenum end)
255 for i
,state
in ipairs(states
) do
257 local peripheries
= 1
258 if state
== self
.start
then label
= "Begin" end
259 if state
== self
.final
or state
.final
then
261 label
= label
.. "NEWLINE" .. state
.final
267 label
= label
:gsub("[\"\\]", "\\%1")
268 label
= label
:gsub("NEWLINE", "\\n")
269 str
= str
.. string.format(' "%s" [label="%s", peripheries=%d];\n', tostring(state
), label
, peripheries
)
270 for char
, tostate
, attributes
in state
:transitions() do
274 -- elseif char == "(" then
275 -- print_char = "start capture"
276 -- elseif char == ")" then
277 -- print_char = "end capture"
278 elseif type(char
) == "string" then
280 elseif type(char
) == 'table' and char
.class
== IntSet
then
281 if char
:isunbounded() then char
= char
:invert() end
282 print_char
= char
:toasciistring()
284 print(serialize(char
, 3, true))
285 print_char
= string.char(char
)
287 print_char
= print_char
:gsub("[\"\\]", "\\%1")
288 print_char
= print_char
:gsub("NEWLINE", "\\n")
289 str
= str
.. string.format(' "%s" -> "%s" [label="%s"];\n', tostring(state
), tostring(tostate
), print_char
)
296 IntFAState
= FAState
:new()
297 IntFAState
.name
= "IntFAState"
299 function IntFAState
:add_transition(edge_value
, target_state
, edge_properties
)
300 -- as a special case, IntSet edge_values can be combined if two edge_values
301 -- have the same target_state and neither has any edge_properties.
302 if edge_value
.class
== IntSet
and edge_properties
== nil then
303 for existing_edge_value
, existing_target_state
, existing_edge_properties
in self
:transitions() do
304 if existing_edge_value
.class
== IntSet
and target_state
== existing_target_state
and existing_edge_properties
== nil then
305 existing_edge_value
:add_intset(edge_value
)
310 FAState
.add_transition(self
, edge_value
, target_state
, edge_properties
)
313 function IntFAState
:transitions_for(val
, prop
)
314 local targets
= Set
:new()
315 if type(val
) == "table" and val
.class
== IntSet
then
316 val
= val
:sampleint()
319 for edge_val
, target_state
, properties
in self
:transitions() do
320 if edge_val
== val
or (val
~= fa
.e
and edge_val
.class
== IntSet
and edge_val
:contains(val
)) then
321 if (prop
== "ANY") or (prop
== properties
) then
322 targets
:add(target_state
)
329 --[[--------------------------------------------------------------------
331 class GLA/GLAState: Classes for representing machines that represent
332 lookahead information.
334 --------------------------------------------------------------------]]--
338 function GLA
:new_graph(init
)
342 function GLA
:new_state()
343 return GLAState
:new()
346 function GLA
:get_outgoing_edge_values(states
)
348 for state
in each(states
) do
349 for edge_val
, target_state
, properties
in state
:transitions() do
350 if edge_val
~= fa
.e
then
351 table.insert(values
, {edge_val
, properties
})
358 function GLA
:to_dot(indent
, suffix
)
360 suffix
= suffix
or ""
361 indent
= indent
or ""
362 for state
in each(self
:states()) do
367 if state
.final
[1] == 0 then -- the special value that means "return"
368 extra_label
= "Return"
370 for edge_val
, dest_state
, properties
in self
.rtn_state
:transitions() do
371 if edge_val
== state
.final
[1] and dest_state
== state
.final
[2] then
372 extra_label
= tostring(properties
.slotnum
)
377 if self
.start
== state
then extra_label
= "Start" end
378 str
= str
.. string.format('%s"%s" [label="%s" peripheries=%d]\n',
379 indent
, tostring(state
) .. suffix
, escape(extra_label
), peripheries
)
380 for edge_val
, target_state
in state
:transitions() do
381 if edge_val
== fa
.eof
then
384 str
= str
.. string.format('%s"%s" -> "%s" [label="%s"]\n',
385 indent
, tostring(state
) .. suffix
, tostring(target_state
) .. suffix
,
393 GLAState
= FAState
:new()
394 GLAState
.name
= "GLAState"
395 function GLAState
:new(paths
)
396 local obj
= FAState
:new()
397 obj
.rtn_paths
= paths
400 for path
in each(paths
) do
401 if obj
.lookahead_k
and obj
.lookahead_k
~= path
.lookahead_k
then
402 error("Internal error: expected all paths for the GLA state to have the same length")
404 obj
.lookahead_k
= path
.lookahead_k
413 --[[--------------------------------------------------------------------
415 class RTN/RTNState: Classes for representing machines that represent
416 context-free grammars.
418 --------------------------------------------------------------------]]--
422 function RTN
:new_graph(init
)
426 function RTN
:new_state()
427 return RTNState
:new()
430 function RTN
:get_outgoing_edge_values(states
)
432 for state
in each(states
) do
433 for edge_val
, target_state
, properties
in state
:transitions() do
434 if edge_val
~= fa
.e
then
435 table.insert(values
, {edge_val
, properties
})
443 return str
:gsub("[\"\\]", "\\%1")
446 function RTN
:to_dot(indent
, suffix
, intfas
, glas
)
447 suffix
= suffix
or ""
448 str
= indent
.. "rankdir=LR;\n"
449 -- str = str .. indent .. string.format('label="%s"\n', self.name)
450 for state
in each(self
:states()) do
455 if state
.gla
.longest_path
== 1 then
456 color
= " fillcolor=\"cornflowerblue\""
457 elseif state
.gla
.longest_path
== 2 then
458 color
= " fillcolor=\"gold\""
459 elseif state
.gla
.longest_path
> 2 then
460 color
= " fillcolor=\"firebrick\""
462 color
= color
.. " style=\"filled\""
464 if state
.final
then peripheries
= 2 end
465 if self
.start
== state
then extra_label
= "Start" end
466 if intfas
and state
.intfa
then
467 if extra_label
~= "" then
468 extra_label
= extra_label
.. "\\n"
470 extra_label
= extra_label
.. "I: " .. tostring(intfas
:offset_of(state
.intfa
))
472 if glas
and state
.gla
then
473 if extra_label
~= "" then
474 extra_label
= extra_label
.. "\\n"
476 extra_label
= extra_label
.. "G: " .. tostring(glas
:offset_of(state
.gla
))
478 str
= str
.. string.format('%s"%s" [label="%s" peripheries=%d%s]\n',
479 indent
, tostring(state
) .. suffix
, extra_label
,
481 for edge_val
, target_state
in state
:transitions() do
482 if fa
.is_nonterm(edge_val
) then
483 str
= str
.. string.format('%s"%s" -> "%s" [label="<%s>"]\n',
484 indent
, tostring(state
) .. suffix
, tostring(target_state
) .. suffix
,
485 escape(edge_val
.name
))
487 --if attributes.regex_text[edge_val] then
488 -- edge_val = "/" .. attributes.regex_text[edge_val] .. "/"
490 if edge_val
== fa
.eof
then
493 str
= str
.. string.format('%s"%s" -> "%s" [label="%s"]\n',
494 indent
, tostring(state
) .. suffix
, tostring(target_state
) .. suffix
,
503 RTNState
= FAState
:new()
504 RTNState
.name
= "RTNState"
506 -- A trivial state is one where you can tell just by looking
507 -- at the state's transitions and its final status alone what
508 -- transition you should take for a given terminal.
509 function RTNState
:needs_gla()
511 -- a final state needs a GLA if it has any outgoing transitions
512 if self
:num_transitions() > 0 then
518 -- a nonfinal state needs a GLA if it has more than one
519 -- outgoing transition and either:
520 -- - at least one of those transitions is a nonterminal
521 -- - there are two or more terminal transitions for the same state
522 -- TODO: what about states with exactly 1 outgoing nonterminal
523 -- transition? We don't technically need a GLA's help to
524 -- figure out the right transition.
525 if self
:num_transitions() == 1 then
528 local edge_vals
= Set
:new()
529 for edge_val
in self
:transitions() do
530 if fa
.is_nonterm(edge_val
) then
533 if edge_vals
:contains(edge_val
) then
536 edge_vals
:add(edge_val
)
544 -- In most cases, a state needs an intfa if it doesn't have a GLA,
545 -- but there are a few exceptions. Final states with no transitions
546 -- don't need an intfa. Neither do states that have only one
547 -- nonterminal transition.
548 function RTNState
:needs_intfa()
549 if self
:needs_gla() then
554 elseif self
:num_transitions() == 1 and fa
.is_nonterm(self
._transitions
[1][1]) then
563 NonTerm
= {name
="NonTerm"}
564 function NonTerm
:new(name
)
565 -- keep a cache of nonterm objects, so that we always return the same object
566 -- for a given name. This lets us compare nonterms for equality.
567 if not self
.cache
then
571 if not self
.cache
[name
] then
572 obj
= newobject(self
)
574 self
.cache
[name
] = obj
577 return self
.cache
[name
]
580 function is_nonterm(thing
)
581 return (type(thing
) == "table" and thing
.class
== NonTerm
)