1 # Copyright (c) 2015-2018 Paul Sokolovsky
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 """Transformation passes on CFG"""
19 from collections
import defaultdict
22 from cfgutils
import *
24 from xform_expr
import *
25 from xform_inst
import *
26 from xform_bblock
import *
27 from utils
import set_union
34 _log
= logging
.getLogger(__name__
)
37 def check_prop(cfg
, prop_name
, err_msg
):
38 entry_addr
= cfg
.entry()
39 if prop_name
not in cfg
[entry_addr
]:
43 def find_1st_def(cfg
, reg
, in_bblock
=False):
45 for n
, info
in cfg
.nodes_props():
46 if reg
in info
["val"].defs():
47 return info
["val"].addr
49 raise NotImplementedError
52 def number_postorder(cfg
):
53 cfg
.number_postorder()
56 def number_postorder_from_exit(cfg
):
57 cfg
.number_postorder_from_exit("_EXIT_")
60 def remove_trailing_jumps(cfg
):
61 remove_returns
= False
66 foreach_bblock(cfg
, remove_trailing_jumps_bblock
, remove_returns
=remove_returns
)
67 cfg
.props
["trailing_jumps"] = False
70 def remove_unreachable_entries(cfg
):
71 # Remove disconnected graph nodes.
72 entries
= cfg
.entries()
76 if e
== cfg
.props
["addr"]:
79 def remove_component(e
):
94 def remove_unreachable_nodes(cfg
):
95 "Remove CFG nodes not reachable from entry."
96 assert "postno" in cfg
[cfg
.first_node
], "Need number_postorder"
98 for node
, info
in list(cfg
.nodes_props()):
99 if info
["postno"] is None:
100 cfg
.remove_node(node
)
103 # Remove any jumps to jumps, replacing destination of first jump
104 # to be destination of 2nd.
105 # This "simplifies" graph, but may make it less structured. This is useful
106 # transformation for generating machine code, but for decompilation,
107 # it actually makes sense to add some extra jump landing sites to make
108 # loops more structured.
110 # http://en.wikipedia.org/wiki/Jump_threading
112 def remove_jump_over_jump(cfg
):
113 for v
in cfg
.nodes():
114 # If node is not entry, has a single exit and empty
115 if cfg
.degree_in(v
) > 0 and cfg
.degree_out(v
) == 1 and not cfg
.node(v
)["val"].items
:
116 cfg
.move_pred(v
, cfg
.succ(v
)[0])
118 _log
.info("jump_over_jump: removed node: %s", v
)
121 # If possible, make a single back-edge to a loop header, by introducing
122 # intermediate jump landing nodes.
123 def loop_single_entry(cfg
):
124 for v
in cfg
.nodes():
125 if cfg
.degree_in(v
) >= 2:
127 back_preds
= list(filter(lambda x
: v
<= x
, preds
))
128 if len(back_preds
) < 2:
130 _log
.info("loop_single_entry: node: %s", v
)
131 _log
.info("back_preds: %s", back_preds
)
132 back_jumps
= list(filter(lambda x
: cfg
.degree_out(x
) == 1, back_preds
))
133 _log
.info("back_jumps: %s", back_jumps
)
134 # find existing landing site
137 b
= cfg
.node(p
)["val"]
141 farthest
= max(back_preds
)
142 _log
.info("farthest: %s", farthest
)
143 newb
= BBlock(farthest
+ "_1")
144 cfg
.add_node(newb
.addr
, val
=newb
)
145 cfg
.add_edge(newb
.addr
, v
)
146 landing_site
= newb
.addr
147 _log
.info("landing_site: %s", landing_site
)
149 if p
!= landing_site
:
150 e
= cfg
.edge(p
, v
).get("cond")
151 cfg
.remove_edge(p
, v
)
152 cfg
.add_edge(p
, landing_site
, cond
=e
)
156 def cfg_single_entry(cfg
):
157 first
= cfg
.first_node
159 # First (== entry) node has a backedge
160 entryb
= BBlock(".ENTRY")
162 cfg
.add_node(entryb
.addr
, val
=entryb
)
163 cfg
.add_edge(entryb
.addr
, first
)
165 # Can still have multiple entries at this point
166 assert len(cfg
.entries()) >= 1
169 # Unconditionally add a new empty entry node, to hold anything needed later
170 def cfg_preheader(cfg
):
171 first
= cfg
.first_node
172 if 1: #cfg.pred(first):
173 entryb
= BBlock(".ENTRY")
175 cfg
.add_node(entryb
.addr
, val
=entryb
)
176 cfg
.add_edge(entryb
.addr
, first
)
177 cfg
.first_node
= entryb
.addr
179 # Can still have multiple entries at this point
180 assert len(cfg
.entries()) >= 1
183 # Make sure that CFG has a single exit, as required for some algorithms.
184 # Note that this doesn't do anything to former individual exit BBlocks,
185 # so they likely still end with "return" instructions.
186 def cfg_single_exit(cfg
):
189 exitb
= BBlock("_EXIT_")
191 exitb
.add(Inst(None, "return", [], addr
=exitb
.addr
))
192 cfg
.add_node(exitb
.addr
, val
=exitb
)
194 cfg
.add_edge(e
, exitb
.addr
)
198 cfg
.props
["noreturn"] = True
201 # This pass finds infinite loops, and adds virtual exit edges from
202 # them (effectively turning infinite loops into "do {...} while (1)"
203 # loops) to a special "_DEADEND_" node, which in turn is connected to
204 # the single exit node. Using intermediate deadend node will allow
205 # later to remove these virtual edges easily, and also to have a
206 # single adhoc rule for live var analysis, that live-out of DEADEND
208 def cfg_infloops_exit(cfg
):
212 for addr
, info
in cfg
.nodes_props():
213 # We're interested only in nodes unreachable from exit
214 if info
["postno_exit"]:
217 succ
= cfg
.succ(addr
)
220 my_postno
= info
["postno"]
223 if cfg
[s
]["postno"] < my_postno
:
227 #print("deadend:", addr)
228 deadend_nodes
.append(addr
)
231 if not cfg
.props
.get("noreturn"):
232 cfg
.props
["has_infloops"] = True
234 # Add intermediate node, so later we can remove it, and all
235 # extra edges are gone
236 bb
= BBlock("_DEADEND_")
238 cfg
.add_node("_DEADEND_", val
=bb
)
239 cfg
.add_edge("_DEADEND_", "_EXIT_", cond
=VALUE(0, 10))
240 for d
in deadend_nodes
:
241 cfg
.add_edge(d
, "_DEADEND_", cond
=VALUE(0, 10))
247 def split_bblock(cfg
, n
, suffix
, only_non_empty
=False, before
=False):
248 # If a node is non-empty bblock, splits it in two, with 2nd one being
249 # empty, and having all out edges, and returns this 2nd one. If bblock
250 # is already empty, returns it directly.
251 if only_non_empty
and not cfg
[n
]["val"].items
:
254 new_addr
= n
+ suffix
255 new_bb
= BBlock(new_addr
)
256 cfg
.add_node(new_addr
, val
=new_bb
)
258 cfg
.move_succ(n
, new_addr
)
259 cfg
.add_edge(n
, new_addr
)
261 # Too keep addresses lexicographically sorted, we now just move insts
262 # to the new bblock, leaving original bblock empty.
264 new_bb
.items
= cfg
[n
]["val"].items
265 cfg
[n
]["val"].items
= []
270 def split_crit_nodes(g
):
271 """Split "critical nodes".
273 Critical nodes are those which have more than one predecessor and more than
274 successor. We split them so original node becomes empty, and add new which
275 contains bblock content (this way it's compatible with trailing_jumps=True
279 crit_nodes
= [n
for n
in g
.nodes() if g
.degree_in(n
) > 1 and g
.degree_out(n
) > 1]
281 split_bblock(g
, n
, ".critn", before
=True)
284 def split_crit_edges(g
):
287 for from_n
, to_n
in g
.edges():
288 if g
.degree_out(from_n
) > 1 and g
.degree_in(to_n
) > 1:
289 to_split
.append((from_n
, to_n
))
291 for from_n
, to_n
in to_split
:
292 new_addr
= from_n
+ ".crite"
293 g
.add_node(new_addr
, val
=BBlock(new_addr
))
294 attrs
= g
[(from_n
, to_n
)]
295 g
.remove_edge(from_n
, to_n
)
296 g
.add_edge(from_n
, new_addr
, **attrs
)
297 g
.add_edge(new_addr
, to_n
)
300 def collect_state_in(cfg
):
301 # This is pretty backwards actually. It uses ReachDef information,
302 # but post-processes it pretty heavily, and instead should be done
303 # as a dataflow analysis itself.
305 for bblock_addr
, node_props
in cfg
.iter_sorted_nodes():
306 org_state
= node_props
["val"].props
.get("state_in", {})
307 state
= org_state
.copy()
308 by_var
= defaultdict(set)
309 for reg
, from_bblock
in node_props
["reachdef_in"]:
310 by_var
[reg
].add(from_bblock
)
311 #print(bblock_addr, by_var)
312 for var
, from_bblocks
in by_var
.items():
314 for bb
in from_bblocks
:
317 val
= cfg
[bb
]["val"].props
["state_out"].get(var
)
322 if len(val_set
) == 1:
323 state
[var
] = val_set
.pop()
324 elif len(val_set
) > 1:
325 _log
.debug("%s: in value set for %s are: %s" % (bblock_addr
, var
, val_set
))
326 if state
!= org_state
:
327 _log
.debug("CHANGED: %s: %r ==VS== %r" % (node_props
["val"], org_state
, state
))
328 node_props
["val"].props
["state_in"] = state
334 def collect_cond_out(cfg
):
335 """Collect known conditions (constraints) on registers at the
336 exit (and entry) of basic blocks."""
338 # Just as collect_state_in, this implements adhoc partial dataflow
339 # analysis as needed for trivial cases of jumptable analysis.
340 # Rewrite to use real dataflow framework.
342 for bblock_addr
, node_props
in cfg
.nodes_props():
343 succ
= cfg
.succ(bblock_addr
)
348 cond
= cfg
.edge(bblock_addr
, s
).get("cond")
350 cond
= last_cond
.neg()
351 # As cond's are updated in-place (later), make a copy
352 cond
= copy
.copy(cond
)
354 succ_bblock
= cfg
[s
]["val"]
355 succ_bblock
.props
.setdefault("cond_in", set()).add((bblock_addr
, cond
))
359 for bblock_addr
, node_props
in cfg
.nodes_props():
360 bblock
= node_props
["val"]
361 cond_in
= bblock
.props
.get("cond_in")
365 # Less conditions than input edges, we definitely don't meet
367 if len(cond_in
) != len(cfg
.pred(bblock_addr
)):
368 del bblock
.props
["cond_in"]
371 # This assumes each predecessor contributes only one condition (i.e.
372 # conditions are compounded in complex conditions if needed).
373 met_conds
= {cond
[1] for cond
in cond_in
}
374 if len(met_conds
) != 1:
375 del bblock
.props
["cond_in"]
378 bblock
.props
["cond_in"] = met_conds
380 for bblock_addr
, node_props
in cfg
.nodes_props():
381 bblock
= node_props
["val"]
382 cond_in
= bblock
.props
.get("cond_in")
386 reachdef_gen_regs
= {r
[0] for r
in node_props
["reachdef_gen"]}
391 if r
in reachdef_gen_regs
:
396 bblock
.props
["cond_out"] = cond_out
399 def propagate(cfg
, bblock_propagator
):
400 check_prop(cfg
, "reachdef_in", "This pass requires reaching defs information")
402 foreach_bblock(cfg
, lambda bb
: bblock_propagator(bb
, False))
403 if not collect_state_in(cfg
):
405 foreach_bblock(cfg
, lambda bb
: bblock_propagator(bb
, True))
408 def const_propagation(cfg
):
409 propagate(cfg
, bblock_const_propagation
)
411 def copy_propagation(cfg
):
412 propagate(cfg
, bblock_copy_propagation
)
414 def mem_propagation(cfg
):
415 propagate(cfg
, bblock_mem_propagation
)
417 def expr_propagation(cfg
):
418 propagate(cfg
, bblock_expr_propagation
)
422 entry_addr
= cfg
.entry()
423 first_bblock
= cfg
[entry_addr
]["val"]
424 first_bblock
.items
.insert(0, Inst(REG("sp"), "=", [REG("sp_0")], addr
=entry_addr
+ ".init0"))
427 # Generalization of insert_sp0
428 # For each register live on entry to function, insert to function pre-header
429 # assignment of $r = $r_0, where $r_0 represents an input register, whereas $r
430 # is a work register. Overall, this implements poor-man's (but very practical)
432 # Requires analyze_live_vars
433 def insert_initial_regs(cfg
):
434 entry_addr
= cfg
.entry()
435 # used_regs = reversed(sorted([x[0] for x in cfg[entry_addr]["reachdef_in"]]))
436 check_prop(cfg
, "live_in", "This pass requires live variable information")
437 used_regs
= cfg
[entry_addr
]["live_in"]
438 first_bblock
= cfg
[entry_addr
]["val"]
439 for i
, r
in enumerate(sorted(list(used_regs
))):
440 first_bblock
.items
.insert(i
, Inst(r
, "=", [REG(r
.name
+ "_0")], addr
=entry_addr
+ ".init%d" % i
))
443 def insert_params(cfg
):
444 entry_addr
= cfg
.entry()
445 first_bblock
= cfg
[entry_addr
]["val"]
446 for i
, reg
in enumerate(sorted(list(arch
.call_params(cfg
.props
["name"])))):
447 first_bblock
.items
.insert(i
, Inst(reg
, "=", [REG("arg_" + reg
.name
)], addr
=entry_addr
+ ".arg%d" % i
))
450 # Requires insert_initial_regs pass followed by expr_propagation passes
451 # (normal, then stack vars propagation), and should be run before DCE.
452 def collect_preserveds(cfg
):
453 exit_addr
= cfg
.exit()
454 exit_bblock
= cfg
[exit_addr
]["val"]
455 state_out
= exit_bblock
.props
["state_out"]
457 for k
, v
in state_out
.items():
458 if is_reg(k
) and is_reg(v
):
459 if v
.name
== k
.name
+ "_0":
461 progdb
.update_cfg_prop(cfg
, "preserveds", preserveds
)
464 # Requires expr_propagation
465 def collect_calls(cfg
):
470 if inst
.op
== "call":
475 calls_indir
.append(arg
)
477 foreach_inst(cfg
, collect
)
478 cfg
.props
["calls"] = calls
480 cfg
.props
["calls_indir"] = calls_indir
483 # While collect_calls collects direct calls, this pass
484 # collects function references by address. Requires
485 # funcdb to know what address is a function.
486 def collect_func_refs(cfg
):
494 if arg
.addr
in progdb
.FUNC_DB
:
497 foreach_inst(cfg
, collect
)
498 cfg
.props
["func_refs"] = refs
501 def collect_mem_refs(cfg
, pred
, prop_name
="mem_refs"):
502 """Collect references to non-symbolic memory address (ones represented
503 by VALUE objects). This is useful e.g. to see which function accesses
504 which MMIO addresses. Takes a predicate telling which addresses should
505 be captured and property name to store summary.
511 def collect_mmio(expr
):
512 if expr
and is_mem(expr
):
514 if is_value(mem
) and pred(mem
.val
):
520 # TODO: This means that MMIO accessed as array
522 if is_value(a
) and pred(a
.val
):
527 inst
.foreach_subexpr(collect_mmio
)
529 foreach_inst(cfg
, collect
)
531 #cfg.props[prop_name] = refs
532 cfg
.props
[prop_name
] = sorted(list(set(refs
)))
535 def local_defines(cfg
):
536 """Locations defined by function's own instructions, disregarding
537 calls to other functions. This may be useful e.g. to assess
543 if inst
.op
!= "call":
544 defs
.update(inst
.defs())
546 foreach_inst(cfg
, func
)
547 cfg
.props
["local_defines"] = defs
551 def collect_reach_exit(cfg
):
552 all_defs
= foreach_bblock(cfg
, lambda b
: b
.defs(True), set_union
)
554 if "reachdef_out" in cfg
.node(exit
):
555 all_defs2
= set(x
[0] for x
in cfg
.node(exit
)["reachdef_out"] if x
[1] is not None)
556 if "_DEADEND_" not in cfg
:
557 assert all_defs
== all_defs2
, "%r vs %r" % (all_defs
, all_defs2
)
559 progdb
.update_cfg_prop(cfg
, "reach_exit", all_defs
)
562 # Collect registers which may be either defined or not on the exit.
563 # These registers often represent parasitic arguments (as they
564 # may be either modified or not, the way to return the original value
565 # of the reg is take it as a param).
566 # Requires reachdef on raw CFG (before insert_initial_regs).
567 def collect_reach_exit_maybe(cfg
):
570 for var
, addr
in cfg
.node(exit
)["reachdef_out"]:
571 vardict
.setdefault(var
, set()).add(addr
)
574 for var
, addrs
in vardict
.items():
575 if len(addrs
) > 1 and None in addrs
:
578 if 1: #mod_maybe or "reach_exit_maybe" in cfg.props:
579 progdb
.update_cfg_prop(cfg
, "reach_exit_maybe", mod_maybe
)
583 def analyze_live_vars(cfg
, **kwargs
):
584 ana
= dataflow
.LiveVarAnalysis(cfg
, **kwargs
)
587 def analyze_reach_defs(cfg
):
588 ana
= dataflow
.ReachDefAnalysis(cfg
)
591 def analyze_dom(cfg
):
592 ana
= dataflow
.DominatorAnalysis(cfg
)
596 # Regs in live_in set on function entry are estimated params
597 # (may miss something, e.g. if value in a reg is passed thru
598 # (without referencing it) to another function).
599 def estimate_params(cfg
):
600 #ana = dataflow.LiveVarAnalysis(cfg, skip_calls=True)
602 check_prop(cfg
, "live_in", "This pass requires live variable information")
603 func_addr
= cfg
.entry()
604 assert func_addr
== ".ENTRY", "cfg_preheader pass required"
605 real_entry
= cfg
.succ(func_addr
)
606 assert len(real_entry
) == 1
607 real_entry
= real_entry
[0]
609 args
= set(REG(r
.name
[:-2] if r
.name
.endswith("_0") else r
.name
) for r
in e
["live_in"])
610 args
-= set([REG("sp")])
611 progdb
.update_cfg_prop(cfg
, "estimated_params", args
)
614 # Precisely compute func params
615 def collect_params(cfg
):
616 func_addr
= cfg
.entry()
618 args
= set(REG(r
.name
[:-2] if r
.name
.endswith("_0") else r
.name
) for r
in e
["live_in"])
619 args
= arch
.param_filter(args
)
620 progdb
.update_cfg_prop(cfg
, "params", args
)
623 # Kind of "AI" telling why this or that reg has got into "params" list.
624 # The explanations are approximate, e.g. a register may be "modified only
625 # along some paths" (especially if overestimated as modified in the
626 # presense of unknown functions), and still be a genuine param.
627 def annotate_params(cfg
):
629 for reg
in cfg
.props
["params"]:
630 if reg
in cfg
.props
.get("estimated_params", ()):
631 res
[reg
] = "100% genuine param"
632 elif reg
== REG("sp"):
633 res
[reg
] = "Param because address of object on stack is taken"
634 elif reg
in cfg
.props
.get("reach_exit_maybe", ()):
635 res
[reg
] = "Param because modified only along some paths (and maybe param to some callee)"
637 res
[reg
] = "Likely param passed down to some callee"
639 cfg
.props
["params_why"] = res
640 #progdb.update_cfg_prop(cfg, "params_why", res)
643 # Collect regs which are live after each function call within current
644 # function. Triples of (bblock_addr, func, live_out) are stored in CFG's
645 # "calls_live_out" property (to be later further unioned and stored in
646 # funcdb). This corresponds to Van Emmerik's callsite use collector.
647 def collect_call_live_out(cfg
):
652 if bb
.items
and bb
[-1].op
== "call":
655 # TODO: Perhaps filter in the real regs?
656 regs
= {r
for r
in node
["live_out"] if not r
.name
.endswith("_0")}
657 calls_live_out
.append((bb
.addr
, arg
, regs
))
659 foreach_node(cfg
, collect
)
660 progdb
.update_cfg_prop(cfg
, "calls_live_out", calls_live_out
)