github/workflows/pycopy-test: Upgrade Pycopy to 3.6.1.
[ScratchABlock.git] / xform_cfg.py
blobf9fe724a1d2d1cb443f930f1c0843c56e16fe080
1 # Copyright (c) 2015-2018 Paul Sokolovsky
3 # This program is free software: you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation, either version 3 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 """Transformation passes on CFG"""
18 import logging
19 from collections import defaultdict
21 from core import *
22 from cfgutils import *
23 from dce import *
24 from xform_expr import *
25 from xform_inst import *
26 from xform_bblock import *
27 from utils import set_union
28 import arch
29 import progdb
31 import copy
34 _log = logging.getLogger(__name__)
37 def check_prop(cfg, prop_name, err_msg):
38 entry_addr = cfg.entry()
39 if prop_name not in cfg[entry_addr]:
40 assert 0, err_msg
43 def find_1st_def(cfg, reg, in_bblock=False):
44 if in_bblock:
45 for n, info in cfg.nodes_props():
46 if reg in info["val"].defs():
47 return info["val"].addr
48 else:
49 raise NotImplementedError
52 def number_postorder(cfg):
53 cfg.number_postorder()
56 def number_postorder_from_exit(cfg):
57 cfg.number_postorder_from_exit("_EXIT_")
60 def remove_trailing_jumps(cfg):
61 remove_returns = False
62 exits = cfg.exits()
63 if len(exits) == 1:
64 remove_returns = True
66 foreach_bblock(cfg, remove_trailing_jumps_bblock, remove_returns=remove_returns)
67 cfg.props["trailing_jumps"] = False
70 def remove_unreachable_entries(cfg):
71 # Remove disconnected graph nodes.
72 entries = cfg.entries()
73 if len(entries) == 1:
74 return
75 for e in entries:
76 if e == cfg.props["addr"]:
77 continue
79 def remove_component(e):
80 if cfg.pred(e):
81 return
82 succ = cfg.succ(e)
83 try:
84 cfg.remove_node(e)
85 except KeyError:
86 # Already removed
87 pass
88 for e in succ:
89 remove_component(e)
91 remove_component(e)
94 def remove_unreachable_nodes(cfg):
95 "Remove CFG nodes not reachable from entry."
96 assert "postno" in cfg[cfg.first_node], "Need number_postorder"
98 for node, info in list(cfg.nodes_props()):
99 if info["postno"] is None:
100 cfg.remove_node(node)
103 # Remove any jumps to jumps, replacing destination of first jump
104 # to be destination of 2nd.
105 # This "simplifies" graph, but may make it less structured. This is useful
106 # transformation for generating machine code, but for decompilation,
107 # it actually makes sense to add some extra jump landing sites to make
108 # loops more structured.
110 # http://en.wikipedia.org/wiki/Jump_threading
112 def remove_jump_over_jump(cfg):
113 for v in cfg.nodes():
114 # If node is not entry, has a single exit and empty
115 if cfg.degree_in(v) > 0 and cfg.degree_out(v) == 1 and not cfg.node(v)["val"].items:
116 cfg.move_pred(v, cfg.succ(v)[0])
117 cfg.remove_node(v)
118 _log.info("jump_over_jump: removed node: %s", v)
119 return True
121 # If possible, make a single back-edge to a loop header, by introducing
122 # intermediate jump landing nodes.
123 def loop_single_entry(cfg):
124 for v in cfg.nodes():
125 if cfg.degree_in(v) >= 2:
126 preds = cfg.pred(v)
127 back_preds = list(filter(lambda x: v <= x, preds))
128 if len(back_preds) < 2:
129 continue
130 _log.info("loop_single_entry: node: %s", v)
131 _log.info("back_preds: %s", back_preds)
132 back_jumps = list(filter(lambda x: cfg.degree_out(x) == 1, back_preds))
133 _log.info("back_jumps: %s", back_jumps)
134 # find existing landing site
135 landing_site = None
136 for p in back_jumps:
137 b = cfg.node(p)["val"]
138 if not b.items:
139 landing_site = p
140 if not landing_site:
141 farthest = max(back_preds)
142 _log.info("farthest: %s", farthest)
143 newb = BBlock(farthest + "_1")
144 cfg.add_node(newb.addr, val=newb)
145 cfg.add_edge(newb.addr, v)
146 landing_site = newb.addr
147 _log.info("landing_site: %s", landing_site)
148 for p in back_preds:
149 if p != landing_site:
150 e = cfg.edge(p, v).get("cond")
151 cfg.remove_edge(p, v)
152 cfg.add_edge(p, landing_site, cond=e)
153 return True
156 def cfg_single_entry(cfg):
157 first = cfg.first_node
158 if cfg.pred(first):
159 # First (== entry) node has a backedge
160 entryb = BBlock(".ENTRY")
161 entryb.cfg = cfg
162 cfg.add_node(entryb.addr, val=entryb)
163 cfg.add_edge(entryb.addr, first)
165 # Can still have multiple entries at this point
166 assert len(cfg.entries()) >= 1
169 # Unconditionally add a new empty entry node, to hold anything needed later
170 def cfg_preheader(cfg):
171 first = cfg.first_node
172 if 1: #cfg.pred(first):
173 entryb = BBlock(".ENTRY")
174 entryb.cfg = cfg
175 cfg.add_node(entryb.addr, val=entryb)
176 cfg.add_edge(entryb.addr, first)
177 cfg.first_node = entryb.addr
179 # Can still have multiple entries at this point
180 assert len(cfg.entries()) >= 1
183 # Make sure that CFG has a single exit, as required for some algorithms.
184 # Note that this doesn't do anything to former individual exit BBlocks,
185 # so they likely still end with "return" instructions.
186 def cfg_single_exit(cfg):
187 exits = cfg.exits()
189 exitb = BBlock("_EXIT_")
190 exitb.cfg = cfg
191 exitb.add(Inst(None, "return", [], addr=exitb.addr))
192 cfg.add_node(exitb.addr, val=exitb)
193 for e in exits:
194 cfg.add_edge(e, exitb.addr)
196 if not exits:
197 # Infinite loop
198 cfg.props["noreturn"] = True
201 # This pass finds infinite loops, and adds virtual exit edges from
202 # them (effectively turning infinite loops into "do {...} while (1)"
203 # loops) to a special "_DEADEND_" node, which in turn is connected to
204 # the single exit node. Using intermediate deadend node will allow
205 # later to remove these virtual edges easily, and also to have a
206 # single adhoc rule for live var analysis, that live-out of DEADEND
207 # is empty.
208 def cfg_infloops_exit(cfg):
210 deadend_nodes = []
212 for addr, info in cfg.nodes_props():
213 # We're interested only in nodes unreachable from exit
214 if info["postno_exit"]:
215 continue
217 succ = cfg.succ(addr)
218 if not succ:
219 continue
220 my_postno = info["postno"]
221 deadend = True
222 for s in succ:
223 if cfg[s]["postno"] < my_postno:
224 deadend = False
225 break
226 if deadend:
227 #print("deadend:", addr)
228 deadend_nodes.append(addr)
230 if deadend_nodes:
231 if not cfg.props.get("noreturn"):
232 cfg.props["has_infloops"] = True
234 # Add intermediate node, so later we can remove it, and all
235 # extra edges are gone
236 bb = BBlock("_DEADEND_")
237 bb.cfg = cfg
238 cfg.add_node("_DEADEND_", val=bb)
239 cfg.add_edge("_DEADEND_", "_EXIT_", cond=VALUE(0, 10))
240 for d in deadend_nodes:
241 cfg.add_edge(d, "_DEADEND_", cond=VALUE(0, 10))
243 # Graph was modified
244 return True
247 def split_bblock(cfg, n, suffix, only_non_empty=False, before=False):
248 # If a node is non-empty bblock, splits it in two, with 2nd one being
249 # empty, and having all out edges, and returns this 2nd one. If bblock
250 # is already empty, returns it directly.
251 if only_non_empty and not cfg[n]["val"].items:
252 return n
254 new_addr = n + suffix
255 new_bb = BBlock(new_addr)
256 cfg.add_node(new_addr, val=new_bb)
258 cfg.move_succ(n, new_addr)
259 cfg.add_edge(n, new_addr)
261 # Too keep addresses lexicographically sorted, we now just move insts
262 # to the new bblock, leaving original bblock empty.
263 if before:
264 new_bb.items = cfg[n]["val"].items
265 cfg[n]["val"].items = []
267 return new_addr
270 def split_crit_nodes(g):
271 """Split "critical nodes".
273 Critical nodes are those which have more than one predecessor and more than
274 successor. We split them so original node becomes empty, and add new which
275 contains bblock content (this way it's compatible with trailing_jumps=True
276 CFGs).
279 crit_nodes = [n for n in g.nodes() if g.degree_in(n) > 1 and g.degree_out(n) > 1]
280 for n in crit_nodes:
281 split_bblock(g, n, ".critn", before=True)
284 def split_crit_edges(g):
285 to_split = []
287 for from_n, to_n in g.edges():
288 if g.degree_out(from_n) > 1 and g.degree_in(to_n) > 1:
289 to_split.append((from_n, to_n))
291 for from_n, to_n in to_split:
292 new_addr = from_n + ".crite"
293 g.add_node(new_addr, val=BBlock(new_addr))
294 attrs = g[(from_n, to_n)]
295 g.remove_edge(from_n, to_n)
296 g.add_edge(from_n, new_addr, **attrs)
297 g.add_edge(new_addr, to_n)
300 def collect_state_in(cfg):
301 # This is pretty backwards actually. It uses ReachDef information,
302 # but post-processes it pretty heavily, and instead should be done
303 # as a dataflow analysis itself.
304 changed = False
305 for bblock_addr, node_props in cfg.iter_sorted_nodes():
306 org_state = node_props["val"].props.get("state_in", {})
307 state = org_state.copy()
308 by_var = defaultdict(set)
309 for reg, from_bblock in node_props["reachdef_in"]:
310 by_var[reg].add(from_bblock)
311 #print(bblock_addr, by_var)
312 for var, from_bblocks in by_var.items():
313 val_set = set()
314 for bb in from_bblocks:
315 val = None
316 if bb is not None:
317 val = cfg[bb]["val"].props["state_out"].get(var)
318 if val is None:
319 val_set = set()
320 break
321 val_set.add(val)
322 if len(val_set) == 1:
323 state[var] = val_set.pop()
324 elif len(val_set) > 1:
325 _log.debug("%s: in value set for %s are: %s" % (bblock_addr, var, val_set))
326 if state != org_state:
327 _log.debug("CHANGED: %s: %r ==VS== %r" % (node_props["val"], org_state, state))
328 node_props["val"].props["state_in"] = state
329 changed = True
331 return changed
334 def collect_cond_out(cfg):
335 """Collect known conditions (constraints) on registers at the
336 exit (and entry) of basic blocks."""
338 # Just as collect_state_in, this implements adhoc partial dataflow
339 # analysis as needed for trivial cases of jumptable analysis.
340 # Rewrite to use real dataflow framework.
342 for bblock_addr, node_props in cfg.nodes_props():
343 succ = cfg.succ(bblock_addr)
344 if len(succ) < 2:
345 continue
346 last_cond = None
347 for s in succ:
348 cond = cfg.edge(bblock_addr, s).get("cond")
349 if cond is None:
350 cond = last_cond.neg()
351 # As cond's are updated in-place (later), make a copy
352 cond = copy.copy(cond)
354 succ_bblock = cfg[s]["val"]
355 succ_bblock.props.setdefault("cond_in", set()).add((bblock_addr, cond))
357 last_cond = cond
359 for bblock_addr, node_props in cfg.nodes_props():
360 bblock = node_props["val"]
361 cond_in = bblock.props.get("cond_in")
362 if not cond_in:
363 continue
365 # Less conditions than input edges, we definitely don't meet
366 # over all paths.
367 if len(cond_in) != len(cfg.pred(bblock_addr)):
368 del bblock.props["cond_in"]
369 continue
371 # This assumes each predecessor contributes only one condition (i.e.
372 # conditions are compounded in complex conditions if needed).
373 met_conds = {cond[1] for cond in cond_in}
374 if len(met_conds) != 1:
375 del bblock.props["cond_in"]
376 continue
378 bblock.props["cond_in"] = met_conds
380 for bblock_addr, node_props in cfg.nodes_props():
381 bblock = node_props["val"]
382 cond_in = bblock.props.get("cond_in")
383 if not cond_in:
384 continue
386 reachdef_gen_regs = {r[0] for r in node_props["reachdef_gen"]}
387 cond_out = set()
388 for c in cond_in:
389 killed = False
390 for r in c.regs():
391 if r in reachdef_gen_regs:
392 killed = True
393 if not killed:
394 cond_out.add(c)
396 bblock.props["cond_out"] = cond_out
399 def propagate(cfg, bblock_propagator):
400 check_prop(cfg, "reachdef_in", "This pass requires reaching defs information")
401 while True:
402 foreach_bblock(cfg, lambda bb: bblock_propagator(bb, False))
403 if not collect_state_in(cfg):
404 break
405 foreach_bblock(cfg, lambda bb: bblock_propagator(bb, True))
408 def const_propagation(cfg):
409 propagate(cfg, bblock_const_propagation)
411 def copy_propagation(cfg):
412 propagate(cfg, bblock_copy_propagation)
414 def mem_propagation(cfg):
415 propagate(cfg, bblock_mem_propagation)
417 def expr_propagation(cfg):
418 propagate(cfg, bblock_expr_propagation)
421 def insert_sp0(cfg):
422 entry_addr = cfg.entry()
423 first_bblock = cfg[entry_addr]["val"]
424 first_bblock.items.insert(0, Inst(REG("sp"), "=", [REG("sp_0")], addr=entry_addr + ".init0"))
427 # Generalization of insert_sp0
428 # For each register live on entry to function, insert to function pre-header
429 # assignment of $r = $r_0, where $r_0 represents an input register, whereas $r
430 # is a work register. Overall, this implements poor-man's (but very practical)
431 # SSA subset.
432 # Requires analyze_live_vars
433 def insert_initial_regs(cfg):
434 entry_addr = cfg.entry()
435 # used_regs = reversed(sorted([x[0] for x in cfg[entry_addr]["reachdef_in"]]))
436 check_prop(cfg, "live_in", "This pass requires live variable information")
437 used_regs = cfg[entry_addr]["live_in"]
438 first_bblock = cfg[entry_addr]["val"]
439 for i, r in enumerate(sorted(list(used_regs))):
440 first_bblock.items.insert(i, Inst(r, "=", [REG(r.name + "_0")], addr=entry_addr + ".init%d" % i))
443 def insert_params(cfg):
444 entry_addr = cfg.entry()
445 first_bblock = cfg[entry_addr]["val"]
446 for i, reg in enumerate(sorted(list(arch.call_params(cfg.props["name"])))):
447 first_bblock.items.insert(i, Inst(reg, "=", [REG("arg_" + reg.name)], addr=entry_addr + ".arg%d" % i))
450 # Requires insert_initial_regs pass followed by expr_propagation passes
451 # (normal, then stack vars propagation), and should be run before DCE.
452 def collect_preserveds(cfg):
453 exit_addr = cfg.exit()
454 exit_bblock = cfg[exit_addr]["val"]
455 state_out = exit_bblock.props["state_out"]
456 preserveds = set()
457 for k, v in state_out.items():
458 if is_reg(k) and is_reg(v):
459 if v.name == k.name + "_0":
460 preserveds.add(k)
461 progdb.update_cfg_prop(cfg, "preserveds", preserveds)
464 # Requires expr_propagation
465 def collect_calls(cfg):
466 calls = []
467 calls_indir = []
469 def collect(inst):
470 if inst.op == "call":
471 arg = inst.args[0]
472 if is_addr(arg):
473 calls.append(arg)
474 else:
475 calls_indir.append(arg)
477 foreach_inst(cfg, collect)
478 cfg.props["calls"] = calls
479 if calls_indir:
480 cfg.props["calls_indir"] = calls_indir
483 # While collect_calls collects direct calls, this pass
484 # collects function references by address. Requires
485 # funcdb to know what address is a function.
486 def collect_func_refs(cfg):
487 refs = []
489 def collect(inst):
490 if inst.op == "=":
491 arg = inst.args[0]
492 if is_addr(arg):
493 import progdb
494 if arg.addr in progdb.FUNC_DB:
495 refs.append(arg)
497 foreach_inst(cfg, collect)
498 cfg.props["func_refs"] = refs
501 def collect_mem_refs(cfg, pred, prop_name="mem_refs"):
502 """Collect references to non-symbolic memory address (ones represented
503 by VALUE objects). This is useful e.g. to see which function accesses
504 which MMIO addresses. Takes a predicate telling which addresses should
505 be captured and property name to store summary.
507 refs = []
509 def collect(inst):
511 def collect_mmio(expr):
512 if expr and is_mem(expr):
513 mem = expr.expr
514 if is_value(mem) and pred(mem.val):
515 refs.append(mem)
516 elif is_expr(mem):
517 if mem.op != "+":
518 #print(mem.op, expr)
519 return
520 # TODO: This means that MMIO accessed as array
521 for a in mem.args:
522 if is_value(a) and pred(a.val):
523 #refs.append(a)
524 refs.append(mem)
525 break
527 inst.foreach_subexpr(collect_mmio)
529 foreach_inst(cfg, collect)
530 if refs:
531 #cfg.props[prop_name] = refs
532 cfg.props[prop_name] = sorted(list(set(refs)))
535 def local_defines(cfg):
536 """Locations defined by function's own instructions, disregarding
537 calls to other functions. This may be useful e.g. to assess
538 preserveds."""
540 defs = set()
542 def func(inst):
543 if inst.op != "call":
544 defs.update(inst.defs())
546 foreach_inst(cfg, func)
547 cfg.props["local_defines"] = defs
548 return defs
551 def collect_reach_exit(cfg):
552 all_defs = foreach_bblock(cfg, lambda b: b.defs(True), set_union)
553 exit = cfg.exit()
554 if "reachdef_out" in cfg.node(exit):
555 all_defs2 = set(x[0] for x in cfg.node(exit)["reachdef_out"] if x[1] is not None)
556 if "_DEADEND_" not in cfg:
557 assert all_defs == all_defs2, "%r vs %r" % (all_defs, all_defs2)
558 all_defs = all_defs2
559 progdb.update_cfg_prop(cfg, "reach_exit", all_defs)
562 # Collect registers which may be either defined or not on the exit.
563 # These registers often represent parasitic arguments (as they
564 # may be either modified or not, the way to return the original value
565 # of the reg is take it as a param).
566 # Requires reachdef on raw CFG (before insert_initial_regs).
567 def collect_reach_exit_maybe(cfg):
568 exit = cfg.exit()
569 vardict = {}
570 for var, addr in cfg.node(exit)["reachdef_out"]:
571 vardict.setdefault(var, set()).add(addr)
572 mod_maybe = set()
574 for var, addrs in vardict.items():
575 if len(addrs) > 1 and None in addrs:
576 mod_maybe.add(var)
578 if 1: #mod_maybe or "reach_exit_maybe" in cfg.props:
579 progdb.update_cfg_prop(cfg, "reach_exit_maybe", mod_maybe)
582 import dataflow
583 def analyze_live_vars(cfg, **kwargs):
584 ana = dataflow.LiveVarAnalysis(cfg, **kwargs)
585 ana.solve()
587 def analyze_reach_defs(cfg):
588 ana = dataflow.ReachDefAnalysis(cfg)
589 ana.solve()
591 def analyze_dom(cfg):
592 ana = dataflow.DominatorAnalysis(cfg)
593 ana.solve()
596 # Regs in live_in set on function entry are estimated params
597 # (may miss something, e.g. if value in a reg is passed thru
598 # (without referencing it) to another function).
599 def estimate_params(cfg):
600 #ana = dataflow.LiveVarAnalysis(cfg, skip_calls=True)
601 #ana.solve()
602 check_prop(cfg, "live_in", "This pass requires live variable information")
603 func_addr = cfg.entry()
604 assert func_addr == ".ENTRY", "cfg_preheader pass required"
605 real_entry = cfg.succ(func_addr)
606 assert len(real_entry) == 1
607 real_entry = real_entry[0]
608 e = cfg[real_entry]
609 args = set(REG(r.name[:-2] if r.name.endswith("_0") else r.name) for r in e["live_in"])
610 args -= set([REG("sp")])
611 progdb.update_cfg_prop(cfg, "estimated_params", args)
614 # Precisely compute func params
615 def collect_params(cfg):
616 func_addr = cfg.entry()
617 e = cfg[func_addr]
618 args = set(REG(r.name[:-2] if r.name.endswith("_0") else r.name) for r in e["live_in"])
619 args = arch.param_filter(args)
620 progdb.update_cfg_prop(cfg, "params", args)
623 # Kind of "AI" telling why this or that reg has got into "params" list.
624 # The explanations are approximate, e.g. a register may be "modified only
625 # along some paths" (especially if overestimated as modified in the
626 # presense of unknown functions), and still be a genuine param.
627 def annotate_params(cfg):
628 res = {}
629 for reg in cfg.props["params"]:
630 if reg in cfg.props.get("estimated_params", ()):
631 res[reg] = "100% genuine param"
632 elif reg == REG("sp"):
633 res[reg] = "Param because address of object on stack is taken"
634 elif reg in cfg.props.get("reach_exit_maybe", ()):
635 res[reg] = "Param because modified only along some paths (and maybe param to some callee)"
636 else:
637 res[reg] = "Likely param passed down to some callee"
639 cfg.props["params_why"] = res
640 #progdb.update_cfg_prop(cfg, "params_why", res)
643 # Collect regs which are live after each function call within current
644 # function. Triples of (bblock_addr, func, live_out) are stored in CFG's
645 # "calls_live_out" property (to be later further unioned and stored in
646 # funcdb). This corresponds to Van Emmerik's callsite use collector.
647 def collect_call_live_out(cfg):
649 calls_live_out = []
650 def collect(node):
651 bb = node["val"]
652 if bb.items and bb[-1].op == "call":
653 inst = bb[-1]
654 arg = inst.args[0]
655 # TODO: Perhaps filter in the real regs?
656 regs = {r for r in node["live_out"] if not r.name.endswith("_0")}
657 calls_live_out.append((bb.addr, arg, regs))
659 foreach_node(cfg, collect)
660 progdb.update_cfg_prop(cfg, "calls_live_out", calls_live_out)