apply_xform.py

   1 #!/usr/bin/env python3
   2 import sys
   3 import argparse
   4 import os.path
   5 import glob
   6 import logging
   7
   8 import yaml
   9 import yamlutils
  10
  11 import core
  12 from parser import *
  13 import dot
  14 from dataflow import *
  15 from xform import *
  16 from xform_utils import *
  17 from decomp import *
  18 from asmprinter import AsmPrinter
  19 import cprinter
  20 import progdb
  21 import bindata
  22
  23 # TODO: something above shadows "copy" otherwise
  24 import copy
  25
  26
  27 _log = logging.getLogger(__name__)
  28
  29
  30 FUNC_DB = {}
  31 FUNC_DB_ORG = {}
  32
  33
  34 def parse_args():
  35     argp = argparse.ArgumentParser(description="Parse PseudoC program, apply transformations, and dump result in various formats")
  36     argp.add_argument("file", help="input file in PseudoC format, or directory of such files")
  37     argp.add_argument("-o", "--output", help="output file/dir (default stdout for single file, *.out for directory)")
  38     argp.add_argument("--arch", default="xtensa", help="architecture to use")
  39     argp.add_argument("--script", action="append", help="apply script from file")
  40     argp.add_argument("--iter", action="store_true", help="apply transform iteratively until no changes to funcdb")
  41     argp.add_argument("--funcdb", help="function database file (default: funcdb.yaml in input file's dir)")
  42     argp.add_argument("--format", choices=["none", "bblocks", "asm", "c"], default="bblocks",
  43         help="output format (default: %(default)s)")
  44     argp.add_argument("--output-suffix", metavar="SUFFIX", default=".out", help="suffix for output files in same-dir mode (default: .out)")
  45     argp.add_argument("--no-dead", action="store_true", help="don't output DCE-eliminated instructions")
  46     argp.add_argument("--no-comments", action="store_true", help="don't output decompilation comments (annotations)")
  47     argp.add_argument("--no-graph-header", action="store_true", help="don't output graph properties")
  48     argp.add_argument("--annotate-calls", action="store_true", help="annotate calls with uses/defs")
  49     argp.add_argument("--inst-addr", action="store_true", help="output instruction addresses")
  50     argp.add_argument("--dot-inst", action="store_true", help="output instructions in .dot files")
  51     argp.add_argument("--repr", action="store_true", help="dump __repr__ format of instructions and other objects")
  52     argp.add_argument("--debug", action="store_true", help="produce debug files")
  53     argp.add_argument("--log-level", metavar="LEVEL", default="INFO", help="set logging level (default: %(default)s)")
  54     args = argp.parse_args()
  55
  56     if args.repr:
  57         core.SimpleExpr.simple_repr = False
  58     if args.inst_addr:
  59         core.Inst.show_addr = True
  60     if args.dot_inst:
  61         import dot
  62         dot.show_insts = True
  63
  64     return args
  65
  66
  67 def handle_file(args):
  68     try:
  69         handle_file_unprotected(args)
  70     except Exception as e:
  71         print("Error while processing file: " + args.file)
  72         raise e
  73
  74
  75 def handle_file_unprotected(args):
  76     p = Parser(args.file)
  77     cfg = p.parse()
  78     cfg.parser = p
  79
  80     # If we want to get asm back, i.e. stay close to the input, don't remove
  81     # trailing jumps. This will work OK for data flow algos, but will produce
  82     # broken or confusing output for control flow algos (for which asm output
  83     # shouldn't be used of course).
  84     # Update: it's unsafe to use this during dataflow analysis
  85     #if args.format != "asm":
  86     #    foreach_bblock(cfg, remove_trailing_jumps)
  87
  88     if args.debug:
  89         with open(args.file + ".0.bb", "w") as f:
  90             dump_bblocks(cfg, f, no_graph_header=args.no_graph_header)
  91         with open(args.file + ".0.dot", "w") as f:
  92             dot.dot(cfg, f)
  93
  94     if args.script:
  95         for s in args.script:
  96             mod = __import__(s)
  97             mod.apply(cfg)
  98     elif hasattr(p, "script"):
  99         for op_type, op_name in p.script:
 100             if op_type == "xform:":
 101                 func = globals()[op_name]
 102                 func(cfg)
 103             elif op_type == "xform_bblock:":
 104                 func = globals()[op_name]
 105                 foreach_bblock(cfg, func)
 106             elif op_type == "xform_inst:":
 107                 func = globals()[op_name]
 108                 foreach_inst(cfg, func)
 109             elif op_type == "script:":
 110                 mod = __import__(op_name)
 111                 mod.apply(cfg)
 112             else:
 113                 assert 0
 114
 115     if args.debug:
 116         with open(args.file + ".out.bb", "w") as f:
 117             dump_bblocks(cfg, f, no_graph_header=args.no_graph_header)
 118         with open(args.file + ".out.dot", "w") as f:
 119             dot.dot(cfg, f)
 120
 121     if args.output and args.format != "none":
 122         out = open(args.output, "w")
 123     else:
 124         out = sys.stdout
 125
 126     if args.no_comments:
 127         Inst.show_comments = False
 128
 129     if args.format == "bblocks":
 130         p = CFGPrinter(cfg, out)
 131         if args.no_graph_header:
 132             p.print_graph_header = lambda: None
 133         p.inst_printer = repr if args.repr else str
 134         p.no_dead = args.no_dead
 135         p.print()
 136     elif args.format == "asm":
 137         p = AsmPrinter(cfg, out)
 138         p.no_dead = args.no_dead
 139         p.print()
 140     elif args.format == "c":
 141         #foreach_bblock(cfg, remove_trailing_jumps)
 142         cfg.number_postorder()
 143         Inst.trail = ";"
 144         cprinter.no_dead = args.no_dead
 145         cprinter.dump_c(cfg, out)
 146
 147     if out is not sys.stdout:
 148         out.close()
 149
 150     progdb.update_funcdb(cfg)
 151
 152     return cfg
 153
 154
 155 def one_iter(input, output, iter_no):
 156     global FUNC_DB, FUNC_DB_ORG
 157
 158     if args.funcdb != "none":
 159         dbs = []
 160         if iter_no == 0 and os.path.exists(args.funcdb + ".in"):
 161             dbs.append(args.funcdb + ".in")
 162         if os.path.exists(args.funcdb):
 163             dbs.append(args.funcdb)
 164         progdb.load_funcdb(*dbs)
 165
 166     FUNC_DB = progdb.FUNC_DB_BY_ADDR
 167     FUNC_DB_ORG = copy.deepcopy(FUNC_DB)
 168
 169     if args.script:
 170         # If script has init() function, call it at the beginning of each
 171         # iteration, this is useful to reset some state. E.g., if some
 172         # funcdb property is calculated as a union, but we want to find
 173         # its lower bound, we need to reset it to empty set at each
 174         # iteration.
 175         for s in args.script:
 176             mod = __import__(s)
 177             if hasattr(mod, "init"):
 178                 mod.init()
 179
 180     if os.path.isdir(input):
 181         if output and not os.path.isdir(output):
 182             os.makedirs(output)
 183         for full_name in glob.glob(input + "/*"):
 184             if full_name.endswith(".lst") and os.path.isfile(full_name):
 185                 if args.debug:
 186                     print(full_name)
 187                 args.file = full_name
 188                 if output:
 189                     base_name = full_name.rsplit("/", 1)[-1]
 190                     args.output = output + "/" + base_name
 191                 else:
 192                     args.output = full_name + args.output_suffix
 193                 handle_file(args)
 194     else:
 195         handle_file(args)
 196
 197
 198     changed = FUNC_DB != FUNC_DB_ORG
 199     if changed and args.funcdb != "none":
 200         progdb.save_funcdb(args.funcdb)
 201
 202     return changed
 203
 204
 205 def __main__():
 206     if args.annotate_calls:
 207         core.Inst.annotate_calls = True
 208
 209     if not args.funcdb:
 210         if os.path.isdir(args.file):
 211             # For an input as directory, use this *input* directory
 212             proj_dir = args.file
 213         else:
 214             # For a single file, use containing directory
 215             proj_dir = os.path.dirname(args.file) or "."
 216
 217         args.funcdb = proj_dir + "/funcdb.yaml"
 218         _log.info("Using funcdb: %s", args.funcdb)
 219         # Load binary data
 220         bindata.init(proj_dir)
 221         # Load symtab
 222         if os.path.exists(proj_dir + "/symtab.txt"):
 223             _log.info("Using symtab:", proj_dir + "/symtab.txt")
 224             progdb.load_symtab(proj_dir + "/symtab.txt")
 225
 226     input = args.file
 227     output = args.output
 228
 229     iter_no = 0
 230     while True:
 231         changed = one_iter(input, output, iter_no)
 232         if not args.iter:
 233             break
 234         if args.debug:
 235             print("=== Done iteration %d ===" % iter_no)
 236         if not changed:
 237             break
 238         iter_no += 1
 239
 240
 241 # Module-level code
 242
 243 # As arch.load_arch() performs dynamic import, do it outside of __main__(),
 244 # i.e. at load-time, to work with Python "strict mode" semantics.
 245 args = parse_args()
 246
 247 if args.log_level:
 248     logging.basicConfig(level=getattr(logging, args.log_level))
 249
 250 import arch
 251 arch.load_arch(args.arch)
 252
 253
 254 def preparse_scripts(input):
 255     files = []
 256     scripts = []
 257
 258     if os.path.isdir(input):
 259         for full_name in glob.glob(input + "/*"):
 260             if full_name.endswith(".lst") and os.path.isfile(full_name):
 261                 files.append(full_name)
 262     else:
 263         files = [input]
 264
 265     for fname in files:
 266         with open(fname) as f:
 267             for l in f:
 268                 if l.startswith("#script: "):
 269                     l = l.rstrip()
 270                     scripts.append(l.split(None, 1)[1])
 271     return scripts
 272
 273
 274 # Preload scripts.
 275 if args.script:
 276     for s in args.script:
 277         __import__(s)
 278 for s in preparse_scripts(args.file):
 279     __import__(s)
 280
 281
 282 if __name__ == "__main__":
 283     __main__()