github/workflows/pycopy-test: Upgrade Pycopy to 3.6.1.
[ScratchABlock.git] / parser.py
blobd7b87799bd73c9dbc4a511672f899b861f3c3d15
1 import sys
2 import string
3 from graph import Graph
4 from core import *
5 import yaml
8 class ParseError(Exception):
9 pass
11 class UndefinedLabel(ParseError):
12 pass
15 class Lexer:
17 def __init__(self, l):
18 self.l = l
19 self.ws()
21 def peek(self):
22 if not self.l:
23 return self.l
24 return self.l[0]
26 def eol(self):
27 return not self.l
29 def is_punct(self, c):
30 return c in "&|^-+*/%=<>"
32 def match(self, tok):
33 if not self.l.startswith(tok):
34 return False
36 is_word = tok[0] in string.ascii_letters
37 rest = self.l[len(tok):]
38 if rest:
39 if is_word and rest[0] not in string.whitespace:
40 return False
41 elif self.is_punct(tok[0]) and self.is_punct(rest[0]):
42 return False
43 self.l = rest
44 self.ws()
45 return True
47 def match_till(self, tok):
48 res = ""
49 while self.l and not self.l.startswith(tok):
50 res += self.l[0]
51 self.l = self.l[1:]
52 return res
54 def match_bracket(self, open_b, close_b):
55 self.expect(open_b)
56 level = 1
57 res = ""
58 while self.l and level:
59 if self.match(open_b):
60 level += 1
61 elif self.match(close_b):
62 level -= 1
63 else:
64 res += self.l[0]
65 self.l = self.l[1:]
66 return res
68 def expect(self, tok, msg=None):
69 if not self.match(tok):
70 if msg is None:
71 msg = "Expected: '%s'" % tok
72 raise ParseError(msg, self.l)
74 def ws(self):
75 while self.l and self.l[0] == " ":
76 self.l = self.l[1:]
78 def rest(self):
79 return self.l.rstrip()
81 def word(self):
82 w = ""
83 while self.l and self.l[0] in (string.ascii_letters + string.digits + "_.$"):
84 w += self.l[0]
85 self.l = self.l[1:]
86 return w
88 def isident(self):
89 return self.l[0] in string.ascii_letters + "_."
91 def isdigit(self):
92 return self.l[0] in string.digits
94 def ident(self):
95 assert self.isident(), repr(self.l)
96 w = self.word()
97 self.ws()
98 return w
100 def num(self):
101 assert self.isdigit(), repr(self.l)
102 base = 10
103 if self.l.startswith("0x"):
104 base = 16
105 w = self.word()
106 self.ws()
107 return int(w, 0), base
109 def skip_comment(self):
110 if self.match("/*"):
111 self.match_till("*/")
112 self.expect("*/")
114 class Parser:
116 def __init__(self, fname):
117 self.fname = fname
118 self.expect_line_addr = None
119 self.cfg = None
120 self.labels = {}
121 self.curline = -1
122 self.script = []
123 self.pass_no = None
124 self.lex = None
126 def error(self, msg):
127 print("%s:%d: %s" % (self.fname, self.curline, msg))
128 sys.exit(1)
130 def parse_cond(self, lex):
131 self.lex = lex
132 lex.expect("(")
133 e = self.parse_expr()
134 lex.expect(")")
135 return COND(e)
137 # Get line from a file, expanding any "macro-like" features,
138 # like "if (cond) $r0 = $r1"
139 def get_expand_line(self, f):
140 while True:
141 l = f.readline()
142 if not l:
143 return [(None, None)]
144 self.curline += 1
145 l = l.rstrip()
146 if not l:
147 continue
149 if l[0] == "#":
150 if self.pass_no == 1:
151 l = l[1:]
152 if l.startswith("xform: "):
153 self.script.append(l.split(None, 1))
154 elif l.startswith("xform_bblock: "):
155 self.script.append(l.split(None, 1))
156 elif l.startswith("xform_inst: "):
157 self.script.append(l.split(None, 1))
158 elif l.startswith("script: "):
159 self.script.append(l.split(None, 1))
160 elif l.startswith("struct: "):
161 import progdb
162 data = yaml.safe_load(l.split(None, 1)[1])
163 progdb.set_struct_types(data)
164 elif l.startswith("struct_addr: "):
165 import progdb
166 data = yaml.safe_load(l.split(None, 1)[1])
167 data = {(start, start + max(progdb.get_struct_types()[name].keys()) + 4): name for start, name in data.items()}
168 progdb.set_struct_instances(data)
169 continue
171 if self.expect_line_addr is None:
172 self.expect_line_addr = self.detect_addr_field(l)
174 if self.expect_line_addr:
175 addr, l = l.split(" ", 1)
176 else:
177 addr = "%04d" % self.curline
178 l = l.lstrip()
180 if l.startswith("if "):
181 # May need to expand "if macro"
182 lex = Lexer(l)
183 lex.expect("if")
184 c = self.parse_cond(lex)
185 rest = lex.rest()
186 if not lex.match("goto"):
187 # Uncomment to treat as literal line for roundtrip testing
188 #return [(addr, "\x01" + l)]
189 out = [
190 (addr, "if %s goto %s.1.cond" % (str(c.neg()), addr)),
191 (addr + ".0", rest),
192 (addr + ".1", addr + ".1.cond:"),
194 return out
196 return [(addr, l)]
198 def parse_labels(self):
199 self.pass_no = 1
200 self.curline = 0
201 with open(self.fname) as f:
202 l = ""
203 while l is not None:
204 for addr, l in self.get_expand_line(f):
205 if l is None:
206 break
207 if l[-1] == ":":
208 self.labels[l[:-1]] = addr
210 def parse_reg(self, lex):
211 lex.expect("$")
212 return REG(lex.ident())
214 def parse_arglist(self):
215 args = []
216 self.lex.expect("(")
217 while not self.lex.match(")"):
218 comm = ""
219 if self.lex.match("/*"):
220 comm = "/*" + self.lex.match_till("*/") + "*/"
221 self.lex.expect("*/")
222 a = self.parse_expr()
223 assert a, (repr(a), repr(self.lex.l))
224 a.comment = comm
225 args.append(a)
226 if self.lex.match(","):
227 pass
228 return args
230 def parse_num(self):
231 if self.lex.isdigit():
232 return VALUE(*self.lex.num())
234 def parse_primary(self):
235 if self.lex.match("*"):
236 self.lex.expect("(", "Dereference requires explicit pointer cast")
237 type = self.lex.ident()
238 self.lex.expect("*")
239 self.lex.expect(")")
240 e = self.parse_primary()
241 return MEM(type, e)
242 elif self.lex.match("("):
243 e = self.parse_expr()
244 self.lex.expect(")")
245 if is_addr(e):
246 # If there was identifier, check it for being a type name
247 if e.addr in ("i8", "u8", "i16", "u16", "i32", "u32", "i64", "u64"):
248 tp = TYPE(e.addr)
249 e = self.parse_primary()
250 return EXPR("CAST", [tp, e])
251 return e
252 elif self.lex.peek() == "$":
253 return self.parse_reg(self.lex)
254 elif self.lex.isdigit():
255 return VALUE(*self.lex.num())
256 elif self.lex.match("-"):
257 e = self.parse_primary()
258 if is_value(e):
259 return VALUE(-e.val, e.base)
260 return EXPR("NEG", [e])
261 elif self.lex.match("!"):
262 e = self.parse_primary()
263 return EXPR("!", [e])
264 elif self.lex.isident():
265 id = self.lex.ident()
266 if self.lex.peek() == "(":
267 args = self.parse_arglist()
268 return EXPR("SFUNC", [SFUNC(id)] + args)
269 else:
270 return ADDR(self.labels.get(id, id))
271 else:
272 self.error("Cannot parse")
274 def parse_level(self, operators, next_level):
275 e = next_level()
276 match = True
277 while match:
278 match = False
279 for op in operators:
280 if self.lex.match(op):
281 e2 = next_level()
282 e = EXPR(op, [e, e2])
283 match = True
284 break
285 return e
287 def parse_mul(self):
288 return self.parse_level(("*", "/", "%"), self.parse_primary)
290 def parse_add(self):
291 return self.parse_level(("+", "-"), self.parse_mul)
293 def parse_shift(self):
294 return self.parse_level(("<<", ">>"), self.parse_add)
296 def parse_lt_gt(self):
297 return self.parse_level(("<", "<=", ">=", ">"), self.parse_shift)
298 def parse_eq(self):
299 return self.parse_level(("==", "!="), self.parse_lt_gt)
301 def parse_band(self):
302 return self.parse_level(("&",), self.parse_eq)
303 def parse_bxor(self):
304 return self.parse_level(("^",), self.parse_band)
305 def parse_bor(self):
306 return self.parse_level(("|",), self.parse_bxor)
308 def parse_land(self):
309 return self.parse_level(("&&",), self.parse_bor)
310 def parse_lor(self):
311 return self.parse_level(("||",), self.parse_land)
313 def parse_expr(self):
314 return self.parse_lor()
317 # If there's numeric value, treat it as address. Otherwise, parse expression.
318 def parse_local_addr_expr(self):
319 if self.lex.isdigit():
320 w = self.lex.word()
321 return ADDR(self.labels.get(w, w))
322 return self.parse_expr()
324 def get_label(self, label):
325 try:
326 return self.labels[label]
327 except KeyError:
328 raise UndefinedLabel(label)
330 def label_from_addr(self, addr):
331 labels = list(filter(lambda x: x[1] == addr, self.labels.items()))
332 if not labels:
333 return addr
334 return labels[0][0]
336 def parse_inst(self, l):
337 lex = self.lex = Lexer(l)
338 if lex.peek() == ".":
339 # Asm directive, ignore
340 return None
341 if lex.match("db") or lex.match("unk"):
342 arg = self.parse_num()
343 if arg is None:
344 arg = STR(lex.l)
345 return Inst(None, "SFUNC", [EXPR("SFUNC", [SFUNC("litbyte"), arg])])
346 if lex.match("goto"):
347 return Inst(None, "goto", [self.parse_local_addr_expr()])
348 if lex.match("call"):
349 return Inst(None, "call", [self.parse_expr()])
350 if lex.match("if"):
351 args = []
352 while True:
353 if lex.match("else"):
354 c = None
355 else:
356 c = self.parse_cond(lex)
357 lex.expect("goto")
358 label = lex.match_till(",").strip()
359 addr = ADDR(self.get_label(label))
360 args.extend([c, addr])
361 if not lex.match(","):
362 break
363 return Inst(None, "if", args)
364 if lex.match("return"):
365 args = []
366 while True:
367 lex.skip_comment()
368 if lex.eol():
369 break
370 a = self.parse_expr()
371 args.append(a)
372 if not lex.eol():
373 lex.expect(",")
374 return Inst(None, "return", args)
376 dest = self.parse_expr()
377 if not dest:
378 return Inst(None, "LIT", [l])
380 if is_expr(dest) and isinstance(dest.args[0], SFUNC) and lex.eol():
381 return Inst(None, "SFUNC", [dest])
383 def make_assign_inst(dest, op, args):
384 #return Inst(dest, op, args)
385 return Inst(dest, "=", [EXPR(op, args)])
387 lex.ws()
388 if lex.match("&="):
389 lex.ws()
390 src = self.parse_expr()
391 return make_assign_inst(dest, "&", [dest, src])
392 elif lex.match("|="):
393 lex.ws()
394 src = self.parse_expr()
395 return make_assign_inst(dest, "|", [dest, src])
396 elif lex.match("^="):
397 lex.ws()
398 src = self.parse_expr()
399 return make_assign_inst(dest, "^", [dest, src])
400 elif lex.match("+="):
401 lex.ws()
402 src = self.parse_expr()
403 return make_assign_inst(dest, "+", [dest, src])
404 elif lex.match("-="):
405 lex.ws()
406 src = self.parse_expr()
407 return make_assign_inst(dest, "-", [dest, src])
408 elif lex.match("*="):
409 lex.ws()
410 src = self.parse_expr()
411 return make_assign_inst(dest, "*", [dest, src])
412 elif lex.match("/="):
413 lex.ws()
414 src = self.parse_expr()
415 return make_assign_inst(dest, "/", [dest, src])
416 elif lex.match("%="):
417 lex.ws()
418 src = self.parse_expr()
419 return make_assign_inst(dest, "%", [dest, src])
420 elif lex.match(">>="):
421 lex.ws()
422 src = self.parse_expr()
423 return make_assign_inst(dest, ">>", [dest, src])
424 elif lex.match("<<="):
425 src = self.parse_expr()
426 return make_assign_inst(dest, "<<", [dest, src])
427 elif lex.match("="):
428 lex.ws()
429 src = self.parse_expr()
430 assert src, repr(lex.l)
431 if isinstance(src, SFUNC):
432 args = self.parse_arglist()
433 return Inst(dest, "SFUNC", [src] + args)
434 lex.ws()
435 if lex.eol():
436 return Inst(dest, "=", [src])
437 else:
438 assert 0, (lex.l, self.curline)
439 else:
440 assert False, (repr(lex.l), self.curline)
443 def detect_addr_field(self, l):
444 # Autodetect whether there's address as first field
445 # of line or not.
446 self.expect_line_addr = False
447 if l[0].isdigit():
448 # Can be either label or address
449 if l[-1] == ":":
450 if " " in l:
451 return True
452 else:
453 return True
454 return False
457 def _parse_bblocks(self, f):
458 self.cfg = Graph()
459 # TODO: not stored in props to avoid modifying tests
460 self.cfg.filename = self.fname
461 self.cfg.props["name"] = None
462 self.cfg.props["trailing_jumps"] = True
463 block = None
464 last_block = None
466 l = ""
467 while l is not None:
468 for addr, l in self.get_expand_line(f):
469 if l is None:
470 break
471 if self.should_stop(addr, l):
472 return
474 #print(addr, l)
475 l = l.split(";", 1)[0]
476 l = l.strip()
477 if not l:
478 continue
480 if l[-1] == ":":
481 # label
482 if self.cfg.props["name"] is None:
483 # First label is function name
484 if block is None and self.cfg.is_empty():
485 self.cfg.props["name"] = l[:-1]
486 self.cfg.props["addr"] = addr
488 if block is not None:
489 last_block = block
490 block = BBlock(addr)
491 block.cfg = self.cfg
492 block.label = l[:-1]
493 self.cfg.add_node(addr, val=block)
495 # Link empty blocks consisting of just 1 label
496 if last_block is not None:
497 self.cfg.add_edge(last_block.addr, block.addr)
498 last_block = None
500 continue
501 elif block is None:
502 block = BBlock(addr)
503 block.cfg = self.cfg
504 self.cfg.add_node(addr, val=block)
507 if last_block is not None:
508 self.cfg.add_edge(last_block.addr, block.addr)
509 #last_block.end.append(addr)
510 last_block = None
512 if l[0] == "\x01":
513 inst = Inst(None, "LIT", [l[1:]], addr=addr)
514 block.add(inst)
515 continue
517 inst = self.parse_inst(l)
518 if not inst:
519 continue
520 inst.addr = addr
522 if inst.op in ("goto", "if", "call"):
523 if inst.op != "if":
524 pairs = [None, inst.args[0]]
525 else:
526 pairs = inst.args
528 # Add out edge to each successor block, which can be
529 # many for generalized "if".
530 for i in range(0, len(pairs), 2):
531 cond = pairs[i]
532 addr = pairs[i + 1]
533 # Skip adding bblocks for indirect jumps
534 if isinstance(addr, ADDR):
535 addr = addr.addr
536 if addr not in self.cfg:
537 self.cfg.add_node(addr)
538 self.cfg.add_edge(block.addr, addr, cond=cond)
540 if cond or inst.op == "call":
541 last_block = block
542 else:
543 last_block = None
544 block.add(inst)
545 block = None
546 elif inst.op == "return":
547 block.add(inst)
548 block = None
549 last_block = None
550 else:
551 block.add(inst)
553 if last_block:
554 print("Warning: function %s was not properly terminated" % self.cfg.props["name"])
555 # block may be None e.g. if last instruction was call,
556 # and function abruptly ended without return.
557 if block:
558 self.cfg.add_edge(last_block.addr, block.addr)
560 def parse_bblocks(self, f):
561 self.pass_no = 2
562 try:
563 self._parse_bblocks(f)
564 except ParseError as e:
565 print("Error: %d: %r" % (self.curline + 1, e))
566 raise
567 sys.exit(1)
568 # External labels may produce empty CFG nodes during parsing.
569 # Make a pass over graph and remove such.
570 for node, info in list(self.cfg.nodes_props()):
571 if not info:
572 self.cfg.remove_node(node)
575 def parse(self):
576 self.parse_labels()
577 self.curline = 0
578 with open(self.fname) as f:
579 self.parse_bblocks(f)
580 return self.cfg
582 def should_stop(self, addr, l):
583 return False