From db9918eec341bab86b92ea64c477e6fa124812d6 Mon Sep 17 00:00:00 2001 From: Julian Squires Date: Mon, 27 Feb 2006 17:49:38 +0100 Subject: [PATCH] First stab at (possibly broken) better lexer based on cl-ppcre. --- README | 3 +- ast.lisp | 2 +- deflexer.lisp | 82 +++++++++++ lexer.lisp | 393 ++++++++++++++++++++------------------------------- m68k-assembler.asd | 6 +- machine.lisp | 20 ++- package.lisp | 2 +- system-tests.lisp | 11 +- tests/ymamoto.o.test | Bin 1657 -> 1657 bytes 9 files changed, 263 insertions(+), 256 deletions(-) create mode 100644 deflexer.lisp rewrite lexer.lisp (82%) diff --git a/README b/README index db1b312..dea7979 100644 --- a/README +++ b/README @@ -95,7 +95,8 @@ THINGS TO PUT IN TEST SUITE - conditional compilations; - ensure directives like ALIGN and EVEN don't emit unnecessary padding; -- random instruction streams tested against disassembly. +- random instruction streams tested against disassembly; +- test that * (as comment) and * (as multiply) are being handled correctly. KNOWN ISSUES diff --git a/ast.lisp b/ast.lisp index 5a6fb92..8f8a292 100644 --- a/ast.lisp +++ b/ast.lisp @@ -17,7 +17,7 @@ (when (eql (car x) 'operand) (push (cadr x) list)) (when (eql (car x) 'operands) - (setf list (append (inner-fn x) list))))) + (setf list (nconc (inner-fn x) list))))) list))) (reverse (inner-fn parse-tree)))) diff --git a/deflexer.lisp b/deflexer.lisp new file mode 100644 index 0000000..88f914e --- /dev/null +++ b/deflexer.lisp @@ -0,0 +1,82 @@ +;;; deflexer using cl-ppcre +;;; +;;; Heavily stolen from +;;; http://common-lisp.net/pipermail/cl-ppcre-devel/2004-June/000041.html +;;; by Edi Weitz, combined with other stuff found in the CL-PPCRE code. +;;; +;;; Hacked together by Julian Squires / 2006. +;;; XXX Needs heavy refactoring! + +(defpackage :cl-ppcre-lex + (:use :cl :cl-ppcre :anaphora) + (:import-from :cl-ppcre #:nsubseq) + (:export #:deflexer)) + +(in-package :cl-ppcre-lex) + +(eval-when (:compile-toplevel :load-toplevel :execute) + (defmacro with-unique-names ((&rest bindings) &body body) + ;; see + `(let ,(mapcar #'(lambda (binding) + (check-type binding (or cons symbol)) + (if (consp binding) + (destructuring-bind (var x) binding + (check-type var symbol) + `(,var (gensym ,(etypecase x + (symbol (symbol-name x)) + (character (string x)) + (string x))))) + `(,binding (gensym ,(symbol-name binding))))) + bindings) + ,@body)) + + (defun collect-bindings (var-list string reg-starts reg-ends) + (loop for var in var-list + for counter from 0 + when var + collect `(,var (awhen (aref ,reg-starts ,counter) + (nsubseq ,string it + (aref ,reg-ends ,counter)))))) + + (defun gather-fns (list) + (with-unique-names (scanner string start match-start match-end + reg-starts reg-ends next-pos-fn) + (loop for x in list + collect + (destructuring-bind (regex (&rest var-list) &body body) x + (let ((bindings (collect-bindings var-list string reg-starts reg-ends))) + `(cons ,regex + (lambda (,scanner ,string ,start ,next-pos-fn) + (multiple-value-bind + (,match-start ,match-end ,reg-starts ,reg-ends) + (cl-ppcre:scan ,scanner ,string :start ,start) + ,@(unless bindings + `((declare (ignore ,reg-starts ,reg-ends)))) + (when ,match-start + (prog1 + (progn ,@(if bindings + `((let (,@bindings) ,@body)) + body)) + (funcall ,next-pos-fn ,match-end)))))))))))) + +(defmacro deflexer (name &body body) + (with-unique-names (regex-table regex sexpr-regex anchored-regex function) + `(let ((,regex-table + (loop for (,regex . ,function) in (list ,@(gather-fns body)) + for ,sexpr-regex = + (etypecase ,regex + (function + (error "Compiled scanners are not allowed here")) + (string + (cl-ppcre::parse-string ,regex)) + (list + ,regex)) + for ,anchored-regex = + (cl-ppcre:create-scanner `(:sequence + :modeless-start-anchor + ,,sexpr-regex)) + collect (cons ,anchored-regex ,function)))) + (defun ,name (string next-pos-fn &key ((:start start) 0)) + (loop for (scanner . function) in ,regex-table + for value = (funcall function scanner string start next-pos-fn) + when value do (return value)))))) diff --git a/lexer.lisp b/lexer.lisp dissimilarity index 82% index 98eac4a..60e1460 100644 --- a/lexer.lisp +++ b/lexer.lisp @@ -1,237 +1,156 @@ - -(in-package :m68k-assembler) - -;;;; TOKEN PARAMETERS - -(defparameter *lexer-terminals* - '(open close colon comma hash + - / * or & ^ ~ << >> - constant symbol register opcode pseudo-op $)) - -(defparameter *lexer-single-char-tokens* - '((#\( open) - (#\) close) - (#\: colon) - (#\, comma) - (#\# hash) - (#\+ +) (#\- -) (#\/ /) (#\* *) - (#\| or) (#\& &) (#\^ ^) (#\~ ~))) - -(defun list-char-range (start end) - (do ((x (char-code start) (1+ x)) - (l nil)) - ((> x (char-code end)) l) - (push (code-char x) l))) - -(defparameter *lexer-word-characters* - `(,@(list-char-range #\A #\Z) - ,@(list-char-range #\a #\z) - ,@(list-char-range #\0 #\9) - #\_ #\. #\= #\\ #\@) - "Characters permitted in a symbol, register, or opcode.") -(defparameter *lexer-int-characters* `(,@(list-char-range #\0 #\9)) - "Characters permitted in an integer.") -(defparameter *lexer-whitespace-characters* '(#\Space #\Tab)) - - -;;;; LEXER BOOKKEEPING - -(defstruct lexer-state - (stream) - (filename) - (line 1) - (column 1)) - -(defvar *lexer-states* nil) -;;; *l-s-o-w-t-l-p* doesn't need to keep multi-file state. -(defvar *lexer-seen-only-whitespace-this-line-p*) - -(defmacro with-lexer ((filename) &body body) - `(unwind-protect (progn - (init-lexer ,filename) - ,@body) - (close-lexer))) - -(defun init-lexer (filename) - (setf *lexer-states* nil) - (nested-lexing filename)) - -(defun close-lexer () - (do ((s #1=(pop *lexer-states*) #1#)) - ((null s)) - (close (lexer-state-stream s)))) - -(defun lexer-next-line () - (setf *lexer-seen-only-whitespace-this-line-p* t - (lexer-state-column (first *lexer-states*)) 1) - (incf (lexer-state-line (first *lexer-states*)))) - -(defun lexer-next-column () - (incf (lexer-state-column (first *lexer-states*)))) - -(defun nested-lexing (filename) - (cond ((find filename *lexer-states* :test #'string-equal - :key #'lexer-state-filename) - (warn "~A is already in the chain of INCLUDES! Ignoring it..." - filename)) - (t - (setf *lexer-seen-only-whitespace-this-line-p* t) - (push (make-lexer-state :stream (open filename) - :filename filename) - *lexer-states*)))) - -;;;; INTERMEDIARY LEXING FUNCTIONS (EATERS) - -(defun eat-whitespace (stream) - (do ((next-char #1=(peek-char nil stream) #1#)) - ((not (find next-char *lexer-whitespace-characters*))) - (read-char stream) - (lexer-next-column))) - -(defun eat-string (stream &optional (start #\") (end #\")) - "Reads a backslash-escaped string from STREAM, delimited by the -characters START and END (which default to quotes)." - (assert (eql (read-char stream) start)) - (lexer-next-column) - (do ((next-char #1=(progn (lexer-next-column) - (read-char stream)) #1#) - (string (make-array '(0) :element-type 'character - :adjustable t :fill-pointer 0))) - ((eql next-char end) string) - (acase next-char - (#\\ (vector-push-extend (read-char stream) string) - (lexer-next-column)) - (t (vector-push-extend it string))))) - -(defconstant +int-conversion-table+ (load-time-value - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")) -(defun digit-to-int (char &optional (radix 10)) - (position char +int-conversion-table+ :test #'char-equal :end radix)) - -(defun eat-integer (stream &optional (radix 10)) - "If the next character is a digit, read digits until the next -character is not a digit." - (do ((next-char #1=(peek-char nil stream) #1#) int) - ((not (digit-to-int next-char radix)) int) - (setf int (+ (* (or int 0) radix) - (digit-to-int (read-char stream) radix))) - (lexer-next-column))) - -(defun eat-hex (stream) - (assert (eql (read-char stream) #\$)) - (lexer-next-column) - (eat-integer stream 16)) - -(defun eat-binary (stream) - (assert (eql (read-char stream) #\%)) - (lexer-next-column) - (eat-integer stream 2)) - -(defun eat-symbol (stream) - (do ((next-char #1=(peek-char nil stream) #1#) - (symbol (make-array '(0) :element-type 'character - :adjustable t :fill-pointer 0))) - ((not (find next-char *lexer-word-characters*)) symbol) - (lexer-next-column) - (vector-push-extend (read-char stream) symbol))) - - -;;;; MAIN LEXER FUNCTION - -(defun next-token () - (handler-case - (if *lexer-states* - (next-token-1 (lexer-state-stream (first *lexer-states*))) - (signal 'end-of-file)) - (end-of-file nil - (pop *lexer-states*) - (if *lexer-states* (next-token) (signal 'end-of-file))))) - -(defun next-token-1 (stream) - (eat-whitespace stream) - (let ((lookahead (peek-char nil stream))) - ;; XXX ugly first-column asterix hack - (unless (member lookahead '(#\* #\Return #\Newline #\;)) - (setf *lexer-seen-only-whitespace-this-line-p* nil)) - (acond - ((find lookahead *lexer-single-char-tokens* :key #'car) - ;; XXX ugly first-column asterix hack - (cond (*lexer-seen-only-whitespace-this-line-p* - (read-line stream) - (lexer-next-line) - (next-token-1 stream)) - (t (read-char stream) - (make-token (cadr it) lookahead)))) - ;; The order of the following few cases is significant. - ((member lookahead *lexer-int-characters*) - (make-token 'constant (eat-integer stream))) - ((eql lookahead #\$) (make-token 'constant (eat-hex stream))) - ((eql lookahead #\%) (make-token 'constant (eat-binary stream))) - ((member lookahead *lexer-word-characters*) - (let ((token (eat-symbol stream))) - (multiple-value-bind (string modifier) (munge-modifier token) - (setf modifier (string-to-modifier modifier)) - (cond ((register-p (register-substitutions string)) - (make-token 'register (list (register-substitutions string) - modifier))) - ((opcode-p string) (make-token 'opcode - (list string modifier))) - ((pseudo-op-p string) (make-token 'pseudo-op - (list string modifier))) - (t (make-token 'symbol token)))))) - ((eql lookahead #\") ; string - (make-token 'constant (eat-string stream))) - - ;; Little special cases. - ((eql lookahead #\<) ; expect < or macro parameter - (read-char stream) - (cond ((eql (peek-char nil stream) #\<) - (read-char stream) - (make-token '<< nil)) - ;; XXX: one problem here is that we don't deal with - ;; escaped <'s inside the string, and I'm not too anxious - ;; to do so yet, either. - (t (unread-char #\< stream) - (make-token 'symbol (eat-string stream #\< #\>))))) - ((eql lookahead #\>) ; expect > - (read-char stream) - (assert (eql (read-char stream) #\>)) - (make-token '>> nil)) - ((eql lookahead #\\) - ;; if it's a macro parameter (\[1-9A-Za-z] or \@) store it as a - ;; symbol ... MACRO will know what to do with it. - ;; also there's \symbol and \$symbol but I'm not sure if we'll - ;; support them yet. - ;; otherwise, it might be a line continuation token. XXX - (read-char stream) - (when (or (char-equal (peek-char nil stream) #\@) - (member (peek-char nil stream) *lexer-word-characters*)) - (make-token 'symbol (concatenate 'string "\\" (eat-symbol stream))))) - ((eql lookahead #\;) ; comment - (read-line stream) - (maybe-return-$ stream)) - ((eql lookahead #\Return) ; stupid ^M - (read-line stream) - (maybe-return-$ stream)) - ((eql lookahead #\Newline) - (read-line stream) - (maybe-return-$ stream))))) - -;;;; LEXER HELPERS - -(defun maybe-return-$ (stream) - (cond (*lexer-seen-only-whitespace-this-line-p* - (lexer-next-line) (next-token-1 stream)) - (t (lexer-next-line) (make-token '$ nil)))) - -(defun make-token (symbol value) - (list symbol value (copy-lexer-state (first *lexer-states*)))) - -(defun terminal-p (symbol) (member symbol *lexer-terminals*)) - -(defun is-position-info-p (x) (lexer-state-p x)) - -(defun string-to-modifier (string) - (cond ((string-equal string "b") 'byte) - ((string-equal string "w") 'word) - ((string-equal string "l") 'long))) - + +(in-package :m68k-assembler) + +;;;; LEXER BOOKKEEPING + +(defstruct lexer-state + (stream) + (filename) + (line 0) + (column 0) + (current-string nil)) + +(defvar *lexer-states* nil) +;;; *l-s-o-w-t-l-p* doesn't need to keep multi-file state. +(defvar *lexer-seen-only-whitespace-this-line-p*) + +(defmacro with-lexer ((filename) &body body) + `(unwind-protect (progn + (init-lexer ,filename) + ,@body) + (close-lexer))) + +(defun init-lexer (filename) + (setf *lexer-states* nil) + (nested-lexing filename)) + +(defun close-lexer () + (do ((s #1=(pop *lexer-states*) #1#)) + ((null s)) + (close (lexer-state-stream s)))) + +(defun lexer-next-line () + (setf (lexer-state-column (first *lexer-states*)) 0) + (incf (lexer-state-line (first *lexer-states*)))) + +(defun lexer-next-column () + (incf (lexer-state-column (first *lexer-states*)))) + +(defun update-lexer-column (c) + (assert (<= (lexer-state-column (first *lexer-states*)) c)) + (setf (lexer-state-column (first *lexer-states*)) c)) + +(defun nested-lexing (filename) + (cond ((find filename *lexer-states* :test #'string-equal + :key #'lexer-state-filename) + (warn "~A is already in the chain of INCLUDES! Ignoring it..." + filename)) + (t + (setf *lexer-seen-only-whitespace-this-line-p* t) + (push (make-lexer-state :stream (open filename) + :filename filename) + *lexer-states*)))) + +;;;; MAIN LEXER FUNCTION + +(defun next-token () + (handler-bind ((end-of-file (lambda (condition) + (pop *lexer-states*) + (if *lexer-states* + (next-token) + (error condition))))) + (unless *lexer-states* + (error 'end-of-file :stream nil)) + (multiple-value-bind (string column) + (ensure-lexer-data (first *lexer-states*)) + (when (and (zerop column) (not *lexer-seen-only-whitespace-this-line-p*)) + (return-from next-token (maybe-return-$))) + (atypecase (devpac-lexer string #'update-lexer-column :start column) + (atom (next-token)) + (t (setf *lexer-seen-only-whitespace-this-line-p* nil) + it))))) + +(defun ensure-lexer-data (state) + (symbol-macrolet ((string (lexer-state-current-string state))) + (unless (and string + (< (lexer-state-column state) + (length string))) + (setf string + (read-possibly-escaped-line (lexer-state-stream state)))) + (values string (lexer-state-column state)))) + +(defun read-possibly-escaped-line (stream) + (loop for line = (read-line stream) then + (concatenate 'string + (make-array (list (1- (length line))) + :displaced-to line) + (read-line stream)) + do (lexer-next-line) + unless (and (plusp (length line)) + (char= (schar line (1- (length line))) #\\)) + return line)) + +(cl-ppcre-lex:deflexer devpac-lexer + ("^[ \\t]*\\*.*$" () 'whitespace) + ("[ \\t]+" () 'whitespace) + ("([\\r\\n\\f]+|$)" () 'whitespace) + ("[;]+.*$" () 'whitespace) + ("([\\-():,#+/*|&^~])" + (single) (assert single) + (let ((it (find (char single 0) + '((#\( open) (#\) close) (#\: colon) (#\, comma) + (#\# hash) (#\+ +) (#\- -) (#\/ /) (#\* *) (#\| or) + (#\& &) (#\^ ^) (#\~ ~)) + :key #'car))) + (assert it) + (make-token (second it) (first it)))) + ("([0-9]+)" (digits) (assert digits) + (make-token 'constant (parse-integer digits))) + ("\\$([0-9A-Fa-f]+)" (digits) (assert digits) + (make-token 'constant (parse-integer digits :radix 16))) + ("%([01]+)" (digits) (assert digits) + (make-token 'constant (parse-integer digits :radix 2))) + ("([A-Za-z0-9_@.]+)(\\.[bBwWlL])" + (string modifier) (assert (and string modifier)) + (setf modifier (when modifier (string-to-modifier modifier))) + (acond ((register-p (register-substitutions string)) + (make-token 'register (list (register-substitutions string) + modifier))) + ((opcode-p string) (make-token 'opcode (list string modifier))) + ((pseudo-op-p string) (make-token 'pseudo-op (list string modifier))) + (t (make-token 'symbol string)))) + ("([A-Za-z0-9_@.]+)" + (string) (assert string) + (acond ((register-p (register-substitutions string)) + (make-token 'register (list (register-substitutions string) nil))) + ((opcode-p string) (make-token 'opcode (list string nil))) + ((pseudo-op-p string) (make-token 'pseudo-op (list string nil))) + (t (make-token 'symbol string)))) + ("=" () (make-token 'pseudo-op (list "=" nil))) + ("\"([^\"]*)\"" (string) (make-token 'string string)) + ("<<" () (make-token '<< nil)) + (">>" () (make-token '>> nil)) + ("<([^<>]*)>" (string) (make-token 'symbol string)) + ("(\\\\[1-9A-Za-z@])" (string) (make-token 'symbol string))) + +;;;; LEXER HELPERS + +(defun make-token (symbol value) + (list symbol value (copy-lexer-state (first *lexer-states*)))) + +(defun terminal-p (symbol) + (member symbol '(open close colon comma hash + - / * or & ^ ~ << >> + constant symbol register opcode pseudo-op $))) + +(defun is-position-info-p (x) (lexer-state-p x)) + +(defun string-to-modifier (string) + (let ((start (if (char= (char string 0) #\.) 1 0))) + (cond ((string-equal string "b" :start1 start) 'byte) + ((string-equal string "w" :start1 start) 'word) + ((string-equal string "l" :start1 start) 'long)))) + +(defun maybe-return-$ () + (cond (*lexer-seen-only-whitespace-this-line-p* 'whitespace) + (t (setf *lexer-seen-only-whitespace-this-line-p* t) + (make-token '$ nil)))) diff --git a/m68k-assembler.asd b/m68k-assembler.asd index e1ce246..6388195 100644 --- a/m68k-assembler.asd +++ b/m68k-assembler.asd @@ -5,12 +5,16 @@ (defsystem m68k-assembler :depends-on (:anaphora :cl-ppcre :osicat :rt) + :author "Julian Squires " + :version "alpha zero" + :components ((:file "package") (:file "special-variables" :depends-on ("package")) (:file "utils" :depends-on ("package")) (:file "machine" :depends-on ("package" "utils")) - (:file "lexer" :depends-on ("package" "machine" "utils")) + (:file "deflexer") + (:file "lexer" :depends-on ("package" "machine" "utils" "deflexer")) (:file "parser" :depends-on ("package")) (:file "ast" :depends-on ("package" "utils" "lexer")) (:file "sections" :depends-on ("package" "utils" "special-variables")) diff --git a/machine.lisp b/machine.lisp index f56649c..e059366 100644 --- a/machine.lisp +++ b/machine.lisp @@ -221,10 +221,10 @@ ;; not. ("BTST" (((byte long) (data-register) (data-addressing-modes)) - ((4 #b0000) (3 (register-idx first-operand)) - (3 #b100) - (6 (effective-address-mode second-operand modifier)) - (? (effective-address-extra second-operand modifier)))) + ((4 #b0000) (3 (register-idx first-operand)) + (3 #b100) + (6 (effective-address-mode second-operand modifier)) + (? (effective-address-extra second-operand modifier)))) (((byte long) (immediate) (data-addressing-modes)) ((10 #b0000100000) (6 (effective-address-mode second-operand modifier)) @@ -238,18 +238,16 @@ (? (effective-address-extra first-operand modifier))))) ,@(mapcar - (lambda (x) + (lambda (x y) `(,x (((byte word long) (data-alterable-modes)) ((5 #b01000) - (3 ,(cond ((string= x "NOT") #b110) - ((string= x "NEG") #b100) - ((string= x "NEGX") #b000) - ((string= x "CLR") #b010))) + (3 ,y) (2 (modifier-bits modifier)) (6 (effective-address-mode first-operand modifier)) (? (effective-address-extra first-operand modifier)))))) - '("CLR" "NEG" "NEGX" "NOT")) + '("CLR" "NEG" "NEGX" "NOT") + '(#b010 #b100 #b000 #b110)) ("CMP" "CMPI" ; I is faster @@ -374,7 +372,7 @@ (3 (register-idx second-operand)) (3 #b001) (6 (effective-address-mode first-operand modifier)) (? (effective-address-extra first-operand modifier))))) - ("MOVEC") ;; XXX I don't understand the syntax for this one. +;; ("MOVEC") ;; XXX I don't understand the syntax for this one. ("MOVEM" (((word long) (register-list) (movem-pre-modes)) ((9 #b010010001) (1 (if-eql-word-p modifier 0 1)) diff --git a/package.lisp b/package.lisp index af89398..2940dc5 100644 --- a/package.lisp +++ b/package.lisp @@ -1,6 +1,6 @@ (defpackage #:m68k-assembler (:nicknames #:m68k-asm) - (:use #:cl #:anaphora) + (:use #:cl #:anaphora #:cl-ppcre) (:export #:assemble)) (defpackage #:m68k-assembler-tests diff --git a/system-tests.lisp b/system-tests.lisp index 0e4425d..002762e 100644 --- a/system-tests.lisp +++ b/system-tests.lisp @@ -2,12 +2,14 @@ (in-package :m68k-assembler-tests) +(defparameter *base-path* "/home/julian/projects/m68k-assembler/") + ;; XXX this doesn't actually work, because the symbol table outputs ;; differ between compilers. (deftest ymamoto.regress.1 - (let ((orig-filename "tests/ymamoto.o") - (our-filename "tests/ymamoto.o.test")) - (m68k-assembler:assemble "tests/ymamoto.s" + (let ((orig-filename (merge-pathnames "tests/ymamoto.o.test" *base-path*)) + (our-filename (merge-pathnames "tests/ymamoto.o" *base-path*))) + (m68k-assembler:assemble (merge-pathnames "tests/ymamoto.s" *base-path*) :object-name our-filename) (with-open-file (original orig-filename :direction :input :element-type 'unsigned-byte) @@ -22,5 +24,6 @@ while (and a b)) (unless (and (equal (read-byte new nil nil) nil) (equal (read-byte original nil nil) nil)) - (error "Outputs not the same at EOF"))))) + (error "Outputs not the same at EOF")))) + t) t) diff --git a/tests/ymamoto.o.test b/tests/ymamoto.o.test index c12a3cbe83bfe6e4afecdb0b39171b4efe1ce715..5249a3c9e9a87c9ccf2ada5e239052dcc39a0eac 100644 GIT binary patch delta 315 zcwYO-^OI+T2WveG3j+g_0s{l{8z}t@N>@Q>raKG_j3Al~!e`82U|;~zDNs5AO24t)w{F3;T)Fk85^!SwW{Gt?w%G|`<{F40m%)HDJ TIHxGJI2FMwEl5c$No4>4iySM2 delta 295 zcwYO-^OI+T2WvgUH3kMI1qKG@XHfbLgl2SMU|;~z0T7y@4Juy+p;=fUG(!(meFjv$ z1C;iG(h*SF07~0H&Eo>=t7l*kfEthjm3RQ*GqFMGJ3u**ya&{P7$_YAr4yj~7#JBC zrU0o15EIA-nFj()oD2*JP&PAAycI~>Rput<=9lEhi -- 2.11.4.GIT