urasm: fixed local labels processing
[urasm.git] / urflibs / urasm / lexer.f
blobe3e88acc3bc8c48d1bd37fd75258187507a4e0cc
1 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ;; and now for something completely different...
3 ;; UrAsm Forth Engine!
4 ;; GPLv3 ONLY
5 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
6 ;; Z80 Assembler: input stream lexer
7 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
9 (*
10 on left paren, lexer tries to look forward to create a better token:
11 (HL) -> uo-r8, kind-(hl)
12 (I[XY]) -> uo-mrx16 kind-i[xy]-none
13 (I[XY]+ -> uo-mrx16 kind-i[xy]-add
14 (I[XY]- -> uo-mrx16 kind-i[xy]-sub
15 (C) -> uo-portc
16 (BC), (DE), (SP) -> uo-mr16 with proper kind
18 "C" is always r8. to convert it to condition, use "token-c-to-cond".
19 to convert "(HL)" to r8 use "token-(hl)->r8"
23 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
24 ;; simple expression evaluator
27 ;; cannot include instrs here
28 vocab-if-none asm-instr
31 vocab-if-none asm-lexer
32 also-defs: asm-lexer
34 ;; options
35 true value allow-continuations
36 ;; for label detection
37 true value line-start
40 ;; one-char and two-char strings sets "TOK-NUM" too
41 ;; number sign is not parsed
42 enum{
43 def: tk-eos
44 def: tk-id
45 def: tk-num
46 def: tk-str
47 def: tk-punct ;; punctuation; sets both "TK-STR" and "TK-NUM" (to char code)
48 def: tk-label ;; line start, word ends with ":"; token is without ":"
49 def: tk-mnemo ;; mnemonics;; TK-STR contains a word (uppercased); TOK-MNEMO-CFA is CFA
50 def: tk-resw ;; reserved word; TK-STR contains a word (uppercased); TOK-UO is UO-xxx; TOK-KIND is kind
53 0 value tok-addr
54 0 value tok-len
55 0 value tok-type ;; TK-xxx
56 0 value tok-num
57 0 value tok-uo
58 0 value tok-kind ;; for reserved words
59 0 value tok-mnemo-cfa
62 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
63 ;; char classifiers
66 <public-words>
68 : is-digit ( ch -- bool )
69 [char] 0 [char] 9 bounds?
72 : is-bin-digit ( ch -- bool )
73 [char] 0 [char] 1 bounds?
76 : is-oct-digit ( ch -- bool )
77 [char] 0 [char] 7 bounds?
80 : is-hex-digit ( ch -- bool )
81 dup [char] 0 [char] 9 bounds?
82 over [char] A [char] F bounds? or
83 swap [char] a [char] f bounds? or
86 : is-alpha ( ch -- bool )
87 dup [char] A [char] Z bounds?
88 swap [char] a [char] z bounds? or
91 : is-under-dot ( ch -- bool )
92 dup [char] _ =
93 swap [char] . = or
96 : is-alpha-under-dot ( ch -- bool )
97 dup is-alpha swap is-under-dot or
100 : is-alnum ( ch -- bool )
101 dup is-alpha swap is-digit or
104 : is-id-start ( ch -- bool )
105 dup is-alpha swap is-under-dot or
108 : is-id-char ( ch -- bool )
109 dup is-id-start swap is-digit or
113 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
114 ;; token checks
117 : punct? ( char -- )
118 tok-type tk-punct = swap tok-num = and
121 : punct2? ( char -- )
122 tok-type tk-punct = swap 0xff and dup 8 ash or tok-num = and
126 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
127 ;; input stream tokenizer
129 <public-words>
131 ;; return current token string
132 ;; valid until "next-token" is called
133 : token ( -- addr count ) tok-addr tok-len ;
136 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
137 ;; debug
140 : dump-token ( -- )
141 ." *** TOKEN: type="
142 tok-type case
143 tk-eos of ." <EOS>\n" endof
144 tk-id of ." <ID>; str=<" token xtype ." >\n" endof
145 tk-num of ." <NUM>; num=" tok-num . ." str=<" token xtype ." >\n" endof
146 tk-str of ." <STR>; str=<" token xtype ." >\n" endof
147 tk-punct of ." <PUNCT>; num=" tok-num . ." str=<" token xtype ." >\n" endof
148 tk-label of ." <LABEL>; str=<" token xtype ." >\n" endof
149 tk-mnemo of ." <MNEMO>; str=<" token xtype ." > cfa=" tok-mnemo-cfa 0 .r cr endof
150 tk-resw of ." <RES-WORD>; str=<" token xtype ." > uo=" tok-uo . ." kind=" tok-kind 0 .r cr endof
151 otherwise . " wutafuck?" error
152 endcase
156 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
157 ;; tokenizer helpers
159 <hidden-words>
161 : char-upper ( ch -- )
162 dup [char] a [char] z bounds? if 32 - endif
165 : (init-tok-addr) ( -- ) (tib-in) to tok-addr ;
166 : (init-tok) ( -- ) (init-tok-addr) 0 to tok-len ;
167 ;; set length according to the current TIB position
168 ;; (init-tok-addr) must be already called
169 : (tok-fix-length) ( -- ) (tib-in) tok-addr - to tok-len ;
171 : (set-eos) ( -- ) (init-tok) tk-eos to tok-type ;
173 : (collect-punct) ( -- )
174 (init-tok-addr)
175 tib-getch case
176 [char] & of
177 tib-peekch [char] & = if tib-skipch endif
178 endof
179 [char] | of
180 tib-peekch [char] | = if tib-skipch endif
181 endof
182 [char] < of
183 tib-peekch case
184 [char] = of tib-skipch endof
185 [char] < of tib-skipch endof
186 [char] > of tib-skipch endof
187 endcase
188 endof
189 [char] > of
190 tib-peekch case
191 [char] = of tib-skipch endof
192 [char] > of tib-skipch endof
193 endcase
194 endof
195 [char] ! of
196 tib-peekch [char] = = if tib-skipch endif
197 endof
198 endcase
199 (tok-fix-length)
200 tok-addr tok-len case
201 1 of c@ endof
202 2 of w@ endof
203 3 of dup w@ swap 2+ c@ 16 lsh or endof
204 4 of @ endof
205 otherwise " wut?!" error
206 endcase to tok-num
207 tk-punct to tok-type
210 : (collect) ( checkcfa -- )
211 >r begin tib-peekch dup 0<> swap r@ execute and
212 while tib-skipch
213 repeat rdrop
214 (tok-fix-length)
217 : (collect-id) ( -- )
218 (init-tok) ['] is-id-char (collect)
219 tk-id to tok-type
222 : (other-pfx-dblch) ( ch -- )
223 tk-id to tok-type
224 >r tib-peekch r@ = if ;; "$$", etc.
225 tib-skipch tib-peekch r> = " invalid identifier" ?error
226 tib-peekch is-id-char if
227 +1-to tok-addr ;; convert "@@f" to "@f"
228 ['] is-id-char (collect)
229 else (tok-fix-length) endif
230 line-start if tk-label to tok-type endif
231 else rdrop ['] is-id-char (collect) endif
234 ;; we are at one-char prefix, it may be a number
235 ;; it it isn't, colled as punctuation
236 : (try-pfx-num) ( checkcfa -- )
237 (collect-punct)
238 tib-peekch swap execute if
239 ['] is-id-char (collect)
240 ;; (XNUMBER) ( addr count allowsign? -- num TRUE / FALSE )
241 token false base @ (based-number) " invalid number" ?not-error
242 to tok-num tk-num to tok-type
243 else ;; "$" is identifier
244 tok-addr c@ [char] $ = if [char] $ (other-pfx-dblch) endif
245 endif
249 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
250 ;; token classifiers
253 : (classify-id-mnemo?) ( -- success-bool )
254 token vocid: asm-instr find-word-in-voc
255 dup if
256 swap to tok-mnemo-cfa
257 tk-mnemo to tok-type
258 \ token string:upper
259 endif
262 : (check-af') ( -- )
263 tok-kind asm-resw:kind-af = if
264 tib-peekch [char] ' = if
265 tib-skipch (tok-fix-length)
266 asm-resw:uo-afx to tok-uo
267 endif
268 endif
271 : (classify-id-resw?) ( -- success-bool )
272 token vocid: asm-resw find-word-in-voc
273 dup if drop ( cfa: UO-xxx kind )
274 cfa->pfa dup @ to tok-uo
275 cell+ @ to tok-kind
276 tk-resw to tok-type
277 (check-af')
278 \ token string:upper
279 true
280 endif
283 : (classify-id-label-colon?) ( -- success-bool )
284 line-start dup if drop
285 parse-skip-blanks
286 tib-peekch [char] : = if tib-skipch
287 tk-label to tok-type
288 tok-addr c@ is-digit " invalid label" ?error
289 true
290 else false
291 endif
292 endif
295 : (classify-id-label?) ( -- success-bool )
296 line-start dup if drop
297 tok-addr tib @ = if
298 tok-addr c@ is-digit not if
299 tk-label to tok-type
300 true
301 else false
302 endif
303 else false endif
304 endif
307 : (classify-id-number?) ( -- success-bool )
308 tok-addr c@ is-digit if
309 token false base @ (based-number) " invalid number" ?not-error
310 to tok-num
311 tk-num to tok-type
312 true
313 else false
314 endif
317 : (classify-id-maybe-number?) ( -- success-bool )
318 tok-addr c@ is-id-char not if
319 token false (based-number) if
320 to tok-num
321 tk-num to tok-type
322 true
323 else false
324 endif
325 else false
326 endif
329 create (classifiers)
330 ' (classify-id-mnemo?) ,
331 ' (classify-id-resw?) ,
332 ' (classify-id-label-colon?) ,
333 ' (classify-id-label?) ,
334 ' (classify-id-number?) ,
335 ' (classify-id-maybe-number?) ,
337 create;
339 : (classify-id) ( -- )
340 (classifiers)
341 begin
342 dup @ ?dup
343 while ( addr cfa )
344 execute if drop exit endif cell+
345 repeat drop
349 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
350 ;; quoted something
353 : byte-swap ( word -- word )
354 dup 0xff and 8 lsh
355 swap -8 lsh 0xff and or
358 ;;FIXME: escape support?
359 : (collect-char-string) ( -- )
360 tib-getch >r
361 (init-tok) begin tib-getch dup " unterminated string" ?not-error
362 r@ = until
363 (tok-fix-length) tok-len 0> " invalid string" ?not-error
364 -1-to tok-len ;; final quote fix
365 tk-str to tok-type
366 tok-len case
367 1 of tok-addr c@ to tok-num endof
368 2 of
369 tok-addr w@ r@ [char] " <> if byte-swap endif ;; "
370 to tok-num
371 endof
372 otherwise drop -1 to tok-num
373 endcase
374 rdrop
378 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
379 ;; parse "(" (extended)
382 : (skip-chars) ( count -- ) for tib-skipch endfor ;
384 : (blank-char?) ( ch -- ) dup 0> swap bl <= and ;
386 ;; skip blanks until non-blank or EOL
387 ;; eofs is at non-blank or EOL
388 : (lparen-skip-blanks) ( stofs -- eofs char )
389 begin 1+ dup 1- tib-peekch-ofs (blank-char?) not-until 1-
390 dup tib-peekch-ofs
393 ;; skip blanks, check for ")"
394 ;; eofs is AFTER ")"
395 : (check-rparen) ( stofs -- eofs TRUE / FALSE )
396 (lparen-skip-blanks)
397 [char] ) = if 1+ true else drop false endif
400 ;; "(c"; TIB is at "c"
401 : (try-lparen-c) ( -- )
402 1 (check-rparen) if
403 (skip-chars) (tok-fix-length)
404 tk-resw to tok-type
405 asm-resw:uo-portc to tok-uo
406 endif
409 ;; "(.."; TIB is at the first char
410 : (try-lparen-r16) ( kind char2 -- )
411 1 tib-peekch-ofs char-upper = if
412 2 (check-rparen) if
413 (skip-chars) (tok-fix-length)
414 tk-resw to tok-type
415 asm-resw:uo-mr16 to tok-uo
416 to tok-kind
417 else drop
418 endif
419 else drop
420 endif
423 : (try-lparen-bc) ( -- ) asm-resw:kind-bc [char] C (try-lparen-r16) ;
424 : (try-lparen-de) ( -- ) asm-resw:kind-de [char] E (try-lparen-r16) ;
425 : (try-lparen-sp) ( -- ) asm-resw:kind-sp [char] P (try-lparen-r16) ;
426 : (try-lparen-hl) ( -- )
427 asm-resw:kind-(hl) [char] L (try-lparen-r16)
428 tok-uo asm-resw:uo-mr16 = if asm-resw:uo-r8 to tok-uo endif
431 ;; most complex checker: "(i[xy][+-]"
432 ;; TIB is at "I"
433 : (try-lparen-ixy) ( -- )
434 ;; parse, put initial kind on the stack
435 1 tib-peekch-ofs char-upper case
436 [char] X of asm-resw:kind-ix-none endof
437 [char] Y of asm-resw:kind-iy-none endof
438 otherwise drop exit
439 endcase
440 ( kind; check if we have + or - )
441 2 (lparen-skip-blanks) ( base-kind ch-ofs char )
442 case
443 [char] + of asm-resw:kind-ix-add endof
444 [char] - of asm-resw:kind-ix-sub endof
445 [char] ) of 0 endof
446 otherwise drop exit
447 endcase
448 ( base-kind ch-ofs ext-kind )
449 rot + swap 1+ (skip-chars) (tok-fix-length)
450 tk-resw to tok-type
451 asm-resw:uo-mrx16 to tok-uo
452 to tok-kind
456 ;; something that starts with "(".
457 ;; it can be:
458 ;; uo-mr16 -- "(r16)"
459 ;; uo-mrx16 -- "(i[xy][+-]"
460 ;; uo-portc -- "(c)"
461 : (check-lparen) ( -- )
462 ;; check next char
463 parse-skip-blanks ;; why not? allow "( hl )"
464 tib-peekch char-upper case
465 [char] I of (try-lparen-ixy) endof
466 [char] C of (try-lparen-c) endof
467 [char] B of (try-lparen-bc) endof
468 [char] D of (try-lparen-de) endof
469 [char] H of (try-lparen-hl) endof
470 [char] S of (try-lparen-sp) endof
471 endcase
475 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
476 ;; read and classify next token from the input stream
478 <public-words>
480 : next-token ( -- )
481 \ ." =====NEXT-TOKEN!" cr debug:dump-stack
482 begin
483 tib-peekch case
484 0 of (set-eos) true endof
485 bl <=of tib-skipch false endof
486 [char] ; of (set-eos) parse-skip-line true endof
487 [char] % of ['] is-bin-digit (try-pfx-num) true endof
488 [char] $ of ['] is-hex-digit (try-pfx-num) true endof
489 [char] # of ['] is-hex-digit (try-pfx-num) tok-type tk-num <> " hex number expected" ?error true endof
490 [char] & of
491 (tib-in) 1+ c@ char-upper
492 case
493 [char] H of true endof
494 [char] B of (tib-in) 2+ c@ is-bin-digit endof
495 [char] O of true endof
496 otherwise drop false
497 endcase
498 if ['] is-hex-digit (try-pfx-num) tok-type tk-num <> " hex number expected" ?error
499 else (collect-punct) endif
500 true
501 endof
502 [char] ( of ;; )
503 (collect-punct) (check-lparen)
504 true
505 endof
506 [char] @ of
507 (collect-punct) [char] @ (other-pfx-dblch)
508 (classify-id-label-colon?) ifnot (classify-id-label?) drop endif
509 true
510 endof
511 [char] ' of (collect-char-string) true endof ;; '
512 [char] " of (collect-char-string) true endof ;; "
513 [char] ` of (collect-char-string) true endof ;; `
514 [char] \ of ;; possible continuation
515 (collect-punct)
516 allow-continuations if
517 parse-skip-blanks
518 tib-peekch if true
519 else refill-nocross " unexpected end of file in line continuation" ?not-error false
520 endif
521 else true
522 endif
523 endof
524 otherwise
525 is-id-char if (collect-id) (classify-id) else (collect-punct) endif
526 true
527 endcase
528 until
529 false to line-start
530 \ ." |" TOKEN XTYPE ." | -- " TIB-PEEKCH XEMIT CR
531 \ debug:dump-stack
535 : token-eol? ( -- bool ) tok-type tk-eos = ;
536 : token-colon? ( -- bool ) tok-type tk-punct = tok-num [char] : = and ;
537 : token-comma? ( -- bool ) tok-type tk-punct = tok-num [char] , = and ;
538 : token-lparen? ( -- bool ) tok-type tk-punct = tok-num [char] ( = and ; ;; )
539 : token-rparen? ( -- bool ) tok-type tk-punct = tok-num [char] ) = and ;
540 : token-=? ( -- bool ) tok-type tk-punct = tok-num [char] = = and ;
542 ;; are we at end of line (real EOL, or ":")?
543 : token-colon-eol? ( -- bool ) token-eol? token-colon? or ;
545 : token-resw? ( -- bool ) tok-type tk-resw = ;
546 : token-uo? ( uo -- bool ) tok-uo = token-resw? and ;
547 : token-uo-kind? ( uo kind -- bool ) tok-kind = swap tok-uo = and token-resw? and ;
549 : token-regA? ( -- bool ) asm-resw:uo-r8 asm-resw:kind-a token-uo-kind? ;
550 : token-regIR? ( -- bool ) asm-resw:uo-sr8 token-uo? ;
551 : token-r8? ( -- bool ) asm-resw:uo-r8 token-uo? ;
552 : token-rx8? ( -- bool ) asm-resw:uo-rx8 token-uo? ;
553 : token-r16? ( -- bool ) asm-resw:uo-r16 token-uo? ;
554 : token-af? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-af token-uo-kind? ;
555 : token-afx? ( -- bool ) asm-resw:uo-afx token-uo? ;
556 : token-bc? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-bc token-uo-kind? ;
557 : token-de? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-de token-uo-kind? ;
558 : token-hl? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-hl token-uo-kind? ;
559 : token-sp? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-sp token-uo-kind? ;
560 : token-ix? ( -- bool ) asm-resw:uo-rx16 asm-resw:kind-ix token-uo-kind? ;
561 : token-iy? ( -- bool ) asm-resw:uo-rx16 asm-resw:kind-iy token-uo-kind? ;
562 : token-ixy? ( -- bool ) asm-resw:uo-rx16 token-uo? ;
563 : token-mr16? ( -- bool ) asm-resw:uo-mr16 token-uo? ;
564 : token-(hl)? ( -- bool ) asm-resw:uo-r8 asm-resw:kind-(hl) token-uo-kind? ;
565 : token-(bc)? ( -- bool ) asm-resw:uo-mr16 asm-resw:kind-bc token-uo-kind? ;
566 : token-(sp)? ( -- bool ) asm-resw:uo-mr16 asm-resw:kind-sp token-uo-kind? ;
567 : token-(ixy)? ( -- bool ) asm-resw:uo-mrx16 token-uo? ;
568 : token-(c)? ( -- bool ) asm-resw:uo-portc token-uo? ;
569 : token-r16-n? ( -- bool ) asm-resw:uo-r16 token-uo? tok-kind asm-resw:kind-sp <= and ;
570 : token-r16-x? ( -- bool ) asm-resw:uo-r16 token-uo? tok-kind asm-resw:kind-hl <= and ;
571 : token-mr16-x? ( -- bool ) asm-resw:uo-mr16 token-uo? tok-kind asm-resw:kind-hl <= and ;
572 : token-reg-c? ( -- bool ) asm-resw:uo-r8 asm-resw:kind-c token-uo-kind? ;
573 : token-equ? ( -- bool ) asm-resw:uo-equ token-uo? ;
574 : token-(c)/(bc)? ( -- bool ) token-(c)? token-(bc)? or ;
576 : token-(hl)->mr16 ( -- )
577 token-(hl)? if asm-resw:uo-mr16 to tok-uo asm-resw:kind-hl to tok-kind endif
580 ;; converts C register to condition
581 : token-c-to-cond ( -- )
582 token-reg-c? if
583 asm-resw:uo-cond to tok-uo
584 asm-resw:kind-cc to tok-kind
585 endif
588 : token-cond? ( -- bool )
589 token-reg-c? if token-c-to-cond true
590 else asm-resw:uo-cond token-uo?
591 endif
595 : (build-expector) ( -- ) \ name checker msg|
596 parse-name dup " name expected" ?not-error
597 compiler:(create-forth-header) [compile] ]
598 [compile] [compile]
599 parse-skip-blanks [char] | parse " message expected" ?not-error strliteral
600 compile ?not-error
603 : (build-expector-finish) ( -- )
604 compile next-token compile forth:(exit) [compile] [
605 compiler:smudge
608 : build-expector: ( -- ) \ name checker msg|
609 (build-expector) (build-expector-finish)
612 : build-expector-res: ( -- ) \ name checker msg|
613 (build-expector) compile tok-kind (build-expector-finish)
616 build-expector: expect-eol token-eol? end of line expected|
617 build-expector: expect-comma token-comma? comma expected|
618 build-expector: expect-lparen token-lparen? `(` expected|
619 build-expector: expect-rparen token-rparen? `)` expected|
620 build-expector: expect-regA token-regA? `A` register expected|
621 build-expector: expect-(c) token-(c)? `(C)` expected|
622 build-expector: expect-(c)/(bc) token-(c)/(bc)? `(C)` expected|
624 build-expector-res: expect-cond token-cond? condition code expected|
625 build-expector-res: expect-r8 token-r8? 8-bit register expected|
626 build-expector-res: expect-r16 token-r16? 16-bit register expected|
628 prev-defs
630 \ debug:dump-stack