urasm: 2-char string literals now processed as in the C UrAsm
[urasm.git] / urflibs / urasm / lexer.f
blobc78585a2b1c6593c39484c79cddf9c84e9e04efe
1 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ;; and now for something completely different...
3 ;; UrAsm Forth Engine!
4 ;; GPLv3 ONLY
5 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
6 ;; Z80 Assembler: input stream lexer
7 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
9 (*
10 on left paren, lexer tries to look forward to create a better token:
11 (HL) -> uo-r8, kind-(hl)
12 (I[XY]) -> uo-mrx16 kind-i[xy]-none
13 (I[XY]+ -> uo-mrx16 kind-i[xy]-add
14 (I[XY]- -> uo-mrx16 kind-i[xy]-sub
15 (C) -> uo-portc
16 (BC), (DE), (SP) -> uo-mr16 with proper kind
18 "C" is always r8. to convert it to condition, use "token-c-to-cond".
19 to convert "(HL)" to r8 use "token-(hl)->r8"
23 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
24 ;; simple expression evaluator
27 ;; cannot include instrs here
28 vocab-if-none asm-instr
31 vocab-if-none asm-lexer
32 also-defs: asm-lexer
34 ;; options
35 true value allow-continuations
36 ;; for label detection
37 true value line-start
40 ;; one-char and two-char strings sets "TOK-NUM" too
41 ;; number sign is not parsed
42 enum{
43 def: tk-eos
44 def: tk-id
45 def: tk-num
46 def: tk-str
47 def: tk-punct ;; punctuation; sets both "TK-STR" and "TK-NUM" (to char code)
48 def: tk-label ;; line start, word ends with ":"; token is without ":"
49 def: tk-mnemo ;; mnemonics;; TK-STR contains a word (uppercased); TOK-MNEMO-CFA is CFA
50 def: tk-resw ;; reserved word; TK-STR contains a word (uppercased); TOK-UO is UO-xxx; TOK-KIND is kind
53 0 value tok-addr
54 0 value tok-len
55 0 value tok-type ;; TK-xxx
56 0 value tok-num
57 0 value tok-uo
58 0 value tok-kind ;; for reserved words
59 0 value tok-mnemo-cfa
62 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
63 ;; char classifiers
66 <public-words>
68 : is-digit ( ch -- bool )
69 [char] 0 [char] 9 bounds?
72 : is-bin-digit ( ch -- bool )
73 [char] 0 [char] 1 bounds?
76 : is-oct-digit ( ch -- bool )
77 [char] 0 [char] 7 bounds?
80 : is-hex-digit ( ch -- bool )
81 dup [char] 0 [char] 9 bounds?
82 over [char] A [char] F bounds? or
83 swap [char] a [char] f bounds? or
86 : is-alpha ( ch -- bool )
87 dup [char] A [char] Z bounds?
88 swap [char] a [char] z bounds? or
91 : is-under-dot ( ch -- bool )
92 dup [char] _ =
93 swap [char] . = or
96 : is-alpha-under-dot ( ch -- bool )
97 dup is-alpha swap is-under-dot or
100 : is-alnum ( ch -- bool )
101 dup is-alpha swap is-digit or
104 : is-id-start ( ch -- bool )
105 dup is-alpha swap is-under-dot or
108 : is-id-char ( ch -- bool )
109 dup is-id-start swap is-digit or
113 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
114 ;; token checks
117 : punct? ( char -- )
118 tok-type tk-punct = swap tok-num = and
121 : punct2? ( char -- )
122 tok-type tk-punct = swap 0xff and dup 8 ash or tok-num = and
126 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
127 ;; input stream tokenizer
129 <public-words>
131 ;; return current token string
132 ;; valid until "next-token" is called
133 : token ( -- addr count ) tok-addr tok-len ;
136 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
137 ;; tokenizer helpers
139 <hidden-words>
141 : char-upper ( ch -- )
142 dup [char] a [char] z bounds? if 32 - endif
145 : (init-tok-addr) ( -- ) (tib-in) to tok-addr ;
146 : (init-tok) ( -- ) (init-tok-addr) 0 to tok-len ;
147 ;; set length according to the current TIB position
148 ;; (init-tok-addr) must be already called
149 : (tok-fix-length) ( -- ) (tib-in) tok-addr - to tok-len ;
151 : (set-eos) ( -- ) (init-tok) tk-eos to tok-type ;
153 : (collect-punct) ( -- )
154 (init-tok-addr)
155 tib-getch case
156 [char] & of
157 tib-peekch [char] & = if tib-skipch endif
158 endof
159 [char] | of
160 tib-peekch [char] | = if tib-skipch endif
161 endof
162 [char] < of
163 tib-peekch case
164 [char] = of tib-skipch endof
165 [char] < of tib-skipch endof
166 [char] > of tib-skipch endof
167 endcase
168 endof
169 [char] > of
170 tib-peekch case
171 [char] = of tib-skipch endof
172 [char] > of tib-skipch endof
173 endcase
174 endof
175 [char] ! of
176 tib-peekch [char] = = if tib-skipch endif
177 endof
178 endcase
179 (tok-fix-length)
180 tok-addr tok-len case
181 1 of c@ endof
182 2 of w@ endof
183 3 of dup w@ swap 2+ c@ 16 lsh or endof
184 4 of @ endof
185 otherwise " wut?!" error
186 endcase to tok-num
187 tk-punct to tok-type
190 : (collect) ( checkcfa -- )
191 >r begin tib-peekch dup 0<> swap r@ execute and
192 while tib-skipch
193 repeat rdrop
194 (tok-fix-length)
197 : (collect-id) ( -- )
198 (init-tok) ['] is-id-char (collect)
199 tk-id to tok-type
202 : (other-pfx-dblch) ( ch -- )
203 tk-id to tok-type
204 >r tib-peekch r@ = if ;; "$$", etc.
205 tib-skipch tib-peekch r> = " invalid identifier" ?error
206 tib-peekch is-id-char if
207 +1-to tok-addr ;; convert "@@f" to "@f"
208 ['] is-id-char (collect)
209 else (tok-fix-length) endif
210 line-start if tk-label to tok-type endif
211 else rdrop endif
214 ;; we are at one-char prefix, it may be a number
215 ;; it it isn't, colled as punctuation
216 : (try-pfx-num) ( checkcfa -- )
217 (collect-punct)
218 tib-peekch swap execute if
219 ['] is-id-char (collect)
220 ;; (XNUMBER) ( addr count allowsign? -- num TRUE / FALSE )
221 token false base @ (based-number) " invalid number" ?not-error
222 to tok-num tk-num to tok-type
223 else ;; "$" is identifier
224 tok-addr c@ [char] $ = if [char] $ (other-pfx-dblch) endif
225 endif
229 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
230 ;; token classifiers
233 : (classify-id-mnemo?) ( -- success-bool )
234 token vocid: asm-instr find-word-in-voc
235 dup if
236 swap to tok-mnemo-cfa
237 tk-mnemo to tok-type
238 \ token string:upper
239 endif
242 : (check-af') ( -- )
243 tok-kind asm-resw:kind-af = if
244 tib-peekch [char] ' = if
245 tib-skipch (tok-fix-length)
246 asm-resw:uo-afx to tok-uo
247 endif
248 endif
251 : (classify-id-resw?) ( -- success-bool )
252 token vocid: asm-resw find-word-in-voc
253 dup if drop ( cfa: UO-xxx kind )
254 cfa->pfa dup @ to tok-uo
255 cell+ @ to tok-kind
256 tk-resw to tok-type
257 (check-af')
258 \ token string:upper
259 true
260 endif
263 : (classify-id-label-colon?) ( -- success-bool )
264 line-start dup if drop
265 parse-skip-blanks
266 tib-peekch [char] : = if tib-skipch
267 tk-label to tok-type
268 tok-addr c@ is-digit " invalid label" ?error
269 true
270 else false
271 endif
272 endif
275 : (classify-id-label?) ( -- success-bool )
276 line-start dup if drop
277 tok-addr tib @ = if
278 tok-addr c@ is-digit not if
279 tk-label to tok-type
280 true
281 else false
282 endif
283 else false endif
284 endif
287 : (classify-id-number?) ( -- success-bool )
288 tok-addr c@ is-digit if
289 token false base @ (based-number) " invalid number" ?not-error
290 to tok-num
291 tk-num to tok-type
292 true
293 else false
294 endif
297 : (classify-id-maybe-number?) ( -- success-bool )
298 tok-addr c@ is-id-char not if
299 token false (based-number) if
300 to tok-num
301 tk-num to tok-type
302 true
303 else false
304 endif
305 else false
306 endif
309 create (classifiers)
310 ' (classify-id-mnemo?) ,
311 ' (classify-id-resw?) ,
312 ' (classify-id-label-colon?) ,
313 ' (classify-id-label?) ,
314 ' (classify-id-number?) ,
315 ' (classify-id-maybe-number?) ,
317 create;
319 : (classify-id) ( -- )
320 (classifiers)
321 begin
322 dup @ ?dup
323 while ( addr cfa )
324 execute if drop exit endif cell+
325 repeat drop
329 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
330 ;; quoted something
333 : byte-swap ( word -- word )
334 dup 0xff and 8 lsh
335 swap -8 lsh 0xff and or
338 ;;FIXME: escape support?
339 : (collect-char-string) ( -- )
340 tib-getch >r
341 (init-tok) begin tib-getch dup " unterminated string" ?not-error
342 r@ = until
343 (tok-fix-length) tok-len 0> " invalid string" ?not-error
344 -1-to tok-len ;; final quote fix
345 tk-str to tok-type
346 tok-len case
347 1 of tok-addr c@ to tok-num endof
348 2 of
349 tok-addr w@ r@ [char] " <> if byte-swap endif ;; "
350 to tok-num
351 endof
352 otherwise drop -1 to tok-num
353 endcase
354 rdrop
358 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
359 ;; parse "(" (extended)
362 : (skip-chars) ( count -- ) for tib-skipch endfor ;
364 : (blank-char?) ( ch -- ) dup 0> swap bl <= and ;
366 ;; skip blanks until non-blank or EOL
367 ;; eofs is at non-blank or EOL
368 : (lparen-skip-blanks) ( stofs -- eofs char )
369 begin 1+ dup 1- tib-peekch-ofs (blank-char?) not-until 1-
370 dup tib-peekch-ofs
373 ;; skip blanks, check for ")"
374 ;; eofs is AFTER ")"
375 : (check-rparen) ( stofs -- eofs TRUE / FALSE )
376 (lparen-skip-blanks)
377 [char] ) = if 1+ true else drop false endif
380 ;; "(c"; TIB is at "c"
381 : (try-lparen-c) ( -- )
382 1 (check-rparen) if
383 (skip-chars) (tok-fix-length)
384 tk-resw to tok-type
385 asm-resw:uo-portc to tok-uo
386 endif
389 ;; "(.."; TIB is at the first char
390 : (try-lparen-r16) ( kind char2 -- )
391 1 tib-peekch-ofs char-upper = if
392 2 (check-rparen) if
393 (skip-chars) (tok-fix-length)
394 tk-resw to tok-type
395 asm-resw:uo-mr16 to tok-uo
396 to tok-kind
397 else drop
398 endif
399 else drop
400 endif
403 : (try-lparen-bc) ( -- ) asm-resw:kind-bc [char] C (try-lparen-r16) ;
404 : (try-lparen-de) ( -- ) asm-resw:kind-de [char] E (try-lparen-r16) ;
405 : (try-lparen-sp) ( -- ) asm-resw:kind-sp [char] P (try-lparen-r16) ;
406 : (try-lparen-hl) ( -- )
407 asm-resw:kind-(hl) [char] L (try-lparen-r16)
408 tok-uo asm-resw:uo-mr16 = if asm-resw:uo-r8 to tok-uo endif
411 ;; most complex checker: "(i[xy][+-]"
412 ;; TIB is at "I"
413 : (try-lparen-ixy) ( -- )
414 ;; parse, put initial kind on the stack
415 1 tib-peekch-ofs char-upper case
416 [char] X of asm-resw:kind-ix-none endof
417 [char] Y of asm-resw:kind-iy-none endof
418 otherwise drop exit
419 endcase
420 ( kind; check if we have + or - )
421 2 (lparen-skip-blanks) ( base-kind ch-ofs char )
422 case
423 [char] + of asm-resw:kind-ix-add endof
424 [char] - of asm-resw:kind-ix-sub endof
425 [char] ) of 0 endof
426 otherwise drop exit
427 endcase
428 ( base-kind ch-ofs ext-kind )
429 rot + swap 1+ (skip-chars) (tok-fix-length)
430 tk-resw to tok-type
431 asm-resw:uo-mrx16 to tok-uo
432 to tok-kind
436 ;; something that starts with "(".
437 ;; it can be:
438 ;; uo-mr16 -- "(r16)"
439 ;; uo-mrx16 -- "(i[xy][+-]"
440 ;; uo-portc -- "(c)"
441 : (check-lparen) ( -- )
442 ;; check next char
443 parse-skip-blanks ;; why not? allow "( hl )"
444 tib-peekch char-upper case
445 [char] I of (try-lparen-ixy) endof
446 [char] C of (try-lparen-c) endof
447 [char] B of (try-lparen-bc) endof
448 [char] D of (try-lparen-de) endof
449 [char] H of (try-lparen-hl) endof
450 [char] S of (try-lparen-sp) endof
451 endcase
455 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
456 ;; read and classify next token from the input stream
458 <public-words>
460 : next-token ( -- )
461 \ ." =====NEXT-TOKEN!" cr debug:dump-stack
462 begin
463 tib-peekch case
464 0 of (set-eos) true endof
465 bl <=of tib-skipch false endof
466 [char] ; of (set-eos) parse-skip-line true endof
467 [char] % of ['] is-bin-digit (try-pfx-num) true endof
468 [char] $ of ['] is-hex-digit (try-pfx-num) true endof
469 [char] # of ['] is-hex-digit (try-pfx-num) tok-type tk-num <> " hex number expected" ?error true endof
470 [char] & of
471 (tib-in) 1+ c@ char-upper
472 case
473 [char] H of true endof
474 [char] B of (tib-in) 2+ c@ is-bin-digit endof
475 [char] O of true endof
476 otherwise drop false
477 endcase
478 if ['] is-hex-digit (try-pfx-num) tok-type tk-num <> " hex number expected" ?error
479 else (collect-punct) endif
480 true
481 endof
482 [char] ( of ;; )
483 (collect-punct) (check-lparen)
484 true
485 endof
486 [char] @ of (collect-punct) [char] @ (other-pfx-dblch) true endof
487 [char] ' of (collect-char-string) true endof ;; '
488 [char] " of (collect-char-string) true endof ;; "
489 [char] ` of (collect-char-string) true endof ;; `
490 [char] \ of ;; possible continuation
491 (collect-punct)
492 allow-continuations if
493 parse-skip-blanks
494 tib-peekch if true
495 else refill-nocross " unexpected end of file in line continuation" ?not-error false
496 endif
497 else true
498 endif
499 endof
500 otherwise
501 is-id-char if (collect-id) (classify-id) else (collect-punct) endif
502 true
503 endcase
504 until
505 false to line-start
506 \ ." |" TOKEN XTYPE ." | -- " TIB-PEEKCH XEMIT CR
507 \ debug:dump-stack
511 : token-colon? ( -- bool ) tok-type tk-punct = tok-num [char] : = and ;
512 : token-comma? ( -- bool ) tok-type tk-punct = tok-num [char] , = and ;
513 : token-lparen? ( -- bool ) tok-type tk-punct = tok-num [char] ( = and ; ;; )
514 : token-rparen? ( -- bool ) tok-type tk-punct = tok-num [char] ) = and ;
515 : token-=? ( -- bool ) tok-type tk-punct = tok-num [char] = = and ;
517 ;; are we at end of line (real EOL, or ":")?
518 : token-colon-eol? ( -- bool ) tok-type tk-eos = token-colon? or ;
520 : token-resw? ( -- bool ) tok-type tk-resw = ;
521 : token-uo? ( uo -- bool ) tok-uo = token-resw? and ;
522 : token-uo-kind? ( uo kind -- bool ) tok-kind = swap tok-uo = and token-resw? and ;
524 : token-regA? ( -- bool ) asm-resw:uo-r8 asm-resw:kind-a token-uo-kind? ;
525 : token-regIR? ( -- bool ) asm-resw:uo-sr8 token-uo? ;
526 : token-r8? ( -- bool ) asm-resw:uo-r8 token-uo? ;
527 : token-rx8? ( -- bool ) asm-resw:uo-rx8 token-uo? ;
528 : token-r16? ( -- bool ) asm-resw:uo-r16 token-uo? ;
529 : token-af? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-af token-uo-kind? ;
530 : token-afx? ( -- bool ) asm-resw:uo-afx token-uo? ;
531 : token-bc? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-bc token-uo-kind? ;
532 : token-de? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-de token-uo-kind? ;
533 : token-hl? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-hl token-uo-kind? ;
534 : token-sp? ( -- bool ) asm-resw:uo-r16 asm-resw:kind-sp token-uo-kind? ;
535 : token-ix? ( -- bool ) asm-resw:uo-rx16 asm-resw:kind-ix token-uo-kind? ;
536 : token-iy? ( -- bool ) asm-resw:uo-rx16 asm-resw:kind-iy token-uo-kind? ;
537 : token-ixy? ( -- bool ) asm-resw:uo-rx16 token-uo? ;
538 : token-mr16? ( -- bool ) asm-resw:uo-mr16 token-uo? ;
539 : token-(hl)? ( -- bool ) asm-resw:uo-r8 asm-resw:kind-(hl) token-uo-kind? ;
540 : token-(bc)? ( -- bool ) asm-resw:uo-mr16 asm-resw:kind-bc token-uo-kind? ;
541 : token-(sp)? ( -- bool ) asm-resw:uo-mr16 asm-resw:kind-sp token-uo-kind? ;
542 : token-(ixy)? ( -- bool ) asm-resw:uo-mrx16 token-uo? ;
543 : token-(c)? ( -- bool ) asm-resw:uo-portc token-uo? ;
544 : token-r16-n? ( -- bool ) asm-resw:uo-r16 token-uo? tok-kind asm-resw:kind-sp <= and ;
545 : token-r16-x? ( -- bool ) asm-resw:uo-r16 token-uo? tok-kind asm-resw:kind-hl <= and ;
546 : token-mr16-x? ( -- bool ) asm-resw:uo-mr16 token-uo? tok-kind asm-resw:kind-hl <= and ;
547 : token-reg-c? ( -- bool ) asm-resw:uo-r8 asm-resw:kind-c token-uo-kind? ;
548 : token-equ? ( -- bool ) asm-resw:uo-equ token-uo? ;
549 : token-(c)/(bc)? ( -- bool ) token-(c)? token-(bc)? or ;
551 : token-(hl)->mr16 ( -- )
552 token-(hl)? if asm-resw:uo-mr16 to tok-uo asm-resw:kind-hl to tok-kind endif
555 ;; converts C register to condition
556 : token-c-to-cond ( -- )
557 token-reg-c? if
558 asm-resw:uo-cond to tok-uo
559 asm-resw:kind-cc to tok-kind
560 endif
563 : token-cond? ( -- bool )
564 token-reg-c? if token-c-to-cond true
565 else asm-resw:uo-cond token-uo?
566 endif
570 : (build-expector) ( -- ) \ name checker msg|
571 parse-name dup " name expected" ?not-error
572 compiler:(create-forth-header) [compile] ]
573 [compile] [compile]
574 parse-skip-blanks [char] | parse " message expected" ?not-error strliteral
575 compile ?not-error
578 : (build-expector-finish) ( -- )
579 compile next-token compile forth:(exit) [compile] [
580 compiler:smudge
583 : build-expector: ( -- ) \ name checker msg|
584 (build-expector) (build-expector-finish)
587 : build-expector-res: ( -- ) \ name checker msg|
588 (build-expector) compile tok-kind (build-expector-finish)
591 build-expector: expect-comma token-comma? comma expected|
592 build-expector: expect-lparen token-lparen? `(` expected|
593 build-expector: expect-rparen token-rparen? `)` expected|
594 build-expector: expect-regA token-regA? `A` register expected|
595 build-expector: expect-(c) token-(c)? `(C)` expected|
596 build-expector: expect-(c)/(bc) token-(c)/(bc)? `(C)` expected|
598 build-expector-res: expect-cond token-cond? condition code expected|
599 build-expector-res: expect-r8 token-r8? 8-bit register expected|
600 build-expector-res: expect-r16 token-r16? 16-bit register expected|
602 prev-defs
604 \ debug:dump-stack