Use `errno_t` in all uspace and kernel code.
[helenos.git] / uspace / app / sbi / src / lex.c
blobe668ae24d94e5686958345470d4f38fe2145bb93
1 /*
2 * Copyright (c) 2011 Jiri Svoboda
3 * All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * - Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * - The name of the author may not be used to endorse or promote products
15 * derived from this software without specific prior written permission.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 /** @file Lexer (lexical analyzer).
31 * Consumes a text file and produces a sequence of lexical elements (lems).
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include "bigint.h"
37 #include "cspan.h"
38 #include "mytypes.h"
39 #include "input.h"
40 #include "os/os.h"
41 #include "strtab.h"
43 #include "lex.h"
45 #define TAB_WIDTH 8
47 typedef enum {
48 cs_chr,
49 cs_str
50 } chr_str_t;
52 static void lex_touch(lex_t *lex);
53 static bool_t lex_read_try(lex_t *lex);
55 static void lex_skip_comment(lex_t *lex);
56 static void lex_skip_ws(lex_t *lex);
57 static bool_t is_wstart(char c);
58 static bool_t is_wcont(char c);
59 static bool_t is_digit(char c);
60 static void lex_word(lex_t *lex);
61 static void lex_char(lex_t *lex);
62 static void lex_number(lex_t *lex);
63 static void lex_string(lex_t *lex);
64 static void lex_char_string_core(lex_t *lex, chr_str_t cs);
65 static int digit_value(char c);
67 /* Note: This imposes an implementation limit on identifier length. */
68 #define IBUF_SIZE 128
69 static char ident_buf[IBUF_SIZE + 1];
71 /* XXX This imposes an implementation limit on string literal length. */
72 #define SLBUF_SIZE 128
73 static char strlit_buf[SLBUF_SIZE + 1];
75 /** Lclass-string pair */
76 struct lc_name {
77 lclass_t lclass;
78 const char *name;
81 /** Keyword names. Used both for printing and recognition. */
82 static struct lc_name keywords[] = {
83 { lc_and, "and" },
84 { lc_as, "as" },
85 { lc_bool, "bool" },
86 { lc_break, "break" },
87 { lc_builtin, "builtin" },
88 { lc_char, "char" },
89 { lc_class, "class" },
90 { lc_deleg, "deleg" },
91 { lc_do, "do" },
92 { lc_elif, "elif" },
93 { lc_else, "else" },
94 { lc_end, "end" },
95 { lc_enum, "enum" },
96 { lc_except, "except" },
97 { lc_false, "false" },
98 { lc_finally, "finally" },
99 { lc_for, "for" },
100 { lc_fun, "fun" },
101 { lc_get, "get" },
102 { lc_if, "if" },
103 { lc_in, "in" },
104 { lc_int, "int" },
105 { lc_interface, "interface" },
106 { lc_is, "is" },
107 { lc_new, "new" },
108 { lc_not, "not" },
109 { lc_nil, "nil" },
110 { lc_or, "or" },
111 { lc_override, "override" },
112 { lc_packed, "packed" },
113 { lc_private, "private" },
114 { lc_prop, "prop" },
115 { lc_protected, "protected" },
116 { lc_public, "public" },
117 { lc_raise, "raise" },
118 { lc_resource, "resource" },
119 { lc_return, "return" },
120 { lc_self, "self" },
121 { lc_set, "set" },
122 { lc_static, "static" },
123 { lc_string, "string" },
124 { lc_struct, "struct" },
125 { lc_switch, "switch" },
126 { lc_then, "then" },
127 { lc_this, "this" },
128 { lc_true, "true" },
129 { lc_var, "var" },
130 { lc_with, "with" },
131 { lc_when, "when" },
132 { lc_while, "while" },
133 { lc_yield, "yield" },
135 { 0, NULL }
138 /** Other simple lclasses. Only used for printing. */
139 static struct lc_name simple_lc[] = {
140 { lc_invalid, "INVALID" },
141 { lc_eof, "EOF" },
143 /* Operators */
144 { lc_period, "." },
145 { lc_slash, "/" },
146 { lc_lparen, "(" },
147 { lc_rparen, ")" },
148 { lc_lsbr, "[" },
149 { lc_rsbr, "]" },
150 { lc_equal, "==" },
151 { lc_notequal, "!=" },
152 { lc_lt, "<" },
153 { lc_gt, ">" },
154 { lc_lt_equal, "<=" },
155 { lc_gt_equal, ">=" },
156 { lc_assign, "=" },
157 { lc_plus, "+" },
158 { lc_minus, "-" },
159 { lc_mult, "*" },
160 { lc_increase, "+=" },
162 /* Punctuators */
163 { lc_comma, "," },
164 { lc_colon, ":" },
165 { lc_scolon, ";" },
167 { 0, NULL },
170 /** Print lclass value.
172 * Prints lclass (lexical element class) value in human-readable form
173 * (for debugging).
175 * @param lclass Lclass value for display.
177 void lclass_print(lclass_t lclass)
179 struct lc_name *dp;
181 dp = keywords;
182 while (dp->name != NULL) {
183 if (dp->lclass == lclass) {
184 printf("%s", dp->name);
185 return;
187 ++dp;
190 dp = simple_lc;
191 while (dp->name != NULL) {
192 if (dp->lclass == lclass) {
193 printf("%s", dp->name);
194 return;
196 ++dp;
199 switch (lclass) {
200 case lc_ident:
201 printf("ident");
202 break;
203 case lc_lit_int:
204 printf("int_literal");
205 break;
206 case lc_lit_string:
207 printf("string_literal");
208 break;
209 default:
210 printf("<unknown?>");
211 break;
215 /** Print lexical element.
217 * Prints lexical element in human-readable form (for debugging).
219 * @param lem Lexical element for display.
221 void lem_print(lem_t *lem)
223 lclass_print(lem->lclass);
225 switch (lem->lclass) {
226 case lc_ident:
227 printf("('%s')", strtab_get_str(lem->u.ident.sid));
228 break;
229 case lc_lit_int:
230 printf("(");
231 bigint_print(&lem->u.lit_int.value);
232 printf(")");
233 break;
234 case lc_lit_string:
235 printf("(\"%s\")", lem->u.lit_string.value);
236 default:
237 break;
241 /** Print lem coordinates.
243 * Print the coordinates (line number, column number) of a lexical element.
245 * @param lem Lexical element for coordinate printing.
247 void lem_print_coords(lem_t *lem)
249 cspan_print(lem->cspan);
252 /** Initialize lexer instance.
254 * @param lex Lexer object to initialize.
255 * @param input Input to associate with lexer.
257 void lex_init(lex_t *lex, struct input *input)
259 errno_t rc;
261 lex->input = input;
263 rc = input_get_line(lex->input, &lex->inbuf);
264 if (rc != EOK) {
265 printf("Error reading input.\n");
266 exit(1);
269 lex->ibp = lex->inbuf;
270 lex->col_adj = 0;
271 lex->prev_valid = b_false;
272 lex->current_valid = b_true;
275 /** Advance to next lexical element.
277 * The new element is read in lazily then it is actually accessed.
279 * @param lex Lexer object.
281 void lex_next(lex_t *lex)
283 /* Make sure the current lem has already been read in. */
284 lex_touch(lex);
286 /* Force a new lem to be read on next access. */
287 lex->current_valid = b_false;
290 /** Get current lem.
292 * The returned pointer is invalidated by next call to lex_next()
294 * @param lex Lexer object.
295 * @return Pointer to current lem. Owned by @a lex and only valid
296 * until next call to lex_xxx().
298 lem_t *lex_get_current(lex_t *lex)
300 lex_touch(lex);
301 return &lex->current;
304 /** Get previous lem if valid.
306 * The returned pointer is invalidated by next call to lex_next()
308 * @param lex Lexer object.
309 * @return Pointer to previous lem. Owned by @a lex and only valid
310 * until next call to lex_xxx().
312 lem_t *lex_peek_prev(lex_t *lex)
314 if (lex->current_valid == b_false) {
316 * This means the head is advanced but next lem was not read.
317 * Thus the previous lem is still in @a current.
319 return &lex->current;
322 if (lex->prev_valid != b_true) {
323 /* Looks like we are still at the first lem. */
324 return NULL;
328 * Current lem has been read in. Thus the previous lem was moved to
329 * @a previous.
331 return &lex->prev;
334 /** Read in the current lexical element (unless already read in).
336 * @param lex Lexer object.
338 static void lex_touch(lex_t *lex)
340 bool_t got_lem;
342 if (lex->current_valid == b_true)
343 return;
345 /* Copy previous lem */
346 lex->prev = lex->current;
347 lex->prev_valid = b_true;
349 do {
350 got_lem = lex_read_try(lex);
351 } while (got_lem == b_false);
353 lex->current_valid = b_true;
356 /** Try reading next lexical element.
358 * Attemps to read the next lexical element. In some cases (such as a comment)
359 * this function will need to give it another try and returns @c b_false
360 * in such case.
362 * @param lex Lexer object.
363 * @return @c b_true on success or @c b_false if it needs
364 * restarting. On success the lem is stored to
365 * the current lem in @a lex.
367 static bool_t lex_read_try(lex_t *lex)
369 char *bp, *lsp;
370 int line0, col0;
372 lex_skip_ws(lex);
375 * Record lem coordinates. Line number we already have. For column
376 * number we start with position in the input buffer. This works
377 * for all characters except tab. Thus we keep track of tabs
378 * separately using col_adj.
380 line0 = input_get_line_no(lex->input);
381 col0 = 1 + lex->col_adj + (lex->ibp - lex->inbuf);
383 lex->current.cspan = cspan_new(lex->input, line0, col0, line0, col0);
385 lsp = lex->ibp;
386 bp = lex->ibp;
388 if (bp[0] == '\0') {
389 /* End of input */
390 lex->current.lclass = lc_eof;
391 goto finish;
394 if (is_wstart(bp[0])) {
395 lex_word(lex);
396 goto finish;
399 if (bp[0] == '\'') {
400 lex_char(lex);
401 goto finish;
404 if (is_digit(bp[0])) {
405 lex_number(lex);
406 goto finish;
409 if (bp[0] == '"') {
410 lex_string(lex);
411 goto finish;
414 if (bp[0] == '-' && bp[1] == '-') {
415 lex_skip_comment(lex);
417 /* Compute ending column number */
418 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
420 /* Try again */
421 return b_false;
424 switch (bp[0]) {
425 case ',': lex->current.lclass = lc_comma; ++bp; break;
426 case ':': lex->current.lclass = lc_colon; ++bp; break;
427 case ';': lex->current.lclass = lc_scolon; ++bp; break;
429 case '.': lex->current.lclass = lc_period; ++bp; break;
430 case '/': lex->current.lclass = lc_slash; ++bp; break;
431 case '(': lex->current.lclass = lc_lparen; ++bp; break;
432 case ')': lex->current.lclass = lc_rparen; ++bp; break;
433 case '[': lex->current.lclass = lc_lsbr; ++bp; break;
434 case ']': lex->current.lclass = lc_rsbr; ++bp; break;
436 case '=':
437 if (bp[1] == '=') {
438 lex->current.lclass = lc_equal; bp += 2; break;
440 lex->current.lclass = lc_assign; ++bp; break;
442 case '!':
443 if (bp[1] == '=') {
444 lex->current.lclass = lc_notequal; bp += 2; break;
446 goto invalid;
448 case '+':
449 if (bp[1] == '=') {
450 lex->current.lclass = lc_increase; bp += 2; break;
452 lex->current.lclass = lc_plus; ++bp; break;
454 case '-':
455 lex->current.lclass = lc_minus; ++bp; break;
457 case '*':
458 lex->current.lclass = lc_mult; ++bp; break;
460 case '<':
461 if (bp[1] == '=') {
462 lex->current.lclass = lc_lt_equal; bp += 2; break;
464 lex->current.lclass = lc_lt; ++bp; break;
466 case '>':
467 if (bp[1] == '=') {
468 lex->current.lclass = lc_gt_equal; bp += 2; break;
470 lex->current.lclass = lc_gt; ++bp; break;
472 default:
473 goto invalid;
476 lex->ibp = bp;
478 finish:
479 /* Compute ending column number */
480 lex->current.cspan->col1 = col0 + (lex->ibp - lsp) - 1;
481 return b_true;
483 invalid:
484 lex->current.lclass = lc_invalid;
485 ++bp;
486 lex->ibp = bp;
488 return b_true;
491 /** Lex a word (identifier or keyword).
493 * Read in a word. This may later turn out to be a keyword or a regular
494 * identifier. It is stored in the current lem in @a lex.
496 * @param lex Lexer object.
498 static void lex_word(lex_t *lex)
500 struct lc_name *dp;
501 char *bp;
502 int idx;
504 bp = lex->ibp;
505 ident_buf[0] = bp[0];
506 idx = 1;
508 while (is_wcont(bp[idx])) {
509 if (idx >= IBUF_SIZE) {
510 printf("Error: Identifier too long.\n");
511 exit(1);
514 ident_buf[idx] = bp[idx];
515 ++idx;
518 lex->ibp = bp + idx;
520 ident_buf[idx] = '\0';
522 dp = keywords;
523 while (dp->name != NULL) {
524 if (os_str_cmp(ident_buf, dp->name) == 0) {
525 /* Match */
526 lex->current.lclass = dp->lclass;
527 return;
529 ++dp;
532 /* No matching keyword -- it must be an identifier. */
533 lex->current.lclass = lc_ident;
534 lex->current.u.ident.sid = strtab_get_sid(ident_buf);
537 /** Lex a character literal.
539 * Reads in a character literal and stores it in the current lem in @a lex.
541 * @param lex Lexer object.
543 static void lex_char(lex_t *lex)
545 size_t len;
546 int char_val;
548 lex_char_string_core(lex, cs_chr);
550 len = os_str_length(strlit_buf);
551 if (len != 1) {
552 printf("Character literal should contain one character, "
553 "but contains %u characters instead.\n", (unsigned) len);
554 exit(1);
557 os_str_get_char(strlit_buf, 0, &char_val);
558 lex->current.lclass = lc_lit_char;
559 bigint_init(&lex->current.u.lit_char.value, char_val);
562 /** Lex a numeric literal.
564 * Reads in a numeric literal and stores it in the current lem in @a lex.
566 * @param lex Lexer object.
568 static void lex_number(lex_t *lex)
570 char *bp;
571 bigint_t value;
572 bigint_t dgval;
573 bigint_t base;
574 bigint_t tprod;
576 bp = lex->ibp;
578 bigint_init(&value, 0);
579 bigint_init(&base, 10);
581 while (is_digit(*bp)) {
582 bigint_mul(&value, &base, &tprod);
583 bigint_init(&dgval, digit_value(*bp));
585 bigint_destroy(&value);
586 bigint_add(&tprod, &dgval, &value);
587 bigint_destroy(&tprod);
588 bigint_destroy(&dgval);
590 ++bp;
593 bigint_destroy(&base);
595 lex->ibp = bp;
597 lex->current.lclass = lc_lit_int;
598 bigint_shallow_copy(&value, &lex->current.u.lit_int.value);
601 /** Lex a string literal.
603 * Reads in a string literal and stores it in the current lem in @a lex.
605 * @param lex Lexer object.
607 static void lex_string(lex_t *lex)
609 lex_char_string_core(lex, cs_str);
611 lex->current.lclass = lc_lit_string;
612 lex->current.u.lit_string.value = os_str_dup(strlit_buf);
615 static void lex_char_string_core(lex_t *lex, chr_str_t cs)
617 char *bp;
618 int sidx, didx;
619 char term;
620 const char *descr, *cap_descr;
621 char spchar;
623 /* Make compiler happy */
624 term = '\0';
625 descr = NULL;
626 cap_descr = NULL;
628 switch (cs) {
629 case cs_chr:
630 term = '\'';
631 descr = "character";
632 cap_descr = "Character";
633 break;
634 case cs_str:
635 term = '"';
636 descr = "string";
637 cap_descr = "String";
638 break;
641 bp = lex->ibp + 1;
642 sidx = didx = 0;
644 while (bp[sidx] != term) {
645 if (didx >= SLBUF_SIZE) {
646 printf("Error: %s literal too long.\n", cap_descr);
647 exit(1);
650 if (bp[sidx] == '\0') {
651 printf("Error: Unterminated %s literal.\n", descr);
652 exit(1);
655 if (bp[sidx] == '\\') {
656 switch (bp[sidx + 1]) {
657 case '\\':
658 spchar = '\\';
659 break;
660 case '\'':
661 spchar = '\'';
662 break;
663 case '"':
664 spchar = '"';
665 break;
666 case 'n':
667 spchar = '\n';
668 break;
669 case 't':
670 spchar = '\t';
671 break;
672 default:
673 printf("Error: Unknown character escape sequence.\n");
674 exit(1);
677 strlit_buf[didx] = spchar;
678 ++didx;
679 sidx += 2;
680 } else {
681 strlit_buf[didx] = bp[sidx];
682 ++sidx; ++didx;
686 lex->ibp = bp + sidx + 1;
688 strlit_buf[didx] = '\0';
691 /** Lex a single-line comment.
693 * This does not produce any lem. The comment is just skipped.
695 * @param lex Lexer object.
697 static void lex_skip_comment(lex_t *lex)
699 char *bp;
701 bp = lex->ibp + 2;
703 while (*bp != '\n' && *bp != '\0') {
704 ++bp;
707 lex->ibp = bp;
710 /** Skip whitespace characters.
712 * This does not produce any lem. The whitespace is just skipped.
714 * @param lex Lexer object.
716 static void lex_skip_ws(lex_t *lex)
718 char *bp;
719 errno_t rc;
721 bp = lex->ibp;
723 while (b_true) {
724 while (*bp == ' ' || *bp == '\t') {
725 if (*bp == '\t') {
726 /* XXX This is too simplifed. */
727 lex->col_adj += (TAB_WIDTH - 1);
729 ++bp;
732 if (*bp != '\n')
733 break;
735 /* Read next line */
736 rc = input_get_line(lex->input, &lex->inbuf);
737 if (rc != EOK) {
738 printf("Error reading input.\n");
739 exit(1);
742 bp = lex->inbuf;
743 lex->col_adj = 0;
746 lex->ibp = bp;
749 /** Determine if character can start a word.
751 * @param c Character.
752 * @return @c b_true if @a c can start a word, @c b_false otherwise.
754 static bool_t is_wstart(char c)
756 return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z')) ||
757 (c == '_');
760 /** Determine if character can continue a word.
762 * @param c Character.
763 * @return @c b_true if @a c can start continue word, @c b_false
764 * otherwise.
766 static bool_t is_wcont(char c)
768 return is_digit(c) || is_wstart(c);
771 /** Determine if character is a numeric digit.
773 * @param c Character.
774 * @return @c b_true if @a c is a numeric digit, @c b_false otherwise.
776 static bool_t is_digit(char c)
778 return ((c >= '0') && (c <= '9'));
781 /** Determine numeric value of digit character.
783 * @param c Character, must be a valid decimal digit.
784 * @return Value of the digit (0-9).
786 static int digit_value(char c)
788 return (c - '0');