1 /* Alexey's C compiler. */
16 unsigned int line
, column
;
20 #define __noreturn __attribute((noreturn))
21 #define __printf(a, b) __attribute__((format(printf, a, b)))
24 #define __printf(a, b)
27 static void warning(struct pos
*pos
, const char *fmt
, ...) __printf(2, 3);
28 static void warning(struct pos
*pos
, const char *fmt
, ...)
32 fprintf(stderr
, "%u:%u: warning: ", pos
->line
, pos
->column
);
34 vfprintf(stderr
, fmt
, args
);
39 static void error_exit(struct pos
*pos
, const char *fmt
, ...) __printf(2, 3) __noreturn
;
40 static void error_exit(struct pos
*pos
, const char *fmt
, ...)
44 fprintf(stderr
, "%u:%u: error: ", pos
->line
, pos
->column
);
46 vfprintf(stderr
, fmt
, args
);
52 static void perror_exit(const char *fmt
, ...) __printf(1, 2) __noreturn
;
53 static void perror_exit(const char *fmt
, ...)
55 int old_errno
= errno
;
58 fputs("acc: ", stderr
);
60 vfprintf(stderr
, fmt
, args
);
68 static void _error_exit(const char *fmt
, ...) __printf(1, 2) __noreturn
;
69 static void _error_exit(const char *fmt
, ...)
73 fputs("acc: error: ", stderr
);
75 vfprintf(stderr
, fmt
, args
);
81 static void *xmalloc(size_t size
)
87 perror_exit("%s: size %zu", __func__
, size
);
91 static void *xmemdup(const void *src
, size_t n
)
100 static ssize_t
_xread(int fd
, void *buf
, size_t count
)
105 rv
= read(fd
, buf
, count
);
106 } while (rv
== -1 && (errno
== EAGAIN
|| errno
== EINTR
));
110 static void xread(int fd
, void *buf
, size_t count
)
115 rv
= _xread(fd
, buf
, count
);
117 perror_exit("read fd %d, buf %p, count %zu", fd
, buf
, count
);
119 _error_exit("fd %d truncated, buf %p, count %zu", fd
, buf
, count
);
121 buf
= (char *)buf
+ rv
;
126 static void convert_from_utf8(uint8_t *_c
, unsigned int _nr_c
, uint32_t **c
, unsigned int *nr_c
)
130 if (_nr_c
>= 0xffffffff / sizeof(uint32_t))
131 _error_exit("integer overflow _nr_c %"PRIu32
, _nr_c
);
133 /* At worse all data is ASCII. */
134 *c
= xmalloc(_nr_c
* sizeof(uint32_t));
139 static const struct {
143 { 0x80, 0x00, 0 }, /* 0xxxxxxx */
144 { 0xe0, 0xc0, 0x80 }, /* 110xxxxx 10xxxxxx */
145 { 0xf0, 0xe0, 0x800 }, /* 1110xxxx 10xxxxxx 10xxxxxx */
146 { 0xf8, 0xf0, 0x10000 }, /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
148 unsigned int level
, j
;
152 while (level
< sizeof(_mask
) / sizeof(_mask
[0])) {
153 if ((_c
[i
] & _mask
[level
].mask1
) == _mask
[level
].res1
)
157 if (level
== sizeof(_mask
) / sizeof(_mask
[0]))
158 _error_exit("invalid UTF-8 octet sequence at %u: %02"PRIx8
, i
, _c
[i
]);
159 if (i
+ level
>= _nr_c
)
160 _error_exit("truncated UTF-8 octet sequence at %u: %02"PRIx8
, i
, _c
[i
]);
161 for (j
= 0; j
< level
; j
++) {
162 if ((_c
[i
+ j
+ 1] & 0xc0) != 0x80)
163 _error_exit("invalid UTF-8 octet sequence at %u: %02"PRIx8
" ... %02"PRIx8
, i
+ j
+ 1, _c
[i
], _c
[i
+ j
+ 1]);
166 ch
= _c
[i
] & ~_mask
[level
].mask1
;
167 for (j
= 0; j
< level
; j
++)
168 ch
= (ch
<< 6) | (_c
[i
+ j
+ 1] & ~0xc0);
170 if (ch
< _mask
[level
].min
)
171 _error_exit("invalid UTF-8 octet sequence at %u: %02"PRIx8
, i
, _c
[i
]);
180 /* LINE SEPARATOR to catch \n. */
181 #define LS ((uint32_t)0x2028)
183 static void fix_newline(uint32_t *c
, unsigned int *nr_c
)
189 if (c
[i
] == 0x0d && i
+ 1 < *nr_c
&& c
[i
+ 1] == 0x0a) {
190 memmove(&c
[i
], &c
[i
+ 1], *nr_c
- i
- 1);
207 static struct pos
*line_column(const uint32_t *c
, unsigned int nr_c
)
210 unsigned int line
, column
;
213 if (nr_c
>= 0xffffffff / sizeof(struct pos
))
214 _error_exit("integer overflow nr_c %u", nr_c
);
215 pos
= xmalloc(nr_c
* sizeof(struct pos
));
219 for (i
= 0; i
< nr_c
; i
++) {
221 pos
[i
].column
= column
;
232 static void warn_trigraph(const uint32_t *c
, unsigned int nr_c
, struct pos
*pos
)
237 while (i
+ 2 < nr_c
) {
238 if (c
[i
] == '?' && c
[i
+ 1] == '?') {
240 case '=':case ')':case '!':
241 case '(':case '\'':case '>':
242 case '/':case '<':case '-':
243 warning(&pos
[i
], "trigraph sequence ??%c, ignoring", c
[i
+ 2]);
254 static void delete_backslash_newline(uint32_t *c
, unsigned int *nr_c
, struct pos
*pos
)
259 while (i
+ 1 < *nr_c
) {
260 if (c
[i
] == '\\' && c
[i
+ 1] == LS
) {
261 unsigned int nr_to_move
= *nr_c
- i
- 2;
263 memmove(&c
[i
], &c
[i
+ 2], nr_to_move
* sizeof(uint32_t));
264 memmove(&pos
[i
], &pos
[i
+ 2], nr_to_move
* sizeof(struct pos
));
272 struct pp_token
*next
;
274 PP_TOKEN_IDENTIFIER
= UCHAR_MAX
+ 1,
279 #define _2(c1, c2) ((((uint32_t)c1) << 8) | ((uint32_t)c2))
280 #define _3(c1, c2, c3) ((((uint32_t)c1) << 16)| (((uint32_t)c2) << 8) | ((uint32_t)c3))
281 PP_TOKEN_DOTDOTDOT
= _3('.', '.', '.'),
282 PP_TOKEN_DEREFERENCE
= _2('-', '>'),
283 PP_TOKEN_SUB_EQ
= _2('-', '='),
284 PP_TOKEN_DEC
= _2('-', '-'),
285 PP_TOKEN_ADD_EQ
= _2('+', '='),
286 PP_TOKEN_INC
= _2('+', '+'),
287 PP_TOKEN_AND_EQ
= _2('&', '='),
288 PP_TOKEN_AND
= _2('&', '&'),
289 PP_TOKEN_MUL_EQ
= _2('*', '='),
290 PP_TOKEN_NOT_EQ
= _2('!', '='),
291 PP_TOKEN_DIV_EQ
= _2('/', '='),
292 PP_TOKEN_REM_EQ
= _2('%', '='),
293 PP_TOKEN_LSHIFT_EQ
= _3('<', '<', '='),
294 PP_TOKEN_LSHIFT
= _2('<', '<'),
295 PP_TOKEN_LEQ
= _2('<', '='),
296 PP_TOKEN_RSHIFT_EQ
= _3('>', '>', '='),
297 PP_TOKEN_RSHIFT
= _2('>', '>'),
298 PP_TOKEN_GEQ
= _2('>', '='),
299 PP_TOKEN_EQ
= _2('=', '='),
300 PP_TOKEN_XOR_EQ
= _2('^', '='),
301 PP_TOKEN_OR_EQ
= _2('|', '='),
302 PP_TOKEN_OR
= _2('|', '|'),
303 PP_TOKEN_SHARPSHARP
= _2('#', '#'),
307 uint32_t *id
; /* string representation, if type is not enough */
311 static struct pp_token
*pp_token_create(enum pp_token_type type
, struct pos
*pos
)
313 struct pp_token
*ppt
;
315 ppt
= xmalloc(sizeof(struct pp_token
));
324 static void pp_token_add(struct pp_token
*ppt
, const uint32_t *c
, unsigned int start
, unsigned int end
)
326 ppt
->id
= xmemdup(&c
[start
], (end
- start
) * sizeof(uint32_t));
329 static void pp_token_free(struct pp_token
*ppt_head
)
331 struct pp_token
*ppt
;
335 struct pp_token
*next
;
345 static void pp_token_print(struct pp_token
*ppt
)
347 printf("%u:%u:\t", ppt
->pos
.line
, ppt
->pos
.column
);
355 case PP_TOKEN_IDENTIFIER
:
356 printf("pp-identifier");
358 case PP_TOKEN_NUMBER
:
361 case PP_TOKEN_STRING
:
367 case PP_TOKEN_DOTDOTDOT
:
368 case PP_TOKEN_LSHIFT_EQ
:
369 case PP_TOKEN_RSHIFT_EQ
:
370 printf("%c%c%c", (ppt
->type
>> 16) & 0xff, (ppt
->type
>> 8) & 0xff, ppt
->type
& 0xff);
372 case PP_TOKEN_DEREFERENCE
:
373 case PP_TOKEN_SUB_EQ
:
375 case PP_TOKEN_ADD_EQ
:
377 case PP_TOKEN_AND_EQ
:
379 case PP_TOKEN_MUL_EQ
:
380 case PP_TOKEN_NOT_EQ
:
381 case PP_TOKEN_DIV_EQ
:
382 case PP_TOKEN_REM_EQ
:
383 case PP_TOKEN_LSHIFT
:
385 case PP_TOKEN_RSHIFT
:
388 case PP_TOKEN_XOR_EQ
:
391 case PP_TOKEN_SHARPSHARP
:
392 printf("%c%c", (ppt
->type
>> 8) & 0xff, ppt
->type
& 0xff);
395 printf("%c", ppt
->type
);
400 static int pp_nondigit(const uint32_t c
)
402 return ('a' <= c
&& c
<= 'z') || ('A' <= c
&& c
<= 'Z') || c
== '_';
405 static int pp_octdigit(const uint32_t c
)
407 return '0' <= c
&& c
<= '7';
410 static int pp_digit(const uint32_t c
)
412 return '0' <= c
&& c
<= '9';
415 static int pp_hexdigit(const uint32_t c
)
417 return pp_digit(c
) || ('a' <= c
&& c
<= 'f') || ('A' <= c
&& c
<= 'F');
420 /* pp-identifier: ([a-zA-Z_]|\u[0-9a-fA-F]{4}|\U[0-9a-fA-F]{8})([a-zA-Z_0-9]|\u[0-9a-fA-F]{4}|\U[0-9a-fA-F]{8})* */
421 static unsigned int _pp_identifier_end(const uint32_t *c
, unsigned int nr_c
, unsigned int start
)
425 /* First identifier-nondigit is already "parsed". */
428 if (pp_nondigit(c
[i
]) || pp_digit(c
[i
])) {
430 } else if (i
+ 5 < nr_c
&& c
[i
] == '\\' && c
[i
+ 1] == 'u' &&
431 pp_hexdigit(c
[i
+ 2]) && pp_hexdigit(c
[i
+ 3]) && pp_hexdigit(c
[i
+ 4]) && pp_hexdigit(c
[i
+ 5])) {
433 } else if (i
+ 9 < nr_c
&& c
[i
] == '\\' && c
[i
+ 1] == 'U' &&
434 pp_hexdigit(c
[i
+ 2]) && pp_hexdigit(c
[i
+ 3]) && pp_hexdigit(c
[i
+ 4]) && pp_hexdigit(c
[i
+ 5]) &&
435 pp_hexdigit(c
[i
+ 6]) && pp_hexdigit(c
[i
+ 7]) && pp_hexdigit(c
[i
+ 8]) && pp_hexdigit(c
[i
+ 9])) {
443 /* pp-number: \.?[0-9]([eEpP][+-]|[a-zA-Z_.]|\u[0-9a-fA-F]{4}|\U[0-9a-fA-F]{8})* */
444 static unsigned int pp_number_end(const uint32_t *c
, unsigned int nr_c
, unsigned int start
)
450 if ((c
[i
] == 'e' || c
[i
] == 'E' || c
[i
] == 'p' || c
[i
] == 'P') &&
451 i
+ 1 < nr_c
&& (c
[i
+ 1] == '+' || c
[i
+ 1] == '-')) {
453 } else if (pp_digit(c
[i
]) || pp_nondigit(c
[i
]) || c
[i
] == '.') {
455 } else if (c
[i
] == '\\' && i
+ 5 < nr_c
&& c
[i
+ 1] == 'u' &&
456 pp_hexdigit(c
[i
+ 2]) && pp_hexdigit(c
[i
+ 3]) && pp_hexdigit(c
[i
+ 4]) && pp_hexdigit(c
[i
+ 5])) {
458 } else if (c
[i
] == '\\' && i
+ 9 < nr_c
&& c
[i
+ 1] == 'U' &&
459 pp_hexdigit(c
[i
+ 2]) && pp_hexdigit(c
[i
+ 3]) && pp_hexdigit(c
[i
+ 4]) && pp_hexdigit(c
[i
+ 5]) &&
460 pp_hexdigit(c
[i
+ 6]) && pp_hexdigit(c
[i
+ 7]) && pp_hexdigit(c
[i
+ 8]) && pp_hexdigit(c
[i
+ 9])) {
468 static unsigned int c_comment_end(const uint32_t *c
, unsigned int nr_c
, unsigned int start
)
473 while (i
+ 1 < nr_c
) {
474 if (c
[i
] == '*' && c
[i
+ 1] == '/')
481 static unsigned int cpp_comment_end(const uint32_t *c
, unsigned int nr_c
, unsigned int start
)
486 while (i
< nr_c
&& c
[i
] != LS
)
491 static unsigned int escape_sequence_end(const uint32_t *c
, unsigned int nr_c
, unsigned int start
, struct pos
*_pos
)
493 struct pos
*pos
= &_pos
[start
];
498 error_exit(pos
, "incomplete escape sequence");
500 case '\'':case '"':case '?':case '\\':
501 case 'a':case 'b':case 'f':case 'n':case 'r':case 't':case 'v':
503 case '0':case '1':case '2':case '3':case '4':case '6':case '7':
504 if (i
+ 2 < nr_c
&& pp_octdigit(c
[i
+ 1]) && pp_octdigit(c
[i
+ 2]))
506 if (i
+ 1 < nr_c
&& pp_octdigit(c
[i
+ 1]))
511 while (i
< nr_c
&& pp_hexdigit(c
[i
]))
514 error_exit(pos
, "invalid hexadecimal escape sequence");
518 pp_hexdigit(c
[i
+ 1]) && pp_hexdigit(c
[i
+ 2]) && pp_hexdigit(c
[i
+ 3]) && pp_hexdigit(c
[i
+ 4]))
520 error_exit(pos
, "invalid universal character name");
523 pp_hexdigit(c
[i
+ 1]) && pp_hexdigit(c
[i
+ 2]) && pp_hexdigit(c
[i
+ 3]) && pp_hexdigit(c
[i
+ 4]) &&
524 pp_hexdigit(c
[i
+ 5]) && pp_hexdigit(c
[i
+ 6]) && pp_hexdigit(c
[i
+ 7]) && pp_hexdigit(c
[i
+ 8]))
526 error_exit(pos
, "invalid universal character name");
528 error_exit(pos
, "invalid escape sequence");
532 static unsigned int pp_string_end(const uint32_t *c
, unsigned int nr_c
, unsigned int start
, struct pos
*_pos
)
534 struct pos
*pos
= &_pos
[start
];
537 /* Opening " is already "parsed". */
539 while (i
< nr_c
&& c
[i
] != '"') {
544 i
= escape_sequence_end(c
, nr_c
, i
, _pos
);
555 error_exit(pos
, "incomplete string literal");
558 static unsigned int pp_char_end(const uint32_t *c
, unsigned int nr_c
, unsigned int start
, struct pos
*_pos
)
560 struct pos
*pos
= &_pos
[start
];
563 /* Opening ' is already "parsed". */
565 while (i
< nr_c
&& c
[i
] != '\'') {
570 i
= escape_sequence_end(c
, nr_c
, i
, _pos
);
583 error_exit(pos
, "incomplete character constant");
585 error_exit(pos
, "empty character constant");
588 static struct pp_token
*pp_tokenize(const uint32_t *c
, unsigned int nr_c
, struct pos
*_pos
)
590 struct pp_token
*ppt_head
, *ppt_tail
;
596 struct pos
*pos
= &_pos
[i
];
597 struct pp_token
*ppt
;
604 ppt
= pp_token_create(' ', pos
);
608 ppt
= pp_token_create(LS
, pos
);
620 ppt
= pp_token_create(c
[i
], pos
);
623 case 'a':case 'b':case 'c':case 'd':case 'e':case 'f':case 'g':
624 case 'h':case 'i':case 'j':case 'k':case 'l':case 'm':case 'n':
625 case 'o':case 'p':case 'q':case 'r':case 's':case 't':case 'u':
626 case 'v':case 'w':case 'x':case 'y':case 'z':
627 case 'A':case 'B':case 'C':case 'D':case 'E':case 'F':case 'G':
628 case 'H':case 'I':case 'J':case 'K':case 'L':case 'M':case 'N':
629 case 'O':case 'P':case 'Q':case 'R':case 'S':case 'T':case 'U':
630 case 'V':case 'W':case 'X':case 'Y':case 'Z':
632 ppt
= pp_token_create(PP_TOKEN_IDENTIFIER
, pos
);
633 j
= _pp_identifier_end(c
, nr_c
, i
+ 1);
634 pp_token_add(ppt
, c
, i
, j
);
638 if (i
+ 5 < nr_c
&& c
[i
+ 1] == 'u' &&
639 pp_hexdigit(c
[i
+ 2]) && pp_hexdigit(c
[i
+ 3]) && pp_hexdigit(c
[i
+ 4]) && pp_hexdigit(c
[i
+ 5])) {
640 ppt
= pp_token_create(PP_TOKEN_IDENTIFIER
, pos
);
641 j
= _pp_identifier_end(c
, nr_c
, i
+ 2 + 4);
642 pp_token_add(ppt
, c
, i
, j
);
644 } else if (i
+ 9 < nr_c
&& c
[i
+ 1] == 'U' &&
645 pp_hexdigit(c
[i
+ 2]) && pp_hexdigit(c
[i
+ 3]) && pp_hexdigit(c
[i
+ 4]) && pp_hexdigit(c
[i
+ 5]) &&
646 pp_hexdigit(c
[i
+ 6]) && pp_hexdigit(c
[i
+ 7]) && pp_hexdigit(c
[i
+ 8]) && pp_hexdigit(c
[i
+ 9])) {
647 ppt
= pp_token_create(PP_TOKEN_IDENTIFIER
, pos
);
648 j
= _pp_identifier_end(c
, nr_c
, i
+ 2 + 4 + 4);
649 pp_token_add(ppt
, c
, i
, j
);
652 error_exit(pos
, "unknown character %08"PRIx32
, c
[i
]);
654 case '0':case '1':case '2':case '3':case '4':
655 case '5':case '6':case '7':case '8':case '9':
656 ppt
= pp_token_create(PP_TOKEN_NUMBER
, pos
);
657 j
= pp_number_end(c
, nr_c
, i
);
658 pp_token_add(ppt
, c
, i
, j
);
662 if (i
+ 2 < nr_c
&& c
[i
+ 1] == '.' && c
[i
+ 2] == '.') {
663 ppt
= pp_token_create(PP_TOKEN_DOTDOTDOT
, pos
);
665 } else if (i
+ 1 < nr_c
&& pp_digit(c
[i
+ 1])) {
666 ppt
= pp_token_create(PP_TOKEN_NUMBER
, pos
);
667 j
= pp_number_end(c
, nr_c
, i
+ 1);
668 pp_token_add(ppt
, c
, i
, j
);
671 goto pp_token_simple
;
674 ppt
= pp_token_create(PP_TOKEN_STRING
, pos
);
675 j
= pp_string_end(c
, nr_c
, i
, _pos
);
676 pp_token_add(ppt
, c
, i
+ 1, j
- 1);
680 ppt
= pp_token_create(PP_TOKEN_CHAR
, pos
);
681 j
= pp_char_end(c
, nr_c
, i
, _pos
);
682 pp_token_add(ppt
, c
, i
+ 1, j
- 1);
686 if (i
+ 1 < nr_c
&& c
[i
+ 1] == '*') {
687 ppt
= pp_token_create(' ', pos
);
688 i
= c_comment_end(c
, nr_c
, i
);
689 } else if (i
+ 1 < nr_c
&& c
[i
+ 1] == '/') {
690 warning(pos
, "C++ comment");
691 ppt
= pp_token_create(' ', pos
);
692 i
= cpp_comment_end(c
, nr_c
, i
);
693 } else if (i
+ 1 < nr_c
&& c
[i
+ 1] == '=') {
694 ppt
= pp_token_create((c
[i
] << 8) | c
[i
+ 1], pos
);
697 goto pp_token_simple
;
700 if (i
+ 1 < nr_c
&& (c
[i
+ 1] == '>' || c
[i
+ 1] == '=' || c
[i
+ 1] == '-')) {
701 ppt
= pp_token_create((c
[i
] << 8) | c
[i
+ 1], pos
);
704 goto pp_token_simple
;
709 if (i
+ 1 < nr_c
&& (c
[i
+ 1] == '=' || c
[i
+ 1] == c
[i
])) {
710 ppt
= pp_token_create((c
[i
] << 8) | c
[i
+ 1], pos
);
713 goto pp_token_simple
;
720 if (i
+ 1 < nr_c
&& c
[i
+ 1] == '=') {
721 ppt
= pp_token_create((c
[i
] << 8) | c
[i
+ 1], pos
);
724 goto pp_token_simple
;
728 if (i
+ 2 < nr_c
&& c
[i
+ 1] == c
[i
] && c
[i
+ 2] == '=') {
729 ppt
= pp_token_create((c
[i
] << 16) | (c
[i
+ 1] << 8) | c
[i
+ 2], pos
);
731 } else if (i
+ 1 < nr_c
&& (c
[i
+ 1] == c
[i
] || c
[i
+ 1] == '=')) {
732 ppt
= pp_token_create((c
[i
] << 8) | c
[i
+ 1], pos
);
735 goto pp_token_simple
;
738 if (i
+ 1 < nr_c
&& c
[i
+ 1] == '#') {
739 ppt
= pp_token_create((c
[i
] << 8) | c
[i
+ 1], pos
);
742 goto pp_token_simple
;
745 error_exit(pos
, "unknown character %08"PRIx32
, c
[i
]);
751 ppt_tail
->next
= ppt
;
757 int main(int argc
, char *argv
[])
761 unsigned int st_size
;
768 struct pp_token
*ppt_head
;
773 fd
= open(argv
[1], O_RDONLY
);
775 perror_exit("open %s", argv
[1]);
776 if (fstat(fd
, &st
) == -1)
777 perror_exit("fstat %s", argv
[1]);
779 _error_exit("%s: negative st_size %"PRIdMAX
, argv
[1], (intmax_t)st
.st_size
);
780 st_size
= (unsigned int)(uintmax_t)(intmax_t)st
.st_size
;
781 if ((uintmax_t)(intmax_t)st
.st_size
!= (uintmax_t)st_size
)
782 _error_exit("%s: too big st_size %"PRIdMAX
, argv
[1], (intmax_t)st
.st_size
);
784 buf
= xmalloc(st_size
);
785 xread(fd
, buf
, st_size
);
790 /* Skip UTF-8 "BOM" if any. */
791 if (st_size
>= 3 && _c
[0] == 0xef && _c
[1] == 0xbb && _c
[2] == 0xbf) {
795 convert_from_utf8(_c
, _nr_c
, &c
, &nr_c
);
798 fix_newline(c
, &nr_c
);
799 pos
= line_column(c
, nr_c
);
800 warn_trigraph(c
, nr_c
, pos
);
801 delete_backslash_newline(c
, &nr_c
, pos
);
803 ppt_head
= pp_tokenize(c
, nr_c
, pos
);
808 struct pp_token
*ppt
;
810 for (ppt
= ppt_head
; ppt
; ppt
= ppt
->next
)
814 pp_token_free(ppt_head
);