acc: saner error reporting
[acc.git] / main.c
blobf62696c0d94a63e75fc0e5e05ac9da7a5555c8f1
1 /* Alexey's C compiler. */
2 #include <stdlib.h>
3 #include <sys/types.h>
4 #include <sys/stat.h>
5 #include <fcntl.h>
6 #include <stdio.h>
7 #include <errno.h>
8 #include <unistd.h>
9 #include <stdarg.h>
10 #include <stdint.h>
11 #include <inttypes.h>
12 #include <string.h>
13 #include <limits.h>
15 struct pos {
16 unsigned int line, column;
19 #ifdef __GNUC__
20 #define __noreturn __attribute((noreturn))
21 #define __printf(a, b) __attribute__((format(printf, a, b)))
22 #else
23 #define __noreturn
24 #define __printf(a, b)
25 #endif
27 static void warning(struct pos *pos, const char *fmt, ...) __printf(2, 3);
28 static void warning(struct pos *pos, const char *fmt, ...)
30 va_list args;
32 fprintf(stderr, "%u:%u: warning: ", pos->line, pos->column);
33 va_start(args, fmt);
34 vfprintf(stderr, fmt, args);
35 va_end(args);
36 fputc('\n', stderr);
39 static void error_exit(struct pos *pos, const char *fmt, ...) __printf(2, 3) __noreturn;
40 static void error_exit(struct pos *pos, const char *fmt, ...)
42 va_list args;
44 fprintf(stderr, "%u:%u: error: ", pos->line, pos->column);
45 va_start(args, fmt);
46 vfprintf(stderr, fmt, args);
47 va_end(args);
48 fputc('\n', stderr);
49 exit(EXIT_FAILURE);
52 static void perror_exit(const char *fmt, ...) __printf(1, 2) __noreturn;
53 static void perror_exit(const char *fmt, ...)
55 int old_errno = errno;
56 va_list args;
58 fputs("acc: ", stderr);
59 va_start(args, fmt);
60 vfprintf(stderr, fmt, args);
61 va_end(args);
62 fputs(": ", stderr);
63 errno = old_errno;
64 perror(NULL);
65 exit(EXIT_FAILURE);
68 static void _error_exit(const char *fmt, ...) __printf(1, 2) __noreturn;
69 static void _error_exit(const char *fmt, ...)
71 va_list args;
73 fputs("acc: error: ", stderr);
74 va_start(args, fmt);
75 vfprintf(stderr, fmt, args);
76 va_end(args);
77 fputc('\n', stderr);
78 exit(EXIT_FAILURE);
81 static void *xmalloc(size_t size)
83 void *p;
85 p = malloc(size);
86 if (!p)
87 perror_exit("%s: size %zu", __func__, size);
88 return p;
91 static void *xmemdup(const void *src, size_t n)
93 void *dst;
95 dst = xmalloc(n);
96 memcpy(dst, src, n);
97 return dst;
100 static ssize_t _xread(int fd, void *buf, size_t count)
102 ssize_t rv;
104 do {
105 rv = read(fd, buf, count);
106 } while (rv == -1 && (errno == EAGAIN || errno == EINTR));
107 return rv;
110 static void xread(int fd, void *buf, size_t count)
112 while (count > 0) {
113 ssize_t rv;
115 rv = _xread(fd, buf, count);
116 if (rv == -1)
117 perror_exit("read fd %d, buf %p, count %zu", fd, buf, count);
118 if (rv == 0)
119 _error_exit("fd %d truncated, buf %p, count %zu", fd, buf, count);
121 buf = (char *)buf + rv;
122 count -= rv;
126 static void convert_from_utf8(uint8_t *_c, unsigned int _nr_c, uint32_t **c, unsigned int *nr_c)
128 unsigned int i;
130 if (_nr_c >= 0xffffffff / sizeof(uint32_t))
131 _error_exit("integer overflow _nr_c %"PRIu32, _nr_c);
133 /* At worse all data is ASCII. */
134 *c = xmalloc(_nr_c * sizeof(uint32_t));
135 *nr_c = 0;
137 i = 0;
138 while (i < _nr_c) {
139 static const struct {
140 uint8_t mask1, res1;
141 uint32_t min;
142 } _mask[] = {
143 { 0x80, 0x00, 0 }, /* 0xxxxxxx */
144 { 0xe0, 0xc0, 0x80 }, /* 110xxxxx 10xxxxxx */
145 { 0xf0, 0xe0, 0x800 }, /* 1110xxxx 10xxxxxx 10xxxxxx */
146 { 0xf8, 0xf0, 0x10000 }, /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
148 unsigned int level, j;
149 uint32_t ch;
151 level = 0;
152 while (level < sizeof(_mask) / sizeof(_mask[0])) {
153 if ((_c[i] & _mask[level].mask1) == _mask[level].res1)
154 break;
155 level++;
157 if (level == sizeof(_mask) / sizeof(_mask[0]))
158 _error_exit("invalid UTF-8 octet sequence at %u: %02"PRIx8, i, _c[i]);
159 if (i + level >= _nr_c)
160 _error_exit("truncated UTF-8 octet sequence at %u: %02"PRIx8, i, _c[i]);
161 for (j = 0; j < level; j++) {
162 if ((_c[i + j + 1] & 0xc0) != 0x80)
163 _error_exit("invalid UTF-8 octet sequence at %u: %02"PRIx8" ... %02"PRIx8, i + j + 1, _c[i], _c[i + j + 1]);
166 ch = _c[i] & ~_mask[level].mask1;
167 for (j = 0; j < level; j++)
168 ch = (ch << 6) | (_c[i + j + 1] & ~0xc0);
170 if (ch < _mask[level].min)
171 _error_exit("invalid UTF-8 octet sequence at %u: %02"PRIx8, i, _c[i]);
173 i += level + 1;
175 (*c)[*nr_c] = ch;
176 (*nr_c)++;
180 /* LINE SEPARATOR to catch \n. */
181 #define LS ((uint32_t)0x2028)
183 static void fix_newline(uint32_t *c, unsigned int *nr_c)
185 unsigned int i;
187 i = 0;
188 while (i < *nr_c) {
189 if (c[i] == 0x0d && i + 1 < *nr_c && c[i + 1] == 0x0a) {
190 memmove(&c[i], &c[i + 1], *nr_c - i - 1);
191 (*nr_c)--;
193 switch (c[i]) {
194 case 0x0d:
195 case 0x0a:
196 case 0x85:
197 case 0x0b:
198 case 0x0c:
199 case 0x2028:
200 case 0x2029:
201 c[i] = LS;
203 i++;
207 static struct pos *line_column(const uint32_t *c, unsigned int nr_c)
209 struct pos *pos;
210 unsigned int line, column;
211 unsigned int i;
213 if (nr_c >= 0xffffffff / sizeof(struct pos))
214 _error_exit("integer overflow nr_c %u", nr_c);
215 pos = xmalloc(nr_c * sizeof(struct pos));
217 line = 1;
218 column = 1;
219 for (i = 0; i < nr_c; i++) {
220 pos[i].line = line;
221 pos[i].column = column;
223 if (c[i] == LS) {
224 line++;
225 column = 1;
226 } else
227 column++;
229 return pos;
232 static void warn_trigraph(const uint32_t *c, unsigned int nr_c, struct pos *pos)
234 unsigned int i;
236 i = 0;
237 while (i + 2 < nr_c) {
238 if (c[i] == '?' && c[i + 1] == '?') {
239 switch (c[i + 2]) {
240 case '=':case ')':case '!':
241 case '(':case '\'':case '>':
242 case '/':case '<':case '-':
243 warning(&pos[i], "trigraph sequence ??%c, ignoring", c[i + 2]);
244 i += 3;
245 break;
246 default:
247 i++;
249 } else
250 i++;
254 static void delete_backslash_newline(uint32_t *c, unsigned int *nr_c, struct pos *pos)
256 unsigned int i;
258 i = 0;
259 while (i + 1 < *nr_c) {
260 if (c[i] == '\\' && c[i + 1] == LS) {
261 unsigned int nr_to_move = *nr_c - i - 2;
263 memmove(&c[i], &c[i + 2], nr_to_move * sizeof(uint32_t));
264 memmove(&pos[i], &pos[i + 2], nr_to_move * sizeof(struct pos));
265 (*nr_c) -= 2;
266 } else
267 i++;
271 struct pp_token {
272 struct pp_token *next;
273 enum pp_token_type {
274 PP_TOKEN_IDENTIFIER = UCHAR_MAX + 1,
275 PP_TOKEN_NUMBER,
276 PP_TOKEN_STRING,
277 PP_TOKEN_CHAR,
279 #define _2(c1, c2) ((((uint32_t)c1) << 8) | ((uint32_t)c2))
280 #define _3(c1, c2, c3) ((((uint32_t)c1) << 16)| (((uint32_t)c2) << 8) | ((uint32_t)c3))
281 PP_TOKEN_DOTDOTDOT = _3('.', '.', '.'),
282 PP_TOKEN_DEREFERENCE = _2('-', '>'),
283 PP_TOKEN_SUB_EQ = _2('-', '='),
284 PP_TOKEN_DEC = _2('-', '-'),
285 PP_TOKEN_ADD_EQ = _2('+', '='),
286 PP_TOKEN_INC = _2('+', '+'),
287 PP_TOKEN_AND_EQ = _2('&', '='),
288 PP_TOKEN_AND = _2('&', '&'),
289 PP_TOKEN_MUL_EQ = _2('*', '='),
290 PP_TOKEN_NOT_EQ = _2('!', '='),
291 PP_TOKEN_DIV_EQ = _2('/', '='),
292 PP_TOKEN_REM_EQ = _2('%', '='),
293 PP_TOKEN_LSHIFT_EQ = _3('<', '<', '='),
294 PP_TOKEN_LSHIFT = _2('<', '<'),
295 PP_TOKEN_LEQ = _2('<', '='),
296 PP_TOKEN_RSHIFT_EQ = _3('>', '>', '='),
297 PP_TOKEN_RSHIFT = _2('>', '>'),
298 PP_TOKEN_GEQ = _2('>', '='),
299 PP_TOKEN_EQ = _2('=', '='),
300 PP_TOKEN_XOR_EQ = _2('^', '='),
301 PP_TOKEN_OR_EQ = _2('|', '='),
302 PP_TOKEN_OR = _2('|', '|'),
303 PP_TOKEN_SHARPSHARP = _2('#', '#'),
304 #undef _2
305 #undef _3
306 } type;
307 uint32_t *id; /* string representation, if type is not enough */
308 struct pos pos;
311 static struct pp_token *pp_token_create(enum pp_token_type type, struct pos *pos)
313 struct pp_token *ppt;
315 ppt = xmalloc(sizeof(struct pp_token));
316 ppt->next = NULL;
317 ppt->type = type;
318 ppt->id = NULL;
319 ppt->pos = *pos;
320 return ppt;
323 /* [start, end) */
324 static void pp_token_add(struct pp_token *ppt, const uint32_t *c, unsigned int start, unsigned int end)
326 ppt->id = xmemdup(&c[start], (end - start) * sizeof(uint32_t));
329 static void pp_token_free(struct pp_token *ppt_head)
331 struct pp_token *ppt;
333 ppt = ppt_head;
334 while (ppt) {
335 struct pp_token *next;
337 next = ppt->next;
338 if (ppt->id)
339 free(ppt->id);
340 free(ppt);
341 ppt = next;
345 static void pp_token_print(struct pp_token *ppt)
347 printf("%u:%u:\t", ppt->pos.line, ppt->pos.column);
348 switch (ppt->type) {
349 case LS:
350 printf("\\n");
351 break;
352 case ' ':
353 printf("' '");
354 break;
355 case PP_TOKEN_IDENTIFIER:
356 printf("pp-identifier");
357 break;
358 case PP_TOKEN_NUMBER:
359 printf("pp-number");
360 break;
361 case PP_TOKEN_STRING:
362 printf("pp-string");
363 break;
364 case PP_TOKEN_CHAR:
365 printf("pp-char");
366 break;
367 case PP_TOKEN_DOTDOTDOT:
368 case PP_TOKEN_LSHIFT_EQ:
369 case PP_TOKEN_RSHIFT_EQ:
370 printf("%c%c%c", (ppt->type >> 16) & 0xff, (ppt->type >> 8) & 0xff, ppt->type & 0xff);
371 break;
372 case PP_TOKEN_DEREFERENCE:
373 case PP_TOKEN_SUB_EQ:
374 case PP_TOKEN_DEC:
375 case PP_TOKEN_ADD_EQ:
376 case PP_TOKEN_INC:
377 case PP_TOKEN_AND_EQ:
378 case PP_TOKEN_AND:
379 case PP_TOKEN_MUL_EQ:
380 case PP_TOKEN_NOT_EQ:
381 case PP_TOKEN_DIV_EQ:
382 case PP_TOKEN_REM_EQ:
383 case PP_TOKEN_LSHIFT:
384 case PP_TOKEN_LEQ:
385 case PP_TOKEN_RSHIFT:
386 case PP_TOKEN_GEQ:
387 case PP_TOKEN_EQ:
388 case PP_TOKEN_XOR_EQ:
389 case PP_TOKEN_OR_EQ:
390 case PP_TOKEN_OR:
391 case PP_TOKEN_SHARPSHARP:
392 printf("%c%c", (ppt->type >> 8) & 0xff, ppt->type & 0xff);
393 break;
394 default:
395 printf("%c", ppt->type);
397 putc('\n', stdout);
400 static int pp_nondigit(const uint32_t c)
402 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
405 static int pp_octdigit(const uint32_t c)
407 return '0' <= c && c <= '7';
410 static int pp_digit(const uint32_t c)
412 return '0' <= c && c <= '9';
415 static int pp_hexdigit(const uint32_t c)
417 return pp_digit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
420 /* pp-identifier: ([a-zA-Z_]|\u[0-9a-fA-F]{4}|\U[0-9a-fA-F]{8})([a-zA-Z_0-9]|\u[0-9a-fA-F]{4}|\U[0-9a-fA-F]{8})* */
421 static unsigned int _pp_identifier_end(const uint32_t *c, unsigned int nr_c, unsigned int start)
423 unsigned int i;
425 /* First identifier-nondigit is already "parsed". */
426 i = start;
427 while (i < nr_c) {
428 if (pp_nondigit(c[i]) || pp_digit(c[i])) {
429 i++;
430 } else if (i + 5 < nr_c && c[i] == '\\' && c[i + 1] == 'u' &&
431 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5])) {
432 i += 2 + 4;
433 } else if (i + 9 < nr_c && c[i] == '\\' && c[i + 1] == 'U' &&
434 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5]) &&
435 pp_hexdigit(c[i + 6]) && pp_hexdigit(c[i + 7]) && pp_hexdigit(c[i + 8]) && pp_hexdigit(c[i + 9])) {
436 i += 2 + 4 + 4;
437 } else
438 return i;
440 return i;
443 /* pp-number: \.?[0-9]([eEpP][+-]|[a-zA-Z_.]|\u[0-9a-fA-F]{4}|\U[0-9a-fA-F]{8})* */
444 static unsigned int pp_number_end(const uint32_t *c, unsigned int nr_c, unsigned int start)
446 unsigned int i;
448 i = start + 1;
449 while (i < nr_c) {
450 if ((c[i] == 'e' || c[i] == 'E' || c[i] == 'p' || c[i] == 'P') &&
451 i + 1 < nr_c && (c[i + 1] == '+' || c[i + 1] == '-')) {
452 i += 2;
453 } else if (pp_digit(c[i]) || pp_nondigit(c[i]) || c[i] == '.') {
454 i++;
455 } else if (c[i] == '\\' && i + 5 < nr_c && c[i + 1] == 'u' &&
456 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5])) {
457 i += 2 + 4;
458 } else if (c[i] == '\\' && i + 9 < nr_c && c[i + 1] == 'U' &&
459 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5]) &&
460 pp_hexdigit(c[i + 6]) && pp_hexdigit(c[i + 7]) && pp_hexdigit(c[i + 8]) && pp_hexdigit(c[i + 9])) {
461 i += 2 + 4 + 4;
462 } else
463 return i;
465 return i;
468 static unsigned int c_comment_end(const uint32_t *c, unsigned int nr_c, unsigned int start)
470 unsigned int i;
472 i = start + 2;
473 while (i + 1 < nr_c) {
474 if (c[i] == '*' && c[i + 1] == '/')
475 return i + 2;
476 i++;
478 return nr_c;
481 static unsigned int cpp_comment_end(const uint32_t *c, unsigned int nr_c, unsigned int start)
483 unsigned int i;
485 i = start + 2;
486 while (i < nr_c && c[i] != LS)
487 i++;
488 return i;
491 static unsigned int escape_sequence_end(const uint32_t *c, unsigned int nr_c, unsigned int start, struct pos *_pos)
493 struct pos *pos = &_pos[start];
494 unsigned int i;
496 i = start + 1;
497 if (i >= nr_c)
498 error_exit(pos, "incomplete escape sequence");
499 switch (c[i]) {
500 case '\'':case '"':case '?':case '\\':
501 case 'a':case 'b':case 'f':case 'n':case 'r':case 't':case 'v':
502 return i + 1;
503 case '0':case '1':case '2':case '3':case '4':case '6':case '7':
504 if (i + 2 < nr_c && pp_octdigit(c[i + 1]) && pp_octdigit(c[i + 2]))
505 return i + 3;
506 if (i + 1 < nr_c && pp_octdigit(c[i + 1]))
507 return i + 2;
508 return i + 1;
509 case 'x':
510 i++;
511 while (i < nr_c && pp_hexdigit(c[i]))
512 i++;
513 if (i == start + 2)
514 error_exit(pos, "invalid hexadecimal escape sequence");
515 return i;
516 case 'u':
517 if (i + 4 < nr_c &&
518 pp_hexdigit(c[i + 1]) && pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]))
519 return i + 5;
520 error_exit(pos, "invalid universal character name");
521 case 'U':
522 if (i + 8 < nr_c &&
523 pp_hexdigit(c[i + 1]) && pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) &&
524 pp_hexdigit(c[i + 5]) && pp_hexdigit(c[i + 6]) && pp_hexdigit(c[i + 7]) && pp_hexdigit(c[i + 8]))
525 return i + 9;
526 error_exit(pos, "invalid universal character name");
527 default:
528 error_exit(pos, "invalid escape sequence");
532 static unsigned int pp_string_end(const uint32_t *c, unsigned int nr_c, unsigned int start, struct pos *_pos)
534 struct pos *pos = &_pos[start];
535 unsigned int i;
537 /* Opening " is already "parsed". */
538 i = start + 1;
539 while (i < nr_c && c[i] != '"') {
540 switch (c[i]) {
541 case LS:
542 goto incomplete;
543 case '\\':
544 i = escape_sequence_end(c, nr_c, i, _pos);
545 break;
546 default:
547 i++;
550 if (i >= nr_c)
551 goto incomplete;
552 return i + 1;
554 incomplete:
555 error_exit(pos, "incomplete string literal");
558 static unsigned int pp_char_end(const uint32_t *c, unsigned int nr_c, unsigned int start, struct pos *_pos)
560 struct pos *pos = &_pos[start];
561 unsigned int i;
563 /* Opening ' is already "parsed". */
564 i = start + 1;
565 while (i < nr_c && c[i] != '\'') {
566 switch (c[i]) {
567 case LS:
568 goto incomplete;
569 case '\\':
570 i = escape_sequence_end(c, nr_c, i, _pos);
571 break;
572 default:
573 i++;
576 if (i >= nr_c)
577 goto incomplete;
578 if (i == start + 1)
579 goto empty;
580 return i + 1;
582 incomplete:
583 error_exit(pos, "incomplete character constant");
584 empty:
585 error_exit(pos, "empty character constant");
588 static struct pp_token *pp_tokenize(const uint32_t *c, unsigned int nr_c, struct pos *_pos)
590 struct pp_token *ppt_head, *ppt_tail;
591 unsigned int i;
593 ppt_head = NULL;
594 i = 0;
595 while (i < nr_c) {
596 struct pos *pos = &_pos[i];
597 struct pp_token *ppt;
599 switch (c[i]) {
600 unsigned int j;
602 case '\t':
603 case ' ':
604 ppt = pp_token_create(' ', pos);
605 i++;
606 break;
607 case LS:
608 ppt = pp_token_create(LS, pos);
609 i++;
610 break;
611 case '[':case ']':
612 case '(':case ')':
613 case '{':case '}':
614 case '~':
615 case '?':
616 case ':':
617 case ';':
618 case ',':
619 pp_token_simple:
620 ppt = pp_token_create(c[i], pos);
621 i++;
622 break;
623 case 'a':case 'b':case 'c':case 'd':case 'e':case 'f':case 'g':
624 case 'h':case 'i':case 'j':case 'k':case 'l':case 'm':case 'n':
625 case 'o':case 'p':case 'q':case 'r':case 's':case 't':case 'u':
626 case 'v':case 'w':case 'x':case 'y':case 'z':
627 case 'A':case 'B':case 'C':case 'D':case 'E':case 'F':case 'G':
628 case 'H':case 'I':case 'J':case 'K':case 'L':case 'M':case 'N':
629 case 'O':case 'P':case 'Q':case 'R':case 'S':case 'T':case 'U':
630 case 'V':case 'W':case 'X':case 'Y':case 'Z':
631 case '_':
632 ppt = pp_token_create(PP_TOKEN_IDENTIFIER, pos);
633 j = _pp_identifier_end(c, nr_c, i + 1);
634 pp_token_add(ppt, c, i, j);
635 i = j;
636 break;
637 case '\\':
638 if (i + 5 < nr_c && c[i + 1] == 'u' &&
639 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5])) {
640 ppt = pp_token_create(PP_TOKEN_IDENTIFIER, pos);
641 j = _pp_identifier_end(c, nr_c, i + 2 + 4);
642 pp_token_add(ppt, c, i, j);
643 i = j;
644 } else if (i + 9 < nr_c && c[i + 1] == 'U' &&
645 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5]) &&
646 pp_hexdigit(c[i + 6]) && pp_hexdigit(c[i + 7]) && pp_hexdigit(c[i + 8]) && pp_hexdigit(c[i + 9])) {
647 ppt = pp_token_create(PP_TOKEN_IDENTIFIER, pos);
648 j = _pp_identifier_end(c, nr_c, i + 2 + 4 + 4);
649 pp_token_add(ppt, c, i, j);
650 i = j;
651 } else
652 error_exit(pos, "unknown character %08"PRIx32, c[i]);
653 break;
654 case '0':case '1':case '2':case '3':case '4':
655 case '5':case '6':case '7':case '8':case '9':
656 ppt = pp_token_create(PP_TOKEN_NUMBER, pos);
657 j = pp_number_end(c, nr_c, i);
658 pp_token_add(ppt, c, i, j);
659 i = j;
660 break;
661 case '.':
662 if (i + 2 < nr_c && c[i + 1] == '.' && c[i + 2] == '.') {
663 ppt = pp_token_create(PP_TOKEN_DOTDOTDOT, pos);
664 i += 3;
665 } else if (i + 1 < nr_c && pp_digit(c[i + 1])) {
666 ppt = pp_token_create(PP_TOKEN_NUMBER, pos);
667 j = pp_number_end(c, nr_c, i + 1);
668 pp_token_add(ppt, c, i, j);
669 i = j;
670 } else
671 goto pp_token_simple;
672 break;
673 case '"':
674 ppt = pp_token_create(PP_TOKEN_STRING, pos);
675 j = pp_string_end(c, nr_c, i, _pos);
676 pp_token_add(ppt, c, i + 1, j - 1);
677 i = j;
678 break;
679 case '\'':
680 ppt = pp_token_create(PP_TOKEN_CHAR, pos);
681 j = pp_char_end(c, nr_c, i, _pos);
682 pp_token_add(ppt, c, i + 1, j - 1);
683 i = j;
684 break;
685 case '/':
686 if (i + 1 < nr_c && c[i + 1] == '*') {
687 ppt = pp_token_create(' ', pos);
688 i = c_comment_end(c, nr_c, i);
689 } else if (i + 1 < nr_c && c[i + 1] == '/') {
690 warning(pos, "C++ comment");
691 ppt = pp_token_create(' ', pos);
692 i = cpp_comment_end(c, nr_c, i);
693 } else if (i + 1 < nr_c && c[i + 1] == '=') {
694 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
695 i += 2;
696 } else
697 goto pp_token_simple;
698 break;
699 case '-':
700 if (i + 1 < nr_c && (c[i + 1] == '>' || c[i + 1] == '=' || c[i + 1] == '-')) {
701 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
702 i += 2;
703 } else
704 goto pp_token_simple;
705 break;
706 case '+':
707 case '&':
708 case '|':
709 if (i + 1 < nr_c && (c[i + 1] == '=' || c[i + 1] == c[i])) {
710 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
711 i += 2;
712 } else
713 goto pp_token_simple;
714 break;
715 case '*':
716 case '!':
717 case '%':
718 case '=':
719 case '^':
720 if (i + 1 < nr_c && c[i + 1] == '=') {
721 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
722 i += 2;
723 } else
724 goto pp_token_simple;
725 break;
726 case '<':
727 case '>':
728 if (i + 2 < nr_c && c[i + 1] == c[i] && c[i + 2] == '=') {
729 ppt = pp_token_create((c[i] << 16) | (c[i + 1] << 8) | c[i + 2], pos);
730 i += 3;
731 } else if (i + 1 < nr_c && (c[i + 1] == c[i] || c[i + 1] == '=')) {
732 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
733 i += 2;
734 } else
735 goto pp_token_simple;
736 break;
737 case '#':
738 if (i + 1 < nr_c && c[i + 1] == '#') {
739 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
740 i += 2;
741 } else
742 goto pp_token_simple;
743 break;
744 default:
745 error_exit(pos, "unknown character %08"PRIx32, c[i]);
748 if (!ppt_head)
749 ppt_head = ppt;
750 else
751 ppt_tail->next = ppt;
752 ppt_tail = ppt;
754 return ppt_head;
757 int main(int argc, char *argv[])
759 int fd;
760 struct stat st;
761 unsigned int st_size;
762 void *buf;
763 uint8_t *_c;
764 unsigned int _nr_c;
765 uint32_t *c;
766 unsigned int nr_c;
767 struct pos *pos;
768 struct pp_token *ppt_head;
770 if (argc < 2)
771 return EXIT_FAILURE;
773 fd = open(argv[1], O_RDONLY);
774 if (fd == -1)
775 perror_exit("open %s", argv[1]);
776 if (fstat(fd, &st) == -1)
777 perror_exit("fstat %s", argv[1]);
778 if (st.st_size < 0)
779 _error_exit("%s: negative st_size %"PRIdMAX, argv[1], (intmax_t)st.st_size);
780 st_size = (unsigned int)(uintmax_t)(intmax_t)st.st_size;
781 if ((uintmax_t)(intmax_t)st.st_size != (uintmax_t)st_size)
782 _error_exit("%s: too big st_size %"PRIdMAX, argv[1], (intmax_t)st.st_size);
784 buf = xmalloc(st_size);
785 xread(fd, buf, st_size);
786 close(fd);
788 _c = buf;
789 _nr_c = st_size;
790 /* Skip UTF-8 "BOM" if any. */
791 if (st_size >= 3 && _c[0] == 0xef && _c[1] == 0xbb && _c[2] == 0xbf) {
792 _c += 3;
793 _nr_c -= 3;
795 convert_from_utf8(_c, _nr_c, &c, &nr_c);
796 free(buf);
798 fix_newline(c, &nr_c);
799 pos = line_column(c, nr_c);
800 warn_trigraph(c, nr_c, pos);
801 delete_backslash_newline(c, &nr_c, pos);
803 ppt_head = pp_tokenize(c, nr_c, pos);
804 free(c);
805 free(pos);
808 struct pp_token *ppt;
810 for (ppt = ppt_head; ppt; ppt = ppt->next)
811 pp_token_print(ppt);
814 pp_token_free(ppt_head);
816 return EXIT_SUCCESS;