acc: "parse" #ifdef, #ifndef
[acc.git] / main.c
blob31beb9b54388fea67ed0b67f7877b1177fc707b0
1 /* Alexey's C compiler. */
2 #include <stdlib.h>
3 #include <sys/types.h>
4 #include <sys/stat.h>
5 #include <fcntl.h>
6 #include <stdio.h>
7 #include <errno.h>
8 #include <unistd.h>
9 #include <stdarg.h>
10 #include <stdint.h>
11 #include <inttypes.h>
12 #include <string.h>
13 #include <limits.h>
15 struct pos {
16 unsigned int line, column;
19 #ifdef __GNUC__
20 #define __noreturn __attribute((noreturn))
21 #define __printf(a, b) __attribute__((format(printf, a, b)))
22 #else
23 #define __noreturn
24 #define __printf(a, b)
25 #endif
27 static void warning(struct pos *pos, const char *fmt, ...) __printf(2, 3);
28 static void warning(struct pos *pos, const char *fmt, ...)
30 va_list args;
32 fprintf(stderr, "%u:%u: warning: ", pos->line, pos->column);
33 va_start(args, fmt);
34 vfprintf(stderr, fmt, args);
35 va_end(args);
36 fputc('\n', stderr);
39 static void error_exit(struct pos *pos, const char *fmt, ...) __printf(2, 3) __noreturn;
40 static void error_exit(struct pos *pos, const char *fmt, ...)
42 va_list args;
44 fprintf(stderr, "%u:%u: error: ", pos->line, pos->column);
45 va_start(args, fmt);
46 vfprintf(stderr, fmt, args);
47 va_end(args);
48 fputc('\n', stderr);
49 exit(EXIT_FAILURE);
52 static void perror_exit(const char *fmt, ...) __printf(1, 2) __noreturn;
53 static void perror_exit(const char *fmt, ...)
55 int old_errno = errno;
56 va_list args;
58 fputs("acc: ", stderr);
59 va_start(args, fmt);
60 vfprintf(stderr, fmt, args);
61 va_end(args);
62 fputs(": ", stderr);
63 errno = old_errno;
64 perror(NULL);
65 exit(EXIT_FAILURE);
68 static void _error_exit(const char *fmt, ...) __printf(1, 2) __noreturn;
69 static void _error_exit(const char *fmt, ...)
71 va_list args;
73 fputs("acc: error: ", stderr);
74 va_start(args, fmt);
75 vfprintf(stderr, fmt, args);
76 va_end(args);
77 fputc('\n', stderr);
78 exit(EXIT_FAILURE);
81 static void *xmalloc(size_t size)
83 void *p;
85 p = malloc(size);
86 if (!p)
87 perror_exit("%s: size %zu", __func__, size);
88 return p;
91 static void *xmemdup(const void *src, size_t n)
93 void *dst;
95 dst = xmalloc(n);
96 memcpy(dst, src, n);
97 return dst;
100 static ssize_t _xread(int fd, void *buf, size_t count)
102 ssize_t rv;
104 do {
105 rv = read(fd, buf, count);
106 } while (rv == -1 && (errno == EAGAIN || errno == EINTR));
107 return rv;
110 static void xread(int fd, void *buf, size_t count)
112 while (count > 0) {
113 ssize_t rv;
115 rv = _xread(fd, buf, count);
116 if (rv == -1)
117 perror_exit("read fd %d, buf %p, count %zu", fd, buf, count);
118 if (rv == 0)
119 _error_exit("fd %d truncated, buf %p, count %zu", fd, buf, count);
121 buf = (char *)buf + rv;
122 count -= rv;
126 static void fix_newline(char *c, unsigned int *nr_c)
128 unsigned int i;
130 i = 0;
131 while (i < *nr_c) {
132 if (c[i] == '\r' && i + 1 < *nr_c && c[i + 1] == '\n') {
133 memmove(&c[i], &c[i + 1], *nr_c - i - 1);
134 (*nr_c)--;
135 } else if (c[i] == '\r') {
136 c[i] = '\n';
137 i++;
138 } else
139 i++;
143 static struct pos *line_column(const char *c, unsigned int nr_c)
145 struct pos *pos;
146 unsigned int line, column;
147 unsigned int i;
149 if (nr_c >= 0xffffffff / sizeof(struct pos))
150 _error_exit("integer overflow nr_c %u", nr_c);
151 pos = xmalloc(nr_c * sizeof(struct pos));
153 line = 1;
154 column = 1;
155 for (i = 0; i < nr_c; i++) {
156 pos[i].line = line;
157 pos[i].column = column;
159 if (c[i] == '\n') {
160 line++;
161 column = 1;
162 } else
163 column++;
165 return pos;
168 static void warn_trigraph(const char *c, unsigned int nr_c, struct pos *pos)
170 unsigned int i;
172 i = 0;
173 while (i + 2 < nr_c) {
174 if (c[i] == '?' && c[i + 1] == '?') {
175 switch (c[i + 2]) {
176 case '=':case ')':case '!':
177 case '(':case '\'':case '>':
178 case '/':case '<':case '-':
179 warning(&pos[i], "trigraph sequence ??%c, ignoring", (unsigned char)c[i + 2]);
180 i += 3;
181 break;
182 default:
183 i++;
185 } else
186 i++;
190 static void delete_backslash_newline(char *c, unsigned int *nr_c, struct pos *pos)
192 unsigned int i;
194 i = 0;
195 while (i + 1 < *nr_c) {
196 if (c[i] == '\\' && c[i + 1] == '\n') {
197 unsigned int nr_to_move = *nr_c - i - 2;
199 memmove(&c[i], &c[i + 2], nr_to_move);
200 memmove(&pos[i], &pos[i + 2], nr_to_move * sizeof(struct pos));
201 (*nr_c) -= 2;
202 } else
203 i++;
207 struct pp_token {
208 struct pp_token *next;
209 enum pp_token_type {
210 PP_TOKEN_IDENTIFIER = UCHAR_MAX + 1,
211 PP_TOKEN_NUMBER,
212 PP_TOKEN_STRING,
213 PP_TOKEN_CHAR,
214 PP_TOKEN_INCLUDE_GLOBAL, /* #include < */
215 PP_TOKEN_INCLUDE_LOCAL, /* #include " */
216 PP_TOKEN_DEFINE, /* #define */
217 PP_TOKEN_IFDEF, /* #ifdef */
218 PP_TOKEN_IFNDEF, /* #ifndef */
220 #define _2(c1, c2) (((c1) << 8) | (c2))
221 #define _3(c1, c2, c3) (((c1) << 16)| ((c2) << 8) | (c3))
222 PP_TOKEN_DOTDOTDOT = _3('.', '.', '.'),
223 PP_TOKEN_DEREFERENCE = _2('-', '>'),
224 PP_TOKEN_SUB_EQ = _2('-', '='),
225 PP_TOKEN_DEC = _2('-', '-'),
226 PP_TOKEN_ADD_EQ = _2('+', '='),
227 PP_TOKEN_INC = _2('+', '+'),
228 PP_TOKEN_AND_EQ = _2('&', '='),
229 PP_TOKEN_AND = _2('&', '&'),
230 PP_TOKEN_MUL_EQ = _2('*', '='),
231 PP_TOKEN_NOT_EQ = _2('!', '='),
232 PP_TOKEN_DIV_EQ = _2('/', '='),
233 PP_TOKEN_REM_EQ = _2('%', '='),
234 PP_TOKEN_LSHIFT_EQ = _3('<', '<', '='),
235 PP_TOKEN_LSHIFT = _2('<', '<'),
236 PP_TOKEN_LEQ = _2('<', '='),
237 PP_TOKEN_RSHIFT_EQ = _3('>', '>', '='),
238 PP_TOKEN_RSHIFT = _2('>', '>'),
239 PP_TOKEN_GEQ = _2('>', '='),
240 PP_TOKEN_EQ = _2('=', '='),
241 PP_TOKEN_XOR_EQ = _2('^', '='),
242 PP_TOKEN_OR_EQ = _2('|', '='),
243 PP_TOKEN_OR = _2('|', '|'),
244 PP_TOKEN_SHARPSHARP = _2('#', '#'),
245 #undef _2
246 #undef _3
247 } type;
248 char *str; /* string representation, if type is not enough */
249 struct pos pos;
252 static struct pp_token *pp_token_create(enum pp_token_type type, struct pos *pos)
254 struct pp_token *ppt;
256 ppt = xmalloc(sizeof(struct pp_token));
257 ppt->next = NULL;
258 ppt->type = type;
259 ppt->str = NULL;
260 ppt->pos = *pos;
261 return ppt;
264 /* [start, end) */
265 static void pp_token_add(struct pp_token *ppt, const char *c, unsigned int start, unsigned int end)
267 ppt->str = xmemdup(&c[start], end - start + 1);
268 ppt->str[end - start] = '\0';
271 static void pp_token_free(struct pp_token *ppt_head)
273 struct pp_token *ppt;
275 ppt = ppt_head;
276 while (ppt) {
277 struct pp_token *next;
279 next = ppt->next;
280 if (ppt->str)
281 free(ppt->str);
282 free(ppt);
283 ppt = next;
287 static void pp_token_print(struct pp_token *ppt)
289 printf("%u:%u:\t", ppt->pos.line, ppt->pos.column);
290 switch (ppt->type) {
291 case '\n':
292 printf("\\n");
293 break;
294 case ' ':
295 printf("' '");
296 break;
297 case PP_TOKEN_IDENTIFIER:
298 printf("pp-identifier %s", ppt->str);
299 break;
300 case PP_TOKEN_NUMBER:
301 printf("pp-number %s", ppt->str);
302 break;
303 case PP_TOKEN_STRING:
304 printf("pp-string \"%s\"", ppt->str);
305 break;
306 case PP_TOKEN_CHAR:
307 printf("pp-char '%s'", ppt->str);
308 break;
309 case PP_TOKEN_INCLUDE_GLOBAL:
310 printf("#include <%s>", ppt->str);
311 break;
312 case PP_TOKEN_INCLUDE_LOCAL:
313 printf("#include \"%s\"", ppt->str);
314 break;
315 case PP_TOKEN_DEFINE:
316 printf("#define %s", ppt->str);
317 break;
318 case PP_TOKEN_IFDEF:
319 printf("#ifdef %s", ppt->str);
320 break;
321 case PP_TOKEN_IFNDEF:
322 printf("#ifndef %s", ppt->str);
323 break;
324 case PP_TOKEN_DOTDOTDOT:
325 case PP_TOKEN_LSHIFT_EQ:
326 case PP_TOKEN_RSHIFT_EQ:
327 printf("%c%c%c", (ppt->type >> 16) & 0xff, (ppt->type >> 8) & 0xff, ppt->type & 0xff);
328 break;
329 case PP_TOKEN_DEREFERENCE:
330 case PP_TOKEN_SUB_EQ:
331 case PP_TOKEN_DEC:
332 case PP_TOKEN_ADD_EQ:
333 case PP_TOKEN_INC:
334 case PP_TOKEN_AND_EQ:
335 case PP_TOKEN_AND:
336 case PP_TOKEN_MUL_EQ:
337 case PP_TOKEN_NOT_EQ:
338 case PP_TOKEN_DIV_EQ:
339 case PP_TOKEN_REM_EQ:
340 case PP_TOKEN_LSHIFT:
341 case PP_TOKEN_LEQ:
342 case PP_TOKEN_RSHIFT:
343 case PP_TOKEN_GEQ:
344 case PP_TOKEN_EQ:
345 case PP_TOKEN_XOR_EQ:
346 case PP_TOKEN_OR_EQ:
347 case PP_TOKEN_OR:
348 case PP_TOKEN_SHARPSHARP:
349 printf("%c%c", (ppt->type >> 8) & 0xff, ppt->type & 0xff);
350 break;
351 default:
352 printf("%c", ppt->type);
354 putc('\n', stdout);
357 static int pp_nondigit(const char c)
359 return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
362 static int pp_octdigit(const char c)
364 return '0' <= c && c <= '7';
367 static int pp_digit(const char c)
369 return '0' <= c && c <= '9';
372 static int pp_hexdigit(const char c)
374 return pp_digit(c) || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
377 static unsigned int _pp_identifier_end(const char *c, unsigned int nr_c, unsigned int start)
379 unsigned int i;
381 i = start;
382 while (i < nr_c) {
383 if (pp_nondigit(c[i]) || pp_digit(c[i])) {
384 i++;
385 } else if (i + 5 < nr_c && c[i] == '\\' && c[i + 1] == 'u' &&
386 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5])) {
387 i += 2 + 4;
388 } else if (i + 9 < nr_c && c[i] == '\\' && c[i + 1] == 'U' &&
389 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5]) &&
390 pp_hexdigit(c[i + 6]) && pp_hexdigit(c[i + 7]) && pp_hexdigit(c[i + 8]) && pp_hexdigit(c[i + 9])) {
391 i += 2 + 4 + 4;
392 } else
393 return i;
395 return i;
398 static unsigned int pp_number_end(const char *c, unsigned int nr_c, unsigned int start)
400 unsigned int i;
402 i = start + 1;
403 while (i < nr_c) {
404 if ((c[i] == 'e' || c[i] == 'E' || c[i] == 'p' || c[i] == 'P') &&
405 i + 1 < nr_c && (c[i + 1] == '+' || c[i + 1] == '-')) {
406 i += 2;
407 } else if (pp_digit(c[i]) || pp_nondigit(c[i]) || c[i] == '.') {
408 i++;
409 } else if (c[i] == '\\' && i + 5 < nr_c && c[i + 1] == 'u' &&
410 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5])) {
411 i += 2 + 4;
412 } else if (c[i] == '\\' && i + 9 < nr_c && c[i + 1] == 'U' &&
413 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5]) &&
414 pp_hexdigit(c[i + 6]) && pp_hexdigit(c[i + 7]) && pp_hexdigit(c[i + 8]) && pp_hexdigit(c[i + 9])) {
415 i += 2 + 4 + 4;
416 } else
417 return i;
419 return i;
422 static unsigned int c_comment_end(const char *c, unsigned int nr_c, unsigned int start)
424 unsigned int i;
426 i = start + 2;
427 while (i + 1 < nr_c) {
428 if (c[i] == '*' && c[i + 1] == '/')
429 return i + 2;
430 i++;
432 return nr_c;
435 static unsigned int cpp_comment_end(const char *c, unsigned int nr_c, unsigned int start)
437 unsigned int i;
439 i = start + 2;
440 while (i < nr_c && c[i] != '\n')
441 i++;
442 return i;
445 static unsigned int escape_sequence_end(const char *c, unsigned int nr_c, unsigned int start, struct pos *_pos)
447 struct pos *pos = &_pos[start];
448 unsigned int i;
450 i = start + 1;
451 if (i >= nr_c)
452 error_exit(pos, "incomplete escape sequence");
453 switch (c[i]) {
454 case '\'':case '"':case '?':case '\\':
455 case 'a':case 'b':case 'f':case 'n':case 'r':case 't':case 'v':
456 return i + 1;
457 case '0':case '1':case '2':case '3':case '4':case '6':case '7':
458 if (i + 2 < nr_c && pp_octdigit(c[i + 1]) && pp_octdigit(c[i + 2]))
459 return i + 3;
460 if (i + 1 < nr_c && pp_octdigit(c[i + 1]))
461 return i + 2;
462 return i + 1;
463 case 'x':
464 i++;
465 while (i < nr_c && pp_hexdigit(c[i]))
466 i++;
467 if (i == start + 2)
468 error_exit(pos, "invalid hexadecimal escape sequence");
469 return i;
470 case 'u':
471 if (i + 4 < nr_c &&
472 pp_hexdigit(c[i + 1]) && pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]))
473 return i + 5;
474 error_exit(pos, "invalid universal character name");
475 case 'U':
476 if (i + 8 < nr_c &&
477 pp_hexdigit(c[i + 1]) && pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) &&
478 pp_hexdigit(c[i + 5]) && pp_hexdigit(c[i + 6]) && pp_hexdigit(c[i + 7]) && pp_hexdigit(c[i + 8]))
479 return i + 9;
480 error_exit(pos, "invalid universal character name");
481 default:
482 error_exit(pos, "invalid escape sequence");
486 static unsigned int pp_string_end(const char *c, unsigned int nr_c, unsigned int start, struct pos *_pos)
488 struct pos *pos = &_pos[start];
489 unsigned int i;
491 i = start + 1;
492 while (i < nr_c && c[i] != '"') {
493 switch (c[i]) {
494 case '\n':
495 goto incomplete;
496 case '\\':
497 i = escape_sequence_end(c, nr_c, i, _pos);
498 break;
499 default:
500 i++;
503 if (i >= nr_c)
504 goto incomplete;
505 return i + 1;
507 incomplete:
508 error_exit(pos, "incomplete string literal");
511 static unsigned int pp_char_end(const char *c, unsigned int nr_c, unsigned int start, struct pos *_pos)
513 struct pos *pos = &_pos[start];
514 unsigned int i;
516 i = start + 1;
517 while (i < nr_c && c[i] != '\'') {
518 switch (c[i]) {
519 case '\n':
520 goto incomplete;
521 case '\\':
522 i = escape_sequence_end(c, nr_c, i, _pos);
523 break;
524 default:
525 i++;
528 if (i >= nr_c)
529 goto incomplete;
530 if (i == start + 1)
531 goto empty;
532 return i + 1;
534 incomplete:
535 error_exit(pos, "incomplete character constant");
536 empty:
537 error_exit(pos, "empty character constant");
540 static unsigned int whitespace_end(const char *c, unsigned int nr_c, unsigned int start)
542 unsigned int i;
544 i = start;
545 while (i < nr_c && (c[i] == ' ' || c[i] == '\t'))
546 i++;
547 return i;
550 static struct pp_token *pp_define(const char *c, unsigned int nr_c, unsigned int *start, struct pos *pos)
552 struct pp_token *ppt;
553 unsigned int name_start, name_end;
554 unsigned int i;
556 i = *start;
557 if (i >= nr_c || !pp_nondigit(c[i]))
558 error_exit(pos, "invalid #define directive");
559 name_start = i;
560 i = _pp_identifier_end(c, nr_c, i + 1);
561 name_end = i;
563 ppt = pp_token_create(PP_TOKEN_DEFINE, pos);
564 pp_token_add(ppt, c, name_start, name_end);
566 while (i < nr_c && c[i] != '\n')
567 i++;
568 *start = i + 1;
569 return ppt;
572 static struct pp_token *pp_error(const char *c, unsigned int nr_c, unsigned int *start, struct pos *pos)
574 unsigned int j;
576 j = *start;
577 while (j < nr_c && c[j] != '\n')
578 j++;
579 error_exit(pos, "%.*s", j - *start, &c[*start]);
582 static struct pp_token *pp_include(const char *c, unsigned int nr_c, unsigned int *start, struct pos *pos)
584 struct pp_token *ppt;
585 unsigned int name_start, name_end;
586 enum pp_token_type type;
587 unsigned int j;
589 j = *start;
590 if (j + 2 >= nr_c || (c[j] != '<' && c[j] != '"'))
591 error_exit(pos, "invalid #include directive");
592 j++;
593 name_start = j;
594 if (c[j - 1] == '<') {
595 while (j < nr_c && c[j] != '>')
596 j++;
597 type = PP_TOKEN_INCLUDE_GLOBAL;
598 } else {
599 while (j < nr_c && c[j] != '"')
600 j++;
601 type = PP_TOKEN_INCLUDE_LOCAL;
603 if (j >= nr_c)
604 error_exit(pos, "invalid #include directive");
605 name_end = j;
607 ppt = pp_token_create(type, pos);
608 pp_token_add(ppt, c, name_start, name_end);
610 j++;
611 if (j < nr_c && c[j] == '\n')
612 j++;
613 *start = j;
614 return ppt;
617 static struct pp_token *pp_ifdef(const char *c, unsigned int nr_c, unsigned int *start, struct pos *pos)
619 struct pp_token *ppt;
620 unsigned int name_start, name_end;
621 unsigned int i;
623 i = *start;
624 if (i >= nr_c || !pp_nondigit(c[i]))
625 error_exit(pos, "invalid #ifdef directive");
626 name_start = i;
627 i = _pp_identifier_end(c, nr_c, i + 1);
628 name_end = i;
630 ppt = pp_token_create(PP_TOKEN_IFDEF, pos);
631 pp_token_add(ppt, c, name_start, name_end);
633 while (i < nr_c && c[i] != '\n')
634 i++;
635 *start = i + 1;
636 return ppt;
639 static struct pp_token *pp_ifndef(const char *c, unsigned int nr_c, unsigned int *start, struct pos *pos)
641 struct pp_token *ppt;
642 unsigned int name_start, name_end;
643 unsigned int i;
645 i = *start;
646 if (i >= nr_c || !pp_nondigit(c[i]))
647 error_exit(pos, "invalid #ifndef directive");
648 name_start = i;
649 i = _pp_identifier_end(c, nr_c, i + 1);
650 name_end = i;
652 ppt = pp_token_create(PP_TOKEN_IFNDEF, pos);
653 pp_token_add(ppt, c, name_start, name_end);
655 while (i < nr_c && c[i] != '\n')
656 i++;
657 *start = i + 1;
658 return ppt;
661 static struct pp_token *pp_tokenize(const char *c, unsigned int nr_c, struct pos *_pos)
663 struct pp_token *ppt_head, *ppt_tail;
664 int pp_directive_allowed;
665 unsigned int i;
667 ppt_head = NULL;
668 pp_directive_allowed = 1;
669 i = 0;
670 while (i < nr_c) {
671 struct pos *pos;
672 struct pp_token *ppt;
674 if (pp_directive_allowed) {
675 static const struct __ppd {
676 unsigned int len;
677 const char *str;
678 struct pp_token * (*f)(const char *c, unsigned int nr_c, unsigned int *start, struct pos *pos);
679 } _ppd[] = {
680 { .len = 6, .str = "define", .f = pp_define, },
681 { .len = 5, .str = "error", .f = pp_error, },
682 { .len = 7, .str = "include", .f = pp_include, },
683 { .len = 5, .str = "ifdef", .f = pp_ifdef, },
684 { .len = 6, .str = "ifndef", .f = pp_ifndef, },
686 unsigned int sharp_start, ppd_start, ppd_end;
687 unsigned int j, k;
689 j = whitespace_end(c, nr_c, i);
690 if (j >= nr_c || c[j] != '#')
691 goto not_pp_directive;
692 sharp_start = j;
693 pos = &_pos[sharp_start];
694 j = whitespace_end(c, nr_c, j + 1);
695 if (j >= nr_c) {
696 warning(pos, "empty preprocessor directive");
697 i = j;
698 continue;
700 if (c[j] == '\n') {
701 warning(pos, "empty preprocessor directive");
702 /* Eat newline after # */
703 i = j + 1;
704 continue;
707 ppd_start = j;
708 while (j < nr_c && pp_nondigit(c[j]))
709 j++;
710 ppd_end = j;
712 for (k = 0; k < sizeof(_ppd) / sizeof(_ppd[0]); k++) {
713 const struct __ppd *ppd = &_ppd[k];
715 if (ppd_end - ppd_start == ppd->len && memcmp(&c[ppd_start], ppd->str, ppd->len) == 0)
716 break;
718 if (k >= sizeof(_ppd) / sizeof(_ppd[0]))
719 error_exit(pos, "unknown preprocessor directive '#%.*s'", ppd_end - ppd_start, &c[ppd_start]);
720 j = whitespace_end(c, nr_c, j);
721 ppt = _ppd[k].f(c, nr_c, &j, pos);
722 i = j;
723 goto pp_token_link;
726 not_pp_directive:
727 pos = &_pos[i];
728 switch (c[i]) {
729 unsigned int j;
731 case '\t':
732 case ' ':
733 ppt = pp_token_create(' ', pos);
734 i++;
735 break;
736 case '\n':
737 ppt = pp_token_create('\n', pos);
738 i++;
739 break;
740 case '[':case ']':
741 case '(':case ')':
742 case '{':case '}':
743 case '~':
744 case '?':
745 case ':':
746 case ';':
747 case ',':
748 pp_token_simple:
749 ppt = pp_token_create(c[i], pos);
750 i++;
751 break;
752 case 'a':case 'b':case 'c':case 'd':case 'e':case 'f':case 'g':
753 case 'h':case 'i':case 'j':case 'k':case 'l':case 'm':case 'n':
754 case 'o':case 'p':case 'q':case 'r':case 's':case 't':case 'u':
755 case 'v':case 'w':case 'x':case 'y':case 'z':
756 case 'A':case 'B':case 'C':case 'D':case 'E':case 'F':case 'G':
757 case 'H':case 'I':case 'J':case 'K':case 'L':case 'M':case 'N':
758 case 'O':case 'P':case 'Q':case 'R':case 'S':case 'T':case 'U':
759 case 'V':case 'W':case 'X':case 'Y':case 'Z':
760 case '_':
761 ppt = pp_token_create(PP_TOKEN_IDENTIFIER, pos);
762 j = _pp_identifier_end(c, nr_c, i + 1);
763 pp_token_add(ppt, c, i, j);
764 i = j;
765 break;
766 case '\\':
767 if (i + 5 < nr_c && c[i + 1] == 'u' &&
768 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5])) {
769 ppt = pp_token_create(PP_TOKEN_IDENTIFIER, pos);
770 j = _pp_identifier_end(c, nr_c, i + 2 + 4);
771 pp_token_add(ppt, c, i, j);
772 i = j;
773 } else if (i + 9 < nr_c && c[i + 1] == 'U' &&
774 pp_hexdigit(c[i + 2]) && pp_hexdigit(c[i + 3]) && pp_hexdigit(c[i + 4]) && pp_hexdigit(c[i + 5]) &&
775 pp_hexdigit(c[i + 6]) && pp_hexdigit(c[i + 7]) && pp_hexdigit(c[i + 8]) && pp_hexdigit(c[i + 9])) {
776 ppt = pp_token_create(PP_TOKEN_IDENTIFIER, pos);
777 j = _pp_identifier_end(c, nr_c, i + 2 + 4 + 4);
778 pp_token_add(ppt, c, i, j);
779 i = j;
780 } else
781 error_exit(pos, "unknown character %08"PRIx32, c[i]);
782 break;
783 case '0':case '1':case '2':case '3':case '4':
784 case '5':case '6':case '7':case '8':case '9':
785 ppt = pp_token_create(PP_TOKEN_NUMBER, pos);
786 j = pp_number_end(c, nr_c, i);
787 pp_token_add(ppt, c, i, j);
788 i = j;
789 break;
790 case '.':
791 if (i + 2 < nr_c && c[i + 1] == '.' && c[i + 2] == '.') {
792 ppt = pp_token_create(PP_TOKEN_DOTDOTDOT, pos);
793 i += 3;
794 } else if (i + 1 < nr_c && pp_digit(c[i + 1])) {
795 ppt = pp_token_create(PP_TOKEN_NUMBER, pos);
796 j = pp_number_end(c, nr_c, i + 1);
797 pp_token_add(ppt, c, i, j);
798 i = j;
799 } else
800 goto pp_token_simple;
801 break;
802 case '"':
803 ppt = pp_token_create(PP_TOKEN_STRING, pos);
804 j = pp_string_end(c, nr_c, i, _pos);
805 pp_token_add(ppt, c, i + 1, j - 1);
806 i = j;
807 break;
808 case '\'':
809 ppt = pp_token_create(PP_TOKEN_CHAR, pos);
810 j = pp_char_end(c, nr_c, i, _pos);
811 pp_token_add(ppt, c, i + 1, j - 1);
812 i = j;
813 break;
814 case '/':
815 if (i + 1 < nr_c && c[i + 1] == '*') {
816 ppt = pp_token_create(' ', pos);
817 i = c_comment_end(c, nr_c, i);
818 } else if (i + 1 < nr_c && c[i + 1] == '/') {
819 warning(pos, "C++ comment");
820 ppt = pp_token_create(' ', pos);
821 i = cpp_comment_end(c, nr_c, i);
822 } else if (i + 1 < nr_c && c[i + 1] == '=') {
823 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
824 i += 2;
825 } else
826 goto pp_token_simple;
827 break;
828 case '-':
829 if (i + 1 < nr_c && (c[i + 1] == '>' || c[i + 1] == '=' || c[i + 1] == '-')) {
830 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
831 i += 2;
832 } else
833 goto pp_token_simple;
834 break;
835 case '+':
836 case '&':
837 case '|':
838 if (i + 1 < nr_c && (c[i + 1] == '=' || c[i + 1] == c[i])) {
839 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
840 i += 2;
841 } else
842 goto pp_token_simple;
843 break;
844 case '*':
845 case '!':
846 case '%':
847 case '=':
848 case '^':
849 if (i + 1 < nr_c && c[i + 1] == '=') {
850 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
851 i += 2;
852 } else
853 goto pp_token_simple;
854 break;
855 case '<':
856 case '>':
857 if (i + 2 < nr_c && c[i + 1] == c[i] && c[i + 2] == '=') {
858 ppt = pp_token_create((c[i] << 16) | (c[i + 1] << 8) | c[i + 2], pos);
859 i += 3;
860 } else if (i + 1 < nr_c && (c[i + 1] == c[i] || c[i + 1] == '=')) {
861 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
862 i += 2;
863 } else
864 goto pp_token_simple;
865 break;
866 case '#':
867 if (i + 1 < nr_c && c[i + 1] == '#') {
868 ppt = pp_token_create((c[i] << 8) | c[i + 1], pos);
869 i += 2;
870 } else
871 goto pp_token_simple;
872 break;
873 default:
874 error_exit(pos, "unknown character %08"PRIx32, c[i]);
877 if (ppt->type == '\n')
878 pp_directive_allowed = 1;
879 else if (ppt->type == ' ')
881 else
882 pp_directive_allowed = 0;
884 pp_token_link:
885 if (!ppt_head)
886 ppt_head = ppt;
887 else
888 ppt_tail->next = ppt;
889 ppt_tail = ppt;
891 return ppt_head;
894 int main(int argc, char *argv[])
896 int fd;
897 struct stat st;
898 unsigned int st_size;
899 void *buf;
900 char *c;
901 unsigned int nr_c;
902 struct pos *pos;
903 struct pp_token *ppt_head;
905 if (argc < 2)
906 return EXIT_FAILURE;
908 fd = open(argv[1], O_RDONLY);
909 if (fd == -1)
910 perror_exit("open %s", argv[1]);
911 if (fstat(fd, &st) == -1)
912 perror_exit("fstat %s", argv[1]);
913 if (st.st_size < 0)
914 _error_exit("%s: negative st_size %"PRIdMAX, argv[1], (intmax_t)st.st_size);
915 st_size = (unsigned int)(uintmax_t)(intmax_t)st.st_size;
916 if ((uintmax_t)(intmax_t)st.st_size != (uintmax_t)st_size)
917 _error_exit("%s: too big st_size %"PRIdMAX, argv[1], (intmax_t)st.st_size);
919 buf = xmalloc(st_size);
920 xread(fd, buf, st_size);
921 close(fd);
923 c = buf;
924 nr_c = st_size;
926 fix_newline(c, &nr_c);
927 pos = line_column(c, nr_c);
928 warn_trigraph(c, nr_c, pos);
929 delete_backslash_newline(c, &nr_c, pos);
931 ppt_head = pp_tokenize(c, nr_c, pos);
932 free(c);
933 free(pos);
936 struct pp_token *ppt;
938 for (ppt = ppt_head; ppt; ppt = ppt->next) {
939 pp_token_print(ppt);
943 pp_token_free(ppt_head);
945 return EXIT_SUCCESS;