added a bunch of gcc builtins
[smatch.git] / tokenize.c
blob497da13ccf46b5a9871c7aada60531d1e7e48a21
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
18 #include "lib.h"
19 #include "allocate.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
29 #define BUFSIZE (8192)
31 typedef struct {
32 int fd, offset, size;
33 int pos, line, nr;
34 int newline, whitespace;
35 struct token **tokenlist;
36 struct token *token;
37 unsigned char *buffer;
38 } stream_t;
40 const char *stream_name(int stream)
42 if (stream < 0 || stream > input_stream_nr)
43 return "<bad stream>";
44 return input_streams[stream].name;
47 static struct position stream_pos(stream_t *stream)
49 struct position pos;
50 pos.type = 0;
51 pos.stream = stream->nr;
52 pos.newline = stream->newline;
53 pos.whitespace = stream->whitespace;
54 pos.pos = stream->pos;
55 pos.line = stream->line;
56 pos.noexpand = 0;
57 return pos;
60 const char *show_special(int val)
62 static const char *combinations[] = COMBINATION_STRINGS;
63 static char buffer[4];
65 buffer[0] = val;
66 buffer[1] = 0;
67 if (val >= SPECIAL_BASE)
68 strcpy(buffer, combinations[val - SPECIAL_BASE]);
69 return buffer;
72 const char *show_ident(const struct ident *ident)
74 static char buffer[256];
75 if (!ident)
76 return "<noident>";
77 sprintf(buffer, "%.*s", ident->len, ident->name);
78 return buffer;
81 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
83 if (isprint(c)) {
84 if (c == escape || c == '\\')
85 *ptr++ = '\\';
86 *ptr++ = c;
87 return ptr;
89 *ptr++ = '\\';
90 switch (c) {
91 case '\n':
92 *ptr++ = 'n';
93 return ptr;
94 case '\t':
95 *ptr++ = 't';
96 return ptr;
98 if (!isdigit(next))
99 return ptr + sprintf(ptr, "%o", c);
101 return ptr + sprintf(ptr, "%03o", c);
104 const char *show_string(const struct string *string)
106 static char buffer[4 * MAX_STRING + 3];
107 char *ptr;
108 int i;
110 if (!string->length)
111 return "<bad_string>";
112 ptr = buffer;
113 *ptr++ = '"';
114 for (i = 0; i < string->length-1; i++) {
115 const char *p = string->data + i;
116 ptr = charstr(ptr, p[0], '"', p[1]);
118 *ptr++ = '"';
119 *ptr = '\0';
120 return buffer;
123 const char *show_token(const struct token *token)
125 static char buffer[256];
127 if (!token)
128 return "<no token>";
129 switch (token_type(token)) {
130 case TOKEN_ERROR:
131 return "syntax error";
133 case TOKEN_EOF:
134 return "end-of-input";
136 case TOKEN_IDENT:
137 return show_ident(token->ident);
139 case TOKEN_STRING:
140 return show_string(token->string);
142 case TOKEN_NUMBER:
143 return token->number;
145 case TOKEN_SPECIAL:
146 return show_special(token->special);
148 case TOKEN_CHAR: {
149 char *ptr = buffer;
150 int c = token->character;
151 *ptr++ = '\'';
152 ptr = charstr(ptr, c, '\'', 0);
153 *ptr++ = '\'';
154 *ptr++ = '\0';
155 return buffer;
158 case TOKEN_STREAMBEGIN:
159 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
160 return buffer;
162 case TOKEN_STREAMEND:
163 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
164 return buffer;
166 default:
167 return "WTF???";
171 int init_stream(const char *name, int fd, const char **next_path)
173 int stream = input_stream_nr;
174 struct stream *current;
176 if (stream >= input_streams_allocated) {
177 int newalloc = stream * 4 / 3 + 10;
178 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
179 if (!input_streams)
180 die("Unable to allocate more streams space");
181 input_streams_allocated = newalloc;
183 current = input_streams + stream;
184 memset(current, 0, sizeof(*current));
185 current->name = name;
186 current->fd = fd;
187 current->next_path = next_path;
188 current->path = NULL;
189 current->constant = CONSTANT_FILE_MAYBE;
190 input_stream_nr = stream+1;
191 return stream;
194 static struct token * alloc_token(stream_t *stream)
196 struct token *token = __alloc_token(0);
197 token->pos = stream_pos(stream);
198 return token;
202 * Argh... That was surprisingly messy - handling '\r' complicates the
203 * things a _lot_.
205 static int nextchar_slow(stream_t *stream)
207 int offset = stream->offset;
208 int size = stream->size;
209 int c;
210 int spliced = 0, had_cr, had_backslash, complain;
212 restart:
213 had_cr = had_backslash = complain = 0;
215 repeat:
216 if (offset >= size) {
217 size = read(stream->fd, stream->buffer, BUFSIZE);
218 if (size <= 0)
219 goto got_eof;
220 stream->size = size;
221 stream->offset = offset = 0;
224 c = stream->buffer[offset++];
226 if (had_cr && c != '\n')
227 complain = 1;
229 if (c == '\r') {
230 had_cr = 1;
231 goto repeat;
234 stream->pos++;
236 if (c == '\n') {
237 stream->line++;
238 stream->pos = 0;
241 if (!had_backslash) {
242 if (c == '\\') {
243 had_backslash = 1;
244 goto repeat;
246 if (c == '\n')
247 stream->newline = 1;
248 } else {
249 if (c == '\n') {
250 if (complain)
251 warning(stream_pos(stream), "non-ASCII data stream");
252 spliced = 1;
253 goto restart;
255 stream->pos--;
256 offset--;
257 c = '\\';
260 out:
261 stream->offset = offset;
262 if (complain)
263 warning(stream_pos(stream), "non-ASCII data stream");
265 return c;
267 got_eof:
268 if (had_backslash) {
269 c = '\\';
270 goto out;
272 if (stream->pos)
273 warning(stream_pos(stream), "no newline at end of file");
274 else if (had_cr)
275 warning(stream_pos(stream), "non-ASCII data stream");
276 else if (spliced)
277 warning(stream_pos(stream), "backslash-newline at end of file");
278 return EOF;
282 * We want that as light as possible while covering all normal cases.
283 * Slow path (including the logics with line-splicing and EOF sanity
284 * checks) is in nextchar_slow().
286 static int nextchar(stream_t *stream)
288 int offset = stream->offset;
290 if (offset < stream->size) {
291 int c = stream->buffer[offset++];
292 static const char special[256] = {
293 ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
295 if (!special[c]) {
296 stream->offset = offset;
297 stream->pos++;
298 return c;
301 return nextchar_slow(stream);
304 struct token eof_token_entry;
306 static void mark_eof(stream_t *stream, struct token *end_token)
308 struct token *end;
310 end = alloc_token(stream);
311 token_type(end) = TOKEN_STREAMEND;
312 end->pos.newline = 1;
314 eof_token_entry.next = &eof_token_entry;
315 eof_token_entry.pos.newline = 1;
317 if (!end_token)
318 end_token = &eof_token_entry;
319 end->next = end_token;
320 *stream->tokenlist = end;
321 stream->tokenlist = NULL;
324 static void add_token(stream_t *stream)
326 struct token *token = stream->token;
328 stream->token = NULL;
329 token->next = NULL;
330 *stream->tokenlist = token;
331 stream->tokenlist = &token->next;
334 static void drop_token(stream_t *stream)
336 stream->newline |= stream->token->pos.newline;
337 stream->whitespace |= stream->token->pos.whitespace;
338 stream->token = NULL;
341 enum {
342 Letter = 1,
343 Digit = 2,
344 Hex = 4,
345 Exp = 8,
346 Dot = 16,
347 ValidSecond = 32,
350 static const long cclass[257] = {
351 ['0' + 1 ... '9' + 1] = Digit | Hex,
352 ['A' + 1 ... 'D' + 1] = Letter | Hex,
353 ['E' + 1] = Letter | Hex | Exp,
354 ['F' + 1] = Letter | Hex,
355 ['G' + 1 ... 'O' + 1] = Letter,
356 ['P' + 1] = Letter | Exp,
357 ['Q' + 1 ... 'Z' + 1] = Letter,
358 ['a' + 1 ... 'd' + 1] = Letter | Hex,
359 ['e' + 1] = Letter | Hex | Exp,
360 ['f' + 1] = Letter | Hex,
361 ['g' + 1 ... 'o' + 1] = Letter,
362 ['p' + 1] = Letter | Exp,
363 ['q' + 1 ... 'z' + 1] = Letter,
364 ['_' + 1] = Letter,
365 ['.' + 1] = Dot | ValidSecond,
366 ['=' + 1] = ValidSecond,
367 ['+' + 1] = ValidSecond,
368 ['-' + 1] = ValidSecond,
369 ['>' + 1] = ValidSecond,
370 ['<' + 1] = ValidSecond,
371 ['&' + 1] = ValidSecond,
372 ['|' + 1] = ValidSecond,
373 ['#' + 1] = ValidSecond,
377 * pp-number:
378 * digit
379 * . digit
380 * pp-number digit
381 * pp-number identifier-nodigit
382 * pp-number e sign
383 * pp-number E sign
384 * pp-number p sign
385 * pp-number P sign
386 * pp-number .
388 static int get_one_number(int c, int next, stream_t *stream)
390 struct token *token;
391 static char buffer[4095];
392 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
393 int len;
395 *p++ = c;
396 for (;;) {
397 long class = cclass[next + 1];
398 if (!(class & (Dot | Digit | Letter)))
399 break;
400 if (p != buffer_end)
401 *p++ = next;
402 next = nextchar(stream);
403 if (class & Exp) {
404 if (next == '-' || next == '+') {
405 if (p != buffer_end)
406 *p++ = next;
407 next = nextchar(stream);
412 if (p == buffer_end) {
413 sparse_error(stream_pos(stream), "number token exceeds %td characters",
414 buffer_end - buffer);
415 // Pretend we saw just "1".
416 buffer[0] = '1';
417 p = buffer + 1;
420 *p++ = 0;
421 len = p - buffer;
422 buf = __alloc_bytes(len);
423 memcpy(buf, buffer, len);
425 token = stream->token;
426 token_type(token) = TOKEN_NUMBER;
427 token->number = buf;
428 add_token(stream);
430 return next;
433 static int escapechar(int first, int type, stream_t *stream, int *valp)
435 int next, value;
437 next = nextchar(stream);
438 value = first;
440 if (first == '\n')
441 warning(stream_pos(stream), "Newline in string or character constant");
443 if (first == '\\' && next != EOF) {
444 value = next;
445 next = nextchar(stream);
446 if (value != type) {
447 switch (value) {
448 case 'a':
449 value = '\a';
450 break;
451 case 'b':
452 value = '\b';
453 break;
454 case 't':
455 value = '\t';
456 break;
457 case 'n':
458 value = '\n';
459 break;
460 case 'v':
461 value = '\v';
462 break;
463 case 'f':
464 value = '\f';
465 break;
466 case 'r':
467 value = '\r';
468 break;
469 case 'e':
470 value = '\e';
471 break;
472 case '\\':
473 break;
474 case '\'':
475 break;
476 case '"':
477 break;
478 case '\n':
479 warning(stream_pos(stream), "Newline in string or character constant");
480 break;
481 case '0'...'7': {
482 int nr = 2;
483 value -= '0';
484 while (next >= '0' && next <= '9') {
485 value = (value << 3) + (next-'0');
486 next = nextchar(stream);
487 if (!--nr)
488 break;
490 value &= 0xff;
491 break;
493 case 'x': {
494 int hex = hexval(next);
495 if (hex < 16) {
496 value = hex;
497 next = nextchar(stream);
498 while ((hex = hexval(next)) < 16) {
499 value = (value << 4) + hex;
500 next = nextchar(stream);
502 value &= 0xff;
503 break;
506 /* Fallthrough */
507 default:
508 warning(stream_pos(stream), "Unknown escape '%c'", value);
511 /* Mark it as escaped */
512 value |= 0x100;
514 *valp = value;
515 return next;
518 static int get_char_token(int next, stream_t *stream)
520 int value;
521 struct token *token;
523 next = escapechar(next, '\'', stream, &value);
524 if (value == '\'' || next != '\'') {
525 sparse_error(stream_pos(stream), "Bad character constant");
526 drop_token(stream);
527 return next;
530 token = stream->token;
531 token_type(token) = TOKEN_CHAR;
532 token->character = value & 0xff;
534 add_token(stream);
535 return nextchar(stream);
538 static int get_string_token(int next, stream_t *stream)
540 static char buffer[MAX_STRING];
541 struct string *string;
542 struct token *token;
543 int len = 0;
545 for (;;) {
546 int val;
547 next = escapechar(next, '"', stream, &val);
548 if (val == '"')
549 break;
550 if (next == EOF) {
551 warning(stream_pos(stream), "End of file in middle of string");
552 return next;
554 if (len < MAX_STRING)
555 buffer[len] = val;
556 len++;
559 if (len > MAX_STRING) {
560 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
561 len = MAX_STRING;
564 string = __alloc_string(len+1);
565 memcpy(string->data, buffer, len);
566 string->data[len] = '\0';
567 string->length = len+1;
569 /* Pass it on.. */
570 token = stream->token;
571 token_type(token) = TOKEN_STRING;
572 token->string = string;
573 add_token(stream);
575 return next;
578 static int drop_stream_eoln(stream_t *stream)
580 int next = nextchar(stream);
581 drop_token(stream);
582 for (;;) {
583 int curr = next;
584 if (curr == EOF)
585 return next;
586 next = nextchar(stream);
587 if (curr == '\n')
588 return next;
592 static int drop_stream_comment(stream_t *stream)
594 int newline;
595 int next;
596 drop_token(stream);
597 newline = stream->newline;
599 next = nextchar(stream);
600 for (;;) {
601 int curr = next;
602 if (curr == EOF) {
603 warning(stream_pos(stream), "End of file in the middle of a comment");
604 return curr;
606 next = nextchar(stream);
607 if (curr == '*' && next == '/')
608 break;
610 stream->newline = newline;
611 return nextchar(stream);
614 unsigned char combinations[][3] = COMBINATION_STRINGS;
616 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
618 static int get_one_special(int c, stream_t *stream)
620 struct token *token;
621 unsigned char c1, c2, c3;
622 int next, value, i;
623 unsigned char *comb;
625 next = nextchar(stream);
628 * Check for numbers, strings, character constants, and comments
630 switch (c) {
631 case '.':
632 if (next >= '0' && next <= '9')
633 return get_one_number(c, next, stream);
634 break;
635 case '"':
636 return get_string_token(next, stream);
637 case '\'':
638 return get_char_token(next, stream);
639 case '/':
640 if (next == '/')
641 return drop_stream_eoln(stream);
642 if (next == '*')
643 return drop_stream_comment(stream);
647 * Check for combinations
649 value = c;
650 if (cclass[next + 1] & ValidSecond) {
651 comb = combinations[0];
652 c1 = c; c2 = next; c3 = 0;
653 for (i = 0; i < NR_COMBINATIONS; i++) {
654 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
655 value = i + SPECIAL_BASE;
656 next = nextchar(stream);
657 if (c3)
658 break;
659 c3 = next;
661 comb += 3;
665 /* Pass it on.. */
666 token = stream->token;
667 token_type(token) = TOKEN_SPECIAL;
668 token->special = value;
669 add_token(stream);
670 return next;
673 #define IDENT_HASH_BITS (13)
674 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
675 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
677 #define ident_hash_init(c) (c)
678 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
679 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
681 static struct ident *hash_table[IDENT_HASH_SIZE];
682 static int ident_hit, ident_miss, idents;
684 void show_identifier_stats(void)
686 int i;
687 int distribution[100];
689 fprintf(stderr, "identifiers: %d hits, %d misses\n",
690 ident_hit, ident_miss);
692 for (i = 0; i < 100; i++)
693 distribution[i] = 0;
695 for (i = 0; i < IDENT_HASH_SIZE; i++) {
696 struct ident * ident = hash_table[i];
697 int count = 0;
699 while (ident) {
700 count++;
701 ident = ident->next;
703 if (count > 99)
704 count = 99;
705 distribution[count]++;
708 for (i = 0; i < 100; i++) {
709 if (distribution[i])
710 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
714 static struct ident *alloc_ident(const char *name, int len)
716 struct ident *ident = __alloc_ident(len);
717 ident->symbols = NULL;
718 ident->len = len;
719 ident->tainted = 0;
720 memcpy(ident->name, name, len);
721 return ident;
724 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
726 ident->next = hash_table[hash];
727 hash_table[hash] = ident;
728 ident_miss++;
729 return ident;
732 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
734 struct ident *ident;
735 struct ident **p;
737 p = &hash_table[hash];
738 while ((ident = *p) != NULL) {
739 if (ident->len == (unsigned char) len) {
740 const char *n = name;
741 const char *m = ident->name;
742 int l = len;
743 do {
744 if (*n != *m)
745 goto next;
746 n++;
747 m++;
748 } while (--l);
750 ident_hit++;
751 return ident;
753 next:
754 //misses++;
755 p = &ident->next;
757 ident = alloc_ident(name, len);
758 *p = ident;
759 ident->next = NULL;
760 ident_miss++;
761 idents++;
762 return ident;
765 static unsigned long hash_name(const char *name, int len)
767 unsigned long hash;
768 const unsigned char *p = (const unsigned char *)name;
770 hash = ident_hash_init(*p++);
771 while (--len) {
772 unsigned int i = *p++;
773 hash = ident_hash_add(hash, i);
775 return ident_hash_end(hash);
778 struct ident *hash_ident(struct ident *ident)
780 return insert_hash(ident, hash_name(ident->name, ident->len));
783 struct ident *built_in_ident(const char *name)
785 int len = strlen(name);
786 return create_hashed_ident(name, len, hash_name(name, len));
789 struct token *built_in_token(int stream, const char *name)
791 struct token *token;
793 token = __alloc_token(0);
794 token->pos.stream = stream;
795 token_type(token) = TOKEN_IDENT;
796 token->ident = built_in_ident(name);
797 return token;
800 static int get_one_identifier(int c, stream_t *stream)
802 struct token *token;
803 struct ident *ident;
804 unsigned long hash;
805 char buf[256];
806 int len = 1;
807 int next;
809 hash = ident_hash_init(c);
810 buf[0] = c;
811 for (;;) {
812 next = nextchar(stream);
813 if (!(cclass[next + 1] & (Letter | Digit)))
814 break;
815 if (len >= sizeof(buf))
816 break;
817 hash = ident_hash_add(hash, next);
818 buf[len] = next;
819 len++;
821 hash = ident_hash_end(hash);
823 ident = create_hashed_ident(buf, len, hash);
825 /* Pass it on.. */
826 token = stream->token;
827 token_type(token) = TOKEN_IDENT;
828 token->ident = ident;
829 add_token(stream);
830 return next;
833 static int get_one_token(int c, stream_t *stream)
835 long class = cclass[c + 1];
836 if (class & Digit)
837 return get_one_number(c, nextchar(stream), stream);
838 if (class & Letter)
839 return get_one_identifier(c, stream);
840 return get_one_special(c, stream);
843 static struct token *setup_stream(stream_t *stream, int idx, int fd,
844 unsigned char *buf, unsigned int buf_size)
846 struct token *begin;
848 stream->nr = idx;
849 stream->line = 1;
850 stream->newline = 1;
851 stream->whitespace = 0;
852 stream->pos = 0;
854 stream->token = NULL;
855 stream->fd = fd;
856 stream->offset = 0;
857 stream->size = buf_size;
858 stream->buffer = buf;
860 begin = alloc_token(stream);
861 token_type(begin) = TOKEN_STREAMBEGIN;
862 stream->tokenlist = &begin->next;
863 return begin;
866 static void tokenize_stream(stream_t *stream, struct token *endtoken)
868 int c = nextchar(stream);
869 while (c != EOF) {
870 if (!isspace(c)) {
871 struct token *token = alloc_token(stream);
872 stream->token = token;
873 stream->newline = 0;
874 stream->whitespace = 0;
875 c = get_one_token(c, stream);
876 continue;
878 stream->whitespace = 1;
879 c = nextchar(stream);
881 mark_eof(stream, endtoken);
884 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token *endtoken)
886 stream_t stream;
887 struct token *begin;
889 begin = setup_stream(&stream, 0, -1, buffer, size);
890 tokenize_stream(&stream, endtoken);
891 return begin;
894 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
896 struct token *begin;
897 stream_t stream;
898 unsigned char buffer[BUFSIZE];
899 int idx;
901 idx = init_stream(name, fd, next_path);
902 if (idx < 0) {
903 // info(endtoken->pos, "File %s is const", name);
904 return endtoken;
907 begin = setup_stream(&stream, idx, fd, buffer, 0);
908 tokenize_stream(&stream, endtoken);
909 return begin;