Fix conditional branch to same target simplification.
[smatch.git] / tokenize.c
blob98ea9c5cf6c80805d718ddafa71acf2d9bd69ad7
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <sys/stat.h>
19 #include "lib.h"
20 #include "allocate.h"
21 #include "token.h"
22 #include "symbol.h"
24 #define EOF (-1)
26 int input_stream_nr = 0;
27 struct stream *input_streams;
28 static int input_streams_allocated;
30 #define BUFSIZE (8192)
32 typedef struct {
33 int fd, offset, size;
34 int pos, line, nr;
35 int newline, whitespace;
36 struct token **tokenlist;
37 struct token *token;
38 unsigned char *buffer;
39 } stream_t;
41 struct position stream_pos(stream_t *stream)
43 struct position pos;
44 pos.type = 0;
45 pos.stream = stream->nr;
46 pos.newline = stream->newline;
47 pos.whitespace = stream->whitespace;
48 pos.pos = stream->pos;
49 pos.line = stream->line;
50 pos.noexpand = 0;
51 return pos;
54 const char *show_special(int val)
56 static const char *combinations[] = COMBINATION_STRINGS;
57 static char buffer[4];
59 buffer[0] = val;
60 buffer[1] = 0;
61 if (val >= SPECIAL_BASE)
62 strcpy(buffer, combinations[val - SPECIAL_BASE]);
63 return buffer;
66 const char *show_ident(const struct ident *ident)
68 static char buffer[256];
69 if (!ident)
70 return "<noident>";
71 sprintf(buffer, "%.*s", ident->len, ident->name);
72 return buffer;
75 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
77 if (isprint(c)) {
78 if (c == escape || c == '\\')
79 *ptr++ = '\\';
80 *ptr++ = c;
81 return ptr;
83 *ptr++ = '\\';
84 switch (c) {
85 case '\n':
86 *ptr++ = 'n';
87 return ptr;
88 case '\t':
89 *ptr++ = 't';
90 return ptr;
92 if (!isdigit(next))
93 return ptr + sprintf(ptr, "%o", c);
95 return ptr + sprintf(ptr, "%03o", c);
98 const char *show_string(const struct string *string)
100 static char buffer[4 * MAX_STRING + 3];
101 char *ptr;
102 int i;
104 if (!string->length)
105 return "<bad_string>";
106 ptr = buffer;
107 *ptr++ = '"';
108 for (i = 0; i < string->length-1; i++) {
109 const unsigned char *p = string->data + i;
110 ptr = charstr(ptr, p[0], '"', p[1]);
112 *ptr++ = '"';
113 *ptr = '\0';
114 return buffer;
117 const char *show_token(const struct token *token)
119 static char buffer[256];
121 if (!token)
122 return "<no token>";
123 switch (token_type(token)) {
124 case TOKEN_ERROR:
125 return "syntax error";
127 case TOKEN_EOF:
128 return "end-of-input";
130 case TOKEN_IDENT:
131 return show_ident(token->ident);
133 case TOKEN_STRING:
134 return show_string(token->string);
136 case TOKEN_NUMBER:
137 return token->number;
139 case TOKEN_SPECIAL:
140 return show_special(token->special);
142 case TOKEN_CHAR: {
143 char *ptr = buffer;
144 int c = token->character;
145 *ptr++ = '\'';
146 ptr = charstr(ptr, c, '\'', 0);
147 *ptr++ = '\'';
148 *ptr++ = '\0';
149 return buffer;
152 case TOKEN_STREAMBEGIN:
153 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
154 return buffer;
156 case TOKEN_STREAMEND:
157 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
158 return buffer;
160 default:
161 return "WTF???";
165 int init_stream(const char *name, int fd, const char **next_path)
167 int stream = input_stream_nr;
168 struct stream *current;
169 struct stat st;
171 if (stream >= input_streams_allocated) {
172 int newalloc = stream * 4 / 3 + 10;
173 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
174 if (!input_streams)
175 die("Unable to allocate more streams space");
176 input_streams_allocated = newalloc;
178 current = input_streams + stream;
179 memset(current, 0, sizeof(*current));
180 current->name = name;
181 current->fd = fd;
182 current->next_path = next_path;
183 current->constant = CONSTANT_FILE_MAYBE;
184 if (fd >= 0 && fstat(fd, &st) == 0 && S_ISREG(st.st_mode)) {
185 int i;
187 for (i = 0; i < stream; i++) {
188 struct stream *s = input_streams + i;
189 if (s->constant == CONSTANT_FILE_YES &&
190 identical_files(s, &st, name) &&
191 lookup_symbol(s->protect, NS_MACRO))
192 return -1;
195 current->dev = st.st_dev;
196 current->ino = st.st_ino;
198 input_stream_nr = stream+1;
199 return stream;
202 static struct token * alloc_token(stream_t *stream)
204 struct token *token = __alloc_token(0);
205 token->pos = stream_pos(stream);
206 return token;
210 * Argh... That was surprisingly messy - handling '\r' complicates the
211 * things a _lot_.
213 static int nextchar_slow(stream_t *stream)
215 int offset = stream->offset;
216 int size = stream->size;
217 int c;
218 int spliced = 0, had_cr, had_backslash, complain;
220 restart:
221 had_cr = had_backslash = complain = 0;
223 repeat:
224 if (offset >= size) {
225 size = read(stream->fd, stream->buffer, BUFSIZE);
226 if (size <= 0)
227 goto got_eof;
228 stream->size = size;
229 stream->offset = offset = 0;
232 c = stream->buffer[offset++];
234 if (had_cr && c != '\n')
235 complain = 1;
237 if (c == '\r') {
238 had_cr = 1;
239 goto repeat;
242 stream->pos++;
244 if (c == '\n') {
245 stream->line++;
246 stream->pos = 0;
249 if (!had_backslash) {
250 if (c == '\\') {
251 had_backslash = 1;
252 goto repeat;
254 if (c == '\n')
255 stream->newline = 1;
256 } else {
257 if (c == '\n') {
258 if (complain)
259 warning(stream_pos(stream), "non-ASCII data stream");
260 spliced = 1;
261 goto restart;
263 stream->pos--;
264 offset--;
265 c = '\\';
268 out:
269 stream->offset = offset;
270 if (complain)
271 warning(stream_pos(stream), "non-ASCII data stream");
273 return c;
275 got_eof:
276 if (had_backslash) {
277 c = '\\';
278 goto out;
280 if (stream->pos)
281 warning(stream_pos(stream), "no newline at end of file");
282 else if (had_cr)
283 warning(stream_pos(stream), "non-ASCII data stream");
284 else if (spliced)
285 warning(stream_pos(stream), "backslash-newline at end of file");
286 return EOF;
290 * We want that as light as possible while covering all normal cases.
291 * Slow path (including the logics with line-splicing and EOF sanity
292 * checks) is in nextchar_slow().
294 static int nextchar(stream_t *stream)
296 int offset = stream->offset;
298 if (offset < stream->size) {
299 int c = stream->buffer[offset++];
300 static const char special[256] = {
301 ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
303 if (!special[c]) {
304 stream->offset = offset;
305 stream->pos++;
306 return c;
309 return nextchar_slow(stream);
312 struct token eof_token_entry;
314 static void mark_eof(stream_t *stream, struct token *end_token)
316 struct token *end;
318 end = alloc_token(stream);
319 token_type(end) = TOKEN_STREAMEND;
320 end->pos.newline = 1;
322 eof_token_entry.next = &eof_token_entry;
323 eof_token_entry.pos.newline = 1;
325 if (!end_token)
326 end_token = &eof_token_entry;
327 end->next = end_token;
328 *stream->tokenlist = end;
329 stream->tokenlist = NULL;
332 static void add_token(stream_t *stream)
334 struct token *token = stream->token;
336 stream->token = NULL;
337 token->next = NULL;
338 *stream->tokenlist = token;
339 stream->tokenlist = &token->next;
342 static void drop_token(stream_t *stream)
344 stream->newline |= stream->token->pos.newline;
345 stream->whitespace |= stream->token->pos.whitespace;
346 stream->token = NULL;
349 enum {
350 Letter = 1,
351 Digit = 2,
352 Hex = 4,
353 Exp = 8,
354 Dot = 16,
355 ValidSecond = 32,
358 static const long cclass[257] = {
359 ['0' + 1 ... '9' + 1] = Digit | Hex,
360 ['A' + 1 ... 'D' + 1] = Letter | Hex,
361 ['E' + 1] = Letter | Hex | Exp,
362 ['F' + 1] = Letter | Hex,
363 ['G' + 1 ... 'O' + 1] = Letter,
364 ['P' + 1] = Letter | Exp,
365 ['Q' + 1 ... 'Z' + 1] = Letter,
366 ['a' + 1 ... 'd' + 1] = Letter | Hex,
367 ['e' + 1] = Letter | Hex | Exp,
368 ['f' + 1] = Letter | Hex,
369 ['g' + 1 ... 'o' + 1] = Letter,
370 ['p' + 1] = Letter | Exp,
371 ['q' + 1 ... 'z' + 1] = Letter,
372 ['_' + 1] = Letter,
373 ['.' + 1] = Dot | ValidSecond,
374 ['=' + 1] = ValidSecond,
375 ['+' + 1] = ValidSecond,
376 ['-' + 1] = ValidSecond,
377 ['>' + 1] = ValidSecond,
378 ['<' + 1] = ValidSecond,
379 ['&' + 1] = ValidSecond,
380 ['|' + 1] = ValidSecond,
381 ['#' + 1] = ValidSecond,
385 * pp-number:
386 * digit
387 * . digit
388 * pp-number digit
389 * pp-number identifier-nodigit
390 * pp-number e sign
391 * pp-number E sign
392 * pp-number p sign
393 * pp-number P sign
394 * pp-number .
396 static int get_one_number(int c, int next, stream_t *stream)
398 struct token *token;
399 static char buffer[4095];
400 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
401 int len;
403 *p++ = c;
404 for (;;) {
405 long class = cclass[next + 1];
406 if (!(class & (Dot | Digit | Letter)))
407 break;
408 if (p != buffer_end)
409 *p++ = next;
410 next = nextchar(stream);
411 if (class & Exp) {
412 if (next == '-' || next == '+') {
413 if (p != buffer_end)
414 *p++ = next;
415 next = nextchar(stream);
420 if (p == buffer_end) {
421 error(stream_pos(stream), "number token exceeds %td characters",
422 buffer_end - buffer);
423 // Pretend we saw just "1".
424 buffer[0] = '1';
425 p = buffer + 1;
428 *p++ = 0;
429 len = p - buffer;
430 buf = __alloc_bytes(len);
431 memcpy(buf, buffer, len);
433 token = stream->token;
434 token_type(token) = TOKEN_NUMBER;
435 token->number = buf;
436 add_token(stream);
438 return next;
441 static int escapechar(int first, int type, stream_t *stream, int *valp)
443 int next, value;
445 next = nextchar(stream);
446 value = first;
448 if (first == '\n')
449 warning(stream_pos(stream), "Newline in string or character constant");
451 if (first == '\\' && next != EOF) {
452 value = next;
453 next = nextchar(stream);
454 if (value != type) {
455 switch (value) {
456 case 'a':
457 value = '\a';
458 break;
459 case 'b':
460 value = '\b';
461 break;
462 case 't':
463 value = '\t';
464 break;
465 case 'n':
466 value = '\n';
467 break;
468 case 'v':
469 value = '\v';
470 break;
471 case 'f':
472 value = '\f';
473 break;
474 case 'r':
475 value = '\r';
476 break;
477 case 'e':
478 value = '\e';
479 break;
480 case '\\':
481 break;
482 case '\'':
483 break;
484 case '"':
485 break;
486 case '\n':
487 warning(stream_pos(stream), "Newline in string or character constant");
488 break;
489 case '0'...'7': {
490 int nr = 2;
491 value -= '0';
492 while (next >= '0' && next <= '9') {
493 value = (value << 3) + (next-'0');
494 next = nextchar(stream);
495 if (!--nr)
496 break;
498 value &= 0xff;
499 break;
501 case 'x': {
502 int hex = hexval(next);
503 if (hex < 16) {
504 value = hex;
505 next = nextchar(stream);
506 while ((hex = hexval(next)) < 16) {
507 value = (value << 4) + hex;
508 next = nextchar(stream);
510 value &= 0xff;
511 break;
514 /* Fallthrough */
515 default:
516 warning(stream_pos(stream), "Unknown escape '%c'", value);
519 /* Mark it as escaped */
520 value |= 0x100;
522 *valp = value;
523 return next;
526 static int get_char_token(int next, stream_t *stream)
528 int value;
529 struct token *token;
531 next = escapechar(next, '\'', stream, &value);
532 if (value == '\'' || next != '\'') {
533 warning(stream_pos(stream), "Bad character constant");
534 drop_token(stream);
535 return next;
538 token = stream->token;
539 token_type(token) = TOKEN_CHAR;
540 token->character = value & 0xff;
542 add_token(stream);
543 return nextchar(stream);
546 static int get_string_token(int next, stream_t *stream)
548 static char buffer[MAX_STRING];
549 struct string *string;
550 struct token *token;
551 int len = 0;
553 for (;;) {
554 int val;
555 next = escapechar(next, '"', stream, &val);
556 if (val == '"')
557 break;
558 if (next == EOF) {
559 warning(stream_pos(stream), "End of file in middle of string");
560 return next;
562 if (len < MAX_STRING)
563 buffer[len] = val;
564 len++;
567 if (len > MAX_STRING) {
568 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
569 len = MAX_STRING;
572 string = __alloc_string(len+1);
573 memcpy(string->data, buffer, len);
574 string->data[len] = '\0';
575 string->length = len+1;
577 /* Pass it on.. */
578 token = stream->token;
579 token_type(token) = TOKEN_STRING;
580 token->string = string;
581 add_token(stream);
583 return next;
586 static int drop_stream_eoln(stream_t *stream)
588 int next = nextchar(stream);
589 drop_token(stream);
590 for (;;) {
591 int curr = next;
592 if (curr == EOF)
593 return next;
594 next = nextchar(stream);
595 if (curr == '\n')
596 return next;
600 static int drop_stream_comment(stream_t *stream)
602 int newline;
603 int next;
604 drop_token(stream);
605 newline = stream->newline;
607 next = nextchar(stream);
608 for (;;) {
609 int curr = next;
610 if (curr == EOF) {
611 warning(stream_pos(stream), "End of file in the middle of a comment");
612 return curr;
614 next = nextchar(stream);
615 if (curr == '*' && next == '/')
616 break;
618 stream->newline = newline;
619 return nextchar(stream);
622 unsigned char combinations[][3] = COMBINATION_STRINGS;
624 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
626 static int get_one_special(int c, stream_t *stream)
628 struct token *token;
629 unsigned char c1, c2, c3;
630 int next, value, i;
631 char *comb;
633 next = nextchar(stream);
636 * Check for numbers, strings, character constants, and comments
638 switch (c) {
639 case '.':
640 if (next >= '0' && next <= '9')
641 return get_one_number(c, next, stream);
642 break;
643 case '"':
644 return get_string_token(next, stream);
645 case '\'':
646 return get_char_token(next, stream);
647 case '/':
648 if (next == '/')
649 return drop_stream_eoln(stream);
650 if (next == '*')
651 return drop_stream_comment(stream);
655 * Check for combinations
657 value = c;
658 if (cclass[next + 1] & ValidSecond) {
659 comb = combinations[0];
660 c1 = c; c2 = next; c3 = 0;
661 for (i = 0; i < NR_COMBINATIONS; i++) {
662 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
663 value = i + SPECIAL_BASE;
664 next = nextchar(stream);
665 if (c3)
666 break;
667 c3 = next;
669 comb += 3;
673 /* Pass it on.. */
674 token = stream->token;
675 token_type(token) = TOKEN_SPECIAL;
676 token->special = value;
677 add_token(stream);
678 return next;
681 #define IDENT_HASH_BITS (13)
682 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
683 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
685 #define ident_hash_init(c) (c)
686 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
687 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
689 static struct ident *hash_table[IDENT_HASH_SIZE];
690 int ident_hit, ident_miss, idents;
692 void show_identifier_stats(void)
694 int i;
695 int distribution[100];
697 fprintf(stderr, "identifiers: %d hits, %d misses\n",
698 ident_hit, ident_miss);
700 for (i = 0; i < 100; i++)
701 distribution[i] = 0;
703 for (i = 0; i < IDENT_HASH_SIZE; i++) {
704 struct ident * ident = hash_table[i];
705 int count = 0;
707 while (ident) {
708 count++;
709 ident = ident->next;
711 if (count > 99)
712 count = 99;
713 distribution[count]++;
716 for (i = 0; i < 100; i++) {
717 if (distribution[i])
718 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
722 static struct ident *alloc_ident(const char *name, int len)
724 struct ident *ident = __alloc_ident(len);
725 ident->symbols = NULL;
726 ident->len = len;
727 ident->tainted = 0;
728 memcpy(ident->name, name, len);
729 return ident;
732 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
734 ident->next = hash_table[hash];
735 hash_table[hash] = ident;
736 ident_miss++;
737 return ident;
740 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
742 struct ident *ident;
743 struct ident **p;
745 p = &hash_table[hash];
746 while ((ident = *p) != NULL) {
747 if (ident->len == (unsigned char) len) {
748 const char *n = name;
749 const char *m = ident->name;
750 int l = len;
751 do {
752 if (*n != *m)
753 goto next;
754 n++;
755 m++;
756 } while (--l);
758 ident_hit++;
759 return ident;
761 next:
762 //misses++;
763 p = &ident->next;
765 ident = alloc_ident(name, len);
766 *p = ident;
767 ident->next = NULL;
768 ident_miss++;
769 idents++;
770 return ident;
773 static unsigned long hash_name(const char *name, int len)
775 unsigned long hash;
776 const unsigned char *p = (const unsigned char *)name;
778 hash = ident_hash_init(*p++);
779 while (--len) {
780 unsigned int i = *p++;
781 hash = ident_hash_add(hash, i);
783 return ident_hash_end(hash);
786 struct ident *hash_ident(struct ident *ident)
788 return insert_hash(ident, hash_name(ident->name, ident->len));
791 struct ident *built_in_ident(const char *name)
793 int len = strlen(name);
794 return create_hashed_ident(name, len, hash_name(name, len));
797 struct token *built_in_token(int stream, const char *name)
799 struct token *token;
801 token = __alloc_token(0);
802 token->pos.stream = stream;
803 token_type(token) = TOKEN_IDENT;
804 token->ident = built_in_ident(name);
805 return token;
808 static int get_one_identifier(int c, stream_t *stream)
810 struct token *token;
811 struct ident *ident;
812 unsigned long hash;
813 char buf[256];
814 int len = 1;
815 int next;
817 hash = ident_hash_init(c);
818 buf[0] = c;
819 for (;;) {
820 next = nextchar(stream);
821 if (!(cclass[next + 1] & (Letter | Digit)))
822 break;
823 if (len >= sizeof(buf))
824 break;
825 hash = ident_hash_add(hash, next);
826 buf[len] = next;
827 len++;
829 hash = ident_hash_end(hash);
831 ident = create_hashed_ident(buf, len, hash);
833 /* Pass it on.. */
834 token = stream->token;
835 token_type(token) = TOKEN_IDENT;
836 token->ident = ident;
837 add_token(stream);
838 return next;
841 static int get_one_token(int c, stream_t *stream)
843 long class = cclass[c + 1];
844 if (class & Digit)
845 return get_one_number(c, nextchar(stream), stream);
846 if (class & Letter)
847 return get_one_identifier(c, stream);
848 return get_one_special(c, stream);
851 static struct token *setup_stream(stream_t *stream, int idx, int fd,
852 unsigned char *buf, unsigned int buf_size)
854 struct token *begin;
856 stream->nr = idx;
857 stream->line = 1;
858 stream->newline = 1;
859 stream->whitespace = 0;
860 stream->pos = 0;
862 stream->token = NULL;
863 stream->fd = fd;
864 stream->offset = 0;
865 stream->size = buf_size;
866 stream->buffer = buf;
868 begin = alloc_token(stream);
869 token_type(begin) = TOKEN_STREAMBEGIN;
870 stream->tokenlist = &begin->next;
871 return begin;
874 static void tokenize_stream(stream_t *stream, struct token *endtoken)
876 int c = nextchar(stream);
877 while (c != EOF) {
878 if (!isspace(c)) {
879 struct token *token = alloc_token(stream);
880 stream->token = token;
881 stream->newline = 0;
882 stream->whitespace = 0;
883 c = get_one_token(c, stream);
884 continue;
886 stream->whitespace = 1;
887 c = nextchar(stream);
889 mark_eof(stream, endtoken);
892 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
894 stream_t stream;
895 struct token *begin;
897 begin = setup_stream(&stream, 0, -1, buffer, size);
898 tokenize_stream(&stream, endtoken);
899 return begin;
902 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
904 struct token *begin;
905 stream_t stream;
906 unsigned char buffer[BUFSIZE];
907 int idx;
909 idx = init_stream(name, fd, next_path);
910 if (idx < 0) {
911 // info(endtoken->pos, "File %s is const", name);
912 return endtoken;
915 begin = setup_stream(&stream, idx, fd, buffer, 0);
916 tokenize_stream(&stream, endtoken);
917 return begin;