[be] Handle 'break' and 'continue' inside loops.
[smatch.git] / tokenize.c
blob9f3b369b9c919e26f607a2a671acfaa5feb36a3e
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <sys/stat.h>
19 #include "lib.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
29 #define BUFSIZE (8192)
31 typedef struct {
32 int fd, offset, size;
33 struct position pos;
34 struct token **tokenlist;
35 struct token *token;
36 unsigned char *buffer;
37 } stream_t;
40 const char *show_special(int val)
42 static const char *combinations[] = COMBINATION_STRINGS;
43 static char buffer[4];
45 buffer[0] = val;
46 buffer[1] = 0;
47 if (val >= SPECIAL_BASE)
48 strcpy(buffer, combinations[val - SPECIAL_BASE]);
49 return buffer;
52 const char *show_ident(const struct ident *ident)
54 static char buffer[256];
55 if (!ident)
56 return "<noident>";
57 sprintf(buffer, "%.*s", ident->len, ident->name);
58 return buffer;
61 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
63 if (isprint(c)) {
64 if (c == escape || c == '\\')
65 *ptr++ = '\\';
66 *ptr++ = c;
67 return ptr;
69 *ptr++ = '\\';
70 switch (c) {
71 case '\n':
72 *ptr++ = 'n';
73 return ptr;
74 case '\t':
75 *ptr++ = 't';
76 return ptr;
78 if (!isdigit(next))
79 return ptr + sprintf(ptr, "%o", c);
81 return ptr + sprintf(ptr, "%03o", c);
84 const char *show_string(const struct string *string)
86 static char buffer[256];
87 char *ptr;
88 int i;
90 ptr = buffer;
91 *ptr++ = '"';
92 for (i = 0; i < string->length-1; i++) {
93 const unsigned char *p = string->data + i;
94 ptr = charstr(ptr, p[0], '"', p[1]);
96 *ptr++ = '"';
97 *ptr = '\0';
98 return buffer;
101 const char *show_token(const struct token *token)
103 static char buffer[256];
105 if (!token)
106 return "<no token>";
107 switch (token_type(token)) {
108 case TOKEN_ERROR:
109 return "syntax error";
111 case TOKEN_EOF:
112 return "end-of-input";
114 case TOKEN_IDENT:
115 return show_ident(token->ident);
117 case TOKEN_STRING:
118 return show_string(token->string);
120 case TOKEN_INTEGER: {
121 const char *p = token->integer;
122 switch (*p) {
123 case 'o': // octal
124 case 'x': // hex
125 buffer[0] = '0';
126 strcpy(buffer+1, p+1);
127 return buffer;
128 default:
129 return p;
133 case TOKEN_FP:
134 return token->fp;
136 case TOKEN_SPECIAL:
137 return show_special(token->special);
139 case TOKEN_CHAR: {
140 char *ptr = buffer;
141 int c = token->character;
142 *ptr++ = '\'';
143 ptr = charstr(ptr, c, '\'', 0);
144 *ptr++ = '\'';
145 *ptr++ = '\0';
146 return buffer;
149 case TOKEN_STREAMBEGIN:
150 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
151 return buffer;
153 case TOKEN_STREAMEND:
154 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
155 return buffer;
157 default:
158 return "WTF???";
162 int init_stream(const char *name, int fd)
164 int stream = input_stream_nr;
165 struct stream *current;
167 if (stream >= input_streams_allocated) {
168 int newalloc = stream * 4 / 3 + 10;
169 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
170 if (!input_streams)
171 die("Unable to allocate more streams space");
172 input_streams_allocated = newalloc;
174 current = input_streams + stream;
175 memset(current, 0, sizeof(*current));
176 current->name = name;
177 current->fd = fd;
178 current->constant = -1; // "unknown"
179 if (fd > 0) {
180 int i;
181 struct stat st;
183 fstat(fd, &st);
184 current->dev = st.st_dev;
185 current->ino = st.st_ino;
186 for (i = 0; i < stream; i++) {
187 struct stream *s = input_streams + i;
188 if (s->dev == st.st_dev && s->ino == st.st_ino) {
189 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
190 return -1;
194 input_stream_nr = stream+1;
195 return stream;
198 static struct token * alloc_token(stream_t *stream)
200 struct token *token = __alloc_token(0);
201 token->pos = stream->pos;
202 return token;
205 static int nextchar(stream_t *stream)
207 int offset = stream->offset;
208 int size = stream->size;
209 int c;
210 int complain = -1;
212 repeat:
213 complain++;
214 if (offset >= size) {
215 size = read(stream->fd, stream->buffer, BUFSIZE);
216 if (size <= 0)
217 return EOF;
218 stream->size = size;
219 stream->offset = 0;
220 offset = 0;
222 c = stream->buffer[offset];
223 stream->offset = ++offset;
225 stream->pos.pos++;
227 /* Ignore DOS-stype '\r' characters */
228 if (c == '\r')
229 goto repeat;
231 if (c == '\n') {
232 stream->pos.line++;
233 stream->pos.newline = 1;
234 stream->pos.pos = 0;
235 complain = 0;
238 if (complain)
239 warn(stream->pos, "non-ASCII data stream");
241 return c;
244 struct token eof_token_entry;
246 static void mark_eof(stream_t *stream, struct token *end_token)
248 struct token *end;
250 end = alloc_token(stream);
251 token_type(end) = TOKEN_STREAMEND;
252 end->pos.newline = 1;
254 eof_token_entry.next = &eof_token_entry;
255 eof_token_entry.pos.newline = 1;
257 if (!end_token)
258 end_token = &eof_token_entry;
259 end->next = end_token;
260 *stream->tokenlist = end;
261 stream->tokenlist = NULL;
264 static void add_token(stream_t *stream)
266 struct token *token = stream->token;
268 stream->token = NULL;
269 token->next = NULL;
270 *stream->tokenlist = token;
271 stream->tokenlist = &token->next;
274 static void drop_token(stream_t *stream)
276 stream->pos.newline |= stream->token->pos.newline;
277 stream->pos.whitespace |= stream->token->pos.whitespace;
278 stream->token = NULL;
281 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
283 char *buf = *p;
285 *buf++ = next;
286 for (;;) {
287 unsigned int n;
288 next = nextchar(stream);
289 n = hexval(next);
290 if (n >= base)
291 break;
292 *buf++ = next;
294 *p = buf;
295 return next;
298 static int do_fp(char *buffer, int len, int next, stream_t *stream)
300 struct token *token = stream->token;
301 void *buf;
303 /* Get the decimal part */
304 if (next == '.') {
305 buffer[len++] = next;
306 next = nextchar(stream);
307 while (next >= '0' && next <= '9') {
308 buffer[len++] = next;
309 next = nextchar(stream);
313 /* Get the exponential part */
314 if (next == 'e' || next == 'E') {
315 buffer[len++] = next;
316 next = nextchar(stream);
317 while (next >= '0' && next <= '9') {
318 buffer[len++] = next;
319 next = nextchar(stream);
323 /* Get the 'lf' type specifiers */
324 while (next == 'f' || next == 'F' || next == 'l' || next == 'L') {
325 buffer[len++] = next;
326 next = nextchar(stream);
329 buffer[len++] = '\0';
330 buf = __alloc_bytes(len);
331 memcpy(buf, buffer, len);
332 token_type(token) = TOKEN_FP;
333 token->fp = buf;
334 add_token(stream);
335 return next;
338 static int do_integer(char *buffer, int len, int next, stream_t *stream)
340 struct token *token = stream->token;
341 void *buf;
343 if (next == '.' || next == 'e' || next == 'E')
344 return do_fp(buffer, len, next, stream);
346 while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
347 buffer[len++] = next;
348 next = nextchar(stream);
350 buffer[len++] = '\0';
351 buf = __alloc_bytes(len);
352 memcpy(buf, buffer, len);
353 token_type(token) = TOKEN_INTEGER;
354 token->integer = buf;
355 add_token(stream);
356 return next;
359 static int get_one_number(int c, stream_t *stream)
361 static char buffer[256];
362 int next = nextchar(stream);
363 char *p = buffer;
365 *p++ = c;
366 switch (next) {
367 case '0'...'7':
368 if (c == '0') {
369 buffer[0] = 'o';
370 next = get_base_number(8, &p, next, stream);
371 break;
373 /* fallthrough */
374 case '8'...'9':
375 next = get_base_number(10, &p, next, stream);
376 break;
377 case 'x': case 'X':
378 if (c == '0') {
379 buffer[0] = 'x';
380 next = get_base_number(16, &p, next, stream);
383 return do_integer(buffer, p - buffer, next, stream);
386 static int escapechar(int first, int type, stream_t *stream, int *valp)
388 int next, value;
390 next = nextchar(stream);
391 value = first;
393 if (first == '\n')
394 warn(stream->pos, "Newline in string or character constant");
396 if (first == '\\' && next != EOF) {
397 value = next;
398 next = nextchar(stream);
399 if (value != type) {
400 switch (value) {
401 case 'a':
402 value = '\a';
403 break;
404 case 'b':
405 value = '\b';
406 break;
407 case 't':
408 value = '\t';
409 break;
410 case 'n':
411 value = '\n';
412 break;
413 case 'v':
414 value = '\v';
415 break;
416 case 'f':
417 value = '\f';
418 break;
419 case 'r':
420 value = '\r';
421 break;
422 case 'e':
423 value = '\e';
424 break;
425 case '\\':
426 break;
427 case '\'':
428 break;
429 case '"':
430 break;
431 case '\n':
432 next = escapechar(next, type, stream, &value);
433 break;
434 case '0'...'7': {
435 int nr = 2;
436 value -= '0';
437 while (next >= '0' && next <= '9') {
438 value = (value << 3) + (next-'0');
439 next = nextchar(stream);
440 if (!--nr)
441 break;
443 value &= 0xff;
444 break;
446 case 'x': {
447 int hex = hexval(next);
448 if (hex < 16) {
449 value = hex;
450 next = nextchar(stream);
451 while ((hex = hexval(next)) < 16) {
452 value = (value << 4) + hex;
453 next = nextchar(stream);
455 value &= 0xff;
456 break;
459 /* Fallthrough */
460 default:
461 warn(stream->pos, "Unknown escape '%c'", value);
464 /* Mark it as escaped */
465 value |= 0x100;
467 *valp = value;
468 return next;
471 static int get_char_token(int next, stream_t *stream)
473 int value;
474 struct token *token;
476 next = escapechar(next, '\'', stream, &value);
477 if (value == '\'' || next != '\'') {
478 warn(stream->pos, "Bad character constant");
479 drop_token(stream);
480 return next;
483 token = stream->token;
484 token_type(token) = TOKEN_CHAR;
485 token->character = value & 0xff;
487 add_token(stream);
488 return nextchar(stream);
491 static int get_string_token(int next, stream_t *stream)
493 static char buffer[512];
494 struct string *string;
495 struct token *token;
496 int len = 0;
498 for (;;) {
499 int val;
500 next = escapechar(next, '"', stream, &val);
501 if (val == '"')
502 break;
503 if (next == EOF) {
504 warn(stream->pos, "Enf of file in middle of string");
505 return next;
507 if (len < sizeof(buffer)) {
508 buffer[len] = val;
509 len++;
514 if (len > 256)
515 warn(stream->pos, "String too long");
517 string = __alloc_string(len+1);
518 memcpy(string->data, buffer, len);
519 string->data[len] = '\0';
520 string->length = len+1;
522 /* Pass it on.. */
523 token = stream->token;
524 token_type(token) = TOKEN_STRING;
525 token->string = string;
526 add_token(stream);
528 return next;
531 static int drop_stream_eoln(stream_t *stream)
533 int next = nextchar(stream);
534 drop_token(stream);
535 for (;;) {
536 int curr = next;
537 if (curr == EOF)
538 return next;
539 next = nextchar(stream);
540 if (curr == '\n')
541 return next;
545 static int drop_stream_comment(stream_t *stream)
547 int next = nextchar(stream);
548 drop_token(stream);
549 for (;;) {
550 int curr = next;
551 if (curr == EOF) {
552 warn(stream->pos, "End of file in the middle of a comment");
553 return curr;
555 next = nextchar(stream);
556 if (curr == '*' && next == '/')
557 break;
559 return nextchar(stream);
562 unsigned char combinations[][3] = COMBINATION_STRINGS;
564 #define NR_COMBINATIONS (sizeof(combinations)/3)
566 static int get_one_special(int c, stream_t *stream)
568 struct token *token;
569 unsigned char c1, c2, c3;
570 int next, value, i;
571 char *comb;
573 next = nextchar(stream);
576 * Check for strings, character constants, and comments
578 switch (c) {
579 case '"':
580 return get_string_token(next, stream);
581 case '\'':
582 return get_char_token(next, stream);
583 case '/':
584 if (next == '/')
585 return drop_stream_eoln(stream);
586 if (next == '*')
587 return drop_stream_comment(stream);
591 * Check for combinations
593 value = c;
594 comb = combinations[0];
595 c1 = c; c2 = next; c3 = 0;
596 for (i = 0; i < NR_COMBINATIONS; i++) {
597 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
598 value = i + SPECIAL_BASE;
599 next = nextchar(stream);
600 if (c3)
601 break;
602 c3 = next;
604 comb += 3;
607 /* Pass it on.. */
608 token = stream->token;
609 token_type(token) = TOKEN_SPECIAL;
610 token->special = value;
611 add_token(stream);
612 return next;
615 #define IDENT_HASH_BITS (10)
616 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
617 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
619 #define ident_hash_init(c) (c)
620 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
621 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
623 static struct ident *hash_table[IDENT_HASH_SIZE];
624 int ident_hit, ident_miss;
626 void show_identifier_stats(void)
628 int i;
629 int distribution[100];
631 fprintf(stderr, "identifiers: %d hits, %d misses\n",
632 ident_hit, ident_miss);
634 for (i = 0; i < 100; i++)
635 distribution[i] = 0;
637 for (i = 0; i < IDENT_HASH_SIZE; i++) {
638 struct ident * ident = hash_table[i];
639 int count = 0;
641 while (ident) {
642 count++;
643 ident = ident->next;
645 if (count > 99)
646 count = 99;
647 distribution[count]++;
650 for (i = 0; i < 100; i++) {
651 if (distribution[i])
652 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
656 static struct ident *alloc_ident(const char *name, int len)
658 struct ident *ident = __alloc_ident(len);
659 ident->symbols = NULL;
660 ident->len = len;
661 memcpy(ident->name, name, len);
662 return ident;
665 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
667 ident->next = hash_table[hash];
668 hash_table[hash] = ident;
669 ident_miss++;
670 return ident;
673 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
675 struct ident *ident;
677 ident = hash_table[hash];
678 while (ident) {
679 if (ident->len == len && !memcmp(ident->name, name, len)) {
680 ident_hit++;
681 return ident;
683 ident = ident->next;
686 return insert_hash(alloc_ident(name, len), hash);
689 static unsigned long hash_name(const char *name, int len)
691 unsigned long hash;
692 const unsigned char *p = (const unsigned char *)name;
694 hash = ident_hash_init(*p++);
695 while (--len) {
696 unsigned int i = *p++;
697 hash = ident_hash_add(hash, i);
699 return ident_hash_end(hash);
702 struct ident *hash_ident(struct ident *ident)
704 return insert_hash(ident, hash_name(ident->name, ident->len));
707 struct ident *built_in_ident(const char *name)
709 int len = strlen(name);
710 return create_hashed_ident(name, len, hash_name(name, len));
713 struct token *built_in_token(int stream, const char *name)
715 struct token *token;
717 token = __alloc_token(0);
718 token->pos.stream = stream;
719 token_type(token) = TOKEN_IDENT;
720 token->ident = built_in_ident(name);
721 return token;
724 static int get_one_identifier(int c, stream_t *stream)
726 struct token *token;
727 struct ident *ident;
728 unsigned long hash;
729 char buf[256];
730 int len = 1;
731 int next;
733 hash = ident_hash_init(c);
734 buf[0] = c;
735 for (;;) {
736 next = nextchar(stream);
737 switch (next) {
738 case '0'...'9':
739 case 'a'...'z':
740 case 'A'...'Z':
741 case '_':
742 if (len < sizeof(buf)) {
743 hash = ident_hash_add(hash, next);
744 buf[len] = next;
745 len++;
747 continue;
749 break;
751 hash = ident_hash_end(hash);
753 ident = create_hashed_ident(buf, len, hash);
755 /* Pass it on.. */
756 token = stream->token;
757 token_type(token) = TOKEN_IDENT;
758 token->ident = ident;
759 add_token(stream);
760 return next;
763 static int get_one_token(int c, stream_t *stream)
765 switch (c) {
766 case '0'...'9':
767 return get_one_number(c, stream);
768 case 'a'...'z':
769 case 'A'...'Z':
770 case '_':
771 return get_one_identifier(c, stream);
772 default:
773 return get_one_special(c, stream);
777 static struct token *setup_stream(stream_t *stream, int idx, int fd,
778 unsigned char *buf, unsigned int buf_size)
780 struct token *begin;
782 stream->pos.stream = idx;
783 stream->pos.line = 1;
784 stream->pos.newline = 1;
785 stream->pos.whitespace = 0;
786 stream->pos.pos = 0;
788 stream->token = NULL;
789 stream->fd = fd;
790 stream->offset = 0;
791 stream->size = buf_size;
792 stream->buffer = buf;
794 begin = alloc_token(stream);
795 token_type(begin) = TOKEN_STREAMBEGIN;
796 stream->tokenlist = &begin->next;
797 return begin;
800 static void tokenize_stream(stream_t *stream, struct token *endtoken)
802 int c = nextchar(stream);
803 while (c != EOF) {
804 if (c == '\\') {
805 c = nextchar(stream);
806 stream->pos.newline = 0;
807 stream->pos.whitespace = 1;
808 continue;
810 if (!isspace(c)) {
811 struct token *token = alloc_token(stream);
812 stream->token = token;
813 stream->pos.newline = 0;
814 stream->pos.whitespace = 0;
815 c = get_one_token(c, stream);
816 continue;
818 stream->pos.whitespace = 1;
819 c = nextchar(stream);
821 mark_eof(stream, endtoken);
824 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
826 stream_t stream;
827 struct token *begin;
829 begin = setup_stream(&stream, 0, -1, buffer, size);
830 tokenize_stream(&stream, endtoken);
831 return begin;
834 struct token * tokenize(const char *name, int fd, struct token *endtoken)
836 struct token *begin;
837 stream_t stream;
838 unsigned char buffer[BUFSIZE];
839 int idx;
841 idx = init_stream(name, fd);
842 if (idx < 0)
843 return endtoken;
845 begin = setup_stream(&stream, idx, fd, buffer, 0);
846 tokenize_stream(&stream, endtoken);
847 return begin;