Handle '#' properly (well, _more_ properly) in macro expansion.
[smatch.git] / tokenize.c
bloba2a63bc9c55dc24a47e3ab0d60595525ba207228
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Linus Torvalds, all rights reserved.
6 */
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <stdarg.h>
10 #include <stddef.h>
11 #include <string.h>
12 #include <ctype.h>
13 #include <unistd.h>
14 #include <sys/stat.h>
16 #include "lib.h"
17 #include "token.h"
18 #include "symbol.h"
20 #define EOF (-1)
22 int input_stream_nr = 0;
23 struct stream *input_streams;
24 static int input_streams_allocated;
26 #define BUFSIZE (8192)
27 typedef struct {
28 int fd, stream, line, pos, offset, size;
29 unsigned int newline:1, whitespace:1;
30 struct token **tokenlist;
31 struct token *token;
32 unsigned char buffer[BUFSIZE];
33 } stream_t;
36 const char *show_special(int val)
38 static const char *combinations[] = COMBINATION_STRINGS;
39 static char buffer[4];
41 buffer[0] = val;
42 buffer[1] = 0;
43 if (val >= SPECIAL_BASE)
44 strcpy(buffer, combinations[val - SPECIAL_BASE]);
45 return buffer;
48 const char *show_ident(const struct ident *ident)
50 static char buffer[256];
51 sprintf(buffer, "%.*s", ident->len, ident->name);
52 return buffer;
55 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
57 if (isprint(c)) {
58 if (c == escape || c == '\\')
59 *ptr++ = '\\';
60 *ptr++ = c;
61 return ptr;
63 *ptr++ = '\\';
64 switch (c) {
65 case '\n':
66 *ptr++ = 'n';
67 return ptr;
68 case '\t':
69 *ptr++ = 't';
70 return ptr;
72 if (!isdigit(next))
73 return ptr + sprintf(ptr, "%o", c);
75 return ptr + sprintf(ptr, "%03o", c);
78 const char *show_token(const struct token *token)
80 static char buffer[256];
82 if (!token)
83 return "<no token>";
84 switch (token->type) {
85 case TOKEN_ERROR:
86 return "syntax error";
88 case TOKEN_EOF:
89 return "end-of-input";
91 case TOKEN_IDENT:
92 return show_ident(token->ident);
94 case TOKEN_STRING: {
95 char *ptr;
96 int i;
97 struct string *string = token->string;
99 ptr = buffer;
100 *ptr++ = '"';
101 for (i = 0; i < string->length-1; i++) {
102 unsigned char *p = string->data + i;
103 ptr = charstr(ptr, p[0], '"', p[1]);
105 *ptr++ = '"';
106 *ptr = '\0';
107 return buffer;
110 case TOKEN_INTEGER: {
111 const char *p = token->integer;
112 switch (*p) {
113 case 'o': // octal
114 case 'x': // hex
115 buffer[0] = '0';
116 strcpy(buffer+1, p+1);
117 return buffer;
118 default:
119 return p;
123 case TOKEN_FP:
124 return token->fp;
126 case TOKEN_SPECIAL:
127 return show_special(token->special);
129 case TOKEN_CHAR: {
130 char *ptr = buffer;
131 int c = token->character;
132 *ptr++ = '\'';
133 ptr = charstr(ptr, c, '\'', 0);
134 *ptr++ = '\'';
135 *ptr++ = '\0';
136 return buffer;
139 case TOKEN_STREAMBEGIN:
140 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->stream)->name);
141 return buffer;
143 case TOKEN_STREAMEND:
144 sprintf(buffer, "<end of '%s'>", (input_streams + token->stream)->name);
145 return buffer;
147 default:
148 return "WTF???";
152 int init_stream(const char *name, int fd)
154 int stream = input_stream_nr;
155 struct stream *current;
157 if (stream >= input_streams_allocated) {
158 int newalloc = stream * 4 / 3 + 10;
159 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
160 if (!input_streams)
161 die("Unable to allocate more streams space");
162 input_streams_allocated = newalloc;
164 current = input_streams + stream;
165 memset(current, 0, sizeof(*current));
166 current->name = name;
167 current->fd = fd;
168 current->constant = -1; // "unknown"
169 if (fd > 0) {
170 int i;
171 struct stat st;
173 fstat(fd, &st);
174 current->dev = st.st_dev;
175 current->ino = st.st_ino;
176 for (i = 0; i < stream; i++) {
177 struct stream *s = input_streams + i;
178 if (s->dev == st.st_dev && s->ino == st.st_ino) {
179 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
180 return -1;
184 input_stream_nr = stream+1;
185 return stream;
188 static struct token * alloc_token(stream_t *stream)
190 struct token *token = __alloc_token(0);
191 token->line = stream->line;
192 token->pos = stream->pos;
193 token->stream = stream->stream;
194 token->newline = stream->newline;
195 token->whitespace = stream->whitespace;
196 return token;
199 static int nextchar(stream_t *stream)
201 int offset = stream->offset;
202 int size = stream->size;
203 int c;
205 if (offset >= size) {
206 size = read(stream->fd, stream->buffer, sizeof(stream->buffer));
207 if (size <= 0)
208 return EOF;
209 stream->size = size;
210 stream->offset = 0;
211 offset = 0;
213 c = stream->buffer[offset];
214 stream->offset = offset + 1;
215 stream->pos++;
216 if (c == '\n') {
217 stream->line++;
218 stream->newline = 1;
219 stream->pos = 0;
221 return c;
224 struct token eof_token_entry;
226 static void mark_eof(stream_t *stream, struct token *end_token)
228 struct token *end;
230 end = alloc_token(stream);
231 end->type = TOKEN_STREAMEND;
232 end->newline = 1;
234 eof_token_entry.next = &eof_token_entry;
235 eof_token_entry.newline = 1;
237 if (!end_token)
238 end_token = &eof_token_entry;
239 end->next = end_token;
240 *stream->tokenlist = end;
241 stream->tokenlist = NULL;
244 static void add_token(stream_t *stream)
246 struct token *token = stream->token;
248 stream->token = NULL;
249 token->next = NULL;
250 *stream->tokenlist = token;
251 stream->tokenlist = &token->next;
254 static void drop_token(stream_t *stream)
256 stream->newline |= stream->token->newline;
257 stream->whitespace |= stream->token->whitespace;
258 stream->token = NULL;
261 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
263 char *buf = *p;
265 *buf++ = next;
266 for (;;) {
267 unsigned int n;
268 next = nextchar(stream);
269 n = hexval(next);
270 if (n >= base)
271 break;
272 *buf++ = next;
274 *p = buf;
275 return next;
278 static int do_integer(char *buffer, int len, int next, stream_t *stream)
280 struct token *token = stream->token;
281 void *buf;
283 while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
284 buffer[len++] = next;
285 next = nextchar(stream);
287 buffer[len++] = '\0';
288 buf = __alloc_bytes(len);
289 memcpy(buf, buffer, len);
290 token->type = TOKEN_INTEGER;
291 token->integer = buf;
292 add_token(stream);
293 return next;
296 static int get_one_number(int c, stream_t *stream)
298 static char buffer[256];
299 int next = nextchar(stream);
300 char *p = buffer;
302 *p++ = c;
303 switch (next) {
304 case '0'...'7':
305 if (c == '0') {
306 buffer[0] = 'o';
307 next = get_base_number(8, &p, next, stream);
308 break;
310 /* fallthrough */
311 case '8'...'9':
312 next = get_base_number(10, &p, next, stream);
313 break;
314 case 'x': case 'X':
315 if (c == '0') {
316 buffer[0] = 'x';
317 next = get_base_number(16, &p, next, stream);
320 return do_integer(buffer, p - buffer, next, stream);
323 static int escapechar(int first, int type, stream_t *stream, int *valp)
325 int next, value;
327 next = nextchar(stream);
328 value = first;
330 if (first == '\n')
331 warn(stream->token, "Newline in string or character constant");
333 if (first == '\\' && next != EOF) {
334 value = next;
335 next = nextchar(stream);
336 if (value != type) {
337 switch (value) {
338 case 'n':
339 value = '\n';
340 break;
341 case 't':
342 value = '\t';
343 break;
344 case '\\':
345 break;
346 case '0'...'7': {
347 int nr = 2;
348 value -= '0';
349 while (next >= '0' && next <= '9') {
350 value = (value << 3) + (next-'0');
351 next = nextchar(stream);
352 if (!--nr)
353 break;
355 value &= 0xff;
356 break;
358 case 'x': {
359 int hex = hexval(next);
360 if (hex < 16) {
361 value = hex;
362 next = nextchar(stream);
363 while ((hex = hexval(next)) < 16) {
364 value = (value << 4) + hex;
365 next = nextchar(stream);
367 value &= 0xff;
368 break;
371 /* Fallthrough */
372 default:
373 warn(stream->token, "Unknown escape '%c'", value);
376 /* Mark it as escaped */
377 value |= 0x100;
379 *valp = value;
380 return next;
383 static int get_char_token(int next, stream_t *stream)
385 int value;
386 struct token *token;
388 next = escapechar(next, '\'', stream, &value);
389 if (value == '\'' || next != '\'') {
390 warn(stream->token, "Bad character constant");
391 drop_token(stream);
392 return next;
395 token = stream->token;
396 token->type = TOKEN_CHAR;
397 token->character = value & 0xff;
399 add_token(stream);
400 return nextchar(stream);
403 static int get_string_token(int next, stream_t *stream)
405 static char buffer[512];
406 struct string *string;
407 struct token *token;
408 int len = 0;
410 for (;;) {
411 int val;
412 next = escapechar(next, '"', stream, &val);
413 if (val == '"')
414 break;
415 if (next == EOF) {
416 warn(stream->token, "Enf of file in middle of string");
417 return next;
419 if (len < sizeof(buffer)) {
420 buffer[len] = val;
421 len++;
426 if (len > 256)
427 warn(stream->token, "String too long");
429 string = __alloc_string(len+1);
430 memcpy(string->data, buffer, len);
431 string->data[len] = '\0';
432 string->length = len+1;
434 /* Pass it on.. */
435 token = stream->token;
436 token->type = TOKEN_STRING;
437 token->string = string;
438 add_token(stream);
440 return next;
443 static int drop_stream_eoln(stream_t *stream)
445 int next = nextchar(stream);
446 drop_token(stream);
447 for (;;) {
448 int curr = next;
449 if (curr == EOF)
450 return next;
451 next = nextchar(stream);
452 if (curr == '\n')
453 return next;
457 static int drop_stream_comment(stream_t *stream)
459 int next = nextchar(stream);
460 drop_token(stream);
461 for (;;) {
462 int curr = next;
463 if (curr == EOF) {
464 warn(stream->token, "End of file in the middle of a comment");
465 return curr;
467 next = nextchar(stream);
468 if (curr == '*' && next == '/')
469 break;
471 return nextchar(stream);
474 unsigned char combinations[][3] = COMBINATION_STRINGS;
476 #define NR_COMBINATIONS (sizeof(combinations)/3)
478 static int get_one_special(int c, stream_t *stream)
480 struct token *token;
481 unsigned char c1, c2, c3;
482 int next, value, i;
483 char *comb;
485 next = nextchar(stream);
488 * Check for strings, character constants, and comments
490 switch (c) {
491 case '"':
492 return get_string_token(next, stream);
493 case '\'':
494 return get_char_token(next, stream);
495 case '/':
496 if (next == '/')
497 return drop_stream_eoln(stream);
498 if (next == '*')
499 return drop_stream_comment(stream);
503 * Check for combinations
505 value = c;
506 comb = combinations[0];
507 c1 = c; c2 = next; c3 = 0;
508 for (i = 0; i < NR_COMBINATIONS; i++) {
509 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
510 value = i + SPECIAL_BASE;
511 next = nextchar(stream);
512 if (c3)
513 break;
514 c3 = next;
516 comb += 3;
519 /* Pass it on.. */
520 token = stream->token;
521 token->type = TOKEN_SPECIAL;
522 token->special = value;
523 add_token(stream);
524 return next;
527 #define IDENT_HASH_BITS (10)
528 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
529 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
531 #define ident_hash_init(c) (c)
532 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
533 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
535 static struct ident *hash_table[IDENT_HASH_SIZE];
536 int ident_hit, ident_miss;
538 void show_identifier_stats(void)
540 int i;
541 int distribution[100];
543 fprintf(stderr, "identifiers: %d hits, %d misses\n",
544 ident_hit, ident_miss);
546 for (i = 0; i < 100; i++)
547 distribution[i] = 0;
549 for (i = 0; i < IDENT_HASH_SIZE; i++) {
550 struct ident * ident = hash_table[i];
551 int count = 0;
553 while (ident) {
554 count++;
555 ident = ident->next;
557 if (count > 99)
558 count = 99;
559 distribution[count]++;
562 for (i = 0; i < 100; i++) {
563 if (distribution[i])
564 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
568 static struct ident *alloc_ident(const char *name, int len)
570 struct ident *ident = __alloc_ident(len);
571 ident->symbols = NULL;
572 ident->len = len;
573 memcpy(ident->name, name, len);
574 return ident;
577 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
579 ident->next = hash_table[hash];
580 hash_table[hash] = ident;
581 ident_miss++;
582 return ident;
585 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
587 struct ident *ident;
589 ident = hash_table[hash];
590 while (ident) {
591 if (ident->len == len && !memcmp(ident->name, name, len)) {
592 ident_hit++;
593 return ident;
595 ident = ident->next;
598 return insert_hash(alloc_ident(name, len), hash);
601 static unsigned long hash_name(const char *name, int len)
603 unsigned long hash;
604 const unsigned char *p = (const unsigned char *)name;
606 hash = ident_hash_init(*p++);
607 while (--len) {
608 unsigned int i = *p++;
609 hash = ident_hash_add(hash, i);
611 return ident_hash_end(hash);
614 struct ident *hash_ident(struct ident *ident)
616 return insert_hash(ident, hash_name(ident->name, ident->len));
619 struct ident *built_in_ident(const char *name)
621 int len = strlen(name);
622 return create_hashed_ident(name, len, hash_name(name, len));
625 struct token *built_in_token(int stream, const char *name)
627 struct token *token;
629 token = __alloc_token(0);
630 token->stream = stream;
631 token->type = TOKEN_IDENT;
632 token->ident = built_in_ident(name);
633 return token;
636 static int get_one_identifier(int c, stream_t *stream)
638 struct token *token;
639 struct ident *ident;
640 unsigned long hash;
641 char buf[256];
642 int len = 1;
643 int next;
645 hash = ident_hash_init(c);
646 buf[0] = c;
647 for (;;) {
648 next = nextchar(stream);
649 switch (next) {
650 case '0'...'9':
651 case 'a'...'z':
652 case 'A'...'Z':
653 case '_':
654 if (len < sizeof(buf)) {
655 hash = ident_hash_add(hash, next);
656 buf[len] = next;
657 len++;
659 continue;
661 break;
663 hash = ident_hash_end(hash);
665 ident = create_hashed_ident(buf, len, hash);
667 /* Pass it on.. */
668 token = stream->token;
669 token->type = TOKEN_IDENT;
670 token->ident = ident;
671 add_token(stream);
672 return next;
675 static int get_one_token(int c, stream_t *stream)
677 switch (c) {
678 case '0'...'9':
679 return get_one_number(c, stream);
680 case 'a'...'z':
681 case 'A'...'Z':
682 case '_':
683 return get_one_identifier(c, stream);
684 default:
685 return get_one_special(c, stream);
689 struct token * tokenize(const char *name, int fd, struct token *endtoken)
691 struct token *begin;
692 stream_t stream;
693 int c, idx;
695 idx = init_stream(name, fd);
696 if (idx < 0)
697 return endtoken;
699 stream.stream = idx;
700 stream.token = NULL;
701 stream.line = 1;
702 stream.newline = 1;
703 stream.whitespace = 0;
704 stream.pos = 0;
705 stream.fd = fd;
706 stream.offset = 0;
707 stream.size = 0;
709 begin = alloc_token(&stream);
710 begin->type = TOKEN_STREAMBEGIN;
711 stream.tokenlist = &begin->next;
713 c = nextchar(&stream);
714 while (c != EOF) {
715 if (c == '\\') {
716 c = nextchar(&stream);
717 stream.newline = 0;
718 stream.whitespace = 1;
719 continue;
721 if (!isspace(c)) {
722 struct token *token = alloc_token(&stream);
723 token->newline = stream.newline;
724 token->whitespace = stream.whitespace;
725 stream.newline = 0;
726 stream.whitespace = 0;
727 stream.token = token;
728 c = get_one_token(c, &stream);
729 continue;
731 stream.whitespace = 1;
732 c = nextchar(&stream);
734 mark_eof(&stream, endtoken);
735 return begin;