Shrink "struct token" by moving "noexpand" into the position flags.
[smatch.git] / tokenize.c
blobeaeb888023745cc4e52f791cb265c3fbece7714e
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <sys/stat.h>
19 #include "lib.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
29 #define BUFSIZE (8192)
31 typedef struct {
32 int fd, offset, size;
33 struct position pos;
34 struct token **tokenlist;
35 struct token *token;
36 unsigned char *buffer;
37 } stream_t;
40 const char *show_special(int val)
42 static const char *combinations[] = COMBINATION_STRINGS;
43 static char buffer[4];
45 buffer[0] = val;
46 buffer[1] = 0;
47 if (val >= SPECIAL_BASE)
48 strcpy(buffer, combinations[val - SPECIAL_BASE]);
49 return buffer;
52 const char *show_ident(const struct ident *ident)
54 static char buffer[256];
55 if (!ident)
56 return "<noident>";
57 sprintf(buffer, "%.*s", ident->len, ident->name);
58 return buffer;
61 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
63 if (isprint(c)) {
64 if (c == escape || c == '\\')
65 *ptr++ = '\\';
66 *ptr++ = c;
67 return ptr;
69 *ptr++ = '\\';
70 switch (c) {
71 case '\n':
72 *ptr++ = 'n';
73 return ptr;
74 case '\t':
75 *ptr++ = 't';
76 return ptr;
78 if (!isdigit(next))
79 return ptr + sprintf(ptr, "%o", c);
81 return ptr + sprintf(ptr, "%03o", c);
84 const char *show_string(const struct string *string)
86 static char buffer[256];
87 char *ptr;
88 int i;
90 ptr = buffer;
91 *ptr++ = '"';
92 for (i = 0; i < string->length-1; i++) {
93 const unsigned char *p = string->data + i;
94 ptr = charstr(ptr, p[0], '"', p[1]);
96 *ptr++ = '"';
97 *ptr = '\0';
98 return buffer;
101 const char *show_token(const struct token *token)
103 static char buffer[256];
105 if (!token)
106 return "<no token>";
107 switch (token_type(token)) {
108 case TOKEN_ERROR:
109 return "syntax error";
111 case TOKEN_EOF:
112 return "end-of-input";
114 case TOKEN_IDENT:
115 return show_ident(token->ident);
117 case TOKEN_STRING:
118 return show_string(token->string);
120 case TOKEN_NUMBER:
121 return token->number;
123 case TOKEN_SPECIAL:
124 return show_special(token->special);
126 case TOKEN_CHAR: {
127 char *ptr = buffer;
128 int c = token->character;
129 *ptr++ = '\'';
130 ptr = charstr(ptr, c, '\'', 0);
131 *ptr++ = '\'';
132 *ptr++ = '\0';
133 return buffer;
136 case TOKEN_STREAMBEGIN:
137 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
138 return buffer;
140 case TOKEN_STREAMEND:
141 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
142 return buffer;
144 default:
145 return "WTF???";
149 int init_stream(const char *name, int fd)
151 int stream = input_stream_nr;
152 struct stream *current;
154 if (stream >= input_streams_allocated) {
155 int newalloc = stream * 4 / 3 + 10;
156 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
157 if (!input_streams)
158 die("Unable to allocate more streams space");
159 input_streams_allocated = newalloc;
161 current = input_streams + stream;
162 memset(current, 0, sizeof(*current));
163 current->name = name;
164 current->fd = fd;
165 current->constant = -1; // "unknown"
166 if (fd > 0) {
167 int i;
168 struct stat st;
170 fstat(fd, &st);
171 current->dev = st.st_dev;
172 current->ino = st.st_ino;
173 for (i = 0; i < stream; i++) {
174 struct stream *s = input_streams + i;
175 if (s->dev == st.st_dev && s->ino == st.st_ino) {
176 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
177 return -1;
181 input_stream_nr = stream+1;
182 return stream;
185 static struct token * alloc_token(stream_t *stream)
187 struct token *token = __alloc_token(0);
188 token->pos = stream->pos;
189 return token;
192 static int nextchar(stream_t *stream)
194 int offset = stream->offset;
195 int size = stream->size;
196 int c;
197 int complain = -1;
199 repeat:
200 complain++;
201 if (offset >= size) {
202 size = read(stream->fd, stream->buffer, BUFSIZE);
203 if (size <= 0)
204 return EOF;
205 stream->size = size;
206 stream->offset = 0;
207 offset = 0;
209 c = stream->buffer[offset];
210 stream->offset = ++offset;
212 stream->pos.pos++;
214 /* Ignore DOS-stype '\r' characters */
215 if (c == '\r')
216 goto repeat;
218 if (c == '\n') {
219 stream->pos.line++;
220 stream->pos.newline = 1;
221 stream->pos.pos = 0;
222 complain = 0;
225 if (complain)
226 warn(stream->pos, "non-ASCII data stream");
228 return c;
231 struct token eof_token_entry;
233 static void mark_eof(stream_t *stream, struct token *end_token)
235 struct token *end;
237 end = alloc_token(stream);
238 token_type(end) = TOKEN_STREAMEND;
239 end->pos.newline = 1;
241 eof_token_entry.next = &eof_token_entry;
242 eof_token_entry.pos.newline = 1;
244 if (!end_token)
245 end_token = &eof_token_entry;
246 end->next = end_token;
247 *stream->tokenlist = end;
248 stream->tokenlist = NULL;
251 static void add_token(stream_t *stream)
253 struct token *token = stream->token;
255 stream->token = NULL;
256 token->next = NULL;
257 *stream->tokenlist = token;
258 stream->tokenlist = &token->next;
261 static void drop_token(stream_t *stream)
263 stream->pos.newline |= stream->token->pos.newline;
264 stream->pos.whitespace |= stream->token->pos.whitespace;
265 stream->token = NULL;
270 * pp-number:
271 * digit
272 * . digit
273 * pp-number digit
274 * pp-number identifier-nodigit
275 * pp-number e sign
276 * pp-number E sign
277 * pp-number p sign
278 * pp-number P sign
279 * pp-number .
281 static int get_one_number(int c, int next, stream_t *stream)
283 struct token *token;
284 static char buffer[256];
285 char *p = buffer, *buf;
286 int len;
288 *p++ = c;
289 for (;;) {
290 switch (next) {
291 case 'e': case 'E':
292 case 'p': case 'P':
293 *p++ = next;
294 next = nextchar(stream);
295 if (next != '-' && next != '+')
296 continue;
297 /* Fallthrough for sign of 'e'/'p' */
298 case '0'...'9':
299 case '.': case '_':
300 case 'a'...'d': case 'A'...'D':
301 case 'f'...'o': case 'F'...'O':
302 case 'q'...'z': case 'Q'...'Z':
303 *p++ = next;
304 next = nextchar(stream);
305 continue;
307 break;
309 *p++ = 0;
310 len = p - buffer;
311 buf = __alloc_bytes(len);
312 memcpy(buf, buffer, len);
314 token = stream->token;
315 token_type(token) = TOKEN_NUMBER;
316 token->number = buf;
317 add_token(stream);
319 return next;
322 static int escapechar(int first, int type, stream_t *stream, int *valp)
324 int next, value;
326 next = nextchar(stream);
327 value = first;
329 if (first == '\n')
330 warn(stream->pos, "Newline in string or character constant");
332 if (first == '\\' && next != EOF) {
333 value = next;
334 next = nextchar(stream);
335 if (value != type) {
336 switch (value) {
337 case 'a':
338 value = '\a';
339 break;
340 case 'b':
341 value = '\b';
342 break;
343 case 't':
344 value = '\t';
345 break;
346 case 'n':
347 value = '\n';
348 break;
349 case 'v':
350 value = '\v';
351 break;
352 case 'f':
353 value = '\f';
354 break;
355 case 'r':
356 value = '\r';
357 break;
358 case 'e':
359 value = '\e';
360 break;
361 case '\\':
362 break;
363 case '\'':
364 break;
365 case '"':
366 break;
367 case '\n':
368 next = escapechar(next, type, stream, &value);
369 break;
370 case '0'...'7': {
371 int nr = 2;
372 value -= '0';
373 while (next >= '0' && next <= '9') {
374 value = (value << 3) + (next-'0');
375 next = nextchar(stream);
376 if (!--nr)
377 break;
379 value &= 0xff;
380 break;
382 case 'x': {
383 int hex = hexval(next);
384 if (hex < 16) {
385 value = hex;
386 next = nextchar(stream);
387 while ((hex = hexval(next)) < 16) {
388 value = (value << 4) + hex;
389 next = nextchar(stream);
391 value &= 0xff;
392 break;
395 /* Fallthrough */
396 default:
397 warn(stream->pos, "Unknown escape '%c'", value);
400 /* Mark it as escaped */
401 value |= 0x100;
403 *valp = value;
404 return next;
407 static int get_char_token(int next, stream_t *stream)
409 int value;
410 struct token *token;
412 next = escapechar(next, '\'', stream, &value);
413 if (value == '\'' || next != '\'') {
414 warn(stream->pos, "Bad character constant");
415 drop_token(stream);
416 return next;
419 token = stream->token;
420 token_type(token) = TOKEN_CHAR;
421 token->character = value & 0xff;
423 add_token(stream);
424 return nextchar(stream);
427 static int get_string_token(int next, stream_t *stream)
429 static char buffer[512];
430 struct string *string;
431 struct token *token;
432 int len = 0;
434 for (;;) {
435 int val;
436 next = escapechar(next, '"', stream, &val);
437 if (val == '"')
438 break;
439 if (next == EOF) {
440 warn(stream->pos, "Enf of file in middle of string");
441 return next;
443 if (len < sizeof(buffer)) {
444 buffer[len] = val;
445 len++;
450 if (len > 256)
451 warn(stream->pos, "String too long");
453 string = __alloc_string(len+1);
454 memcpy(string->data, buffer, len);
455 string->data[len] = '\0';
456 string->length = len+1;
458 /* Pass it on.. */
459 token = stream->token;
460 token_type(token) = TOKEN_STRING;
461 token->string = string;
462 add_token(stream);
464 return next;
467 static int drop_stream_eoln(stream_t *stream)
469 int next = nextchar(stream);
470 drop_token(stream);
471 for (;;) {
472 int curr = next;
473 if (curr == EOF)
474 return next;
475 next = nextchar(stream);
476 if (curr == '\n')
477 return next;
481 static int drop_stream_comment(stream_t *stream)
483 int next = nextchar(stream);
484 drop_token(stream);
485 for (;;) {
486 int curr = next;
487 if (curr == EOF) {
488 warn(stream->pos, "End of file in the middle of a comment");
489 return curr;
491 next = nextchar(stream);
492 if (curr == '*' && next == '/')
493 break;
495 return nextchar(stream);
498 unsigned char combinations[][3] = COMBINATION_STRINGS;
500 #define NR_COMBINATIONS (sizeof(combinations)/3)
502 static int get_one_special(int c, stream_t *stream)
504 struct token *token;
505 unsigned char c1, c2, c3;
506 int next, value, i;
507 char *comb;
509 next = nextchar(stream);
512 * Check for numbers, strings, character constants, and comments
514 switch (c) {
515 case '.':
516 if (next >= '0' && next <= '9')
517 return get_one_number(c, next, stream);
518 break;
519 case '"':
520 return get_string_token(next, stream);
521 case '\'':
522 return get_char_token(next, stream);
523 case '/':
524 if (next == '/')
525 return drop_stream_eoln(stream);
526 if (next == '*')
527 return drop_stream_comment(stream);
531 * Check for combinations
533 value = c;
534 comb = combinations[0];
535 c1 = c; c2 = next; c3 = 0;
536 for (i = 0; i < NR_COMBINATIONS; i++) {
537 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
538 value = i + SPECIAL_BASE;
539 next = nextchar(stream);
540 if (c3)
541 break;
542 c3 = next;
544 comb += 3;
547 /* Pass it on.. */
548 token = stream->token;
549 token_type(token) = TOKEN_SPECIAL;
550 token->special = value;
551 add_token(stream);
552 return next;
555 #define IDENT_HASH_BITS (10)
556 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
557 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
559 #define ident_hash_init(c) (c)
560 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
561 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
563 static struct ident *hash_table[IDENT_HASH_SIZE];
564 int ident_hit, ident_miss;
566 void show_identifier_stats(void)
568 int i;
569 int distribution[100];
571 fprintf(stderr, "identifiers: %d hits, %d misses\n",
572 ident_hit, ident_miss);
574 for (i = 0; i < 100; i++)
575 distribution[i] = 0;
577 for (i = 0; i < IDENT_HASH_SIZE; i++) {
578 struct ident * ident = hash_table[i];
579 int count = 0;
581 while (ident) {
582 count++;
583 ident = ident->next;
585 if (count > 99)
586 count = 99;
587 distribution[count]++;
590 for (i = 0; i < 100; i++) {
591 if (distribution[i])
592 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
596 static struct ident *alloc_ident(const char *name, int len)
598 struct ident *ident = __alloc_ident(len);
599 ident->symbols = NULL;
600 ident->len = len;
601 ident->tainted = 0;
602 memcpy(ident->name, name, len);
603 return ident;
606 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
608 ident->next = hash_table[hash];
609 hash_table[hash] = ident;
610 ident_miss++;
611 return ident;
614 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
616 struct ident *ident;
618 ident = hash_table[hash];
619 while (ident) {
620 if (ident->len == len && !memcmp(ident->name, name, len)) {
621 ident_hit++;
622 return ident;
624 ident = ident->next;
627 return insert_hash(alloc_ident(name, len), hash);
630 static unsigned long hash_name(const char *name, int len)
632 unsigned long hash;
633 const unsigned char *p = (const unsigned char *)name;
635 hash = ident_hash_init(*p++);
636 while (--len) {
637 unsigned int i = *p++;
638 hash = ident_hash_add(hash, i);
640 return ident_hash_end(hash);
643 struct ident *hash_ident(struct ident *ident)
645 return insert_hash(ident, hash_name(ident->name, ident->len));
648 struct ident *built_in_ident(const char *name)
650 int len = strlen(name);
651 return create_hashed_ident(name, len, hash_name(name, len));
654 struct token *built_in_token(int stream, const char *name)
656 struct token *token;
658 token = __alloc_token(0);
659 token->pos.stream = stream;
660 token_type(token) = TOKEN_IDENT;
661 token->ident = built_in_ident(name);
662 return token;
665 static int get_one_identifier(int c, stream_t *stream)
667 struct token *token;
668 struct ident *ident;
669 unsigned long hash;
670 char buf[256];
671 int len = 1;
672 int next;
674 hash = ident_hash_init(c);
675 buf[0] = c;
676 for (;;) {
677 next = nextchar(stream);
678 switch (next) {
679 case '0'...'9':
680 case 'a'...'z':
681 case 'A'...'Z':
682 case '_':
683 if (len < sizeof(buf)) {
684 hash = ident_hash_add(hash, next);
685 buf[len] = next;
686 len++;
688 continue;
690 break;
692 hash = ident_hash_end(hash);
694 ident = create_hashed_ident(buf, len, hash);
696 /* Pass it on.. */
697 token = stream->token;
698 token_type(token) = TOKEN_IDENT;
699 token->ident = ident;
700 add_token(stream);
701 return next;
704 static int get_one_token(int c, stream_t *stream)
706 switch (c) {
707 case '0'...'9':
708 return get_one_number(c, nextchar(stream), stream);
709 case 'a'...'z':
710 case 'A'...'Z':
711 case '_':
712 return get_one_identifier(c, stream);
713 default:
714 return get_one_special(c, stream);
718 static struct token *setup_stream(stream_t *stream, int idx, int fd,
719 unsigned char *buf, unsigned int buf_size)
721 struct token *begin;
723 stream->pos.stream = idx;
724 stream->pos.line = 1;
725 stream->pos.newline = 1;
726 stream->pos.whitespace = 0;
727 stream->pos.pos = 0;
728 stream->pos.noexpand = 0;
730 stream->token = NULL;
731 stream->fd = fd;
732 stream->offset = 0;
733 stream->size = buf_size;
734 stream->buffer = buf;
736 begin = alloc_token(stream);
737 token_type(begin) = TOKEN_STREAMBEGIN;
738 stream->tokenlist = &begin->next;
739 return begin;
742 static void tokenize_stream(stream_t *stream, struct token *endtoken)
744 int c = nextchar(stream);
745 while (c != EOF) {
746 if (c == '\\') {
747 c = nextchar(stream);
748 stream->pos.newline = 0;
749 stream->pos.whitespace = 1;
750 continue;
752 if (!isspace(c)) {
753 struct token *token = alloc_token(stream);
754 stream->token = token;
755 stream->pos.newline = 0;
756 stream->pos.whitespace = 0;
757 c = get_one_token(c, stream);
758 continue;
760 stream->pos.whitespace = 1;
761 c = nextchar(stream);
763 mark_eof(stream, endtoken);
766 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
768 stream_t stream;
769 struct token *begin;
771 begin = setup_stream(&stream, 0, -1, buffer, size);
772 tokenize_stream(&stream, endtoken);
773 return begin;
776 struct token * tokenize(const char *name, int fd, struct token *endtoken)
778 struct token *begin;
779 stream_t stream;
780 unsigned char buffer[BUFSIZE];
781 int idx;
783 idx = init_stream(name, fd);
784 if (idx < 0)
785 return endtoken;
787 begin = setup_stream(&stream, idx, fd, buffer, 0);
788 tokenize_stream(&stream, endtoken);
789 return begin;