Add a backend generic pointer to the symbol. Not that
[smatch.git] / tokenize.c
blob93218bbf1f70d3bff9519c4238b5ce67ca115de5
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
7 * Licensed under the Open Software License version 1.1
8 */
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <stdarg.h>
12 #include <stddef.h>
13 #include <string.h>
14 #include <ctype.h>
15 #include <unistd.h>
16 #include <sys/stat.h>
18 #include "lib.h"
19 #include "token.h"
20 #include "symbol.h"
22 #define EOF (-1)
24 int input_stream_nr = 0;
25 struct stream *input_streams;
26 static int input_streams_allocated;
28 #define BUFSIZE (8192)
30 typedef struct {
31 int fd, offset, size;
32 struct position pos;
33 struct token **tokenlist;
34 struct token *token;
35 unsigned char *buffer;
36 } stream_t;
39 const char *show_special(int val)
41 static const char *combinations[] = COMBINATION_STRINGS;
42 static char buffer[4];
44 buffer[0] = val;
45 buffer[1] = 0;
46 if (val >= SPECIAL_BASE)
47 strcpy(buffer, combinations[val - SPECIAL_BASE]);
48 return buffer;
51 const char *show_ident(const struct ident *ident)
53 static char buffer[256];
54 if (!ident)
55 return "<noident>";
56 sprintf(buffer, "%.*s", ident->len, ident->name);
57 return buffer;
60 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
62 if (isprint(c)) {
63 if (c == escape || c == '\\')
64 *ptr++ = '\\';
65 *ptr++ = c;
66 return ptr;
68 *ptr++ = '\\';
69 switch (c) {
70 case '\n':
71 *ptr++ = 'n';
72 return ptr;
73 case '\t':
74 *ptr++ = 't';
75 return ptr;
77 if (!isdigit(next))
78 return ptr + sprintf(ptr, "%o", c);
80 return ptr + sprintf(ptr, "%03o", c);
83 const char *show_string(const struct string *string)
85 static char buffer[256];
86 char *ptr;
87 int i;
89 ptr = buffer;
90 *ptr++ = '"';
91 for (i = 0; i < string->length-1; i++) {
92 const unsigned char *p = string->data + i;
93 ptr = charstr(ptr, p[0], '"', p[1]);
95 *ptr++ = '"';
96 *ptr = '\0';
97 return buffer;
100 const char *show_token(const struct token *token)
102 static char buffer[256];
104 if (!token)
105 return "<no token>";
106 switch (token_type(token)) {
107 case TOKEN_ERROR:
108 return "syntax error";
110 case TOKEN_EOF:
111 return "end-of-input";
113 case TOKEN_IDENT:
114 return show_ident(token->ident);
116 case TOKEN_STRING:
117 return show_string(token->string);
119 case TOKEN_INTEGER: {
120 const char *p = token->integer;
121 switch (*p) {
122 case 'o': // octal
123 case 'x': // hex
124 buffer[0] = '0';
125 strcpy(buffer+1, p+1);
126 return buffer;
127 default:
128 return p;
132 case TOKEN_FP:
133 return token->fp;
135 case TOKEN_SPECIAL:
136 return show_special(token->special);
138 case TOKEN_CHAR: {
139 char *ptr = buffer;
140 int c = token->character;
141 *ptr++ = '\'';
142 ptr = charstr(ptr, c, '\'', 0);
143 *ptr++ = '\'';
144 *ptr++ = '\0';
145 return buffer;
148 case TOKEN_STREAMBEGIN:
149 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
150 return buffer;
152 case TOKEN_STREAMEND:
153 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
154 return buffer;
156 default:
157 return "WTF???";
161 int init_stream(const char *name, int fd)
163 int stream = input_stream_nr;
164 struct stream *current;
166 if (stream >= input_streams_allocated) {
167 int newalloc = stream * 4 / 3 + 10;
168 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
169 if (!input_streams)
170 die("Unable to allocate more streams space");
171 input_streams_allocated = newalloc;
173 current = input_streams + stream;
174 memset(current, 0, sizeof(*current));
175 current->name = name;
176 current->fd = fd;
177 current->constant = -1; // "unknown"
178 if (fd > 0) {
179 int i;
180 struct stat st;
182 fstat(fd, &st);
183 current->dev = st.st_dev;
184 current->ino = st.st_ino;
185 for (i = 0; i < stream; i++) {
186 struct stream *s = input_streams + i;
187 if (s->dev == st.st_dev && s->ino == st.st_ino) {
188 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
189 return -1;
193 input_stream_nr = stream+1;
194 return stream;
197 static struct token * alloc_token(stream_t *stream)
199 struct token *token = __alloc_token(0);
200 token->pos = stream->pos;
201 return token;
204 static int nextchar(stream_t *stream)
206 int offset = stream->offset;
207 int size = stream->size;
208 int c;
210 if (offset >= size) {
211 size = read(stream->fd, stream->buffer, BUFSIZE);
212 if (size <= 0)
213 return EOF;
214 stream->size = size;
215 stream->offset = 0;
216 offset = 0;
218 c = stream->buffer[offset];
219 stream->offset = offset + 1;
220 stream->pos.pos++;
221 if (c == '\n') {
222 stream->pos.line++;
223 stream->pos.newline = 1;
224 stream->pos.pos = 0;
226 return c;
229 struct token eof_token_entry;
231 static void mark_eof(stream_t *stream, struct token *end_token)
233 struct token *end;
235 end = alloc_token(stream);
236 token_type(end) = TOKEN_STREAMEND;
237 end->pos.newline = 1;
239 eof_token_entry.next = &eof_token_entry;
240 eof_token_entry.pos.newline = 1;
242 if (!end_token)
243 end_token = &eof_token_entry;
244 end->next = end_token;
245 *stream->tokenlist = end;
246 stream->tokenlist = NULL;
249 static void add_token(stream_t *stream)
251 struct token *token = stream->token;
253 stream->token = NULL;
254 token->next = NULL;
255 *stream->tokenlist = token;
256 stream->tokenlist = &token->next;
259 static void drop_token(stream_t *stream)
261 stream->pos.newline |= stream->token->pos.newline;
262 stream->pos.whitespace |= stream->token->pos.whitespace;
263 stream->token = NULL;
266 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
268 char *buf = *p;
270 *buf++ = next;
271 for (;;) {
272 unsigned int n;
273 next = nextchar(stream);
274 n = hexval(next);
275 if (n >= base)
276 break;
277 *buf++ = next;
279 *p = buf;
280 return next;
283 static int do_integer(char *buffer, int len, int next, stream_t *stream)
285 struct token *token = stream->token;
286 void *buf;
288 while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
289 buffer[len++] = next;
290 next = nextchar(stream);
292 buffer[len++] = '\0';
293 buf = __alloc_bytes(len);
294 memcpy(buf, buffer, len);
295 token_type(token) = TOKEN_INTEGER;
296 token->integer = buf;
297 add_token(stream);
298 return next;
301 static int get_one_number(int c, stream_t *stream)
303 static char buffer[256];
304 int next = nextchar(stream);
305 char *p = buffer;
307 *p++ = c;
308 switch (next) {
309 case '0'...'7':
310 if (c == '0') {
311 buffer[0] = 'o';
312 next = get_base_number(8, &p, next, stream);
313 break;
315 /* fallthrough */
316 case '8'...'9':
317 next = get_base_number(10, &p, next, stream);
318 break;
319 case 'x': case 'X':
320 if (c == '0') {
321 buffer[0] = 'x';
322 next = get_base_number(16, &p, next, stream);
325 return do_integer(buffer, p - buffer, next, stream);
328 static int escapechar(int first, int type, stream_t *stream, int *valp)
330 int next, value;
332 next = nextchar(stream);
333 value = first;
335 if (first == '\n')
336 warn(stream->pos, "Newline in string or character constant");
338 if (first == '\\' && next != EOF) {
339 value = next;
340 next = nextchar(stream);
341 if (value != type) {
342 switch (value) {
343 case 'n':
344 value = '\n';
345 break;
346 case 't':
347 value = '\t';
348 break;
349 case '\\':
350 break;
351 case '\'':
352 break;
353 case '"':
354 break;
355 case '0'...'7': {
356 int nr = 2;
357 value -= '0';
358 while (next >= '0' && next <= '9') {
359 value = (value << 3) + (next-'0');
360 next = nextchar(stream);
361 if (!--nr)
362 break;
364 value &= 0xff;
365 break;
367 case 'x': {
368 int hex = hexval(next);
369 if (hex < 16) {
370 value = hex;
371 next = nextchar(stream);
372 while ((hex = hexval(next)) < 16) {
373 value = (value << 4) + hex;
374 next = nextchar(stream);
376 value &= 0xff;
377 break;
380 /* Fallthrough */
381 default:
382 warn(stream->pos, "Unknown escape '%c'", value);
385 /* Mark it as escaped */
386 value |= 0x100;
388 *valp = value;
389 return next;
392 static int get_char_token(int next, stream_t *stream)
394 int value;
395 struct token *token;
397 next = escapechar(next, '\'', stream, &value);
398 if (value == '\'' || next != '\'') {
399 warn(stream->pos, "Bad character constant");
400 drop_token(stream);
401 return next;
404 token = stream->token;
405 token_type(token) = TOKEN_CHAR;
406 token->character = value & 0xff;
408 add_token(stream);
409 return nextchar(stream);
412 static int get_string_token(int next, stream_t *stream)
414 static char buffer[512];
415 struct string *string;
416 struct token *token;
417 int len = 0;
419 for (;;) {
420 int val;
421 next = escapechar(next, '"', stream, &val);
422 if (val == '"')
423 break;
424 if (next == EOF) {
425 warn(stream->pos, "Enf of file in middle of string");
426 return next;
428 if (len < sizeof(buffer)) {
429 buffer[len] = val;
430 len++;
435 if (len > 256)
436 warn(stream->pos, "String too long");
438 string = __alloc_string(len+1);
439 memcpy(string->data, buffer, len);
440 string->data[len] = '\0';
441 string->length = len+1;
443 /* Pass it on.. */
444 token = stream->token;
445 token_type(token) = TOKEN_STRING;
446 token->string = string;
447 add_token(stream);
449 return next;
452 static int drop_stream_eoln(stream_t *stream)
454 int next = nextchar(stream);
455 drop_token(stream);
456 for (;;) {
457 int curr = next;
458 if (curr == EOF)
459 return next;
460 next = nextchar(stream);
461 if (curr == '\n')
462 return next;
466 static int drop_stream_comment(stream_t *stream)
468 int next = nextchar(stream);
469 drop_token(stream);
470 for (;;) {
471 int curr = next;
472 if (curr == EOF) {
473 warn(stream->pos, "End of file in the middle of a comment");
474 return curr;
476 next = nextchar(stream);
477 if (curr == '*' && next == '/')
478 break;
480 return nextchar(stream);
483 unsigned char combinations[][3] = COMBINATION_STRINGS;
485 #define NR_COMBINATIONS (sizeof(combinations)/3)
487 static int get_one_special(int c, stream_t *stream)
489 struct token *token;
490 unsigned char c1, c2, c3;
491 int next, value, i;
492 char *comb;
494 next = nextchar(stream);
497 * Check for strings, character constants, and comments
499 switch (c) {
500 case '"':
501 return get_string_token(next, stream);
502 case '\'':
503 return get_char_token(next, stream);
504 case '/':
505 if (next == '/')
506 return drop_stream_eoln(stream);
507 if (next == '*')
508 return drop_stream_comment(stream);
512 * Check for combinations
514 value = c;
515 comb = combinations[0];
516 c1 = c; c2 = next; c3 = 0;
517 for (i = 0; i < NR_COMBINATIONS; i++) {
518 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
519 value = i + SPECIAL_BASE;
520 next = nextchar(stream);
521 if (c3)
522 break;
523 c3 = next;
525 comb += 3;
528 /* Pass it on.. */
529 token = stream->token;
530 token_type(token) = TOKEN_SPECIAL;
531 token->special = value;
532 add_token(stream);
533 return next;
536 #define IDENT_HASH_BITS (10)
537 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
538 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
540 #define ident_hash_init(c) (c)
541 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
542 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
544 static struct ident *hash_table[IDENT_HASH_SIZE];
545 int ident_hit, ident_miss;
547 void show_identifier_stats(void)
549 int i;
550 int distribution[100];
552 fprintf(stderr, "identifiers: %d hits, %d misses\n",
553 ident_hit, ident_miss);
555 for (i = 0; i < 100; i++)
556 distribution[i] = 0;
558 for (i = 0; i < IDENT_HASH_SIZE; i++) {
559 struct ident * ident = hash_table[i];
560 int count = 0;
562 while (ident) {
563 count++;
564 ident = ident->next;
566 if (count > 99)
567 count = 99;
568 distribution[count]++;
571 for (i = 0; i < 100; i++) {
572 if (distribution[i])
573 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
577 static struct ident *alloc_ident(const char *name, int len)
579 struct ident *ident = __alloc_ident(len);
580 ident->symbols = NULL;
581 ident->len = len;
582 memcpy(ident->name, name, len);
583 return ident;
586 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
588 ident->next = hash_table[hash];
589 hash_table[hash] = ident;
590 ident_miss++;
591 return ident;
594 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
596 struct ident *ident;
598 ident = hash_table[hash];
599 while (ident) {
600 if (ident->len == len && !memcmp(ident->name, name, len)) {
601 ident_hit++;
602 return ident;
604 ident = ident->next;
607 return insert_hash(alloc_ident(name, len), hash);
610 static unsigned long hash_name(const char *name, int len)
612 unsigned long hash;
613 const unsigned char *p = (const unsigned char *)name;
615 hash = ident_hash_init(*p++);
616 while (--len) {
617 unsigned int i = *p++;
618 hash = ident_hash_add(hash, i);
620 return ident_hash_end(hash);
623 struct ident *hash_ident(struct ident *ident)
625 return insert_hash(ident, hash_name(ident->name, ident->len));
628 struct ident *built_in_ident(const char *name)
630 int len = strlen(name);
631 return create_hashed_ident(name, len, hash_name(name, len));
634 struct token *built_in_token(int stream, const char *name)
636 struct token *token;
638 token = __alloc_token(0);
639 token->pos.stream = stream;
640 token_type(token) = TOKEN_IDENT;
641 token->ident = built_in_ident(name);
642 return token;
645 static int get_one_identifier(int c, stream_t *stream)
647 struct token *token;
648 struct ident *ident;
649 unsigned long hash;
650 char buf[256];
651 int len = 1;
652 int next;
654 hash = ident_hash_init(c);
655 buf[0] = c;
656 for (;;) {
657 next = nextchar(stream);
658 switch (next) {
659 case '0'...'9':
660 case 'a'...'z':
661 case 'A'...'Z':
662 case '_':
663 if (len < sizeof(buf)) {
664 hash = ident_hash_add(hash, next);
665 buf[len] = next;
666 len++;
668 continue;
670 break;
672 hash = ident_hash_end(hash);
674 ident = create_hashed_ident(buf, len, hash);
676 /* Pass it on.. */
677 token = stream->token;
678 token_type(token) = TOKEN_IDENT;
679 token->ident = ident;
680 add_token(stream);
681 return next;
684 static int get_one_token(int c, stream_t *stream)
686 switch (c) {
687 case '0'...'9':
688 return get_one_number(c, stream);
689 case 'a'...'z':
690 case 'A'...'Z':
691 case '_':
692 return get_one_identifier(c, stream);
693 default:
694 return get_one_special(c, stream);
698 static struct token *setup_stream(stream_t *stream, int idx, int fd,
699 unsigned char *buf, unsigned int buf_size)
701 struct token *begin;
703 stream->pos.stream = idx;
704 stream->pos.line = 1;
705 stream->pos.newline = 1;
706 stream->pos.whitespace = 0;
707 stream->pos.pos = 0;
709 stream->token = NULL;
710 stream->fd = fd;
711 stream->offset = 0;
712 stream->size = buf_size;
713 stream->buffer = buf;
715 begin = alloc_token(stream);
716 token_type(begin) = TOKEN_STREAMBEGIN;
717 stream->tokenlist = &begin->next;
718 return begin;
721 static void tokenize_stream(stream_t *stream, struct token *endtoken)
723 int c = nextchar(stream);
724 while (c != EOF) {
725 if (c == '\\') {
726 c = nextchar(stream);
727 stream->pos.newline = 0;
728 stream->pos.whitespace = 1;
729 continue;
731 if (!isspace(c)) {
732 struct token *token = alloc_token(stream);
733 stream->token = token;
734 stream->pos.newline = 0;
735 stream->pos.whitespace = 0;
736 c = get_one_token(c, stream);
737 continue;
739 stream->pos.whitespace = 1;
740 c = nextchar(stream);
742 mark_eof(stream, endtoken);
745 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
747 stream_t stream;
748 struct token *begin;
750 begin = setup_stream(&stream, 0, -1, buffer, size);
751 tokenize_stream(&stream, endtoken);
752 return begin;
755 struct token * tokenize(const char *name, int fd, struct token *endtoken)
757 struct token *begin;
758 stream_t stream;
759 unsigned char buffer[BUFSIZE];
760 int idx;
762 idx = init_stream(name, fd);
763 if (idx < 0)
764 return endtoken;
766 begin = setup_stream(&stream, idx, fd, buffer, 0);
767 tokenize_stream(&stream, endtoken);
768 return begin;