Start doing constant strings right: do proper concatenation of strings,
[smatch.git] / tokenize.c
blob27fc835463519ec3110c64426ec1e79101d5535a
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Linus Torvalds, all rights reserved.
6 */
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <stdarg.h>
10 #include <stddef.h>
11 #include <string.h>
12 #include <ctype.h>
13 #include <unistd.h>
14 #include <sys/stat.h>
16 #include "lib.h"
17 #include "token.h"
18 #include "symbol.h"
20 #define EOF (-1)
22 int input_stream_nr = 0;
23 struct stream *input_streams;
24 static int input_streams_allocated;
26 #define BUFSIZE (8192)
28 typedef struct {
29 int fd, offset, size;
30 struct position pos;
31 struct token **tokenlist;
32 struct token *token;
33 unsigned char *buffer;
34 } stream_t;
37 const char *show_special(int val)
39 static const char *combinations[] = COMBINATION_STRINGS;
40 static char buffer[4];
42 buffer[0] = val;
43 buffer[1] = 0;
44 if (val >= SPECIAL_BASE)
45 strcpy(buffer, combinations[val - SPECIAL_BASE]);
46 return buffer;
49 const char *show_ident(const struct ident *ident)
51 static char buffer[256];
52 if (!ident)
53 return "<noident>";
54 sprintf(buffer, "%.*s", ident->len, ident->name);
55 return buffer;
58 char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
60 if (isprint(c)) {
61 if (c == escape || c == '\\')
62 *ptr++ = '\\';
63 *ptr++ = c;
64 return ptr;
66 *ptr++ = '\\';
67 switch (c) {
68 case '\n':
69 *ptr++ = 'n';
70 return ptr;
71 case '\t':
72 *ptr++ = 't';
73 return ptr;
75 if (!isdigit(next))
76 return ptr + sprintf(ptr, "%o", c);
78 return ptr + sprintf(ptr, "%03o", c);
81 const char *show_string(const struct string *string)
83 static char buffer[256];
84 char *ptr;
85 int i;
87 ptr = buffer;
88 *ptr++ = '"';
89 for (i = 0; i < string->length-1; i++) {
90 const unsigned char *p = string->data + i;
91 ptr = charstr(ptr, p[0], '"', p[1]);
93 *ptr++ = '"';
94 *ptr = '\0';
95 return buffer;
98 const char *show_token(const struct token *token)
100 static char buffer[256];
102 if (!token)
103 return "<no token>";
104 switch (token_type(token)) {
105 case TOKEN_ERROR:
106 return "syntax error";
108 case TOKEN_EOF:
109 return "end-of-input";
111 case TOKEN_IDENT:
112 return show_ident(token->ident);
114 case TOKEN_STRING:
115 return show_string(token->string);
117 case TOKEN_INTEGER: {
118 const char *p = token->integer;
119 switch (*p) {
120 case 'o': // octal
121 case 'x': // hex
122 buffer[0] = '0';
123 strcpy(buffer+1, p+1);
124 return buffer;
125 default:
126 return p;
130 case TOKEN_FP:
131 return token->fp;
133 case TOKEN_SPECIAL:
134 return show_special(token->special);
136 case TOKEN_CHAR: {
137 char *ptr = buffer;
138 int c = token->character;
139 *ptr++ = '\'';
140 ptr = charstr(ptr, c, '\'', 0);
141 *ptr++ = '\'';
142 *ptr++ = '\0';
143 return buffer;
146 case TOKEN_STREAMBEGIN:
147 sprintf(buffer, "<beginning of '%s'>", (input_streams + token->pos.stream)->name);
148 return buffer;
150 case TOKEN_STREAMEND:
151 sprintf(buffer, "<end of '%s'>", (input_streams + token->pos.stream)->name);
152 return buffer;
154 default:
155 return "WTF???";
159 int init_stream(const char *name, int fd)
161 int stream = input_stream_nr;
162 struct stream *current;
164 if (stream >= input_streams_allocated) {
165 int newalloc = stream * 4 / 3 + 10;
166 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
167 if (!input_streams)
168 die("Unable to allocate more streams space");
169 input_streams_allocated = newalloc;
171 current = input_streams + stream;
172 memset(current, 0, sizeof(*current));
173 current->name = name;
174 current->fd = fd;
175 current->constant = -1; // "unknown"
176 if (fd > 0) {
177 int i;
178 struct stat st;
180 fstat(fd, &st);
181 current->dev = st.st_dev;
182 current->ino = st.st_ino;
183 for (i = 0; i < stream; i++) {
184 struct stream *s = input_streams + i;
185 if (s->dev == st.st_dev && s->ino == st.st_ino) {
186 if (s->constant > 0 && lookup_symbol(s->protect, NS_PREPROCESSOR))
187 return -1;
191 input_stream_nr = stream+1;
192 return stream;
195 static struct token * alloc_token(stream_t *stream)
197 struct token *token = __alloc_token(0);
198 token->pos = stream->pos;
199 return token;
202 static int nextchar(stream_t *stream)
204 int offset = stream->offset;
205 int size = stream->size;
206 int c;
208 if (offset >= size) {
209 size = read(stream->fd, stream->buffer, BUFSIZE);
210 if (size <= 0)
211 return EOF;
212 stream->size = size;
213 stream->offset = 0;
214 offset = 0;
216 c = stream->buffer[offset];
217 stream->offset = offset + 1;
218 stream->pos.pos++;
219 if (c == '\n') {
220 stream->pos.line++;
221 stream->pos.newline = 1;
222 stream->pos.pos = 0;
224 return c;
227 struct token eof_token_entry;
229 static void mark_eof(stream_t *stream, struct token *end_token)
231 struct token *end;
233 end = alloc_token(stream);
234 token_type(end) = TOKEN_STREAMEND;
235 end->pos.newline = 1;
237 eof_token_entry.next = &eof_token_entry;
238 eof_token_entry.pos.newline = 1;
240 if (!end_token)
241 end_token = &eof_token_entry;
242 end->next = end_token;
243 *stream->tokenlist = end;
244 stream->tokenlist = NULL;
247 static void add_token(stream_t *stream)
249 struct token *token = stream->token;
251 stream->token = NULL;
252 token->next = NULL;
253 *stream->tokenlist = token;
254 stream->tokenlist = &token->next;
257 static void drop_token(stream_t *stream)
259 stream->pos.newline |= stream->token->pos.newline;
260 stream->pos.whitespace |= stream->token->pos.whitespace;
261 stream->token = NULL;
264 static int get_base_number(unsigned int base, char **p, int next, stream_t *stream)
266 char *buf = *p;
268 *buf++ = next;
269 for (;;) {
270 unsigned int n;
271 next = nextchar(stream);
272 n = hexval(next);
273 if (n >= base)
274 break;
275 *buf++ = next;
277 *p = buf;
278 return next;
281 static int do_integer(char *buffer, int len, int next, stream_t *stream)
283 struct token *token = stream->token;
284 void *buf;
286 while (next == 'u' || next == 'U' || next == 'l' || next == 'L') {
287 buffer[len++] = next;
288 next = nextchar(stream);
290 buffer[len++] = '\0';
291 buf = __alloc_bytes(len);
292 memcpy(buf, buffer, len);
293 token_type(token) = TOKEN_INTEGER;
294 token->integer = buf;
295 add_token(stream);
296 return next;
299 static int get_one_number(int c, stream_t *stream)
301 static char buffer[256];
302 int next = nextchar(stream);
303 char *p = buffer;
305 *p++ = c;
306 switch (next) {
307 case '0'...'7':
308 if (c == '0') {
309 buffer[0] = 'o';
310 next = get_base_number(8, &p, next, stream);
311 break;
313 /* fallthrough */
314 case '8'...'9':
315 next = get_base_number(10, &p, next, stream);
316 break;
317 case 'x': case 'X':
318 if (c == '0') {
319 buffer[0] = 'x';
320 next = get_base_number(16, &p, next, stream);
323 return do_integer(buffer, p - buffer, next, stream);
326 static int escapechar(int first, int type, stream_t *stream, int *valp)
328 int next, value;
330 next = nextchar(stream);
331 value = first;
333 if (first == '\n')
334 warn(stream->pos, "Newline in string or character constant");
336 if (first == '\\' && next != EOF) {
337 value = next;
338 next = nextchar(stream);
339 if (value != type) {
340 switch (value) {
341 case 'n':
342 value = '\n';
343 break;
344 case 't':
345 value = '\t';
346 break;
347 case '\\':
348 break;
349 case '\'':
350 break;
351 case '"':
352 break;
353 case '0'...'7': {
354 int nr = 2;
355 value -= '0';
356 while (next >= '0' && next <= '9') {
357 value = (value << 3) + (next-'0');
358 next = nextchar(stream);
359 if (!--nr)
360 break;
362 value &= 0xff;
363 break;
365 case 'x': {
366 int hex = hexval(next);
367 if (hex < 16) {
368 value = hex;
369 next = nextchar(stream);
370 while ((hex = hexval(next)) < 16) {
371 value = (value << 4) + hex;
372 next = nextchar(stream);
374 value &= 0xff;
375 break;
378 /* Fallthrough */
379 default:
380 warn(stream->pos, "Unknown escape '%c'", value);
383 /* Mark it as escaped */
384 value |= 0x100;
386 *valp = value;
387 return next;
390 static int get_char_token(int next, stream_t *stream)
392 int value;
393 struct token *token;
395 next = escapechar(next, '\'', stream, &value);
396 if (value == '\'' || next != '\'') {
397 warn(stream->pos, "Bad character constant");
398 drop_token(stream);
399 return next;
402 token = stream->token;
403 token_type(token) = TOKEN_CHAR;
404 token->character = value & 0xff;
406 add_token(stream);
407 return nextchar(stream);
410 static int get_string_token(int next, stream_t *stream)
412 static char buffer[512];
413 struct string *string;
414 struct token *token;
415 int len = 0;
417 for (;;) {
418 int val;
419 next = escapechar(next, '"', stream, &val);
420 if (val == '"')
421 break;
422 if (next == EOF) {
423 warn(stream->pos, "Enf of file in middle of string");
424 return next;
426 if (len < sizeof(buffer)) {
427 buffer[len] = val;
428 len++;
433 if (len > 256)
434 warn(stream->pos, "String too long");
436 string = __alloc_string(len+1);
437 memcpy(string->data, buffer, len);
438 string->data[len] = '\0';
439 string->length = len+1;
441 /* Pass it on.. */
442 token = stream->token;
443 token_type(token) = TOKEN_STRING;
444 token->string = string;
445 add_token(stream);
447 return next;
450 static int drop_stream_eoln(stream_t *stream)
452 int next = nextchar(stream);
453 drop_token(stream);
454 for (;;) {
455 int curr = next;
456 if (curr == EOF)
457 return next;
458 next = nextchar(stream);
459 if (curr == '\n')
460 return next;
464 static int drop_stream_comment(stream_t *stream)
466 int next = nextchar(stream);
467 drop_token(stream);
468 for (;;) {
469 int curr = next;
470 if (curr == EOF) {
471 warn(stream->pos, "End of file in the middle of a comment");
472 return curr;
474 next = nextchar(stream);
475 if (curr == '*' && next == '/')
476 break;
478 return nextchar(stream);
481 unsigned char combinations[][3] = COMBINATION_STRINGS;
483 #define NR_COMBINATIONS (sizeof(combinations)/3)
485 static int get_one_special(int c, stream_t *stream)
487 struct token *token;
488 unsigned char c1, c2, c3;
489 int next, value, i;
490 char *comb;
492 next = nextchar(stream);
495 * Check for strings, character constants, and comments
497 switch (c) {
498 case '"':
499 return get_string_token(next, stream);
500 case '\'':
501 return get_char_token(next, stream);
502 case '/':
503 if (next == '/')
504 return drop_stream_eoln(stream);
505 if (next == '*')
506 return drop_stream_comment(stream);
510 * Check for combinations
512 value = c;
513 comb = combinations[0];
514 c1 = c; c2 = next; c3 = 0;
515 for (i = 0; i < NR_COMBINATIONS; i++) {
516 if (comb[0] == c1 && comb[1] == c2 && comb[2] == c3) {
517 value = i + SPECIAL_BASE;
518 next = nextchar(stream);
519 if (c3)
520 break;
521 c3 = next;
523 comb += 3;
526 /* Pass it on.. */
527 token = stream->token;
528 token_type(token) = TOKEN_SPECIAL;
529 token->special = value;
530 add_token(stream);
531 return next;
534 #define IDENT_HASH_BITS (10)
535 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
536 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
538 #define ident_hash_init(c) (c)
539 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
540 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
542 static struct ident *hash_table[IDENT_HASH_SIZE];
543 int ident_hit, ident_miss;
545 void show_identifier_stats(void)
547 int i;
548 int distribution[100];
550 fprintf(stderr, "identifiers: %d hits, %d misses\n",
551 ident_hit, ident_miss);
553 for (i = 0; i < 100; i++)
554 distribution[i] = 0;
556 for (i = 0; i < IDENT_HASH_SIZE; i++) {
557 struct ident * ident = hash_table[i];
558 int count = 0;
560 while (ident) {
561 count++;
562 ident = ident->next;
564 if (count > 99)
565 count = 99;
566 distribution[count]++;
569 for (i = 0; i < 100; i++) {
570 if (distribution[i])
571 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
575 static struct ident *alloc_ident(const char *name, int len)
577 struct ident *ident = __alloc_ident(len);
578 ident->symbols = NULL;
579 ident->len = len;
580 memcpy(ident->name, name, len);
581 return ident;
584 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
586 ident->next = hash_table[hash];
587 hash_table[hash] = ident;
588 ident_miss++;
589 return ident;
592 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
594 struct ident *ident;
596 ident = hash_table[hash];
597 while (ident) {
598 if (ident->len == len && !memcmp(ident->name, name, len)) {
599 ident_hit++;
600 return ident;
602 ident = ident->next;
605 return insert_hash(alloc_ident(name, len), hash);
608 static unsigned long hash_name(const char *name, int len)
610 unsigned long hash;
611 const unsigned char *p = (const unsigned char *)name;
613 hash = ident_hash_init(*p++);
614 while (--len) {
615 unsigned int i = *p++;
616 hash = ident_hash_add(hash, i);
618 return ident_hash_end(hash);
621 struct ident *hash_ident(struct ident *ident)
623 return insert_hash(ident, hash_name(ident->name, ident->len));
626 struct ident *built_in_ident(const char *name)
628 int len = strlen(name);
629 return create_hashed_ident(name, len, hash_name(name, len));
632 struct token *built_in_token(int stream, const char *name)
634 struct token *token;
636 token = __alloc_token(0);
637 token->pos.stream = stream;
638 token_type(token) = TOKEN_IDENT;
639 token->ident = built_in_ident(name);
640 return token;
643 static int get_one_identifier(int c, stream_t *stream)
645 struct token *token;
646 struct ident *ident;
647 unsigned long hash;
648 char buf[256];
649 int len = 1;
650 int next;
652 hash = ident_hash_init(c);
653 buf[0] = c;
654 for (;;) {
655 next = nextchar(stream);
656 switch (next) {
657 case '0'...'9':
658 case 'a'...'z':
659 case 'A'...'Z':
660 case '_':
661 if (len < sizeof(buf)) {
662 hash = ident_hash_add(hash, next);
663 buf[len] = next;
664 len++;
666 continue;
668 break;
670 hash = ident_hash_end(hash);
672 ident = create_hashed_ident(buf, len, hash);
674 /* Pass it on.. */
675 token = stream->token;
676 token_type(token) = TOKEN_IDENT;
677 token->ident = ident;
678 add_token(stream);
679 return next;
682 static int get_one_token(int c, stream_t *stream)
684 switch (c) {
685 case '0'...'9':
686 return get_one_number(c, stream);
687 case 'a'...'z':
688 case 'A'...'Z':
689 case '_':
690 return get_one_identifier(c, stream);
691 default:
692 return get_one_special(c, stream);
696 static struct token *setup_stream(stream_t *stream, int idx, int fd,
697 unsigned char *buf, unsigned int buf_size)
699 struct token *begin;
701 stream->pos.stream = idx;
702 stream->pos.line = 1;
703 stream->pos.newline = 1;
704 stream->pos.whitespace = 0;
705 stream->pos.pos = 0;
707 stream->token = NULL;
708 stream->fd = fd;
709 stream->offset = 0;
710 stream->size = buf_size;
711 stream->buffer = buf;
713 begin = alloc_token(stream);
714 token_type(begin) = TOKEN_STREAMBEGIN;
715 stream->tokenlist = &begin->next;
716 return begin;
719 static void tokenize_stream(stream_t *stream, struct token *endtoken)
721 int c = nextchar(stream);
722 while (c != EOF) {
723 if (c == '\\') {
724 c = nextchar(stream);
725 stream->pos.newline = 0;
726 stream->pos.whitespace = 1;
727 continue;
729 if (!isspace(c)) {
730 struct token *token = alloc_token(stream);
731 stream->token = token;
732 stream->pos.newline = 0;
733 stream->pos.whitespace = 0;
734 c = get_one_token(c, stream);
735 continue;
737 stream->pos.whitespace = 1;
738 c = nextchar(stream);
740 mark_eof(stream, endtoken);
743 struct token * tokenize_buffer(unsigned char *buffer, unsigned long size, struct token *endtoken)
745 stream_t stream;
746 struct token *begin;
748 begin = setup_stream(&stream, 0, -1, buffer, size);
749 tokenize_stream(&stream, endtoken);
750 return begin;
753 struct token * tokenize(const char *name, int fd, struct token *endtoken)
755 struct token *begin;
756 stream_t stream;
757 unsigned char buffer[BUFSIZE];
758 int idx;
760 idx = init_stream(name, fd);
761 if (idx < 0)
762 return endtoken;
764 begin = setup_stream(&stream, idx, fd, buffer, 0);
765 tokenize_stream(&stream, endtoken);
766 return begin;