warn on unknown escapes after preprocessing
[smatch.git] / tokenize.c
blob632413f0c614e02b19675579215d8779c5f0cdde
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <unistd.h>
33 #include <stdint.h>
35 #include "lib.h"
36 #include "allocate.h"
37 #include "token.h"
38 #include "symbol.h"
40 #define EOF (-1)
42 int input_stream_nr = 0;
43 struct stream *input_streams;
44 static int input_streams_allocated;
45 unsigned int tabstop = 8;
47 #define BUFSIZE (8192)
49 typedef struct {
50 int fd, offset, size;
51 int pos, line, nr;
52 int newline, whitespace;
53 struct token **tokenlist;
54 struct token *token;
55 unsigned char *buffer;
56 } stream_t;
58 const char *stream_name(int stream)
60 if (stream < 0 || stream > input_stream_nr)
61 return "<bad stream>";
62 return input_streams[stream].name;
65 static struct position stream_pos(stream_t *stream)
67 struct position pos;
68 pos.type = 0;
69 pos.stream = stream->nr;
70 pos.newline = stream->newline;
71 pos.whitespace = stream->whitespace;
72 pos.pos = stream->pos;
73 pos.line = stream->line;
74 pos.noexpand = 0;
75 return pos;
78 const char *show_special(int val)
80 static char buffer[4];
82 buffer[0] = val;
83 buffer[1] = 0;
84 if (val >= SPECIAL_BASE)
85 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
86 return buffer;
89 const char *show_ident(const struct ident *ident)
91 static char buffer[256];
92 if (!ident)
93 return "<noident>";
94 sprintf(buffer, "%.*s", ident->len, ident->name);
95 return buffer;
98 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
100 if (isprint(c)) {
101 if (c == escape || c == '\\')
102 *ptr++ = '\\';
103 *ptr++ = c;
104 return ptr;
106 *ptr++ = '\\';
107 switch (c) {
108 case '\n':
109 *ptr++ = 'n';
110 return ptr;
111 case '\t':
112 *ptr++ = 't';
113 return ptr;
115 if (!isdigit(next))
116 return ptr + sprintf(ptr, "%o", c);
118 return ptr + sprintf(ptr, "%03o", c);
121 const char *show_string(const struct string *string)
123 static char buffer[4 * MAX_STRING + 3];
124 char *ptr;
125 int i;
127 if (!string->length)
128 return "<bad_string>";
129 ptr = buffer;
130 *ptr++ = '"';
131 for (i = 0; i < string->length-1; i++) {
132 const char *p = string->data + i;
133 ptr = charstr(ptr, p[0], '"', p[1]);
135 *ptr++ = '"';
136 *ptr = '\0';
137 return buffer;
140 static const char *show_char(const char *s, size_t len, char prefix, char delim)
142 static char buffer[MAX_STRING + 4];
143 char *p = buffer;
144 if (prefix)
145 *p++ = prefix;
146 *p++ = delim;
147 memcpy(p, s, len);
148 p += len;
149 *p++ = delim;
150 *p++ = '\0';
151 return buffer;
154 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
156 static char buffer[2*MAX_STRING + 6];
157 size_t i;
158 char *p = buffer;
159 if (prefix)
160 *p++ = prefix;
161 if (delim == '"')
162 *p++ = '\\';
163 *p++ = delim;
164 for (i = 0; i < len; i++) {
165 if (s[i] == '"' || s[i] == '\\')
166 *p++ = '\\';
167 *p++ = s[i];
169 if (delim == '"')
170 *p++ = '\\';
171 *p++ = delim;
172 *p++ = '\0';
173 return buffer;
176 const char *show_token(const struct token *token)
178 static char buffer[256];
180 if (!token)
181 return "<no token>";
182 switch (token_type(token)) {
183 case TOKEN_ERROR:
184 return "syntax error";
186 case TOKEN_EOF:
187 return "end-of-input";
189 case TOKEN_IDENT:
190 return show_ident(token->ident);
192 case TOKEN_NUMBER:
193 return token->number;
195 case TOKEN_SPECIAL:
196 return show_special(token->special);
198 case TOKEN_CHAR:
199 return show_char(token->string->data,
200 token->string->length - 1, 0, '\'');
201 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
202 return show_char(token->embedded,
203 token_type(token) - TOKEN_CHAR, 0, '\'');
204 case TOKEN_WIDE_CHAR:
205 return show_char(token->string->data,
206 token->string->length - 1, 'L', '\'');
207 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
208 return show_char(token->embedded,
209 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
210 case TOKEN_STRING:
211 return show_char(token->string->data,
212 token->string->length - 1, 0, '"');
213 case TOKEN_WIDE_STRING:
214 return show_char(token->string->data,
215 token->string->length - 1, 'L', '"');
217 case TOKEN_STREAMBEGIN:
218 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
219 return buffer;
221 case TOKEN_STREAMEND:
222 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
223 return buffer;
225 case TOKEN_UNTAINT:
226 sprintf(buffer, "<untaint>");
227 return buffer;
229 case TOKEN_ARG_COUNT:
230 sprintf(buffer, "<argcnt>");
231 return buffer;
233 default:
234 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
235 return buffer;
239 const char *quote_token(const struct token *token)
241 static char buffer[256];
243 switch (token_type(token)) {
244 case TOKEN_ERROR:
245 return "syntax error";
247 case TOKEN_IDENT:
248 return show_ident(token->ident);
250 case TOKEN_NUMBER:
251 return token->number;
253 case TOKEN_SPECIAL:
254 return show_special(token->special);
256 case TOKEN_CHAR:
257 return quote_char(token->string->data,
258 token->string->length - 1, 0, '\'');
259 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
260 return quote_char(token->embedded,
261 token_type(token) - TOKEN_CHAR, 0, '\'');
262 case TOKEN_WIDE_CHAR:
263 return quote_char(token->string->data,
264 token->string->length - 1, 'L', '\'');
265 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
266 return quote_char(token->embedded,
267 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
268 case TOKEN_STRING:
269 return quote_char(token->string->data,
270 token->string->length - 1, 0, '"');
271 case TOKEN_WIDE_STRING:
272 return quote_char(token->string->data,
273 token->string->length - 1, 'L', '"');
274 default:
275 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
276 return buffer;
280 #define HASHED_INPUT_BITS (6)
281 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
282 #define HASH_PRIME 0x9e370001UL
284 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
286 int *hash_stream(const char *name)
288 uint32_t hash = 0;
289 unsigned char c;
291 while ((c = *name++) != 0)
292 hash = (hash + (c << 4) + (c >> 4)) * 11;
294 hash *= HASH_PRIME;
295 hash >>= 32 - HASHED_INPUT_BITS;
296 return input_stream_hashes + hash;
299 int init_stream(const char *name, int fd, const char **next_path)
301 int stream = input_stream_nr, *hash;
302 struct stream *current;
304 if (stream >= input_streams_allocated) {
305 int newalloc = stream * 4 / 3 + 10;
306 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
307 if (!input_streams)
308 die("Unable to allocate more streams space");
309 input_streams_allocated = newalloc;
311 current = input_streams + stream;
312 memset(current, 0, sizeof(*current));
313 current->name = name;
314 current->fd = fd;
315 current->next_path = next_path;
316 current->path = NULL;
317 current->constant = CONSTANT_FILE_MAYBE;
318 input_stream_nr = stream+1;
319 hash = hash_stream(name);
320 current->next_stream = *hash;
321 *hash = stream;
322 return stream;
325 static struct token * alloc_token(stream_t *stream)
327 struct token *token = __alloc_token(0);
328 token->pos = stream_pos(stream);
329 return token;
333 * Argh... That was surprisingly messy - handling '\r' complicates the
334 * things a _lot_.
336 static int nextchar_slow(stream_t *stream)
338 int offset = stream->offset;
339 int size = stream->size;
340 int c;
341 int spliced = 0, had_cr, had_backslash;
343 restart:
344 had_cr = had_backslash = 0;
346 repeat:
347 if (offset >= size) {
348 if (stream->fd < 0)
349 goto got_eof;
350 size = read(stream->fd, stream->buffer, BUFSIZE);
351 if (size <= 0)
352 goto got_eof;
353 stream->size = size;
354 stream->offset = offset = 0;
357 c = stream->buffer[offset++];
358 if (had_cr)
359 goto check_lf;
361 if (c == '\r') {
362 had_cr = 1;
363 goto repeat;
366 norm:
367 if (!had_backslash) {
368 switch (c) {
369 case '\t':
370 stream->pos += tabstop - stream->pos % tabstop;
371 break;
372 case '\n':
373 stream->line++;
374 stream->pos = 0;
375 stream->newline = 1;
376 break;
377 case '\\':
378 had_backslash = 1;
379 stream->pos++;
380 goto repeat;
381 default:
382 stream->pos++;
384 } else {
385 if (c == '\n') {
386 stream->line++;
387 stream->pos = 0;
388 spliced = 1;
389 goto restart;
391 offset--;
392 c = '\\';
394 out:
395 stream->offset = offset;
397 return c;
399 check_lf:
400 if (c != '\n')
401 offset--;
402 c = '\n';
403 goto norm;
405 got_eof:
406 if (had_backslash) {
407 c = '\\';
408 goto out;
410 if (stream->pos)
411 warning(stream_pos(stream), "no newline at end of file");
412 else if (spliced)
413 warning(stream_pos(stream), "backslash-newline at end of file");
414 return EOF;
418 * We want that as light as possible while covering all normal cases.
419 * Slow path (including the logics with line-splicing and EOF sanity
420 * checks) is in nextchar_slow().
422 static inline int nextchar(stream_t *stream)
424 int offset = stream->offset;
426 if (offset < stream->size) {
427 int c = stream->buffer[offset++];
428 static const char special[256] = {
429 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
431 if (!special[c]) {
432 stream->offset = offset;
433 stream->pos++;
434 return c;
437 return nextchar_slow(stream);
440 struct token eof_token_entry;
442 static struct token *mark_eof(stream_t *stream)
444 struct token *end;
446 end = alloc_token(stream);
447 token_type(end) = TOKEN_STREAMEND;
448 end->pos.newline = 1;
450 eof_token_entry.next = &eof_token_entry;
451 eof_token_entry.pos.newline = 1;
453 end->next = &eof_token_entry;
454 *stream->tokenlist = end;
455 stream->tokenlist = NULL;
456 return end;
459 static void add_token(stream_t *stream)
461 struct token *token = stream->token;
463 stream->token = NULL;
464 token->next = NULL;
465 *stream->tokenlist = token;
466 stream->tokenlist = &token->next;
469 static void drop_token(stream_t *stream)
471 stream->newline |= stream->token->pos.newline;
472 stream->whitespace |= stream->token->pos.whitespace;
473 stream->token = NULL;
476 enum {
477 Letter = 1,
478 Digit = 2,
479 Hex = 4,
480 Exp = 8,
481 Dot = 16,
482 ValidSecond = 32,
483 Quote = 64,
484 Escape = 128,
487 static const long cclass[257] = {
488 ['0' + 1 ... '7' + 1] = Digit | Hex | Escape, /* \<octal> */
489 ['8' + 1 ... '9' + 1] = Digit | Hex,
490 ['A' + 1 ... 'D' + 1] = Letter | Hex,
491 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
492 ['F' + 1] = Letter | Hex,
493 ['G' + 1 ... 'O' + 1] = Letter,
494 ['P' + 1] = Letter | Exp, /* P<exp> */
495 ['Q' + 1 ... 'Z' + 1] = Letter,
496 ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
497 ['c' + 1 ... 'd' + 1] = Letter | Hex,
498 ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
499 ['f' + 1] = Letter | Hex | Escape, /* \f */
500 ['g' + 1 ... 'm' + 1] = Letter,
501 ['n' + 1] = Letter | Escape, /* \n */
502 ['o' + 1] = Letter,
503 ['p' + 1] = Letter | Exp, /* p<exp> */
504 ['q' + 1] = Letter,
505 ['r' + 1] = Letter | Escape, /* \r */
506 ['s' + 1] = Letter,
507 ['t' + 1] = Letter | Escape, /* \t */
508 ['u' + 1] = Letter,
509 ['v' + 1] = Letter | Escape, /* \v */
510 ['w' + 1] = Letter,
511 ['x' + 1] = Letter | Escape, /* \x<hex> */
512 ['y' + 1 ... 'z' + 1] = Letter,
513 ['_' + 1] = Letter,
514 ['.' + 1] = Dot | ValidSecond,
515 ['=' + 1] = ValidSecond,
516 ['+' + 1] = ValidSecond,
517 ['-' + 1] = ValidSecond,
518 ['>' + 1] = ValidSecond,
519 ['<' + 1] = ValidSecond,
520 ['&' + 1] = ValidSecond,
521 ['|' + 1] = ValidSecond,
522 ['#' + 1] = ValidSecond,
523 ['\'' + 1] = Quote | Escape,
524 ['"' + 1] = Quote | Escape,
525 ['\\' + 1] = Escape,
526 ['?' + 1] = Escape,
530 * pp-number:
531 * digit
532 * . digit
533 * pp-number digit
534 * pp-number identifier-nodigit
535 * pp-number e sign
536 * pp-number E sign
537 * pp-number p sign
538 * pp-number P sign
539 * pp-number .
541 static int get_one_number(int c, int next, stream_t *stream)
543 struct token *token;
544 static char buffer[4095];
545 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
546 int len;
548 *p++ = c;
549 for (;;) {
550 long class = cclass[next + 1];
551 if (!(class & (Dot | Digit | Letter)))
552 break;
553 if (p != buffer_end)
554 *p++ = next;
555 next = nextchar(stream);
556 if (class & Exp) {
557 if (next == '-' || next == '+') {
558 if (p != buffer_end)
559 *p++ = next;
560 next = nextchar(stream);
565 if (p == buffer_end) {
566 sparse_error(stream_pos(stream), "number token exceeds %td characters",
567 buffer_end - buffer);
568 // Pretend we saw just "1".
569 buffer[0] = '1';
570 p = buffer + 1;
573 *p++ = 0;
574 len = p - buffer;
575 buf = __alloc_bytes(len);
576 memcpy(buf, buffer, len);
578 token = stream->token;
579 token_type(token) = TOKEN_NUMBER;
580 token->number = buf;
581 add_token(stream);
583 return next;
586 static int eat_string(int next, stream_t *stream, enum token_type type)
588 static char buffer[MAX_STRING];
589 struct string *string;
590 struct token *token = stream->token;
591 int len = 0;
592 int escape;
593 int want_hex = 0;
594 char delim = type < TOKEN_STRING ? '\'' : '"';
596 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
597 if (len < MAX_STRING)
598 buffer[len] = next;
599 len++;
600 if (next == '\n') {
601 warning(stream_pos(stream),
602 "Newline in string or character constant");
603 if (delim == '\'') /* assume it's lost ' */
604 break;
606 if (next == EOF) {
607 warning(stream_pos(stream),
608 "End of file in middle of string");
609 return next;
611 if (!escape) {
612 if (want_hex && !(cclass[next + 1] & Hex))
613 warning(stream_pos(stream),
614 "\\x used with no following hex digits");
615 want_hex = 0;
616 escape = next == '\\';
617 } else {
618 escape = 0;
619 want_hex = next == 'x';
622 if (want_hex)
623 warning(stream_pos(stream),
624 "\\x used with no following hex digits");
625 if (len > MAX_STRING) {
626 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
627 len = MAX_STRING;
629 if (delim == '\'' && len <= 4) {
630 if (len == 0) {
631 sparse_error(stream_pos(stream),
632 "empty character constant");
633 return nextchar(stream);
635 token_type(token) = type + len;
636 memset(buffer + len, '\0', 4 - len);
637 memcpy(token->embedded, buffer, 4);
638 } else {
639 token_type(token) = type;
640 string = __alloc_string(len+1);
641 memcpy(string->data, buffer, len);
642 string->data[len] = '\0';
643 string->length = len+1;
644 token->string = string;
647 /* Pass it on.. */
648 token = stream->token;
649 add_token(stream);
650 return nextchar(stream);
653 static int drop_stream_eoln(stream_t *stream)
655 drop_token(stream);
656 for (;;) {
657 switch (nextchar(stream)) {
658 case EOF:
659 return EOF;
660 case '\n':
661 return nextchar(stream);
666 static int drop_stream_comment(stream_t *stream)
668 int newline;
669 int next;
670 drop_token(stream);
671 newline = stream->newline;
673 next = nextchar(stream);
674 for (;;) {
675 int curr = next;
676 if (curr == EOF) {
677 warning(stream_pos(stream), "End of file in the middle of a comment");
678 return curr;
680 next = nextchar(stream);
681 if (curr == '*' && next == '/')
682 break;
684 stream->newline = newline;
685 return nextchar(stream);
688 unsigned char combinations[][4] = COMBINATION_STRINGS;
690 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
692 /* hash function for two-character punctuators - all give unique values */
693 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
696 * note that we won't get false positives - special_hash(0,0) is 0 and
697 * entry 0 is filled (by +=), so all the missing ones are OK.
699 static unsigned char hash_results[32][2] = {
700 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
701 RES('+', '='), /* 00 */
702 RES('/', '='), /* 01 */
703 RES('^', '='), /* 05 */
704 RES('&', '&'), /* 07 */
705 RES('#', '#'), /* 08 */
706 RES('<', '<'), /* 0a */
707 RES('<', '='), /* 0c */
708 RES('!', '='), /* 0e */
709 RES('%', '='), /* 0f */
710 RES('-', '-'), /* 10 */
711 RES('-', '='), /* 11 */
712 RES('-', '>'), /* 13 */
713 RES('=', '='), /* 15 */
714 RES('&', '='), /* 17 */
715 RES('*', '='), /* 18 */
716 RES('.', '.'), /* 1a */
717 RES('+', '+'), /* 1b */
718 RES('|', '='), /* 1c */
719 RES('>', '='), /* 1d */
720 RES('|', '|'), /* 1e */
721 RES('>', '>') /* 1f */
722 #undef RES
724 static int code[32] = {
725 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
726 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
727 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
728 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
729 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
730 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
731 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
732 CODE('<', '=', SPECIAL_LTE), /* 0c */
733 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
734 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
735 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
736 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
737 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
738 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
739 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
740 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
741 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
742 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
743 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
744 CODE('>', '=', SPECIAL_GTE), /* 1d */
745 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
746 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
747 #undef CODE
750 static int get_one_special(int c, stream_t *stream)
752 struct token *token;
753 int next, value, i;
755 next = nextchar(stream);
758 * Check for numbers, strings, character constants, and comments
760 switch (c) {
761 case '.':
762 if (next >= '0' && next <= '9')
763 return get_one_number(c, next, stream);
764 break;
765 case '"':
766 return eat_string(next, stream, TOKEN_STRING);
767 case '\'':
768 return eat_string(next, stream, TOKEN_CHAR);
769 case '/':
770 if (next == '/')
771 return drop_stream_eoln(stream);
772 if (next == '*')
773 return drop_stream_comment(stream);
777 * Check for combinations
779 value = c;
780 if (cclass[next + 1] & ValidSecond) {
781 i = special_hash(c, next);
782 if (hash_results[i][0] == c && hash_results[i][1] == next) {
783 value = code[i];
784 next = nextchar(stream);
785 if (value >= SPECIAL_LEFTSHIFT &&
786 next == "==."[value - SPECIAL_LEFTSHIFT]) {
787 value += 3;
788 next = nextchar(stream);
793 /* Pass it on.. */
794 token = stream->token;
795 token_type(token) = TOKEN_SPECIAL;
796 token->special = value;
797 add_token(stream);
798 return next;
801 #define IDENT_HASH_BITS (13)
802 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
803 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
805 #define ident_hash_init(c) (c)
806 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
807 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
809 static struct ident *hash_table[IDENT_HASH_SIZE];
810 static int ident_hit, ident_miss, idents;
812 void show_identifier_stats(void)
814 int i;
815 int distribution[100];
817 fprintf(stderr, "identifiers: %d hits, %d misses\n",
818 ident_hit, ident_miss);
820 for (i = 0; i < 100; i++)
821 distribution[i] = 0;
823 for (i = 0; i < IDENT_HASH_SIZE; i++) {
824 struct ident * ident = hash_table[i];
825 int count = 0;
827 while (ident) {
828 count++;
829 ident = ident->next;
831 if (count > 99)
832 count = 99;
833 distribution[count]++;
836 for (i = 0; i < 100; i++) {
837 if (distribution[i])
838 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
842 static struct ident *alloc_ident(const char *name, int len)
844 struct ident *ident = __alloc_ident(len);
845 ident->symbols = NULL;
846 ident->len = len;
847 ident->tainted = 0;
848 memcpy(ident->name, name, len);
849 return ident;
852 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
854 ident->next = hash_table[hash];
855 hash_table[hash] = ident;
856 ident_miss++;
857 return ident;
860 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
862 struct ident *ident;
863 struct ident **p;
865 p = &hash_table[hash];
866 while ((ident = *p) != NULL) {
867 if (ident->len == (unsigned char) len) {
868 if (strncmp(name, ident->name, len) != 0)
869 goto next;
871 ident_hit++;
872 return ident;
874 next:
875 //misses++;
876 p = &ident->next;
878 ident = alloc_ident(name, len);
879 *p = ident;
880 ident->next = NULL;
881 ident_miss++;
882 idents++;
883 return ident;
886 static unsigned long hash_name(const char *name, int len)
888 unsigned long hash;
889 const unsigned char *p = (const unsigned char *)name;
891 hash = ident_hash_init(*p++);
892 while (--len) {
893 unsigned int i = *p++;
894 hash = ident_hash_add(hash, i);
896 return ident_hash_end(hash);
899 struct ident *hash_ident(struct ident *ident)
901 return insert_hash(ident, hash_name(ident->name, ident->len));
904 struct ident *built_in_ident(const char *name)
906 int len = strlen(name);
907 return create_hashed_ident(name, len, hash_name(name, len));
910 struct token *built_in_token(int stream, const char *name)
912 struct token *token;
914 token = __alloc_token(0);
915 token->pos.stream = stream;
916 token_type(token) = TOKEN_IDENT;
917 token->ident = built_in_ident(name);
918 return token;
921 static int get_one_identifier(int c, stream_t *stream)
923 struct token *token;
924 struct ident *ident;
925 unsigned long hash;
926 char buf[256];
927 int len = 1;
928 int next;
930 hash = ident_hash_init(c);
931 buf[0] = c;
932 for (;;) {
933 next = nextchar(stream);
934 if (!(cclass[next + 1] & (Letter | Digit)))
935 break;
936 if (len >= sizeof(buf))
937 break;
938 hash = ident_hash_add(hash, next);
939 buf[len] = next;
940 len++;
942 if (cclass[next + 1] & Quote) {
943 if (len == 1 && buf[0] == 'L') {
944 if (next == '\'')
945 return eat_string(nextchar(stream), stream,
946 TOKEN_WIDE_CHAR);
947 else
948 return eat_string(nextchar(stream), stream,
949 TOKEN_WIDE_STRING);
952 hash = ident_hash_end(hash);
953 ident = create_hashed_ident(buf, len, hash);
955 /* Pass it on.. */
956 token = stream->token;
957 token_type(token) = TOKEN_IDENT;
958 token->ident = ident;
959 add_token(stream);
960 return next;
963 static int get_one_token(int c, stream_t *stream)
965 long class = cclass[c + 1];
966 if (class & Digit)
967 return get_one_number(c, nextchar(stream), stream);
968 if (class & Letter)
969 return get_one_identifier(c, stream);
970 return get_one_special(c, stream);
973 static struct token *setup_stream(stream_t *stream, int idx, int fd,
974 unsigned char *buf, unsigned int buf_size)
976 struct token *begin;
978 stream->nr = idx;
979 stream->line = 1;
980 stream->newline = 1;
981 stream->whitespace = 0;
982 stream->pos = 0;
984 stream->token = NULL;
985 stream->fd = fd;
986 stream->offset = 0;
987 stream->size = buf_size;
988 stream->buffer = buf;
990 begin = alloc_token(stream);
991 token_type(begin) = TOKEN_STREAMBEGIN;
992 stream->tokenlist = &begin->next;
993 return begin;
996 static struct token *tokenize_stream(stream_t *stream)
998 int c = nextchar(stream);
999 while (c != EOF) {
1000 if (!isspace(c)) {
1001 struct token *token = alloc_token(stream);
1002 stream->token = token;
1003 stream->newline = 0;
1004 stream->whitespace = 0;
1005 c = get_one_token(c, stream);
1006 continue;
1008 stream->whitespace = 1;
1009 c = nextchar(stream);
1011 return mark_eof(stream);
1014 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1016 stream_t stream;
1017 struct token *begin;
1019 begin = setup_stream(&stream, 0, -1, buffer, size);
1020 *endtoken = tokenize_stream(&stream);
1021 return begin;
1024 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1026 struct token *begin, *end;
1027 stream_t stream;
1028 unsigned char buffer[BUFSIZE];
1029 int idx;
1031 idx = init_stream(name, fd, next_path);
1032 if (idx < 0) {
1033 // info(endtoken->pos, "File %s is const", name);
1034 return endtoken;
1037 begin = setup_stream(&stream, idx, fd, buffer, 0);
1038 end = tokenize_stream(&stream);
1039 if (endtoken)
1040 end->next = endtoken;
1041 return begin;