canonicalize constant signed compares toward zero
[smatch.git] / tokenize.c
blobea71054382706a7275c5a81d0464752f2728c5ba
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <unistd.h>
33 #include <stdint.h>
35 #include "lib.h"
36 #include "allocate.h"
37 #include "token.h"
38 #include "symbol.h"
40 #define EOF (-1)
42 int input_stream_nr = 0;
43 struct stream *input_streams;
44 static int input_streams_allocated;
45 unsigned int tabstop = 8;
47 #define BUFSIZE (8192)
49 typedef struct {
50 int fd, offset, size;
51 int pos, line, nr;
52 int newline, whitespace;
53 struct token **tokenlist;
54 struct token *token;
55 unsigned char *buffer;
56 } stream_t;
58 const char *stream_name(int stream)
60 if (stream < 0 || stream > input_stream_nr)
61 return "<bad stream>";
62 return input_streams[stream].name;
65 int stream_prev(int stream)
67 if (stream < 0 || stream > input_stream_nr)
68 return -1;
69 stream = input_streams[stream].pos.stream;
70 if (stream > input_stream_nr)
71 return -1;
72 return stream;
75 static struct position stream_pos(stream_t *stream)
77 struct position pos;
78 pos.type = 0;
79 pos.stream = stream->nr;
80 pos.newline = stream->newline;
81 pos.whitespace = stream->whitespace;
82 pos.pos = stream->pos;
83 pos.line = stream->line;
84 pos.noexpand = 0;
85 return pos;
88 const char *show_special(int val)
90 static char buffer[4];
92 buffer[0] = val;
93 buffer[1] = 0;
94 if (val >= SPECIAL_BASE)
95 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
96 return buffer;
99 const char *show_ident(const struct ident *ident)
101 static char buff[4][256];
102 static int n;
103 char *buffer;
105 if (!ident)
106 return "<noident>";
107 buffer = buff[3 & ++n];
108 sprintf(buffer, "%.*s", ident->len, ident->name);
109 return buffer;
112 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
114 if (isprint(c)) {
115 if (c == escape || c == '\\')
116 *ptr++ = '\\';
117 *ptr++ = c;
118 return ptr;
120 *ptr++ = '\\';
121 switch (c) {
122 case '\n':
123 *ptr++ = 'n';
124 return ptr;
125 case '\t':
126 *ptr++ = 't';
127 return ptr;
129 if (!isdigit(next))
130 return ptr + sprintf(ptr, "%o", c);
132 return ptr + sprintf(ptr, "%03o", c);
135 const char *show_string(const struct string *string)
137 static char buffer[4 * MAX_STRING + 3];
138 char *ptr;
139 int i;
141 if (!string || !string->length)
142 return "<bad_string>";
143 ptr = buffer;
144 *ptr++ = '"';
145 for (i = 0; i < string->length-1; i++) {
146 const char *p = string->data + i;
147 ptr = charstr(ptr, p[0], '"', p[1]);
149 *ptr++ = '"';
150 *ptr = '\0';
151 return buffer;
154 static const char *show_char(const char *s, size_t len, char prefix, char delim)
156 static char buffer[MAX_STRING + 4];
157 char *p = buffer;
158 if (prefix)
159 *p++ = prefix;
160 *p++ = delim;
161 memcpy(p, s, len);
162 p += len;
163 *p++ = delim;
164 *p++ = '\0';
165 return buffer;
168 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
170 static char buffer[2*MAX_STRING + 6];
171 size_t i;
172 char *p = buffer;
173 if (prefix)
174 *p++ = prefix;
175 if (delim == '"')
176 *p++ = '\\';
177 *p++ = delim;
178 for (i = 0; i < len; i++) {
179 if (s[i] == '"' || s[i] == '\\')
180 *p++ = '\\';
181 *p++ = s[i];
183 if (delim == '"')
184 *p++ = '\\';
185 *p++ = delim;
186 *p++ = '\0';
187 return buffer;
190 const char *show_token(const struct token *token)
192 static char buffer[256];
194 if (!token)
195 return "<no token>";
196 switch (token_type(token)) {
197 case TOKEN_ERROR:
198 return "syntax error";
200 case TOKEN_EOF:
201 return "end-of-input";
203 case TOKEN_IDENT:
204 return show_ident(token->ident);
206 case TOKEN_NUMBER:
207 return token->number;
209 case TOKEN_SPECIAL:
210 return show_special(token->special);
212 case TOKEN_CHAR:
213 return show_char(token->string->data,
214 token->string->length - 1, 0, '\'');
215 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
216 return show_char(token->embedded,
217 token_type(token) - TOKEN_CHAR, 0, '\'');
218 case TOKEN_WIDE_CHAR:
219 return show_char(token->string->data,
220 token->string->length - 1, 'L', '\'');
221 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
222 return show_char(token->embedded,
223 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
224 case TOKEN_STRING:
225 return show_char(token->string->data,
226 token->string->length - 1, 0, '"');
227 case TOKEN_WIDE_STRING:
228 return show_char(token->string->data,
229 token->string->length - 1, 'L', '"');
231 case TOKEN_STREAMBEGIN:
232 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
233 return buffer;
235 case TOKEN_STREAMEND:
236 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
237 return buffer;
239 case TOKEN_UNTAINT:
240 sprintf(buffer, "<untaint>");
241 return buffer;
243 case TOKEN_ARG_COUNT:
244 sprintf(buffer, "<argcnt>");
245 return buffer;
247 default:
248 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
249 return buffer;
253 const char *quote_token(const struct token *token)
255 static char buffer[256];
257 switch (token_type(token)) {
258 case TOKEN_ERROR:
259 return "syntax error";
261 case TOKEN_IDENT:
262 return show_ident(token->ident);
264 case TOKEN_NUMBER:
265 return token->number;
267 case TOKEN_SPECIAL:
268 return show_special(token->special);
270 case TOKEN_CHAR:
271 return quote_char(token->string->data,
272 token->string->length - 1, 0, '\'');
273 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
274 return quote_char(token->embedded,
275 token_type(token) - TOKEN_CHAR, 0, '\'');
276 case TOKEN_WIDE_CHAR:
277 return quote_char(token->string->data,
278 token->string->length - 1, 'L', '\'');
279 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
280 return quote_char(token->embedded,
281 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
282 case TOKEN_STRING:
283 return quote_char(token->string->data,
284 token->string->length - 1, 0, '"');
285 case TOKEN_WIDE_STRING:
286 return quote_char(token->string->data,
287 token->string->length - 1, 'L', '"');
288 default:
289 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
290 return buffer;
294 #define HASHED_INPUT_BITS (6)
295 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
296 #define HASH_PRIME 0x9e370001UL
298 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
300 int *hash_stream(const char *name)
302 uint32_t hash = 0;
303 unsigned char c;
305 while ((c = *name++) != 0)
306 hash = (hash + (c << 4) + (c >> 4)) * 11;
308 hash *= HASH_PRIME;
309 hash >>= 32 - HASHED_INPUT_BITS;
310 return input_stream_hashes + hash;
313 int init_stream(const struct position *pos, const char *name, int fd, const char **next_path)
315 int stream = input_stream_nr, *hash;
316 struct stream *current;
318 if (stream >= input_streams_allocated) {
319 int newalloc = stream * 4 / 3 + 10;
320 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
321 if (!input_streams)
322 die("Unable to allocate more streams space");
323 input_streams_allocated = newalloc;
325 current = input_streams + stream;
326 memset(current, 0, sizeof(*current));
327 current->name = name;
328 current->fd = fd;
329 current->next_path = next_path;
330 current->path = NULL;
331 current->constant = CONSTANT_FILE_MAYBE;
332 if (pos)
333 current->pos = *pos;
334 else
335 current->pos.stream = -1;
336 input_stream_nr = stream+1;
337 hash = hash_stream(name);
338 current->next_stream = *hash;
339 *hash = stream;
340 return stream;
343 static struct token * alloc_token(stream_t *stream)
345 struct token *token = __alloc_token(0);
346 token->pos = stream_pos(stream);
347 return token;
351 * Argh... That was surprisingly messy - handling '\r' complicates the
352 * things a _lot_.
354 static int nextchar_slow(stream_t *stream)
356 int offset = stream->offset;
357 int size = stream->size;
358 int c;
359 int spliced = 0, had_cr, had_backslash;
361 restart:
362 had_cr = had_backslash = 0;
364 repeat:
365 if (offset >= size) {
366 if (stream->fd < 0)
367 goto got_eof;
368 size = read(stream->fd, stream->buffer, BUFSIZE);
369 if (size <= 0)
370 goto got_eof;
371 stream->size = size;
372 stream->offset = offset = 0;
375 c = stream->buffer[offset++];
376 if (had_cr)
377 goto check_lf;
379 if (c == '\r') {
380 had_cr = 1;
381 goto repeat;
384 norm:
385 if (!had_backslash) {
386 switch (c) {
387 case '\t':
388 stream->pos += tabstop - stream->pos % tabstop;
389 break;
390 case '\n':
391 stream->line++;
392 stream->pos = 0;
393 stream->newline = 1;
394 break;
395 case '\\':
396 had_backslash = 1;
397 stream->pos++;
398 goto repeat;
399 default:
400 stream->pos++;
402 } else {
403 if (c == '\n') {
404 stream->line++;
405 stream->pos = 0;
406 spliced = 1;
407 goto restart;
409 offset--;
410 c = '\\';
412 out:
413 stream->offset = offset;
415 return c;
417 check_lf:
418 if (c != '\n')
419 offset--;
420 c = '\n';
421 goto norm;
423 got_eof:
424 if (had_backslash) {
425 c = '\\';
426 goto out;
428 if (stream->pos & Wnewline_eof)
429 warning(stream_pos(stream), "no newline at end of file");
430 else if (spliced)
431 warning(stream_pos(stream), "backslash-newline at end of file");
432 return EOF;
436 * We want that as light as possible while covering all normal cases.
437 * Slow path (including the logics with line-splicing and EOF sanity
438 * checks) is in nextchar_slow().
440 static inline int nextchar(stream_t *stream)
442 int offset = stream->offset;
444 if (offset < stream->size) {
445 int c = stream->buffer[offset++];
446 static const char special[256] = {
447 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
449 if (!special[c]) {
450 stream->offset = offset;
451 stream->pos++;
452 return c;
455 return nextchar_slow(stream);
458 struct token eof_token_entry;
460 static struct token *mark_eof(stream_t *stream)
462 struct token *end;
464 end = alloc_token(stream);
465 eof_token_entry.pos = end->pos;
466 token_type(end) = TOKEN_STREAMEND;
467 end->pos.newline = 1;
469 eof_token_entry.next = &eof_token_entry;
470 eof_token_entry.pos.newline = 1;
472 end->next = &eof_token_entry;
473 *stream->tokenlist = end;
474 stream->tokenlist = NULL;
475 return end;
478 static void add_token(stream_t *stream)
480 struct token *token = stream->token;
482 stream->token = NULL;
483 token->next = NULL;
484 *stream->tokenlist = token;
485 stream->tokenlist = &token->next;
488 static void drop_token(stream_t *stream)
490 stream->newline |= stream->token->pos.newline;
491 stream->whitespace |= stream->token->pos.whitespace;
492 stream->token = NULL;
495 enum {
496 Letter = 1,
497 Digit = 2,
498 Hex = 4,
499 Exp = 8,
500 Dot = 16,
501 ValidSecond = 32,
502 Quote = 64,
505 static const char cclass[257] = {
506 ['0' + 1 ... '9' + 1] = Digit | Hex,
507 ['A' + 1 ... 'D' + 1] = Letter | Hex,
508 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
509 ['F' + 1] = Letter | Hex,
510 ['G' + 1 ... 'O' + 1] = Letter,
511 ['P' + 1] = Letter | Exp, /* P<exp> */
512 ['Q' + 1 ... 'Z' + 1] = Letter,
513 ['a' + 1 ... 'd' + 1] = Letter | Hex,
514 ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
515 ['f' + 1] = Letter | Hex,
516 ['g' + 1 ... 'o' + 1] = Letter,
517 ['p' + 1] = Letter | Exp, /* p<exp> */
518 ['q' + 1 ... 'z' + 1] = Letter,
519 ['_' + 1] = Letter,
520 ['.' + 1] = Dot | ValidSecond,
521 ['=' + 1] = ValidSecond,
522 ['+' + 1] = ValidSecond,
523 ['-' + 1] = ValidSecond,
524 ['>' + 1] = ValidSecond,
525 ['<' + 1] = ValidSecond,
526 ['&' + 1] = ValidSecond,
527 ['|' + 1] = ValidSecond,
528 ['#' + 1] = ValidSecond,
529 ['\'' + 1] = Quote,
530 ['"' + 1] = Quote,
534 * pp-number:
535 * digit
536 * . digit
537 * pp-number digit
538 * pp-number identifier-nodigit
539 * pp-number e sign
540 * pp-number E sign
541 * pp-number p sign
542 * pp-number P sign
543 * pp-number .
545 static int get_one_number(int c, int next, stream_t *stream)
547 struct token *token;
548 static char buffer[4095];
549 char *p = buffer, *buffer_end = buffer + sizeof (buffer);
551 *p++ = c;
552 for (;;) {
553 long class = cclass[next + 1];
554 if (!(class & (Dot | Digit | Letter)))
555 break;
556 if (p != buffer_end)
557 *p++ = next;
558 next = nextchar(stream);
559 if (class & Exp) {
560 if (next == '-' || next == '+') {
561 if (p != buffer_end)
562 *p++ = next;
563 next = nextchar(stream);
568 if (p == buffer_end) {
569 sparse_error(stream_pos(stream), "number token exceeds %td characters",
570 buffer_end - buffer);
571 // Pretend we saw just "1".
572 buffer[0] = '1';
573 p = buffer + 1;
576 *p++ = 0;
577 token = stream->token;
578 token_type(token) = TOKEN_NUMBER;
579 token->number = xmemdup(buffer, p - buffer);
580 add_token(stream);
582 return next;
585 static int eat_string(int next, stream_t *stream, enum token_type type)
587 static char buffer[MAX_STRING];
588 struct string *string;
589 struct token *token = stream->token;
590 int len = 0;
591 int escape;
592 int want_hex = 0;
593 char delim = type < TOKEN_STRING ? '\'' : '"';
595 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
596 if (len < MAX_STRING)
597 buffer[len] = next;
598 len++;
599 if (next == '\n') {
600 warning(stream_pos(stream),
601 "missing terminating %c character", delim);
602 /* assume delimiter is lost */
603 break;
605 if (next == EOF) {
606 warning(stream_pos(stream),
607 "End of file in middle of string");
608 return next;
610 if (!escape) {
611 if (want_hex && !(cclass[next + 1] & Hex))
612 warning(stream_pos(stream),
613 "\\x used with no following hex digits");
614 want_hex = 0;
615 escape = next == '\\';
616 } else {
617 escape = 0;
618 want_hex = next == 'x';
621 if (want_hex)
622 warning(stream_pos(stream),
623 "\\x used with no following hex digits");
624 if (len > MAX_STRING) {
625 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
626 len = MAX_STRING;
628 if (delim == '\'' && len && len <= 4) {
629 token_type(token) = type + len;
630 memset(buffer + len, '\0', 4 - len);
631 memcpy(token->embedded, buffer, 4);
632 } else {
633 token_type(token) = type;
634 string = __alloc_string(len+1);
635 memcpy(string->data, buffer, len);
636 string->data[len] = '\0';
637 string->length = len+1;
638 token->string = string;
641 /* Pass it on.. */
642 token = stream->token;
643 add_token(stream);
644 return nextchar(stream);
647 static int drop_stream_eoln(stream_t *stream)
649 drop_token(stream);
650 for (;;) {
651 switch (nextchar(stream)) {
652 case EOF:
653 return EOF;
654 case '\n':
655 return nextchar(stream);
660 static int drop_stream_comment(stream_t *stream)
662 int newline;
663 int next;
664 drop_token(stream);
665 newline = stream->newline;
667 next = nextchar(stream);
668 for (;;) {
669 int curr = next;
670 if (curr == EOF) {
671 warning(stream_pos(stream), "End of file in the middle of a comment");
672 return curr;
674 next = nextchar(stream);
675 if (curr == '*' && next == '/')
676 break;
678 stream->newline = newline;
679 return nextchar(stream);
682 unsigned char combinations[][4] = COMBINATION_STRINGS;
684 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
686 /* hash function for two-character punctuators - all give unique values */
687 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
690 * note that we won't get false positives - special_hash(0,0) is 0 and
691 * entry 0 is filled (by +=), so all the missing ones are OK.
693 static unsigned char hash_results[32][2] = {
694 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
695 RES('+', '='), /* 00 */
696 RES('/', '='), /* 01 */
697 RES('^', '='), /* 05 */
698 RES('&', '&'), /* 07 */
699 RES('#', '#'), /* 08 */
700 RES('<', '<'), /* 0a */
701 RES('<', '='), /* 0c */
702 RES('!', '='), /* 0e */
703 RES('%', '='), /* 0f */
704 RES('-', '-'), /* 10 */
705 RES('-', '='), /* 11 */
706 RES('-', '>'), /* 13 */
707 RES('=', '='), /* 15 */
708 RES('&', '='), /* 17 */
709 RES('*', '='), /* 18 */
710 RES('.', '.'), /* 1a */
711 RES('+', '+'), /* 1b */
712 RES('|', '='), /* 1c */
713 RES('>', '='), /* 1d */
714 RES('|', '|'), /* 1e */
715 RES('>', '>') /* 1f */
716 #undef RES
718 static int code[32] = {
719 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
720 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
721 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
722 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
723 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
724 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
725 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
726 CODE('<', '=', SPECIAL_LTE), /* 0c */
727 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
728 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
729 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
730 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
731 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
732 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
733 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
734 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
735 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
736 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
737 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
738 CODE('>', '=', SPECIAL_GTE), /* 1d */
739 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
740 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
741 #undef CODE
744 static int get_one_special(int c, stream_t *stream)
746 struct token *token;
747 int next, value, i;
749 next = nextchar(stream);
752 * Check for numbers, strings, character constants, and comments
754 switch (c) {
755 case '.':
756 if (next >= '0' && next <= '9')
757 return get_one_number(c, next, stream);
758 break;
759 case '"':
760 return eat_string(next, stream, TOKEN_STRING);
761 case '\'':
762 return eat_string(next, stream, TOKEN_CHAR);
763 case '/':
764 if (next == '/')
765 return drop_stream_eoln(stream);
766 if (next == '*')
767 return drop_stream_comment(stream);
771 * Check for combinations
773 value = c;
774 if (cclass[next + 1] & ValidSecond) {
775 i = special_hash(c, next);
776 if (hash_results[i][0] == c && hash_results[i][1] == next) {
777 value = code[i];
778 next = nextchar(stream);
779 if (value >= SPECIAL_LEFTSHIFT &&
780 next == "==."[value - SPECIAL_LEFTSHIFT]) {
781 value += 3;
782 next = nextchar(stream);
787 /* Pass it on.. */
788 token = stream->token;
789 token_type(token) = TOKEN_SPECIAL;
790 token->special = value;
791 add_token(stream);
792 return next;
795 #define IDENT_HASH_BITS (13)
796 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
797 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
799 #define ident_hash_init(c) (c)
800 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
801 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
803 static struct ident *hash_table[IDENT_HASH_SIZE];
804 static int ident_hit, ident_miss, idents;
806 void show_identifier_stats(void)
808 int i;
809 int distribution[100];
811 fprintf(stderr, "identifiers: %d hits, %d misses\n",
812 ident_hit, ident_miss);
814 for (i = 0; i < 100; i++)
815 distribution[i] = 0;
817 for (i = 0; i < IDENT_HASH_SIZE; i++) {
818 struct ident * ident = hash_table[i];
819 int count = 0;
821 while (ident) {
822 count++;
823 ident = ident->next;
825 if (count > 99)
826 count = 99;
827 distribution[count]++;
830 for (i = 0; i < 100; i++) {
831 if (distribution[i])
832 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
836 static struct ident *alloc_ident(const char *name, int len)
838 struct ident *ident = __alloc_ident(len);
839 ident->symbols = NULL;
840 ident->len = len;
841 ident->tainted = 0;
842 memcpy(ident->name, name, len);
843 return ident;
846 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
848 ident->next = hash_table[hash];
849 hash_table[hash] = ident;
850 ident_miss++;
851 return ident;
854 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
856 struct ident *ident;
857 struct ident **p;
859 p = &hash_table[hash];
860 while ((ident = *p) != NULL) {
861 if (ident->len == (unsigned char) len) {
862 if (strncmp(name, ident->name, len) != 0)
863 goto next;
865 ident_hit++;
866 return ident;
868 next:
869 //misses++;
870 p = &ident->next;
872 ident = alloc_ident(name, len);
873 *p = ident;
874 ident->next = NULL;
875 ident_miss++;
876 idents++;
877 return ident;
880 static unsigned long hash_name(const char *name, int len)
882 unsigned long hash;
883 const unsigned char *p = (const unsigned char *)name;
885 hash = ident_hash_init(*p++);
886 while (--len) {
887 unsigned int i = *p++;
888 hash = ident_hash_add(hash, i);
890 return ident_hash_end(hash);
893 struct ident *hash_ident(struct ident *ident)
895 return insert_hash(ident, hash_name(ident->name, ident->len));
898 struct ident *built_in_ident(const char *name)
900 int len = strlen(name);
901 return create_hashed_ident(name, len, hash_name(name, len));
904 struct token *built_in_token(int stream, struct ident *ident)
906 struct token *token;
908 token = __alloc_token(0);
909 token->pos.stream = stream;
910 token_type(token) = TOKEN_IDENT;
911 token->ident = ident;
912 return token;
915 static int get_one_identifier(int c, stream_t *stream)
917 struct token *token;
918 struct ident *ident;
919 unsigned long hash;
920 char buf[256];
921 int len = 1;
922 int next;
924 hash = ident_hash_init(c);
925 buf[0] = c;
926 for (;;) {
927 next = nextchar(stream);
928 if (!(cclass[next + 1] & (Letter | Digit)))
929 break;
930 if (len >= sizeof(buf))
931 break;
932 hash = ident_hash_add(hash, next);
933 buf[len] = next;
934 len++;
936 if (cclass[next + 1] & Quote) {
937 if (len == 1 && buf[0] == 'L') {
938 if (next == '\'')
939 return eat_string(nextchar(stream), stream,
940 TOKEN_WIDE_CHAR);
941 else
942 return eat_string(nextchar(stream), stream,
943 TOKEN_WIDE_STRING);
946 hash = ident_hash_end(hash);
947 ident = create_hashed_ident(buf, len, hash);
949 /* Pass it on.. */
950 token = stream->token;
951 token_type(token) = TOKEN_IDENT;
952 token->ident = ident;
953 add_token(stream);
954 return next;
957 static int get_one_token(int c, stream_t *stream)
959 long class = cclass[c + 1];
960 if (class & Digit)
961 return get_one_number(c, nextchar(stream), stream);
962 if (class & Letter)
963 return get_one_identifier(c, stream);
964 return get_one_special(c, stream);
967 static struct token *setup_stream(stream_t *stream, int idx, int fd,
968 unsigned char *buf, unsigned int buf_size)
970 struct token *begin;
972 stream->nr = idx;
973 stream->line = 1;
974 stream->newline = 1;
975 stream->whitespace = 0;
976 stream->pos = 0;
978 stream->token = NULL;
979 stream->fd = fd;
980 stream->offset = 0;
981 stream->size = buf_size;
982 stream->buffer = buf;
984 begin = alloc_token(stream);
985 token_type(begin) = TOKEN_STREAMBEGIN;
986 stream->tokenlist = &begin->next;
987 return begin;
990 static struct token *tokenize_stream(stream_t *stream)
992 int c = nextchar(stream);
993 while (c != EOF) {
994 if (!isspace(c)) {
995 struct token *token = alloc_token(stream);
996 stream->token = token;
997 stream->newline = 0;
998 stream->whitespace = 0;
999 c = get_one_token(c, stream);
1000 continue;
1002 stream->whitespace = 1;
1003 c = nextchar(stream);
1005 return mark_eof(stream);
1008 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1010 stream_t stream;
1011 struct token *begin;
1013 begin = setup_stream(&stream, 0, -1, buffer, size);
1014 *endtoken = tokenize_stream(&stream);
1015 return begin;
1018 struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path)
1020 struct token *begin, *end;
1021 stream_t stream;
1022 unsigned char buffer[BUFSIZE];
1023 int idx;
1025 idx = init_stream(pos, name, fd, next_path);
1026 if (idx < 0) {
1027 // info(endtoken->pos, "File %s is const", name);
1028 return endtoken;
1031 begin = setup_stream(&stream, idx, fd, buffer, 0);
1032 end = tokenize_stream(&stream);
1033 if (endtoken)
1034 end->next = endtoken;
1035 return begin;