slist: introduce merge_fake_stree()
[smatch.git] / tokenize.c
blobdddc7d20325f1a0d5cf79f18ae942dd5df4a117b
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <ctype.h>
32 #include <unistd.h>
33 #include <stdint.h>
35 #include "lib.h"
36 #include "allocate.h"
37 #include "token.h"
38 #include "symbol.h"
40 #define EOF (-1)
42 int input_stream_nr = 0;
43 struct stream *input_streams;
44 static int input_streams_allocated;
45 unsigned int tabstop = 8;
46 int no_lineno = 0;
48 #define BUFSIZE (8192)
50 typedef struct {
51 int fd, offset, size;
52 int pos, line, nr;
53 int newline, whitespace;
54 struct token **tokenlist;
55 struct token *token;
56 unsigned char *buffer;
57 } stream_t;
59 const char *stream_name(int stream)
61 if (stream < 0 || stream > input_stream_nr)
62 return "<bad stream>";
63 return input_streams[stream].name;
66 static struct position stream_pos(stream_t *stream)
68 struct position pos;
69 pos.type = 0;
70 pos.stream = stream->nr;
71 pos.newline = stream->newline;
72 pos.whitespace = stream->whitespace;
73 pos.pos = stream->pos;
75 pos.line = stream->line;
76 if (no_lineno)
77 pos.line = 123456;
79 pos.noexpand = 0;
80 return pos;
83 const char *show_special(int val)
85 static char buffer[4];
87 buffer[0] = val;
88 buffer[1] = 0;
89 if (val >= SPECIAL_BASE)
90 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
91 return buffer;
94 const char *show_ident(const struct ident *ident)
96 static char buffer[256];
97 if (!ident)
98 return "<noident>";
99 sprintf(buffer, "%.*s", ident->len, ident->name);
100 return buffer;
103 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
105 if (isprint(c)) {
106 if (c == escape || c == '\\')
107 *ptr++ = '\\';
108 *ptr++ = c;
109 return ptr;
111 *ptr++ = '\\';
112 switch (c) {
113 case '\n':
114 *ptr++ = 'n';
115 return ptr;
116 case '\t':
117 *ptr++ = 't';
118 return ptr;
120 if (!isdigit(next))
121 return ptr + sprintf(ptr, "%o", c);
123 return ptr + sprintf(ptr, "%03o", c);
126 const char *show_string(const struct string *string)
128 static char buffer[4 * MAX_STRING + 3];
129 char *ptr;
130 int i;
132 if (!string->length)
133 return "<bad_string>";
134 ptr = buffer;
135 *ptr++ = '"';
136 for (i = 0; i < string->length-1; i++) {
137 const char *p = string->data + i;
138 ptr = charstr(ptr, p[0], '"', p[1]);
140 *ptr++ = '"';
141 *ptr = '\0';
142 return buffer;
145 static const char *show_char(const char *s, size_t len, char prefix, char delim)
147 static char buffer[MAX_STRING + 4];
148 char *p = buffer;
149 if (prefix)
150 *p++ = prefix;
151 *p++ = delim;
152 memcpy(p, s, len);
153 p += len;
154 *p++ = delim;
155 *p++ = '\0';
156 return buffer;
159 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
161 static char buffer[2*MAX_STRING + 6];
162 size_t i;
163 char *p = buffer;
164 if (prefix)
165 *p++ = prefix;
166 if (delim == '"')
167 *p++ = '\\';
168 *p++ = delim;
169 for (i = 0; i < len; i++) {
170 if (s[i] == '"' || s[i] == '\\')
171 *p++ = '\\';
172 *p++ = s[i];
174 if (delim == '"')
175 *p++ = '\\';
176 *p++ = delim;
177 *p++ = '\0';
178 return buffer;
181 const char *show_token(const struct token *token)
183 static char buffer[256];
185 if (!token)
186 return "<no token>";
187 switch (token_type(token)) {
188 case TOKEN_ERROR:
189 return "syntax error";
191 case TOKEN_EOF:
192 return "end-of-input";
194 case TOKEN_IDENT:
195 return show_ident(token->ident);
197 case TOKEN_NUMBER:
198 return token->number;
200 case TOKEN_SPECIAL:
201 return show_special(token->special);
203 case TOKEN_CHAR:
204 return show_char(token->string->data,
205 token->string->length - 1, 0, '\'');
206 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
207 return show_char(token->embedded,
208 token_type(token) - TOKEN_CHAR, 0, '\'');
209 case TOKEN_WIDE_CHAR:
210 return show_char(token->string->data,
211 token->string->length - 1, 'L', '\'');
212 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
213 return show_char(token->embedded,
214 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
215 case TOKEN_STRING:
216 return show_char(token->string->data,
217 token->string->length - 1, 0, '"');
218 case TOKEN_WIDE_STRING:
219 return show_char(token->string->data,
220 token->string->length - 1, 'L', '"');
222 case TOKEN_STREAMBEGIN:
223 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
224 return buffer;
226 case TOKEN_STREAMEND:
227 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
228 return buffer;
230 case TOKEN_UNTAINT:
231 sprintf(buffer, "<untaint>");
232 return buffer;
234 case TOKEN_ARG_COUNT:
235 sprintf(buffer, "<argcnt>");
236 return buffer;
238 default:
239 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
240 return buffer;
244 const char *quote_token(const struct token *token)
246 static char buffer[256];
248 switch (token_type(token)) {
249 case TOKEN_ERROR:
250 return "syntax error";
252 case TOKEN_IDENT:
253 return show_ident(token->ident);
255 case TOKEN_NUMBER:
256 return token->number;
258 case TOKEN_SPECIAL:
259 return show_special(token->special);
261 case TOKEN_CHAR:
262 return quote_char(token->string->data,
263 token->string->length - 1, 0, '\'');
264 case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
265 return quote_char(token->embedded,
266 token_type(token) - TOKEN_CHAR, 0, '\'');
267 case TOKEN_WIDE_CHAR:
268 return quote_char(token->string->data,
269 token->string->length - 1, 'L', '\'');
270 case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
271 return quote_char(token->embedded,
272 token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
273 case TOKEN_STRING:
274 return quote_char(token->string->data,
275 token->string->length - 1, 0, '"');
276 case TOKEN_WIDE_STRING:
277 return quote_char(token->string->data,
278 token->string->length - 1, 'L', '"');
279 default:
280 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
281 return buffer;
285 #define HASHED_INPUT_BITS (6)
286 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
287 #define HASH_PRIME 0x9e370001UL
289 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
291 int *hash_stream(const char *name)
293 uint32_t hash = 0;
294 unsigned char c;
296 while ((c = *name++) != 0)
297 hash = (hash + (c << 4) + (c >> 4)) * 11;
299 hash *= HASH_PRIME;
300 hash >>= 32 - HASHED_INPUT_BITS;
301 return input_stream_hashes + hash;
304 int init_stream(const char *name, int fd, const char **next_path)
306 int stream = input_stream_nr, *hash;
307 struct stream *current;
309 if (stream >= input_streams_allocated) {
310 int newalloc = stream * 4 / 3 + 10;
311 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
312 if (!input_streams)
313 die("Unable to allocate more streams space");
314 input_streams_allocated = newalloc;
316 current = input_streams + stream;
317 memset(current, 0, sizeof(*current));
318 current->name = name;
319 current->fd = fd;
320 current->next_path = next_path;
321 current->path = NULL;
322 current->constant = CONSTANT_FILE_MAYBE;
323 input_stream_nr = stream+1;
324 hash = hash_stream(name);
325 current->next_stream = *hash;
326 *hash = stream;
327 return stream;
330 static struct token * alloc_token(stream_t *stream)
332 struct token *token = __alloc_token(0);
333 token->pos = stream_pos(stream);
334 return token;
338 * Argh... That was surprisingly messy - handling '\r' complicates the
339 * things a _lot_.
341 static int nextchar_slow(stream_t *stream)
343 int offset = stream->offset;
344 int size = stream->size;
345 int c;
346 int spliced = 0, had_cr, had_backslash;
348 restart:
349 had_cr = had_backslash = 0;
351 repeat:
352 if (offset >= size) {
353 if (stream->fd < 0)
354 goto got_eof;
355 size = read(stream->fd, stream->buffer, BUFSIZE);
356 if (size <= 0)
357 goto got_eof;
358 stream->size = size;
359 stream->offset = offset = 0;
362 c = stream->buffer[offset++];
363 if (had_cr)
364 goto check_lf;
366 if (c == '\r') {
367 had_cr = 1;
368 goto repeat;
371 norm:
372 if (!had_backslash) {
373 switch (c) {
374 case '\t':
375 stream->pos += tabstop - stream->pos % tabstop;
376 break;
377 case '\n':
378 stream->line++;
379 stream->pos = 0;
380 stream->newline = 1;
381 break;
382 case '\\':
383 had_backslash = 1;
384 stream->pos++;
385 goto repeat;
386 default:
387 stream->pos++;
389 } else {
390 if (c == '\n') {
391 stream->line++;
392 stream->pos = 0;
393 spliced = 1;
394 goto restart;
396 offset--;
397 c = '\\';
399 out:
400 stream->offset = offset;
402 return c;
404 check_lf:
405 if (c != '\n')
406 offset--;
407 c = '\n';
408 goto norm;
410 got_eof:
411 if (had_backslash) {
412 c = '\\';
413 goto out;
415 if (stream->pos)
416 warning(stream_pos(stream), "no newline at end of file");
417 else if (spliced)
418 warning(stream_pos(stream), "backslash-newline at end of file");
419 return EOF;
423 * We want that as light as possible while covering all normal cases.
424 * Slow path (including the logics with line-splicing and EOF sanity
425 * checks) is in nextchar_slow().
427 static inline int nextchar(stream_t *stream)
429 int offset = stream->offset;
431 if (offset < stream->size) {
432 int c = stream->buffer[offset++];
433 static const char special[256] = {
434 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
436 if (!special[c]) {
437 stream->offset = offset;
438 stream->pos++;
439 return c;
442 return nextchar_slow(stream);
445 struct token eof_token_entry;
447 static struct token *mark_eof(stream_t *stream)
449 struct token *end;
451 end = alloc_token(stream);
452 token_type(end) = TOKEN_STREAMEND;
453 end->pos.newline = 1;
455 eof_token_entry.next = &eof_token_entry;
456 eof_token_entry.pos.newline = 1;
458 end->next = &eof_token_entry;
459 *stream->tokenlist = end;
460 stream->tokenlist = NULL;
461 return end;
464 static void add_token(stream_t *stream)
466 struct token *token = stream->token;
468 stream->token = NULL;
469 token->next = NULL;
470 *stream->tokenlist = token;
471 stream->tokenlist = &token->next;
474 static void drop_token(stream_t *stream)
476 stream->newline |= stream->token->pos.newline;
477 stream->whitespace |= stream->token->pos.whitespace;
478 stream->token = NULL;
481 enum {
482 Letter = 1,
483 Digit = 2,
484 Hex = 4,
485 Exp = 8,
486 Dot = 16,
487 ValidSecond = 32,
488 Quote = 64,
489 Escape = 128,
492 static const long cclass[257] = {
493 ['0' + 1 ... '7' + 1] = Digit | Hex | Escape, /* \<octal> */
494 ['8' + 1 ... '9' + 1] = Digit | Hex,
495 ['A' + 1 ... 'D' + 1] = Letter | Hex,
496 ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
497 ['F' + 1] = Letter | Hex,
498 ['G' + 1 ... 'O' + 1] = Letter,
499 ['P' + 1] = Letter | Exp, /* P<exp> */
500 ['Q' + 1 ... 'Z' + 1] = Letter,
501 ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
502 ['c' + 1 ... 'd' + 1] = Letter | Hex,
503 ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
504 ['f' + 1] = Letter | Hex | Escape, /* \f */
505 ['g' + 1 ... 'm' + 1] = Letter,
506 ['n' + 1] = Letter | Escape, /* \n */
507 ['o' + 1] = Letter,
508 ['p' + 1] = Letter | Exp, /* p<exp> */
509 ['q' + 1] = Letter,
510 ['r' + 1] = Letter | Escape, /* \r */
511 ['s' + 1] = Letter,
512 ['t' + 1] = Letter | Escape, /* \t */
513 ['u' + 1] = Letter,
514 ['v' + 1] = Letter | Escape, /* \v */
515 ['w' + 1] = Letter,
516 ['x' + 1] = Letter | Escape, /* \x<hex> */
517 ['y' + 1 ... 'z' + 1] = Letter,
518 ['_' + 1] = Letter,
519 ['.' + 1] = Dot | ValidSecond,
520 ['=' + 1] = ValidSecond,
521 ['+' + 1] = ValidSecond,
522 ['-' + 1] = ValidSecond,
523 ['>' + 1] = ValidSecond,
524 ['<' + 1] = ValidSecond,
525 ['&' + 1] = ValidSecond,
526 ['|' + 1] = ValidSecond,
527 ['#' + 1] = ValidSecond,
528 ['\'' + 1] = Quote | Escape,
529 ['"' + 1] = Quote | Escape,
530 ['\\' + 1] = Escape,
531 ['?' + 1] = Escape,
535 * pp-number:
536 * digit
537 * . digit
538 * pp-number digit
539 * pp-number identifier-nodigit
540 * pp-number e sign
541 * pp-number E sign
542 * pp-number p sign
543 * pp-number P sign
544 * pp-number .
546 static int get_one_number(int c, int next, stream_t *stream)
548 struct token *token;
549 static char buffer[4095];
550 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
551 int len;
553 *p++ = c;
554 for (;;) {
555 long class = cclass[next + 1];
556 if (!(class & (Dot | Digit | Letter)))
557 break;
558 if (p != buffer_end)
559 *p++ = next;
560 next = nextchar(stream);
561 if (class & Exp) {
562 if (next == '-' || next == '+') {
563 if (p != buffer_end)
564 *p++ = next;
565 next = nextchar(stream);
570 if (p == buffer_end) {
571 sparse_error(stream_pos(stream), "number token exceeds %td characters",
572 buffer_end - buffer);
573 // Pretend we saw just "1".
574 buffer[0] = '1';
575 p = buffer + 1;
578 *p++ = 0;
579 len = p - buffer;
580 buf = __alloc_bytes(len);
581 memcpy(buf, buffer, len);
583 token = stream->token;
584 token_type(token) = TOKEN_NUMBER;
585 token->number = buf;
586 add_token(stream);
588 return next;
591 static int eat_string(int next, stream_t *stream, enum token_type type)
593 static char buffer[MAX_STRING];
594 struct string *string;
595 struct token *token = stream->token;
596 int len = 0;
597 int escape;
598 int want_hex = 0;
599 char delim = type < TOKEN_STRING ? '\'' : '"';
601 for (escape = 0; escape || next != delim; next = nextchar(stream)) {
602 if (len < MAX_STRING)
603 buffer[len] = next;
604 len++;
605 if (next == '\n') {
606 warning(stream_pos(stream),
607 "Newline in string or character constant");
608 if (delim == '\'') /* assume it's lost ' */
609 break;
611 if (next == EOF) {
612 warning(stream_pos(stream),
613 "End of file in middle of string");
614 return next;
616 if (!escape) {
617 if (want_hex && !(cclass[next + 1] & Hex))
618 warning(stream_pos(stream),
619 "\\x used with no following hex digits");
620 want_hex = 0;
621 escape = next == '\\';
622 } else {
623 if (!(cclass[next + 1] & Escape))
624 warning(stream_pos(stream),
625 "Unknown escape '%c'", next);
626 escape = 0;
627 want_hex = next == 'x';
630 if (want_hex)
631 warning(stream_pos(stream),
632 "\\x used with no following hex digits");
633 if (len > MAX_STRING) {
634 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
635 len = MAX_STRING;
637 if (delim == '\'' && len <= 4) {
638 if (len == 0) {
639 sparse_error(stream_pos(stream),
640 "empty character constant");
641 return nextchar(stream);
643 token_type(token) = type + len;
644 memset(buffer + len, '\0', 4 - len);
645 memcpy(token->embedded, buffer, 4);
646 } else {
647 token_type(token) = type;
648 string = __alloc_string(len+1);
649 memcpy(string->data, buffer, len);
650 string->data[len] = '\0';
651 string->length = len+1;
652 token->string = string;
655 /* Pass it on.. */
656 token = stream->token;
657 add_token(stream);
658 return nextchar(stream);
661 static int drop_stream_eoln(stream_t *stream)
663 drop_token(stream);
664 for (;;) {
665 switch (nextchar(stream)) {
666 case EOF:
667 return EOF;
668 case '\n':
669 return nextchar(stream);
674 static int drop_stream_comment(stream_t *stream)
676 int newline;
677 int next;
678 drop_token(stream);
679 newline = stream->newline;
681 next = nextchar(stream);
682 for (;;) {
683 int curr = next;
684 if (curr == EOF) {
685 warning(stream_pos(stream), "End of file in the middle of a comment");
686 return curr;
688 next = nextchar(stream);
689 if (curr == '*' && next == '/')
690 break;
692 stream->newline = newline;
693 return nextchar(stream);
696 unsigned char combinations[][4] = COMBINATION_STRINGS;
698 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
700 /* hash function for two-character punctuators - all give unique values */
701 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
704 * note that we won't get false positives - special_hash(0,0) is 0 and
705 * entry 0 is filled (by +=), so all the missing ones are OK.
707 static unsigned char hash_results[32][2] = {
708 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
709 RES('+', '='), /* 00 */
710 RES('/', '='), /* 01 */
711 RES('^', '='), /* 05 */
712 RES('&', '&'), /* 07 */
713 RES('#', '#'), /* 08 */
714 RES('<', '<'), /* 0a */
715 RES('<', '='), /* 0c */
716 RES('!', '='), /* 0e */
717 RES('%', '='), /* 0f */
718 RES('-', '-'), /* 10 */
719 RES('-', '='), /* 11 */
720 RES('-', '>'), /* 13 */
721 RES('=', '='), /* 15 */
722 RES('&', '='), /* 17 */
723 RES('*', '='), /* 18 */
724 RES('.', '.'), /* 1a */
725 RES('+', '+'), /* 1b */
726 RES('|', '='), /* 1c */
727 RES('>', '='), /* 1d */
728 RES('|', '|'), /* 1e */
729 RES('>', '>') /* 1f */
730 #undef RES
732 static int code[32] = {
733 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
734 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
735 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
736 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
737 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
738 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
739 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
740 CODE('<', '=', SPECIAL_LTE), /* 0c */
741 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
742 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
743 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
744 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
745 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
746 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
747 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
748 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
749 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
750 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
751 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
752 CODE('>', '=', SPECIAL_GTE), /* 1d */
753 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
754 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
755 #undef CODE
758 static int get_one_special(int c, stream_t *stream)
760 struct token *token;
761 int next, value, i;
763 next = nextchar(stream);
766 * Check for numbers, strings, character constants, and comments
768 switch (c) {
769 case '.':
770 if (next >= '0' && next <= '9')
771 return get_one_number(c, next, stream);
772 break;
773 case '"':
774 return eat_string(next, stream, TOKEN_STRING);
775 case '\'':
776 return eat_string(next, stream, TOKEN_CHAR);
777 case '/':
778 if (next == '/')
779 return drop_stream_eoln(stream);
780 if (next == '*')
781 return drop_stream_comment(stream);
785 * Check for combinations
787 value = c;
788 if (cclass[next + 1] & ValidSecond) {
789 i = special_hash(c, next);
790 if (hash_results[i][0] == c && hash_results[i][1] == next) {
791 value = code[i];
792 next = nextchar(stream);
793 if (value >= SPECIAL_LEFTSHIFT &&
794 next == "==."[value - SPECIAL_LEFTSHIFT]) {
795 value += 3;
796 next = nextchar(stream);
801 /* Pass it on.. */
802 token = stream->token;
803 token_type(token) = TOKEN_SPECIAL;
804 token->special = value;
805 add_token(stream);
806 return next;
809 #define IDENT_HASH_BITS (13)
810 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
811 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
813 #define ident_hash_init(c) (c)
814 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
815 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
817 static struct ident *hash_table[IDENT_HASH_SIZE];
818 static int ident_hit, ident_miss, idents;
820 void show_identifier_stats(void)
822 int i;
823 int distribution[100];
825 fprintf(stderr, "identifiers: %d hits, %d misses\n",
826 ident_hit, ident_miss);
828 for (i = 0; i < 100; i++)
829 distribution[i] = 0;
831 for (i = 0; i < IDENT_HASH_SIZE; i++) {
832 struct ident * ident = hash_table[i];
833 int count = 0;
835 while (ident) {
836 count++;
837 ident = ident->next;
839 if (count > 99)
840 count = 99;
841 distribution[count]++;
844 for (i = 0; i < 100; i++) {
845 if (distribution[i])
846 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
850 static struct ident *alloc_ident(const char *name, int len)
852 struct ident *ident = __alloc_ident(len);
853 ident->symbols = NULL;
854 ident->len = len;
855 ident->tainted = 0;
856 memcpy(ident->name, name, len);
857 return ident;
860 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
862 ident->next = hash_table[hash];
863 hash_table[hash] = ident;
864 ident_miss++;
865 return ident;
868 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
870 struct ident *ident;
871 struct ident **p;
873 p = &hash_table[hash];
874 while ((ident = *p) != NULL) {
875 if (ident->len == (unsigned char) len) {
876 if (strncmp(name, ident->name, len) != 0)
877 goto next;
879 ident_hit++;
880 return ident;
882 next:
883 //misses++;
884 p = &ident->next;
886 ident = alloc_ident(name, len);
887 *p = ident;
888 ident->next = NULL;
889 ident_miss++;
890 idents++;
891 return ident;
894 static unsigned long hash_name(const char *name, int len)
896 unsigned long hash;
897 const unsigned char *p = (const unsigned char *)name;
899 hash = ident_hash_init(*p++);
900 while (--len) {
901 unsigned int i = *p++;
902 hash = ident_hash_add(hash, i);
904 return ident_hash_end(hash);
907 struct ident *hash_ident(struct ident *ident)
909 return insert_hash(ident, hash_name(ident->name, ident->len));
912 struct ident *built_in_ident(const char *name)
914 int len = strlen(name);
915 return create_hashed_ident(name, len, hash_name(name, len));
918 struct token *built_in_token(int stream, const char *name)
920 struct token *token;
922 token = __alloc_token(0);
923 token->pos.stream = stream;
924 token_type(token) = TOKEN_IDENT;
925 token->ident = built_in_ident(name);
926 return token;
929 static int get_one_identifier(int c, stream_t *stream)
931 struct token *token;
932 struct ident *ident;
933 unsigned long hash;
934 char buf[256];
935 int len = 1;
936 int next;
938 hash = ident_hash_init(c);
939 buf[0] = c;
940 for (;;) {
941 next = nextchar(stream);
942 if (!(cclass[next + 1] & (Letter | Digit)))
943 break;
944 if (len >= sizeof(buf))
945 break;
946 hash = ident_hash_add(hash, next);
947 buf[len] = next;
948 len++;
950 if (cclass[next + 1] & Quote) {
951 if (len == 1 && buf[0] == 'L') {
952 if (next == '\'')
953 return eat_string(nextchar(stream), stream,
954 TOKEN_WIDE_CHAR);
955 else
956 return eat_string(nextchar(stream), stream,
957 TOKEN_WIDE_STRING);
960 hash = ident_hash_end(hash);
961 ident = create_hashed_ident(buf, len, hash);
963 /* Pass it on.. */
964 token = stream->token;
965 token_type(token) = TOKEN_IDENT;
966 token->ident = ident;
967 add_token(stream);
968 return next;
971 static int get_one_token(int c, stream_t *stream)
973 long class = cclass[c + 1];
974 if (class & Digit)
975 return get_one_number(c, nextchar(stream), stream);
976 if (class & Letter)
977 return get_one_identifier(c, stream);
978 return get_one_special(c, stream);
981 static struct token *setup_stream(stream_t *stream, int idx, int fd,
982 unsigned char *buf, unsigned int buf_size)
984 struct token *begin;
986 stream->nr = idx;
987 stream->line = 1;
988 stream->newline = 1;
989 stream->whitespace = 0;
990 stream->pos = 0;
992 stream->token = NULL;
993 stream->fd = fd;
994 stream->offset = 0;
995 stream->size = buf_size;
996 stream->buffer = buf;
998 begin = alloc_token(stream);
999 token_type(begin) = TOKEN_STREAMBEGIN;
1000 stream->tokenlist = &begin->next;
1001 return begin;
1004 static struct token *tokenize_stream(stream_t *stream)
1006 int c = nextchar(stream);
1007 while (c != EOF) {
1008 if (!isspace(c)) {
1009 struct token *token = alloc_token(stream);
1010 stream->token = token;
1011 stream->newline = 0;
1012 stream->whitespace = 0;
1013 c = get_one_token(c, stream);
1014 continue;
1016 stream->whitespace = 1;
1017 c = nextchar(stream);
1019 return mark_eof(stream);
1022 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1024 stream_t stream;
1025 struct token *begin;
1027 begin = setup_stream(&stream, 0, -1, buffer, size);
1028 *endtoken = tokenize_stream(&stream);
1029 return begin;
1032 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1034 struct token *begin, *end;
1035 stream_t stream;
1036 unsigned char buffer[BUFSIZE];
1037 int idx;
1039 idx = init_stream(name, fd, next_path);
1040 if (idx < 0) {
1041 // info(endtoken->pos, "File %s is const", name);
1042 return endtoken;
1045 begin = setup_stream(&stream, idx, fd, buffer, 0);
1046 end = tokenize_stream(&stream);
1047 if (endtoken)
1048 end->next = endtoken;
1049 return begin;