tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17
  18 #include "lib.h"
  19 #include "allocate.h"
  20 #include "token.h"
  21 #include "symbol.h"
  22
  23 #define EOF (-1)
  24
  25 int input_stream_nr = 0;
  26 struct stream *input_streams;
  27 static int input_streams_allocated;
  28 unsigned int tabstop = 8;
  29
  30 #define BUFSIZE (8192)
  31
  32 typedef struct {
  33         int fd, offset, size;
  34         int pos, line, nr;
  35         int newline, whitespace;
  36         struct token **tokenlist;
  37         struct token *token;
  38         unsigned char *buffer;
  39 } stream_t;
  40
  41 const char *stream_name(int stream)
  42 {
  43         if (stream < 0 || stream > input_stream_nr)
  44                 return "<bad stream>";
  45         return input_streams[stream].name;
  46 }
  47
  48 static struct position stream_pos(stream_t *stream)
  49 {
  50         struct position pos;
  51         pos.type = 0;
  52         pos.stream = stream->nr;
  53         pos.newline = stream->newline;
  54         pos.whitespace = stream->whitespace;
  55         pos.pos = stream->pos;
  56         pos.line = stream->line;
  57         pos.noexpand = 0;
  58         return pos;
  59 }
  60
  61 const char *show_special(int val)
  62 {
  63         static char buffer[4];
  64
  65         buffer[0] = val;
  66         buffer[1] = 0;
  67         if (val >= SPECIAL_BASE)
  68                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  69         return buffer;
  70 }
  71
  72 const char *show_ident(const struct ident *ident)
  73 {
  74         static char buffer[256];
  75         if (!ident)
  76                 return "<noident>";
  77         sprintf(buffer, "%.*s", ident->len, ident->name);
  78         return buffer;
  79 }
  80
  81 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  82 {
  83         if (isprint(c)) {
  84                 if (c == escape || c == '\\')
  85                         *ptr++ = '\\';
  86                 *ptr++ = c;
  87                 return ptr;
  88         }
  89         *ptr++ = '\\';
  90         switch (c) {
  91         case '\n':
  92                 *ptr++ = 'n';
  93                 return ptr;
  94         case '\t':
  95                 *ptr++ = 't';
  96                 return ptr;
  97         }
  98         if (!isdigit(next))
  99                 return ptr + sprintf(ptr, "%o", c);
 100
 101         return ptr + sprintf(ptr, "%03o", c);
 102 }
 103
 104 const char *show_string(const struct string *string)
 105 {
 106         static char buffer[4 * MAX_STRING + 3];
 107         char *ptr;
 108         int i;
 109
 110         if (!string->length)
 111                 return "<bad_string>";
 112         ptr = buffer;
 113         *ptr++ = '"';
 114         for (i = 0; i < string->length-1; i++) {
 115                 const char *p = string->data + i;
 116                 ptr = charstr(ptr, p[0], '"', p[1]);
 117         }
 118         *ptr++ = '"';
 119         *ptr = '\0';
 120         return buffer;
 121 }
 122
 123 const char *show_token(const struct token *token)
 124 {
 125         static char buffer[256];
 126
 127         if (!token)
 128                 return "<no token>";
 129         switch (token_type(token)) {
 130         case TOKEN_ERROR:
 131                 return "syntax error";
 132
 133         case TOKEN_EOF:
 134                 return "end-of-input";
 135
 136         case TOKEN_IDENT:
 137                 return show_ident(token->ident);
 138
 139         case TOKEN_STRING:
 140         case TOKEN_WIDE_STRING:
 141                 return show_string(token->string);
 142
 143         case TOKEN_NUMBER:
 144                 return token->number;
 145
 146         case TOKEN_SPECIAL:
 147                 return show_special(token->special);
 148
 149         case TOKEN_CHAR:
 150         case TOKEN_WIDE_CHAR: {
 151                 char *ptr = buffer;
 152                 int c = token->character;
 153                 *ptr++ = '\'';
 154                 ptr = charstr(ptr, c, '\'', 0);
 155                 *ptr++ = '\'';
 156                 *ptr++ = '\0';
 157                 return buffer;
 158         }
 159
 160         case TOKEN_STREAMBEGIN:
 161                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 162                 return buffer;
 163
 164         case TOKEN_STREAMEND:
 165                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 166                 return buffer;
 167
 168         case TOKEN_UNTAINT:
 169                 sprintf(buffer, "<untaint>");
 170                 return buffer;
 171
 172         case TOKEN_ARG_COUNT:
 173                 sprintf(buffer, "<argcnt>");
 174                 return buffer;
 175
 176         default:
 177                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 178                 return buffer;
 179         }
 180 }
 181
 182 int init_stream(const char *name, int fd, const char **next_path)
 183 {
 184         int stream = input_stream_nr;
 185         struct stream *current;
 186
 187         if (stream >= input_streams_allocated) {
 188                 int newalloc = stream * 4 / 3 + 10;
 189                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 190                 if (!input_streams)
 191                         die("Unable to allocate more streams space");
 192                 input_streams_allocated = newalloc;
 193         }
 194         current = input_streams + stream;
 195         memset(current, 0, sizeof(*current));
 196         current->name = name;
 197         current->fd = fd;
 198         current->next_path = next_path;
 199         current->path = NULL;
 200         current->constant = CONSTANT_FILE_MAYBE;
 201         input_stream_nr = stream+1;
 202         return stream;
 203 }
 204
 205 static struct token * alloc_token(stream_t *stream)
 206 {
 207         struct token *token = __alloc_token(0);
 208         token->pos = stream_pos(stream);
 209         return token;
 210 }
 211
 212 /*
 213  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 214  *  things a _lot_.
 215  */
 216 static int nextchar_slow(stream_t *stream)
 217 {
 218         int offset = stream->offset;
 219         int size = stream->size;
 220         int c;
 221         int spliced = 0, had_cr, had_backslash, complain;
 222
 223 restart:
 224         had_cr = had_backslash = complain = 0;
 225
 226 repeat:
 227         if (offset >= size) {
 228                 if (stream->fd < 0)
 229                         goto got_eof;
 230                 size = read(stream->fd, stream->buffer, BUFSIZE);
 231                 if (size <= 0)
 232                         goto got_eof;
 233                 stream->size = size;
 234                 stream->offset = offset = 0;
 235         }
 236
 237         c = stream->buffer[offset++];
 238
 239         if (had_cr && c != '\n')
 240                 complain = 1;
 241
 242         if (c == '\r') {
 243                 had_cr = 1;
 244                 goto repeat;
 245         }
 246
 247         stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
 248
 249         if (c == '\n') {
 250                 stream->line++;
 251                 stream->pos = 0;
 252         }
 253
 254         if (!had_backslash) {
 255                 if (c == '\\') {
 256                         had_backslash = 1;
 257                         goto repeat;
 258                 }
 259                 if (c == '\n')
 260                         stream->newline = 1;
 261         } else {
 262                 if (c == '\n') {
 263                         if (complain)
 264                                 warning(stream_pos(stream), "non-ASCII data stream");
 265                         spliced = 1;
 266                         goto restart;
 267                 }
 268                 stream->pos--;
 269                 offset--;
 270                 c = '\\';
 271         }
 272
 273 out:
 274         stream->offset = offset;
 275         if (complain)
 276                 warning(stream_pos(stream), "non-ASCII data stream");
 277
 278         return c;
 279
 280 got_eof:
 281         if (had_backslash) {
 282                 c = '\\';
 283                 goto out;
 284         }
 285         if (stream->pos)
 286                 warning(stream_pos(stream), "no newline at end of file");
 287         else if (had_cr)
 288                 warning(stream_pos(stream), "non-ASCII data stream");
 289         else if (spliced)
 290                 warning(stream_pos(stream), "backslash-newline at end of file");
 291         return EOF;
 292 }
 293
 294 /*
 295  *  We want that as light as possible while covering all normal cases.
 296  *  Slow path (including the logics with line-splicing and EOF sanity
 297  *  checks) is in nextchar_slow().
 298  */
 299 static inline int nextchar(stream_t *stream)
 300 {
 301         int offset = stream->offset;
 302
 303         if (offset < stream->size) {
 304                 int c = stream->buffer[offset++];
 305                 static const char special[256] = {
 306                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 307                 };
 308                 if (!special[c]) {
 309                         stream->offset = offset;
 310                         stream->pos++;
 311                         return c;
 312                 }
 313         }
 314         return nextchar_slow(stream);
 315 }
 316
 317 struct token eof_token_entry;
 318
 319 static struct token *mark_eof(stream_t *stream)
 320 {
 321         struct token *end;
 322
 323         end = alloc_token(stream);
 324         token_type(end) = TOKEN_STREAMEND;
 325         end->pos.newline = 1;
 326
 327         eof_token_entry.next = &eof_token_entry;
 328         eof_token_entry.pos.newline = 1;
 329
 330         end->next =  &eof_token_entry;
 331         *stream->tokenlist = end;
 332         stream->tokenlist = NULL;
 333         return end;
 334 }
 335
 336 static void add_token(stream_t *stream)
 337 {
 338         struct token *token = stream->token;
 339
 340         stream->token = NULL;
 341         token->next = NULL;
 342         *stream->tokenlist = token;
 343         stream->tokenlist = &token->next;
 344 }
 345
 346 static void drop_token(stream_t *stream)
 347 {
 348         stream->newline |= stream->token->pos.newline;
 349         stream->whitespace |= stream->token->pos.whitespace;
 350         stream->token = NULL;
 351 }
 352
 353 enum {
 354         Letter = 1,
 355         Digit = 2,
 356         Hex = 4,
 357         Exp = 8,
 358         Dot = 16,
 359         ValidSecond = 32,
 360 };
 361
 362 static const long cclass[257] = {
 363         ['0' + 1 ... '9' + 1] = Digit | Hex,
 364         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 365         ['E' + 1] = Letter | Hex | Exp,
 366         ['F' + 1] = Letter | Hex,
 367         ['G' + 1 ... 'O' + 1] = Letter,
 368         ['P' + 1] = Letter | Exp,
 369         ['Q' + 1 ... 'Z' + 1] = Letter,
 370         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 371         ['e' + 1] = Letter | Hex | Exp,
 372         ['f' + 1] = Letter | Hex,
 373         ['g' + 1 ... 'o' + 1] = Letter,
 374         ['p' + 1] = Letter | Exp,
 375         ['q' + 1 ... 'z' + 1] = Letter,
 376         ['_' + 1] = Letter,
 377         ['.' + 1] = Dot | ValidSecond,
 378         ['=' + 1] = ValidSecond,
 379         ['+' + 1] = ValidSecond,
 380         ['-' + 1] = ValidSecond,
 381         ['>' + 1] = ValidSecond,
 382         ['<' + 1] = ValidSecond,
 383         ['&' + 1] = ValidSecond,
 384         ['|' + 1] = ValidSecond,
 385         ['#' + 1] = ValidSecond,
 386 };
 387
 388 /*
 389  * pp-number:
 390  *      digit
 391  *      . digit
 392  *      pp-number digit
 393  *      pp-number identifier-nodigit
 394  *      pp-number e sign
 395  *      pp-number E sign
 396  *      pp-number p sign
 397  *      pp-number P sign
 398  *      pp-number .
 399  */
 400 static int get_one_number(int c, int next, stream_t *stream)
 401 {
 402         struct token *token;
 403         static char buffer[4095];
 404         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 405         int len;
 406
 407         *p++ = c;
 408         for (;;) {
 409                 long class =  cclass[next + 1];
 410                 if (!(class & (Dot | Digit | Letter)))
 411                         break;
 412                 if (p != buffer_end)
 413                         *p++ = next;
 414                 next = nextchar(stream);
 415                 if (class & Exp) {
 416                         if (next == '-' || next == '+') {
 417                                 if (p != buffer_end)
 418                                         *p++ = next;
 419                                 next = nextchar(stream);
 420                         }
 421                 }
 422         }
 423
 424         if (p == buffer_end) {
 425                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 426                       buffer_end - buffer);
 427                 // Pretend we saw just "1".
 428                 buffer[0] = '1';
 429                 p = buffer + 1;
 430         }
 431
 432         *p++ = 0;
 433         len = p - buffer;
 434         buf = __alloc_bytes(len);
 435         memcpy(buf, buffer, len);
 436
 437         token = stream->token;
 438         token_type(token) = TOKEN_NUMBER;
 439         token->number = buf;
 440         add_token(stream);
 441
 442         return next;
 443 }
 444
 445 static int escapechar(int first, int type, stream_t *stream, int *valp)
 446 {
 447         int next, value;
 448
 449         next = nextchar(stream);
 450         value = first;
 451
 452         if (first == '\n')
 453                 warning(stream_pos(stream), "Newline in string or character constant");
 454
 455         if (first == '\\' && next != EOF) {
 456                 value = next;
 457                 next = nextchar(stream);
 458                 if (value != type) {
 459                         switch (value) {
 460                         case 'a':
 461                                 value = '\a';
 462                                 break;
 463                         case 'b':
 464                                 value = '\b';
 465                                 break;
 466                         case 't':
 467                                 value = '\t';
 468                                 break;
 469                         case 'n':
 470                                 value = '\n';
 471                                 break;
 472                         case 'v':
 473                                 value = '\v';
 474                                 break;
 475                         case 'f':
 476                                 value = '\f';
 477                                 break;
 478                         case 'r':
 479                                 value = '\r';
 480                                 break;
 481                         case 'e':
 482                                 value = '\e';
 483                                 break;
 484                         case '\\':
 485                                 break;
 486                         case '?':
 487                                 break;
 488                         case '\'':
 489                                 break;
 490                         case '"':
 491                                 break;
 492                         case '\n':
 493                                 warning(stream_pos(stream), "Newline in string or character constant");
 494                                 break;
 495                         case '0'...'7': {
 496                                 int nr = 2;
 497                                 value -= '0';
 498                                 while (next >= '0' && next <= '7') {
 499                                         value = (value << 3) + (next-'0');
 500                                         next = nextchar(stream);
 501                                         if (!--nr)
 502                                                 break;
 503                                 }
 504                                 value &= 0xff;
 505                                 break;
 506                         }
 507                         case 'x': {
 508                                 int hex = hexval(next);
 509                                 if (hex < 16) {
 510                                         value = hex;
 511                                         next = nextchar(stream);
 512                                         while ((hex = hexval(next)) < 16) {
 513                                                 value = (value << 4) + hex;
 514                                                 next = nextchar(stream);
 515                                         }
 516                                         value &= 0xff;
 517                                         break;
 518                                 }
 519                         }
 520                         /* Fall through */
 521                         default:
 522                                 warning(stream_pos(stream), "Unknown escape '%c'", value);
 523                         }
 524                 }
 525                 /* Mark it as escaped */
 526                 value |= 0x100;
 527         }
 528         *valp = value;
 529         return next;
 530 }
 531
 532 static int get_char_token(int next, stream_t *stream, enum token_type type)
 533 {
 534         int value;
 535         struct token *token;
 536
 537         next = escapechar(next, '\'', stream, &value);
 538         if (value == '\'' || next != '\'') {
 539                 sparse_error(stream_pos(stream), "Bad character constant");
 540                 drop_token(stream);
 541                 return next;
 542         }
 543
 544         token = stream->token;
 545         token_type(token) = type;
 546         token->character = value & 0xff;
 547
 548         add_token(stream);
 549         return nextchar(stream);
 550 }
 551
 552 static int get_string_token(int next, stream_t *stream, enum token_type type)
 553 {
 554         static char buffer[MAX_STRING];
 555         struct string *string;
 556         struct token *token;
 557         int len = 0;
 558
 559         for (;;) {
 560                 int val;
 561                 next = escapechar(next, '"', stream, &val);
 562                 if (val == '"')
 563                         break;
 564                 if (next == EOF) {
 565                         warning(stream_pos(stream), "End of file in middle of string");
 566                         return next;
 567                 }
 568                 if (len < MAX_STRING)
 569                         buffer[len] = val;
 570                 len++;
 571         }
 572
 573         if (len > MAX_STRING) {
 574                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 575                 len = MAX_STRING;
 576         }
 577
 578         string = __alloc_string(len+1);
 579         memcpy(string->data, buffer, len);
 580         string->data[len] = '\0';
 581         string->length = len+1;
 582
 583         /* Pass it on.. */
 584         token = stream->token;
 585         token_type(token) = type;
 586         token->string = string;
 587         add_token(stream);
 588
 589         return next;
 590 }
 591
 592 static int drop_stream_eoln(stream_t *stream)
 593 {
 594         drop_token(stream);
 595         for (;;) {
 596                 switch (nextchar(stream)) {
 597                 case EOF:
 598                         return EOF;
 599                 case '\n':
 600                         return nextchar(stream);
 601                 }
 602         }
 603 }
 604
 605 static int drop_stream_comment(stream_t *stream)
 606 {
 607         int newline;
 608         int next;
 609         drop_token(stream);
 610         newline = stream->newline;
 611
 612         next = nextchar(stream);
 613         for (;;) {
 614                 int curr = next;
 615                 if (curr == EOF) {
 616                         warning(stream_pos(stream), "End of file in the middle of a comment");
 617                         return curr;
 618                 }
 619                 next = nextchar(stream);
 620                 if (curr == '*' && next == '/')
 621                         break;
 622         }
 623         stream->newline = newline;
 624         return nextchar(stream);
 625 }
 626
 627 unsigned char combinations[][4] = COMBINATION_STRINGS;
 628
 629 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 630
 631 /* hash function for two-character punctuators - all give unique values */
 632 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 633
 634 /*
 635  * note that we won't get false positives - special_hash(0,0) is 0 and
 636  * entry 0 is filled (by +=), so all the missing ones are OK.
 637  */
 638 static unsigned char hash_results[32][2] = {
 639 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 640         RES('+', '='), /* 00 */
 641         RES('/', '='), /* 01 */
 642         RES('^', '='), /* 05 */
 643         RES('&', '&'), /* 07 */
 644         RES('#', '#'), /* 08 */
 645         RES('<', '<'), /* 0a */
 646         RES('<', '='), /* 0c */
 647         RES('!', '='), /* 0e */
 648         RES('%', '='), /* 0f */
 649         RES('-', '-'), /* 10 */
 650         RES('-', '='), /* 11 */
 651         RES('-', '>'), /* 13 */
 652         RES('=', '='), /* 15 */
 653         RES('&', '='), /* 17 */
 654         RES('*', '='), /* 18 */
 655         RES('.', '.'), /* 1a */
 656         RES('+', '+'), /* 1b */
 657         RES('|', '='), /* 1c */
 658         RES('>', '='), /* 1d */
 659         RES('|', '|'), /* 1e */
 660         RES('>', '>')  /* 1f */
 661 #undef RES
 662 };
 663 static int code[32] = {
 664 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 665         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 666         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 667         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 668         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 669         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 670         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 671         CODE('<', '=', SPECIAL_LTE), /* 0c */
 672         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 673         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 674         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 675         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 676         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 677         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 678         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 679         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 680         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 681         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 682         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 683         CODE('>', '=', SPECIAL_GTE), /* 1d */
 684         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 685         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 686 #undef CODE
 687 };
 688
 689 static int get_one_special(int c, stream_t *stream)
 690 {
 691         struct token *token;
 692         int next, value, i;
 693
 694         next = nextchar(stream);
 695
 696         /*
 697          * Check for numbers, strings, character constants, and comments
 698          */
 699         switch (c) {
 700         case '.':
 701                 if (next >= '0' && next <= '9')
 702                         return get_one_number(c, next, stream);
 703                 break;
 704         case '"':
 705                 return get_string_token(next, stream, TOKEN_STRING);
 706         case '\'':
 707                 return get_char_token(next, stream, TOKEN_CHAR);
 708         case '/':
 709                 if (next == '/')
 710                         return drop_stream_eoln(stream);
 711                 if (next == '*')
 712                         return drop_stream_comment(stream);
 713         }
 714
 715         /*
 716          * Check for combinations
 717          */
 718         value = c;
 719         if (cclass[next + 1] & ValidSecond) {
 720                 i = special_hash(c, next);
 721                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 722                         value = code[i];
 723                         next = nextchar(stream);
 724                         if (value >= SPECIAL_LEFTSHIFT &&
 725                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 726                                 value += 3;
 727                                 next = nextchar(stream);
 728                         }
 729                 }
 730         }
 731
 732         /* Pass it on.. */
 733         token = stream->token;
 734         token_type(token) = TOKEN_SPECIAL;
 735         token->special = value;
 736         add_token(stream);
 737         return next;
 738 }
 739
 740 #define IDENT_HASH_BITS (13)
 741 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 742 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 743
 744 #define ident_hash_init(c)              (c)
 745 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 746 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 747
 748 static struct ident *hash_table[IDENT_HASH_SIZE];
 749 static int ident_hit, ident_miss, idents;
 750
 751 void show_identifier_stats(void)
 752 {
 753         int i;
 754         int distribution[100];
 755
 756         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 757                 ident_hit, ident_miss);
 758
 759         for (i = 0; i < 100; i++)
 760                 distribution[i] = 0;
 761
 762         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 763                 struct ident * ident = hash_table[i];
 764                 int count = 0;
 765
 766                 while (ident) {
 767                         count++;
 768                         ident = ident->next;
 769                 }
 770                 if (count > 99)
 771                         count = 99;
 772                 distribution[count]++;
 773         }
 774
 775         for (i = 0; i < 100; i++) {
 776                 if (distribution[i])
 777                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 778         }
 779 }
 780
 781 static struct ident *alloc_ident(const char *name, int len)
 782 {
 783         struct ident *ident = __alloc_ident(len);
 784         ident->symbols = NULL;
 785         ident->len = len;
 786         ident->tainted = 0;
 787         memcpy(ident->name, name, len);
 788         return ident;
 789 }
 790
 791 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 792 {
 793         ident->next = hash_table[hash];
 794         hash_table[hash] = ident;
 795         ident_miss++;
 796         return ident;
 797 }
 798
 799 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 800 {
 801         struct ident *ident;
 802         struct ident **p;
 803
 804         p = &hash_table[hash];
 805         while ((ident = *p) != NULL) {
 806                 if (ident->len == (unsigned char) len) {
 807                         if (strncmp(name, ident->name, len) != 0)
 808                                 goto next;
 809
 810                         ident_hit++;
 811                         return ident;
 812                 }
 813 next:
 814                 //misses++;
 815                 p = &ident->next;
 816         }
 817         ident = alloc_ident(name, len);
 818         *p = ident;
 819         ident->next = NULL;
 820         ident_miss++;
 821         idents++;
 822         return ident;
 823 }
 824
 825 static unsigned long hash_name(const char *name, int len)
 826 {
 827         unsigned long hash;
 828         const unsigned char *p = (const unsigned char *)name;
 829
 830         hash = ident_hash_init(*p++);
 831         while (--len) {
 832                 unsigned int i = *p++;
 833                 hash = ident_hash_add(hash, i);
 834         }
 835         return ident_hash_end(hash);
 836 }
 837
 838 struct ident *hash_ident(struct ident *ident)
 839 {
 840         return insert_hash(ident, hash_name(ident->name, ident->len));
 841 }
 842
 843 struct ident *built_in_ident(const char *name)
 844 {
 845         int len = strlen(name);
 846         return create_hashed_ident(name, len, hash_name(name, len));
 847 }
 848
 849 struct token *built_in_token(int stream, const char *name)
 850 {
 851         struct token *token;
 852
 853         token = __alloc_token(0);
 854         token->pos.stream = stream;
 855         token_type(token) = TOKEN_IDENT;
 856         token->ident = built_in_ident(name);
 857         return token;
 858 }
 859
 860 static int get_one_identifier(int c, stream_t *stream)
 861 {
 862         struct token *token;
 863         struct ident *ident;
 864         unsigned long hash;
 865         char buf[256];
 866         int len = 1;
 867         int next;
 868
 869         hash = ident_hash_init(c);
 870         buf[0] = c;
 871         for (;;) {
 872                 next = nextchar(stream);
 873                 if (!(cclass[next + 1] & (Letter | Digit)))
 874                         break;
 875                 if (len >= sizeof(buf))
 876                         break;
 877                 hash = ident_hash_add(hash, next);
 878                 buf[len] = next;
 879                 len++;
 880         };
 881         hash = ident_hash_end(hash);
 882
 883         ident = create_hashed_ident(buf, len, hash);
 884
 885         if (ident == &L_ident) {
 886                 if (next == '\'')
 887                         return get_char_token(nextchar(stream), stream, TOKEN_WIDE_CHAR);
 888                 if (next == '\"')
 889                         return get_string_token(nextchar(stream), stream, TOKEN_WIDE_STRING);
 890         }
 891
 892         /* Pass it on.. */
 893         token = stream->token;
 894         token_type(token) = TOKEN_IDENT;
 895         token->ident = ident;
 896         add_token(stream);
 897         return next;
 898 }
 899
 900 static int get_one_token(int c, stream_t *stream)
 901 {
 902         long class = cclass[c + 1];
 903         if (class & Digit)
 904                 return get_one_number(c, nextchar(stream), stream);
 905         if (class & Letter)
 906                 return get_one_identifier(c, stream);
 907         return get_one_special(c, stream);
 908 }
 909
 910 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 911         unsigned char *buf, unsigned int buf_size)
 912 {
 913         struct token *begin;
 914
 915         stream->nr = idx;
 916         stream->line = 1;
 917         stream->newline = 1;
 918         stream->whitespace = 0;
 919         stream->pos = 0;
 920
 921         stream->token = NULL;
 922         stream->fd = fd;
 923         stream->offset = 0;
 924         stream->size = buf_size;
 925         stream->buffer = buf;
 926
 927         begin = alloc_token(stream);
 928         token_type(begin) = TOKEN_STREAMBEGIN;
 929         stream->tokenlist = &begin->next;
 930         return begin;
 931 }
 932
 933 static struct token *tokenize_stream(stream_t *stream)
 934 {
 935         int c = nextchar(stream);
 936         while (c != EOF) {
 937                 if (!isspace(c)) {
 938                         struct token *token = alloc_token(stream);
 939                         stream->token = token;
 940                         stream->newline = 0;
 941                         stream->whitespace = 0;
 942                         c = get_one_token(c, stream);
 943                         continue;
 944                 }
 945                 stream->whitespace = 1;
 946                 c = nextchar(stream);
 947         }
 948         return mark_eof(stream);
 949 }
 950
 951 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
 952 {
 953         stream_t stream;
 954         struct token *begin;
 955
 956         begin = setup_stream(&stream, 0, -1, buffer, size);
 957         *endtoken = tokenize_stream(&stream);
 958         return begin;
 959 }
 960
 961 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
 962 {
 963         struct token *begin, *end;
 964         stream_t stream;
 965         unsigned char buffer[BUFSIZE];
 966         int idx;
 967
 968         idx = init_stream(name, fd, next_path);
 969         if (idx < 0) {
 970                 // info(endtoken->pos, "File %s is const", name);
 971                 return endtoken;
 972         }
 973
 974         begin = setup_stream(&stream, idx, fd, buffer, 0);
 975         end = tokenize_stream(&stream);
 976         if (endtoken)
 977                 end->next = endtoken;
 978         return begin;
 979 }