tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17
  18 #include "lib.h"
  19 #include "allocate.h"
  20 #include "token.h"
  21 #include "symbol.h"
  22
  23 #define EOF (-1)
  24
  25 int input_stream_nr = 0;
  26 struct stream *input_streams;
  27 static int input_streams_allocated;
  28 unsigned int tabstop = 8;
  29
  30 #define BUFSIZE (8192)
  31
  32 typedef struct {
  33         int fd, offset, size;
  34         int pos, line, nr;
  35         int newline, whitespace;
  36         struct token **tokenlist;
  37         struct token *token;
  38         unsigned char *buffer;
  39 } stream_t;
  40
  41 const char *stream_name(int stream)
  42 {
  43         if (stream < 0 || stream > input_stream_nr)
  44                 return "<bad stream>";
  45         return input_streams[stream].name;
  46 }
  47
  48 static struct position stream_pos(stream_t *stream)
  49 {
  50         struct position pos;
  51         pos.type = 0;
  52         pos.stream = stream->nr;
  53         pos.newline = stream->newline;
  54         pos.whitespace = stream->whitespace;
  55         pos.pos = stream->pos;
  56         pos.line = stream->line;
  57         pos.noexpand = 0;
  58         return pos;
  59 }
  60
  61 const char *show_special(int val)
  62 {
  63         static char buffer[4];
  64
  65         buffer[0] = val;
  66         buffer[1] = 0;
  67         if (val >= SPECIAL_BASE)
  68                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  69         return buffer;
  70 }
  71
  72 const char *show_ident(const struct ident *ident)
  73 {
  74         static char buffer[256];
  75         if (!ident)
  76                 return "<noident>";
  77         sprintf(buffer, "%.*s", ident->len, ident->name);
  78         return buffer;
  79 }
  80
  81 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  82 {
  83         if (isprint(c)) {
  84                 if (c == escape || c == '\\')
  85                         *ptr++ = '\\';
  86                 *ptr++ = c;
  87                 return ptr;
  88         }
  89         *ptr++ = '\\';
  90         switch (c) {
  91         case '\n':
  92                 *ptr++ = 'n';
  93                 return ptr;
  94         case '\t':
  95                 *ptr++ = 't';
  96                 return ptr;
  97         }
  98         if (!isdigit(next))
  99                 return ptr + sprintf(ptr, "%o", c);
 100
 101         return ptr + sprintf(ptr, "%03o", c);
 102 }
 103
 104 const char *show_string(const struct string *string)
 105 {
 106         static char buffer[4 * MAX_STRING + 3];
 107         char *ptr;
 108         int i;
 109
 110         if (!string->length)
 111                 return "<bad_string>";
 112         ptr = buffer;
 113         *ptr++ = '"';
 114         for (i = 0; i < string->length-1; i++) {
 115                 const char *p = string->data + i;
 116                 ptr = charstr(ptr, p[0], '"', p[1]);
 117         }
 118         *ptr++ = '"';
 119         *ptr = '\0';
 120         return buffer;
 121 }
 122
 123 const char *show_token(const struct token *token)
 124 {
 125         static char buffer[256];
 126
 127         if (!token)
 128                 return "<no token>";
 129         switch (token_type(token)) {
 130         case TOKEN_ERROR:
 131                 return "syntax error";
 132
 133         case TOKEN_EOF:
 134                 return "end-of-input";
 135
 136         case TOKEN_IDENT:
 137                 return show_ident(token->ident);
 138
 139         case TOKEN_STRING:
 140                 return show_string(token->string);
 141
 142         case TOKEN_NUMBER:
 143                 return token->number;
 144
 145         case TOKEN_SPECIAL:
 146                 return show_special(token->special);
 147
 148         case TOKEN_CHAR: {
 149                 char *ptr = buffer;
 150                 int c = token->character;
 151                 *ptr++ = '\'';
 152                 ptr = charstr(ptr, c, '\'', 0);
 153                 *ptr++ = '\'';
 154                 *ptr++ = '\0';
 155                 return buffer;
 156         }
 157
 158         case TOKEN_STREAMBEGIN:
 159                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 160                 return buffer;
 161
 162         case TOKEN_STREAMEND:
 163                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 164                 return buffer;
 165
 166         case TOKEN_UNTAINT:
 167                 sprintf(buffer, "<untaint>");
 168                 return buffer;
 169
 170         case TOKEN_ARG_COUNT:
 171                 sprintf(buffer, "<argcnt>");
 172                 return buffer;
 173
 174         default:
 175                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 176                 return buffer;
 177         }
 178 }
 179
 180 int init_stream(const char *name, int fd, const char **next_path)
 181 {
 182         int stream = input_stream_nr;
 183         struct stream *current;
 184
 185         if (stream >= input_streams_allocated) {
 186                 int newalloc = stream * 4 / 3 + 10;
 187                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 188                 if (!input_streams)
 189                         die("Unable to allocate more streams space");
 190                 input_streams_allocated = newalloc;
 191         }
 192         current = input_streams + stream;
 193         memset(current, 0, sizeof(*current));
 194         current->name = name;
 195         current->fd = fd;
 196         current->next_path = next_path;
 197         current->path = NULL;
 198         current->constant = CONSTANT_FILE_MAYBE;
 199         input_stream_nr = stream+1;
 200         return stream;
 201 }
 202
 203 static struct token * alloc_token(stream_t *stream)
 204 {
 205         struct token *token = __alloc_token(0);
 206         token->pos = stream_pos(stream);
 207         return token;
 208 }
 209
 210 /*
 211  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 212  *  things a _lot_.
 213  */
 214 static int nextchar_slow(stream_t *stream)
 215 {
 216         int offset = stream->offset;
 217         int size = stream->size;
 218         int c;
 219         int spliced = 0, had_cr, had_backslash, complain;
 220
 221 restart:
 222         had_cr = had_backslash = complain = 0;
 223
 224 repeat:
 225         if (offset >= size) {
 226                 if (stream->fd < 0)
 227                         goto got_eof;
 228                 size = read(stream->fd, stream->buffer, BUFSIZE);
 229                 if (size <= 0)
 230                         goto got_eof;
 231                 stream->size = size;
 232                 stream->offset = offset = 0;
 233         }
 234
 235         c = stream->buffer[offset++];
 236
 237         if (had_cr && c != '\n')
 238                 complain = 1;
 239
 240         if (c == '\r') {
 241                 had_cr = 1;
 242                 goto repeat;
 243         }
 244
 245         stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
 246
 247         if (c == '\n') {
 248                 stream->line++;
 249                 stream->pos = 0;
 250         }
 251
 252         if (!had_backslash) {
 253                 if (c == '\\') {
 254                         had_backslash = 1;
 255                         goto repeat;
 256                 }
 257                 if (c == '\n')
 258                         stream->newline = 1;
 259         } else {
 260                 if (c == '\n') {
 261                         if (complain)
 262                                 warning(stream_pos(stream), "non-ASCII data stream");
 263                         spliced = 1;
 264                         goto restart;
 265                 }
 266                 stream->pos--;
 267                 offset--;
 268                 c = '\\';
 269         }
 270
 271 out:
 272         stream->offset = offset;
 273         if (complain)
 274                 warning(stream_pos(stream), "non-ASCII data stream");
 275
 276         return c;
 277
 278 got_eof:
 279         if (had_backslash) {
 280                 c = '\\';
 281                 goto out;
 282         }
 283         if (stream->pos)
 284                 warning(stream_pos(stream), "no newline at end of file");
 285         else if (had_cr)
 286                 warning(stream_pos(stream), "non-ASCII data stream");
 287         else if (spliced)
 288                 warning(stream_pos(stream), "backslash-newline at end of file");
 289         return EOF;
 290 }
 291
 292 /*
 293  *  We want that as light as possible while covering all normal cases.
 294  *  Slow path (including the logics with line-splicing and EOF sanity
 295  *  checks) is in nextchar_slow().
 296  */
 297 static inline int nextchar(stream_t *stream)
 298 {
 299         int offset = stream->offset;
 300
 301         if (offset < stream->size) {
 302                 int c = stream->buffer[offset++];
 303                 static const char special[256] = {
 304                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 305                 };
 306                 if (!special[c]) {
 307                         stream->offset = offset;
 308                         stream->pos++;
 309                         return c;
 310                 }
 311         }
 312         return nextchar_slow(stream);
 313 }
 314
 315 struct token eof_token_entry;
 316
 317 static struct token *mark_eof(stream_t *stream)
 318 {
 319         struct token *end;
 320
 321         end = alloc_token(stream);
 322         token_type(end) = TOKEN_STREAMEND;
 323         end->pos.newline = 1;
 324
 325         eof_token_entry.next = &eof_token_entry;
 326         eof_token_entry.pos.newline = 1;
 327
 328         end->next =  &eof_token_entry;
 329         *stream->tokenlist = end;
 330         stream->tokenlist = NULL;
 331         return end;
 332 }
 333
 334 static void add_token(stream_t *stream)
 335 {
 336         struct token *token = stream->token;
 337
 338         stream->token = NULL;
 339         token->next = NULL;
 340         *stream->tokenlist = token;
 341         stream->tokenlist = &token->next;
 342 }
 343
 344 static void drop_token(stream_t *stream)
 345 {
 346         stream->newline |= stream->token->pos.newline;
 347         stream->whitespace |= stream->token->pos.whitespace;
 348         stream->token = NULL;
 349 }
 350
 351 enum {
 352         Letter = 1,
 353         Digit = 2,
 354         Hex = 4,
 355         Exp = 8,
 356         Dot = 16,
 357         ValidSecond = 32,
 358 };
 359
 360 static const long cclass[257] = {
 361         ['0' + 1 ... '9' + 1] = Digit | Hex,
 362         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 363         ['E' + 1] = Letter | Hex | Exp,
 364         ['F' + 1] = Letter | Hex,
 365         ['G' + 1 ... 'O' + 1] = Letter,
 366         ['P' + 1] = Letter | Exp,
 367         ['Q' + 1 ... 'Z' + 1] = Letter,
 368         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 369         ['e' + 1] = Letter | Hex | Exp,
 370         ['f' + 1] = Letter | Hex,
 371         ['g' + 1 ... 'o' + 1] = Letter,
 372         ['p' + 1] = Letter | Exp,
 373         ['q' + 1 ... 'z' + 1] = Letter,
 374         ['_' + 1] = Letter,
 375         ['.' + 1] = Dot | ValidSecond,
 376         ['=' + 1] = ValidSecond,
 377         ['+' + 1] = ValidSecond,
 378         ['-' + 1] = ValidSecond,
 379         ['>' + 1] = ValidSecond,
 380         ['<' + 1] = ValidSecond,
 381         ['&' + 1] = ValidSecond,
 382         ['|' + 1] = ValidSecond,
 383         ['#' + 1] = ValidSecond,
 384 };
 385
 386 /*
 387  * pp-number:
 388  *      digit
 389  *      . digit
 390  *      pp-number digit
 391  *      pp-number identifier-nodigit
 392  *      pp-number e sign
 393  *      pp-number E sign
 394  *      pp-number p sign
 395  *      pp-number P sign
 396  *      pp-number .
 397  */
 398 static int get_one_number(int c, int next, stream_t *stream)
 399 {
 400         struct token *token;
 401         static char buffer[4095];
 402         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 403         int len;
 404
 405         *p++ = c;
 406         for (;;) {
 407                 long class =  cclass[next + 1];
 408                 if (!(class & (Dot | Digit | Letter)))
 409                         break;
 410                 if (p != buffer_end)
 411                         *p++ = next;
 412                 next = nextchar(stream);
 413                 if (class & Exp) {
 414                         if (next == '-' || next == '+') {
 415                                 if (p != buffer_end)
 416                                         *p++ = next;
 417                                 next = nextchar(stream);
 418                         }
 419                 }
 420         }
 421
 422         if (p == buffer_end) {
 423                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 424                       buffer_end - buffer);
 425                 // Pretend we saw just "1".
 426                 buffer[0] = '1';
 427                 p = buffer + 1;
 428         }
 429
 430         *p++ = 0;
 431         len = p - buffer;
 432         buf = __alloc_bytes(len);
 433         memcpy(buf, buffer, len);
 434
 435         token = stream->token;
 436         token_type(token) = TOKEN_NUMBER;
 437         token->number = buf;
 438         add_token(stream);
 439
 440         return next;
 441 }
 442
 443 static int escapechar(int first, int type, stream_t *stream, int *valp)
 444 {
 445         int next, value;
 446
 447         next = nextchar(stream);
 448         value = first;
 449
 450         if (first == '\n')
 451                 warning(stream_pos(stream), "Newline in string or character constant");
 452
 453         if (first == '\\' && next != EOF) {
 454                 value = next;
 455                 next = nextchar(stream);
 456                 if (value != type) {
 457                         switch (value) {
 458                         case 'a':
 459                                 value = '\a';
 460                                 break;
 461                         case 'b':
 462                                 value = '\b';
 463                                 break;
 464                         case 't':
 465                                 value = '\t';
 466                                 break;
 467                         case 'n':
 468                                 value = '\n';
 469                                 break;
 470                         case 'v':
 471                                 value = '\v';
 472                                 break;
 473                         case 'f':
 474                                 value = '\f';
 475                                 break;
 476                         case 'r':
 477                                 value = '\r';
 478                                 break;
 479                         case 'e':
 480                                 value = '\e';
 481                                 break;
 482                         case '\\':
 483                                 break;
 484                         case '?':
 485                                 break;
 486                         case '\'':
 487                                 break;
 488                         case '"':
 489                                 break;
 490                         case '\n':
 491                                 warning(stream_pos(stream), "Newline in string or character constant");
 492                                 break;
 493                         case '0'...'7': {
 494                                 int nr = 2;
 495                                 value -= '0';
 496                                 while (next >= '0' && next <= '9') {
 497                                         value = (value << 3) + (next-'0');
 498                                         next = nextchar(stream);
 499                                         if (!--nr)
 500                                                 break;
 501                                 }
 502                                 value &= 0xff;
 503                                 break;
 504                         }
 505                         case 'x': {
 506                                 int hex = hexval(next);
 507                                 if (hex < 16) {
 508                                         value = hex;
 509                                         next = nextchar(stream);
 510                                         while ((hex = hexval(next)) < 16) {
 511                                                 value = (value << 4) + hex;
 512                                                 next = nextchar(stream);
 513                                         }
 514                                         value &= 0xff;
 515                                         break;
 516                                 }
 517                         }
 518                         /* Fall through */
 519                         default:
 520                                 warning(stream_pos(stream), "Unknown escape '%c'", value);
 521                         }
 522                 }
 523                 /* Mark it as escaped */
 524                 value |= 0x100;
 525         }
 526         *valp = value;
 527         return next;
 528 }
 529
 530 static int get_char_token(int next, stream_t *stream)
 531 {
 532         int value;
 533         struct token *token;
 534
 535         next = escapechar(next, '\'', stream, &value);
 536         if (value == '\'' || next != '\'') {
 537                 sparse_error(stream_pos(stream), "Bad character constant");
 538                 drop_token(stream);
 539                 return next;
 540         }
 541
 542         token = stream->token;
 543         token_type(token) = TOKEN_CHAR;
 544         token->character = value & 0xff;
 545
 546         add_token(stream);
 547         return nextchar(stream);
 548 }
 549
 550 static int get_string_token(int next, stream_t *stream)
 551 {
 552         static char buffer[MAX_STRING];
 553         struct string *string;
 554         struct token *token;
 555         int len = 0;
 556
 557         for (;;) {
 558                 int val;
 559                 next = escapechar(next, '"', stream, &val);
 560                 if (val == '"')
 561                         break;
 562                 if (next == EOF) {
 563                         warning(stream_pos(stream), "End of file in middle of string");
 564                         return next;
 565                 }
 566                 if (len < MAX_STRING)
 567                         buffer[len] = val;
 568                 len++;
 569         }
 570
 571         if (len > MAX_STRING) {
 572                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 573                 len = MAX_STRING;
 574         }
 575
 576         string = __alloc_string(len+1);
 577         memcpy(string->data, buffer, len);
 578         string->data[len] = '\0';
 579         string->length = len+1;
 580
 581         /* Pass it on.. */
 582         token = stream->token;
 583         token_type(token) = TOKEN_STRING;
 584         token->string = string;
 585         add_token(stream);
 586
 587         return next;
 588 }
 589
 590 static int drop_stream_eoln(stream_t *stream)
 591 {
 592         drop_token(stream);
 593         for (;;) {
 594                 switch (nextchar(stream)) {
 595                 case EOF:
 596                         return EOF;
 597                 case '\n':
 598                         return nextchar(stream);
 599                 }
 600         }
 601 }
 602
 603 static int drop_stream_comment(stream_t *stream)
 604 {
 605         int newline;
 606         int next;
 607         drop_token(stream);
 608         newline = stream->newline;
 609
 610         next = nextchar(stream);
 611         for (;;) {
 612                 int curr = next;
 613                 if (curr == EOF) {
 614                         warning(stream_pos(stream), "End of file in the middle of a comment");
 615                         return curr;
 616                 }
 617                 next = nextchar(stream);
 618                 if (curr == '*' && next == '/')
 619                         break;
 620         }
 621         stream->newline = newline;
 622         return nextchar(stream);
 623 }
 624
 625 unsigned char combinations[][4] = COMBINATION_STRINGS;
 626
 627 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 628
 629 /* hash function for two-character punctuators - all give unique values */
 630 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 631
 632 /*
 633  * note that we won't get false positives - special_hash(0,0) is 0 and
 634  * entry 0 is filled (by +=), so all the missing ones are OK.
 635  */
 636 static unsigned char hash_results[32][2] = {
 637 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 638         RES('+', '='), /* 00 */
 639         RES('/', '='), /* 01 */
 640         RES('^', '='), /* 05 */
 641         RES('&', '&'), /* 07 */
 642         RES('#', '#'), /* 08 */
 643         RES('<', '<'), /* 0a */
 644         RES('<', '='), /* 0c */
 645         RES('!', '='), /* 0e */
 646         RES('%', '='), /* 0f */
 647         RES('-', '-'), /* 10 */
 648         RES('-', '='), /* 11 */
 649         RES('-', '>'), /* 13 */
 650         RES('=', '='), /* 15 */
 651         RES('&', '='), /* 17 */
 652         RES('*', '='), /* 18 */
 653         RES('.', '.'), /* 1a */
 654         RES('+', '+'), /* 1b */
 655         RES('|', '='), /* 1c */
 656         RES('>', '='), /* 1d */
 657         RES('|', '|'), /* 1e */
 658         RES('>', '>')  /* 1f */
 659 #undef RES
 660 };
 661 static int code[32] = {
 662 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 663         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 664         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 665         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 666         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 667         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 668         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 669         CODE('<', '=', SPECIAL_LTE), /* 0c */
 670         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 671         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 672         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 673         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 674         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 675         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 676         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 677         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 678         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 679         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 680         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 681         CODE('>', '=', SPECIAL_GTE), /* 1d */
 682         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 683         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 684 #undef CODE
 685 };
 686
 687 static int get_one_special(int c, stream_t *stream)
 688 {
 689         struct token *token;
 690         int next, value, i;
 691
 692         next = nextchar(stream);
 693
 694         /*
 695          * Check for numbers, strings, character constants, and comments
 696          */
 697         switch (c) {
 698         case '.':
 699                 if (next >= '0' && next <= '9')
 700                         return get_one_number(c, next, stream);
 701                 break;
 702         case '"':
 703                 return get_string_token(next, stream);
 704         case '\'':
 705                 return get_char_token(next, stream);
 706         case '/':
 707                 if (next == '/')
 708                         return drop_stream_eoln(stream);
 709                 if (next == '*')
 710                         return drop_stream_comment(stream);
 711         }
 712
 713         /*
 714          * Check for combinations
 715          */
 716         value = c;
 717         if (cclass[next + 1] & ValidSecond) {
 718                 i = special_hash(c, next);
 719                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 720                         value = code[i];
 721                         next = nextchar(stream);
 722                         if (value >= SPECIAL_LEFTSHIFT &&
 723                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 724                                 value += 3;
 725                                 next = nextchar(stream);
 726                         }
 727                 }
 728         }
 729
 730         /* Pass it on.. */
 731         token = stream->token;
 732         token_type(token) = TOKEN_SPECIAL;
 733         token->special = value;
 734         add_token(stream);
 735         return next;
 736 }
 737
 738 #define IDENT_HASH_BITS (13)
 739 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 740 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 741
 742 #define ident_hash_init(c)              (c)
 743 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 744 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 745
 746 static struct ident *hash_table[IDENT_HASH_SIZE];
 747 static int ident_hit, ident_miss, idents;
 748
 749 void show_identifier_stats(void)
 750 {
 751         int i;
 752         int distribution[100];
 753
 754         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 755                 ident_hit, ident_miss);
 756
 757         for (i = 0; i < 100; i++)
 758                 distribution[i] = 0;
 759
 760         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 761                 struct ident * ident = hash_table[i];
 762                 int count = 0;
 763
 764                 while (ident) {
 765                         count++;
 766                         ident = ident->next;
 767                 }
 768                 if (count > 99)
 769                         count = 99;
 770                 distribution[count]++;
 771         }
 772
 773         for (i = 0; i < 100; i++) {
 774                 if (distribution[i])
 775                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 776         }
 777 }
 778
 779 static struct ident *alloc_ident(const char *name, int len)
 780 {
 781         struct ident *ident = __alloc_ident(len);
 782         ident->symbols = NULL;
 783         ident->len = len;
 784         ident->tainted = 0;
 785         memcpy(ident->name, name, len);
 786         return ident;
 787 }
 788
 789 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 790 {
 791         ident->next = hash_table[hash];
 792         hash_table[hash] = ident;
 793         ident_miss++;
 794         return ident;
 795 }
 796
 797 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 798 {
 799         struct ident *ident;
 800         struct ident **p;
 801
 802         p = &hash_table[hash];
 803         while ((ident = *p) != NULL) {
 804                 if (ident->len == (unsigned char) len) {
 805                         if (strncmp(name, ident->name, len) != 0)
 806                                 goto next;
 807
 808                         ident_hit++;
 809                         return ident;
 810                 }
 811 next:
 812                 //misses++;
 813                 p = &ident->next;
 814         }
 815         ident = alloc_ident(name, len);
 816         *p = ident;
 817         ident->next = NULL;
 818         ident_miss++;
 819         idents++;
 820         return ident;
 821 }
 822
 823 static unsigned long hash_name(const char *name, int len)
 824 {
 825         unsigned long hash;
 826         const unsigned char *p = (const unsigned char *)name;
 827
 828         hash = ident_hash_init(*p++);
 829         while (--len) {
 830                 unsigned int i = *p++;
 831                 hash = ident_hash_add(hash, i);
 832         }
 833         return ident_hash_end(hash);
 834 }
 835
 836 struct ident *hash_ident(struct ident *ident)
 837 {
 838         return insert_hash(ident, hash_name(ident->name, ident->len));
 839 }
 840
 841 struct ident *built_in_ident(const char *name)
 842 {
 843         int len = strlen(name);
 844         return create_hashed_ident(name, len, hash_name(name, len));
 845 }
 846
 847 struct token *built_in_token(int stream, const char *name)
 848 {
 849         struct token *token;
 850
 851         token = __alloc_token(0);
 852         token->pos.stream = stream;
 853         token_type(token) = TOKEN_IDENT;
 854         token->ident = built_in_ident(name);
 855         return token;
 856 }
 857
 858 static int get_one_identifier(int c, stream_t *stream)
 859 {
 860         struct token *token;
 861         struct ident *ident;
 862         unsigned long hash;
 863         char buf[256];
 864         int len = 1;
 865         int next;
 866
 867         hash = ident_hash_init(c);
 868         buf[0] = c;
 869         for (;;) {
 870                 next = nextchar(stream);
 871                 if (!(cclass[next + 1] & (Letter | Digit)))
 872                         break;
 873                 if (len >= sizeof(buf))
 874                         break;
 875                 hash = ident_hash_add(hash, next);
 876                 buf[len] = next;
 877                 len++;
 878         };
 879         hash = ident_hash_end(hash);
 880
 881         ident = create_hashed_ident(buf, len, hash);
 882
 883         /* Pass it on.. */
 884         token = stream->token;
 885         token_type(token) = TOKEN_IDENT;
 886         token->ident = ident;
 887         add_token(stream);
 888         return next;
 889 }
 890
 891 static int get_one_token(int c, stream_t *stream)
 892 {
 893         long class = cclass[c + 1];
 894         if (class & Digit)
 895                 return get_one_number(c, nextchar(stream), stream);
 896         if (class & Letter)
 897                 return get_one_identifier(c, stream);
 898         return get_one_special(c, stream);
 899 }
 900
 901 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 902         unsigned char *buf, unsigned int buf_size)
 903 {
 904         struct token *begin;
 905
 906         stream->nr = idx;
 907         stream->line = 1;
 908         stream->newline = 1;
 909         stream->whitespace = 0;
 910         stream->pos = 0;
 911
 912         stream->token = NULL;
 913         stream->fd = fd;
 914         stream->offset = 0;
 915         stream->size = buf_size;
 916         stream->buffer = buf;
 917
 918         begin = alloc_token(stream);
 919         token_type(begin) = TOKEN_STREAMBEGIN;
 920         stream->tokenlist = &begin->next;
 921         return begin;
 922 }
 923
 924 static struct token *tokenize_stream(stream_t *stream)
 925 {
 926         int c = nextchar(stream);
 927         while (c != EOF) {
 928                 if (!isspace(c)) {
 929                         struct token *token = alloc_token(stream);
 930                         stream->token = token;
 931                         stream->newline = 0;
 932                         stream->whitespace = 0;
 933                         c = get_one_token(c, stream);
 934                         continue;
 935                 }
 936                 stream->whitespace = 1;
 937                 c = nextchar(stream);
 938         }
 939         return mark_eof(stream);
 940 }
 941
 942 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
 943 {
 944         stream_t stream;
 945         struct token *begin;
 946
 947         begin = setup_stream(&stream, 0, -1, buffer, size);
 948         *endtoken = tokenize_stream(&stream);
 949         return begin;
 950 }
 951
 952 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
 953 {
 954         struct token *begin, *end;
 955         stream_t stream;
 956         unsigned char buffer[BUFSIZE];
 957         int idx;
 958
 959         idx = init_stream(name, fd, next_path);
 960         if (idx < 0) {
 961                 // info(endtoken->pos, "File %s is const", name);
 962                 return endtoken;
 963         }
 964
 965         begin = setup_stream(&stream, idx, fd, buffer, 0);
 966         end = tokenize_stream(&stream);
 967         if (endtoken)
 968                 end->next = endtoken;
 969         return begin;
 970 }