tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17
  18 #include "lib.h"
  19 #include "allocate.h"
  20 #include "token.h"
  21 #include "symbol.h"
  22
  23 #define EOF (-1)
  24
  25 int input_stream_nr = 0;
  26 struct stream *input_streams;
  27 static int input_streams_allocated;
  28 unsigned int tabstop = 8;
  29
  30 #define BUFSIZE (8192)
  31
  32 typedef struct {
  33         int fd, offset, size;
  34         int pos, line, nr;
  35         int newline, whitespace;
  36         struct token **tokenlist;
  37         struct token *token;
  38         unsigned char *buffer;
  39 } stream_t;
  40
  41 const char *stream_name(int stream)
  42 {
  43         if (stream < 0 || stream > input_stream_nr)
  44                 return "<bad stream>";
  45         return input_streams[stream].name;
  46 }
  47
  48 static struct position stream_pos(stream_t *stream)
  49 {
  50         struct position pos;
  51         pos.type = 0;
  52         pos.stream = stream->nr;
  53         pos.newline = stream->newline;
  54         pos.whitespace = stream->whitespace;
  55         pos.pos = stream->pos;
  56         pos.line = stream->line;
  57         pos.noexpand = 0;
  58         return pos;
  59 }
  60
  61 const char *show_special(int val)
  62 {
  63         static char buffer[4];
  64
  65         buffer[0] = val;
  66         buffer[1] = 0;
  67         if (val >= SPECIAL_BASE)
  68                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  69         return buffer;
  70 }
  71
  72 const char *show_ident(const struct ident *ident)
  73 {
  74         static char buffer[256];
  75         if (!ident)
  76                 return "<noident>";
  77         sprintf(buffer, "%.*s", ident->len, ident->name);
  78         return buffer;
  79 }
  80
  81 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  82 {
  83         if (isprint(c)) {
  84                 if (c == escape || c == '\\')
  85                         *ptr++ = '\\';
  86                 *ptr++ = c;
  87                 return ptr;
  88         }
  89         *ptr++ = '\\';
  90         switch (c) {
  91         case '\n':
  92                 *ptr++ = 'n';
  93                 return ptr;
  94         case '\t':
  95                 *ptr++ = 't';
  96                 return ptr;
  97         }
  98         if (!isdigit(next))
  99                 return ptr + sprintf(ptr, "%o", c);
 100
 101         return ptr + sprintf(ptr, "%03o", c);
 102 }
 103
 104 const char *show_string(const struct string *string)
 105 {
 106         static char buffer[4 * MAX_STRING + 3];
 107         char *ptr;
 108         int i;
 109
 110         if (!string->length)
 111                 return "<bad_string>";
 112         ptr = buffer;
 113         *ptr++ = '"';
 114         for (i = 0; i < string->length-1; i++) {
 115                 const char *p = string->data + i;
 116                 ptr = charstr(ptr, p[0], '"', p[1]);
 117         }
 118         *ptr++ = '"';
 119         *ptr = '\0';
 120         return buffer;
 121 }
 122
 123 const char *show_token(const struct token *token)
 124 {
 125         static char buffer[256];
 126
 127         if (!token)
 128                 return "<no token>";
 129         switch (token_type(token)) {
 130         case TOKEN_ERROR:
 131                 return "syntax error";
 132
 133         case TOKEN_EOF:
 134                 return "end-of-input";
 135
 136         case TOKEN_IDENT:
 137                 return show_ident(token->ident);
 138
 139         case TOKEN_STRING:
 140                 return show_string(token->string);
 141
 142         case TOKEN_NUMBER:
 143                 return token->number;
 144
 145         case TOKEN_SPECIAL:
 146                 return show_special(token->special);
 147
 148         case TOKEN_CHAR: {
 149                 char *ptr = buffer;
 150                 int c = token->character;
 151                 *ptr++ = '\'';
 152                 ptr = charstr(ptr, c, '\'', 0);
 153                 *ptr++ = '\'';
 154                 *ptr++ = '\0';
 155                 return buffer;
 156         }
 157
 158         case TOKEN_STREAMBEGIN:
 159                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 160                 return buffer;
 161
 162         case TOKEN_STREAMEND:
 163                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 164                 return buffer;
 165
 166         default:
 167                 return "WTF???";
 168         }
 169 }
 170
 171 int init_stream(const char *name, int fd, const char **next_path)
 172 {
 173         int stream = input_stream_nr;
 174         struct stream *current;
 175
 176         if (stream >= input_streams_allocated) {
 177                 int newalloc = stream * 4 / 3 + 10;
 178                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 179                 if (!input_streams)
 180                         die("Unable to allocate more streams space");
 181                 input_streams_allocated = newalloc;
 182         }
 183         current = input_streams + stream;
 184         memset(current, 0, sizeof(*current));
 185         current->name = name;
 186         current->fd = fd;
 187         current->next_path = next_path;
 188         current->path = NULL;
 189         current->constant = CONSTANT_FILE_MAYBE;
 190         input_stream_nr = stream+1;
 191         return stream;
 192 }
 193
 194 static struct token * alloc_token(stream_t *stream)
 195 {
 196         struct token *token = __alloc_token(0);
 197         token->pos = stream_pos(stream);
 198         return token;
 199 }
 200
 201 /*
 202  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 203  *  things a _lot_.
 204  */
 205 static int nextchar_slow(stream_t *stream)
 206 {
 207         int offset = stream->offset;
 208         int size = stream->size;
 209         int c;
 210         int spliced = 0, had_cr, had_backslash, complain;
 211
 212 restart:
 213         had_cr = had_backslash = complain = 0;
 214
 215 repeat:
 216         if (offset >= size) {
 217                 if (stream->fd < 0)
 218                         goto got_eof;
 219                 size = read(stream->fd, stream->buffer, BUFSIZE);
 220                 if (size <= 0)
 221                         goto got_eof;
 222                 stream->size = size;
 223                 stream->offset = offset = 0;
 224         }
 225
 226         c = stream->buffer[offset++];
 227
 228         if (had_cr && c != '\n')
 229                 complain = 1;
 230
 231         if (c == '\r') {
 232                 had_cr = 1;
 233                 goto repeat;
 234         }
 235
 236         stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
 237
 238         if (c == '\n') {
 239                 stream->line++;
 240                 stream->pos = 0;
 241         }
 242
 243         if (!had_backslash) {
 244                 if (c == '\\') {
 245                         had_backslash = 1;
 246                         goto repeat;
 247                 }
 248                 if (c == '\n')
 249                         stream->newline = 1;
 250         } else {
 251                 if (c == '\n') {
 252                         if (complain)
 253                                 warning(stream_pos(stream), "non-ASCII data stream");
 254                         spliced = 1;
 255                         goto restart;
 256                 }
 257                 stream->pos--;
 258                 offset--;
 259                 c = '\\';
 260         }
 261
 262 out:
 263         stream->offset = offset;
 264         if (complain)
 265                 warning(stream_pos(stream), "non-ASCII data stream");
 266
 267         return c;
 268
 269 got_eof:
 270         if (had_backslash) {
 271                 c = '\\';
 272                 goto out;
 273         }
 274         if (stream->pos)
 275                 warning(stream_pos(stream), "no newline at end of file");
 276         else if (had_cr)
 277                 warning(stream_pos(stream), "non-ASCII data stream");
 278         else if (spliced)
 279                 warning(stream_pos(stream), "backslash-newline at end of file");
 280         return EOF;
 281 }
 282
 283 /*
 284  *  We want that as light as possible while covering all normal cases.
 285  *  Slow path (including the logics with line-splicing and EOF sanity
 286  *  checks) is in nextchar_slow().
 287  */
 288 static inline int nextchar(stream_t *stream)
 289 {
 290         int offset = stream->offset;
 291
 292         if (offset < stream->size) {
 293                 int c = stream->buffer[offset++];
 294                 static const char special[256] = {
 295                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 296                 };
 297                 if (!special[c]) {
 298                         stream->offset = offset;
 299                         stream->pos++;
 300                         return c;
 301                 }
 302         }
 303         return nextchar_slow(stream);
 304 }
 305
 306 struct token eof_token_entry;
 307
 308 static struct token *mark_eof(stream_t *stream)
 309 {
 310         struct token *end;
 311
 312         end = alloc_token(stream);
 313         token_type(end) = TOKEN_STREAMEND;
 314         end->pos.newline = 1;
 315
 316         eof_token_entry.next = &eof_token_entry;
 317         eof_token_entry.pos.newline = 1;
 318
 319         end->next =  &eof_token_entry;
 320         *stream->tokenlist = end;
 321         stream->tokenlist = NULL;
 322         return end;
 323 }
 324
 325 static void add_token(stream_t *stream)
 326 {
 327         struct token *token = stream->token;
 328
 329         stream->token = NULL;
 330         token->next = NULL;
 331         *stream->tokenlist = token;
 332         stream->tokenlist = &token->next;
 333 }
 334
 335 static void drop_token(stream_t *stream)
 336 {
 337         stream->newline |= stream->token->pos.newline;
 338         stream->whitespace |= stream->token->pos.whitespace;
 339         stream->token = NULL;
 340 }
 341
 342 enum {
 343         Letter = 1,
 344         Digit = 2,
 345         Hex = 4,
 346         Exp = 8,
 347         Dot = 16,
 348         ValidSecond = 32,
 349 };
 350
 351 static const long cclass[257] = {
 352         ['0' + 1 ... '9' + 1] = Digit | Hex,
 353         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 354         ['E' + 1] = Letter | Hex | Exp,
 355         ['F' + 1] = Letter | Hex,
 356         ['G' + 1 ... 'O' + 1] = Letter,
 357         ['P' + 1] = Letter | Exp,
 358         ['Q' + 1 ... 'Z' + 1] = Letter,
 359         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 360         ['e' + 1] = Letter | Hex | Exp,
 361         ['f' + 1] = Letter | Hex,
 362         ['g' + 1 ... 'o' + 1] = Letter,
 363         ['p' + 1] = Letter | Exp,
 364         ['q' + 1 ... 'z' + 1] = Letter,
 365         ['_' + 1] = Letter,
 366         ['.' + 1] = Dot | ValidSecond,
 367         ['=' + 1] = ValidSecond,
 368         ['+' + 1] = ValidSecond,
 369         ['-' + 1] = ValidSecond,
 370         ['>' + 1] = ValidSecond,
 371         ['<' + 1] = ValidSecond,
 372         ['&' + 1] = ValidSecond,
 373         ['|' + 1] = ValidSecond,
 374         ['#' + 1] = ValidSecond,
 375 };
 376
 377 /*
 378  * pp-number:
 379  *      digit
 380  *      . digit
 381  *      pp-number digit
 382  *      pp-number identifier-nodigit
 383  *      pp-number e sign
 384  *      pp-number E sign
 385  *      pp-number p sign
 386  *      pp-number P sign
 387  *      pp-number .
 388  */
 389 static int get_one_number(int c, int next, stream_t *stream)
 390 {
 391         struct token *token;
 392         static char buffer[4095];
 393         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 394         int len;
 395
 396         *p++ = c;
 397         for (;;) {
 398                 long class =  cclass[next + 1];
 399                 if (!(class & (Dot | Digit | Letter)))
 400                         break;
 401                 if (p != buffer_end)
 402                         *p++ = next;
 403                 next = nextchar(stream);
 404                 if (class & Exp) {
 405                         if (next == '-' || next == '+') {
 406                                 if (p != buffer_end)
 407                                         *p++ = next;
 408                                 next = nextchar(stream);
 409                         }
 410                 }
 411         }
 412
 413         if (p == buffer_end) {
 414                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 415                       buffer_end - buffer);
 416                 // Pretend we saw just "1".
 417                 buffer[0] = '1';
 418                 p = buffer + 1;
 419         }
 420
 421         *p++ = 0;
 422         len = p - buffer;
 423         buf = __alloc_bytes(len);
 424         memcpy(buf, buffer, len);
 425
 426         token = stream->token;
 427         token_type(token) = TOKEN_NUMBER;
 428         token->number = buf;
 429         add_token(stream);
 430
 431         return next;
 432 }
 433
 434 static int escapechar(int first, int type, stream_t *stream, int *valp)
 435 {
 436         int next, value;
 437
 438         next = nextchar(stream);
 439         value = first;
 440
 441         if (first == '\n')
 442                 warning(stream_pos(stream), "Newline in string or character constant");
 443
 444         if (first == '\\' && next != EOF) {
 445                 value = next;
 446                 next = nextchar(stream);
 447                 if (value != type) {
 448                         switch (value) {
 449                         case 'a':
 450                                 value = '\a';
 451                                 break;
 452                         case 'b':
 453                                 value = '\b';
 454                                 break;
 455                         case 't':
 456                                 value = '\t';
 457                                 break;
 458                         case 'n':
 459                                 value = '\n';
 460                                 break;
 461                         case 'v':
 462                                 value = '\v';
 463                                 break;
 464                         case 'f':
 465                                 value = '\f';
 466                                 break;
 467                         case 'r':
 468                                 value = '\r';
 469                                 break;
 470                         case 'e':
 471                                 value = '\e';
 472                                 break;
 473                         case '\\':
 474                                 break;
 475                         case '?':
 476                                 break;
 477                         case '\'':
 478                                 break;
 479                         case '"':
 480                                 break;
 481                         case '\n':
 482                                 warning(stream_pos(stream), "Newline in string or character constant");
 483                                 break;
 484                         case '0'...'7': {
 485                                 int nr = 2;
 486                                 value -= '0';
 487                                 while (next >= '0' && next <= '9') {
 488                                         value = (value << 3) + (next-'0');
 489                                         next = nextchar(stream);
 490                                         if (!--nr)
 491                                                 break;
 492                                 }
 493                                 value &= 0xff;
 494                                 break;
 495                         }
 496                         case 'x': {
 497                                 int hex = hexval(next);
 498                                 if (hex < 16) {
 499                                         value = hex;
 500                                         next = nextchar(stream);
 501                                         while ((hex = hexval(next)) < 16) {
 502                                                 value = (value << 4) + hex;
 503                                                 next = nextchar(stream);
 504                                         }
 505                                         value &= 0xff;
 506                                         break;
 507                                 }
 508                         }
 509                         /* Fall through */
 510                         default:
 511                                 warning(stream_pos(stream), "Unknown escape '%c'", value);
 512                         }
 513                 }
 514                 /* Mark it as escaped */
 515                 value |= 0x100;
 516         }
 517         *valp = value;
 518         return next;
 519 }
 520
 521 static int get_char_token(int next, stream_t *stream)
 522 {
 523         int value;
 524         struct token *token;
 525
 526         next = escapechar(next, '\'', stream, &value);
 527         if (value == '\'' || next != '\'') {
 528                 sparse_error(stream_pos(stream), "Bad character constant");
 529                 drop_token(stream);
 530                 return next;
 531         }
 532
 533         token = stream->token;
 534         token_type(token) = TOKEN_CHAR;
 535         token->character = value & 0xff;
 536
 537         add_token(stream);
 538         return nextchar(stream);
 539 }
 540
 541 static int get_string_token(int next, stream_t *stream)
 542 {
 543         static char buffer[MAX_STRING];
 544         struct string *string;
 545         struct token *token;
 546         int len = 0;
 547
 548         for (;;) {
 549                 int val;
 550                 next = escapechar(next, '"', stream, &val);
 551                 if (val == '"')
 552                         break;
 553                 if (next == EOF) {
 554                         warning(stream_pos(stream), "End of file in middle of string");
 555                         return next;
 556                 }
 557                 if (len < MAX_STRING)
 558                         buffer[len] = val;
 559                 len++;
 560         }
 561
 562         if (len > MAX_STRING) {
 563                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 564                 len = MAX_STRING;
 565         }
 566
 567         string = __alloc_string(len+1);
 568         memcpy(string->data, buffer, len);
 569         string->data[len] = '\0';
 570         string->length = len+1;
 571
 572         /* Pass it on.. */
 573         token = stream->token;
 574         token_type(token) = TOKEN_STRING;
 575         token->string = string;
 576         add_token(stream);
 577
 578         return next;
 579 }
 580
 581 static int drop_stream_eoln(stream_t *stream)
 582 {
 583         drop_token(stream);
 584         for (;;) {
 585                 switch (nextchar(stream)) {
 586                 case EOF:
 587                         return EOF;
 588                 case '\n':
 589                         return nextchar(stream);
 590                 }
 591         }
 592 }
 593
 594 static int drop_stream_comment(stream_t *stream)
 595 {
 596         int newline;
 597         int next;
 598         drop_token(stream);
 599         newline = stream->newline;
 600
 601         next = nextchar(stream);
 602         for (;;) {
 603                 int curr = next;
 604                 if (curr == EOF) {
 605                         warning(stream_pos(stream), "End of file in the middle of a comment");
 606                         return curr;
 607                 }
 608                 next = nextchar(stream);
 609                 if (curr == '*' && next == '/')
 610                         break;
 611         }
 612         stream->newline = newline;
 613         return nextchar(stream);
 614 }
 615
 616 unsigned char combinations[][4] = COMBINATION_STRINGS;
 617
 618 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 619
 620 /* hash function for two-character punctuators - all give unique values */
 621 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 622
 623 /*
 624  * note that we won't get false positives - special_hash(0,0) is 0 and
 625  * entry 0 is filled (by +=), so all the missing ones are OK.
 626  */
 627 static unsigned char hash_results[32][2] = {
 628 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 629         RES('+', '='), /* 00 */
 630         RES('/', '='), /* 01 */
 631         RES('^', '='), /* 05 */
 632         RES('&', '&'), /* 07 */
 633         RES('#', '#'), /* 08 */
 634         RES('<', '<'), /* 0a */
 635         RES('<', '='), /* 0c */
 636         RES('!', '='), /* 0e */
 637         RES('%', '='), /* 0f */
 638         RES('-', '-'), /* 10 */
 639         RES('-', '='), /* 11 */
 640         RES('-', '>'), /* 13 */
 641         RES('=', '='), /* 15 */
 642         RES('&', '='), /* 17 */
 643         RES('*', '='), /* 18 */
 644         RES('.', '.'), /* 1a */
 645         RES('+', '+'), /* 1b */
 646         RES('|', '='), /* 1c */
 647         RES('>', '='), /* 1d */
 648         RES('|', '|'), /* 1e */
 649         RES('>', '>')  /* 1f */
 650 #undef RES
 651 };
 652 static int code[32] = {
 653 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 654         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 655         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 656         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 657         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 658         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 659         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 660         CODE('<', '=', SPECIAL_LTE), /* 0c */
 661         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 662         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 663         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 664         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 665         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 666         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 667         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 668         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 669         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 670         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 671         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 672         CODE('>', '=', SPECIAL_GTE), /* 1d */
 673         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 674         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 675 #undef CODE
 676 };
 677
 678 static int get_one_special(int c, stream_t *stream)
 679 {
 680         struct token *token;
 681         int next, value, i;
 682
 683         next = nextchar(stream);
 684
 685         /*
 686          * Check for numbers, strings, character constants, and comments
 687          */
 688         switch (c) {
 689         case '.':
 690                 if (next >= '0' && next <= '9')
 691                         return get_one_number(c, next, stream);
 692                 break;
 693         case '"':
 694                 return get_string_token(next, stream);
 695         case '\'':
 696                 return get_char_token(next, stream);
 697         case '/':
 698                 if (next == '/')
 699                         return drop_stream_eoln(stream);
 700                 if (next == '*')
 701                         return drop_stream_comment(stream);
 702         }
 703
 704         /*
 705          * Check for combinations
 706          */
 707         value = c;
 708         if (cclass[next + 1] & ValidSecond) {
 709                 i = special_hash(c, next);
 710                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 711                         value = code[i];
 712                         next = nextchar(stream);
 713                         if (value >= SPECIAL_LEFTSHIFT &&
 714                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 715                                 value += 3;
 716                                 next = nextchar(stream);
 717                         }
 718                 }
 719         }
 720
 721         /* Pass it on.. */
 722         token = stream->token;
 723         token_type(token) = TOKEN_SPECIAL;
 724         token->special = value;
 725         add_token(stream);
 726         return next;
 727 }
 728
 729 #define IDENT_HASH_BITS (13)
 730 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 731 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 732
 733 #define ident_hash_init(c)              (c)
 734 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 735 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 736
 737 static struct ident *hash_table[IDENT_HASH_SIZE];
 738 static int ident_hit, ident_miss, idents;
 739
 740 void show_identifier_stats(void)
 741 {
 742         int i;
 743         int distribution[100];
 744
 745         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 746                 ident_hit, ident_miss);
 747
 748         for (i = 0; i < 100; i++)
 749                 distribution[i] = 0;
 750
 751         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 752                 struct ident * ident = hash_table[i];
 753                 int count = 0;
 754
 755                 while (ident) {
 756                         count++;
 757                         ident = ident->next;
 758                 }
 759                 if (count > 99)
 760                         count = 99;
 761                 distribution[count]++;
 762         }
 763
 764         for (i = 0; i < 100; i++) {
 765                 if (distribution[i])
 766                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 767         }
 768 }
 769
 770 static struct ident *alloc_ident(const char *name, int len)
 771 {
 772         struct ident *ident = __alloc_ident(len);
 773         ident->symbols = NULL;
 774         ident->len = len;
 775         ident->tainted = 0;
 776         memcpy(ident->name, name, len);
 777         return ident;
 778 }
 779
 780 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 781 {
 782         ident->next = hash_table[hash];
 783         hash_table[hash] = ident;
 784         ident_miss++;
 785         return ident;
 786 }
 787
 788 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 789 {
 790         struct ident *ident;
 791         struct ident **p;
 792
 793         p = &hash_table[hash];
 794         while ((ident = *p) != NULL) {
 795                 if (ident->len == (unsigned char) len) {
 796                         if (strncmp(name, ident->name, len) != 0)
 797                                 goto next;
 798
 799                         ident_hit++;
 800                         return ident;
 801                 }
 802 next:
 803                 //misses++;
 804                 p = &ident->next;
 805         }
 806         ident = alloc_ident(name, len);
 807         *p = ident;
 808         ident->next = NULL;
 809         ident_miss++;
 810         idents++;
 811         return ident;
 812 }
 813
 814 static unsigned long hash_name(const char *name, int len)
 815 {
 816         unsigned long hash;
 817         const unsigned char *p = (const unsigned char *)name;
 818
 819         hash = ident_hash_init(*p++);
 820         while (--len) {
 821                 unsigned int i = *p++;
 822                 hash = ident_hash_add(hash, i);
 823         }
 824         return ident_hash_end(hash);
 825 }
 826
 827 struct ident *hash_ident(struct ident *ident)
 828 {
 829         return insert_hash(ident, hash_name(ident->name, ident->len));
 830 }
 831
 832 struct ident *built_in_ident(const char *name)
 833 {
 834         int len = strlen(name);
 835         return create_hashed_ident(name, len, hash_name(name, len));
 836 }
 837
 838 struct token *built_in_token(int stream, const char *name)
 839 {
 840         struct token *token;
 841
 842         token = __alloc_token(0);
 843         token->pos.stream = stream;
 844         token_type(token) = TOKEN_IDENT;
 845         token->ident = built_in_ident(name);
 846         return token;
 847 }
 848
 849 static int get_one_identifier(int c, stream_t *stream)
 850 {
 851         struct token *token;
 852         struct ident *ident;
 853         unsigned long hash;
 854         char buf[256];
 855         int len = 1;
 856         int next;
 857
 858         hash = ident_hash_init(c);
 859         buf[0] = c;
 860         for (;;) {
 861                 next = nextchar(stream);
 862                 if (!(cclass[next + 1] & (Letter | Digit)))
 863                         break;
 864                 if (len >= sizeof(buf))
 865                         break;
 866                 hash = ident_hash_add(hash, next);
 867                 buf[len] = next;
 868                 len++;
 869         };
 870         hash = ident_hash_end(hash);
 871
 872         ident = create_hashed_ident(buf, len, hash);
 873
 874         /* Pass it on.. */
 875         token = stream->token;
 876         token_type(token) = TOKEN_IDENT;
 877         token->ident = ident;
 878         add_token(stream);
 879         return next;
 880 }
 881
 882 static int get_one_token(int c, stream_t *stream)
 883 {
 884         long class = cclass[c + 1];
 885         if (class & Digit)
 886                 return get_one_number(c, nextchar(stream), stream);
 887         if (class & Letter)
 888                 return get_one_identifier(c, stream);
 889         return get_one_special(c, stream);
 890 }
 891
 892 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 893         unsigned char *buf, unsigned int buf_size)
 894 {
 895         struct token *begin;
 896
 897         stream->nr = idx;
 898         stream->line = 1;
 899         stream->newline = 1;
 900         stream->whitespace = 0;
 901         stream->pos = 0;
 902
 903         stream->token = NULL;
 904         stream->fd = fd;
 905         stream->offset = 0;
 906         stream->size = buf_size;
 907         stream->buffer = buf;
 908
 909         begin = alloc_token(stream);
 910         token_type(begin) = TOKEN_STREAMBEGIN;
 911         stream->tokenlist = &begin->next;
 912         return begin;
 913 }
 914
 915 static struct token *tokenize_stream(stream_t *stream)
 916 {
 917         int c = nextchar(stream);
 918         while (c != EOF) {
 919                 if (!isspace(c)) {
 920                         struct token *token = alloc_token(stream);
 921                         stream->token = token;
 922                         stream->newline = 0;
 923                         stream->whitespace = 0;
 924                         c = get_one_token(c, stream);
 925                         continue;
 926                 }
 927                 stream->whitespace = 1;
 928                 c = nextchar(stream);
 929         }
 930         return mark_eof(stream);
 931 }
 932
 933 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
 934 {
 935         stream_t stream;
 936         struct token *begin;
 937
 938         begin = setup_stream(&stream, 0, -1, buffer, size);
 939         *endtoken = tokenize_stream(&stream);
 940         return begin;
 941 }
 942
 943 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
 944 {
 945         struct token *begin, *end;
 946         stream_t stream;
 947         unsigned char buffer[BUFSIZE];
 948         int idx;
 949
 950         idx = init_stream(name, fd, next_path);
 951         if (idx < 0) {
 952                 // info(endtoken->pos, "File %s is const", name);
 953                 return endtoken;
 954         }
 955
 956         begin = setup_stream(&stream, idx, fd, buffer, 0);
 957         end = tokenize_stream(&stream);
 958         if (endtoken)
 959                 end->next = endtoken;
 960         return begin;
 961 }