tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  *  Licensed under the Open Software License version 1.1
   9  */
  10 #include <stdio.h>
  11 #include <stdlib.h>
  12 #include <stdarg.h>
  13 #include <stddef.h>
  14 #include <string.h>
  15 #include <ctype.h>
  16 #include <unistd.h>
  17 #include <stdint.h>
  18
  19 #include "lib.h"
  20 #include "allocate.h"
  21 #include "token.h"
  22 #include "symbol.h"
  23
  24 #define EOF (-1)
  25
  26 int input_stream_nr = 0;
  27 struct stream *input_streams;
  28 static int input_streams_allocated;
  29 unsigned int tabstop = 8;
  30
  31 #define BUFSIZE (8192)
  32
  33 typedef struct {
  34         int fd, offset, size;
  35         int pos, line, nr;
  36         int newline, whitespace;
  37         struct token **tokenlist;
  38         struct token *token;
  39         unsigned char *buffer;
  40 } stream_t;
  41
  42 const char *stream_name(int stream)
  43 {
  44         if (stream < 0 || stream > input_stream_nr)
  45                 return "<bad stream>";
  46         return input_streams[stream].name;
  47 }
  48
  49 static struct position stream_pos(stream_t *stream)
  50 {
  51         struct position pos;
  52         pos.type = 0;
  53         pos.stream = stream->nr;
  54         pos.newline = stream->newline;
  55         pos.whitespace = stream->whitespace;
  56         pos.pos = stream->pos;
  57         pos.line = stream->line;
  58         pos.noexpand = 0;
  59         return pos;
  60 }
  61
  62 const char *show_special(int val)
  63 {
  64         static char buffer[4];
  65
  66         buffer[0] = val;
  67         buffer[1] = 0;
  68         if (val >= SPECIAL_BASE)
  69                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  70         return buffer;
  71 }
  72
  73 const char *show_ident(const struct ident *ident)
  74 {
  75         static char buffer[256];
  76         if (!ident)
  77                 return "<noident>";
  78         sprintf(buffer, "%.*s", ident->len, ident->name);
  79         return buffer;
  80 }
  81
  82 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
  83 {
  84         if (isprint(c)) {
  85                 if (c == escape || c == '\\')
  86                         *ptr++ = '\\';
  87                 *ptr++ = c;
  88                 return ptr;
  89         }
  90         *ptr++ = '\\';
  91         switch (c) {
  92         case '\n':
  93                 *ptr++ = 'n';
  94                 return ptr;
  95         case '\t':
  96                 *ptr++ = 't';
  97                 return ptr;
  98         }
  99         if (!isdigit(next))
 100                 return ptr + sprintf(ptr, "%o", c);
 101
 102         return ptr + sprintf(ptr, "%03o", c);
 103 }
 104
 105 const char *show_string(const struct string *string)
 106 {
 107         static char buffer[4 * MAX_STRING + 3];
 108         char *ptr;
 109         int i;
 110
 111         if (!string->length)
 112                 return "<bad_string>";
 113         ptr = buffer;
 114         *ptr++ = '"';
 115         for (i = 0; i < string->length-1; i++) {
 116                 const char *p = string->data + i;
 117                 ptr = charstr(ptr, p[0], '"', p[1]);
 118         }
 119         *ptr++ = '"';
 120         *ptr = '\0';
 121         return buffer;
 122 }
 123
 124 static const char *show_char(const char *s, size_t len, char prefix, char delim)
 125 {
 126         static char buffer[MAX_STRING + 4];
 127         char *p = buffer;
 128         if (prefix)
 129                 *p++ = prefix;
 130         *p++ = delim;
 131         memcpy(p, s, len);
 132         p += len;
 133         *p++ = delim;
 134         *p++ = '\0';
 135         return buffer;
 136 }
 137
 138 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 139 {
 140         static char buffer[2*MAX_STRING + 6];
 141         size_t i;
 142         char *p = buffer;
 143         if (prefix)
 144                 *p++ = prefix;
 145         if (delim == '"')
 146                 *p++ = '\\';
 147         *p++ = delim;
 148         for (i = 0; i < len; i++) {
 149                 if (s[i] == '"' || s[i] == '\\')
 150                         *p++ = '\\';
 151                 *p++ = s[i];
 152         }
 153         if (delim == '"')
 154                 *p++ = '\\';
 155         *p++ = delim;
 156         *p++ = '\0';
 157         return buffer;
 158 }
 159
 160 const char *show_token(const struct token *token)
 161 {
 162         static char buffer[256];
 163
 164         if (!token)
 165                 return "<no token>";
 166         switch (token_type(token)) {
 167         case TOKEN_ERROR:
 168                 return "syntax error";
 169
 170         case TOKEN_EOF:
 171                 return "end-of-input";
 172
 173         case TOKEN_IDENT:
 174                 return show_ident(token->ident);
 175
 176         case TOKEN_NUMBER:
 177                 return token->number;
 178
 179         case TOKEN_SPECIAL:
 180                 return show_special(token->special);
 181
 182         case TOKEN_CHAR:
 183                 return show_char(token->string->data,
 184                         token->string->length - 1, 0, '\'');
 185         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 186                 return show_char(token->embedded,
 187                         token_type(token) - TOKEN_CHAR, 0, '\'');
 188         case TOKEN_WIDE_CHAR:
 189                 return show_char(token->string->data,
 190                         token->string->length - 1, 'L', '\'');
 191         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 192                 return show_char(token->embedded,
 193                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 194         case TOKEN_STRING:
 195                 return show_char(token->string->data,
 196                         token->string->length - 1, 0, '"');
 197         case TOKEN_WIDE_STRING:
 198                 return show_char(token->string->data,
 199                         token->string->length - 1, 'L', '"');
 200
 201         case TOKEN_STREAMBEGIN:
 202                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 203                 return buffer;
 204
 205         case TOKEN_STREAMEND:
 206                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 207                 return buffer;
 208
 209         case TOKEN_UNTAINT:
 210                 sprintf(buffer, "<untaint>");
 211                 return buffer;
 212
 213         case TOKEN_ARG_COUNT:
 214                 sprintf(buffer, "<argcnt>");
 215                 return buffer;
 216
 217         default:
 218                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 219                 return buffer;
 220         }
 221 }
 222
 223 const char *quote_token(const struct token *token)
 224 {
 225         static char buffer[256];
 226
 227         switch (token_type(token)) {
 228         case TOKEN_ERROR:
 229                 return "syntax error";
 230
 231         case TOKEN_IDENT:
 232                 return show_ident(token->ident);
 233
 234         case TOKEN_NUMBER:
 235                 return token->number;
 236
 237         case TOKEN_SPECIAL:
 238                 return show_special(token->special);
 239
 240         case TOKEN_CHAR:
 241                 return quote_char(token->string->data,
 242                         token->string->length - 1, 0, '\'');
 243         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 244                 return quote_char(token->embedded,
 245                         token_type(token) - TOKEN_CHAR, 0, '\'');
 246         case TOKEN_WIDE_CHAR:
 247                 return quote_char(token->string->data,
 248                         token->string->length - 1, 'L', '\'');
 249         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 250                 return quote_char(token->embedded,
 251                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 252         case TOKEN_STRING:
 253                 return quote_char(token->string->data,
 254                         token->string->length - 1, 0, '"');
 255         case TOKEN_WIDE_STRING:
 256                 return quote_char(token->string->data,
 257                         token->string->length - 1, 'L', '"');
 258         default:
 259                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 260                 return buffer;
 261         }
 262 }
 263
 264 #define HASHED_INPUT_BITS (6)
 265 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 266 #define HASH_PRIME 0x9e370001UL
 267
 268 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 269
 270 int *hash_stream(const char *name)
 271 {
 272         uint32_t hash = 0;
 273         unsigned char c;
 274
 275         while ((c = *name++) != 0)
 276                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 277
 278         hash *= HASH_PRIME;
 279         hash >>= 32 - HASHED_INPUT_BITS;
 280         return input_stream_hashes + hash;
 281 }
 282
 283 int init_stream(const char *name, int fd, const char **next_path)
 284 {
 285         int stream = input_stream_nr, *hash;
 286         struct stream *current;
 287
 288         if (stream >= input_streams_allocated) {
 289                 int newalloc = stream * 4 / 3 + 10;
 290                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 291                 if (!input_streams)
 292                         die("Unable to allocate more streams space");
 293                 input_streams_allocated = newalloc;
 294         }
 295         current = input_streams + stream;
 296         memset(current, 0, sizeof(*current));
 297         current->name = name;
 298         current->fd = fd;
 299         current->next_path = next_path;
 300         current->path = NULL;
 301         current->constant = CONSTANT_FILE_MAYBE;
 302         input_stream_nr = stream+1;
 303         hash = hash_stream(name);
 304         current->next_stream = *hash;
 305         *hash = stream;
 306         return stream;
 307 }
 308
 309 static struct token * alloc_token(stream_t *stream)
 310 {
 311         struct token *token = __alloc_token(0);
 312         token->pos = stream_pos(stream);
 313         return token;
 314 }
 315
 316 /*
 317  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 318  *  things a _lot_.
 319  */
 320 static int nextchar_slow(stream_t *stream)
 321 {
 322         int offset = stream->offset;
 323         int size = stream->size;
 324         int c;
 325         int spliced = 0, had_cr, had_backslash;
 326
 327 restart:
 328         had_cr = had_backslash = 0;
 329
 330 repeat:
 331         if (offset >= size) {
 332                 if (stream->fd < 0)
 333                         goto got_eof;
 334                 size = read(stream->fd, stream->buffer, BUFSIZE);
 335                 if (size <= 0)
 336                         goto got_eof;
 337                 stream->size = size;
 338                 stream->offset = offset = 0;
 339         }
 340
 341         c = stream->buffer[offset++];
 342         if (had_cr)
 343                 goto check_lf;
 344
 345         if (c == '\r') {
 346                 had_cr = 1;
 347                 goto repeat;
 348         }
 349
 350 norm:
 351         if (!had_backslash) {
 352                 switch (c) {
 353                 case '\t':
 354                         stream->pos += tabstop - stream->pos % tabstop;
 355                         break;
 356                 case '\n':
 357                         stream->line++;
 358                         stream->pos = 0;
 359                         stream->newline = 1;
 360                         break;
 361                 case '\\':
 362                         had_backslash = 1;
 363                         stream->pos++;
 364                         goto repeat;
 365                 default:
 366                         stream->pos++;
 367                 }
 368         } else {
 369                 if (c == '\n') {
 370                         stream->line++;
 371                         stream->pos = 0;
 372                         spliced = 1;
 373                         goto restart;
 374                 }
 375                 offset--;
 376                 c = '\\';
 377         }
 378 out:
 379         stream->offset = offset;
 380
 381         return c;
 382
 383 check_lf:
 384         if (c != '\n')
 385                 offset--;
 386         c = '\n';
 387         goto norm;
 388
 389 got_eof:
 390         if (had_backslash) {
 391                 c = '\\';
 392                 goto out;
 393         }
 394         if (stream->pos)
 395                 warning(stream_pos(stream), "no newline at end of file");
 396         else if (spliced)
 397                 warning(stream_pos(stream), "backslash-newline at end of file");
 398         return EOF;
 399 }
 400
 401 /*
 402  *  We want that as light as possible while covering all normal cases.
 403  *  Slow path (including the logics with line-splicing and EOF sanity
 404  *  checks) is in nextchar_slow().
 405  */
 406 static inline int nextchar(stream_t *stream)
 407 {
 408         int offset = stream->offset;
 409
 410         if (offset < stream->size) {
 411                 int c = stream->buffer[offset++];
 412                 static const char special[256] = {
 413                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 414                 };
 415                 if (!special[c]) {
 416                         stream->offset = offset;
 417                         stream->pos++;
 418                         return c;
 419                 }
 420         }
 421         return nextchar_slow(stream);
 422 }
 423
 424 struct token eof_token_entry;
 425
 426 static struct token *mark_eof(stream_t *stream)
 427 {
 428         struct token *end;
 429
 430         end = alloc_token(stream);
 431         token_type(end) = TOKEN_STREAMEND;
 432         end->pos.newline = 1;
 433
 434         eof_token_entry.next = &eof_token_entry;
 435         eof_token_entry.pos.newline = 1;
 436
 437         end->next =  &eof_token_entry;
 438         *stream->tokenlist = end;
 439         stream->tokenlist = NULL;
 440         return end;
 441 }
 442
 443 static void add_token(stream_t *stream)
 444 {
 445         struct token *token = stream->token;
 446
 447         stream->token = NULL;
 448         token->next = NULL;
 449         *stream->tokenlist = token;
 450         stream->tokenlist = &token->next;
 451 }
 452
 453 static void drop_token(stream_t *stream)
 454 {
 455         stream->newline |= stream->token->pos.newline;
 456         stream->whitespace |= stream->token->pos.whitespace;
 457         stream->token = NULL;
 458 }
 459
 460 enum {
 461         Letter = 1,
 462         Digit = 2,
 463         Hex = 4,
 464         Exp = 8,
 465         Dot = 16,
 466         ValidSecond = 32,
 467         Quote = 64,
 468         Escape = 128,
 469 };
 470
 471 static const long cclass[257] = {
 472         ['0' + 1 ... '7' + 1] = Digit | Hex | Escape,   /* \<octal> */
 473         ['8' + 1 ... '9' + 1] = Digit | Hex,
 474         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 475         ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 476         ['F' + 1] = Letter | Hex,
 477         ['G' + 1 ... 'O' + 1] = Letter,
 478         ['P' + 1] = Letter | Exp,       /* P<exp> */
 479         ['Q' + 1 ... 'Z' + 1] = Letter,
 480         ['a' + 1 ... 'b' + 1] = Letter | Hex | Escape, /* \a, \b */
 481         ['c' + 1 ... 'd' + 1] = Letter | Hex,
 482         ['e' + 1] = Letter | Hex | Exp | Escape,/* \e, e<exp> */
 483         ['f' + 1] = Letter | Hex | Escape,      /* \f */
 484         ['g' + 1 ... 'm' + 1] = Letter,
 485         ['n' + 1] = Letter | Escape,    /* \n */
 486         ['o' + 1] = Letter,
 487         ['p' + 1] = Letter | Exp,       /* p<exp> */
 488         ['q' + 1] = Letter,
 489         ['r' + 1] = Letter | Escape,    /* \r */
 490         ['s' + 1] = Letter,
 491         ['t' + 1] = Letter | Escape,    /* \t */
 492         ['u' + 1] = Letter,
 493         ['v' + 1] = Letter | Escape,    /* \v */
 494         ['w' + 1] = Letter,
 495         ['x' + 1] = Letter | Escape,    /* \x<hex> */
 496         ['y' + 1 ... 'z' + 1] = Letter,
 497         ['_' + 1] = Letter,
 498         ['.' + 1] = Dot | ValidSecond,
 499         ['=' + 1] = ValidSecond,
 500         ['+' + 1] = ValidSecond,
 501         ['-' + 1] = ValidSecond,
 502         ['>' + 1] = ValidSecond,
 503         ['<' + 1] = ValidSecond,
 504         ['&' + 1] = ValidSecond,
 505         ['|' + 1] = ValidSecond,
 506         ['#' + 1] = ValidSecond,
 507         ['\'' + 1] = Quote | Escape,
 508         ['"' + 1] = Quote | Escape,
 509         ['\\' + 1] = Escape,
 510         ['?' + 1] = Escape,
 511 };
 512
 513 /*
 514  * pp-number:
 515  *      digit
 516  *      . digit
 517  *      pp-number digit
 518  *      pp-number identifier-nodigit
 519  *      pp-number e sign
 520  *      pp-number E sign
 521  *      pp-number p sign
 522  *      pp-number P sign
 523  *      pp-number .
 524  */
 525 static int get_one_number(int c, int next, stream_t *stream)
 526 {
 527         struct token *token;
 528         static char buffer[4095];
 529         char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
 530         int len;
 531
 532         *p++ = c;
 533         for (;;) {
 534                 long class =  cclass[next + 1];
 535                 if (!(class & (Dot | Digit | Letter)))
 536                         break;
 537                 if (p != buffer_end)
 538                         *p++ = next;
 539                 next = nextchar(stream);
 540                 if (class & Exp) {
 541                         if (next == '-' || next == '+') {
 542                                 if (p != buffer_end)
 543                                         *p++ = next;
 544                                 next = nextchar(stream);
 545                         }
 546                 }
 547         }
 548
 549         if (p == buffer_end) {
 550                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 551                       buffer_end - buffer);
 552                 // Pretend we saw just "1".
 553                 buffer[0] = '1';
 554                 p = buffer + 1;
 555         }
 556
 557         *p++ = 0;
 558         len = p - buffer;
 559         buf = __alloc_bytes(len);
 560         memcpy(buf, buffer, len);
 561
 562         token = stream->token;
 563         token_type(token) = TOKEN_NUMBER;
 564         token->number = buf;
 565         add_token(stream);
 566
 567         return next;
 568 }
 569
 570 static int eat_string(int next, stream_t *stream, enum token_type type)
 571 {
 572         static char buffer[MAX_STRING];
 573         struct string *string;
 574         struct token *token = stream->token;
 575         int len = 0;
 576         int escape;
 577         int want_hex = 0;
 578         char delim = type < TOKEN_STRING ? '\'' : '"';
 579
 580         for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 581                 if (len < MAX_STRING)
 582                         buffer[len] = next;
 583                 len++;
 584                 if (next == '\n') {
 585                         warning(stream_pos(stream),
 586                                 "Newline in string or character constant");
 587                         if (delim == '\'') /* assume it's lost ' */
 588                                 break;
 589                 }
 590                 if (next == EOF) {
 591                         warning(stream_pos(stream),
 592                                 "End of file in middle of string");
 593                         return next;
 594                 }
 595                 if (!escape) {
 596                         if (want_hex && !(cclass[next + 1] & Hex))
 597                                 warning(stream_pos(stream),
 598                                         "\\x used with no following hex digits");
 599                         want_hex = 0;
 600                         escape = next == '\\';
 601                 } else {
 602                         if (!(cclass[next + 1] & Escape))
 603                                 warning(stream_pos(stream),
 604                                         "Unknown escape '%c'", next);
 605                         escape = 0;
 606                         want_hex = next == 'x';
 607                 }
 608         }
 609         if (want_hex)
 610                 warning(stream_pos(stream),
 611                         "\\x used with no following hex digits");
 612         if (len > MAX_STRING) {
 613                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 614                 len = MAX_STRING;
 615         }
 616         if (delim == '\'' && len <= 4) {
 617                 if (len == 0) {
 618                         sparse_error(stream_pos(stream),
 619                                 "empty character constant");
 620                         return nextchar(stream);
 621                 }
 622                 token_type(token) = type + len;
 623                 memset(buffer + len, '\0', 4 - len);
 624                 memcpy(token->embedded, buffer, 4);
 625         } else {
 626                 token_type(token) = type;
 627                 string = __alloc_string(len+1);
 628                 memcpy(string->data, buffer, len);
 629                 string->data[len] = '\0';
 630                 string->length = len+1;
 631                 token->string = string;
 632         }
 633
 634         /* Pass it on.. */
 635         token = stream->token;
 636         add_token(stream);
 637         return nextchar(stream);
 638 }
 639
 640 static int drop_stream_eoln(stream_t *stream)
 641 {
 642         drop_token(stream);
 643         for (;;) {
 644                 switch (nextchar(stream)) {
 645                 case EOF:
 646                         return EOF;
 647                 case '\n':
 648                         return nextchar(stream);
 649                 }
 650         }
 651 }
 652
 653 static int drop_stream_comment(stream_t *stream)
 654 {
 655         int newline;
 656         int next;
 657         drop_token(stream);
 658         newline = stream->newline;
 659
 660         next = nextchar(stream);
 661         for (;;) {
 662                 int curr = next;
 663                 if (curr == EOF) {
 664                         warning(stream_pos(stream), "End of file in the middle of a comment");
 665                         return curr;
 666                 }
 667                 next = nextchar(stream);
 668                 if (curr == '*' && next == '/')
 669                         break;
 670         }
 671         stream->newline = newline;
 672         return nextchar(stream);
 673 }
 674
 675 unsigned char combinations[][4] = COMBINATION_STRINGS;
 676
 677 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 678
 679 /* hash function for two-character punctuators - all give unique values */
 680 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 681
 682 /*
 683  * note that we won't get false positives - special_hash(0,0) is 0 and
 684  * entry 0 is filled (by +=), so all the missing ones are OK.
 685  */
 686 static unsigned char hash_results[32][2] = {
 687 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 688         RES('+', '='), /* 00 */
 689         RES('/', '='), /* 01 */
 690         RES('^', '='), /* 05 */
 691         RES('&', '&'), /* 07 */
 692         RES('#', '#'), /* 08 */
 693         RES('<', '<'), /* 0a */
 694         RES('<', '='), /* 0c */
 695         RES('!', '='), /* 0e */
 696         RES('%', '='), /* 0f */
 697         RES('-', '-'), /* 10 */
 698         RES('-', '='), /* 11 */
 699         RES('-', '>'), /* 13 */
 700         RES('=', '='), /* 15 */
 701         RES('&', '='), /* 17 */
 702         RES('*', '='), /* 18 */
 703         RES('.', '.'), /* 1a */
 704         RES('+', '+'), /* 1b */
 705         RES('|', '='), /* 1c */
 706         RES('>', '='), /* 1d */
 707         RES('|', '|'), /* 1e */
 708         RES('>', '>')  /* 1f */
 709 #undef RES
 710 };
 711 static int code[32] = {
 712 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 713         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 714         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 715         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 716         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 717         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 718         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 719         CODE('<', '=', SPECIAL_LTE), /* 0c */
 720         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 721         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 722         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 723         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 724         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 725         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 726         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 727         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 728         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 729         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 730         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 731         CODE('>', '=', SPECIAL_GTE), /* 1d */
 732         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 733         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 734 #undef CODE
 735 };
 736
 737 static int get_one_special(int c, stream_t *stream)
 738 {
 739         struct token *token;
 740         int next, value, i;
 741
 742         next = nextchar(stream);
 743
 744         /*
 745          * Check for numbers, strings, character constants, and comments
 746          */
 747         switch (c) {
 748         case '.':
 749                 if (next >= '0' && next <= '9')
 750                         return get_one_number(c, next, stream);
 751                 break;
 752         case '"':
 753                 return eat_string(next, stream, TOKEN_STRING);
 754         case '\'':
 755                 return eat_string(next, stream, TOKEN_CHAR);
 756         case '/':
 757                 if (next == '/')
 758                         return drop_stream_eoln(stream);
 759                 if (next == '*')
 760                         return drop_stream_comment(stream);
 761         }
 762
 763         /*
 764          * Check for combinations
 765          */
 766         value = c;
 767         if (cclass[next + 1] & ValidSecond) {
 768                 i = special_hash(c, next);
 769                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 770                         value = code[i];
 771                         next = nextchar(stream);
 772                         if (value >= SPECIAL_LEFTSHIFT &&
 773                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 774                                 value += 3;
 775                                 next = nextchar(stream);
 776                         }
 777                 }
 778         }
 779
 780         /* Pass it on.. */
 781         token = stream->token;
 782         token_type(token) = TOKEN_SPECIAL;
 783         token->special = value;
 784         add_token(stream);
 785         return next;
 786 }
 787
 788 #define IDENT_HASH_BITS (13)
 789 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 790 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 791
 792 #define ident_hash_init(c)              (c)
 793 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 794 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 795
 796 static struct ident *hash_table[IDENT_HASH_SIZE];
 797 static int ident_hit, ident_miss, idents;
 798
 799 void show_identifier_stats(void)
 800 {
 801         int i;
 802         int distribution[100];
 803
 804         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 805                 ident_hit, ident_miss);
 806
 807         for (i = 0; i < 100; i++)
 808                 distribution[i] = 0;
 809
 810         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 811                 struct ident * ident = hash_table[i];
 812                 int count = 0;
 813
 814                 while (ident) {
 815                         count++;
 816                         ident = ident->next;
 817                 }
 818                 if (count > 99)
 819                         count = 99;
 820                 distribution[count]++;
 821         }
 822
 823         for (i = 0; i < 100; i++) {
 824                 if (distribution[i])
 825                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 826         }
 827 }
 828
 829 static struct ident *alloc_ident(const char *name, int len)
 830 {
 831         struct ident *ident = __alloc_ident(len);
 832         ident->symbols = NULL;
 833         ident->len = len;
 834         ident->tainted = 0;
 835         memcpy(ident->name, name, len);
 836         return ident;
 837 }
 838
 839 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 840 {
 841         ident->next = hash_table[hash];
 842         hash_table[hash] = ident;
 843         ident_miss++;
 844         return ident;
 845 }
 846
 847 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 848 {
 849         struct ident *ident;
 850         struct ident **p;
 851
 852         p = &hash_table[hash];
 853         while ((ident = *p) != NULL) {
 854                 if (ident->len == (unsigned char) len) {
 855                         if (strncmp(name, ident->name, len) != 0)
 856                                 goto next;
 857
 858                         ident_hit++;
 859                         return ident;
 860                 }
 861 next:
 862                 //misses++;
 863                 p = &ident->next;
 864         }
 865         ident = alloc_ident(name, len);
 866         *p = ident;
 867         ident->next = NULL;
 868         ident_miss++;
 869         idents++;
 870         return ident;
 871 }
 872
 873 static unsigned long hash_name(const char *name, int len)
 874 {
 875         unsigned long hash;
 876         const unsigned char *p = (const unsigned char *)name;
 877
 878         hash = ident_hash_init(*p++);
 879         while (--len) {
 880                 unsigned int i = *p++;
 881                 hash = ident_hash_add(hash, i);
 882         }
 883         return ident_hash_end(hash);
 884 }
 885
 886 struct ident *hash_ident(struct ident *ident)
 887 {
 888         return insert_hash(ident, hash_name(ident->name, ident->len));
 889 }
 890
 891 struct ident *built_in_ident(const char *name)
 892 {
 893         int len = strlen(name);
 894         return create_hashed_ident(name, len, hash_name(name, len));
 895 }
 896
 897 struct token *built_in_token(int stream, const char *name)
 898 {
 899         struct token *token;
 900
 901         token = __alloc_token(0);
 902         token->pos.stream = stream;
 903         token_type(token) = TOKEN_IDENT;
 904         token->ident = built_in_ident(name);
 905         return token;
 906 }
 907
 908 static int get_one_identifier(int c, stream_t *stream)
 909 {
 910         struct token *token;
 911         struct ident *ident;
 912         unsigned long hash;
 913         char buf[256];
 914         int len = 1;
 915         int next;
 916
 917         hash = ident_hash_init(c);
 918         buf[0] = c;
 919         for (;;) {
 920                 next = nextchar(stream);
 921                 if (!(cclass[next + 1] & (Letter | Digit)))
 922                         break;
 923                 if (len >= sizeof(buf))
 924                         break;
 925                 hash = ident_hash_add(hash, next);
 926                 buf[len] = next;
 927                 len++;
 928         };
 929         if (cclass[next + 1] & Quote) {
 930                 if (len == 1 && buf[0] == 'L') {
 931                         if (next == '\'')
 932                                 return eat_string(nextchar(stream), stream,
 933                                                         TOKEN_WIDE_CHAR);
 934                         else
 935                                 return eat_string(nextchar(stream), stream,
 936                                                         TOKEN_WIDE_STRING);
 937                 }
 938         }
 939         hash = ident_hash_end(hash);
 940         ident = create_hashed_ident(buf, len, hash);
 941
 942         /* Pass it on.. */
 943         token = stream->token;
 944         token_type(token) = TOKEN_IDENT;
 945         token->ident = ident;
 946         add_token(stream);
 947         return next;
 948 }
 949
 950 static int get_one_token(int c, stream_t *stream)
 951 {
 952         long class = cclass[c + 1];
 953         if (class & Digit)
 954                 return get_one_number(c, nextchar(stream), stream);
 955         if (class & Letter)
 956                 return get_one_identifier(c, stream);
 957         return get_one_special(c, stream);
 958 }
 959
 960 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 961         unsigned char *buf, unsigned int buf_size)
 962 {
 963         struct token *begin;
 964
 965         stream->nr = idx;
 966         stream->line = 1;
 967         stream->newline = 1;
 968         stream->whitespace = 0;
 969         stream->pos = 0;
 970
 971         stream->token = NULL;
 972         stream->fd = fd;
 973         stream->offset = 0;
 974         stream->size = buf_size;
 975         stream->buffer = buf;
 976
 977         begin = alloc_token(stream);
 978         token_type(begin) = TOKEN_STREAMBEGIN;
 979         stream->tokenlist = &begin->next;
 980         return begin;
 981 }
 982
 983 static struct token *tokenize_stream(stream_t *stream)
 984 {
 985         int c = nextchar(stream);
 986         while (c != EOF) {
 987                 if (!isspace(c)) {
 988                         struct token *token = alloc_token(stream);
 989                         stream->token = token;
 990                         stream->newline = 0;
 991                         stream->whitespace = 0;
 992                         c = get_one_token(c, stream);
 993                         continue;
 994                 }
 995                 stream->whitespace = 1;
 996                 c = nextchar(stream);
 997         }
 998         return mark_eof(stream);
 999 }
1000
1001 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1002 {
1003         stream_t stream;
1004         struct token *begin;
1005
1006         begin = setup_stream(&stream, 0, -1, buffer, size);
1007         *endtoken = tokenize_stream(&stream);
1008         return begin;
1009 }
1010
1011 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1012 {
1013         struct token *begin, *end;
1014         stream_t stream;
1015         unsigned char buffer[BUFSIZE];
1016         int idx;
1017
1018         idx = init_stream(name, fd, next_path);
1019         if (idx < 0) {
1020                 // info(endtoken->pos, "File %s is const", name);
1021                 return endtoken;
1022         }
1023
1024         begin = setup_stream(&stream, idx, fd, buffer, 0);
1025         end = tokenize_stream(&stream);
1026         if (endtoken)
1027                 end->next = endtoken;
1028         return begin;
1029 }