tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a copy
   9  * of this software and associated documentation files (the "Software"), to deal
  10  * in the Software without restriction, including without limitation the rights
  11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12  * copies of the Software, and to permit persons to whom the Software is
  13  * furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included in
  16  * all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24  * THE SOFTWARE.
  25  */
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <stdarg.h>
  29 #include <stddef.h>
  30 #include <string.h>
  31 #include <ctype.h>
  32 #include <unistd.h>
  33 #include <stdint.h>
  34
  35 #include "lib.h"
  36 #include "allocate.h"
  37 #include "token.h"
  38 #include "symbol.h"
  39
  40 #define EOF (-1)
  41
  42 int input_stream_nr = 0;
  43 struct stream *input_streams;
  44 static int input_streams_allocated;
  45 unsigned int tabstop = 8;
  46 int no_lineno = 0;
  47
  48 #define BUFSIZE (8192)
  49
  50 typedef struct {
  51         int fd, offset, size;
  52         int pos, line, nr;
  53         int newline, whitespace;
  54         struct token **tokenlist;
  55         struct token *token;
  56         unsigned char *buffer;
  57 } stream_t;
  58
  59 const char *stream_name(int stream)
  60 {
  61         if (stream < 0 || stream > input_stream_nr)
  62                 return "<bad stream>";
  63         return input_streams[stream].name;
  64 }
  65
  66 static struct position stream_pos(stream_t *stream)
  67 {
  68         struct position pos;
  69         pos.type = 0;
  70         pos.stream = stream->nr;
  71         pos.newline = stream->newline;
  72         pos.whitespace = stream->whitespace;
  73         pos.pos = stream->pos;
  74
  75         pos.line = stream->line;
  76         if (no_lineno)
  77                 pos.line = 123456;
  78
  79         pos.noexpand = 0;
  80         return pos;
  81 }
  82
  83 const char *show_special(int val)
  84 {
  85         static char buffer[4];
  86
  87         buffer[0] = val;
  88         buffer[1] = 0;
  89         if (val >= SPECIAL_BASE)
  90                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  91         return buffer;
  92 }
  93
  94 const char *show_ident(const struct ident *ident)
  95 {
  96         static char buff[4][256];
  97         static int n;
  98         char *buffer;
  99
 100         if (!ident)
 101                 return "<noident>";
 102         buffer = buff[3 & ++n];
 103         sprintf(buffer, "%.*s", ident->len, ident->name);
 104         return buffer;
 105 }
 106
 107 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
 108 {
 109         if (isprint(c)) {
 110                 if (c == escape || c == '\\')
 111                         *ptr++ = '\\';
 112                 *ptr++ = c;
 113                 return ptr;
 114         }
 115         *ptr++ = '\\';
 116         switch (c) {
 117         case '\n':
 118                 *ptr++ = 'n';
 119                 return ptr;
 120         case '\t':
 121                 *ptr++ = 't';
 122                 return ptr;
 123         }
 124         if (!isdigit(next))
 125                 return ptr + sprintf(ptr, "%o", c);
 126
 127         return ptr + sprintf(ptr, "%03o", c);
 128 }
 129
 130 const char *show_string(const struct string *string)
 131 {
 132         static char buffer[4 * MAX_STRING + 3];
 133         char *ptr;
 134         int i;
 135
 136         if (!string || !string->length)
 137                 return "<bad_string>";
 138         ptr = buffer;
 139         *ptr++ = '"';
 140         for (i = 0; i < string->length-1; i++) {
 141                 const char *p = string->data + i;
 142                 ptr = charstr(ptr, p[0], '"', p[1]);
 143         }
 144         *ptr++ = '"';
 145         *ptr = '\0';
 146         return buffer;
 147 }
 148
 149 static const char *show_char(const char *s, size_t len, char prefix, char delim)
 150 {
 151         static char buffer[MAX_STRING + 4];
 152         char *p = buffer;
 153         if (prefix)
 154                 *p++ = prefix;
 155         *p++ = delim;
 156         memcpy(p, s, len);
 157         p += len;
 158         *p++ = delim;
 159         *p++ = '\0';
 160         return buffer;
 161 }
 162
 163 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 164 {
 165         static char buffer[2*MAX_STRING + 6];
 166         size_t i;
 167         char *p = buffer;
 168         if (prefix)
 169                 *p++ = prefix;
 170         if (delim == '"')
 171                 *p++ = '\\';
 172         *p++ = delim;
 173         for (i = 0; i < len; i++) {
 174                 if (s[i] == '"' || s[i] == '\\')
 175                         *p++ = '\\';
 176                 *p++ = s[i];
 177         }
 178         if (delim == '"')
 179                 *p++ = '\\';
 180         *p++ = delim;
 181         *p++ = '\0';
 182         return buffer;
 183 }
 184
 185 const char *show_token(const struct token *token)
 186 {
 187         static char buffer[256];
 188
 189         if (!token)
 190                 return "<no token>";
 191         switch (token_type(token)) {
 192         case TOKEN_ERROR:
 193                 return "syntax error";
 194
 195         case TOKEN_EOF:
 196                 return "end-of-input";
 197
 198         case TOKEN_IDENT:
 199                 return show_ident(token->ident);
 200
 201         case TOKEN_NUMBER:
 202                 return token->number;
 203
 204         case TOKEN_SPECIAL:
 205                 return show_special(token->special);
 206
 207         case TOKEN_CHAR:
 208                 return show_char(token->string->data,
 209                         token->string->length - 1, 0, '\'');
 210         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 211                 return show_char(token->embedded,
 212                         token_type(token) - TOKEN_CHAR, 0, '\'');
 213         case TOKEN_WIDE_CHAR:
 214                 return show_char(token->string->data,
 215                         token->string->length - 1, 'L', '\'');
 216         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 217                 return show_char(token->embedded,
 218                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 219         case TOKEN_STRING:
 220                 return show_char(token->string->data,
 221                         token->string->length - 1, 0, '"');
 222         case TOKEN_WIDE_STRING:
 223                 return show_char(token->string->data,
 224                         token->string->length - 1, 'L', '"');
 225
 226         case TOKEN_STREAMBEGIN:
 227                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 228                 return buffer;
 229
 230         case TOKEN_STREAMEND:
 231                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 232                 return buffer;
 233
 234         case TOKEN_UNTAINT:
 235                 sprintf(buffer, "<untaint>");
 236                 return buffer;
 237
 238         case TOKEN_ARG_COUNT:
 239                 sprintf(buffer, "<argcnt>");
 240                 return buffer;
 241
 242         default:
 243                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 244                 return buffer;
 245         }
 246 }
 247
 248 const char *quote_token(const struct token *token)
 249 {
 250         static char buffer[256];
 251
 252         switch (token_type(token)) {
 253         case TOKEN_ERROR:
 254                 return "syntax error";
 255
 256         case TOKEN_IDENT:
 257                 return show_ident(token->ident);
 258
 259         case TOKEN_NUMBER:
 260                 return token->number;
 261
 262         case TOKEN_SPECIAL:
 263                 return show_special(token->special);
 264
 265         case TOKEN_CHAR:
 266                 return quote_char(token->string->data,
 267                         token->string->length - 1, 0, '\'');
 268         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 269                 return quote_char(token->embedded,
 270                         token_type(token) - TOKEN_CHAR, 0, '\'');
 271         case TOKEN_WIDE_CHAR:
 272                 return quote_char(token->string->data,
 273                         token->string->length - 1, 'L', '\'');
 274         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 275                 return quote_char(token->embedded,
 276                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 277         case TOKEN_STRING:
 278                 return quote_char(token->string->data,
 279                         token->string->length - 1, 0, '"');
 280         case TOKEN_WIDE_STRING:
 281                 return quote_char(token->string->data,
 282                         token->string->length - 1, 'L', '"');
 283         default:
 284                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 285                 return buffer;
 286         }
 287 }
 288
 289 #define HASHED_INPUT_BITS (6)
 290 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 291 #define HASH_PRIME 0x9e370001UL
 292
 293 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 294
 295 int *hash_stream(const char *name)
 296 {
 297         uint32_t hash = 0;
 298         unsigned char c;
 299
 300         while ((c = *name++) != 0)
 301                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 302
 303         hash *= HASH_PRIME;
 304         hash >>= 32 - HASHED_INPUT_BITS;
 305         return input_stream_hashes + hash;
 306 }
 307
 308 int init_stream(const char *name, int fd, const char **next_path)
 309 {
 310         int stream = input_stream_nr, *hash;
 311         struct stream *current;
 312
 313         if (stream >= input_streams_allocated) {
 314                 int newalloc = stream * 4 / 3 + 10;
 315                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 316                 if (!input_streams)
 317                         die("Unable to allocate more streams space");
 318                 input_streams_allocated = newalloc;
 319         }
 320         current = input_streams + stream;
 321         memset(current, 0, sizeof(*current));
 322         current->name = name;
 323         current->fd = fd;
 324         current->next_path = next_path;
 325         current->path = NULL;
 326         current->constant = CONSTANT_FILE_MAYBE;
 327         input_stream_nr = stream+1;
 328         hash = hash_stream(name);
 329         current->next_stream = *hash;
 330         *hash = stream;
 331         return stream;
 332 }
 333
 334 static struct token * alloc_token(stream_t *stream)
 335 {
 336         struct token *token = __alloc_token(0);
 337         token->pos = stream_pos(stream);
 338         return token;
 339 }
 340
 341 /*
 342  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 343  *  things a _lot_.
 344  */
 345 static int nextchar_slow(stream_t *stream)
 346 {
 347         int offset = stream->offset;
 348         int size = stream->size;
 349         int c;
 350         int spliced = 0, had_cr, had_backslash;
 351
 352 restart:
 353         had_cr = had_backslash = 0;
 354
 355 repeat:
 356         if (offset >= size) {
 357                 if (stream->fd < 0)
 358                         goto got_eof;
 359                 size = read(stream->fd, stream->buffer, BUFSIZE);
 360                 if (size <= 0)
 361                         goto got_eof;
 362                 stream->size = size;
 363                 stream->offset = offset = 0;
 364         }
 365
 366         c = stream->buffer[offset++];
 367         if (had_cr)
 368                 goto check_lf;
 369
 370         if (c == '\r') {
 371                 had_cr = 1;
 372                 goto repeat;
 373         }
 374
 375 norm:
 376         if (!had_backslash) {
 377                 switch (c) {
 378                 case '\t':
 379                         stream->pos += tabstop - stream->pos % tabstop;
 380                         break;
 381                 case '\n':
 382                         stream->line++;
 383                         stream->pos = 0;
 384                         stream->newline = 1;
 385                         break;
 386                 case '\\':
 387                         had_backslash = 1;
 388                         stream->pos++;
 389                         goto repeat;
 390                 default:
 391                         stream->pos++;
 392                 }
 393         } else {
 394                 if (c == '\n') {
 395                         stream->line++;
 396                         stream->pos = 0;
 397                         spliced = 1;
 398                         goto restart;
 399                 }
 400                 offset--;
 401                 c = '\\';
 402         }
 403 out:
 404         stream->offset = offset;
 405
 406         return c;
 407
 408 check_lf:
 409         if (c != '\n')
 410                 offset--;
 411         c = '\n';
 412         goto norm;
 413
 414 got_eof:
 415         if (had_backslash) {
 416                 c = '\\';
 417                 goto out;
 418         }
 419         if (stream->pos)
 420                 warning(stream_pos(stream), "no newline at end of file");
 421         else if (spliced)
 422                 warning(stream_pos(stream), "backslash-newline at end of file");
 423         return EOF;
 424 }
 425
 426 /*
 427  *  We want that as light as possible while covering all normal cases.
 428  *  Slow path (including the logics with line-splicing and EOF sanity
 429  *  checks) is in nextchar_slow().
 430  */
 431 static inline int nextchar(stream_t *stream)
 432 {
 433         int offset = stream->offset;
 434
 435         if (offset < stream->size) {
 436                 int c = stream->buffer[offset++];
 437                 static const char special[256] = {
 438                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 439                 };
 440                 if (!special[c]) {
 441                         stream->offset = offset;
 442                         stream->pos++;
 443                         return c;
 444                 }
 445         }
 446         return nextchar_slow(stream);
 447 }
 448
 449 struct token eof_token_entry;
 450
 451 static struct token *mark_eof(stream_t *stream)
 452 {
 453         struct token *end;
 454
 455         end = alloc_token(stream);
 456         eof_token_entry.pos = end->pos;
 457         token_type(end) = TOKEN_STREAMEND;
 458         end->pos.newline = 1;
 459
 460         eof_token_entry.next = &eof_token_entry;
 461         eof_token_entry.pos.newline = 1;
 462
 463         end->next =  &eof_token_entry;
 464         *stream->tokenlist = end;
 465         stream->tokenlist = NULL;
 466         return end;
 467 }
 468
 469 static void add_token(stream_t *stream)
 470 {
 471         struct token *token = stream->token;
 472
 473         stream->token = NULL;
 474         token->next = NULL;
 475         *stream->tokenlist = token;
 476         stream->tokenlist = &token->next;
 477 }
 478
 479 static void drop_token(stream_t *stream)
 480 {
 481         stream->newline |= stream->token->pos.newline;
 482         stream->whitespace |= stream->token->pos.whitespace;
 483         stream->token = NULL;
 484 }
 485
 486 enum {
 487         Letter = 1,
 488         Digit = 2,
 489         Hex = 4,
 490         Exp = 8,
 491         Dot = 16,
 492         ValidSecond = 32,
 493         Quote = 64,
 494 };
 495
 496 static const char cclass[257] = {
 497         ['0' + 1 ... '9' + 1] = Digit | Hex,
 498         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 499         ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 500         ['F' + 1] = Letter | Hex,
 501         ['G' + 1 ... 'O' + 1] = Letter,
 502         ['P' + 1] = Letter | Exp,       /* P<exp> */
 503         ['Q' + 1 ... 'Z' + 1] = Letter,
 504         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 505         ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
 506         ['f' + 1] = Letter | Hex,
 507         ['g' + 1 ... 'o' + 1] = Letter,
 508         ['p' + 1] = Letter | Exp,       /* p<exp> */
 509         ['q' + 1 ... 'z' + 1] = Letter,
 510         ['_' + 1] = Letter,
 511         ['.' + 1] = Dot | ValidSecond,
 512         ['=' + 1] = ValidSecond,
 513         ['+' + 1] = ValidSecond,
 514         ['-' + 1] = ValidSecond,
 515         ['>' + 1] = ValidSecond,
 516         ['<' + 1] = ValidSecond,
 517         ['&' + 1] = ValidSecond,
 518         ['|' + 1] = ValidSecond,
 519         ['#' + 1] = ValidSecond,
 520         ['\'' + 1] = Quote,
 521         ['"' + 1] = Quote,
 522 };
 523
 524 /*
 525  * pp-number:
 526  *      digit
 527  *      . digit
 528  *      pp-number digit
 529  *      pp-number identifier-nodigit
 530  *      pp-number e sign
 531  *      pp-number E sign
 532  *      pp-number p sign
 533  *      pp-number P sign
 534  *      pp-number .
 535  */
 536 static int get_one_number(int c, int next, stream_t *stream)
 537 {
 538         struct token *token;
 539         static char buffer[4095];
 540         char *p = buffer, *buffer_end = buffer + sizeof (buffer);
 541
 542         *p++ = c;
 543         for (;;) {
 544                 long class =  cclass[next + 1];
 545                 if (!(class & (Dot | Digit | Letter)))
 546                         break;
 547                 if (p != buffer_end)
 548                         *p++ = next;
 549                 next = nextchar(stream);
 550                 if (class & Exp) {
 551                         if (next == '-' || next == '+') {
 552                                 if (p != buffer_end)
 553                                         *p++ = next;
 554                                 next = nextchar(stream);
 555                         }
 556                 }
 557         }
 558
 559         if (p == buffer_end) {
 560                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 561                       buffer_end - buffer);
 562                 // Pretend we saw just "1".
 563                 buffer[0] = '1';
 564                 p = buffer + 1;
 565         }
 566
 567         *p++ = 0;
 568         token = stream->token;
 569         token_type(token) = TOKEN_NUMBER;
 570         token->number = xmemdup(buffer, p - buffer);
 571         add_token(stream);
 572
 573         return next;
 574 }
 575
 576 static int eat_string(int next, stream_t *stream, enum token_type type)
 577 {
 578         static char buffer[MAX_STRING];
 579         struct string *string;
 580         struct token *token = stream->token;
 581         int len = 0;
 582         int escape;
 583         int want_hex = 0;
 584         char delim = type < TOKEN_STRING ? '\'' : '"';
 585
 586         for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 587                 if (len < MAX_STRING)
 588                         buffer[len] = next;
 589                 len++;
 590                 if (next == '\n') {
 591                         warning(stream_pos(stream),
 592                                 "missing terminating %c character", delim);
 593                         /* assume delimiter is lost */
 594                         break;
 595                 }
 596                 if (next == EOF) {
 597                         warning(stream_pos(stream),
 598                                 "End of file in middle of string");
 599                         return next;
 600                 }
 601                 if (!escape) {
 602                         if (want_hex && !(cclass[next + 1] & Hex))
 603                                 warning(stream_pos(stream),
 604                                         "\\x used with no following hex digits");
 605                         want_hex = 0;
 606                         escape = next == '\\';
 607                 } else {
 608                         escape = 0;
 609                         want_hex = next == 'x';
 610                 }
 611         }
 612         if (want_hex)
 613                 warning(stream_pos(stream),
 614                         "\\x used with no following hex digits");
 615         if (len > MAX_STRING) {
 616                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 617                 len = MAX_STRING;
 618         }
 619         if (delim == '\'' && len <= 4) {
 620                 if (len == 0) {
 621                         sparse_error(stream_pos(stream),
 622                                 "empty character constant");
 623                         return nextchar(stream);
 624                 }
 625                 token_type(token) = type + len;
 626                 memset(buffer + len, '\0', 4 - len);
 627                 memcpy(token->embedded, buffer, 4);
 628         } else {
 629                 token_type(token) = type;
 630                 string = __alloc_string(len+1);
 631                 memcpy(string->data, buffer, len);
 632                 string->data[len] = '\0';
 633                 string->length = len+1;
 634                 token->string = string;
 635         }
 636
 637         /* Pass it on.. */
 638         token = stream->token;
 639         add_token(stream);
 640         return nextchar(stream);
 641 }
 642
 643 static int drop_stream_eoln(stream_t *stream)
 644 {
 645         drop_token(stream);
 646         for (;;) {
 647                 switch (nextchar(stream)) {
 648                 case EOF:
 649                         return EOF;
 650                 case '\n':
 651                         return nextchar(stream);
 652                 }
 653         }
 654 }
 655
 656 static int drop_stream_comment(stream_t *stream)
 657 {
 658         int newline;
 659         int next;
 660         drop_token(stream);
 661         newline = stream->newline;
 662
 663         next = nextchar(stream);
 664         for (;;) {
 665                 int curr = next;
 666                 if (curr == EOF) {
 667                         warning(stream_pos(stream), "End of file in the middle of a comment");
 668                         return curr;
 669                 }
 670                 next = nextchar(stream);
 671                 if (curr == '*' && next == '/')
 672                         break;
 673         }
 674         stream->newline = newline;
 675         return nextchar(stream);
 676 }
 677
 678 unsigned char combinations[][4] = COMBINATION_STRINGS;
 679
 680 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 681
 682 /* hash function for two-character punctuators - all give unique values */
 683 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 684
 685 /*
 686  * note that we won't get false positives - special_hash(0,0) is 0 and
 687  * entry 0 is filled (by +=), so all the missing ones are OK.
 688  */
 689 static unsigned char hash_results[32][2] = {
 690 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 691         RES('+', '='), /* 00 */
 692         RES('/', '='), /* 01 */
 693         RES('^', '='), /* 05 */
 694         RES('&', '&'), /* 07 */
 695         RES('#', '#'), /* 08 */
 696         RES('<', '<'), /* 0a */
 697         RES('<', '='), /* 0c */
 698         RES('!', '='), /* 0e */
 699         RES('%', '='), /* 0f */
 700         RES('-', '-'), /* 10 */
 701         RES('-', '='), /* 11 */
 702         RES('-', '>'), /* 13 */
 703         RES('=', '='), /* 15 */
 704         RES('&', '='), /* 17 */
 705         RES('*', '='), /* 18 */
 706         RES('.', '.'), /* 1a */
 707         RES('+', '+'), /* 1b */
 708         RES('|', '='), /* 1c */
 709         RES('>', '='), /* 1d */
 710         RES('|', '|'), /* 1e */
 711         RES('>', '>')  /* 1f */
 712 #undef RES
 713 };
 714 static int code[32] = {
 715 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 716         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 717         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 718         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 719         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 720         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 721         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 722         CODE('<', '=', SPECIAL_LTE), /* 0c */
 723         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 724         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 725         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 726         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 727         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 728         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 729         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 730         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 731         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 732         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 733         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 734         CODE('>', '=', SPECIAL_GTE), /* 1d */
 735         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 736         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 737 #undef CODE
 738 };
 739
 740 static int get_one_special(int c, stream_t *stream)
 741 {
 742         struct token *token;
 743         int next, value, i;
 744
 745         next = nextchar(stream);
 746
 747         /*
 748          * Check for numbers, strings, character constants, and comments
 749          */
 750         switch (c) {
 751         case '.':
 752                 if (next >= '0' && next <= '9')
 753                         return get_one_number(c, next, stream);
 754                 break;
 755         case '"':
 756                 return eat_string(next, stream, TOKEN_STRING);
 757         case '\'':
 758                 return eat_string(next, stream, TOKEN_CHAR);
 759         case '/':
 760                 if (next == '/')
 761                         return drop_stream_eoln(stream);
 762                 if (next == '*')
 763                         return drop_stream_comment(stream);
 764         }
 765
 766         /*
 767          * Check for combinations
 768          */
 769         value = c;
 770         if (cclass[next + 1] & ValidSecond) {
 771                 i = special_hash(c, next);
 772                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 773                         value = code[i];
 774                         next = nextchar(stream);
 775                         if (value >= SPECIAL_LEFTSHIFT &&
 776                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 777                                 value += 3;
 778                                 next = nextchar(stream);
 779                         }
 780                 }
 781         }
 782
 783         /* Pass it on.. */
 784         token = stream->token;
 785         token_type(token) = TOKEN_SPECIAL;
 786         token->special = value;
 787         add_token(stream);
 788         return next;
 789 }
 790
 791 #define IDENT_HASH_BITS (13)
 792 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 793 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 794
 795 #define ident_hash_init(c)              (c)
 796 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 797 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 798
 799 static struct ident *hash_table[IDENT_HASH_SIZE];
 800 static int ident_hit, ident_miss, idents;
 801
 802 void show_identifier_stats(void)
 803 {
 804         int i;
 805         int distribution[100];
 806
 807         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 808                 ident_hit, ident_miss);
 809
 810         for (i = 0; i < 100; i++)
 811                 distribution[i] = 0;
 812
 813         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 814                 struct ident * ident = hash_table[i];
 815                 int count = 0;
 816
 817                 while (ident) {
 818                         count++;
 819                         ident = ident->next;
 820                 }
 821                 if (count > 99)
 822                         count = 99;
 823                 distribution[count]++;
 824         }
 825
 826         for (i = 0; i < 100; i++) {
 827                 if (distribution[i])
 828                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 829         }
 830 }
 831
 832 struct ident *alloc_ident(const char *name, int len)
 833 {
 834         struct ident *ident = __alloc_ident(len);
 835         ident->symbols = NULL;
 836         ident->len = len;
 837         ident->tainted = 0;
 838         memcpy(ident->name, name, len);
 839         return ident;
 840 }
 841
 842 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 843 {
 844         ident->next = hash_table[hash];
 845         hash_table[hash] = ident;
 846         ident_miss++;
 847         return ident;
 848 }
 849
 850 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 851 {
 852         struct ident *ident;
 853         struct ident **p;
 854
 855         p = &hash_table[hash];
 856         while ((ident = *p) != NULL) {
 857                 if (ident->len == (unsigned char) len) {
 858                         if (strncmp(name, ident->name, len) != 0)
 859                                 goto next;
 860
 861                         ident_hit++;
 862                         return ident;
 863                 }
 864 next:
 865                 //misses++;
 866                 p = &ident->next;
 867         }
 868         ident = alloc_ident(name, len);
 869         *p = ident;
 870         ident->next = NULL;
 871         ident_miss++;
 872         idents++;
 873         return ident;
 874 }
 875
 876 static unsigned long hash_name(const char *name, int len)
 877 {
 878         unsigned long hash;
 879         const unsigned char *p = (const unsigned char *)name;
 880
 881         hash = ident_hash_init(*p++);
 882         while (--len) {
 883                 unsigned int i = *p++;
 884                 hash = ident_hash_add(hash, i);
 885         }
 886         return ident_hash_end(hash);
 887 }
 888
 889 struct ident *hash_ident(struct ident *ident)
 890 {
 891         return insert_hash(ident, hash_name(ident->name, ident->len));
 892 }
 893
 894 struct ident *built_in_ident(const char *name)
 895 {
 896         int len = strlen(name);
 897         return create_hashed_ident(name, len, hash_name(name, len));
 898 }
 899
 900 struct token *built_in_token(int stream, struct ident *ident)
 901 {
 902         struct token *token;
 903
 904         token = __alloc_token(0);
 905         token->pos.stream = stream;
 906         token_type(token) = TOKEN_IDENT;
 907         token->ident = ident;
 908         return token;
 909 }
 910
 911 static int get_one_identifier(int c, stream_t *stream)
 912 {
 913         struct token *token;
 914         struct ident *ident;
 915         unsigned long hash;
 916         char buf[256];
 917         int len = 1;
 918         int next;
 919
 920         hash = ident_hash_init(c);
 921         buf[0] = c;
 922         for (;;) {
 923                 next = nextchar(stream);
 924                 if (!(cclass[next + 1] & (Letter | Digit)))
 925                         break;
 926                 if (len >= sizeof(buf))
 927                         break;
 928                 hash = ident_hash_add(hash, next);
 929                 buf[len] = next;
 930                 len++;
 931         };
 932         if (cclass[next + 1] & Quote) {
 933                 if (len == 1 && buf[0] == 'L') {
 934                         if (next == '\'')
 935                                 return eat_string(nextchar(stream), stream,
 936                                                         TOKEN_WIDE_CHAR);
 937                         else
 938                                 return eat_string(nextchar(stream), stream,
 939                                                         TOKEN_WIDE_STRING);
 940                 }
 941         }
 942         hash = ident_hash_end(hash);
 943         ident = create_hashed_ident(buf, len, hash);
 944
 945         /* Pass it on.. */
 946         token = stream->token;
 947         token_type(token) = TOKEN_IDENT;
 948         token->ident = ident;
 949         add_token(stream);
 950         return next;
 951 }
 952
 953 static int get_one_token(int c, stream_t *stream)
 954 {
 955         long class = cclass[c + 1];
 956         if (class & Digit)
 957                 return get_one_number(c, nextchar(stream), stream);
 958         if (class & Letter)
 959                 return get_one_identifier(c, stream);
 960         return get_one_special(c, stream);
 961 }
 962
 963 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 964         unsigned char *buf, unsigned int buf_size)
 965 {
 966         struct token *begin;
 967
 968         stream->nr = idx;
 969         stream->line = 1;
 970         stream->newline = 1;
 971         stream->whitespace = 0;
 972         stream->pos = 0;
 973
 974         stream->token = NULL;
 975         stream->fd = fd;
 976         stream->offset = 0;
 977         stream->size = buf_size;
 978         stream->buffer = buf;
 979
 980         begin = alloc_token(stream);
 981         token_type(begin) = TOKEN_STREAMBEGIN;
 982         stream->tokenlist = &begin->next;
 983         return begin;
 984 }
 985
 986 static struct token *tokenize_stream(stream_t *stream)
 987 {
 988         int c = nextchar(stream);
 989         while (c != EOF) {
 990                 if (!isspace(c)) {
 991                         struct token *token = alloc_token(stream);
 992                         stream->token = token;
 993                         stream->newline = 0;
 994                         stream->whitespace = 0;
 995                         c = get_one_token(c, stream);
 996                         continue;
 997                 }
 998                 stream->whitespace = 1;
 999                 c = nextchar(stream);
1000         }
1001         return mark_eof(stream);
1002 }
1003
1004 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1005 {
1006         stream_t stream;
1007         struct token *begin;
1008
1009         begin = setup_stream(&stream, 0, -1, buffer, size);
1010         *endtoken = tokenize_stream(&stream);
1011         return begin;
1012 }
1013
1014 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
1015 {
1016         struct token *begin, *end;
1017         stream_t stream;
1018         unsigned char buffer[BUFSIZE];
1019         int idx;
1020
1021         idx = init_stream(name, fd, next_path);
1022         if (idx < 0) {
1023                 // info(endtoken->pos, "File %s is const", name);
1024                 return endtoken;
1025         }
1026
1027         begin = setup_stream(&stream, idx, fd, buffer, 0);
1028         end = tokenize_stream(&stream);
1029         if (endtoken)
1030                 end->next = endtoken;
1031         return begin;
1032 }