tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a copy
   9  * of this software and associated documentation files (the "Software"), to deal
  10  * in the Software without restriction, including without limitation the rights
  11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12  * copies of the Software, and to permit persons to whom the Software is
  13  * furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included in
  16  * all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24  * THE SOFTWARE.
  25  */
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <stdarg.h>
  29 #include <stddef.h>
  30 #include <string.h>
  31 #include <ctype.h>
  32 #include <unistd.h>
  33 #include <stdint.h>
  34
  35 #include "lib.h"
  36 #include "allocate.h"
  37 #include "token.h"
  38 #include "symbol.h"
  39
  40 #define EOF (-1)
  41
  42 int input_stream_nr = 0;
  43 struct stream *input_streams;
  44 static int input_streams_allocated;
  45 unsigned int tabstop = 8;
  46
  47 #define BUFSIZE (8192)
  48
  49 typedef struct {
  50         int fd, offset, size;
  51         int pos, line, nr;
  52         int newline, whitespace;
  53         struct token **tokenlist;
  54         struct token *token;
  55         unsigned char *buffer;
  56 } stream_t;
  57
  58 const char *stream_name(int stream)
  59 {
  60         if (stream < 0 || stream > input_stream_nr)
  61                 return "<bad stream>";
  62         return input_streams[stream].name;
  63 }
  64
  65 int stream_prev(int stream)
  66 {
  67         if (stream < 0 || stream > input_stream_nr)
  68                 return -1;
  69         stream = input_streams[stream].pos.stream;
  70         if (stream > input_stream_nr)
  71                 return -1;
  72         return stream;
  73 }
  74
  75 static struct position stream_pos(stream_t *stream)
  76 {
  77         struct position pos;
  78         pos.type = 0;
  79         pos.stream = stream->nr;
  80         pos.newline = stream->newline;
  81         pos.whitespace = stream->whitespace;
  82         pos.pos = stream->pos;
  83         pos.line = stream->line;
  84         pos.noexpand = 0;
  85         return pos;
  86 }
  87
  88 const char *show_special(int val)
  89 {
  90         static char buffer[4];
  91
  92         buffer[0] = val;
  93         buffer[1] = 0;
  94         if (val >= SPECIAL_BASE)
  95                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
  96         return buffer;
  97 }
  98
  99 const char *show_ident(const struct ident *ident)
 100 {
 101         static char buff[4][256];
 102         static int n;
 103         char *buffer;
 104
 105         if (!ident)
 106                 return "<noident>";
 107         buffer = buff[3 & ++n];
 108         sprintf(buffer, "%.*s", ident->len, ident->name);
 109         return buffer;
 110 }
 111
 112 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
 113 {
 114         if (isprint(c)) {
 115                 if (c == escape || c == '\\')
 116                         *ptr++ = '\\';
 117                 *ptr++ = c;
 118                 return ptr;
 119         }
 120         *ptr++ = '\\';
 121         switch (c) {
 122         case '\n':
 123                 *ptr++ = 'n';
 124                 return ptr;
 125         case '\t':
 126                 *ptr++ = 't';
 127                 return ptr;
 128         }
 129         if (!isdigit(next))
 130                 return ptr + sprintf(ptr, "%o", c);
 131
 132         return ptr + sprintf(ptr, "%03o", c);
 133 }
 134
 135 const char *show_string(const struct string *string)
 136 {
 137         static char buffer[4 * MAX_STRING + 3];
 138         char *ptr;
 139         int i;
 140
 141         if (!string || !string->length)
 142                 return "<bad_string>";
 143         ptr = buffer;
 144         *ptr++ = '"';
 145         for (i = 0; i < string->length-1; i++) {
 146                 const char *p = string->data + i;
 147                 ptr = charstr(ptr, p[0], '"', p[1]);
 148         }
 149         *ptr++ = '"';
 150         *ptr = '\0';
 151         return buffer;
 152 }
 153
 154 static const char *show_char(const char *s, size_t len, char prefix, char delim)
 155 {
 156         static char buffer[MAX_STRING + 4];
 157         char *p = buffer;
 158         if (prefix)
 159                 *p++ = prefix;
 160         *p++ = delim;
 161         memcpy(p, s, len);
 162         p += len;
 163         *p++ = delim;
 164         *p++ = '\0';
 165         return buffer;
 166 }
 167
 168 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 169 {
 170         static char buffer[2*MAX_STRING + 6];
 171         size_t i;
 172         char *p = buffer;
 173         if (prefix)
 174                 *p++ = prefix;
 175         if (delim == '"')
 176                 *p++ = '\\';
 177         *p++ = delim;
 178         for (i = 0; i < len; i++) {
 179                 if (s[i] == '"' || s[i] == '\\')
 180                         *p++ = '\\';
 181                 *p++ = s[i];
 182         }
 183         if (delim == '"')
 184                 *p++ = '\\';
 185         *p++ = delim;
 186         *p++ = '\0';
 187         return buffer;
 188 }
 189
 190 const char *show_token(const struct token *token)
 191 {
 192         static char buffer[256];
 193
 194         if (!token)
 195                 return "<no token>";
 196         switch (token_type(token)) {
 197         case TOKEN_ERROR:
 198                 return "syntax error";
 199
 200         case TOKEN_EOF:
 201                 return "end-of-input";
 202
 203         case TOKEN_IDENT:
 204                 return show_ident(token->ident);
 205
 206         case TOKEN_NUMBER:
 207                 return token->number;
 208
 209         case TOKEN_SPECIAL:
 210                 return show_special(token->special);
 211
 212         case TOKEN_CHAR:
 213                 return show_char(token->string->data,
 214                         token->string->length - 1, 0, '\'');
 215         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 216                 return show_char(token->embedded,
 217                         token_type(token) - TOKEN_CHAR, 0, '\'');
 218         case TOKEN_WIDE_CHAR:
 219                 return show_char(token->string->data,
 220                         token->string->length - 1, 'L', '\'');
 221         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 222                 return show_char(token->embedded,
 223                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 224         case TOKEN_STRING:
 225                 return show_char(token->string->data,
 226                         token->string->length - 1, 0, '"');
 227         case TOKEN_WIDE_STRING:
 228                 return show_char(token->string->data,
 229                         token->string->length - 1, 'L', '"');
 230
 231         case TOKEN_STREAMBEGIN:
 232                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 233                 return buffer;
 234
 235         case TOKEN_STREAMEND:
 236                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 237                 return buffer;
 238
 239         case TOKEN_UNTAINT:
 240                 sprintf(buffer, "<untaint>");
 241                 return buffer;
 242
 243         case TOKEN_ARG_COUNT:
 244                 sprintf(buffer, "<argcnt>");
 245                 return buffer;
 246
 247         default:
 248                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 249                 return buffer;
 250         }
 251 }
 252
 253 const char *quote_token(const struct token *token)
 254 {
 255         static char buffer[256];
 256
 257         switch (token_type(token)) {
 258         case TOKEN_ERROR:
 259                 return "syntax error";
 260
 261         case TOKEN_IDENT:
 262                 return show_ident(token->ident);
 263
 264         case TOKEN_NUMBER:
 265                 return token->number;
 266
 267         case TOKEN_SPECIAL:
 268                 return show_special(token->special);
 269
 270         case TOKEN_CHAR:
 271                 return quote_char(token->string->data,
 272                         token->string->length - 1, 0, '\'');
 273         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 274                 return quote_char(token->embedded,
 275                         token_type(token) - TOKEN_CHAR, 0, '\'');
 276         case TOKEN_WIDE_CHAR:
 277                 return quote_char(token->string->data,
 278                         token->string->length - 1, 'L', '\'');
 279         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 280                 return quote_char(token->embedded,
 281                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 282         case TOKEN_STRING:
 283                 return quote_char(token->string->data,
 284                         token->string->length - 1, 0, '"');
 285         case TOKEN_WIDE_STRING:
 286                 return quote_char(token->string->data,
 287                         token->string->length - 1, 'L', '"');
 288         default:
 289                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 290                 return buffer;
 291         }
 292 }
 293
 294 #define HASHED_INPUT_BITS (6)
 295 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 296 #define HASH_PRIME 0x9e370001UL
 297
 298 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 299
 300 int *hash_stream(const char *name)
 301 {
 302         uint32_t hash = 0;
 303         unsigned char c;
 304
 305         while ((c = *name++) != 0)
 306                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 307
 308         hash *= HASH_PRIME;
 309         hash >>= 32 - HASHED_INPUT_BITS;
 310         return input_stream_hashes + hash;
 311 }
 312
 313 int init_stream(const struct position *pos, const char *name, int fd, const char **next_path)
 314 {
 315         int stream = input_stream_nr, *hash;
 316         struct stream *current;
 317
 318         if (stream >= input_streams_allocated) {
 319                 int newalloc = stream * 4 / 3 + 10;
 320                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 321                 if (!input_streams)
 322                         die("Unable to allocate more streams space");
 323                 input_streams_allocated = newalloc;
 324         }
 325         current = input_streams + stream;
 326         memset(current, 0, sizeof(*current));
 327         current->name = name;
 328         current->fd = fd;
 329         current->next_path = next_path;
 330         current->path = NULL;
 331         current->constant = CONSTANT_FILE_MAYBE;
 332         if (pos)
 333                 current->pos = *pos;
 334         else
 335                 current->pos.stream = -1;
 336         input_stream_nr = stream+1;
 337         hash = hash_stream(name);
 338         current->next_stream = *hash;
 339         *hash = stream;
 340         return stream;
 341 }
 342
 343 static struct token * alloc_token(stream_t *stream)
 344 {
 345         struct token *token = __alloc_token(0);
 346         token->pos = stream_pos(stream);
 347         return token;
 348 }
 349
 350 /*
 351  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 352  *  things a _lot_.
 353  */
 354 static int nextchar_slow(stream_t *stream)
 355 {
 356         int offset = stream->offset;
 357         int size = stream->size;
 358         int c;
 359         int spliced = 0, had_cr, had_backslash;
 360
 361 restart:
 362         had_cr = had_backslash = 0;
 363
 364 repeat:
 365         if (offset >= size) {
 366                 if (stream->fd < 0)
 367                         goto got_eof;
 368                 size = read(stream->fd, stream->buffer, BUFSIZE);
 369                 if (size <= 0)
 370                         goto got_eof;
 371                 stream->size = size;
 372                 stream->offset = offset = 0;
 373         }
 374
 375         c = stream->buffer[offset++];
 376         if (had_cr)
 377                 goto check_lf;
 378
 379         if (c == '\r') {
 380                 had_cr = 1;
 381                 goto repeat;
 382         }
 383
 384 norm:
 385         if (!had_backslash) {
 386                 switch (c) {
 387                 case '\t':
 388                         stream->pos += tabstop - stream->pos % tabstop;
 389                         break;
 390                 case '\n':
 391                         stream->line++;
 392                         stream->pos = 0;
 393                         stream->newline = 1;
 394                         break;
 395                 case '\\':
 396                         had_backslash = 1;
 397                         stream->pos++;
 398                         goto repeat;
 399                 default:
 400                         stream->pos++;
 401                 }
 402         } else {
 403                 if (c == '\n') {
 404                         stream->line++;
 405                         stream->pos = 0;
 406                         spliced = 1;
 407                         goto restart;
 408                 }
 409                 offset--;
 410                 c = '\\';
 411         }
 412 out:
 413         stream->offset = offset;
 414
 415         return c;
 416
 417 check_lf:
 418         if (c != '\n')
 419                 offset--;
 420         c = '\n';
 421         goto norm;
 422
 423 got_eof:
 424         if (had_backslash) {
 425                 c = '\\';
 426                 goto out;
 427         }
 428         if (stream->pos & Wnewline_eof)
 429                 warning(stream_pos(stream), "no newline at end of file");
 430         else if (spliced)
 431                 warning(stream_pos(stream), "backslash-newline at end of file");
 432         return EOF;
 433 }
 434
 435 /*
 436  *  We want that as light as possible while covering all normal cases.
 437  *  Slow path (including the logics with line-splicing and EOF sanity
 438  *  checks) is in nextchar_slow().
 439  */
 440 static inline int nextchar(stream_t *stream)
 441 {
 442         int offset = stream->offset;
 443
 444         if (offset < stream->size) {
 445                 int c = stream->buffer[offset++];
 446                 static const char special[256] = {
 447                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 448                 };
 449                 if (!special[c]) {
 450                         stream->offset = offset;
 451                         stream->pos++;
 452                         return c;
 453                 }
 454         }
 455         return nextchar_slow(stream);
 456 }
 457
 458 struct token eof_token_entry;
 459
 460 static struct token *mark_eof(stream_t *stream)
 461 {
 462         struct token *end;
 463
 464         end = alloc_token(stream);
 465         eof_token_entry.pos = end->pos;
 466         token_type(end) = TOKEN_STREAMEND;
 467         end->pos.newline = 1;
 468
 469         eof_token_entry.next = &eof_token_entry;
 470         eof_token_entry.pos.newline = 1;
 471
 472         end->next =  &eof_token_entry;
 473         *stream->tokenlist = end;
 474         stream->tokenlist = NULL;
 475         return end;
 476 }
 477
 478 static void add_token(stream_t *stream)
 479 {
 480         struct token *token = stream->token;
 481
 482         stream->token = NULL;
 483         token->next = NULL;
 484         *stream->tokenlist = token;
 485         stream->tokenlist = &token->next;
 486 }
 487
 488 static void drop_token(stream_t *stream)
 489 {
 490         stream->newline |= stream->token->pos.newline;
 491         stream->whitespace |= stream->token->pos.whitespace;
 492         stream->token = NULL;
 493 }
 494
 495 enum {
 496         Letter = 1,
 497         Digit = 2,
 498         Hex = 4,
 499         Exp = 8,
 500         Dot = 16,
 501         ValidSecond = 32,
 502         Quote = 64,
 503 };
 504
 505 static const char cclass[257] = {
 506         ['0' + 1 ... '9' + 1] = Digit | Hex,
 507         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 508         ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 509         ['F' + 1] = Letter | Hex,
 510         ['G' + 1 ... 'O' + 1] = Letter,
 511         ['P' + 1] = Letter | Exp,       /* P<exp> */
 512         ['Q' + 1 ... 'Z' + 1] = Letter,
 513         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 514         ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
 515         ['f' + 1] = Letter | Hex,
 516         ['g' + 1 ... 'o' + 1] = Letter,
 517         ['p' + 1] = Letter | Exp,       /* p<exp> */
 518         ['q' + 1 ... 'z' + 1] = Letter,
 519         ['_' + 1] = Letter,
 520         ['.' + 1] = Dot | ValidSecond,
 521         ['=' + 1] = ValidSecond,
 522         ['+' + 1] = ValidSecond,
 523         ['-' + 1] = ValidSecond,
 524         ['>' + 1] = ValidSecond,
 525         ['<' + 1] = ValidSecond,
 526         ['&' + 1] = ValidSecond,
 527         ['|' + 1] = ValidSecond,
 528         ['#' + 1] = ValidSecond,
 529         ['\'' + 1] = Quote,
 530         ['"' + 1] = Quote,
 531 };
 532
 533 /*
 534  * pp-number:
 535  *      digit
 536  *      . digit
 537  *      pp-number digit
 538  *      pp-number identifier-nodigit
 539  *      pp-number e sign
 540  *      pp-number E sign
 541  *      pp-number p sign
 542  *      pp-number P sign
 543  *      pp-number .
 544  */
 545 static int get_one_number(int c, int next, stream_t *stream)
 546 {
 547         struct token *token;
 548         static char buffer[4095];
 549         char *p = buffer, *buffer_end = buffer + sizeof (buffer);
 550
 551         *p++ = c;
 552         for (;;) {
 553                 long class =  cclass[next + 1];
 554                 if (!(class & (Dot | Digit | Letter)))
 555                         break;
 556                 if (p != buffer_end)
 557                         *p++ = next;
 558                 next = nextchar(stream);
 559                 if (class & Exp) {
 560                         if (next == '-' || next == '+') {
 561                                 if (p != buffer_end)
 562                                         *p++ = next;
 563                                 next = nextchar(stream);
 564                         }
 565                 }
 566         }
 567
 568         if (p == buffer_end) {
 569                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 570                       buffer_end - buffer);
 571                 // Pretend we saw just "1".
 572                 buffer[0] = '1';
 573                 p = buffer + 1;
 574         }
 575
 576         *p++ = 0;
 577         token = stream->token;
 578         token_type(token) = TOKEN_NUMBER;
 579         token->number = xmemdup(buffer, p - buffer);
 580         add_token(stream);
 581
 582         return next;
 583 }
 584
 585 static int eat_string(int next, stream_t *stream, enum token_type type)
 586 {
 587         static char buffer[MAX_STRING];
 588         struct string *string;
 589         struct token *token = stream->token;
 590         int len = 0;
 591         int escape;
 592         int want_hex = 0;
 593         char delim = type < TOKEN_STRING ? '\'' : '"';
 594
 595         for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 596                 if (len < MAX_STRING)
 597                         buffer[len] = next;
 598                 len++;
 599                 if (next == '\n') {
 600                         warning(stream_pos(stream),
 601                                 "missing terminating %c character", delim);
 602                         /* assume delimiter is lost */
 603                         break;
 604                 }
 605                 if (next == EOF) {
 606                         warning(stream_pos(stream),
 607                                 "End of file in middle of string");
 608                         return next;
 609                 }
 610                 if (!escape) {
 611                         if (want_hex && !(cclass[next + 1] & Hex))
 612                                 warning(stream_pos(stream),
 613                                         "\\x used with no following hex digits");
 614                         want_hex = 0;
 615                         escape = next == '\\';
 616                 } else {
 617                         escape = 0;
 618                         want_hex = next == 'x';
 619                 }
 620         }
 621         if (want_hex)
 622                 warning(stream_pos(stream),
 623                         "\\x used with no following hex digits");
 624         if (len > MAX_STRING) {
 625                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 626                 len = MAX_STRING;
 627         }
 628         if (delim == '\'' && len && len <= 4) {
 629                 token_type(token) = type + len;
 630                 memset(buffer + len, '\0', 4 - len);
 631                 memcpy(token->embedded, buffer, 4);
 632         } else {
 633                 token_type(token) = type;
 634                 string = __alloc_string(len+1);
 635                 memcpy(string->data, buffer, len);
 636                 string->data[len] = '\0';
 637                 string->length = len+1;
 638                 token->string = string;
 639         }
 640
 641         /* Pass it on.. */
 642         token = stream->token;
 643         add_token(stream);
 644         return nextchar(stream);
 645 }
 646
 647 static int drop_stream_eoln(stream_t *stream)
 648 {
 649         drop_token(stream);
 650         for (;;) {
 651                 switch (nextchar(stream)) {
 652                 case EOF:
 653                         return EOF;
 654                 case '\n':
 655                         return nextchar(stream);
 656                 }
 657         }
 658 }
 659
 660 static int drop_stream_comment(stream_t *stream)
 661 {
 662         int newline;
 663         int next;
 664         drop_token(stream);
 665         newline = stream->newline;
 666
 667         next = nextchar(stream);
 668         for (;;) {
 669                 int curr = next;
 670                 if (curr == EOF) {
 671                         warning(stream_pos(stream), "End of file in the middle of a comment");
 672                         return curr;
 673                 }
 674                 next = nextchar(stream);
 675                 if (curr == '*' && next == '/')
 676                         break;
 677         }
 678         stream->newline = newline;
 679         return nextchar(stream);
 680 }
 681
 682 unsigned char combinations[][4] = COMBINATION_STRINGS;
 683
 684 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 685
 686 /* hash function for two-character punctuators - all give unique values */
 687 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 688
 689 /*
 690  * note that we won't get false positives - special_hash(0,0) is 0 and
 691  * entry 0 is filled (by +=), so all the missing ones are OK.
 692  */
 693 static unsigned char hash_results[32][2] = {
 694 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 695         RES('+', '='), /* 00 */
 696         RES('/', '='), /* 01 */
 697         RES('^', '='), /* 05 */
 698         RES('&', '&'), /* 07 */
 699         RES('#', '#'), /* 08 */
 700         RES('<', '<'), /* 0a */
 701         RES('<', '='), /* 0c */
 702         RES('!', '='), /* 0e */
 703         RES('%', '='), /* 0f */
 704         RES('-', '-'), /* 10 */
 705         RES('-', '='), /* 11 */
 706         RES('-', '>'), /* 13 */
 707         RES('=', '='), /* 15 */
 708         RES('&', '='), /* 17 */
 709         RES('*', '='), /* 18 */
 710         RES('.', '.'), /* 1a */
 711         RES('+', '+'), /* 1b */
 712         RES('|', '='), /* 1c */
 713         RES('>', '='), /* 1d */
 714         RES('|', '|'), /* 1e */
 715         RES('>', '>')  /* 1f */
 716 #undef RES
 717 };
 718 static int code[32] = {
 719 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 720         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 721         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 722         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 723         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 724         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 725         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 726         CODE('<', '=', SPECIAL_LTE), /* 0c */
 727         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 728         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 729         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 730         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 731         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 732         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 733         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 734         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 735         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 736         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 737         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 738         CODE('>', '=', SPECIAL_GTE), /* 1d */
 739         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 740         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 741 #undef CODE
 742 };
 743
 744 static int get_one_special(int c, stream_t *stream)
 745 {
 746         struct token *token;
 747         int next, value, i;
 748
 749         next = nextchar(stream);
 750
 751         /*
 752          * Check for numbers, strings, character constants, and comments
 753          */
 754         switch (c) {
 755         case '.':
 756                 if (next >= '0' && next <= '9')
 757                         return get_one_number(c, next, stream);
 758                 break;
 759         case '"':
 760                 return eat_string(next, stream, TOKEN_STRING);
 761         case '\'':
 762                 return eat_string(next, stream, TOKEN_CHAR);
 763         case '/':
 764                 if (next == '/')
 765                         return drop_stream_eoln(stream);
 766                 if (next == '*')
 767                         return drop_stream_comment(stream);
 768         }
 769
 770         /*
 771          * Check for combinations
 772          */
 773         value = c;
 774         if (cclass[next + 1] & ValidSecond) {
 775                 i = special_hash(c, next);
 776                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 777                         value = code[i];
 778                         next = nextchar(stream);
 779                         if (value >= SPECIAL_LEFTSHIFT &&
 780                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 781                                 value += 3;
 782                                 next = nextchar(stream);
 783                         }
 784                 }
 785         }
 786
 787         /* Pass it on.. */
 788         token = stream->token;
 789         token_type(token) = TOKEN_SPECIAL;
 790         token->special = value;
 791         add_token(stream);
 792         return next;
 793 }
 794
 795 #define IDENT_HASH_BITS (13)
 796 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 797 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 798
 799 #define ident_hash_init(c)              (c)
 800 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 801 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 802
 803 static struct ident *hash_table[IDENT_HASH_SIZE];
 804 static int ident_hit, ident_miss, idents;
 805
 806 void show_identifier_stats(void)
 807 {
 808         int i;
 809         int distribution[100];
 810
 811         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 812                 ident_hit, ident_miss);
 813
 814         for (i = 0; i < 100; i++)
 815                 distribution[i] = 0;
 816
 817         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 818                 struct ident * ident = hash_table[i];
 819                 int count = 0;
 820
 821                 while (ident) {
 822                         count++;
 823                         ident = ident->next;
 824                 }
 825                 if (count > 99)
 826                         count = 99;
 827                 distribution[count]++;
 828         }
 829
 830         for (i = 0; i < 100; i++) {
 831                 if (distribution[i])
 832                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 833         }
 834 }
 835
 836 static struct ident *alloc_ident(const char *name, int len)
 837 {
 838         struct ident *ident = __alloc_ident(len);
 839         ident->symbols = NULL;
 840         ident->len = len;
 841         ident->tainted = 0;
 842         memcpy(ident->name, name, len);
 843         return ident;
 844 }
 845
 846 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 847 {
 848         ident->next = hash_table[hash];
 849         hash_table[hash] = ident;
 850         ident_miss++;
 851         return ident;
 852 }
 853
 854 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 855 {
 856         struct ident *ident;
 857         struct ident **p;
 858
 859         p = &hash_table[hash];
 860         while ((ident = *p) != NULL) {
 861                 if (ident->len == (unsigned char) len) {
 862                         if (strncmp(name, ident->name, len) != 0)
 863                                 goto next;
 864
 865                         ident_hit++;
 866                         return ident;
 867                 }
 868 next:
 869                 //misses++;
 870                 p = &ident->next;
 871         }
 872         ident = alloc_ident(name, len);
 873         *p = ident;
 874         ident->next = NULL;
 875         ident_miss++;
 876         idents++;
 877         return ident;
 878 }
 879
 880 static unsigned long hash_name(const char *name, int len)
 881 {
 882         unsigned long hash;
 883         const unsigned char *p = (const unsigned char *)name;
 884
 885         hash = ident_hash_init(*p++);
 886         while (--len) {
 887                 unsigned int i = *p++;
 888                 hash = ident_hash_add(hash, i);
 889         }
 890         return ident_hash_end(hash);
 891 }
 892
 893 struct ident *hash_ident(struct ident *ident)
 894 {
 895         return insert_hash(ident, hash_name(ident->name, ident->len));
 896 }
 897
 898 struct ident *built_in_ident(const char *name)
 899 {
 900         int len = strlen(name);
 901         return create_hashed_ident(name, len, hash_name(name, len));
 902 }
 903
 904 struct token *built_in_token(int stream, struct ident *ident)
 905 {
 906         struct token *token;
 907
 908         token = __alloc_token(0);
 909         token->pos.stream = stream;
 910         token_type(token) = TOKEN_IDENT;
 911         token->ident = ident;
 912         return token;
 913 }
 914
 915 static int get_one_identifier(int c, stream_t *stream)
 916 {
 917         struct token *token;
 918         struct ident *ident;
 919         unsigned long hash;
 920         char buf[256];
 921         int len = 1;
 922         int next;
 923
 924         hash = ident_hash_init(c);
 925         buf[0] = c;
 926         for (;;) {
 927                 next = nextchar(stream);
 928                 if (!(cclass[next + 1] & (Letter | Digit)))
 929                         break;
 930                 if (len >= sizeof(buf))
 931                         break;
 932                 hash = ident_hash_add(hash, next);
 933                 buf[len] = next;
 934                 len++;
 935         };
 936         if (cclass[next + 1] & Quote) {
 937                 if (len == 1 && buf[0] == 'L') {
 938                         if (next == '\'')
 939                                 return eat_string(nextchar(stream), stream,
 940                                                         TOKEN_WIDE_CHAR);
 941                         else
 942                                 return eat_string(nextchar(stream), stream,
 943                                                         TOKEN_WIDE_STRING);
 944                 }
 945         }
 946         hash = ident_hash_end(hash);
 947         ident = create_hashed_ident(buf, len, hash);
 948
 949         /* Pass it on.. */
 950         token = stream->token;
 951         token_type(token) = TOKEN_IDENT;
 952         token->ident = ident;
 953         add_token(stream);
 954         return next;
 955 }
 956
 957 static int get_one_token(int c, stream_t *stream)
 958 {
 959         long class = cclass[c + 1];
 960         if (class & Digit)
 961                 return get_one_number(c, nextchar(stream), stream);
 962         if (class & Letter)
 963                 return get_one_identifier(c, stream);
 964         return get_one_special(c, stream);
 965 }
 966
 967 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 968         unsigned char *buf, unsigned int buf_size)
 969 {
 970         struct token *begin;
 971
 972         stream->nr = idx;
 973         stream->line = 1;
 974         stream->newline = 1;
 975         stream->whitespace = 0;
 976         stream->pos = 0;
 977
 978         stream->token = NULL;
 979         stream->fd = fd;
 980         stream->offset = 0;
 981         stream->size = buf_size;
 982         stream->buffer = buf;
 983
 984         begin = alloc_token(stream);
 985         token_type(begin) = TOKEN_STREAMBEGIN;
 986         stream->tokenlist = &begin->next;
 987         return begin;
 988 }
 989
 990 static struct token *tokenize_stream(stream_t *stream)
 991 {
 992         int c = nextchar(stream);
 993         while (c != EOF) {
 994                 if (!isspace(c)) {
 995                         struct token *token = alloc_token(stream);
 996                         stream->token = token;
 997                         stream->newline = 0;
 998                         stream->whitespace = 0;
 999                         c = get_one_token(c, stream);
1000                         continue;
1001                 }
1002                 stream->whitespace = 1;
1003                 c = nextchar(stream);
1004         }
1005         return mark_eof(stream);
1006 }
1007
1008 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1009 {
1010         stream_t stream;
1011         struct token *begin;
1012
1013         begin = setup_stream(&stream, 0, -1, buffer, size);
1014         *endtoken = tokenize_stream(&stream);
1015         return begin;
1016 }
1017
1018 struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path)
1019 {
1020         struct token *begin, *end;
1021         stream_t stream;
1022         unsigned char buffer[BUFSIZE];
1023         int idx;
1024
1025         idx = init_stream(pos, name, fd, next_path);
1026         if (idx < 0) {
1027                 // info(endtoken->pos, "File %s is const", name);
1028                 return endtoken;
1029         }
1030
1031         begin = setup_stream(&stream, idx, fd, buffer, 0);
1032         end = tokenize_stream(&stream);
1033         if (endtoken)
1034                 end->next = endtoken;
1035         return begin;
1036 }