tokenize.c

   1 /*
   2  * This is a really stupid C tokenizer. It doesn't do any include
   3  * files or anything complex at all. That's the preprocessor.
   4  *
   5  * Copyright (C) 2003 Transmeta Corp.
   6  *               2003 Linus Torvalds
   7  *
   8  * Permission is hereby granted, free of charge, to any person obtaining a copy
   9  * of this software and associated documentation files (the "Software"), to deal
  10  * in the Software without restriction, including without limitation the rights
  11  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12  * copies of the Software, and to permit persons to whom the Software is
  13  * furnished to do so, subject to the following conditions:
  14  *
  15  * The above copyright notice and this permission notice shall be included in
  16  * all copies or substantial portions of the Software.
  17  *
  18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24  * THE SOFTWARE.
  25  */
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <stdarg.h>
  29 #include <stddef.h>
  30 #include <string.h>
  31 #include <ctype.h>
  32 #include <unistd.h>
  33 #include <stdint.h>
  34
  35 #include "lib.h"
  36 #include "allocate.h"
  37 #include "token.h"
  38 #include "symbol.h"
  39
  40 #define EOF (-1)
  41
  42 int input_stream_nr = 0;
  43 struct stream *input_streams;
  44 static int input_streams_allocated;
  45 unsigned int tabstop = 8;
  46 int no_lineno = 0;
  47
  48 #define BUFSIZE (8192)
  49
  50 typedef struct {
  51         int fd, offset, size;
  52         int pos, line, nr;
  53         int newline, whitespace;
  54         struct token **tokenlist;
  55         struct token *token;
  56         unsigned char *buffer;
  57 } stream_t;
  58
  59 const char *stream_name(int stream)
  60 {
  61         if (stream < 0 || stream > input_stream_nr)
  62                 return "<bad stream>";
  63         return input_streams[stream].name;
  64 }
  65
  66 int stream_prev(int stream)
  67 {
  68         if (stream < 0 || stream > input_stream_nr)
  69                 return -1;
  70         stream = input_streams[stream].pos.stream;
  71         if (stream > input_stream_nr)
  72                 return -1;
  73         return stream;
  74 }
  75
  76 static struct position stream_pos(stream_t *stream)
  77 {
  78         struct position pos;
  79         pos.type = 0;
  80         pos.stream = stream->nr;
  81         pos.newline = stream->newline;
  82         pos.whitespace = stream->whitespace;
  83         pos.pos = stream->pos;
  84
  85         pos.line = stream->line;
  86         if (no_lineno)
  87                 pos.line = 123456;
  88
  89         pos.noexpand = 0;
  90         return pos;
  91 }
  92
  93 const char *show_special(int val)
  94 {
  95         static char buffer[4];
  96
  97         buffer[0] = val;
  98         buffer[1] = 0;
  99         if (val >= SPECIAL_BASE)
 100                 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
 101         return buffer;
 102 }
 103
 104 const char *show_ident(const struct ident *ident)
 105 {
 106         static char buff[4][256];
 107         static int n;
 108         char *buffer;
 109
 110         if (!ident)
 111                 return "<noident>";
 112         buffer = buff[3 & ++n];
 113         sprintf(buffer, "%.*s", ident->len, ident->name);
 114         return buffer;
 115 }
 116
 117 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
 118 {
 119         if (isprint(c)) {
 120                 if (c == escape || c == '\\')
 121                         *ptr++ = '\\';
 122                 *ptr++ = c;
 123                 return ptr;
 124         }
 125         *ptr++ = '\\';
 126         switch (c) {
 127         case '\n':
 128                 *ptr++ = 'n';
 129                 return ptr;
 130         case '\t':
 131                 *ptr++ = 't';
 132                 return ptr;
 133         }
 134         if (!isdigit(next))
 135                 return ptr + sprintf(ptr, "%o", c);
 136
 137         return ptr + sprintf(ptr, "%03o", c);
 138 }
 139
 140 const char *show_string(const struct string *string)
 141 {
 142         static char buffer[4 * MAX_STRING + 3];
 143         char *ptr;
 144         int i;
 145
 146         if (!string || !string->length)
 147                 return "<bad_string>";
 148         ptr = buffer;
 149         *ptr++ = '"';
 150         for (i = 0; i < string->length-1; i++) {
 151                 const char *p = string->data + i;
 152                 ptr = charstr(ptr, p[0], '"', p[1]);
 153         }
 154         *ptr++ = '"';
 155         *ptr = '\0';
 156         return buffer;
 157 }
 158
 159 static const char *show_char(const char *s, size_t len, char prefix, char delim)
 160 {
 161         static char buffer[MAX_STRING + 4];
 162         char *p = buffer;
 163         if (prefix)
 164                 *p++ = prefix;
 165         *p++ = delim;
 166         memcpy(p, s, len);
 167         p += len;
 168         *p++ = delim;
 169         *p++ = '\0';
 170         return buffer;
 171 }
 172
 173 static const char *quote_char(const char *s, size_t len, char prefix, char delim)
 174 {
 175         static char buffer[2*MAX_STRING + 6];
 176         size_t i;
 177         char *p = buffer;
 178         if (prefix)
 179                 *p++ = prefix;
 180         if (delim == '"')
 181                 *p++ = '\\';
 182         *p++ = delim;
 183         for (i = 0; i < len; i++) {
 184                 if (s[i] == '"' || s[i] == '\\')
 185                         *p++ = '\\';
 186                 *p++ = s[i];
 187         }
 188         if (delim == '"')
 189                 *p++ = '\\';
 190         *p++ = delim;
 191         *p++ = '\0';
 192         return buffer;
 193 }
 194
 195 const char *show_token(const struct token *token)
 196 {
 197         static char buffer[256];
 198
 199         if (!token)
 200                 return "<no token>";
 201         switch (token_type(token)) {
 202         case TOKEN_ERROR:
 203                 return "syntax error";
 204
 205         case TOKEN_EOF:
 206                 return "end-of-input";
 207
 208         case TOKEN_IDENT:
 209                 return show_ident(token->ident);
 210
 211         case TOKEN_NUMBER:
 212                 return token->number;
 213
 214         case TOKEN_SPECIAL:
 215                 return show_special(token->special);
 216
 217         case TOKEN_CHAR:
 218                 return show_char(token->string->data,
 219                         token->string->length - 1, 0, '\'');
 220         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 221                 return show_char(token->embedded,
 222                         token_type(token) - TOKEN_CHAR, 0, '\'');
 223         case TOKEN_WIDE_CHAR:
 224                 return show_char(token->string->data,
 225                         token->string->length - 1, 'L', '\'');
 226         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 227                 return show_char(token->embedded,
 228                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 229         case TOKEN_STRING:
 230                 return show_char(token->string->data,
 231                         token->string->length - 1, 0, '"');
 232         case TOKEN_WIDE_STRING:
 233                 return show_char(token->string->data,
 234                         token->string->length - 1, 'L', '"');
 235
 236         case TOKEN_STREAMBEGIN:
 237                 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
 238                 return buffer;
 239
 240         case TOKEN_STREAMEND:
 241                 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
 242                 return buffer;
 243
 244         case TOKEN_UNTAINT:
 245                 sprintf(buffer, "<untaint>");
 246                 return buffer;
 247
 248         case TOKEN_ARG_COUNT:
 249                 sprintf(buffer, "<argcnt>");
 250                 return buffer;
 251
 252         default:
 253                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 254                 return buffer;
 255         }
 256 }
 257
 258 const char *quote_token(const struct token *token)
 259 {
 260         static char buffer[256];
 261
 262         switch (token_type(token)) {
 263         case TOKEN_ERROR:
 264                 return "syntax error";
 265
 266         case TOKEN_IDENT:
 267                 return show_ident(token->ident);
 268
 269         case TOKEN_NUMBER:
 270                 return token->number;
 271
 272         case TOKEN_SPECIAL:
 273                 return show_special(token->special);
 274
 275         case TOKEN_CHAR:
 276                 return quote_char(token->string->data,
 277                         token->string->length - 1, 0, '\'');
 278         case TOKEN_CHAR_EMBEDDED_0 ... TOKEN_CHAR_EMBEDDED_3:
 279                 return quote_char(token->embedded,
 280                         token_type(token) - TOKEN_CHAR, 0, '\'');
 281         case TOKEN_WIDE_CHAR:
 282                 return quote_char(token->string->data,
 283                         token->string->length - 1, 'L', '\'');
 284         case TOKEN_WIDE_CHAR_EMBEDDED_0 ... TOKEN_WIDE_CHAR_EMBEDDED_3:
 285                 return quote_char(token->embedded,
 286                         token_type(token) - TOKEN_WIDE_CHAR, 'L', '\'');
 287         case TOKEN_STRING:
 288                 return quote_char(token->string->data,
 289                         token->string->length - 1, 0, '"');
 290         case TOKEN_WIDE_STRING:
 291                 return quote_char(token->string->data,
 292                         token->string->length - 1, 'L', '"');
 293         default:
 294                 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
 295                 return buffer;
 296         }
 297 }
 298
 299 #define HASHED_INPUT_BITS (6)
 300 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
 301 #define HASH_PRIME 0x9e370001UL
 302
 303 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
 304
 305 int *hash_stream(const char *name)
 306 {
 307         uint32_t hash = 0;
 308         unsigned char c;
 309
 310         while ((c = *name++) != 0)
 311                 hash = (hash + (c << 4) + (c >> 4)) * 11;
 312
 313         hash *= HASH_PRIME;
 314         hash >>= 32 - HASHED_INPUT_BITS;
 315         return input_stream_hashes + hash;
 316 }
 317
 318 int init_stream(const struct position *pos, const char *name, int fd, const char **next_path)
 319 {
 320         int stream = input_stream_nr, *hash;
 321         struct stream *current;
 322
 323         if (stream >= input_streams_allocated) {
 324                 int newalloc = stream * 4 / 3 + 10;
 325                 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
 326                 if (!input_streams)
 327                         die("Unable to allocate more streams space");
 328                 input_streams_allocated = newalloc;
 329         }
 330         current = input_streams + stream;
 331         memset(current, 0, sizeof(*current));
 332         current->name = name;
 333         current->fd = fd;
 334         current->next_path = next_path;
 335         current->path = NULL;
 336         current->constant = CONSTANT_FILE_MAYBE;
 337         if (pos)
 338                 current->pos = *pos;
 339         else
 340                 current->pos.stream = -1;
 341         input_stream_nr = stream+1;
 342         hash = hash_stream(name);
 343         current->next_stream = *hash;
 344         *hash = stream;
 345         return stream;
 346 }
 347
 348 static struct token * alloc_token(stream_t *stream)
 349 {
 350         struct token *token = __alloc_token(0);
 351         token->pos = stream_pos(stream);
 352         return token;
 353 }
 354
 355 /*
 356  *  Argh...  That was surprisingly messy - handling '\r' complicates the
 357  *  things a _lot_.
 358  */
 359 static int nextchar_slow(stream_t *stream)
 360 {
 361         int offset = stream->offset;
 362         int size = stream->size;
 363         int c;
 364         int spliced = 0, had_cr, had_backslash;
 365
 366 restart:
 367         had_cr = had_backslash = 0;
 368
 369 repeat:
 370         if (offset >= size) {
 371                 if (stream->fd < 0)
 372                         goto got_eof;
 373                 size = read(stream->fd, stream->buffer, BUFSIZE);
 374                 if (size <= 0)
 375                         goto got_eof;
 376                 stream->size = size;
 377                 stream->offset = offset = 0;
 378         }
 379
 380         c = stream->buffer[offset++];
 381         if (had_cr)
 382                 goto check_lf;
 383
 384         if (c == '\r') {
 385                 had_cr = 1;
 386                 goto repeat;
 387         }
 388
 389 norm:
 390         if (!had_backslash) {
 391                 switch (c) {
 392                 case '\t':
 393                         stream->pos += tabstop - stream->pos % tabstop;
 394                         break;
 395                 case '\n':
 396                         stream->line++;
 397                         stream->pos = 0;
 398                         stream->newline = 1;
 399                         break;
 400                 case '\\':
 401                         had_backslash = 1;
 402                         stream->pos++;
 403                         goto repeat;
 404                 default:
 405                         stream->pos++;
 406                 }
 407         } else {
 408                 if (c == '\n') {
 409                         stream->line++;
 410                         stream->pos = 0;
 411                         spliced = 1;
 412                         goto restart;
 413                 }
 414                 offset--;
 415                 c = '\\';
 416         }
 417 out:
 418         stream->offset = offset;
 419
 420         return c;
 421
 422 check_lf:
 423         if (c != '\n')
 424                 offset--;
 425         c = '\n';
 426         goto norm;
 427
 428 got_eof:
 429         if (had_backslash) {
 430                 c = '\\';
 431                 goto out;
 432         }
 433         if (stream->pos & Wnewline_eof)
 434                 warning(stream_pos(stream), "no newline at end of file");
 435         else if (spliced)
 436                 warning(stream_pos(stream), "backslash-newline at end of file");
 437         return EOF;
 438 }
 439
 440 /*
 441  *  We want that as light as possible while covering all normal cases.
 442  *  Slow path (including the logics with line-splicing and EOF sanity
 443  *  checks) is in nextchar_slow().
 444  */
 445 static inline int nextchar(stream_t *stream)
 446 {
 447         int offset = stream->offset;
 448
 449         if (offset < stream->size) {
 450                 int c = stream->buffer[offset++];
 451                 static const char special[256] = {
 452                         ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
 453                 };
 454                 if (!special[c]) {
 455                         stream->offset = offset;
 456                         stream->pos++;
 457                         return c;
 458                 }
 459         }
 460         return nextchar_slow(stream);
 461 }
 462
 463 struct token eof_token_entry;
 464
 465 static struct token *mark_eof(stream_t *stream)
 466 {
 467         struct token *end;
 468
 469         end = alloc_token(stream);
 470         eof_token_entry.pos = end->pos;
 471         token_type(end) = TOKEN_STREAMEND;
 472         end->pos.newline = 1;
 473
 474         eof_token_entry.next = &eof_token_entry;
 475         eof_token_entry.pos.newline = 1;
 476
 477         end->next =  &eof_token_entry;
 478         *stream->tokenlist = end;
 479         stream->tokenlist = NULL;
 480         return end;
 481 }
 482
 483 static void add_token(stream_t *stream)
 484 {
 485         struct token *token = stream->token;
 486
 487         stream->token = NULL;
 488         token->next = NULL;
 489         *stream->tokenlist = token;
 490         stream->tokenlist = &token->next;
 491 }
 492
 493 static void drop_token(stream_t *stream)
 494 {
 495         stream->newline |= stream->token->pos.newline;
 496         stream->whitespace |= stream->token->pos.whitespace;
 497         stream->token = NULL;
 498 }
 499
 500 enum {
 501         Letter = 1,
 502         Digit = 2,
 503         Hex = 4,
 504         Exp = 8,
 505         Dot = 16,
 506         ValidSecond = 32,
 507         Quote = 64,
 508 };
 509
 510 static const char cclass[257] = {
 511         ['0' + 1 ... '9' + 1] = Digit | Hex,
 512         ['A' + 1 ... 'D' + 1] = Letter | Hex,
 513         ['E' + 1] = Letter | Hex | Exp, /* E<exp> */
 514         ['F' + 1] = Letter | Hex,
 515         ['G' + 1 ... 'O' + 1] = Letter,
 516         ['P' + 1] = Letter | Exp,       /* P<exp> */
 517         ['Q' + 1 ... 'Z' + 1] = Letter,
 518         ['a' + 1 ... 'd' + 1] = Letter | Hex,
 519         ['e' + 1] = Letter | Hex | Exp, /* e<exp> */
 520         ['f' + 1] = Letter | Hex,
 521         ['g' + 1 ... 'o' + 1] = Letter,
 522         ['p' + 1] = Letter | Exp,       /* p<exp> */
 523         ['q' + 1 ... 'z' + 1] = Letter,
 524         ['_' + 1] = Letter,
 525         ['.' + 1] = Dot | ValidSecond,
 526         ['=' + 1] = ValidSecond,
 527         ['+' + 1] = ValidSecond,
 528         ['-' + 1] = ValidSecond,
 529         ['>' + 1] = ValidSecond,
 530         ['<' + 1] = ValidSecond,
 531         ['&' + 1] = ValidSecond,
 532         ['|' + 1] = ValidSecond,
 533         ['#' + 1] = ValidSecond,
 534         ['\'' + 1] = Quote,
 535         ['"' + 1] = Quote,
 536 };
 537
 538 /*
 539  * pp-number:
 540  *      digit
 541  *      . digit
 542  *      pp-number digit
 543  *      pp-number identifier-nodigit
 544  *      pp-number e sign
 545  *      pp-number E sign
 546  *      pp-number p sign
 547  *      pp-number P sign
 548  *      pp-number .
 549  */
 550 static int get_one_number(int c, int next, stream_t *stream)
 551 {
 552         struct token *token;
 553         static char buffer[4095];
 554         char *p = buffer, *buffer_end = buffer + sizeof (buffer);
 555
 556         *p++ = c;
 557         for (;;) {
 558                 long class =  cclass[next + 1];
 559                 if (!(class & (Dot | Digit | Letter)))
 560                         break;
 561                 if (p != buffer_end)
 562                         *p++ = next;
 563                 next = nextchar(stream);
 564                 if (class & Exp) {
 565                         if (next == '-' || next == '+') {
 566                                 if (p != buffer_end)
 567                                         *p++ = next;
 568                                 next = nextchar(stream);
 569                         }
 570                 }
 571         }
 572
 573         if (p == buffer_end) {
 574                 sparse_error(stream_pos(stream), "number token exceeds %td characters",
 575                       buffer_end - buffer);
 576                 // Pretend we saw just "1".
 577                 buffer[0] = '1';
 578                 p = buffer + 1;
 579         }
 580
 581         *p++ = 0;
 582         token = stream->token;
 583         token_type(token) = TOKEN_NUMBER;
 584         token->number = xmemdup(buffer, p - buffer);
 585         add_token(stream);
 586
 587         return next;
 588 }
 589
 590 static int eat_string(int next, stream_t *stream, enum token_type type)
 591 {
 592         static char buffer[MAX_STRING];
 593         struct string *string;
 594         struct token *token = stream->token;
 595         int len = 0;
 596         int escape;
 597         int want_hex = 0;
 598         char delim = type < TOKEN_STRING ? '\'' : '"';
 599
 600         for (escape = 0; escape || next != delim; next = nextchar(stream)) {
 601                 if (len < MAX_STRING)
 602                         buffer[len] = next;
 603                 len++;
 604                 if (next == '\n') {
 605                         warning(stream_pos(stream),
 606                                 "missing terminating %c character", delim);
 607                         /* assume delimiter is lost */
 608                         break;
 609                 }
 610                 if (next == EOF) {
 611                         warning(stream_pos(stream),
 612                                 "End of file in middle of string");
 613                         return next;
 614                 }
 615                 if (!escape) {
 616                         if (want_hex && !(cclass[next + 1] & Hex))
 617                                 warning(stream_pos(stream),
 618                                         "\\x used with no following hex digits");
 619                         want_hex = 0;
 620                         escape = next == '\\';
 621                 } else {
 622                         escape = 0;
 623                         want_hex = next == 'x';
 624                 }
 625         }
 626         if (want_hex)
 627                 warning(stream_pos(stream),
 628                         "\\x used with no following hex digits");
 629         if (len > MAX_STRING) {
 630                 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
 631                 len = MAX_STRING;
 632         }
 633         if (delim == '\'' && len && len <= 4) {
 634                 token_type(token) = type + len;
 635                 memset(buffer + len, '\0', 4 - len);
 636                 memcpy(token->embedded, buffer, 4);
 637         } else {
 638                 token_type(token) = type;
 639                 string = __alloc_string(len+1);
 640                 memcpy(string->data, buffer, len);
 641                 string->data[len] = '\0';
 642                 string->length = len+1;
 643                 token->string = string;
 644         }
 645
 646         /* Pass it on.. */
 647         token = stream->token;
 648         add_token(stream);
 649         return nextchar(stream);
 650 }
 651
 652 static int drop_stream_eoln(stream_t *stream)
 653 {
 654         drop_token(stream);
 655         for (;;) {
 656                 switch (nextchar(stream)) {
 657                 case EOF:
 658                         return EOF;
 659                 case '\n':
 660                         return nextchar(stream);
 661                 }
 662         }
 663 }
 664
 665 static int drop_stream_comment(stream_t *stream)
 666 {
 667         int newline;
 668         int next;
 669         drop_token(stream);
 670         newline = stream->newline;
 671
 672         next = nextchar(stream);
 673         for (;;) {
 674                 int curr = next;
 675                 if (curr == EOF) {
 676                         warning(stream_pos(stream), "End of file in the middle of a comment");
 677                         return curr;
 678                 }
 679                 next = nextchar(stream);
 680                 if (curr == '*' && next == '/')
 681                         break;
 682         }
 683         stream->newline = newline;
 684         return nextchar(stream);
 685 }
 686
 687 unsigned char combinations[][4] = COMBINATION_STRINGS;
 688
 689 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
 690
 691 /* hash function for two-character punctuators - all give unique values */
 692 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
 693
 694 /*
 695  * note that we won't get false positives - special_hash(0,0) is 0 and
 696  * entry 0 is filled (by +=), so all the missing ones are OK.
 697  */
 698 static unsigned char hash_results[32][2] = {
 699 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
 700         RES('+', '='), /* 00 */
 701         RES('/', '='), /* 01 */
 702         RES('^', '='), /* 05 */
 703         RES('&', '&'), /* 07 */
 704         RES('#', '#'), /* 08 */
 705         RES('<', '<'), /* 0a */
 706         RES('<', '='), /* 0c */
 707         RES('!', '='), /* 0e */
 708         RES('%', '='), /* 0f */
 709         RES('-', '-'), /* 10 */
 710         RES('-', '='), /* 11 */
 711         RES('-', '>'), /* 13 */
 712         RES('=', '='), /* 15 */
 713         RES('&', '='), /* 17 */
 714         RES('*', '='), /* 18 */
 715         RES('.', '.'), /* 1a */
 716         RES('+', '+'), /* 1b */
 717         RES('|', '='), /* 1c */
 718         RES('>', '='), /* 1d */
 719         RES('|', '|'), /* 1e */
 720         RES('>', '>')  /* 1f */
 721 #undef RES
 722 };
 723 static int code[32] = {
 724 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
 725         CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
 726         CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
 727         CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
 728         CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
 729         CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
 730         CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
 731         CODE('<', '=', SPECIAL_LTE), /* 0c */
 732         CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
 733         CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
 734         CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
 735         CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
 736         CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
 737         CODE('=', '=', SPECIAL_EQUAL), /* 15 */
 738         CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
 739         CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
 740         CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
 741         CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
 742         CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
 743         CODE('>', '=', SPECIAL_GTE), /* 1d */
 744         CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
 745         CODE('>', '>', SPECIAL_RIGHTSHIFT)  /* 1f */
 746 #undef CODE
 747 };
 748
 749 static int get_one_special(int c, stream_t *stream)
 750 {
 751         struct token *token;
 752         int next, value, i;
 753
 754         next = nextchar(stream);
 755
 756         /*
 757          * Check for numbers, strings, character constants, and comments
 758          */
 759         switch (c) {
 760         case '.':
 761                 if (next >= '0' && next <= '9')
 762                         return get_one_number(c, next, stream);
 763                 break;
 764         case '"':
 765                 return eat_string(next, stream, TOKEN_STRING);
 766         case '\'':
 767                 return eat_string(next, stream, TOKEN_CHAR);
 768         case '/':
 769                 if (next == '/')
 770                         return drop_stream_eoln(stream);
 771                 if (next == '*')
 772                         return drop_stream_comment(stream);
 773         }
 774
 775         /*
 776          * Check for combinations
 777          */
 778         value = c;
 779         if (cclass[next + 1] & ValidSecond) {
 780                 i = special_hash(c, next);
 781                 if (hash_results[i][0] == c && hash_results[i][1] == next) {
 782                         value = code[i];
 783                         next = nextchar(stream);
 784                         if (value >= SPECIAL_LEFTSHIFT &&
 785                             next == "==."[value - SPECIAL_LEFTSHIFT]) {
 786                                 value += 3;
 787                                 next = nextchar(stream);
 788                         }
 789                 }
 790         }
 791
 792         /* Pass it on.. */
 793         token = stream->token;
 794         token_type(token) = TOKEN_SPECIAL;
 795         token->special = value;
 796         add_token(stream);
 797         return next;
 798 }
 799
 800 #define IDENT_HASH_BITS (13)
 801 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
 802 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
 803
 804 #define ident_hash_init(c)              (c)
 805 #define ident_hash_add(oldhash,c)       ((oldhash)*11 + (c))
 806 #define ident_hash_end(hash)            ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
 807
 808 static struct ident *hash_table[IDENT_HASH_SIZE];
 809 static int ident_hit, ident_miss, idents;
 810
 811 void show_identifier_stats(void)
 812 {
 813         int i;
 814         int distribution[100];
 815
 816         fprintf(stderr, "identifiers: %d hits, %d misses\n",
 817                 ident_hit, ident_miss);
 818
 819         for (i = 0; i < 100; i++)
 820                 distribution[i] = 0;
 821
 822         for (i = 0; i < IDENT_HASH_SIZE; i++) {
 823                 struct ident * ident = hash_table[i];
 824                 int count = 0;
 825
 826                 while (ident) {
 827                         count++;
 828                         ident = ident->next;
 829                 }
 830                 if (count > 99)
 831                         count = 99;
 832                 distribution[count]++;
 833         }
 834
 835         for (i = 0; i < 100; i++) {
 836                 if (distribution[i])
 837                         fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
 838         }
 839 }
 840
 841 struct ident *alloc_ident(const char *name, int len)
 842 {
 843         struct ident *ident = __alloc_ident(len);
 844         ident->symbols = NULL;
 845         ident->len = len;
 846         ident->tainted = 0;
 847         memcpy(ident->name, name, len);
 848         return ident;
 849 }
 850
 851 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
 852 {
 853         ident->next = hash_table[hash];
 854         hash_table[hash] = ident;
 855         ident_miss++;
 856         return ident;
 857 }
 858
 859 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
 860 {
 861         struct ident *ident;
 862         struct ident **p;
 863
 864         p = &hash_table[hash];
 865         while ((ident = *p) != NULL) {
 866                 if (ident->len == (unsigned char) len) {
 867                         if (strncmp(name, ident->name, len) != 0)
 868                                 goto next;
 869
 870                         ident_hit++;
 871                         return ident;
 872                 }
 873 next:
 874                 //misses++;
 875                 p = &ident->next;
 876         }
 877         ident = alloc_ident(name, len);
 878         *p = ident;
 879         ident->next = NULL;
 880         ident_miss++;
 881         idents++;
 882         return ident;
 883 }
 884
 885 static unsigned long hash_name(const char *name, int len)
 886 {
 887         unsigned long hash;
 888         const unsigned char *p = (const unsigned char *)name;
 889
 890         hash = ident_hash_init(*p++);
 891         while (--len) {
 892                 unsigned int i = *p++;
 893                 hash = ident_hash_add(hash, i);
 894         }
 895         return ident_hash_end(hash);
 896 }
 897
 898 struct ident *hash_ident(struct ident *ident)
 899 {
 900         return insert_hash(ident, hash_name(ident->name, ident->len));
 901 }
 902
 903 struct ident *built_in_ident(const char *name)
 904 {
 905         int len = strlen(name);
 906         return create_hashed_ident(name, len, hash_name(name, len));
 907 }
 908
 909 struct token *built_in_token(int stream, struct ident *ident)
 910 {
 911         struct token *token;
 912
 913         token = __alloc_token(0);
 914         token->pos.stream = stream;
 915         token_type(token) = TOKEN_IDENT;
 916         token->ident = ident;
 917         return token;
 918 }
 919
 920 static int get_one_identifier(int c, stream_t *stream)
 921 {
 922         struct token *token;
 923         struct ident *ident;
 924         unsigned long hash;
 925         char buf[256];
 926         int len = 1;
 927         int next;
 928
 929         hash = ident_hash_init(c);
 930         buf[0] = c;
 931         for (;;) {
 932                 next = nextchar(stream);
 933                 if (!(cclass[next + 1] & (Letter | Digit)))
 934                         break;
 935                 if (len >= sizeof(buf))
 936                         break;
 937                 hash = ident_hash_add(hash, next);
 938                 buf[len] = next;
 939                 len++;
 940         };
 941         if (cclass[next + 1] & Quote) {
 942                 if (len == 1 && buf[0] == 'L') {
 943                         if (next == '\'')
 944                                 return eat_string(nextchar(stream), stream,
 945                                                         TOKEN_WIDE_CHAR);
 946                         else
 947                                 return eat_string(nextchar(stream), stream,
 948                                                         TOKEN_WIDE_STRING);
 949                 }
 950         }
 951         hash = ident_hash_end(hash);
 952         ident = create_hashed_ident(buf, len, hash);
 953
 954         /* Pass it on.. */
 955         token = stream->token;
 956         token_type(token) = TOKEN_IDENT;
 957         token->ident = ident;
 958         add_token(stream);
 959         return next;
 960 }
 961
 962 static int get_one_token(int c, stream_t *stream)
 963 {
 964         long class = cclass[c + 1];
 965         if (class & Digit)
 966                 return get_one_number(c, nextchar(stream), stream);
 967         if (class & Letter)
 968                 return get_one_identifier(c, stream);
 969         return get_one_special(c, stream);
 970 }
 971
 972 static struct token *setup_stream(stream_t *stream, int idx, int fd,
 973         unsigned char *buf, unsigned int buf_size)
 974 {
 975         struct token *begin;
 976
 977         stream->nr = idx;
 978         stream->line = 1;
 979         stream->newline = 1;
 980         stream->whitespace = 0;
 981         stream->pos = 0;
 982
 983         stream->token = NULL;
 984         stream->fd = fd;
 985         stream->offset = 0;
 986         stream->size = buf_size;
 987         stream->buffer = buf;
 988
 989         begin = alloc_token(stream);
 990         token_type(begin) = TOKEN_STREAMBEGIN;
 991         stream->tokenlist = &begin->next;
 992         return begin;
 993 }
 994
 995 static struct token *tokenize_stream(stream_t *stream)
 996 {
 997         int c = nextchar(stream);
 998         while (c != EOF) {
 999                 if (!isspace(c)) {
1000                         struct token *token = alloc_token(stream);
1001                         stream->token = token;
1002                         stream->newline = 0;
1003                         stream->whitespace = 0;
1004                         c = get_one_token(c, stream);
1005                         continue;
1006                 }
1007                 stream->whitespace = 1;
1008                 c = nextchar(stream);
1009         }
1010         return mark_eof(stream);
1011 }
1012
1013 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
1014 {
1015         stream_t stream;
1016         struct token *begin;
1017
1018         begin = setup_stream(&stream, 0, -1, buffer, size);
1019         *endtoken = tokenize_stream(&stream);
1020         return begin;
1021 }
1022
1023 struct token * tokenize(const struct position *pos, const char *name, int fd, struct token *endtoken, const char **next_path)
1024 {
1025         struct token *begin, *end;
1026         stream_t stream;
1027         unsigned char buffer[BUFSIZE];
1028         int idx;
1029
1030         idx = init_stream(pos, name, fd, next_path);
1031         if (idx < 0) {
1032                 // info(endtoken->pos, "File %s is const", name);
1033                 return endtoken;
1034         }
1035
1036         begin = setup_stream(&stream, idx, fd, buffer, 0);
1037         end = tokenize_stream(&stream);
1038         if (endtoken)
1039                 end->next = endtoken;
1040         return begin;
1041 }