param_limit: fix read beyond end of array (segfault)
[smatch.git] / tokenize.c
blob61782754f4331b2b49ac84f7b2b73c39294bd0b7
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <stdint.h>
19 #include "lib.h"
20 #include "allocate.h"
21 #include "token.h"
22 #include "symbol.h"
24 #define EOF (-1)
26 int input_stream_nr = 0;
27 struct stream *input_streams;
28 static int input_streams_allocated;
29 unsigned int tabstop = 8;
30 int no_lineno = 0;
32 #define BUFSIZE (8192)
34 typedef struct {
35 int fd, offset, size;
36 int pos, line, nr;
37 int newline, whitespace;
38 struct token **tokenlist;
39 struct token *token;
40 unsigned char *buffer;
41 } stream_t;
43 const char *stream_name(int stream)
45 if (stream < 0 || stream > input_stream_nr)
46 return "<bad stream>";
47 return input_streams[stream].name;
50 static struct position stream_pos(stream_t *stream)
52 struct position pos;
53 pos.type = 0;
54 pos.stream = stream->nr;
55 pos.newline = stream->newline;
56 pos.whitespace = stream->whitespace;
57 pos.pos = stream->pos;
59 pos.line = stream->line;
60 if (no_lineno)
61 pos.line = 123456;
63 pos.noexpand = 0;
64 return pos;
67 const char *show_special(int val)
69 static char buffer[4];
71 buffer[0] = val;
72 buffer[1] = 0;
73 if (val >= SPECIAL_BASE)
74 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
75 return buffer;
78 const char *show_ident(const struct ident *ident)
80 static char buffer[256];
81 if (!ident)
82 return "<noident>";
83 sprintf(buffer, "%.*s", ident->len, ident->name);
84 return buffer;
87 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
89 if (isprint(c)) {
90 if (c == escape || c == '\\')
91 *ptr++ = '\\';
92 *ptr++ = c;
93 return ptr;
95 *ptr++ = '\\';
96 switch (c) {
97 case '\n':
98 *ptr++ = 'n';
99 return ptr;
100 case '\t':
101 *ptr++ = 't';
102 return ptr;
104 if (!isdigit(next))
105 return ptr + sprintf(ptr, "%o", c);
107 return ptr + sprintf(ptr, "%03o", c);
110 const char *show_string(const struct string *string)
112 static char buffer[4 * MAX_STRING + 3];
113 char *ptr;
114 int i;
116 if (!string->length)
117 return "<bad_string>";
118 ptr = buffer;
119 *ptr++ = '"';
120 for (i = 0; i < string->length-1; i++) {
121 const char *p = string->data + i;
122 ptr = charstr(ptr, p[0], '"', p[1]);
124 *ptr++ = '"';
125 *ptr = '\0';
126 return buffer;
129 const char *show_token(const struct token *token)
131 static char buffer[256];
133 if (!token)
134 return "<no token>";
135 switch (token_type(token)) {
136 case TOKEN_ERROR:
137 return "syntax error";
139 case TOKEN_EOF:
140 return "end-of-input";
142 case TOKEN_IDENT:
143 return show_ident(token->ident);
145 case TOKEN_STRING:
146 case TOKEN_WIDE_STRING:
147 return show_string(token->string);
149 case TOKEN_NUMBER:
150 return token->number;
152 case TOKEN_SPECIAL:
153 return show_special(token->special);
155 case TOKEN_CHAR:
156 case TOKEN_WIDE_CHAR: {
157 char *ptr = buffer;
158 int c = token->character;
159 *ptr++ = '\'';
160 ptr = charstr(ptr, c, '\'', 0);
161 *ptr++ = '\'';
162 *ptr++ = '\0';
163 return buffer;
166 case TOKEN_STREAMBEGIN:
167 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
168 return buffer;
170 case TOKEN_STREAMEND:
171 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
172 return buffer;
174 case TOKEN_UNTAINT:
175 sprintf(buffer, "<untaint>");
176 return buffer;
178 case TOKEN_ARG_COUNT:
179 sprintf(buffer, "<argcnt>");
180 return buffer;
182 default:
183 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
184 return buffer;
188 #define HASHED_INPUT_BITS (6)
189 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
190 #define HASH_PRIME 0x9e370001UL
192 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
194 int *hash_stream(const char *name)
196 uint32_t hash = 0;
197 unsigned char c;
199 while ((c = *name++) != 0)
200 hash = (hash + (c << 4) + (c >> 4)) * 11;
202 hash *= HASH_PRIME;
203 hash >>= 32 - HASHED_INPUT_BITS;
204 return input_stream_hashes + hash;
207 int init_stream(const char *name, int fd, const char **next_path)
209 int stream = input_stream_nr, *hash;
210 struct stream *current;
212 if (stream >= input_streams_allocated) {
213 int newalloc = stream * 4 / 3 + 10;
214 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
215 if (!input_streams)
216 die("Unable to allocate more streams space");
217 input_streams_allocated = newalloc;
219 current = input_streams + stream;
220 memset(current, 0, sizeof(*current));
221 current->name = name;
222 current->fd = fd;
223 current->next_path = next_path;
224 current->path = NULL;
225 current->constant = CONSTANT_FILE_MAYBE;
226 input_stream_nr = stream+1;
227 hash = hash_stream(name);
228 current->next_stream = *hash;
229 *hash = stream;
230 return stream;
233 static struct token * alloc_token(stream_t *stream)
235 struct token *token = __alloc_token(0);
236 token->pos = stream_pos(stream);
237 return token;
241 * Argh... That was surprisingly messy - handling '\r' complicates the
242 * things a _lot_.
244 static int nextchar_slow(stream_t *stream)
246 int offset = stream->offset;
247 int size = stream->size;
248 int c;
249 int spliced = 0, had_cr, had_backslash, complain;
251 restart:
252 had_cr = had_backslash = complain = 0;
254 repeat:
255 if (offset >= size) {
256 if (stream->fd < 0)
257 goto got_eof;
258 size = read(stream->fd, stream->buffer, BUFSIZE);
259 if (size <= 0)
260 goto got_eof;
261 stream->size = size;
262 stream->offset = offset = 0;
265 c = stream->buffer[offset++];
267 if (had_cr && c != '\n')
268 complain = 1;
270 if (c == '\r') {
271 had_cr = 1;
272 goto repeat;
275 stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
277 if (c == '\n') {
278 stream->line++;
279 stream->pos = 0;
282 if (!had_backslash) {
283 if (c == '\\') {
284 had_backslash = 1;
285 goto repeat;
287 if (c == '\n')
288 stream->newline = 1;
289 } else {
290 if (c == '\n') {
291 if (complain)
292 warning(stream_pos(stream), "non-ASCII data stream");
293 spliced = 1;
294 goto restart;
296 stream->pos--;
297 offset--;
298 c = '\\';
301 out:
302 stream->offset = offset;
303 if (complain)
304 warning(stream_pos(stream), "non-ASCII data stream");
306 return c;
308 got_eof:
309 if (had_backslash) {
310 c = '\\';
311 goto out;
313 if (stream->pos)
314 warning(stream_pos(stream), "no newline at end of file");
315 else if (had_cr)
316 warning(stream_pos(stream), "non-ASCII data stream");
317 else if (spliced)
318 warning(stream_pos(stream), "backslash-newline at end of file");
319 return EOF;
323 * We want that as light as possible while covering all normal cases.
324 * Slow path (including the logics with line-splicing and EOF sanity
325 * checks) is in nextchar_slow().
327 static inline int nextchar(stream_t *stream)
329 int offset = stream->offset;
331 if (offset < stream->size) {
332 int c = stream->buffer[offset++];
333 static const char special[256] = {
334 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
336 if (!special[c]) {
337 stream->offset = offset;
338 stream->pos++;
339 return c;
342 return nextchar_slow(stream);
345 struct token eof_token_entry;
347 static struct token *mark_eof(stream_t *stream)
349 struct token *end;
351 end = alloc_token(stream);
352 token_type(end) = TOKEN_STREAMEND;
353 end->pos.newline = 1;
355 eof_token_entry.next = &eof_token_entry;
356 eof_token_entry.pos.newline = 1;
358 end->next = &eof_token_entry;
359 *stream->tokenlist = end;
360 stream->tokenlist = NULL;
361 return end;
364 static void add_token(stream_t *stream)
366 struct token *token = stream->token;
368 stream->token = NULL;
369 token->next = NULL;
370 *stream->tokenlist = token;
371 stream->tokenlist = &token->next;
374 static void drop_token(stream_t *stream)
376 stream->newline |= stream->token->pos.newline;
377 stream->whitespace |= stream->token->pos.whitespace;
378 stream->token = NULL;
381 enum {
382 Letter = 1,
383 Digit = 2,
384 Hex = 4,
385 Exp = 8,
386 Dot = 16,
387 ValidSecond = 32,
390 static const long cclass[257] = {
391 ['0' + 1 ... '9' + 1] = Digit | Hex,
392 ['A' + 1 ... 'D' + 1] = Letter | Hex,
393 ['E' + 1] = Letter | Hex | Exp,
394 ['F' + 1] = Letter | Hex,
395 ['G' + 1 ... 'O' + 1] = Letter,
396 ['P' + 1] = Letter | Exp,
397 ['Q' + 1 ... 'Z' + 1] = Letter,
398 ['a' + 1 ... 'd' + 1] = Letter | Hex,
399 ['e' + 1] = Letter | Hex | Exp,
400 ['f' + 1] = Letter | Hex,
401 ['g' + 1 ... 'o' + 1] = Letter,
402 ['p' + 1] = Letter | Exp,
403 ['q' + 1 ... 'z' + 1] = Letter,
404 ['_' + 1] = Letter,
405 ['.' + 1] = Dot | ValidSecond,
406 ['=' + 1] = ValidSecond,
407 ['+' + 1] = ValidSecond,
408 ['-' + 1] = ValidSecond,
409 ['>' + 1] = ValidSecond,
410 ['<' + 1] = ValidSecond,
411 ['&' + 1] = ValidSecond,
412 ['|' + 1] = ValidSecond,
413 ['#' + 1] = ValidSecond,
417 * pp-number:
418 * digit
419 * . digit
420 * pp-number digit
421 * pp-number identifier-nodigit
422 * pp-number e sign
423 * pp-number E sign
424 * pp-number p sign
425 * pp-number P sign
426 * pp-number .
428 static int get_one_number(int c, int next, stream_t *stream)
430 struct token *token;
431 static char buffer[4095];
432 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
433 int len;
435 *p++ = c;
436 for (;;) {
437 long class = cclass[next + 1];
438 if (!(class & (Dot | Digit | Letter)))
439 break;
440 if (p != buffer_end)
441 *p++ = next;
442 next = nextchar(stream);
443 if (class & Exp) {
444 if (next == '-' || next == '+') {
445 if (p != buffer_end)
446 *p++ = next;
447 next = nextchar(stream);
452 if (p == buffer_end) {
453 sparse_error(stream_pos(stream), "number token exceeds %td characters",
454 buffer_end - buffer);
455 // Pretend we saw just "1".
456 buffer[0] = '1';
457 p = buffer + 1;
460 *p++ = 0;
461 len = p - buffer;
462 buf = __alloc_bytes(len);
463 memcpy(buf, buffer, len);
465 token = stream->token;
466 token_type(token) = TOKEN_NUMBER;
467 token->number = buf;
468 add_token(stream);
470 return next;
473 static int escapechar(int first, int type, stream_t *stream, int *valp)
475 int next, value;
477 next = nextchar(stream);
478 value = first;
480 if (first == '\n')
481 warning(stream_pos(stream), "Newline in string or character constant");
483 if (first == '\\' && next != EOF) {
484 value = next;
485 next = nextchar(stream);
486 if (value != type) {
487 switch (value) {
488 case 'a':
489 value = '\a';
490 break;
491 case 'b':
492 value = '\b';
493 break;
494 case 't':
495 value = '\t';
496 break;
497 case 'n':
498 value = '\n';
499 break;
500 case 'v':
501 value = '\v';
502 break;
503 case 'f':
504 value = '\f';
505 break;
506 case 'r':
507 value = '\r';
508 break;
509 case 'e':
510 value = '\e';
511 break;
512 case '\\':
513 break;
514 case '?':
515 break;
516 case '\'':
517 break;
518 case '"':
519 break;
520 case '\n':
521 warning(stream_pos(stream), "Newline in string or character constant");
522 break;
523 case '0'...'7': {
524 int nr = 2;
525 value -= '0';
526 while (next >= '0' && next <= '7') {
527 value = (value << 3) + (next-'0');
528 next = nextchar(stream);
529 if (!--nr)
530 break;
532 value &= 0xff;
533 break;
535 case 'x': {
536 int hex = hexval(next);
537 if (hex < 16) {
538 value = hex;
539 next = nextchar(stream);
540 while ((hex = hexval(next)) < 16) {
541 value = (value << 4) + hex;
542 next = nextchar(stream);
544 value &= 0xff;
545 break;
548 /* Fall through */
549 default:
550 warning(stream_pos(stream), "Unknown escape '%c'", value);
553 /* Mark it as escaped */
554 value |= 0x100;
556 *valp = value;
557 return next;
560 static int get_char_token(int next, stream_t *stream, enum token_type type)
562 int value;
563 struct token *token;
565 next = escapechar(next, '\'', stream, &value);
566 if (value == '\'' || next != '\'') {
567 sparse_error(stream_pos(stream), "Bad character constant");
568 drop_token(stream);
569 return next;
572 token = stream->token;
573 token_type(token) = type;
574 token->character = value & 0xff;
576 add_token(stream);
577 return nextchar(stream);
580 static int get_string_token(int next, stream_t *stream, enum token_type type)
582 static char buffer[MAX_STRING];
583 struct string *string;
584 struct token *token;
585 int len = 0;
587 for (;;) {
588 int val;
589 next = escapechar(next, '"', stream, &val);
590 if (val == '"')
591 break;
592 if (next == EOF) {
593 warning(stream_pos(stream), "End of file in middle of string");
594 return next;
596 if (len < MAX_STRING)
597 buffer[len] = val;
598 len++;
601 if (len > MAX_STRING) {
602 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
603 len = MAX_STRING;
606 string = __alloc_string(len+1);
607 memcpy(string->data, buffer, len);
608 string->data[len] = '\0';
609 string->length = len+1;
611 /* Pass it on.. */
612 token = stream->token;
613 token_type(token) = type;
614 token->string = string;
615 add_token(stream);
617 return next;
620 static int drop_stream_eoln(stream_t *stream)
622 drop_token(stream);
623 for (;;) {
624 switch (nextchar(stream)) {
625 case EOF:
626 return EOF;
627 case '\n':
628 return nextchar(stream);
633 static int drop_stream_comment(stream_t *stream)
635 int newline;
636 int next;
637 drop_token(stream);
638 newline = stream->newline;
640 next = nextchar(stream);
641 for (;;) {
642 int curr = next;
643 if (curr == EOF) {
644 warning(stream_pos(stream), "End of file in the middle of a comment");
645 return curr;
647 next = nextchar(stream);
648 if (curr == '*' && next == '/')
649 break;
651 stream->newline = newline;
652 return nextchar(stream);
655 unsigned char combinations[][4] = COMBINATION_STRINGS;
657 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
659 /* hash function for two-character punctuators - all give unique values */
660 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
663 * note that we won't get false positives - special_hash(0,0) is 0 and
664 * entry 0 is filled (by +=), so all the missing ones are OK.
666 static unsigned char hash_results[32][2] = {
667 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
668 RES('+', '='), /* 00 */
669 RES('/', '='), /* 01 */
670 RES('^', '='), /* 05 */
671 RES('&', '&'), /* 07 */
672 RES('#', '#'), /* 08 */
673 RES('<', '<'), /* 0a */
674 RES('<', '='), /* 0c */
675 RES('!', '='), /* 0e */
676 RES('%', '='), /* 0f */
677 RES('-', '-'), /* 10 */
678 RES('-', '='), /* 11 */
679 RES('-', '>'), /* 13 */
680 RES('=', '='), /* 15 */
681 RES('&', '='), /* 17 */
682 RES('*', '='), /* 18 */
683 RES('.', '.'), /* 1a */
684 RES('+', '+'), /* 1b */
685 RES('|', '='), /* 1c */
686 RES('>', '='), /* 1d */
687 RES('|', '|'), /* 1e */
688 RES('>', '>') /* 1f */
689 #undef RES
691 static int code[32] = {
692 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
693 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
694 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
695 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
696 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
697 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
698 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
699 CODE('<', '=', SPECIAL_LTE), /* 0c */
700 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
701 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
702 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
703 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
704 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
705 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
706 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
707 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
708 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
709 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
710 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
711 CODE('>', '=', SPECIAL_GTE), /* 1d */
712 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
713 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
714 #undef CODE
717 static int get_one_special(int c, stream_t *stream)
719 struct token *token;
720 int next, value, i;
722 next = nextchar(stream);
725 * Check for numbers, strings, character constants, and comments
727 switch (c) {
728 case '.':
729 if (next >= '0' && next <= '9')
730 return get_one_number(c, next, stream);
731 break;
732 case '"':
733 return get_string_token(next, stream, TOKEN_STRING);
734 case '\'':
735 return get_char_token(next, stream, TOKEN_CHAR);
736 case '/':
737 if (next == '/')
738 return drop_stream_eoln(stream);
739 if (next == '*')
740 return drop_stream_comment(stream);
744 * Check for combinations
746 value = c;
747 if (cclass[next + 1] & ValidSecond) {
748 i = special_hash(c, next);
749 if (hash_results[i][0] == c && hash_results[i][1] == next) {
750 value = code[i];
751 next = nextchar(stream);
752 if (value >= SPECIAL_LEFTSHIFT &&
753 next == "==."[value - SPECIAL_LEFTSHIFT]) {
754 value += 3;
755 next = nextchar(stream);
760 /* Pass it on.. */
761 token = stream->token;
762 token_type(token) = TOKEN_SPECIAL;
763 token->special = value;
764 add_token(stream);
765 return next;
768 #define IDENT_HASH_BITS (13)
769 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
770 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
772 #define ident_hash_init(c) (c)
773 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
774 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
776 static struct ident *hash_table[IDENT_HASH_SIZE];
777 static int ident_hit, ident_miss, idents;
779 void show_identifier_stats(void)
781 int i;
782 int distribution[100];
784 fprintf(stderr, "identifiers: %d hits, %d misses\n",
785 ident_hit, ident_miss);
787 for (i = 0; i < 100; i++)
788 distribution[i] = 0;
790 for (i = 0; i < IDENT_HASH_SIZE; i++) {
791 struct ident * ident = hash_table[i];
792 int count = 0;
794 while (ident) {
795 count++;
796 ident = ident->next;
798 if (count > 99)
799 count = 99;
800 distribution[count]++;
803 for (i = 0; i < 100; i++) {
804 if (distribution[i])
805 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
809 static struct ident *alloc_ident(const char *name, int len)
811 struct ident *ident = __alloc_ident(len);
812 ident->symbols = NULL;
813 ident->len = len;
814 ident->tainted = 0;
815 memcpy(ident->name, name, len);
816 return ident;
819 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
821 ident->next = hash_table[hash];
822 hash_table[hash] = ident;
823 ident_miss++;
824 return ident;
827 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
829 struct ident *ident;
830 struct ident **p;
832 p = &hash_table[hash];
833 while ((ident = *p) != NULL) {
834 if (ident->len == (unsigned char) len) {
835 if (strncmp(name, ident->name, len) != 0)
836 goto next;
838 ident_hit++;
839 return ident;
841 next:
842 //misses++;
843 p = &ident->next;
845 ident = alloc_ident(name, len);
846 *p = ident;
847 ident->next = NULL;
848 ident_miss++;
849 idents++;
850 return ident;
853 static unsigned long hash_name(const char *name, int len)
855 unsigned long hash;
856 const unsigned char *p = (const unsigned char *)name;
858 hash = ident_hash_init(*p++);
859 while (--len) {
860 unsigned int i = *p++;
861 hash = ident_hash_add(hash, i);
863 return ident_hash_end(hash);
866 struct ident *hash_ident(struct ident *ident)
868 return insert_hash(ident, hash_name(ident->name, ident->len));
871 struct ident *built_in_ident(const char *name)
873 int len = strlen(name);
874 return create_hashed_ident(name, len, hash_name(name, len));
877 struct token *built_in_token(int stream, const char *name)
879 struct token *token;
881 token = __alloc_token(0);
882 token->pos.stream = stream;
883 token_type(token) = TOKEN_IDENT;
884 token->ident = built_in_ident(name);
885 return token;
888 static int get_one_identifier(int c, stream_t *stream)
890 struct token *token;
891 struct ident *ident;
892 unsigned long hash;
893 char buf[256];
894 int len = 1;
895 int next;
897 hash = ident_hash_init(c);
898 buf[0] = c;
899 for (;;) {
900 next = nextchar(stream);
901 if (!(cclass[next + 1] & (Letter | Digit)))
902 break;
903 if (len >= sizeof(buf))
904 break;
905 hash = ident_hash_add(hash, next);
906 buf[len] = next;
907 len++;
909 hash = ident_hash_end(hash);
911 ident = create_hashed_ident(buf, len, hash);
913 if (ident == &L_ident) {
914 if (next == '\'')
915 return get_char_token(nextchar(stream), stream, TOKEN_WIDE_CHAR);
916 if (next == '\"')
917 return get_string_token(nextchar(stream), stream, TOKEN_WIDE_STRING);
920 /* Pass it on.. */
921 token = stream->token;
922 token_type(token) = TOKEN_IDENT;
923 token->ident = ident;
924 add_token(stream);
925 return next;
928 static int get_one_token(int c, stream_t *stream)
930 long class = cclass[c + 1];
931 if (class & Digit)
932 return get_one_number(c, nextchar(stream), stream);
933 if (class & Letter)
934 return get_one_identifier(c, stream);
935 return get_one_special(c, stream);
938 static struct token *setup_stream(stream_t *stream, int idx, int fd,
939 unsigned char *buf, unsigned int buf_size)
941 struct token *begin;
943 stream->nr = idx;
944 stream->line = 1;
945 stream->newline = 1;
946 stream->whitespace = 0;
947 stream->pos = 0;
949 stream->token = NULL;
950 stream->fd = fd;
951 stream->offset = 0;
952 stream->size = buf_size;
953 stream->buffer = buf;
955 begin = alloc_token(stream);
956 token_type(begin) = TOKEN_STREAMBEGIN;
957 stream->tokenlist = &begin->next;
958 return begin;
961 static struct token *tokenize_stream(stream_t *stream)
963 int c = nextchar(stream);
964 while (c != EOF) {
965 if (!isspace(c)) {
966 struct token *token = alloc_token(stream);
967 stream->token = token;
968 stream->newline = 0;
969 stream->whitespace = 0;
970 c = get_one_token(c, stream);
971 continue;
973 stream->whitespace = 1;
974 c = nextchar(stream);
976 return mark_eof(stream);
979 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
981 stream_t stream;
982 struct token *begin;
984 begin = setup_stream(&stream, 0, -1, buffer, size);
985 *endtoken = tokenize_stream(&stream);
986 return begin;
989 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
991 struct token *begin, *end;
992 stream_t stream;
993 unsigned char buffer[BUFSIZE];
994 int idx;
996 idx = init_stream(name, fd, next_path);
997 if (idx < 0) {
998 // info(endtoken->pos, "File %s is const", name);
999 return endtoken;
1002 begin = setup_stream(&stream, idx, fd, buffer, 0);
1003 end = tokenize_stream(&stream);
1004 if (endtoken)
1005 end->next = endtoken;
1006 return begin;