Sparse 0.4.4-rc2
[smatch.git] / tokenize.c
blobd4f05e5637706c3dd998aa1ef8386f8a8da30d0a
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
17 #include <stdint.h>
19 #include "lib.h"
20 #include "allocate.h"
21 #include "token.h"
22 #include "symbol.h"
24 #define EOF (-1)
26 int input_stream_nr = 0;
27 struct stream *input_streams;
28 static int input_streams_allocated;
29 unsigned int tabstop = 8;
31 #define BUFSIZE (8192)
33 typedef struct {
34 int fd, offset, size;
35 int pos, line, nr;
36 int newline, whitespace;
37 struct token **tokenlist;
38 struct token *token;
39 unsigned char *buffer;
40 } stream_t;
42 const char *stream_name(int stream)
44 if (stream < 0 || stream > input_stream_nr)
45 return "<bad stream>";
46 return input_streams[stream].name;
49 static struct position stream_pos(stream_t *stream)
51 struct position pos;
52 pos.type = 0;
53 pos.stream = stream->nr;
54 pos.newline = stream->newline;
55 pos.whitespace = stream->whitespace;
56 pos.pos = stream->pos;
57 pos.line = stream->line;
58 pos.noexpand = 0;
59 return pos;
62 const char *show_special(int val)
64 static char buffer[4];
66 buffer[0] = val;
67 buffer[1] = 0;
68 if (val >= SPECIAL_BASE)
69 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
70 return buffer;
73 const char *show_ident(const struct ident *ident)
75 static char buffer[256];
76 if (!ident)
77 return "<noident>";
78 sprintf(buffer, "%.*s", ident->len, ident->name);
79 return buffer;
82 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
84 if (isprint(c)) {
85 if (c == escape || c == '\\')
86 *ptr++ = '\\';
87 *ptr++ = c;
88 return ptr;
90 *ptr++ = '\\';
91 switch (c) {
92 case '\n':
93 *ptr++ = 'n';
94 return ptr;
95 case '\t':
96 *ptr++ = 't';
97 return ptr;
99 if (!isdigit(next))
100 return ptr + sprintf(ptr, "%o", c);
102 return ptr + sprintf(ptr, "%03o", c);
105 const char *show_string(const struct string *string)
107 static char buffer[4 * MAX_STRING + 3];
108 char *ptr;
109 int i;
111 if (!string->length)
112 return "<bad_string>";
113 ptr = buffer;
114 *ptr++ = '"';
115 for (i = 0; i < string->length-1; i++) {
116 const char *p = string->data + i;
117 ptr = charstr(ptr, p[0], '"', p[1]);
119 *ptr++ = '"';
120 *ptr = '\0';
121 return buffer;
124 const char *show_token(const struct token *token)
126 static char buffer[256];
128 if (!token)
129 return "<no token>";
130 switch (token_type(token)) {
131 case TOKEN_ERROR:
132 return "syntax error";
134 case TOKEN_EOF:
135 return "end-of-input";
137 case TOKEN_IDENT:
138 return show_ident(token->ident);
140 case TOKEN_STRING:
141 case TOKEN_WIDE_STRING:
142 return show_string(token->string);
144 case TOKEN_NUMBER:
145 return token->number;
147 case TOKEN_SPECIAL:
148 return show_special(token->special);
150 case TOKEN_CHAR:
151 case TOKEN_WIDE_CHAR: {
152 char *ptr = buffer;
153 int c = token->character;
154 *ptr++ = '\'';
155 ptr = charstr(ptr, c, '\'', 0);
156 *ptr++ = '\'';
157 *ptr++ = '\0';
158 return buffer;
161 case TOKEN_STREAMBEGIN:
162 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
163 return buffer;
165 case TOKEN_STREAMEND:
166 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
167 return buffer;
169 case TOKEN_UNTAINT:
170 sprintf(buffer, "<untaint>");
171 return buffer;
173 case TOKEN_ARG_COUNT:
174 sprintf(buffer, "<argcnt>");
175 return buffer;
177 default:
178 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
179 return buffer;
183 #define HASHED_INPUT_BITS (6)
184 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
185 #define HASH_PRIME 0x9e370001UL
187 static int input_stream_hashes[HASHED_INPUT] = { [0 ... HASHED_INPUT-1] = -1 };
189 int *hash_stream(const char *name)
191 uint32_t hash = 0;
192 unsigned char c;
194 while ((c = *name++) != 0)
195 hash = (hash + (c << 4) + (c >> 4)) * 11;
197 hash *= HASH_PRIME;
198 hash >>= 32 - HASHED_INPUT_BITS;
199 return input_stream_hashes + hash;
202 int init_stream(const char *name, int fd, const char **next_path)
204 int stream = input_stream_nr, *hash;
205 struct stream *current;
207 if (stream >= input_streams_allocated) {
208 int newalloc = stream * 4 / 3 + 10;
209 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
210 if (!input_streams)
211 die("Unable to allocate more streams space");
212 input_streams_allocated = newalloc;
214 current = input_streams + stream;
215 memset(current, 0, sizeof(*current));
216 current->name = name;
217 current->fd = fd;
218 current->next_path = next_path;
219 current->path = NULL;
220 current->constant = CONSTANT_FILE_MAYBE;
221 input_stream_nr = stream+1;
222 hash = hash_stream(name);
223 current->next_stream = *hash;
224 *hash = stream;
225 return stream;
228 static struct token * alloc_token(stream_t *stream)
230 struct token *token = __alloc_token(0);
231 token->pos = stream_pos(stream);
232 return token;
236 * Argh... That was surprisingly messy - handling '\r' complicates the
237 * things a _lot_.
239 static int nextchar_slow(stream_t *stream)
241 int offset = stream->offset;
242 int size = stream->size;
243 int c;
244 int spliced = 0, had_cr, had_backslash, complain;
246 restart:
247 had_cr = had_backslash = complain = 0;
249 repeat:
250 if (offset >= size) {
251 if (stream->fd < 0)
252 goto got_eof;
253 size = read(stream->fd, stream->buffer, BUFSIZE);
254 if (size <= 0)
255 goto got_eof;
256 stream->size = size;
257 stream->offset = offset = 0;
260 c = stream->buffer[offset++];
262 if (had_cr && c != '\n')
263 complain = 1;
265 if (c == '\r') {
266 had_cr = 1;
267 goto repeat;
270 stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
272 if (c == '\n') {
273 stream->line++;
274 stream->pos = 0;
277 if (!had_backslash) {
278 if (c == '\\') {
279 had_backslash = 1;
280 goto repeat;
282 if (c == '\n')
283 stream->newline = 1;
284 } else {
285 if (c == '\n') {
286 if (complain)
287 warning(stream_pos(stream), "non-ASCII data stream");
288 spliced = 1;
289 goto restart;
291 stream->pos--;
292 offset--;
293 c = '\\';
296 out:
297 stream->offset = offset;
298 if (complain)
299 warning(stream_pos(stream), "non-ASCII data stream");
301 return c;
303 got_eof:
304 if (had_backslash) {
305 c = '\\';
306 goto out;
308 if (stream->pos)
309 warning(stream_pos(stream), "no newline at end of file");
310 else if (had_cr)
311 warning(stream_pos(stream), "non-ASCII data stream");
312 else if (spliced)
313 warning(stream_pos(stream), "backslash-newline at end of file");
314 return EOF;
318 * We want that as light as possible while covering all normal cases.
319 * Slow path (including the logics with line-splicing and EOF sanity
320 * checks) is in nextchar_slow().
322 static inline int nextchar(stream_t *stream)
324 int offset = stream->offset;
326 if (offset < stream->size) {
327 int c = stream->buffer[offset++];
328 static const char special[256] = {
329 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
331 if (!special[c]) {
332 stream->offset = offset;
333 stream->pos++;
334 return c;
337 return nextchar_slow(stream);
340 struct token eof_token_entry;
342 static struct token *mark_eof(stream_t *stream)
344 struct token *end;
346 end = alloc_token(stream);
347 token_type(end) = TOKEN_STREAMEND;
348 end->pos.newline = 1;
350 eof_token_entry.next = &eof_token_entry;
351 eof_token_entry.pos.newline = 1;
353 end->next = &eof_token_entry;
354 *stream->tokenlist = end;
355 stream->tokenlist = NULL;
356 return end;
359 static void add_token(stream_t *stream)
361 struct token *token = stream->token;
363 stream->token = NULL;
364 token->next = NULL;
365 *stream->tokenlist = token;
366 stream->tokenlist = &token->next;
369 static void drop_token(stream_t *stream)
371 stream->newline |= stream->token->pos.newline;
372 stream->whitespace |= stream->token->pos.whitespace;
373 stream->token = NULL;
376 enum {
377 Letter = 1,
378 Digit = 2,
379 Hex = 4,
380 Exp = 8,
381 Dot = 16,
382 ValidSecond = 32,
385 static const long cclass[257] = {
386 ['0' + 1 ... '9' + 1] = Digit | Hex,
387 ['A' + 1 ... 'D' + 1] = Letter | Hex,
388 ['E' + 1] = Letter | Hex | Exp,
389 ['F' + 1] = Letter | Hex,
390 ['G' + 1 ... 'O' + 1] = Letter,
391 ['P' + 1] = Letter | Exp,
392 ['Q' + 1 ... 'Z' + 1] = Letter,
393 ['a' + 1 ... 'd' + 1] = Letter | Hex,
394 ['e' + 1] = Letter | Hex | Exp,
395 ['f' + 1] = Letter | Hex,
396 ['g' + 1 ... 'o' + 1] = Letter,
397 ['p' + 1] = Letter | Exp,
398 ['q' + 1 ... 'z' + 1] = Letter,
399 ['_' + 1] = Letter,
400 ['.' + 1] = Dot | ValidSecond,
401 ['=' + 1] = ValidSecond,
402 ['+' + 1] = ValidSecond,
403 ['-' + 1] = ValidSecond,
404 ['>' + 1] = ValidSecond,
405 ['<' + 1] = ValidSecond,
406 ['&' + 1] = ValidSecond,
407 ['|' + 1] = ValidSecond,
408 ['#' + 1] = ValidSecond,
412 * pp-number:
413 * digit
414 * . digit
415 * pp-number digit
416 * pp-number identifier-nodigit
417 * pp-number e sign
418 * pp-number E sign
419 * pp-number p sign
420 * pp-number P sign
421 * pp-number .
423 static int get_one_number(int c, int next, stream_t *stream)
425 struct token *token;
426 static char buffer[4095];
427 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
428 int len;
430 *p++ = c;
431 for (;;) {
432 long class = cclass[next + 1];
433 if (!(class & (Dot | Digit | Letter)))
434 break;
435 if (p != buffer_end)
436 *p++ = next;
437 next = nextchar(stream);
438 if (class & Exp) {
439 if (next == '-' || next == '+') {
440 if (p != buffer_end)
441 *p++ = next;
442 next = nextchar(stream);
447 if (p == buffer_end) {
448 sparse_error(stream_pos(stream), "number token exceeds %td characters",
449 buffer_end - buffer);
450 // Pretend we saw just "1".
451 buffer[0] = '1';
452 p = buffer + 1;
455 *p++ = 0;
456 len = p - buffer;
457 buf = __alloc_bytes(len);
458 memcpy(buf, buffer, len);
460 token = stream->token;
461 token_type(token) = TOKEN_NUMBER;
462 token->number = buf;
463 add_token(stream);
465 return next;
468 static int escapechar(int first, int type, stream_t *stream, int *valp)
470 int next, value;
472 next = nextchar(stream);
473 value = first;
475 if (first == '\n')
476 warning(stream_pos(stream), "Newline in string or character constant");
478 if (first == '\\' && next != EOF) {
479 value = next;
480 next = nextchar(stream);
481 if (value != type) {
482 switch (value) {
483 case 'a':
484 value = '\a';
485 break;
486 case 'b':
487 value = '\b';
488 break;
489 case 't':
490 value = '\t';
491 break;
492 case 'n':
493 value = '\n';
494 break;
495 case 'v':
496 value = '\v';
497 break;
498 case 'f':
499 value = '\f';
500 break;
501 case 'r':
502 value = '\r';
503 break;
504 case 'e':
505 value = '\e';
506 break;
507 case '\\':
508 break;
509 case '?':
510 break;
511 case '\'':
512 break;
513 case '"':
514 break;
515 case '\n':
516 warning(stream_pos(stream), "Newline in string or character constant");
517 break;
518 case '0'...'7': {
519 int nr = 2;
520 value -= '0';
521 while (next >= '0' && next <= '7') {
522 value = (value << 3) + (next-'0');
523 next = nextchar(stream);
524 if (!--nr)
525 break;
527 value &= 0xff;
528 break;
530 case 'x': {
531 int hex = hexval(next);
532 if (hex < 16) {
533 value = hex;
534 next = nextchar(stream);
535 while ((hex = hexval(next)) < 16) {
536 value = (value << 4) + hex;
537 next = nextchar(stream);
539 value &= 0xff;
540 break;
543 /* Fall through */
544 default:
545 warning(stream_pos(stream), "Unknown escape '%c'", value);
548 /* Mark it as escaped */
549 value |= 0x100;
551 *valp = value;
552 return next;
555 static int get_char_token(int next, stream_t *stream, enum token_type type)
557 int value;
558 struct token *token;
560 next = escapechar(next, '\'', stream, &value);
561 if (value == '\'' || next != '\'') {
562 sparse_error(stream_pos(stream), "Bad character constant");
563 drop_token(stream);
564 return next;
567 token = stream->token;
568 token_type(token) = type;
569 token->character = value & 0xff;
571 add_token(stream);
572 return nextchar(stream);
575 static int get_string_token(int next, stream_t *stream, enum token_type type)
577 static char buffer[MAX_STRING];
578 struct string *string;
579 struct token *token;
580 int len = 0;
582 for (;;) {
583 int val;
584 next = escapechar(next, '"', stream, &val);
585 if (val == '"')
586 break;
587 if (next == EOF) {
588 warning(stream_pos(stream), "End of file in middle of string");
589 return next;
591 if (len < MAX_STRING)
592 buffer[len] = val;
593 len++;
596 if (len > MAX_STRING) {
597 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
598 len = MAX_STRING;
601 string = __alloc_string(len+1);
602 memcpy(string->data, buffer, len);
603 string->data[len] = '\0';
604 string->length = len+1;
606 /* Pass it on.. */
607 token = stream->token;
608 token_type(token) = type;
609 token->string = string;
610 add_token(stream);
612 return next;
615 static int drop_stream_eoln(stream_t *stream)
617 drop_token(stream);
618 for (;;) {
619 switch (nextchar(stream)) {
620 case EOF:
621 return EOF;
622 case '\n':
623 return nextchar(stream);
628 static int drop_stream_comment(stream_t *stream)
630 int newline;
631 int next;
632 drop_token(stream);
633 newline = stream->newline;
635 next = nextchar(stream);
636 for (;;) {
637 int curr = next;
638 if (curr == EOF) {
639 warning(stream_pos(stream), "End of file in the middle of a comment");
640 return curr;
642 next = nextchar(stream);
643 if (curr == '*' && next == '/')
644 break;
646 stream->newline = newline;
647 return nextchar(stream);
650 unsigned char combinations[][4] = COMBINATION_STRINGS;
652 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
654 /* hash function for two-character punctuators - all give unique values */
655 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
658 * note that we won't get false positives - special_hash(0,0) is 0 and
659 * entry 0 is filled (by +=), so all the missing ones are OK.
661 static unsigned char hash_results[32][2] = {
662 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
663 RES('+', '='), /* 00 */
664 RES('/', '='), /* 01 */
665 RES('^', '='), /* 05 */
666 RES('&', '&'), /* 07 */
667 RES('#', '#'), /* 08 */
668 RES('<', '<'), /* 0a */
669 RES('<', '='), /* 0c */
670 RES('!', '='), /* 0e */
671 RES('%', '='), /* 0f */
672 RES('-', '-'), /* 10 */
673 RES('-', '='), /* 11 */
674 RES('-', '>'), /* 13 */
675 RES('=', '='), /* 15 */
676 RES('&', '='), /* 17 */
677 RES('*', '='), /* 18 */
678 RES('.', '.'), /* 1a */
679 RES('+', '+'), /* 1b */
680 RES('|', '='), /* 1c */
681 RES('>', '='), /* 1d */
682 RES('|', '|'), /* 1e */
683 RES('>', '>') /* 1f */
684 #undef RES
686 static int code[32] = {
687 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
688 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
689 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
690 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
691 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
692 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
693 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
694 CODE('<', '=', SPECIAL_LTE), /* 0c */
695 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
696 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
697 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
698 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
699 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
700 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
701 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
702 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
703 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
704 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
705 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
706 CODE('>', '=', SPECIAL_GTE), /* 1d */
707 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
708 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
709 #undef CODE
712 static int get_one_special(int c, stream_t *stream)
714 struct token *token;
715 int next, value, i;
717 next = nextchar(stream);
720 * Check for numbers, strings, character constants, and comments
722 switch (c) {
723 case '.':
724 if (next >= '0' && next <= '9')
725 return get_one_number(c, next, stream);
726 break;
727 case '"':
728 return get_string_token(next, stream, TOKEN_STRING);
729 case '\'':
730 return get_char_token(next, stream, TOKEN_CHAR);
731 case '/':
732 if (next == '/')
733 return drop_stream_eoln(stream);
734 if (next == '*')
735 return drop_stream_comment(stream);
739 * Check for combinations
741 value = c;
742 if (cclass[next + 1] & ValidSecond) {
743 i = special_hash(c, next);
744 if (hash_results[i][0] == c && hash_results[i][1] == next) {
745 value = code[i];
746 next = nextchar(stream);
747 if (value >= SPECIAL_LEFTSHIFT &&
748 next == "==."[value - SPECIAL_LEFTSHIFT]) {
749 value += 3;
750 next = nextchar(stream);
755 /* Pass it on.. */
756 token = stream->token;
757 token_type(token) = TOKEN_SPECIAL;
758 token->special = value;
759 add_token(stream);
760 return next;
763 #define IDENT_HASH_BITS (13)
764 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
765 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
767 #define ident_hash_init(c) (c)
768 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
769 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
771 static struct ident *hash_table[IDENT_HASH_SIZE];
772 static int ident_hit, ident_miss, idents;
774 void show_identifier_stats(void)
776 int i;
777 int distribution[100];
779 fprintf(stderr, "identifiers: %d hits, %d misses\n",
780 ident_hit, ident_miss);
782 for (i = 0; i < 100; i++)
783 distribution[i] = 0;
785 for (i = 0; i < IDENT_HASH_SIZE; i++) {
786 struct ident * ident = hash_table[i];
787 int count = 0;
789 while (ident) {
790 count++;
791 ident = ident->next;
793 if (count > 99)
794 count = 99;
795 distribution[count]++;
798 for (i = 0; i < 100; i++) {
799 if (distribution[i])
800 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
804 static struct ident *alloc_ident(const char *name, int len)
806 struct ident *ident = __alloc_ident(len);
807 ident->symbols = NULL;
808 ident->len = len;
809 ident->tainted = 0;
810 memcpy(ident->name, name, len);
811 return ident;
814 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
816 ident->next = hash_table[hash];
817 hash_table[hash] = ident;
818 ident_miss++;
819 return ident;
822 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
824 struct ident *ident;
825 struct ident **p;
827 p = &hash_table[hash];
828 while ((ident = *p) != NULL) {
829 if (ident->len == (unsigned char) len) {
830 if (strncmp(name, ident->name, len) != 0)
831 goto next;
833 ident_hit++;
834 return ident;
836 next:
837 //misses++;
838 p = &ident->next;
840 ident = alloc_ident(name, len);
841 *p = ident;
842 ident->next = NULL;
843 ident_miss++;
844 idents++;
845 return ident;
848 static unsigned long hash_name(const char *name, int len)
850 unsigned long hash;
851 const unsigned char *p = (const unsigned char *)name;
853 hash = ident_hash_init(*p++);
854 while (--len) {
855 unsigned int i = *p++;
856 hash = ident_hash_add(hash, i);
858 return ident_hash_end(hash);
861 struct ident *hash_ident(struct ident *ident)
863 return insert_hash(ident, hash_name(ident->name, ident->len));
866 struct ident *built_in_ident(const char *name)
868 int len = strlen(name);
869 return create_hashed_ident(name, len, hash_name(name, len));
872 struct token *built_in_token(int stream, const char *name)
874 struct token *token;
876 token = __alloc_token(0);
877 token->pos.stream = stream;
878 token_type(token) = TOKEN_IDENT;
879 token->ident = built_in_ident(name);
880 return token;
883 static int get_one_identifier(int c, stream_t *stream)
885 struct token *token;
886 struct ident *ident;
887 unsigned long hash;
888 char buf[256];
889 int len = 1;
890 int next;
892 hash = ident_hash_init(c);
893 buf[0] = c;
894 for (;;) {
895 next = nextchar(stream);
896 if (!(cclass[next + 1] & (Letter | Digit)))
897 break;
898 if (len >= sizeof(buf))
899 break;
900 hash = ident_hash_add(hash, next);
901 buf[len] = next;
902 len++;
904 hash = ident_hash_end(hash);
906 ident = create_hashed_ident(buf, len, hash);
908 if (ident == &L_ident) {
909 if (next == '\'')
910 return get_char_token(nextchar(stream), stream, TOKEN_WIDE_CHAR);
911 if (next == '\"')
912 return get_string_token(nextchar(stream), stream, TOKEN_WIDE_STRING);
915 /* Pass it on.. */
916 token = stream->token;
917 token_type(token) = TOKEN_IDENT;
918 token->ident = ident;
919 add_token(stream);
920 return next;
923 static int get_one_token(int c, stream_t *stream)
925 long class = cclass[c + 1];
926 if (class & Digit)
927 return get_one_number(c, nextchar(stream), stream);
928 if (class & Letter)
929 return get_one_identifier(c, stream);
930 return get_one_special(c, stream);
933 static struct token *setup_stream(stream_t *stream, int idx, int fd,
934 unsigned char *buf, unsigned int buf_size)
936 struct token *begin;
938 stream->nr = idx;
939 stream->line = 1;
940 stream->newline = 1;
941 stream->whitespace = 0;
942 stream->pos = 0;
944 stream->token = NULL;
945 stream->fd = fd;
946 stream->offset = 0;
947 stream->size = buf_size;
948 stream->buffer = buf;
950 begin = alloc_token(stream);
951 token_type(begin) = TOKEN_STREAMBEGIN;
952 stream->tokenlist = &begin->next;
953 return begin;
956 static struct token *tokenize_stream(stream_t *stream)
958 int c = nextchar(stream);
959 while (c != EOF) {
960 if (!isspace(c)) {
961 struct token *token = alloc_token(stream);
962 stream->token = token;
963 stream->newline = 0;
964 stream->whitespace = 0;
965 c = get_one_token(c, stream);
966 continue;
968 stream->whitespace = 1;
969 c = nextchar(stream);
971 return mark_eof(stream);
974 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
976 stream_t stream;
977 struct token *begin;
979 begin = setup_stream(&stream, 0, -1, buffer, size);
980 *endtoken = tokenize_stream(&stream);
981 return begin;
984 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
986 struct token *begin, *end;
987 stream_t stream;
988 unsigned char buffer[BUFSIZE];
989 int idx;
991 idx = init_stream(name, fd, next_path);
992 if (idx < 0) {
993 // info(endtoken->pos, "File %s is const", name);
994 return endtoken;
997 begin = setup_stream(&stream, idx, fd, buffer, 0);
998 end = tokenize_stream(&stream);
999 if (endtoken)
1000 end->next = endtoken;
1001 return begin;