flow: fix segfault on parse error
[smatch.git] / tokenize.c
blob68f9bb6726b83e74d03453bf608f2018058a3a72
1 /*
2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
6 * 2003 Linus Torvalds
8 * Licensed under the Open Software License version 1.1
9 */
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <stdarg.h>
13 #include <stddef.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <unistd.h>
18 #include "lib.h"
19 #include "allocate.h"
20 #include "token.h"
21 #include "symbol.h"
23 #define EOF (-1)
25 int input_stream_nr = 0;
26 struct stream *input_streams;
27 static int input_streams_allocated;
28 unsigned int tabstop = 8;
29 int no_lineno = 0;
31 #define BUFSIZE (8192)
33 typedef struct {
34 int fd, offset, size;
35 int pos, line, nr;
36 int newline, whitespace;
37 struct token **tokenlist;
38 struct token *token;
39 unsigned char *buffer;
40 } stream_t;
42 const char *stream_name(int stream)
44 if (stream < 0 || stream > input_stream_nr)
45 return "<bad stream>";
46 return input_streams[stream].name;
49 static struct position stream_pos(stream_t *stream)
51 struct position pos;
52 pos.type = 0;
53 pos.stream = stream->nr;
54 pos.newline = stream->newline;
55 pos.whitespace = stream->whitespace;
56 pos.pos = stream->pos;
58 pos.line = stream->line;
59 if (no_lineno)
60 pos.line = 123456;
62 pos.noexpand = 0;
63 return pos;
66 const char *show_special(int val)
68 static char buffer[4];
70 buffer[0] = val;
71 buffer[1] = 0;
72 if (val >= SPECIAL_BASE)
73 strcpy(buffer, (char *) combinations[val - SPECIAL_BASE]);
74 return buffer;
77 const char *show_ident(const struct ident *ident)
79 static char buffer[256];
80 if (!ident)
81 return "<noident>";
82 sprintf(buffer, "%.*s", ident->len, ident->name);
83 return buffer;
86 static char *charstr(char *ptr, unsigned char c, unsigned char escape, unsigned char next)
88 if (isprint(c)) {
89 if (c == escape || c == '\\')
90 *ptr++ = '\\';
91 *ptr++ = c;
92 return ptr;
94 *ptr++ = '\\';
95 switch (c) {
96 case '\n':
97 *ptr++ = 'n';
98 return ptr;
99 case '\t':
100 *ptr++ = 't';
101 return ptr;
103 if (!isdigit(next))
104 return ptr + sprintf(ptr, "%o", c);
106 return ptr + sprintf(ptr, "%03o", c);
109 const char *show_string(const struct string *string)
111 static char buffer[4 * MAX_STRING + 3];
112 char *ptr;
113 int i;
115 if (!string->length)
116 return "<bad_string>";
117 ptr = buffer;
118 *ptr++ = '"';
119 for (i = 0; i < string->length-1; i++) {
120 const char *p = string->data + i;
121 ptr = charstr(ptr, p[0], '"', p[1]);
123 *ptr++ = '"';
124 *ptr = '\0';
125 return buffer;
128 const char *show_token(const struct token *token)
130 static char buffer[256];
132 if (!token)
133 return "<no token>";
134 switch (token_type(token)) {
135 case TOKEN_ERROR:
136 return "syntax error";
138 case TOKEN_EOF:
139 return "end-of-input";
141 case TOKEN_IDENT:
142 return show_ident(token->ident);
144 case TOKEN_STRING:
145 case TOKEN_WIDE_STRING:
146 return show_string(token->string);
148 case TOKEN_NUMBER:
149 return token->number;
151 case TOKEN_SPECIAL:
152 return show_special(token->special);
154 case TOKEN_CHAR:
155 case TOKEN_WIDE_CHAR: {
156 char *ptr = buffer;
157 int c = token->character;
158 *ptr++ = '\'';
159 ptr = charstr(ptr, c, '\'', 0);
160 *ptr++ = '\'';
161 *ptr++ = '\0';
162 return buffer;
165 case TOKEN_STREAMBEGIN:
166 sprintf(buffer, "<beginning of '%s'>", stream_name(token->pos.stream));
167 return buffer;
169 case TOKEN_STREAMEND:
170 sprintf(buffer, "<end of '%s'>", stream_name(token->pos.stream));
171 return buffer;
173 case TOKEN_UNTAINT:
174 sprintf(buffer, "<untaint>");
175 return buffer;
177 case TOKEN_ARG_COUNT:
178 sprintf(buffer, "<argcnt>");
179 return buffer;
181 default:
182 sprintf(buffer, "unhandled token type '%d' ", token_type(token));
183 return buffer;
187 int init_stream(const char *name, int fd, const char **next_path)
189 int stream = input_stream_nr;
190 struct stream *current;
192 if (stream >= input_streams_allocated) {
193 int newalloc = stream * 4 / 3 + 10;
194 input_streams = realloc(input_streams, newalloc * sizeof(struct stream));
195 if (!input_streams)
196 die("Unable to allocate more streams space");
197 input_streams_allocated = newalloc;
199 current = input_streams + stream;
200 memset(current, 0, sizeof(*current));
201 current->name = name;
202 current->fd = fd;
203 current->next_path = next_path;
204 current->path = NULL;
205 current->constant = CONSTANT_FILE_MAYBE;
206 input_stream_nr = stream+1;
207 return stream;
210 static struct token * alloc_token(stream_t *stream)
212 struct token *token = __alloc_token(0);
213 token->pos = stream_pos(stream);
214 return token;
218 * Argh... That was surprisingly messy - handling '\r' complicates the
219 * things a _lot_.
221 static int nextchar_slow(stream_t *stream)
223 int offset = stream->offset;
224 int size = stream->size;
225 int c;
226 int spliced = 0, had_cr, had_backslash, complain;
228 restart:
229 had_cr = had_backslash = complain = 0;
231 repeat:
232 if (offset >= size) {
233 if (stream->fd < 0)
234 goto got_eof;
235 size = read(stream->fd, stream->buffer, BUFSIZE);
236 if (size <= 0)
237 goto got_eof;
238 stream->size = size;
239 stream->offset = offset = 0;
242 c = stream->buffer[offset++];
244 if (had_cr && c != '\n')
245 complain = 1;
247 if (c == '\r') {
248 had_cr = 1;
249 goto repeat;
252 stream->pos += (c == '\t') ? (tabstop - stream->pos % tabstop) : 1;
254 if (c == '\n') {
255 stream->line++;
256 stream->pos = 0;
259 if (!had_backslash) {
260 if (c == '\\') {
261 had_backslash = 1;
262 goto repeat;
264 if (c == '\n')
265 stream->newline = 1;
266 } else {
267 if (c == '\n') {
268 if (complain)
269 warning(stream_pos(stream), "non-ASCII data stream");
270 spliced = 1;
271 goto restart;
273 stream->pos--;
274 offset--;
275 c = '\\';
278 out:
279 stream->offset = offset;
280 if (complain)
281 warning(stream_pos(stream), "non-ASCII data stream");
283 return c;
285 got_eof:
286 if (had_backslash) {
287 c = '\\';
288 goto out;
290 if (stream->pos)
291 warning(stream_pos(stream), "no newline at end of file");
292 else if (had_cr)
293 warning(stream_pos(stream), "non-ASCII data stream");
294 else if (spliced)
295 warning(stream_pos(stream), "backslash-newline at end of file");
296 return EOF;
300 * We want that as light as possible while covering all normal cases.
301 * Slow path (including the logics with line-splicing and EOF sanity
302 * checks) is in nextchar_slow().
304 static inline int nextchar(stream_t *stream)
306 int offset = stream->offset;
308 if (offset < stream->size) {
309 int c = stream->buffer[offset++];
310 static const char special[256] = {
311 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
313 if (!special[c]) {
314 stream->offset = offset;
315 stream->pos++;
316 return c;
319 return nextchar_slow(stream);
322 struct token eof_token_entry;
324 static struct token *mark_eof(stream_t *stream)
326 struct token *end;
328 end = alloc_token(stream);
329 token_type(end) = TOKEN_STREAMEND;
330 end->pos.newline = 1;
332 eof_token_entry.next = &eof_token_entry;
333 eof_token_entry.pos.newline = 1;
335 end->next = &eof_token_entry;
336 *stream->tokenlist = end;
337 stream->tokenlist = NULL;
338 return end;
341 static void add_token(stream_t *stream)
343 struct token *token = stream->token;
345 stream->token = NULL;
346 token->next = NULL;
347 *stream->tokenlist = token;
348 stream->tokenlist = &token->next;
351 static void drop_token(stream_t *stream)
353 stream->newline |= stream->token->pos.newline;
354 stream->whitespace |= stream->token->pos.whitespace;
355 stream->token = NULL;
358 enum {
359 Letter = 1,
360 Digit = 2,
361 Hex = 4,
362 Exp = 8,
363 Dot = 16,
364 ValidSecond = 32,
367 static const long cclass[257] = {
368 ['0' + 1 ... '9' + 1] = Digit | Hex,
369 ['A' + 1 ... 'D' + 1] = Letter | Hex,
370 ['E' + 1] = Letter | Hex | Exp,
371 ['F' + 1] = Letter | Hex,
372 ['G' + 1 ... 'O' + 1] = Letter,
373 ['P' + 1] = Letter | Exp,
374 ['Q' + 1 ... 'Z' + 1] = Letter,
375 ['a' + 1 ... 'd' + 1] = Letter | Hex,
376 ['e' + 1] = Letter | Hex | Exp,
377 ['f' + 1] = Letter | Hex,
378 ['g' + 1 ... 'o' + 1] = Letter,
379 ['p' + 1] = Letter | Exp,
380 ['q' + 1 ... 'z' + 1] = Letter,
381 ['_' + 1] = Letter,
382 ['.' + 1] = Dot | ValidSecond,
383 ['=' + 1] = ValidSecond,
384 ['+' + 1] = ValidSecond,
385 ['-' + 1] = ValidSecond,
386 ['>' + 1] = ValidSecond,
387 ['<' + 1] = ValidSecond,
388 ['&' + 1] = ValidSecond,
389 ['|' + 1] = ValidSecond,
390 ['#' + 1] = ValidSecond,
394 * pp-number:
395 * digit
396 * . digit
397 * pp-number digit
398 * pp-number identifier-nodigit
399 * pp-number e sign
400 * pp-number E sign
401 * pp-number p sign
402 * pp-number P sign
403 * pp-number .
405 static int get_one_number(int c, int next, stream_t *stream)
407 struct token *token;
408 static char buffer[4095];
409 char *p = buffer, *buf, *buffer_end = buffer + sizeof (buffer);
410 int len;
412 *p++ = c;
413 for (;;) {
414 long class = cclass[next + 1];
415 if (!(class & (Dot | Digit | Letter)))
416 break;
417 if (p != buffer_end)
418 *p++ = next;
419 next = nextchar(stream);
420 if (class & Exp) {
421 if (next == '-' || next == '+') {
422 if (p != buffer_end)
423 *p++ = next;
424 next = nextchar(stream);
429 if (p == buffer_end) {
430 sparse_error(stream_pos(stream), "number token exceeds %td characters",
431 buffer_end - buffer);
432 // Pretend we saw just "1".
433 buffer[0] = '1';
434 p = buffer + 1;
437 *p++ = 0;
438 len = p - buffer;
439 buf = __alloc_bytes(len);
440 memcpy(buf, buffer, len);
442 token = stream->token;
443 token_type(token) = TOKEN_NUMBER;
444 token->number = buf;
445 add_token(stream);
447 return next;
450 static int escapechar(int first, int type, stream_t *stream, int *valp)
452 int next, value;
454 next = nextchar(stream);
455 value = first;
457 if (first == '\n')
458 warning(stream_pos(stream), "Newline in string or character constant");
460 if (first == '\\' && next != EOF) {
461 value = next;
462 next = nextchar(stream);
463 if (value != type) {
464 switch (value) {
465 case 'a':
466 value = '\a';
467 break;
468 case 'b':
469 value = '\b';
470 break;
471 case 't':
472 value = '\t';
473 break;
474 case 'n':
475 value = '\n';
476 break;
477 case 'v':
478 value = '\v';
479 break;
480 case 'f':
481 value = '\f';
482 break;
483 case 'r':
484 value = '\r';
485 break;
486 case 'e':
487 value = '\e';
488 break;
489 case '\\':
490 break;
491 case '?':
492 break;
493 case '\'':
494 break;
495 case '"':
496 break;
497 case '\n':
498 warning(stream_pos(stream), "Newline in string or character constant");
499 break;
500 case '0'...'7': {
501 int nr = 2;
502 value -= '0';
503 while (next >= '0' && next <= '9') {
504 value = (value << 3) + (next-'0');
505 next = nextchar(stream);
506 if (!--nr)
507 break;
509 value &= 0xff;
510 break;
512 case 'x': {
513 int hex = hexval(next);
514 if (hex < 16) {
515 value = hex;
516 next = nextchar(stream);
517 while ((hex = hexval(next)) < 16) {
518 value = (value << 4) + hex;
519 next = nextchar(stream);
521 value &= 0xff;
522 break;
525 /* Fall through */
526 default:
527 warning(stream_pos(stream), "Unknown escape '%c'", value);
530 /* Mark it as escaped */
531 value |= 0x100;
533 *valp = value;
534 return next;
537 static int get_char_token(int next, stream_t *stream, enum token_type type)
539 int value;
540 struct token *token;
542 next = escapechar(next, '\'', stream, &value);
543 if (value == '\'' || next != '\'') {
544 sparse_error(stream_pos(stream), "Bad character constant");
545 drop_token(stream);
546 return next;
549 token = stream->token;
550 token_type(token) = type;
551 token->character = value & 0xff;
553 add_token(stream);
554 return nextchar(stream);
557 static int get_string_token(int next, stream_t *stream, enum token_type type)
559 static char buffer[MAX_STRING];
560 struct string *string;
561 struct token *token;
562 int len = 0;
564 for (;;) {
565 int val;
566 next = escapechar(next, '"', stream, &val);
567 if (val == '"')
568 break;
569 if (next == EOF) {
570 warning(stream_pos(stream), "End of file in middle of string");
571 return next;
573 if (len < MAX_STRING)
574 buffer[len] = val;
575 len++;
578 if (len > MAX_STRING) {
579 warning(stream_pos(stream), "string too long (%d bytes, %d bytes max)", len, MAX_STRING);
580 len = MAX_STRING;
583 string = __alloc_string(len+1);
584 memcpy(string->data, buffer, len);
585 string->data[len] = '\0';
586 string->length = len+1;
588 /* Pass it on.. */
589 token = stream->token;
590 token_type(token) = type;
591 token->string = string;
592 add_token(stream);
594 return next;
597 static int drop_stream_eoln(stream_t *stream)
599 drop_token(stream);
600 for (;;) {
601 switch (nextchar(stream)) {
602 case EOF:
603 return EOF;
604 case '\n':
605 return nextchar(stream);
610 static int drop_stream_comment(stream_t *stream)
612 int newline;
613 int next;
614 drop_token(stream);
615 newline = stream->newline;
617 next = nextchar(stream);
618 for (;;) {
619 int curr = next;
620 if (curr == EOF) {
621 warning(stream_pos(stream), "End of file in the middle of a comment");
622 return curr;
624 next = nextchar(stream);
625 if (curr == '*' && next == '/')
626 break;
628 stream->newline = newline;
629 return nextchar(stream);
632 unsigned char combinations[][4] = COMBINATION_STRINGS;
634 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
636 /* hash function for two-character punctuators - all give unique values */
637 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
640 * note that we won't get false positives - special_hash(0,0) is 0 and
641 * entry 0 is filled (by +=), so all the missing ones are OK.
643 static unsigned char hash_results[32][2] = {
644 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
645 RES('+', '='), /* 00 */
646 RES('/', '='), /* 01 */
647 RES('^', '='), /* 05 */
648 RES('&', '&'), /* 07 */
649 RES('#', '#'), /* 08 */
650 RES('<', '<'), /* 0a */
651 RES('<', '='), /* 0c */
652 RES('!', '='), /* 0e */
653 RES('%', '='), /* 0f */
654 RES('-', '-'), /* 10 */
655 RES('-', '='), /* 11 */
656 RES('-', '>'), /* 13 */
657 RES('=', '='), /* 15 */
658 RES('&', '='), /* 17 */
659 RES('*', '='), /* 18 */
660 RES('.', '.'), /* 1a */
661 RES('+', '+'), /* 1b */
662 RES('|', '='), /* 1c */
663 RES('>', '='), /* 1d */
664 RES('|', '|'), /* 1e */
665 RES('>', '>') /* 1f */
666 #undef RES
668 static int code[32] = {
669 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
670 CODE('+', '=', SPECIAL_ADD_ASSIGN), /* 00 */
671 CODE('/', '=', SPECIAL_DIV_ASSIGN), /* 01 */
672 CODE('^', '=', SPECIAL_XOR_ASSIGN), /* 05 */
673 CODE('&', '&', SPECIAL_LOGICAL_AND), /* 07 */
674 CODE('#', '#', SPECIAL_HASHHASH), /* 08 */
675 CODE('<', '<', SPECIAL_LEFTSHIFT), /* 0a */
676 CODE('<', '=', SPECIAL_LTE), /* 0c */
677 CODE('!', '=', SPECIAL_NOTEQUAL), /* 0e */
678 CODE('%', '=', SPECIAL_MOD_ASSIGN), /* 0f */
679 CODE('-', '-', SPECIAL_DECREMENT), /* 10 */
680 CODE('-', '=', SPECIAL_SUB_ASSIGN), /* 11 */
681 CODE('-', '>', SPECIAL_DEREFERENCE), /* 13 */
682 CODE('=', '=', SPECIAL_EQUAL), /* 15 */
683 CODE('&', '=', SPECIAL_AND_ASSIGN), /* 17 */
684 CODE('*', '=', SPECIAL_MUL_ASSIGN), /* 18 */
685 CODE('.', '.', SPECIAL_DOTDOT), /* 1a */
686 CODE('+', '+', SPECIAL_INCREMENT), /* 1b */
687 CODE('|', '=', SPECIAL_OR_ASSIGN), /* 1c */
688 CODE('>', '=', SPECIAL_GTE), /* 1d */
689 CODE('|', '|', SPECIAL_LOGICAL_OR), /* 1e */
690 CODE('>', '>', SPECIAL_RIGHTSHIFT) /* 1f */
691 #undef CODE
694 static int get_one_special(int c, stream_t *stream)
696 struct token *token;
697 int next, value, i;
699 next = nextchar(stream);
702 * Check for numbers, strings, character constants, and comments
704 switch (c) {
705 case '.':
706 if (next >= '0' && next <= '9')
707 return get_one_number(c, next, stream);
708 break;
709 case '"':
710 return get_string_token(next, stream, TOKEN_STRING);
711 case '\'':
712 return get_char_token(next, stream, TOKEN_CHAR);
713 case '/':
714 if (next == '/')
715 return drop_stream_eoln(stream);
716 if (next == '*')
717 return drop_stream_comment(stream);
721 * Check for combinations
723 value = c;
724 if (cclass[next + 1] & ValidSecond) {
725 i = special_hash(c, next);
726 if (hash_results[i][0] == c && hash_results[i][1] == next) {
727 value = code[i];
728 next = nextchar(stream);
729 if (value >= SPECIAL_LEFTSHIFT &&
730 next == "==."[value - SPECIAL_LEFTSHIFT]) {
731 value += 3;
732 next = nextchar(stream);
737 /* Pass it on.. */
738 token = stream->token;
739 token_type(token) = TOKEN_SPECIAL;
740 token->special = value;
741 add_token(stream);
742 return next;
745 #define IDENT_HASH_BITS (13)
746 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
747 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
749 #define ident_hash_init(c) (c)
750 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
751 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
753 static struct ident *hash_table[IDENT_HASH_SIZE];
754 static int ident_hit, ident_miss, idents;
756 void show_identifier_stats(void)
758 int i;
759 int distribution[100];
761 fprintf(stderr, "identifiers: %d hits, %d misses\n",
762 ident_hit, ident_miss);
764 for (i = 0; i < 100; i++)
765 distribution[i] = 0;
767 for (i = 0; i < IDENT_HASH_SIZE; i++) {
768 struct ident * ident = hash_table[i];
769 int count = 0;
771 while (ident) {
772 count++;
773 ident = ident->next;
775 if (count > 99)
776 count = 99;
777 distribution[count]++;
780 for (i = 0; i < 100; i++) {
781 if (distribution[i])
782 fprintf(stderr, "%2d: %d buckets\n", i, distribution[i]);
786 static struct ident *alloc_ident(const char *name, int len)
788 struct ident *ident = __alloc_ident(len);
789 ident->symbols = NULL;
790 ident->len = len;
791 ident->tainted = 0;
792 memcpy(ident->name, name, len);
793 return ident;
796 static struct ident * insert_hash(struct ident *ident, unsigned long hash)
798 ident->next = hash_table[hash];
799 hash_table[hash] = ident;
800 ident_miss++;
801 return ident;
804 static struct ident *create_hashed_ident(const char *name, int len, unsigned long hash)
806 struct ident *ident;
807 struct ident **p;
809 p = &hash_table[hash];
810 while ((ident = *p) != NULL) {
811 if (ident->len == (unsigned char) len) {
812 if (strncmp(name, ident->name, len) != 0)
813 goto next;
815 ident_hit++;
816 return ident;
818 next:
819 //misses++;
820 p = &ident->next;
822 ident = alloc_ident(name, len);
823 *p = ident;
824 ident->next = NULL;
825 ident_miss++;
826 idents++;
827 return ident;
830 static unsigned long hash_name(const char *name, int len)
832 unsigned long hash;
833 const unsigned char *p = (const unsigned char *)name;
835 hash = ident_hash_init(*p++);
836 while (--len) {
837 unsigned int i = *p++;
838 hash = ident_hash_add(hash, i);
840 return ident_hash_end(hash);
843 struct ident *hash_ident(struct ident *ident)
845 return insert_hash(ident, hash_name(ident->name, ident->len));
848 struct ident *built_in_ident(const char *name)
850 int len = strlen(name);
851 return create_hashed_ident(name, len, hash_name(name, len));
854 struct token *built_in_token(int stream, const char *name)
856 struct token *token;
858 token = __alloc_token(0);
859 token->pos.stream = stream;
860 token_type(token) = TOKEN_IDENT;
861 token->ident = built_in_ident(name);
862 return token;
865 static int get_one_identifier(int c, stream_t *stream)
867 struct token *token;
868 struct ident *ident;
869 unsigned long hash;
870 char buf[256];
871 int len = 1;
872 int next;
874 hash = ident_hash_init(c);
875 buf[0] = c;
876 for (;;) {
877 next = nextchar(stream);
878 if (!(cclass[next + 1] & (Letter | Digit)))
879 break;
880 if (len >= sizeof(buf))
881 break;
882 hash = ident_hash_add(hash, next);
883 buf[len] = next;
884 len++;
886 hash = ident_hash_end(hash);
888 ident = create_hashed_ident(buf, len, hash);
890 if (ident == &L_ident) {
891 if (next == '\'')
892 return get_char_token(nextchar(stream), stream, TOKEN_WIDE_CHAR);
893 if (next == '\"')
894 return get_string_token(nextchar(stream), stream, TOKEN_WIDE_STRING);
897 /* Pass it on.. */
898 token = stream->token;
899 token_type(token) = TOKEN_IDENT;
900 token->ident = ident;
901 add_token(stream);
902 return next;
905 static int get_one_token(int c, stream_t *stream)
907 long class = cclass[c + 1];
908 if (class & Digit)
909 return get_one_number(c, nextchar(stream), stream);
910 if (class & Letter)
911 return get_one_identifier(c, stream);
912 return get_one_special(c, stream);
915 static struct token *setup_stream(stream_t *stream, int idx, int fd,
916 unsigned char *buf, unsigned int buf_size)
918 struct token *begin;
920 stream->nr = idx;
921 stream->line = 1;
922 stream->newline = 1;
923 stream->whitespace = 0;
924 stream->pos = 0;
926 stream->token = NULL;
927 stream->fd = fd;
928 stream->offset = 0;
929 stream->size = buf_size;
930 stream->buffer = buf;
932 begin = alloc_token(stream);
933 token_type(begin) = TOKEN_STREAMBEGIN;
934 stream->tokenlist = &begin->next;
935 return begin;
938 static struct token *tokenize_stream(stream_t *stream)
940 int c = nextchar(stream);
941 while (c != EOF) {
942 if (!isspace(c)) {
943 struct token *token = alloc_token(stream);
944 stream->token = token;
945 stream->newline = 0;
946 stream->whitespace = 0;
947 c = get_one_token(c, stream);
948 continue;
950 stream->whitespace = 1;
951 c = nextchar(stream);
953 return mark_eof(stream);
956 struct token * tokenize_buffer(void *buffer, unsigned long size, struct token **endtoken)
958 stream_t stream;
959 struct token *begin;
961 begin = setup_stream(&stream, 0, -1, buffer, size);
962 *endtoken = tokenize_stream(&stream);
963 return begin;
966 struct token * tokenize(const char *name, int fd, struct token *endtoken, const char **next_path)
968 struct token *begin, *end;
969 stream_t stream;
970 unsigned char buffer[BUFSIZE];
971 int idx;
973 idx = init_stream(name, fd, next_path);
974 if (idx < 0) {
975 // info(endtoken->pos, "File %s is const", name);
976 return endtoken;
979 begin = setup_stream(&stream, idx, fd, buffer, 0);
980 end = tokenize_stream(&stream);
981 if (endtoken)
982 end->next = endtoken;
983 return begin;