2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
8 * Licensed under the Open Software License version 1.1
25 int input_stream_nr
= 0;
26 struct stream
*input_streams
;
27 static int input_streams_allocated
;
29 #define BUFSIZE (8192)
34 int newline
, whitespace
;
35 struct token
**tokenlist
;
37 unsigned char *buffer
;
40 struct position
stream_pos(stream_t
*stream
)
44 pos
.stream
= stream
->nr
;
45 pos
.newline
= stream
->newline
;
46 pos
.whitespace
= stream
->whitespace
;
47 pos
.pos
= stream
->pos
;
48 pos
.line
= stream
->line
;
53 const char *show_special(int val
)
55 static const char *combinations
[] = COMBINATION_STRINGS
;
56 static char buffer
[4];
60 if (val
>= SPECIAL_BASE
)
61 strcpy(buffer
, combinations
[val
- SPECIAL_BASE
]);
65 const char *show_ident(const struct ident
*ident
)
67 static char buffer
[256];
70 sprintf(buffer
, "%.*s", ident
->len
, ident
->name
);
74 char *charstr(char *ptr
, unsigned char c
, unsigned char escape
, unsigned char next
)
77 if (c
== escape
|| c
== '\\')
92 return ptr
+ sprintf(ptr
, "%o", c
);
94 return ptr
+ sprintf(ptr
, "%03o", c
);
97 const char *show_string(const struct string
*string
)
99 static char buffer
[4 * MAX_STRING
+ 3];
104 return "<bad_string>";
107 for (i
= 0; i
< string
->length
-1; i
++) {
108 const unsigned char *p
= string
->data
+ i
;
109 ptr
= charstr(ptr
, p
[0], '"', p
[1]);
116 const char *show_token(const struct token
*token
)
118 static char buffer
[256];
122 switch (token_type(token
)) {
124 return "syntax error";
127 return "end-of-input";
130 return show_ident(token
->ident
);
133 return show_string(token
->string
);
136 return token
->number
;
139 return show_special(token
->special
);
143 int c
= token
->character
;
145 ptr
= charstr(ptr
, c
, '\'', 0);
151 case TOKEN_STREAMBEGIN
:
152 sprintf(buffer
, "<beginning of '%s'>", (input_streams
+ token
->pos
.stream
)->name
);
155 case TOKEN_STREAMEND
:
156 sprintf(buffer
, "<end of '%s'>", (input_streams
+ token
->pos
.stream
)->name
);
164 int init_stream(const char *name
, int fd
, const char **next_path
)
166 int stream
= input_stream_nr
;
167 struct stream
*current
;
170 if (stream
>= input_streams_allocated
) {
171 int newalloc
= stream
* 4 / 3 + 10;
172 input_streams
= realloc(input_streams
, newalloc
* sizeof(struct stream
));
174 die("Unable to allocate more streams space");
175 input_streams_allocated
= newalloc
;
177 current
= input_streams
+ stream
;
178 memset(current
, 0, sizeof(*current
));
179 current
->name
= name
;
181 current
->next_path
= next_path
;
182 current
->constant
= CONSTANT_FILE_MAYBE
;
183 if (fd
>= 0 && fstat(fd
, &st
) == 0 && S_ISREG(st
.st_mode
)) {
186 for (i
= 0; i
< stream
; i
++) {
187 struct stream
*s
= input_streams
+ i
;
188 if (s
->constant
== CONSTANT_FILE_YES
&&
189 identical_files(s
, &st
, name
) &&
190 lookup_symbol(s
->protect
, NS_MACRO
))
194 current
->dev
= st
.st_dev
;
195 current
->ino
= st
.st_ino
;
197 input_stream_nr
= stream
+1;
201 static struct token
* alloc_token(stream_t
*stream
)
203 struct token
*token
= __alloc_token(0);
204 token
->pos
= stream_pos(stream
);
209 * Argh... That was surprisingly messy - handling '\r' complicates the
212 static int nextchar_slow(stream_t
*stream
)
214 int offset
= stream
->offset
;
215 int size
= stream
->size
;
217 int spliced
= 0, had_cr
, had_backslash
, complain
;
220 had_cr
= had_backslash
= complain
= 0;
223 if (offset
>= size
) {
224 size
= read(stream
->fd
, stream
->buffer
, BUFSIZE
);
228 stream
->offset
= offset
= 0;
231 c
= stream
->buffer
[offset
++];
233 if (had_cr
&& c
!= '\n')
248 if (!had_backslash
) {
258 warning(stream_pos(stream
), "non-ASCII data stream");
268 stream
->offset
= offset
;
270 warning(stream_pos(stream
), "non-ASCII data stream");
280 warning(stream_pos(stream
), "no newline at end of file");
282 warning(stream_pos(stream
), "non-ASCII data stream");
284 warning(stream_pos(stream
), "backslash-newline at end of file");
289 * We want that as light as possible while covering all normal cases.
290 * Slow path (including the logics with line-splicing and EOF sanity
291 * checks) is in nextchar_slow().
293 static int nextchar(stream_t
*stream
)
295 int offset
= stream
->offset
;
297 if (offset
< stream
->size
) {
298 int c
= stream
->buffer
[offset
++];
299 static const char special
[256] = {
300 ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
303 stream
->offset
= offset
;
308 return nextchar_slow(stream
);
311 struct token eof_token_entry
;
313 static void mark_eof(stream_t
*stream
, struct token
*end_token
)
317 end
= alloc_token(stream
);
318 token_type(end
) = TOKEN_STREAMEND
;
319 end
->pos
.newline
= 1;
321 eof_token_entry
.next
= &eof_token_entry
;
322 eof_token_entry
.pos
.newline
= 1;
325 end_token
= &eof_token_entry
;
326 end
->next
= end_token
;
327 *stream
->tokenlist
= end
;
328 stream
->tokenlist
= NULL
;
331 static void add_token(stream_t
*stream
)
333 struct token
*token
= stream
->token
;
335 stream
->token
= NULL
;
337 *stream
->tokenlist
= token
;
338 stream
->tokenlist
= &token
->next
;
341 static void drop_token(stream_t
*stream
)
343 stream
->newline
|= stream
->token
->pos
.newline
;
344 stream
->whitespace
|= stream
->token
->pos
.whitespace
;
345 stream
->token
= NULL
;
357 static const long cclass
[257] = {
358 ['0' + 1 ... '9' + 1] = Digit
| Hex
,
359 ['A' + 1 ... 'D' + 1] = Letter
| Hex
,
360 ['E' + 1] = Letter
| Hex
| Exp
,
361 ['F' + 1] = Letter
| Hex
,
362 ['G' + 1 ... 'O' + 1] = Letter
,
363 ['P' + 1] = Letter
| Exp
,
364 ['Q' + 1 ... 'Z' + 1] = Letter
,
365 ['a' + 1 ... 'd' + 1] = Letter
| Hex
,
366 ['e' + 1] = Letter
| Hex
| Exp
,
367 ['f' + 1] = Letter
| Hex
,
368 ['g' + 1 ... 'o' + 1] = Letter
,
369 ['p' + 1] = Letter
| Exp
,
370 ['q' + 1 ... 'z' + 1] = Letter
,
372 ['.' + 1] = Dot
| ValidSecond
,
373 ['=' + 1] = ValidSecond
,
374 ['+' + 1] = ValidSecond
,
375 ['-' + 1] = ValidSecond
,
376 ['>' + 1] = ValidSecond
,
377 ['<' + 1] = ValidSecond
,
378 ['&' + 1] = ValidSecond
,
379 ['|' + 1] = ValidSecond
,
380 ['#' + 1] = ValidSecond
,
388 * pp-number identifier-nodigit
395 static int get_one_number(int c
, int next
, stream_t
*stream
)
398 static char buffer
[4095];
399 char *p
= buffer
, *buf
, *buffer_end
= buffer
+ sizeof (buffer
);
404 long class = cclass
[next
+ 1];
405 if (!(class & (Dot
| Digit
| Letter
)))
409 next
= nextchar(stream
);
411 if (next
== '-' || next
== '+') {
414 next
= nextchar(stream
);
419 if (p
== buffer_end
) {
420 error(stream_pos(stream
), "number token exceeds %td characters",
421 buffer_end
- buffer
);
422 // Pretend we saw just "1".
429 buf
= __alloc_bytes(len
);
430 memcpy(buf
, buffer
, len
);
432 token
= stream
->token
;
433 token_type(token
) = TOKEN_NUMBER
;
440 static int escapechar(int first
, int type
, stream_t
*stream
, int *valp
)
444 next
= nextchar(stream
);
448 warning(stream_pos(stream
), "Newline in string or character constant");
450 if (first
== '\\' && next
!= EOF
) {
452 next
= nextchar(stream
);
486 warning(stream_pos(stream
), "Newline in string or character constant");
491 while (next
>= '0' && next
<= '9') {
492 value
= (value
<< 3) + (next
-'0');
493 next
= nextchar(stream
);
501 int hex
= hexval(next
);
504 next
= nextchar(stream
);
505 while ((hex
= hexval(next
)) < 16) {
506 value
= (value
<< 4) + hex
;
507 next
= nextchar(stream
);
515 warning(stream_pos(stream
), "Unknown escape '%c'", value
);
518 /* Mark it as escaped */
525 static int get_char_token(int next
, stream_t
*stream
)
530 next
= escapechar(next
, '\'', stream
, &value
);
531 if (value
== '\'' || next
!= '\'') {
532 warning(stream_pos(stream
), "Bad character constant");
537 token
= stream
->token
;
538 token_type(token
) = TOKEN_CHAR
;
539 token
->character
= value
& 0xff;
542 return nextchar(stream
);
545 static int get_string_token(int next
, stream_t
*stream
)
547 static char buffer
[MAX_STRING
];
548 struct string
*string
;
554 next
= escapechar(next
, '"', stream
, &val
);
558 warning(stream_pos(stream
), "End of file in middle of string");
561 if (len
< MAX_STRING
)
566 if (len
> MAX_STRING
) {
567 warning(stream_pos(stream
), "string too long (%d bytes, %d bytes max)", len
, MAX_STRING
);
571 string
= __alloc_string(len
+1);
572 memcpy(string
->data
, buffer
, len
);
573 string
->data
[len
] = '\0';
574 string
->length
= len
+1;
577 token
= stream
->token
;
578 token_type(token
) = TOKEN_STRING
;
579 token
->string
= string
;
585 static int drop_stream_eoln(stream_t
*stream
)
587 int next
= nextchar(stream
);
593 next
= nextchar(stream
);
599 static int drop_stream_comment(stream_t
*stream
)
604 newline
= stream
->newline
;
606 next
= nextchar(stream
);
610 warning(stream_pos(stream
), "End of file in the middle of a comment");
613 next
= nextchar(stream
);
614 if (curr
== '*' && next
== '/')
617 stream
->newline
= newline
;
618 return nextchar(stream
);
621 unsigned char combinations
[][3] = COMBINATION_STRINGS
;
623 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
625 static int get_one_special(int c
, stream_t
*stream
)
628 unsigned char c1
, c2
, c3
;
632 next
= nextchar(stream
);
635 * Check for numbers, strings, character constants, and comments
639 if (next
>= '0' && next
<= '9')
640 return get_one_number(c
, next
, stream
);
643 return get_string_token(next
, stream
);
645 return get_char_token(next
, stream
);
648 return drop_stream_eoln(stream
);
650 return drop_stream_comment(stream
);
654 * Check for combinations
657 if (cclass
[next
+ 1] & ValidSecond
) {
658 comb
= combinations
[0];
659 c1
= c
; c2
= next
; c3
= 0;
660 for (i
= 0; i
< NR_COMBINATIONS
; i
++) {
661 if (comb
[0] == c1
&& comb
[1] == c2
&& comb
[2] == c3
) {
662 value
= i
+ SPECIAL_BASE
;
663 next
= nextchar(stream
);
673 token
= stream
->token
;
674 token_type(token
) = TOKEN_SPECIAL
;
675 token
->special
= value
;
680 #define IDENT_HASH_BITS (13)
681 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
682 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
684 #define ident_hash_init(c) (c)
685 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
686 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
688 static struct ident
*hash_table
[IDENT_HASH_SIZE
];
689 int ident_hit
, ident_miss
, idents
;
691 void show_identifier_stats(void)
694 int distribution
[100];
696 fprintf(stderr
, "identifiers: %d hits, %d misses\n",
697 ident_hit
, ident_miss
);
699 for (i
= 0; i
< 100; i
++)
702 for (i
= 0; i
< IDENT_HASH_SIZE
; i
++) {
703 struct ident
* ident
= hash_table
[i
];
712 distribution
[count
]++;
715 for (i
= 0; i
< 100; i
++) {
717 fprintf(stderr
, "%2d: %d buckets\n", i
, distribution
[i
]);
721 static struct ident
*alloc_ident(const char *name
, int len
)
723 struct ident
*ident
= __alloc_ident(len
);
724 ident
->symbols
= NULL
;
727 memcpy(ident
->name
, name
, len
);
731 static struct ident
* insert_hash(struct ident
*ident
, unsigned long hash
)
733 ident
->next
= hash_table
[hash
];
734 hash_table
[hash
] = ident
;
739 static struct ident
*create_hashed_ident(const char *name
, int len
, unsigned long hash
)
744 p
= &hash_table
[hash
];
745 while ((ident
= *p
) != NULL
) {
746 if (ident
->len
== (unsigned char) len
) {
747 const char *n
= name
;
748 const char *m
= ident
->name
;
764 ident
= alloc_ident(name
, len
);
772 static unsigned long hash_name(const char *name
, int len
)
775 const unsigned char *p
= (const unsigned char *)name
;
777 hash
= ident_hash_init(*p
++);
779 unsigned int i
= *p
++;
780 hash
= ident_hash_add(hash
, i
);
782 return ident_hash_end(hash
);
785 struct ident
*hash_ident(struct ident
*ident
)
787 return insert_hash(ident
, hash_name(ident
->name
, ident
->len
));
790 struct ident
*built_in_ident(const char *name
)
792 int len
= strlen(name
);
793 return create_hashed_ident(name
, len
, hash_name(name
, len
));
796 struct token
*built_in_token(int stream
, const char *name
)
800 token
= __alloc_token(0);
801 token
->pos
.stream
= stream
;
802 token_type(token
) = TOKEN_IDENT
;
803 token
->ident
= built_in_ident(name
);
807 static int get_one_identifier(int c
, stream_t
*stream
)
816 hash
= ident_hash_init(c
);
819 next
= nextchar(stream
);
820 if (!(cclass
[next
+ 1] & (Letter
| Digit
)))
822 if (len
>= sizeof(buf
))
824 hash
= ident_hash_add(hash
, next
);
828 hash
= ident_hash_end(hash
);
830 ident
= create_hashed_ident(buf
, len
, hash
);
833 token
= stream
->token
;
834 token_type(token
) = TOKEN_IDENT
;
835 token
->ident
= ident
;
840 static int get_one_token(int c
, stream_t
*stream
)
842 long class = cclass
[c
+ 1];
844 return get_one_number(c
, nextchar(stream
), stream
);
846 return get_one_identifier(c
, stream
);
847 return get_one_special(c
, stream
);
850 static struct token
*setup_stream(stream_t
*stream
, int idx
, int fd
,
851 unsigned char *buf
, unsigned int buf_size
)
858 stream
->whitespace
= 0;
861 stream
->token
= NULL
;
864 stream
->size
= buf_size
;
865 stream
->buffer
= buf
;
867 begin
= alloc_token(stream
);
868 token_type(begin
) = TOKEN_STREAMBEGIN
;
869 stream
->tokenlist
= &begin
->next
;
873 static void tokenize_stream(stream_t
*stream
, struct token
*endtoken
)
875 int c
= nextchar(stream
);
878 struct token
*token
= alloc_token(stream
);
879 stream
->token
= token
;
881 stream
->whitespace
= 0;
882 c
= get_one_token(c
, stream
);
885 stream
->whitespace
= 1;
886 c
= nextchar(stream
);
888 mark_eof(stream
, endtoken
);
891 struct token
* tokenize_buffer(unsigned char *buffer
, unsigned long size
, struct token
*endtoken
)
896 begin
= setup_stream(&stream
, 0, -1, buffer
, size
);
897 tokenize_stream(&stream
, endtoken
);
901 struct token
* tokenize(const char *name
, int fd
, struct token
*endtoken
, const char **next_path
)
905 unsigned char buffer
[BUFSIZE
];
908 idx
= init_stream(name
, fd
, next_path
);
910 // info(endtoken->pos, "File %s is const", name);
914 begin
= setup_stream(&stream
, idx
, fd
, buffer
, 0);
915 tokenize_stream(&stream
, endtoken
);