2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the pre-processor.
5 * Copyright (C) 2003 Transmeta Corp.
8 * Licensed under the Open Software License version 1.1
25 int input_stream_nr
= 0;
26 struct stream
*input_streams
;
27 static int input_streams_allocated
;
29 #define BUFSIZE (8192)
34 struct token
**tokenlist
;
36 unsigned char *buffer
;
40 const char *show_special(int val
)
42 static const char *combinations
[] = COMBINATION_STRINGS
;
43 static char buffer
[4];
47 if (val
>= SPECIAL_BASE
)
48 strcpy(buffer
, combinations
[val
- SPECIAL_BASE
]);
52 const char *show_ident(const struct ident
*ident
)
54 static char buffer
[256];
57 sprintf(buffer
, "%.*s", ident
->len
, ident
->name
);
61 char *charstr(char *ptr
, unsigned char c
, unsigned char escape
, unsigned char next
)
64 if (c
== escape
|| c
== '\\')
79 return ptr
+ sprintf(ptr
, "%o", c
);
81 return ptr
+ sprintf(ptr
, "%03o", c
);
84 const char *show_string(const struct string
*string
)
86 static char buffer
[256];
92 for (i
= 0; i
< string
->length
-1; i
++) {
93 const unsigned char *p
= string
->data
+ i
;
94 ptr
= charstr(ptr
, p
[0], '"', p
[1]);
101 const char *show_token(const struct token
*token
)
103 static char buffer
[256];
107 switch (token_type(token
)) {
109 return "syntax error";
112 return "end-of-input";
115 return show_ident(token
->ident
);
118 return show_string(token
->string
);
121 return token
->number
;
124 return show_special(token
->special
);
128 int c
= token
->character
;
130 ptr
= charstr(ptr
, c
, '\'', 0);
136 case TOKEN_STREAMBEGIN
:
137 sprintf(buffer
, "<beginning of '%s'>", (input_streams
+ token
->pos
.stream
)->name
);
140 case TOKEN_STREAMEND
:
141 sprintf(buffer
, "<end of '%s'>", (input_streams
+ token
->pos
.stream
)->name
);
149 int init_stream(const char *name
, int fd
)
151 int stream
= input_stream_nr
;
152 struct stream
*current
;
154 if (stream
>= input_streams_allocated
) {
155 int newalloc
= stream
* 4 / 3 + 10;
156 input_streams
= realloc(input_streams
, newalloc
* sizeof(struct stream
));
158 die("Unable to allocate more streams space");
159 input_streams_allocated
= newalloc
;
161 current
= input_streams
+ stream
;
162 memset(current
, 0, sizeof(*current
));
163 current
->name
= name
;
165 current
->constant
= -1; // "unknown"
171 current
->dev
= st
.st_dev
;
172 current
->ino
= st
.st_ino
;
173 for (i
= 0; i
< stream
; i
++) {
174 struct stream
*s
= input_streams
+ i
;
175 if (s
->dev
== st
.st_dev
&& s
->ino
== st
.st_ino
) {
176 if (s
->constant
> 0 && lookup_symbol(s
->protect
, NS_PREPROCESSOR
))
181 input_stream_nr
= stream
+1;
185 static struct token
* alloc_token(stream_t
*stream
)
187 struct token
*token
= __alloc_token(0);
188 token
->pos
= stream
->pos
;
193 * Argh... That was surprisingly messy - handling '\r' complicates the
196 static int nextchar_slow(stream_t
*stream
)
198 int offset
= stream
->offset
;
199 int size
= stream
->size
;
201 int spliced
= 0, had_cr
, had_backslash
, complain
;
204 had_cr
= had_backslash
= complain
= 0;
207 if (offset
>= size
) {
208 size
= read(stream
->fd
, stream
->buffer
, BUFSIZE
);
212 stream
->offset
= offset
= 0;
215 c
= stream
->buffer
[offset
++];
217 if (had_cr
&& c
!= '\n')
232 if (!had_backslash
) {
238 stream
->pos
.newline
= 1;
242 warn(stream
->pos
, "non-ASCII data stream");
252 stream
->offset
= offset
;
254 warn(stream
->pos
, "non-ASCII data stream");
264 warn(stream
->pos
, "no newline at end of file");
266 warn(stream
->pos
, "non-ASCII data stream");
268 warn(stream
->pos
, "backslash-newline at end of file");
273 * We want that as light as possible while covering all normal cases.
274 * Slow path (including the logics with line-splicing and EOF sanity
275 * checks) is in nextchar_slow().
277 static inline int nextchar(stream_t
*stream
)
279 int offset
= stream
->offset
;
281 if (offset
< stream
->size
) {
282 int c
= stream
->buffer
[offset
++];
288 stream
->offset
= offset
;
290 stream
->pos
.newline
= 1;
294 if (offset
>= stream
->size
)
296 next
= stream
->buffer
[offset
];
297 if (next
== '\n' || next
== '\r')
301 stream
->offset
= offset
;
306 return nextchar_slow(stream
);
309 struct token eof_token_entry
;
311 static void mark_eof(stream_t
*stream
, struct token
*end_token
)
315 end
= alloc_token(stream
);
316 token_type(end
) = TOKEN_STREAMEND
;
317 end
->pos
.newline
= 1;
319 eof_token_entry
.next
= &eof_token_entry
;
320 eof_token_entry
.pos
.newline
= 1;
323 end_token
= &eof_token_entry
;
324 end
->next
= end_token
;
325 *stream
->tokenlist
= end
;
326 stream
->tokenlist
= NULL
;
329 static void add_token(stream_t
*stream
)
331 struct token
*token
= stream
->token
;
333 stream
->token
= NULL
;
335 *stream
->tokenlist
= token
;
336 stream
->tokenlist
= &token
->next
;
339 static void drop_token(stream_t
*stream
)
341 stream
->pos
.newline
|= stream
->token
->pos
.newline
;
342 stream
->pos
.whitespace
|= stream
->token
->pos
.whitespace
;
343 stream
->token
= NULL
;
355 static const long cclass
[257] = {
356 ['0' + 1 ... '9' + 1] = Digit
| Hex
,
357 ['A' + 1 ... 'D' + 1] = Letter
| Hex
,
358 ['E' + 1] = Letter
| Hex
| Exp
,
359 ['F' + 1] = Letter
| Hex
,
360 ['G' + 1 ... 'O' + 1] = Letter
,
361 ['P' + 1] = Letter
| Exp
,
362 ['Q' + 1 ... 'Z' + 1] = Letter
,
363 ['a' + 1 ... 'd' + 1] = Letter
| Hex
,
364 ['e' + 1] = Letter
| Hex
| Exp
,
365 ['f' + 1] = Letter
| Hex
,
366 ['g' + 1 ... 'o' + 1] = Letter
,
367 ['p' + 1] = Letter
| Exp
,
368 ['q' + 1 ... 'z' + 1] = Letter
,
370 ['.' + 1] = Dot
| ValidSecond
,
371 ['=' + 1] = ValidSecond
,
372 ['+' + 1] = ValidSecond
,
373 ['-' + 1] = ValidSecond
,
374 ['>' + 1] = ValidSecond
,
375 ['<' + 1] = ValidSecond
,
376 ['&' + 1] = ValidSecond
,
377 ['|' + 1] = ValidSecond
,
378 ['#' + 1] = ValidSecond
,
386 * pp-number identifier-nodigit
393 static int get_one_number(int c
, int next
, stream_t
*stream
)
396 static char buffer
[256];
397 char *p
= buffer
, *buf
;
402 long class = cclass
[next
+ 1];
403 if (!(class & (Dot
| Digit
| Letter
)))
406 next
= nextchar(stream
);
408 if (next
== '-' || next
== '+') {
410 next
= nextchar(stream
);
416 buf
= __alloc_bytes(len
);
417 memcpy(buf
, buffer
, len
);
419 token
= stream
->token
;
420 token_type(token
) = TOKEN_NUMBER
;
427 static int escapechar(int first
, int type
, stream_t
*stream
, int *valp
)
431 next
= nextchar(stream
);
435 warn(stream
->pos
, "Newline in string or character constant");
437 if (first
== '\\' && next
!= EOF
) {
439 next
= nextchar(stream
);
473 warn(stream
->pos
, "Newline in string or character constant");
478 while (next
>= '0' && next
<= '9') {
479 value
= (value
<< 3) + (next
-'0');
480 next
= nextchar(stream
);
488 int hex
= hexval(next
);
491 next
= nextchar(stream
);
492 while ((hex
= hexval(next
)) < 16) {
493 value
= (value
<< 4) + hex
;
494 next
= nextchar(stream
);
502 warn(stream
->pos
, "Unknown escape '%c'", value
);
505 /* Mark it as escaped */
512 static int get_char_token(int next
, stream_t
*stream
)
517 next
= escapechar(next
, '\'', stream
, &value
);
518 if (value
== '\'' || next
!= '\'') {
519 warn(stream
->pos
, "Bad character constant");
524 token
= stream
->token
;
525 token_type(token
) = TOKEN_CHAR
;
526 token
->character
= value
& 0xff;
529 return nextchar(stream
);
532 static int get_string_token(int next
, stream_t
*stream
)
534 static char buffer
[MAX_STRING
];
535 struct string
*string
;
541 next
= escapechar(next
, '"', stream
, &val
);
545 warn(stream
->pos
, "End of file in middle of string");
548 if (len
< MAX_STRING
)
553 if (len
> MAX_STRING
) {
554 warn(stream
->pos
, "string too long (%d bytes, %d bytes max)", len
, MAX_STRING
);
558 string
= __alloc_string(len
+1);
559 memcpy(string
->data
, buffer
, len
);
560 string
->data
[len
] = '\0';
561 string
->length
= len
+1;
564 token
= stream
->token
;
565 token_type(token
) = TOKEN_STRING
;
566 token
->string
= string
;
572 static int drop_stream_eoln(stream_t
*stream
)
574 int next
= nextchar(stream
);
580 next
= nextchar(stream
);
586 static int drop_stream_comment(stream_t
*stream
)
591 newline
= stream
->pos
.newline
;
593 next
= nextchar(stream
);
597 warn(stream
->pos
, "End of file in the middle of a comment");
600 next
= nextchar(stream
);
601 if (curr
== '*' && next
== '/')
604 stream
->pos
.newline
= newline
;
605 return nextchar(stream
);
608 unsigned char combinations
[][3] = COMBINATION_STRINGS
;
610 #define NR_COMBINATIONS (sizeof(combinations)/3)
612 static int get_one_special(int c
, stream_t
*stream
)
615 unsigned char c1
, c2
, c3
;
619 next
= nextchar(stream
);
622 * Check for numbers, strings, character constants, and comments
626 if (next
>= '0' && next
<= '9')
627 return get_one_number(c
, next
, stream
);
630 return get_string_token(next
, stream
);
632 return get_char_token(next
, stream
);
635 return drop_stream_eoln(stream
);
637 return drop_stream_comment(stream
);
641 * Check for combinations
644 if (cclass
[next
+ 1] & ValidSecond
) {
645 comb
= combinations
[0];
646 c1
= c
; c2
= next
; c3
= 0;
647 for (i
= 0; i
< NR_COMBINATIONS
; i
++) {
648 if (comb
[0] == c1
&& comb
[1] == c2
&& comb
[2] == c3
) {
649 value
= i
+ SPECIAL_BASE
;
650 next
= nextchar(stream
);
660 token
= stream
->token
;
661 token_type(token
) = TOKEN_SPECIAL
;
662 token
->special
= value
;
667 #define IDENT_HASH_BITS (10)
668 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
669 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
671 #define ident_hash_init(c) (c)
672 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
673 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
675 static struct ident
*hash_table
[IDENT_HASH_SIZE
];
676 int ident_hit
, ident_miss
;
678 void show_identifier_stats(void)
681 int distribution
[100];
683 fprintf(stderr
, "identifiers: %d hits, %d misses\n",
684 ident_hit
, ident_miss
);
686 for (i
= 0; i
< 100; i
++)
689 for (i
= 0; i
< IDENT_HASH_SIZE
; i
++) {
690 struct ident
* ident
= hash_table
[i
];
699 distribution
[count
]++;
702 for (i
= 0; i
< 100; i
++) {
704 fprintf(stderr
, "%2d: %d buckets\n", i
, distribution
[i
]);
708 static struct ident
*alloc_ident(const char *name
, int len
)
710 struct ident
*ident
= __alloc_ident(len
);
711 ident
->symbols
= NULL
;
714 memcpy(ident
->name
, name
, len
);
718 static struct ident
* insert_hash(struct ident
*ident
, unsigned long hash
)
720 ident
->next
= hash_table
[hash
];
721 hash_table
[hash
] = ident
;
726 static struct ident
*create_hashed_ident(const char *name
, int len
, unsigned long hash
)
731 p
= &hash_table
[hash
];
732 while ((ident
= *p
) != NULL
) {
733 if (ident
->len
== len
&& !memcmp(ident
->name
, name
, len
)) {
740 ident
= alloc_ident(name
, len
);
747 static unsigned long hash_name(const char *name
, int len
)
750 const unsigned char *p
= (const unsigned char *)name
;
752 hash
= ident_hash_init(*p
++);
754 unsigned int i
= *p
++;
755 hash
= ident_hash_add(hash
, i
);
757 return ident_hash_end(hash
);
760 struct ident
*hash_ident(struct ident
*ident
)
762 return insert_hash(ident
, hash_name(ident
->name
, ident
->len
));
765 struct ident
*built_in_ident(const char *name
)
767 int len
= strlen(name
);
768 return create_hashed_ident(name
, len
, hash_name(name
, len
));
771 struct token
*built_in_token(int stream
, const char *name
)
775 token
= __alloc_token(0);
776 token
->pos
.stream
= stream
;
777 token_type(token
) = TOKEN_IDENT
;
778 token
->ident
= built_in_ident(name
);
782 static int get_one_identifier(int c
, stream_t
*stream
)
791 hash
= ident_hash_init(c
);
794 next
= nextchar(stream
);
795 if (!(cclass
[next
+ 1] & (Letter
| Digit
)))
797 if (len
>= sizeof(buf
))
799 hash
= ident_hash_add(hash
, next
);
803 hash
= ident_hash_end(hash
);
805 ident
= create_hashed_ident(buf
, len
, hash
);
808 token
= stream
->token
;
809 token_type(token
) = TOKEN_IDENT
;
810 token
->ident
= ident
;
815 static int get_one_token(int c
, stream_t
*stream
)
817 long class = cclass
[c
+ 1];
819 return get_one_number(c
, nextchar(stream
), stream
);
821 return get_one_identifier(c
, stream
);
822 return get_one_special(c
, stream
);
825 static struct token
*setup_stream(stream_t
*stream
, int idx
, int fd
,
826 unsigned char *buf
, unsigned int buf_size
)
830 stream
->pos
.stream
= idx
;
831 stream
->pos
.line
= 1;
832 stream
->pos
.newline
= 1;
833 stream
->pos
.whitespace
= 0;
835 stream
->pos
.noexpand
= 0;
837 stream
->token
= NULL
;
840 stream
->size
= buf_size
;
841 stream
->buffer
= buf
;
843 begin
= alloc_token(stream
);
844 token_type(begin
) = TOKEN_STREAMBEGIN
;
845 stream
->tokenlist
= &begin
->next
;
849 static void tokenize_stream(stream_t
*stream
, struct token
*endtoken
)
851 int c
= nextchar(stream
);
854 struct token
*token
= alloc_token(stream
);
855 stream
->token
= token
;
856 stream
->pos
.newline
= 0;
857 stream
->pos
.whitespace
= 0;
858 c
= get_one_token(c
, stream
);
861 stream
->pos
.whitespace
= 1;
862 c
= nextchar(stream
);
864 mark_eof(stream
, endtoken
);
867 struct token
* tokenize_buffer(unsigned char *buffer
, unsigned long size
, struct token
*endtoken
)
872 begin
= setup_stream(&stream
, 0, -1, buffer
, size
);
873 tokenize_stream(&stream
, endtoken
);
877 struct token
* tokenize(const char *name
, int fd
, struct token
*endtoken
)
881 unsigned char buffer
[BUFSIZE
];
884 idx
= init_stream(name
, fd
);
888 begin
= setup_stream(&stream
, idx
, fd
, buffer
, 0);
889 tokenize_stream(&stream
, endtoken
);