2 * This is a really stupid C tokenizer. It doesn't do any include
3 * files or anything complex at all. That's the preprocessor.
5 * Copyright (C) 2003 Transmeta Corp.
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
42 int input_stream_nr
= 0;
43 struct stream
*input_streams
;
44 static int input_streams_allocated
;
45 unsigned int tabstop
= 8;
48 #define BUFSIZE (8192)
53 int newline
, whitespace
;
54 struct token
**tokenlist
;
56 unsigned char *buffer
;
59 const char *stream_name(int stream
)
61 if (stream
< 0 || stream
> input_stream_nr
)
62 return "<bad stream>";
63 return input_streams
[stream
].name
;
66 static struct position
stream_pos(stream_t
*stream
)
70 pos
.stream
= stream
->nr
;
71 pos
.newline
= stream
->newline
;
72 pos
.whitespace
= stream
->whitespace
;
73 pos
.pos
= stream
->pos
;
75 pos
.line
= stream
->line
;
83 const char *show_special(int val
)
85 static char buffer
[4];
89 if (val
>= SPECIAL_BASE
)
90 strcpy(buffer
, (char *) combinations
[val
- SPECIAL_BASE
]);
94 const char *show_ident(const struct ident
*ident
)
96 static char buff
[4][256];
102 buffer
= buff
[3 & ++n
];
103 sprintf(buffer
, "%.*s", ident
->len
, ident
->name
);
107 static char *charstr(char *ptr
, unsigned char c
, unsigned char escape
, unsigned char next
)
110 if (c
== escape
|| c
== '\\')
125 return ptr
+ sprintf(ptr
, "%o", c
);
127 return ptr
+ sprintf(ptr
, "%03o", c
);
130 const char *show_string(const struct string
*string
)
132 static char buffer
[4 * MAX_STRING
+ 3];
136 if (!string
|| !string
->length
)
137 return "<bad_string>";
140 for (i
= 0; i
< string
->length
-1; i
++) {
141 const char *p
= string
->data
+ i
;
142 ptr
= charstr(ptr
, p
[0], '"', p
[1]);
149 static const char *show_char(const char *s
, size_t len
, char prefix
, char delim
)
151 static char buffer
[MAX_STRING
+ 4];
163 static const char *quote_char(const char *s
, size_t len
, char prefix
, char delim
)
165 static char buffer
[2*MAX_STRING
+ 6];
173 for (i
= 0; i
< len
; i
++) {
174 if (s
[i
] == '"' || s
[i
] == '\\')
185 const char *show_token(const struct token
*token
)
187 static char buffer
[256];
191 switch (token_type(token
)) {
193 return "syntax error";
196 return "end-of-input";
199 return show_ident(token
->ident
);
202 return token
->number
;
205 return show_special(token
->special
);
208 return show_char(token
->string
->data
,
209 token
->string
->length
- 1, 0, '\'');
210 case TOKEN_CHAR_EMBEDDED_0
... TOKEN_CHAR_EMBEDDED_3
:
211 return show_char(token
->embedded
,
212 token_type(token
) - TOKEN_CHAR
, 0, '\'');
213 case TOKEN_WIDE_CHAR
:
214 return show_char(token
->string
->data
,
215 token
->string
->length
- 1, 'L', '\'');
216 case TOKEN_WIDE_CHAR_EMBEDDED_0
... TOKEN_WIDE_CHAR_EMBEDDED_3
:
217 return show_char(token
->embedded
,
218 token_type(token
) - TOKEN_WIDE_CHAR
, 'L', '\'');
220 return show_char(token
->string
->data
,
221 token
->string
->length
- 1, 0, '"');
222 case TOKEN_WIDE_STRING
:
223 return show_char(token
->string
->data
,
224 token
->string
->length
- 1, 'L', '"');
226 case TOKEN_STREAMBEGIN
:
227 sprintf(buffer
, "<beginning of '%s'>", stream_name(token
->pos
.stream
));
230 case TOKEN_STREAMEND
:
231 sprintf(buffer
, "<end of '%s'>", stream_name(token
->pos
.stream
));
235 sprintf(buffer
, "<untaint>");
238 case TOKEN_ARG_COUNT
:
239 sprintf(buffer
, "<argcnt>");
243 sprintf(buffer
, "unhandled token type '%d' ", token_type(token
));
248 const char *quote_token(const struct token
*token
)
250 static char buffer
[256];
252 switch (token_type(token
)) {
254 return "syntax error";
257 return show_ident(token
->ident
);
260 return token
->number
;
263 return show_special(token
->special
);
266 return quote_char(token
->string
->data
,
267 token
->string
->length
- 1, 0, '\'');
268 case TOKEN_CHAR_EMBEDDED_0
... TOKEN_CHAR_EMBEDDED_3
:
269 return quote_char(token
->embedded
,
270 token_type(token
) - TOKEN_CHAR
, 0, '\'');
271 case TOKEN_WIDE_CHAR
:
272 return quote_char(token
->string
->data
,
273 token
->string
->length
- 1, 'L', '\'');
274 case TOKEN_WIDE_CHAR_EMBEDDED_0
... TOKEN_WIDE_CHAR_EMBEDDED_3
:
275 return quote_char(token
->embedded
,
276 token_type(token
) - TOKEN_WIDE_CHAR
, 'L', '\'');
278 return quote_char(token
->string
->data
,
279 token
->string
->length
- 1, 0, '"');
280 case TOKEN_WIDE_STRING
:
281 return quote_char(token
->string
->data
,
282 token
->string
->length
- 1, 'L', '"');
284 sprintf(buffer
, "unhandled token type '%d' ", token_type(token
));
289 #define HASHED_INPUT_BITS (6)
290 #define HASHED_INPUT (1 << HASHED_INPUT_BITS)
291 #define HASH_PRIME 0x9e370001UL
293 static int input_stream_hashes
[HASHED_INPUT
] = { [0 ... HASHED_INPUT
-1] = -1 };
295 int *hash_stream(const char *name
)
300 while ((c
= *name
++) != 0)
301 hash
= (hash
+ (c
<< 4) + (c
>> 4)) * 11;
304 hash
>>= 32 - HASHED_INPUT_BITS
;
305 return input_stream_hashes
+ hash
;
308 int init_stream(const char *name
, int fd
, const char **next_path
)
310 int stream
= input_stream_nr
, *hash
;
311 struct stream
*current
;
313 if (stream
>= input_streams_allocated
) {
314 int newalloc
= stream
* 4 / 3 + 10;
315 input_streams
= realloc(input_streams
, newalloc
* sizeof(struct stream
));
317 die("Unable to allocate more streams space");
318 input_streams_allocated
= newalloc
;
320 current
= input_streams
+ stream
;
321 memset(current
, 0, sizeof(*current
));
322 current
->name
= name
;
324 current
->next_path
= next_path
;
325 current
->path
= NULL
;
326 current
->constant
= CONSTANT_FILE_MAYBE
;
327 input_stream_nr
= stream
+1;
328 hash
= hash_stream(name
);
329 current
->next_stream
= *hash
;
334 static struct token
* alloc_token(stream_t
*stream
)
336 struct token
*token
= __alloc_token(0);
337 token
->pos
= stream_pos(stream
);
342 * Argh... That was surprisingly messy - handling '\r' complicates the
345 static int nextchar_slow(stream_t
*stream
)
347 int offset
= stream
->offset
;
348 int size
= stream
->size
;
350 int spliced
= 0, had_cr
, had_backslash
;
353 had_cr
= had_backslash
= 0;
356 if (offset
>= size
) {
359 size
= read(stream
->fd
, stream
->buffer
, BUFSIZE
);
363 stream
->offset
= offset
= 0;
366 c
= stream
->buffer
[offset
++];
376 if (!had_backslash
) {
379 stream
->pos
+= tabstop
- stream
->pos
% tabstop
;
404 stream
->offset
= offset
;
420 warning(stream_pos(stream
), "no newline at end of file");
422 warning(stream_pos(stream
), "backslash-newline at end of file");
427 * We want that as light as possible while covering all normal cases.
428 * Slow path (including the logics with line-splicing and EOF sanity
429 * checks) is in nextchar_slow().
431 static inline int nextchar(stream_t
*stream
)
433 int offset
= stream
->offset
;
435 if (offset
< stream
->size
) {
436 int c
= stream
->buffer
[offset
++];
437 static const char special
[256] = {
438 ['\t'] = 1, ['\r'] = 1, ['\n'] = 1, ['\\'] = 1
441 stream
->offset
= offset
;
446 return nextchar_slow(stream
);
449 struct token eof_token_entry
;
451 static struct token
*mark_eof(stream_t
*stream
)
455 end
= alloc_token(stream
);
456 eof_token_entry
.pos
= end
->pos
;
457 token_type(end
) = TOKEN_STREAMEND
;
458 end
->pos
.newline
= 1;
460 eof_token_entry
.next
= &eof_token_entry
;
461 eof_token_entry
.pos
.newline
= 1;
463 end
->next
= &eof_token_entry
;
464 *stream
->tokenlist
= end
;
465 stream
->tokenlist
= NULL
;
469 static void add_token(stream_t
*stream
)
471 struct token
*token
= stream
->token
;
473 stream
->token
= NULL
;
475 *stream
->tokenlist
= token
;
476 stream
->tokenlist
= &token
->next
;
479 static void drop_token(stream_t
*stream
)
481 stream
->newline
|= stream
->token
->pos
.newline
;
482 stream
->whitespace
|= stream
->token
->pos
.whitespace
;
483 stream
->token
= NULL
;
496 static const char cclass
[257] = {
497 ['0' + 1 ... '9' + 1] = Digit
| Hex
,
498 ['A' + 1 ... 'D' + 1] = Letter
| Hex
,
499 ['E' + 1] = Letter
| Hex
| Exp
, /* E<exp> */
500 ['F' + 1] = Letter
| Hex
,
501 ['G' + 1 ... 'O' + 1] = Letter
,
502 ['P' + 1] = Letter
| Exp
, /* P<exp> */
503 ['Q' + 1 ... 'Z' + 1] = Letter
,
504 ['a' + 1 ... 'd' + 1] = Letter
| Hex
,
505 ['e' + 1] = Letter
| Hex
| Exp
, /* e<exp> */
506 ['f' + 1] = Letter
| Hex
,
507 ['g' + 1 ... 'o' + 1] = Letter
,
508 ['p' + 1] = Letter
| Exp
, /* p<exp> */
509 ['q' + 1 ... 'z' + 1] = Letter
,
511 ['.' + 1] = Dot
| ValidSecond
,
512 ['=' + 1] = ValidSecond
,
513 ['+' + 1] = ValidSecond
,
514 ['-' + 1] = ValidSecond
,
515 ['>' + 1] = ValidSecond
,
516 ['<' + 1] = ValidSecond
,
517 ['&' + 1] = ValidSecond
,
518 ['|' + 1] = ValidSecond
,
519 ['#' + 1] = ValidSecond
,
529 * pp-number identifier-nodigit
536 static int get_one_number(int c
, int next
, stream_t
*stream
)
539 static char buffer
[4095];
540 char *p
= buffer
, *buffer_end
= buffer
+ sizeof (buffer
);
544 long class = cclass
[next
+ 1];
545 if (!(class & (Dot
| Digit
| Letter
)))
549 next
= nextchar(stream
);
551 if (next
== '-' || next
== '+') {
554 next
= nextchar(stream
);
559 if (p
== buffer_end
) {
560 sparse_error(stream_pos(stream
), "number token exceeds %td characters",
561 buffer_end
- buffer
);
562 // Pretend we saw just "1".
568 token
= stream
->token
;
569 token_type(token
) = TOKEN_NUMBER
;
570 token
->number
= xmemdup(buffer
, p
- buffer
);
576 static int eat_string(int next
, stream_t
*stream
, enum token_type type
)
578 static char buffer
[MAX_STRING
];
579 struct string
*string
;
580 struct token
*token
= stream
->token
;
584 char delim
= type
< TOKEN_STRING
? '\'' : '"';
586 for (escape
= 0; escape
|| next
!= delim
; next
= nextchar(stream
)) {
587 if (len
< MAX_STRING
)
591 warning(stream_pos(stream
),
592 "missing terminating %c character", delim
);
593 /* assume delimiter is lost */
597 warning(stream_pos(stream
),
598 "End of file in middle of string");
602 if (want_hex
&& !(cclass
[next
+ 1] & Hex
))
603 warning(stream_pos(stream
),
604 "\\x used with no following hex digits");
606 escape
= next
== '\\';
609 want_hex
= next
== 'x';
613 warning(stream_pos(stream
),
614 "\\x used with no following hex digits");
615 if (len
> MAX_STRING
) {
616 warning(stream_pos(stream
), "string too long (%d bytes, %d bytes max)", len
, MAX_STRING
);
619 if (delim
== '\'' && len
<= 4) {
621 sparse_error(stream_pos(stream
),
622 "empty character constant");
623 return nextchar(stream
);
625 token_type(token
) = type
+ len
;
626 memset(buffer
+ len
, '\0', 4 - len
);
627 memcpy(token
->embedded
, buffer
, 4);
629 token_type(token
) = type
;
630 string
= __alloc_string(len
+1);
631 memcpy(string
->data
, buffer
, len
);
632 string
->data
[len
] = '\0';
633 string
->length
= len
+1;
634 token
->string
= string
;
638 token
= stream
->token
;
640 return nextchar(stream
);
643 static int drop_stream_eoln(stream_t
*stream
)
647 switch (nextchar(stream
)) {
651 return nextchar(stream
);
656 static int drop_stream_comment(stream_t
*stream
)
661 newline
= stream
->newline
;
663 next
= nextchar(stream
);
667 warning(stream_pos(stream
), "End of file in the middle of a comment");
670 next
= nextchar(stream
);
671 if (curr
== '*' && next
== '/')
674 stream
->newline
= newline
;
675 return nextchar(stream
);
678 unsigned char combinations
[][4] = COMBINATION_STRINGS
;
680 #define NR_COMBINATIONS (SPECIAL_ARG_SEPARATOR - SPECIAL_BASE)
682 /* hash function for two-character punctuators - all give unique values */
683 #define special_hash(c0, c1) (((c0*8+c1*2)+((c0*8+c1*2)>>5))&31)
686 * note that we won't get false positives - special_hash(0,0) is 0 and
687 * entry 0 is filled (by +=), so all the missing ones are OK.
689 static unsigned char hash_results
[32][2] = {
690 #define RES(c0, c1) [special_hash(c0, c1)] = {c0, c1}
691 RES('+', '='), /* 00 */
692 RES('/', '='), /* 01 */
693 RES('^', '='), /* 05 */
694 RES('&', '&'), /* 07 */
695 RES('#', '#'), /* 08 */
696 RES('<', '<'), /* 0a */
697 RES('<', '='), /* 0c */
698 RES('!', '='), /* 0e */
699 RES('%', '='), /* 0f */
700 RES('-', '-'), /* 10 */
701 RES('-', '='), /* 11 */
702 RES('-', '>'), /* 13 */
703 RES('=', '='), /* 15 */
704 RES('&', '='), /* 17 */
705 RES('*', '='), /* 18 */
706 RES('.', '.'), /* 1a */
707 RES('+', '+'), /* 1b */
708 RES('|', '='), /* 1c */
709 RES('>', '='), /* 1d */
710 RES('|', '|'), /* 1e */
711 RES('>', '>') /* 1f */
714 static int code
[32] = {
715 #define CODE(c0, c1, value) [special_hash(c0, c1)] = value
716 CODE('+', '=', SPECIAL_ADD_ASSIGN
), /* 00 */
717 CODE('/', '=', SPECIAL_DIV_ASSIGN
), /* 01 */
718 CODE('^', '=', SPECIAL_XOR_ASSIGN
), /* 05 */
719 CODE('&', '&', SPECIAL_LOGICAL_AND
), /* 07 */
720 CODE('#', '#', SPECIAL_HASHHASH
), /* 08 */
721 CODE('<', '<', SPECIAL_LEFTSHIFT
), /* 0a */
722 CODE('<', '=', SPECIAL_LTE
), /* 0c */
723 CODE('!', '=', SPECIAL_NOTEQUAL
), /* 0e */
724 CODE('%', '=', SPECIAL_MOD_ASSIGN
), /* 0f */
725 CODE('-', '-', SPECIAL_DECREMENT
), /* 10 */
726 CODE('-', '=', SPECIAL_SUB_ASSIGN
), /* 11 */
727 CODE('-', '>', SPECIAL_DEREFERENCE
), /* 13 */
728 CODE('=', '=', SPECIAL_EQUAL
), /* 15 */
729 CODE('&', '=', SPECIAL_AND_ASSIGN
), /* 17 */
730 CODE('*', '=', SPECIAL_MUL_ASSIGN
), /* 18 */
731 CODE('.', '.', SPECIAL_DOTDOT
), /* 1a */
732 CODE('+', '+', SPECIAL_INCREMENT
), /* 1b */
733 CODE('|', '=', SPECIAL_OR_ASSIGN
), /* 1c */
734 CODE('>', '=', SPECIAL_GTE
), /* 1d */
735 CODE('|', '|', SPECIAL_LOGICAL_OR
), /* 1e */
736 CODE('>', '>', SPECIAL_RIGHTSHIFT
) /* 1f */
740 static int get_one_special(int c
, stream_t
*stream
)
745 next
= nextchar(stream
);
748 * Check for numbers, strings, character constants, and comments
752 if (next
>= '0' && next
<= '9')
753 return get_one_number(c
, next
, stream
);
756 return eat_string(next
, stream
, TOKEN_STRING
);
758 return eat_string(next
, stream
, TOKEN_CHAR
);
761 return drop_stream_eoln(stream
);
763 return drop_stream_comment(stream
);
767 * Check for combinations
770 if (cclass
[next
+ 1] & ValidSecond
) {
771 i
= special_hash(c
, next
);
772 if (hash_results
[i
][0] == c
&& hash_results
[i
][1] == next
) {
774 next
= nextchar(stream
);
775 if (value
>= SPECIAL_LEFTSHIFT
&&
776 next
== "==."[value
- SPECIAL_LEFTSHIFT
]) {
778 next
= nextchar(stream
);
784 token
= stream
->token
;
785 token_type(token
) = TOKEN_SPECIAL
;
786 token
->special
= value
;
791 #define IDENT_HASH_BITS (13)
792 #define IDENT_HASH_SIZE (1<<IDENT_HASH_BITS)
793 #define IDENT_HASH_MASK (IDENT_HASH_SIZE-1)
795 #define ident_hash_init(c) (c)
796 #define ident_hash_add(oldhash,c) ((oldhash)*11 + (c))
797 #define ident_hash_end(hash) ((((hash) >> IDENT_HASH_BITS) + (hash)) & IDENT_HASH_MASK)
799 static struct ident
*hash_table
[IDENT_HASH_SIZE
];
800 static int ident_hit
, ident_miss
, idents
;
802 void show_identifier_stats(void)
805 int distribution
[100];
807 fprintf(stderr
, "identifiers: %d hits, %d misses\n",
808 ident_hit
, ident_miss
);
810 for (i
= 0; i
< 100; i
++)
813 for (i
= 0; i
< IDENT_HASH_SIZE
; i
++) {
814 struct ident
* ident
= hash_table
[i
];
823 distribution
[count
]++;
826 for (i
= 0; i
< 100; i
++) {
828 fprintf(stderr
, "%2d: %d buckets\n", i
, distribution
[i
]);
832 struct ident
*alloc_ident(const char *name
, int len
)
834 struct ident
*ident
= __alloc_ident(len
);
835 ident
->symbols
= NULL
;
838 memcpy(ident
->name
, name
, len
);
842 static struct ident
* insert_hash(struct ident
*ident
, unsigned long hash
)
844 ident
->next
= hash_table
[hash
];
845 hash_table
[hash
] = ident
;
850 static struct ident
*create_hashed_ident(const char *name
, int len
, unsigned long hash
)
855 p
= &hash_table
[hash
];
856 while ((ident
= *p
) != NULL
) {
857 if (ident
->len
== (unsigned char) len
) {
858 if (strncmp(name
, ident
->name
, len
) != 0)
868 ident
= alloc_ident(name
, len
);
876 static unsigned long hash_name(const char *name
, int len
)
879 const unsigned char *p
= (const unsigned char *)name
;
881 hash
= ident_hash_init(*p
++);
883 unsigned int i
= *p
++;
884 hash
= ident_hash_add(hash
, i
);
886 return ident_hash_end(hash
);
889 struct ident
*hash_ident(struct ident
*ident
)
891 return insert_hash(ident
, hash_name(ident
->name
, ident
->len
));
894 struct ident
*built_in_ident(const char *name
)
896 int len
= strlen(name
);
897 return create_hashed_ident(name
, len
, hash_name(name
, len
));
900 struct token
*built_in_token(int stream
, struct ident
*ident
)
904 token
= __alloc_token(0);
905 token
->pos
.stream
= stream
;
906 token_type(token
) = TOKEN_IDENT
;
907 token
->ident
= ident
;
911 static int get_one_identifier(int c
, stream_t
*stream
)
920 hash
= ident_hash_init(c
);
923 next
= nextchar(stream
);
924 if (!(cclass
[next
+ 1] & (Letter
| Digit
)))
926 if (len
>= sizeof(buf
))
928 hash
= ident_hash_add(hash
, next
);
932 if (cclass
[next
+ 1] & Quote
) {
933 if (len
== 1 && buf
[0] == 'L') {
935 return eat_string(nextchar(stream
), stream
,
938 return eat_string(nextchar(stream
), stream
,
942 hash
= ident_hash_end(hash
);
943 ident
= create_hashed_ident(buf
, len
, hash
);
946 token
= stream
->token
;
947 token_type(token
) = TOKEN_IDENT
;
948 token
->ident
= ident
;
953 static int get_one_token(int c
, stream_t
*stream
)
955 long class = cclass
[c
+ 1];
957 return get_one_number(c
, nextchar(stream
), stream
);
959 return get_one_identifier(c
, stream
);
960 return get_one_special(c
, stream
);
963 static struct token
*setup_stream(stream_t
*stream
, int idx
, int fd
,
964 unsigned char *buf
, unsigned int buf_size
)
971 stream
->whitespace
= 0;
974 stream
->token
= NULL
;
977 stream
->size
= buf_size
;
978 stream
->buffer
= buf
;
980 begin
= alloc_token(stream
);
981 token_type(begin
) = TOKEN_STREAMBEGIN
;
982 stream
->tokenlist
= &begin
->next
;
986 static struct token
*tokenize_stream(stream_t
*stream
)
988 int c
= nextchar(stream
);
991 struct token
*token
= alloc_token(stream
);
992 stream
->token
= token
;
994 stream
->whitespace
= 0;
995 c
= get_one_token(c
, stream
);
998 stream
->whitespace
= 1;
999 c
= nextchar(stream
);
1001 return mark_eof(stream
);
1004 struct token
* tokenize_buffer(void *buffer
, unsigned long size
, struct token
**endtoken
)
1007 struct token
*begin
;
1009 begin
= setup_stream(&stream
, 0, -1, buffer
, size
);
1010 *endtoken
= tokenize_stream(&stream
);
1014 struct token
* tokenize(const char *name
, int fd
, struct token
*endtoken
, const char **next_path
)
1016 struct token
*begin
, *end
;
1018 unsigned char buffer
[BUFSIZE
];
1021 idx
= init_stream(name
, fd
, next_path
);
1023 // info(endtoken->pos, "File %s is const", name);
1027 begin
= setup_stream(&stream
, idx
, fd
, buffer
, 0);
1028 end
= tokenize_stream(&stream
);
1030 end
->next
= endtoken
;