tokenizer.c

   1 #include <stdint.h>
   2 #include <stdio.h>
   3 #include <ctype.h>
   4 #include <string.h>
   5 #include <assert.h>
   6
   7 #include "tokenizer.h"
   8
   9 void tokenizer_set_filename(struct tokenizer *t, const char* fn) {
  10         t->filename = fn;
  11 }
  12
  13 #define ARRAY_SIZE(X) (sizeof(X)/sizeof(X[0]))
  14
  15 off_t tokenizer_ftello(struct tokenizer *t) {
  16         return ftello(t->input)-t->getc_buf.buffered;
  17 }
  18
  19 static int tokenizer_ungetc(struct tokenizer *t, int c)
  20 {
  21         ++t->getc_buf.buffered;
  22         assert(t->getc_buf.buffered<ARRAY_SIZE(t->getc_buf.buf));
  23         assert(t->getc_buf.cnt > 0);
  24         --t->getc_buf.cnt;
  25         assert(t->getc_buf.buf[t->getc_buf.cnt % ARRAY_SIZE(t->getc_buf.buf)] == c);
  26         return c;
  27 }
  28 static int tokenizer_getc(struct tokenizer *t)
  29 {
  30         int c;
  31         if(t->getc_buf.buffered) {
  32                 t->getc_buf.buffered--;
  33                 c = t->getc_buf.buf[(t->getc_buf.cnt) % ARRAY_SIZE(t->getc_buf.buf)];
  34         } else {
  35                 c = getc(t->input);
  36                 t->getc_buf.buf[t->getc_buf.cnt % ARRAY_SIZE(t->getc_buf.buf)] = c;
  37         }
  38         ++t->getc_buf.cnt;
  39         return c;
  40 }
  41
  42 int tokenizer_peek(struct tokenizer *t) {
  43         if(t->peeking) return t->peek_token.value;
  44         int ret = tokenizer_getc(t);
  45         if(ret != EOF) tokenizer_ungetc(t, ret);
  46         return ret;
  47 }
  48
  49 int tokenizer_peek_token(struct tokenizer *t, struct token *tok) {
  50         int ret = tokenizer_next(t, tok);
  51         t->peek_token = *tok;
  52         t->peeking = 1;
  53         return ret;
  54 }
  55
  56 void tokenizer_register_custom_token(struct tokenizer*t, int tokentype, const char* str) {
  57         assert(tokentype >= TT_CUSTOM && tokentype < TT_CUSTOM + MAX_CUSTOM_TOKENS);
  58         int pos = tokentype - TT_CUSTOM;
  59         t->custom_tokens[pos] = str;
  60         if(pos+1 > t->custom_count) t->custom_count = pos+1;
  61 }
  62
  63 const char* tokentype_to_str(enum tokentype tt) {
  64         switch((unsigned) tt) {
  65                 case TT_IDENTIFIER: return "iden";
  66                 case TT_WIDECHAR_LIT: return "widechar";
  67                 case TT_WIDESTRING_LIT: return "widestring";
  68                 case TT_SQSTRING_LIT: return "single-quoted string";
  69                 case TT_DQSTRING_LIT: return "double-quoted string";
  70                 case TT_ELLIPSIS: return "ellipsis";
  71                 case TT_HEX_INT_LIT: return "hexint";
  72                 case TT_OCT_INT_LIT: return "octint";
  73                 case TT_DEC_INT_LIT: return "decint";
  74                 case TT_FLOAT_LIT: return "float";
  75                 case TT_SEP: return "separator";
  76                 case TT_UNKNOWN: return "unknown";
  77                 case TT_OVERFLOW: return "overflow";
  78                 case TT_EOF: return "eof";
  79         }
  80         return "????";
  81 }
  82
  83 static int has_ul_tail(const char *p) {
  84         char tail[4];
  85         int tc = 0, c;
  86         while(tc < 4 ) {
  87                 if(!*p) break;
  88                 c = tolower(*p);
  89                 if(c == 'u' || c == 'l') {
  90                         tail[tc++] = c;
  91                 } else {
  92                         return 0;
  93                 }
  94                 p++;
  95         }
  96         if(tc == 1) return 1;
  97         if(tc == 2) {
  98                 if(!memcmp(tail, "lu", 2)) return 1;
  99                 if(!memcmp(tail, "ul", 2)) return 1;
 100                 if(!memcmp(tail, "ll", 2)) return 1;
 101         }
 102         if(tc == 3) {
 103                 if(!memcmp(tail, "llu", 3)) return 1;
 104                 if(!memcmp(tail, "ull", 3)) return 1;
 105         }
 106         return 0;
 107 }
 108
 109 static int is_hex_int_literal(const char *s) {
 110         if(s[0] == '-') s++;
 111         if(s[0] == '0' && (s[1] == 'x' || s[1] == 'X')) {
 112                 const char* p = s+2;
 113                 while(*p) {
 114                         if(!strchr("0123456789abcdef", tolower(*p))) {
 115                                 if(p == s+2) return 0;
 116                                 return has_ul_tail(p);
 117                         }
 118                         p++;
 119                 }
 120                 return 1;
 121         }
 122         return 0;
 123 }
 124
 125 static int is_plus_or_minus(int c) {
 126         return c == '-' || c == '+';
 127 }
 128
 129 static int is_dec_int_literal(const char *str) {
 130         const char *s = str;
 131         if(is_plus_or_minus(s[0])) s++;
 132         if(s[0] == '0') {
 133                 if(s[1] == 0) return 1;
 134                 if(isdigit(s[1])) return 0;
 135         }
 136         while(*s) {
 137                 if(!isdigit(*s)) {
 138                         if(s > str && (is_plus_or_minus(str[0]) ? s > str+1 : 1)) return has_ul_tail(s);
 139                         else return 0;
 140                 }
 141                 s++;
 142         }
 143         return 1;
 144 }
 145
 146 static int is_float_literal(const char *str) {
 147         const char *s = str;
 148         if(is_plus_or_minus(s[0])) s++;
 149         int got_dot = 0, got_e = 0, got_digits = 0;
 150         while(*s) {
 151                 int l = tolower(*s);
 152                 if(*s == '.') {
 153                         if(got_dot) return 0;
 154                         got_dot = 1;
 155                 } else if(l == 'f') {
 156                         if(s[1] == 0 && (got_dot || got_e) && got_digits) return 1;
 157                         return 0;
 158                 } else if (isdigit(*s)) {
 159                         got_digits = 1;
 160                 } else if(l == 'e') {
 161                         if(!got_digits) return 0;
 162                         s++;
 163                         if(is_plus_or_minus(*s)) s++;
 164                         if(!isdigit(*s)) return 0;
 165                         got_e = 1;
 166                 } else return 0;
 167                 s++;
 168         }
 169         if(got_digits && (got_e || got_dot)) return 1;
 170         return 0;
 171 }
 172
 173 static int is_valid_float_until(const char*s, const char* until) {
 174         int got_digits = 0, got_dot = 0;
 175         while(s < until) {
 176                 if(isdigit(*s)) got_digits = 1;
 177                 else if(*s == '.') {
 178                         if(got_dot) return 0;
 179                         got_dot = 1;
 180                 } else return 0;
 181                 ++s;
 182         }
 183         return got_digits | (got_dot << 1);
 184 }
 185
 186 static int is_oct_int_literal(const char *s) {
 187         if(s[0] == '-') s++;
 188         if(s[0] != '0') return 0;
 189         while(*s) {
 190                 if(!strchr("01234567", *s)) return 0;
 191                 s++;
 192         }
 193         return 1;
 194 }
 195
 196 static int is_identifier(const char *s) {
 197         static const char ascmap[128] = {
 198         ['0'] = 2, ['1'] = 2, ['2'] = 2, ['3'] = 2,
 199         ['4'] = 2, ['5'] = 2, ['6'] = 2, ['7'] = 2,
 200         ['8'] = 2, ['9'] = 2, ['A'] = 1, ['B'] = 1,
 201         ['C'] = 1, ['D'] = 1, ['E'] = 1, ['F'] = 1,
 202         ['G'] = 1, ['H'] = 1, ['I'] = 1, ['J'] = 1,
 203         ['K'] = 1, ['L'] = 1, ['M'] = 1, ['N'] = 1,
 204         ['O'] = 1, ['P'] = 1, ['Q'] = 1, ['R'] = 1,
 205         ['S'] = 1, ['T'] = 1, ['U'] = 1, ['V'] = 1,
 206         ['W'] = 1, ['X'] = 1, ['Y'] = 1, ['Z'] = 1,
 207         ['_'] = 1, ['a'] = 1, ['b'] = 1, ['c'] = 1,
 208         ['d'] = 1, ['e'] = 1, ['f'] = 1, ['g'] = 1,
 209         ['h'] = 1, ['i'] = 1, ['j'] = 1, ['k'] = 1,
 210         ['l'] = 1, ['m'] = 1, ['n'] = 1, ['o'] = 1,
 211         ['p'] = 1, ['q'] = 1, ['r'] = 1, ['s'] = 1,
 212         ['t'] = 1, ['u'] = 1, ['v'] = 1, ['w'] = 1,
 213         ['x'] = 1, ['y'] = 1, ['z'] = 1,
 214         };
 215         if((*s) & 128) return 0;
 216         if(ascmap[(unsigned) *s] != 1) return 0;
 217         ++s;
 218         while(*s) {
 219                 if((*s) & 128) return 0;
 220                 if(!ascmap[(unsigned) *s])
 221                         return 0;
 222                 s++;
 223         }
 224         return 1;
 225 }
 226
 227 static enum tokentype categorize(const char *s) {
 228         if(is_hex_int_literal(s)) return TT_HEX_INT_LIT;
 229         if(is_dec_int_literal(s)) return TT_DEC_INT_LIT;
 230         if(is_oct_int_literal(s)) return TT_OCT_INT_LIT;
 231         if(is_float_literal(s)) return TT_FLOAT_LIT;
 232         if(is_identifier(s)) return TT_IDENTIFIER;
 233         return TT_UNKNOWN;
 234 }
 235
 236
 237 static int is_sep(int c) {
 238         static const char ascmap[128] = {
 239                 ['\t'] = 1, ['\n'] = 1, [' '] = 1, ['!'] = 1,
 240                 ['\"'] = 1, ['#'] = 1, ['%'] = 1, ['&'] = 1,
 241                 ['\''] = 1, ['('] = 1, [')'] = 1, ['*'] = 1,
 242                 ['+'] = 1, [','] = 1, ['-'] = 1, ['.'] = 1,
 243                 ['/'] = 1, [':'] = 1, [';'] = 1, ['<'] = 1,
 244                 ['='] = 1, ['>'] = 1, ['?'] = 1, ['['] = 1,
 245                 ['\\'] = 1, [']'] = 1, ['{'] = 1, ['|'] = 1,
 246                 ['}'] = 1, ['~'] = 1, ['^'] = 1,
 247         };
 248         return !(c&128) && ascmap[c];
 249 }
 250
 251 static int apply_coords(struct tokenizer *t, struct token* out, char *end, int retval) {
 252         out->line = t->line;
 253         uintptr_t len = end - t->buf;
 254         out->column = t->column - len;
 255         if(len + 1 >= t->bufsize) {
 256                 out->type = TT_OVERFLOW;
 257                 return 0;
 258         }
 259         return retval;
 260 }
 261
 262 static inline char *assign_bufchar(struct tokenizer *t, char *s, int c) {
 263         t->column++;
 264         *s = c;
 265         return s + 1;
 266 }
 267
 268 static int get_string(struct tokenizer *t, char quote_char, struct token* out, int wide) {
 269         char *s = t->buf+1;
 270         int escaped = 0;
 271         char *end = t->buf + t->bufsize - 2;
 272         while(s < end) {
 273                 int c = tokenizer_getc(t);
 274                 if(c == EOF) {
 275                         out->type = TT_EOF;
 276                         *s = 0;
 277                         return apply_coords(t, out, s, 0);
 278                 }
 279                 if(c == '\\') {
 280                         c = tokenizer_getc(t);
 281                         if(c == '\n') continue;
 282                         tokenizer_ungetc(t, c);
 283                         c = '\\';
 284                 }
 285                 if(c == '\n') {
 286                         if(escaped) {
 287                                 escaped = 0;
 288                                 continue;
 289                         }
 290                         tokenizer_ungetc(t, c);
 291                         out->type = TT_UNKNOWN;
 292                         s = assign_bufchar(t, s, 0);
 293                         return apply_coords(t, out, s, 0);
 294                 }
 295                 if(!escaped) {
 296                         if(c == quote_char) {
 297                                 s = assign_bufchar(t, s, c);
 298                                 *s = 0;
 299                                 //s = assign_bufchar(t, s, 0);
 300                                 if(!wide)
 301                                         out->type = (quote_char == '"'? TT_DQSTRING_LIT : TT_SQSTRING_LIT);
 302                                 else
 303                                         out->type = (quote_char == '"'? TT_WIDESTRING_LIT : TT_WIDECHAR_LIT);
 304                                 return apply_coords(t, out, s, 1);
 305                         }
 306                         if(c == '\\') escaped = 1;
 307                 } else {
 308                         escaped = 0;
 309                 }
 310                 s = assign_bufchar(t, s, c);
 311         }
 312         t->buf[MAX_TOK_LEN-1] = 0;
 313         out->type = TT_OVERFLOW;
 314         return apply_coords(t, out, s, 0);
 315 }
 316
 317 /* if sequence found, next tokenizer call will point after the sequence */
 318 static int sequence_follows(struct tokenizer *t, int c, const char *which)
 319 {
 320         if(!which || !which[0]) return 0;
 321         size_t i = 0;
 322         while(c == which[i]) {
 323                 if(!which[++i]) break;
 324                 c = tokenizer_getc(t);
 325         }
 326         if(!which[i]) return 1;
 327         while(i > 0) {
 328                 tokenizer_ungetc(t, c);
 329                 c = which[--i];
 330         }
 331         return 0;
 332 }
 333
 334 int tokenizer_skip_chars(struct tokenizer *t, const char *chars, int *count) {
 335         assert(!t->peeking);
 336         int c;
 337         *count = 0;
 338         while(1) {
 339                 c = tokenizer_getc(t);
 340                 if(c == EOF) return 0;
 341                 const char *s = chars;
 342                 int match = 0;
 343                 while(*s) {
 344                         if(c==*s) {
 345                                 ++(*count);
 346                                 match = 1;
 347                                 break;
 348                         }
 349                         ++s;
 350                 }
 351                 if(!match) {
 352                         tokenizer_ungetc(t, c);
 353                         return 1;
 354                 }
 355         }
 356
 357 }
 358
 359 int tokenizer_read_until(struct tokenizer *t, const char* marker, int stop_at_nl)
 360 {
 361         int c, marker_is_nl = !strcmp(marker, "\n");
 362         char *s = t->buf;
 363         while(1) {
 364                 c = tokenizer_getc(t);
 365                 if(c == EOF) {
 366                         *s = 0;
 367                         return 0;
 368                 }
 369                 if(c == '\n') {
 370                         t->line++;
 371                         t->column = 0;
 372                         if(stop_at_nl) {
 373                                 *s = 0;
 374                                 if(marker_is_nl) return 1;
 375                                 return 0;
 376                         }
 377                 }
 378                 if(!sequence_follows(t, c, marker))
 379                         s = assign_bufchar(t, s, c);
 380                 else
 381                         break;
 382         }
 383         *s = 0;
 384         size_t i;
 385         for(i=strlen(marker); i > 0; )
 386                 tokenizer_ungetc(t, marker[--i]);
 387         return 1;
 388 }
 389 static int ignore_until(struct tokenizer *t, const char* marker, int col_advance)
 390 {
 391         t->column += col_advance;
 392         int c;
 393         do {
 394                 c = tokenizer_getc(t);
 395                 if(c == EOF) return 0;
 396                 if(c == '\n') {
 397                         t->line++;
 398                         t->column = 0;
 399                 } else t->column++;
 400         } while(!sequence_follows(t, c, marker));
 401         t->column += strlen(marker)-1;
 402         return 1;
 403 }
 404
 405 void tokenizer_skip_until(struct tokenizer *t, const char *marker)
 406 {
 407         ignore_until(t, marker, 0);
 408 }
 409
 410 int tokenizer_next_real(struct tokenizer *t, struct token* out) {
 411         char *s = t->buf;
 412         out->value = 0;
 413         int c = 0;
 414         if(t->peeking) {
 415                 *out = t->peek_token;
 416                 t->peeking = 0;
 417                 return 1;
 418         }
 419         while(1) {
 420                 c = tokenizer_getc(t);
 421                 if(c == EOF) break;
 422
 423                 /* components of multi-line comment marker might be terminals themselves */
 424                 if(sequence_follows(t, c, t->marker[MT_MULTILINE_COMMENT_START])) {
 425                         ignore_until(t, t->marker[MT_MULTILINE_COMMENT_END], strlen(t->marker[MT_MULTILINE_COMMENT_START]));
 426                         continue;
 427                 }
 428                 if(sequence_follows(t, c, t->marker[MT_SINGLELINE_COMMENT_START])) {
 429                         ignore_until(t, "\n", strlen(t->marker[MT_SINGLELINE_COMMENT_START]));
 430                         continue;
 431                 }
 432                 if(is_sep(c)) {
 433                         if(s != t->buf && c == '\\' && !isspace(s[-1])) {
 434                                 c = tokenizer_getc(t);
 435                                 if(c == '\n') continue;
 436                                 tokenizer_ungetc(t, c);
 437                                 c = '\\';
 438                         } else if(is_plus_or_minus(c) && s > t->buf+1 &&
 439                                   (s[-1] == 'E' || s[-1] == 'e') && is_valid_float_until(t->buf, s-1)) {
 440                                 goto process_char;
 441                         } else if(c == '.' && s != t->buf && is_valid_float_until(t->buf, s) == 1) {
 442                                 goto process_char;
 443                         } else if(c == '.' && s == t->buf) {
 444                                 int jump = 0;
 445                                 c = tokenizer_getc(t);
 446                                 if(isdigit(c)) jump = 1;
 447                                 tokenizer_ungetc(t, c);
 448                                 c = '.';
 449                                 if(jump) goto process_char;
 450                         }
 451                         tokenizer_ungetc(t, c);
 452                         break;
 453                 }
 454                 if((t->flags & TF_PARSE_WIDE_STRINGS) && s == t->buf && c == 'L') {
 455                         c = tokenizer_getc(t);
 456                         tokenizer_ungetc(t, c);
 457                         tokenizer_ungetc(t, 'L');
 458                         if(c == '\'' || c == '\"') break;
 459                 }
 460
 461 process_char:;
 462                 s = assign_bufchar(t, s, c);
 463                 if(t->column + 1 >= MAX_TOK_LEN) {
 464                         out->type = TT_OVERFLOW;
 465                         return apply_coords(t, out, s, 0);
 466                 }
 467         }
 468         if(s == t->buf) {
 469                 if(c == EOF) {
 470                         out->type = TT_EOF;
 471                         return apply_coords(t, out, s, 1);
 472                 }
 473
 474                 int wide = 0;
 475                 c = tokenizer_getc(t);
 476                 if((t->flags & TF_PARSE_WIDE_STRINGS) && c == 'L') {
 477                         c = tokenizer_getc(t);
 478                         assert(c == '\'' || c == '\"');
 479                         wide = 1;
 480                         goto string_handling;
 481                 } else if (c == '.' && sequence_follows(t, c, "...")) {
 482                         strcpy(t->buf, "...");
 483                         out->type = TT_ELLIPSIS;
 484                         return apply_coords(t, out, s+3, 1);
 485                 }
 486
 487                 {
 488                         int i;
 489                         for(i = 0; i < t->custom_count; i++)
 490                                 if(sequence_follows(t, c, t->custom_tokens[i])) {
 491                                         const char *p = t->custom_tokens[i];
 492                                         while(*p) {
 493                                                 s = assign_bufchar(t, s, *p);
 494                                                 p++;
 495                                         }
 496                                         *s = 0;
 497                                         out->type = TT_CUSTOM + i;
 498                                         return apply_coords(t, out, s, 1);
 499                                 }
 500                 }
 501
 502 string_handling:
 503                 s = assign_bufchar(t, s, c);
 504                 *s = 0;
 505                 //s = assign_bufchar(t, s, 0);
 506                 if(c == '"' || c == '\'')
 507                         if(t->flags & TF_PARSE_STRINGS) return get_string(t, c, out, wide);
 508                 out->type = TT_SEP;
 509                 out->value = c;
 510                 if(c == '\n') {
 511                         apply_coords(t, out, s, 1);
 512                         t->line++;
 513                         t->column=0;
 514                         return 1;
 515                 }
 516                 return apply_coords(t, out, s, 1);
 517         }
 518         //s = assign_bufchar(t, s, 0);
 519         *s = 0;
 520         out->type = categorize(t->buf);
 521         return apply_coords(t, out, s, out->type != TT_UNKNOWN);
 522 }
 523
 524 int tokenizer_next(struct tokenizer *t, struct token* out) {
 525         int ret = tokenizer_next_real(t, out);
 526 #if TDEBUG
 527         fprintf(stderr, "<%s:%p> <%s>'%c' = \"%s\"\n", t->filename, t->input,
 528         tokentype_to_str(out->type),
 529         out->value, t->buf);
 530 #endif
 531         return ret;
 532 }
 533
 534 void tokenizer_set_flags(struct tokenizer *t, int flags) {
 535         t->flags = flags;
 536 }
 537
 538 int tokenizer_get_flags(struct tokenizer *t) {
 539         return t->flags;
 540 }
 541
 542 void tokenizer_init(struct tokenizer *t, FILE* in, int flags) {
 543         *t = (struct tokenizer){ .input = in, .line = 1, .flags = flags, .bufsize = MAX_TOK_LEN};
 544 }
 545
 546 void tokenizer_register_marker(struct tokenizer *t, enum markertype mt, const char* marker)
 547 {
 548         t->marker[mt] = marker;
 549 }
 550
 551 int tokenizer_rewind(struct tokenizer *t) {
 552         FILE *f = t->input;
 553         int flags = t->flags;
 554         const char* fn = t->filename;
 555         tokenizer_init(t, f, flags);
 556         tokenizer_set_filename(t, fn);
 557         return fseek(f, 0, SEEK_SET) == 0;
 558 }