Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "bytesobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #endif /* PGEN */
  20
  21 #define is_potential_identifier_start(c) (\
  22               (c >= 'a' && c <= 'z')\
  23                || (c >= 'A' && c <= 'Z')\
  24                || c == '_'\
  25                || (c >= 128))
  26
  27 #define is_potential_identifier_char(c) (\
  28               (c >= 'a' && c <= 'z')\
  29                || (c >= 'A' && c <= 'Z')\
  30                || (c >= '0' && c <= '9')\
  31                || c == '_'\
  32                || (c >= 128))
  33
  34 extern char *PyOS_Readline(FILE *, FILE *, char *);
  35 /* Return malloc'ed string including trailing \n;
  36    empty malloc'ed string for EOF;
  37    NULL if interrupted */
  38
  39 /* Don't ever change this -- it would break the portability of Python code */
  40 #define TABSIZE 8
  41
  42 /* Forward */
  43 static struct tok_state *tok_new(void);
  44 static int tok_nextc(struct tok_state *tok);
  45 static void tok_backup(struct tok_state *tok, int c);
  46
  47
  48 /* Token names */
  49
  50 char *_PyParser_TokenNames[] = {
  51     "ENDMARKER",
  52     "NAME",
  53     "NUMBER",
  54     "STRING",
  55     "NEWLINE",
  56     "INDENT",
  57     "DEDENT",
  58     "LPAR",
  59     "RPAR",
  60     "LSQB",
  61     "RSQB",
  62     "COLON",
  63     "COMMA",
  64     "SEMI",
  65     "PLUS",
  66     "MINUS",
  67     "STAR",
  68     "SLASH",
  69     "VBAR",
  70     "AMPER",
  71     "LESS",
  72     "GREATER",
  73     "EQUAL",
  74     "DOT",
  75     "PERCENT",
  76     "LBRACE",
  77     "RBRACE",
  78     "EQEQUAL",
  79     "NOTEQUAL",
  80     "LESSEQUAL",
  81     "GREATEREQUAL",
  82     "TILDE",
  83     "CIRCUMFLEX",
  84     "LEFTSHIFT",
  85     "RIGHTSHIFT",
  86     "DOUBLESTAR",
  87     "PLUSEQUAL",
  88     "MINEQUAL",
  89     "STAREQUAL",
  90     "SLASHEQUAL",
  91     "PERCENTEQUAL",
  92     "AMPEREQUAL",
  93     "VBAREQUAL",
  94     "CIRCUMFLEXEQUAL",
  95     "LEFTSHIFTEQUAL",
  96     "RIGHTSHIFTEQUAL",
  97     "DOUBLESTAREQUAL",
  98     "DOUBLESLASH",
  99     "DOUBLESLASHEQUAL",
 100     "AT",
 101     "RARROW",
 102     "ELLIPSIS",
 103     /* This table must match the #defines in token.h! */
 104     "OP",
 105     "<ERRORTOKEN>",
 106     "<N_TOKENS>"
 107 };
 108
 109
 110 /* Create and initialize a new tok_state structure */
 111
 112 static struct tok_state *
 113 tok_new(void)
 114 {
 115     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 116                                             sizeof(struct tok_state));
 117     if (tok == NULL)
 118         return NULL;
 119     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 120     tok->done = E_OK;
 121     tok->fp = NULL;
 122     tok->tabsize = TABSIZE;
 123     tok->indent = 0;
 124     tok->indstack[0] = 0;
 125     tok->atbol = 1;
 126     tok->pendin = 0;
 127     tok->prompt = tok->nextprompt = NULL;
 128     tok->lineno = 0;
 129     tok->level = 0;
 130     tok->filename = NULL;
 131     tok->altwarning = 1;
 132     tok->alterror = 1;
 133     tok->alttabsize = 1;
 134     tok->altindstack[0] = 0;
 135     tok->decoding_state = STATE_INIT;
 136     tok->decoding_erred = 0;
 137     tok->read_coding_spec = 0;
 138     tok->enc = NULL;
 139     tok->encoding = NULL;
 140     tok->cont_line = 0;
 141 #ifndef PGEN
 142     tok->decoding_readline = NULL;
 143     tok->decoding_buffer = NULL;
 144 #endif
 145     return tok;
 146 }
 147
 148 #ifdef PGEN
 149
 150 static char *
 151 decoding_fgets(char *s, int size, struct tok_state *tok)
 152 {
 153     return fgets(s, size, tok->fp);
 154 }
 155
 156 static int
 157 decoding_feof(struct tok_state *tok)
 158 {
 159     return feof(tok->fp);
 160 }
 161
 162 static const char *
 163 decode_str(const char *str, struct tok_state *tok)
 164 {
 165     return str;
 166 }
 167
 168 #else /* PGEN */
 169
 170 static char *
 171 error_ret(struct tok_state *tok) /* XXX */
 172 {
 173     tok->decoding_erred = 1;
 174     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 175         PyMem_FREE(tok->buf);
 176     tok->buf = NULL;
 177     return NULL;                /* as if it were EOF */
 178 }
 179
 180 static char *
 181 new_string(const char *s, Py_ssize_t len)
 182 {
 183     char* result = (char *)PyMem_MALLOC(len + 1);
 184     if (result != NULL) {
 185         memcpy(result, s, len);
 186         result[len] = '\0';
 187     }
 188     return result;
 189 }
 190
 191 static char *
 192 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 193 {
 194     char buf[13];
 195     int i;
 196     for (i = 0; i < 12; i++) {
 197         int c = s[i];
 198         if (c == '\0')
 199             break;
 200         else if (c == '_')
 201             buf[i] = '-';
 202         else
 203             buf[i] = tolower(c);
 204     }
 205     buf[i] = '\0';
 206     if (strcmp(buf, "utf-8") == 0 ||
 207         strncmp(buf, "utf-8-", 6) == 0)
 208         return "utf-8";
 209     else if (strcmp(buf, "latin-1") == 0 ||
 210              strcmp(buf, "iso-8859-1") == 0 ||
 211              strcmp(buf, "iso-latin-1") == 0 ||
 212              strncmp(buf, "latin-1-", 8) == 0 ||
 213              strncmp(buf, "iso-8859-1-", 11) == 0 ||
 214              strncmp(buf, "iso-latin-1-", 12) == 0)
 215         return "iso-8859-1";
 216     else
 217         return s;
 218 }
 219
 220 /* Return the coding spec in S, or NULL if none is found.  */
 221
 222 static char *
 223 get_coding_spec(const char *s, Py_ssize_t size)
 224 {
 225     Py_ssize_t i;
 226     /* Coding spec must be in a comment, and that comment must be
 227      * the only statement on the source code line. */
 228     for (i = 0; i < size - 6; i++) {
 229         if (s[i] == '#')
 230             break;
 231         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 232             return NULL;
 233     }
 234     for (; i < size - 6; i++) { /* XXX inefficient search */
 235         const char* t = s + i;
 236         if (strncmp(t, "coding", 6) == 0) {
 237             const char* begin = NULL;
 238             t += 6;
 239             if (t[0] != ':' && t[0] != '=')
 240                 continue;
 241             do {
 242                 t++;
 243             } while (t[0] == '\x20' || t[0] == '\t');
 244
 245             begin = t;
 246             while (isalnum(Py_CHARMASK(t[0])) ||
 247                    t[0] == '-' || t[0] == '_' || t[0] == '.')
 248                 t++;
 249
 250             if (begin < t) {
 251                 char* r = new_string(begin, t - begin);
 252                 char* q = get_normal_name(r);
 253                 if (r != q) {
 254                     PyMem_FREE(r);
 255                     r = new_string(q, strlen(q));
 256                 }
 257                 return r;
 258             }
 259         }
 260     }
 261     return NULL;
 262 }
 263
 264 /* Check whether the line contains a coding spec. If it does,
 265    invoke the set_readline function for the new encoding.
 266    This function receives the tok_state and the new encoding.
 267    Return 1 on success, 0 on failure.  */
 268
 269 static int
 270 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 271                   int set_readline(struct tok_state *, const char *))
 272 {
 273     char * cs;
 274     int r = 1;
 275
 276     if (tok->cont_line)
 277         /* It's a continuation line, so it can't be a coding spec. */
 278         return 1;
 279     cs = get_coding_spec(line, size);
 280     if (cs != NULL) {
 281         tok->read_coding_spec = 1;
 282         if (tok->encoding == NULL) {
 283             assert(tok->decoding_state == STATE_RAW);
 284             if (strcmp(cs, "utf-8") == 0) {
 285                 tok->encoding = cs;
 286             } else {
 287                 r = set_readline(tok, cs);
 288                 if (r) {
 289                     tok->encoding = cs;
 290                     tok->decoding_state = STATE_NORMAL;
 291                 }
 292                 else
 293                     PyMem_FREE(cs);
 294             }
 295         } else {                /* then, compare cs with BOM */
 296             r = (strcmp(tok->encoding, cs) == 0);
 297             PyMem_FREE(cs);
 298         }
 299     }
 300     if (!r) {
 301         cs = tok->encoding;
 302         if (!cs)
 303             cs = "with BOM";
 304         PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 305     }
 306     return r;
 307 }
 308
 309 /* See whether the file starts with a BOM. If it does,
 310    invoke the set_readline function with the new encoding.
 311    Return 1 on success, 0 on failure.  */
 312
 313 static int
 314 check_bom(int get_char(struct tok_state *),
 315           void unget_char(int, struct tok_state *),
 316           int set_readline(struct tok_state *, const char *),
 317           struct tok_state *tok)
 318 {
 319     int ch1, ch2, ch3;
 320     ch1 = get_char(tok);
 321     tok->decoding_state = STATE_RAW;
 322     if (ch1 == EOF) {
 323         return 1;
 324     } else if (ch1 == 0xEF) {
 325         ch2 = get_char(tok);
 326         if (ch2 != 0xBB) {
 327             unget_char(ch2, tok);
 328             unget_char(ch1, tok);
 329             return 1;
 330         }
 331         ch3 = get_char(tok);
 332         if (ch3 != 0xBF) {
 333             unget_char(ch3, tok);
 334             unget_char(ch2, tok);
 335             unget_char(ch1, tok);
 336             return 1;
 337         }
 338 #if 0
 339     /* Disable support for UTF-16 BOMs until a decision
 340        is made whether this needs to be supported.  */
 341     } else if (ch1 == 0xFE) {
 342         ch2 = get_char(tok);
 343         if (ch2 != 0xFF) {
 344             unget_char(ch2, tok);
 345             unget_char(ch1, tok);
 346             return 1;
 347         }
 348         if (!set_readline(tok, "utf-16-be"))
 349             return 0;
 350         tok->decoding_state = STATE_NORMAL;
 351     } else if (ch1 == 0xFF) {
 352         ch2 = get_char(tok);
 353         if (ch2 != 0xFE) {
 354             unget_char(ch2, tok);
 355             unget_char(ch1, tok);
 356             return 1;
 357         }
 358         if (!set_readline(tok, "utf-16-le"))
 359             return 0;
 360         tok->decoding_state = STATE_NORMAL;
 361 #endif
 362     } else {
 363         unget_char(ch1, tok);
 364         return 1;
 365     }
 366     if (tok->encoding != NULL)
 367         PyMem_FREE(tok->encoding);
 368     tok->encoding = new_string("utf-8", 5);     /* resulting is in utf-8 */
 369     /* No need to set_readline: input is already utf-8 */
 370     return 1;
 371 }
 372
 373 /* Read a line of text from TOK into S, using the stream in TOK.
 374    Return NULL on failure, else S.
 375
 376    On entry, tok->decoding_buffer will be one of:
 377      1) NULL: need to call tok->decoding_readline to get a new line
 378      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 379        stored the result in tok->decoding_buffer
 380      3) PyByteArrayObject *: previous call to fp_readl did not have enough room
 381        (in the s buffer) to copy entire contents of the line read
 382        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 383        In this case, fp_readl is called in a loop (with an expanded buffer)
 384        until the buffer ends with a '\n' (or until the end of the file is
 385        reached): see tok_nextc and its calls to decoding_fgets.
 386 */
 387
 388 static char *
 389 fp_readl(char *s, int size, struct tok_state *tok)
 390 {
 391     PyObject* bufobj;
 392     const char *buf;
 393     Py_ssize_t buflen;
 394
 395     /* Ask for one less byte so we can terminate it */
 396     assert(size > 0);
 397     size--;
 398
 399     if (tok->decoding_buffer) {
 400         bufobj = tok->decoding_buffer;
 401         Py_INCREF(bufobj);
 402     }
 403     else
 404     {
 405         bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
 406         if (bufobj == NULL)
 407             goto error;
 408     }
 409     if (PyUnicode_CheckExact(bufobj))
 410     {
 411         buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
 412         if (buf == NULL) {
 413             goto error;
 414         }
 415     }
 416     else
 417     {
 418         buf = PyByteArray_AsString(bufobj);
 419         if (buf == NULL) {
 420             goto error;
 421         }
 422         buflen = PyByteArray_GET_SIZE(bufobj);
 423     }
 424
 425     Py_XDECREF(tok->decoding_buffer);
 426     if (buflen > size) {
 427         /* Too many chars, the rest goes into tok->decoding_buffer */
 428         tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
 429                                                          buflen-size);
 430         if (tok->decoding_buffer == NULL)
 431             goto error;
 432         buflen = size;
 433     }
 434     else
 435         tok->decoding_buffer = NULL;
 436
 437     memcpy(s, buf, buflen);
 438     s[buflen] = '\0';
 439     if (buflen == 0) /* EOF */
 440         s = NULL;
 441     Py_DECREF(bufobj);
 442     return s;
 443
 444 error:
 445     Py_XDECREF(bufobj);
 446     return error_ret(tok);
 447 }
 448
 449 /* Set the readline function for TOK to a StreamReader's
 450    readline function. The StreamReader is named ENC.
 451
 452    This function is called from check_bom and check_coding_spec.
 453
 454    ENC is usually identical to the future value of tok->encoding,
 455    except for the (currently unsupported) case of UTF-16.
 456
 457    Return 1 on success, 0 on failure. */
 458
 459 static int
 460 fp_setreadl(struct tok_state *tok, const char* enc)
 461 {
 462     PyObject *readline = NULL, *stream = NULL, *io = NULL;
 463
 464     io = PyImport_ImportModuleNoBlock("io");
 465     if (io == NULL)
 466         goto cleanup;
 467
 468     if (tok->filename)
 469         stream = PyObject_CallMethod(io, "open", "ssis",
 470                                      tok->filename, "r", -1, enc);
 471     else
 472         stream = PyObject_CallMethod(io, "open", "isisOOO",
 473                         fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
 474     if (stream == NULL)
 475         goto cleanup;
 476
 477     Py_XDECREF(tok->decoding_readline);
 478     readline = PyObject_GetAttrString(stream, "readline");
 479     tok->decoding_readline = readline;
 480
 481     /* The file has been reopened; parsing will restart from
 482      * the beginning of the file, we have to reset the line number.
 483      * But this function has been called from inside tok_nextc() which
 484      * will increment lineno before it returns. So we set it -1 so that
 485      * the next call to tok_nextc() will start with tok->lineno == 0.
 486      */
 487     tok->lineno = -1;
 488
 489   cleanup:
 490     Py_XDECREF(stream);
 491     Py_XDECREF(io);
 492     return readline != NULL;
 493 }
 494
 495 /* Fetch the next byte from TOK. */
 496
 497 static int fp_getc(struct tok_state *tok) {
 498     return getc(tok->fp);
 499 }
 500
 501 /* Unfetch the last byte back into TOK.  */
 502
 503 static void fp_ungetc(int c, struct tok_state *tok) {
 504     ungetc(c, tok->fp);
 505 }
 506
 507 /* Check whether the characters at s start a valid
 508    UTF-8 sequence. Return the number of characters forming
 509    the sequence if yes, 0 if not.  */
 510 static int valid_utf8(const unsigned char* s)
 511 {
 512     int expected = 0;
 513     int length;
 514     if (*s < 0x80)
 515         /* single-byte code */
 516         return 1;
 517     if (*s < 0xc0)
 518         /* following byte */
 519         return 0;
 520     if (*s < 0xE0)
 521         expected = 1;
 522     else if (*s < 0xF0)
 523         expected = 2;
 524     else if (*s < 0xF8)
 525         expected = 3;
 526     else
 527         return 0;
 528     length = expected + 1;
 529     for (; expected; expected--)
 530         if (s[expected] < 0x80 || s[expected] >= 0xC0)
 531             return 0;
 532     return length;
 533 }
 534
 535 /* Read a line of input from TOK. Determine encoding
 536    if necessary.  */
 537
 538 static char *
 539 decoding_fgets(char *s, int size, struct tok_state *tok)
 540 {
 541     char *line = NULL;
 542     int badchar = 0;
 543     for (;;) {
 544         if (tok->decoding_state == STATE_NORMAL) {
 545             /* We already have a codec associated with
 546                this input. */
 547             line = fp_readl(s, size, tok);
 548             break;
 549         } else if (tok->decoding_state == STATE_RAW) {
 550             /* We want a 'raw' read. */
 551             line = Py_UniversalNewlineFgets(s, size,
 552                                             tok->fp, NULL);
 553             break;
 554         } else {
 555             /* We have not yet determined the encoding.
 556                If an encoding is found, use the file-pointer
 557                reader functions from now on. */
 558             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 559                 return error_ret(tok);
 560             assert(tok->decoding_state != STATE_INIT);
 561         }
 562     }
 563     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 564         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 565             return error_ret(tok);
 566         }
 567     }
 568 #ifndef PGEN
 569     /* The default encoding is UTF-8, so make sure we don't have any
 570        non-UTF-8 sequences in it. */
 571     if (line && !tok->encoding) {
 572         unsigned char *c;
 573         int length;
 574         for (c = (unsigned char *)line; *c; c += length)
 575             if (!(length = valid_utf8(c))) {
 576                 badchar = *c;
 577                 break;
 578             }
 579     }
 580     if (badchar) {
 581         char buf[500];
 582         /* Need to add 1 to the line number, since this line
 583            has not been counted, yet.  */
 584         sprintf(buf,
 585             "Non-UTF-8 code starting with '\\x%.2x' "
 586             "in file %.200s on line %i, "
 587             "but no encoding declared; "
 588             "see http://python.org/dev/peps/pep-0263/ for details",
 589             badchar, tok->filename, tok->lineno + 1);
 590         PyErr_SetString(PyExc_SyntaxError, buf);
 591         return error_ret(tok);
 592     }
 593 #endif
 594     return line;
 595 }
 596
 597 static int
 598 decoding_feof(struct tok_state *tok)
 599 {
 600     if (tok->decoding_state != STATE_NORMAL) {
 601         return feof(tok->fp);
 602     } else {
 603         PyObject* buf = tok->decoding_buffer;
 604         if (buf == NULL) {
 605             buf = PyObject_CallObject(tok->decoding_readline, NULL);
 606             if (buf == NULL) {
 607                 error_ret(tok);
 608                 return 1;
 609             } else {
 610                 tok->decoding_buffer = buf;
 611             }
 612         }
 613         return PyObject_Length(buf) == 0;
 614     }
 615 }
 616
 617 /* Fetch a byte from TOK, using the string buffer. */
 618
 619 static int
 620 buf_getc(struct tok_state *tok) {
 621     return Py_CHARMASK(*tok->str++);
 622 }
 623
 624 /* Unfetch a byte from TOK, using the string buffer. */
 625
 626 static void
 627 buf_ungetc(int c, struct tok_state *tok) {
 628     tok->str--;
 629     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
 630 }
 631
 632 /* Set the readline function for TOK to ENC. For the string-based
 633    tokenizer, this means to just record the encoding. */
 634
 635 static int
 636 buf_setreadl(struct tok_state *tok, const char* enc) {
 637     tok->enc = enc;
 638     return 1;
 639 }
 640
 641 /* Return a UTF-8 encoding Python string object from the
 642    C byte string STR, which is encoded with ENC. */
 643
 644 static PyObject *
 645 translate_into_utf8(const char* str, const char* enc) {
 646     PyObject *utf8;
 647     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 648     if (buf == NULL)
 649         return NULL;
 650     utf8 = PyUnicode_AsUTF8String(buf);
 651     Py_DECREF(buf);
 652     return utf8;
 653 }
 654
 655 /* Decode a byte string STR for use as the buffer of TOK.
 656    Look for encoding declarations inside STR, and record them
 657    inside TOK.  */
 658
 659 static const char *
 660 decode_str(const char *str, struct tok_state *tok)
 661 {
 662     PyObject* utf8 = NULL;
 663     const char *s;
 664     const char *newl[2] = {NULL, NULL};
 665     int lineno = 0;
 666     tok->enc = NULL;
 667     tok->str = str;
 668     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 669         return error_ret(tok);
 670     str = tok->str;             /* string after BOM if any */
 671     assert(str);
 672     if (tok->enc != NULL) {
 673         utf8 = translate_into_utf8(str, tok->enc);
 674         if (utf8 == NULL)
 675             return error_ret(tok);
 676         str = PyBytes_AsString(utf8);
 677     }
 678     for (s = str;; s++) {
 679         if (*s == '\0') break;
 680         else if (*s == '\n') {
 681             assert(lineno < 2);
 682             newl[lineno] = s;
 683             lineno++;
 684             if (lineno == 2) break;
 685         }
 686     }
 687     tok->enc = NULL;
 688     /* need to check line 1 and 2 separately since check_coding_spec
 689        assumes a single line as input */
 690     if (newl[0]) {
 691         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 692             return error_ret(tok);
 693         if (tok->enc == NULL && newl[1]) {
 694             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 695                                    tok, buf_setreadl))
 696                 return error_ret(tok);
 697         }
 698     }
 699     if (tok->enc != NULL) {
 700         assert(utf8 == NULL);
 701         utf8 = translate_into_utf8(str, tok->enc);
 702         if (utf8 == NULL)
 703             return error_ret(tok);
 704         str = PyBytes_AS_STRING(utf8);
 705     }
 706     assert(tok->decoding_buffer == NULL);
 707     tok->decoding_buffer = utf8; /* CAUTION */
 708     return str;
 709 }
 710
 711 #endif /* PGEN */
 712
 713 /* Set up tokenizer for string */
 714
 715 struct tok_state *
 716 PyTokenizer_FromString(const char *str)
 717 {
 718     struct tok_state *tok = tok_new();
 719     if (tok == NULL)
 720         return NULL;
 721     str = (char *)decode_str(str, tok);
 722     if (str == NULL) {
 723         PyTokenizer_Free(tok);
 724         return NULL;
 725     }
 726
 727     /* XXX: constify members. */
 728     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 729     return tok;
 730 }
 731
 732 struct tok_state *
 733 PyTokenizer_FromUTF8(const char *str)
 734 {
 735     struct tok_state *tok = tok_new();
 736     if (tok == NULL)
 737         return NULL;
 738     tok->decoding_state = STATE_RAW;
 739     tok->read_coding_spec = 1;
 740     tok->enc = NULL;
 741     tok->str = str;
 742     tok->encoding = (char *)PyMem_MALLOC(6);
 743     if (!tok->encoding) {
 744         PyTokenizer_Free(tok);
 745         return NULL;
 746     }
 747     strcpy(tok->encoding, "utf-8");
 748
 749     /* XXX: constify members. */
 750     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 751     return tok;
 752 }
 753
 754
 755 /* Set up tokenizer for file */
 756
 757 struct tok_state *
 758 PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
 759 {
 760     struct tok_state *tok = tok_new();
 761     if (tok == NULL)
 762         return NULL;
 763     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 764         PyTokenizer_Free(tok);
 765         return NULL;
 766     }
 767     tok->cur = tok->inp = tok->buf;
 768     tok->end = tok->buf + BUFSIZ;
 769     tok->fp = fp;
 770     tok->prompt = ps1;
 771     tok->nextprompt = ps2;
 772     if (enc != NULL) {
 773         /* Must copy encoding declaration since it
 774            gets copied into the parse tree. */
 775         tok->encoding = PyMem_MALLOC(strlen(enc)+1);
 776         if (!tok->encoding) {
 777             PyTokenizer_Free(tok);
 778             return NULL;
 779         }
 780         strcpy(tok->encoding, enc);
 781         tok->decoding_state = STATE_NORMAL;
 782     }
 783     return tok;
 784 }
 785
 786
 787 /* Free a tok_state structure */
 788
 789 void
 790 PyTokenizer_Free(struct tok_state *tok)
 791 {
 792     if (tok->encoding != NULL)
 793         PyMem_FREE(tok->encoding);
 794 #ifndef PGEN
 795     Py_XDECREF(tok->decoding_readline);
 796     Py_XDECREF(tok->decoding_buffer);
 797 #endif
 798     if (tok->fp != NULL && tok->buf != NULL)
 799         PyMem_FREE(tok->buf);
 800     PyMem_FREE(tok);
 801 }
 802
 803 /* Get next char, updating state; error code goes into tok->done */
 804
 805 static int
 806 tok_nextc(register struct tok_state *tok)
 807 {
 808     for (;;) {
 809         if (tok->cur != tok->inp) {
 810             return Py_CHARMASK(*tok->cur++); /* Fast path */
 811         }
 812         if (tok->done != E_OK)
 813             return EOF;
 814         if (tok->fp == NULL) {
 815             char *end = strchr(tok->inp, '\n');
 816             if (end != NULL)
 817                 end++;
 818             else {
 819                 end = strchr(tok->inp, '\0');
 820                 if (end == tok->inp) {
 821                     tok->done = E_EOF;
 822                     return EOF;
 823                 }
 824             }
 825             if (tok->start == NULL)
 826                 tok->buf = tok->cur;
 827             tok->line_start = tok->cur;
 828             tok->lineno++;
 829             tok->inp = end;
 830             return Py_CHARMASK(*tok->cur++);
 831         }
 832         if (tok->prompt != NULL) {
 833             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 834 #ifndef PGEN
 835             if (tok->encoding && newtok && *newtok) {
 836                 /* Recode to UTF-8 */
 837                 Py_ssize_t buflen;
 838                 const char* buf;
 839                 PyObject *u = translate_into_utf8(newtok, tok->encoding);
 840                 PyMem_FREE(newtok);
 841                 if (!u) {
 842                     tok->done = E_DECODE;
 843                     return EOF;
 844                 }
 845                 buflen = PyBytes_GET_SIZE(u);
 846                 buf = PyBytes_AS_STRING(u);
 847                 if (!buf) {
 848                     Py_DECREF(u);
 849                     tok->done = E_DECODE;
 850                     return EOF;
 851                 }
 852                 newtok = PyMem_MALLOC(buflen+1);
 853                 strcpy(newtok, buf);
 854                 Py_DECREF(u);
 855             }
 856 #endif
 857             if (tok->nextprompt != NULL)
 858                 tok->prompt = tok->nextprompt;
 859             if (newtok == NULL)
 860                 tok->done = E_INTR;
 861             else if (*newtok == '\0') {
 862                 PyMem_FREE(newtok);
 863                 tok->done = E_EOF;
 864             }
 865             else if (tok->start != NULL) {
 866                 size_t start = tok->start - tok->buf;
 867                 size_t oldlen = tok->cur - tok->buf;
 868                 size_t newlen = oldlen + strlen(newtok);
 869                 char *buf = tok->buf;
 870                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 871                 tok->lineno++;
 872                 if (buf == NULL) {
 873                     PyMem_FREE(tok->buf);
 874                     tok->buf = NULL;
 875                     PyMem_FREE(newtok);
 876                     tok->done = E_NOMEM;
 877                     return EOF;
 878                 }
 879                 tok->buf = buf;
 880                 tok->cur = tok->buf + oldlen;
 881                 tok->line_start = tok->cur;
 882                 strcpy(tok->buf + oldlen, newtok);
 883                 PyMem_FREE(newtok);
 884                 tok->inp = tok->buf + newlen;
 885                 tok->end = tok->inp + 1;
 886                 tok->start = tok->buf + start;
 887             }
 888             else {
 889                 tok->lineno++;
 890                 if (tok->buf != NULL)
 891                     PyMem_FREE(tok->buf);
 892                 tok->buf = newtok;
 893                 tok->line_start = tok->buf;
 894                 tok->cur = tok->buf;
 895                 tok->line_start = tok->buf;
 896                 tok->inp = strchr(tok->buf, '\0');
 897                 tok->end = tok->inp + 1;
 898             }
 899         }
 900         else {
 901             int done = 0;
 902             Py_ssize_t cur = 0;
 903             char *pt;
 904             if (tok->start == NULL) {
 905                 if (tok->buf == NULL) {
 906                     tok->buf = (char *)
 907                         PyMem_MALLOC(BUFSIZ);
 908                     if (tok->buf == NULL) {
 909                         tok->done = E_NOMEM;
 910                         return EOF;
 911                     }
 912                     tok->end = tok->buf + BUFSIZ;
 913                 }
 914                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 915                           tok) == NULL) {
 916                     tok->done = E_EOF;
 917                     done = 1;
 918                 }
 919                 else {
 920                     tok->done = E_OK;
 921                     tok->inp = strchr(tok->buf, '\0');
 922                     done = tok->inp[-1] == '\n';
 923                 }
 924             }
 925             else {
 926                 cur = tok->cur - tok->buf;
 927                 if (decoding_feof(tok)) {
 928                     tok->done = E_EOF;
 929                     done = 1;
 930                 }
 931                 else
 932                     tok->done = E_OK;
 933             }
 934             tok->lineno++;
 935             /* Read until '\n' or EOF */
 936             while (!done) {
 937                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 938                           tok->start - tok->buf;
 939                 Py_ssize_t curvalid = tok->inp - tok->buf;
 940                 Py_ssize_t newsize = curvalid + BUFSIZ;
 941                 char *newbuf = tok->buf;
 942                 newbuf = (char *)PyMem_REALLOC(newbuf,
 943                                                newsize);
 944                 if (newbuf == NULL) {
 945                     tok->done = E_NOMEM;
 946                     tok->cur = tok->inp;
 947                     return EOF;
 948                 }
 949                 tok->buf = newbuf;
 950                 tok->inp = tok->buf + curvalid;
 951                 tok->end = tok->buf + newsize;
 952                 tok->start = curstart < 0 ? NULL :
 953                          tok->buf + curstart;
 954                 if (decoding_fgets(tok->inp,
 955                                (int)(tok->end - tok->inp),
 956                                tok) == NULL) {
 957                     /* Break out early on decoding
 958                        errors, as tok->buf will be NULL
 959                      */
 960                     if (tok->decoding_erred)
 961                         return EOF;
 962                     /* Last line does not end in \n,
 963                        fake one */
 964                     strcpy(tok->inp, "\n");
 965                 }
 966                 tok->inp = strchr(tok->inp, '\0');
 967                 done = tok->inp[-1] == '\n';
 968             }
 969             if (tok->buf != NULL) {
 970                 tok->cur = tok->buf + cur;
 971                 tok->line_start = tok->cur;
 972                 /* replace "\r\n" with "\n" */
 973                 /* For Mac leave the \r, giving a syntax error */
 974                 pt = tok->inp - 2;
 975                 if (pt >= tok->buf && *pt == '\r') {
 976                     *pt++ = '\n';
 977                     *pt = '\0';
 978                     tok->inp = pt;
 979                 }
 980             }
 981         }
 982         if (tok->done != E_OK) {
 983             if (tok->prompt != NULL)
 984                 PySys_WriteStderr("\n");
 985             tok->cur = tok->inp;
 986             return EOF;
 987         }
 988     }
 989     /*NOTREACHED*/
 990 }
 991
 992
 993 /* Back-up one character */
 994
 995 static void
 996 tok_backup(register struct tok_state *tok, register int c)
 997 {
 998     if (c != EOF) {
 999         if (--tok->cur < tok->buf)
1000             Py_FatalError("tok_backup: beginning of buffer");
1001         if (*tok->cur != c)
1002             *tok->cur = c;
1003     }
1004 }
1005
1006
1007 /* Return the token corresponding to a single character */
1008
1009 int
1010 PyToken_OneChar(int c)
1011 {
1012     switch (c) {
1013     case '(':           return LPAR;
1014     case ')':           return RPAR;
1015     case '[':           return LSQB;
1016     case ']':           return RSQB;
1017     case ':':           return COLON;
1018     case ',':           return COMMA;
1019     case ';':           return SEMI;
1020     case '+':           return PLUS;
1021     case '-':           return MINUS;
1022     case '*':           return STAR;
1023     case '/':           return SLASH;
1024     case '|':           return VBAR;
1025     case '&':           return AMPER;
1026     case '<':           return LESS;
1027     case '>':           return GREATER;
1028     case '=':           return EQUAL;
1029     case '.':           return DOT;
1030     case '%':           return PERCENT;
1031     case '{':           return LBRACE;
1032     case '}':           return RBRACE;
1033     case '^':           return CIRCUMFLEX;
1034     case '~':           return TILDE;
1035     case '@':       return AT;
1036     default:            return OP;
1037     }
1038 }
1039
1040
1041 int
1042 PyToken_TwoChars(int c1, int c2)
1043 {
1044     switch (c1) {
1045     case '=':
1046         switch (c2) {
1047         case '=':               return EQEQUAL;
1048         }
1049         break;
1050     case '!':
1051         switch (c2) {
1052         case '=':               return NOTEQUAL;
1053         }
1054         break;
1055     case '<':
1056         switch (c2) {
1057         case '>':               return NOTEQUAL;
1058         case '=':               return LESSEQUAL;
1059         case '<':               return LEFTSHIFT;
1060         }
1061         break;
1062     case '>':
1063         switch (c2) {
1064         case '=':               return GREATEREQUAL;
1065         case '>':               return RIGHTSHIFT;
1066         }
1067         break;
1068     case '+':
1069         switch (c2) {
1070         case '=':               return PLUSEQUAL;
1071         }
1072         break;
1073     case '-':
1074         switch (c2) {
1075         case '=':               return MINEQUAL;
1076         case '>':               return RARROW;
1077         }
1078         break;
1079     case '*':
1080         switch (c2) {
1081         case '*':               return DOUBLESTAR;
1082         case '=':               return STAREQUAL;
1083         }
1084         break;
1085     case '/':
1086         switch (c2) {
1087         case '/':               return DOUBLESLASH;
1088         case '=':               return SLASHEQUAL;
1089         }
1090         break;
1091     case '|':
1092         switch (c2) {
1093         case '=':               return VBAREQUAL;
1094         }
1095         break;
1096     case '%':
1097         switch (c2) {
1098         case '=':               return PERCENTEQUAL;
1099         }
1100         break;
1101     case '&':
1102         switch (c2) {
1103         case '=':               return AMPEREQUAL;
1104         }
1105         break;
1106     case '^':
1107         switch (c2) {
1108         case '=':               return CIRCUMFLEXEQUAL;
1109         }
1110         break;
1111     }
1112     return OP;
1113 }
1114
1115 int
1116 PyToken_ThreeChars(int c1, int c2, int c3)
1117 {
1118     switch (c1) {
1119     case '<':
1120         switch (c2) {
1121         case '<':
1122             switch (c3) {
1123             case '=':
1124                 return LEFTSHIFTEQUAL;
1125             }
1126             break;
1127         }
1128         break;
1129     case '>':
1130         switch (c2) {
1131         case '>':
1132             switch (c3) {
1133             case '=':
1134                 return RIGHTSHIFTEQUAL;
1135             }
1136             break;
1137         }
1138         break;
1139     case '*':
1140         switch (c2) {
1141         case '*':
1142             switch (c3) {
1143             case '=':
1144                 return DOUBLESTAREQUAL;
1145             }
1146             break;
1147         }
1148         break;
1149     case '/':
1150         switch (c2) {
1151         case '/':
1152             switch (c3) {
1153             case '=':
1154                 return DOUBLESLASHEQUAL;
1155             }
1156             break;
1157         }
1158         break;
1159     case '.':
1160         switch (c2) {
1161         case '.':
1162             switch (c3) {
1163             case '.':
1164                 return ELLIPSIS;
1165             }
1166             break;
1167         }
1168         break;
1169     }
1170     return OP;
1171 }
1172
1173 static int
1174 indenterror(struct tok_state *tok)
1175 {
1176     if (tok->alterror) {
1177         tok->done = E_TABSPACE;
1178         tok->cur = tok->inp;
1179         return 1;
1180     }
1181     if (tok->altwarning) {
1182         PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1183                           "in indentation\n", tok->filename);
1184         tok->altwarning = 0;
1185     }
1186     return 0;
1187 }
1188
1189 #ifdef PGEN
1190 #define verify_identifier(tok) 1
1191 #else
1192 /* Verify that the identifier follows PEP 3131. */
1193 static int
1194 verify_identifier(struct tok_state *tok)
1195 {
1196     PyObject *s;
1197     int result;
1198     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1199     if (s == NULL) {
1200         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1201             PyErr_Clear();
1202             tok->done = E_IDENTIFIER;
1203         } else {
1204             tok->done = E_ERROR;
1205         }
1206         return 0;
1207     }
1208     result = PyUnicode_IsIdentifier(s);
1209     Py_DECREF(s);
1210     if (result == 0)
1211         tok->done = E_IDENTIFIER;
1212     return result;
1213 }
1214 #endif
1215
1216 /* Get next token, after space stripping etc. */
1217
1218 static int
1219 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1220 {
1221     register int c;
1222     int blankline, nonascii;
1223
1224     *p_start = *p_end = NULL;
1225   nextline:
1226     tok->start = NULL;
1227     blankline = 0;
1228
1229     /* Get indentation level */
1230     if (tok->atbol) {
1231         register int col = 0;
1232         register int altcol = 0;
1233         tok->atbol = 0;
1234         for (;;) {
1235             c = tok_nextc(tok);
1236             if (c == ' ')
1237                 col++, altcol++;
1238             else if (c == '\t') {
1239                 col = (col/tok->tabsize + 1) * tok->tabsize;
1240                 altcol = (altcol/tok->alttabsize + 1)
1241                     * tok->alttabsize;
1242             }
1243             else if (c == '\014') /* Control-L (formfeed) */
1244                 col = altcol = 0; /* For Emacs users */
1245             else
1246                 break;
1247         }
1248         tok_backup(tok, c);
1249         if (c == '#' || c == '\n') {
1250             /* Lines with only whitespace and/or comments
1251                shouldn't affect the indentation and are
1252                not passed to the parser as NEWLINE tokens,
1253                except *totally* empty lines in interactive
1254                mode, which signal the end of a command group. */
1255             if (col == 0 && c == '\n' && tok->prompt != NULL)
1256                 blankline = 0; /* Let it through */
1257             else
1258                 blankline = 1; /* Ignore completely */
1259             /* We can't jump back right here since we still
1260                may need to skip to the end of a comment */
1261         }
1262         if (!blankline && tok->level == 0) {
1263             if (col == tok->indstack[tok->indent]) {
1264                 /* No change */
1265                 if (altcol != tok->altindstack[tok->indent]) {
1266                     if (indenterror(tok))
1267                         return ERRORTOKEN;
1268                 }
1269             }
1270             else if (col > tok->indstack[tok->indent]) {
1271                 /* Indent -- always one */
1272                 if (tok->indent+1 >= MAXINDENT) {
1273                     tok->done = E_TOODEEP;
1274                     tok->cur = tok->inp;
1275                     return ERRORTOKEN;
1276                 }
1277                 if (altcol <= tok->altindstack[tok->indent]) {
1278                     if (indenterror(tok))
1279                         return ERRORTOKEN;
1280                 }
1281                 tok->pendin++;
1282                 tok->indstack[++tok->indent] = col;
1283                 tok->altindstack[tok->indent] = altcol;
1284             }
1285             else /* col < tok->indstack[tok->indent] */ {
1286                 /* Dedent -- any number, must be consistent */
1287                 while (tok->indent > 0 &&
1288                     col < tok->indstack[tok->indent]) {
1289                     tok->pendin--;
1290                     tok->indent--;
1291                 }
1292                 if (col != tok->indstack[tok->indent]) {
1293                     tok->done = E_DEDENT;
1294                     tok->cur = tok->inp;
1295                     return ERRORTOKEN;
1296                 }
1297                 if (altcol != tok->altindstack[tok->indent]) {
1298                     if (indenterror(tok))
1299                         return ERRORTOKEN;
1300                 }
1301             }
1302         }
1303     }
1304
1305     tok->start = tok->cur;
1306
1307     /* Return pending indents/dedents */
1308     if (tok->pendin != 0) {
1309         if (tok->pendin < 0) {
1310             tok->pendin++;
1311             return DEDENT;
1312         }
1313         else {
1314             tok->pendin--;
1315             return INDENT;
1316         }
1317     }
1318
1319  again:
1320     tok->start = NULL;
1321     /* Skip spaces */
1322     do {
1323         c = tok_nextc(tok);
1324     } while (c == ' ' || c == '\t' || c == '\014');
1325
1326     /* Set start of current token */
1327     tok->start = tok->cur - 1;
1328
1329     /* Skip comment */
1330     if (c == '#')
1331         while (c != EOF && c != '\n')
1332             c = tok_nextc(tok);
1333
1334     /* Check for EOF and errors now */
1335     if (c == EOF) {
1336         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1337     }
1338
1339     /* Identifier (most frequent token!) */
1340     nonascii = 0;
1341     if (is_potential_identifier_start(c)) {
1342         /* Process b"", r"" and br"" */
1343         if (c == 'b' || c == 'B') {
1344             c = tok_nextc(tok);
1345             if (c == '"' || c == '\'')
1346                 goto letter_quote;
1347         }
1348         if (c == 'r' || c == 'R') {
1349             c = tok_nextc(tok);
1350             if (c == '"' || c == '\'')
1351                 goto letter_quote;
1352         }
1353         while (is_potential_identifier_char(c)) {
1354             if (c >= 128)
1355                 nonascii = 1;
1356             c = tok_nextc(tok);
1357         }
1358         tok_backup(tok, c);
1359         if (nonascii &&
1360             !verify_identifier(tok)) {
1361             tok->done = E_IDENTIFIER;
1362             return ERRORTOKEN;
1363         }
1364         *p_start = tok->start;
1365         *p_end = tok->cur;
1366         return NAME;
1367     }
1368
1369     /* Newline */
1370     if (c == '\n') {
1371         tok->atbol = 1;
1372         if (blankline || tok->level > 0)
1373             goto nextline;
1374         *p_start = tok->start;
1375         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1376         tok->cont_line = 0;
1377         return NEWLINE;
1378     }
1379
1380     /* Period or number starting with period? */
1381     if (c == '.') {
1382         c = tok_nextc(tok);
1383         if (isdigit(c)) {
1384             goto fraction;
1385         } else if (c == '.') {
1386             c = tok_nextc(tok);
1387             if (c == '.') {
1388                 *p_start = tok->start;
1389                 *p_end = tok->cur;
1390                 return ELLIPSIS;
1391             } else {
1392                 tok_backup(tok, c);
1393             }
1394             tok_backup(tok, '.');
1395         } else {
1396             tok_backup(tok, c);
1397         }
1398         *p_start = tok->start;
1399         *p_end = tok->cur;
1400         return DOT;
1401     }
1402
1403     /* Number */
1404     if (isdigit(c)) {
1405         if (c == '0') {
1406             /* Hex, octal or binary -- maybe. */
1407             c = tok_nextc(tok);
1408             if (c == '.')
1409                 goto fraction;
1410 #ifndef WITHOUT_COMPLEX
1411             if (c == 'j' || c == 'J')
1412                 goto imaginary;
1413 #endif
1414             if (c == 'x' || c == 'X') {
1415
1416                 /* Hex */
1417                 c = tok_nextc(tok);
1418                 if (!isxdigit(c)) {
1419                     tok->done = E_TOKEN;
1420                     tok_backup(tok, c);
1421                     return ERRORTOKEN;
1422                 }
1423                 do {
1424                     c = tok_nextc(tok);
1425                 } while (isxdigit(c));
1426             }
1427             else if (c == 'o' || c == 'O') {
1428                 /* Octal */
1429                 c = tok_nextc(tok);
1430                 if (c < '0' || c >= '8') {
1431                     tok->done = E_TOKEN;
1432                     tok_backup(tok, c);
1433                     return ERRORTOKEN;
1434                 }
1435                 do {
1436                     c = tok_nextc(tok);
1437                 } while ('0' <= c && c < '8');
1438             }
1439             else if (c == 'b' || c == 'B') {
1440                 /* Binary */
1441                 c = tok_nextc(tok);
1442                 if (c != '0' && c != '1') {
1443                     tok->done = E_TOKEN;
1444                     tok_backup(tok, c);
1445                     return ERRORTOKEN;
1446                 }
1447                 do {
1448                     c = tok_nextc(tok);
1449                 } while (c == '0' || c == '1');
1450             }
1451             else {
1452                 int nonzero = 0;
1453                 /* maybe old-style octal; c is first char of it */
1454                 /* in any case, allow '0' as a literal */
1455                 while (c == '0')
1456                     c = tok_nextc(tok);
1457                 while (isdigit(c)) {
1458                     nonzero = 1;
1459                     c = tok_nextc(tok);
1460                 }
1461                 if (c == '.')
1462                     goto fraction;
1463                 else if (c == 'e' || c == 'E')
1464                     goto exponent;
1465 #ifndef WITHOUT_COMPLEX
1466                 else if (c == 'j' || c == 'J')
1467                     goto imaginary;
1468 #endif
1469                 else if (nonzero) {
1470                     tok->done = E_TOKEN;
1471                     tok_backup(tok, c);
1472                     return ERRORTOKEN;
1473                 }
1474             }
1475         }
1476         else {
1477             /* Decimal */
1478             do {
1479                 c = tok_nextc(tok);
1480             } while (isdigit(c));
1481             {
1482                 /* Accept floating point numbers. */
1483                 if (c == '.') {
1484         fraction:
1485                     /* Fraction */
1486                     do {
1487                         c = tok_nextc(tok);
1488                     } while (isdigit(c));
1489                 }
1490                 if (c == 'e' || c == 'E') {
1491         exponent:
1492                     /* Exponent part */
1493                     c = tok_nextc(tok);
1494                     if (c == '+' || c == '-')
1495                         c = tok_nextc(tok);
1496                     if (!isdigit(c)) {
1497                         tok->done = E_TOKEN;
1498                         tok_backup(tok, c);
1499                         return ERRORTOKEN;
1500                     }
1501                     do {
1502                         c = tok_nextc(tok);
1503                     } while (isdigit(c));
1504                 }
1505 #ifndef WITHOUT_COMPLEX
1506                 if (c == 'j' || c == 'J')
1507                     /* Imaginary part */
1508         imaginary:
1509                     c = tok_nextc(tok);
1510 #endif
1511             }
1512         }
1513         tok_backup(tok, c);
1514         *p_start = tok->start;
1515         *p_end = tok->cur;
1516         return NUMBER;
1517     }
1518
1519   letter_quote:
1520     /* String */
1521     if (c == '\'' || c == '"') {
1522         int quote = c;
1523         int quote_size = 1;             /* 1 or 3 */
1524         int end_quote_size = 0;
1525
1526         /* Find the quote size and start of string */
1527         c = tok_nextc(tok);
1528         if (c == quote) {
1529             c = tok_nextc(tok);
1530             if (c == quote)
1531                 quote_size = 3;
1532             else
1533                 end_quote_size = 1;     /* empty string found */
1534         }
1535         if (c != quote)
1536             tok_backup(tok, c);
1537
1538         /* Get rest of string */
1539         while (end_quote_size != quote_size) {
1540             c = tok_nextc(tok);
1541             if (c == EOF) {
1542                 if (quote_size == 3)
1543                     tok->done = E_EOFS;
1544                 else
1545                     tok->done = E_EOLS;
1546                 tok->cur = tok->inp;
1547                 return ERRORTOKEN;
1548             }
1549             if (quote_size == 1 && c == '\n') {
1550                 tok->done = E_EOLS;
1551                 tok->cur = tok->inp;
1552                 return ERRORTOKEN;
1553             }
1554             if (c == quote)
1555                 end_quote_size += 1;
1556             else {
1557                 end_quote_size = 0;
1558                 if (c == '\\')
1559                 c = tok_nextc(tok);  /* skip escaped char */
1560             }
1561         }
1562
1563         *p_start = tok->start;
1564         *p_end = tok->cur;
1565         return STRING;
1566     }
1567
1568     /* Line continuation */
1569     if (c == '\\') {
1570         c = tok_nextc(tok);
1571         if (c != '\n') {
1572             tok->done = E_LINECONT;
1573             tok->cur = tok->inp;
1574             return ERRORTOKEN;
1575         }
1576         tok->cont_line = 1;
1577         goto again; /* Read next line */
1578     }
1579
1580     /* Check for two-character token */
1581     {
1582         int c2 = tok_nextc(tok);
1583         int token = PyToken_TwoChars(c, c2);
1584         if (token != OP) {
1585             int c3 = tok_nextc(tok);
1586             int token3 = PyToken_ThreeChars(c, c2, c3);
1587             if (token3 != OP) {
1588                 token = token3;
1589             } else {
1590                 tok_backup(tok, c3);
1591             }
1592             *p_start = tok->start;
1593             *p_end = tok->cur;
1594             return token;
1595         }
1596         tok_backup(tok, c2);
1597     }
1598
1599     /* Keep track of parentheses nesting level */
1600     switch (c) {
1601     case '(':
1602     case '[':
1603     case '{':
1604         tok->level++;
1605         break;
1606     case ')':
1607     case ']':
1608     case '}':
1609         tok->level--;
1610         break;
1611     }
1612
1613     /* Punctuation character */
1614     *p_start = tok->start;
1615     *p_end = tok->cur;
1616     return PyToken_OneChar(c);
1617 }
1618
1619 int
1620 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1621 {
1622     int result = tok_get(tok, p_start, p_end);
1623     if (tok->decoding_erred) {
1624         result = ERRORTOKEN;
1625         tok->done = E_DECODE;
1626     }
1627     return result;
1628 }
1629
1630 /* Get -*- encoding -*- from a Python file.
1631
1632    PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
1633    the first or second line of the file (in which case the encoding
1634    should be assumed to be PyUnicode_GetDefaultEncoding()).
1635
1636    The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1637    by the caller.
1638 */
1639 char *
1640 PyTokenizer_FindEncoding(int fd)
1641 {
1642     struct tok_state *tok;
1643     FILE *fp;
1644     char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1645
1646     fd = dup(fd);
1647     if (fd < 0) {
1648         return NULL;
1649     }
1650     fp = fdopen(fd, "r");
1651     if (fp == NULL) {
1652         return NULL;
1653     }
1654     tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1655     if (tok == NULL) {
1656         fclose(fp);
1657         return NULL;
1658     }
1659     while (tok->lineno < 2 && tok->done == E_OK) {
1660         PyTokenizer_Get(tok, &p_start, &p_end);
1661     }
1662     fclose(fp);
1663     if (tok->encoding) {
1664         encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1665         if (encoding)
1666         strcpy(encoding, tok->encoding);
1667     }
1668     PyTokenizer_Free(tok);
1669     return encoding;
1670 }
1671
1672 #ifdef Py_DEBUG
1673
1674 void
1675 tok_dump(int type, char *start, char *end)
1676 {
1677     printf("%s", _PyParser_TokenNames[type]);
1678     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1679         printf("(%.*s)", (int)(end - start), start);
1680 }
1681
1682 #endif