Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "bytesobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #endif /* PGEN */
  20
  21 #define is_potential_identifier_start(c) (\
  22                           (c >= 'a' && c <= 'z')\
  23                        || (c >= 'A' && c <= 'Z')\
  24                        || c == '_'\
  25                        || (c >= 128))
  26
  27 #define is_potential_identifier_char(c) (\
  28                           (c >= 'a' && c <= 'z')\
  29                        || (c >= 'A' && c <= 'Z')\
  30                        || (c >= '0' && c <= '9')\
  31                        || c == '_'\
  32                        || (c >= 128))
  33
  34 extern char *PyOS_Readline(FILE *, FILE *, char *);
  35 /* Return malloc'ed string including trailing \n;
  36    empty malloc'ed string for EOF;
  37    NULL if interrupted */
  38
  39 /* Don't ever change this -- it would break the portability of Python code */
  40 #define TABSIZE 8
  41
  42 /* Forward */
  43 static struct tok_state *tok_new(void);
  44 static int tok_nextc(struct tok_state *tok);
  45 static void tok_backup(struct tok_state *tok, int c);
  46
  47
  48 /* Token names */
  49
  50 char *_PyParser_TokenNames[] = {
  51         "ENDMARKER",
  52         "NAME",
  53         "NUMBER",
  54         "STRING",
  55         "NEWLINE",
  56         "INDENT",
  57         "DEDENT",
  58         "LPAR",
  59         "RPAR",
  60         "LSQB",
  61         "RSQB",
  62         "COLON",
  63         "COMMA",
  64         "SEMI",
  65         "PLUS",
  66         "MINUS",
  67         "STAR",
  68         "SLASH",
  69         "VBAR",
  70         "AMPER",
  71         "LESS",
  72         "GREATER",
  73         "EQUAL",
  74         "DOT",
  75         "PERCENT",
  76         "LBRACE",
  77         "RBRACE",
  78         "EQEQUAL",
  79         "NOTEQUAL",
  80         "LESSEQUAL",
  81         "GREATEREQUAL",
  82         "TILDE",
  83         "CIRCUMFLEX",
  84         "LEFTSHIFT",
  85         "RIGHTSHIFT",
  86         "DOUBLESTAR",
  87         "PLUSEQUAL",
  88         "MINEQUAL",
  89         "STAREQUAL",
  90         "SLASHEQUAL",
  91         "PERCENTEQUAL",
  92         "AMPEREQUAL",
  93         "VBAREQUAL",
  94         "CIRCUMFLEXEQUAL",
  95         "LEFTSHIFTEQUAL",
  96         "RIGHTSHIFTEQUAL",
  97         "DOUBLESTAREQUAL",
  98         "DOUBLESLASH",
  99         "DOUBLESLASHEQUAL",
 100         "AT",
 101         "RARROW",
 102         "ELLIPSIS",
 103         /* This table must match the #defines in token.h! */
 104         "OP",
 105         "<ERRORTOKEN>",
 106         "<N_TOKENS>"
 107 };
 108
 109
 110 /* Create and initialize a new tok_state structure */
 111
 112 static struct tok_state *
 113 tok_new(void)
 114 {
 115         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 116                                                 sizeof(struct tok_state));
 117         if (tok == NULL)
 118                 return NULL;
 119         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 120         tok->done = E_OK;
 121         tok->fp = NULL;
 122         tok->tabsize = TABSIZE;
 123         tok->indent = 0;
 124         tok->indstack[0] = 0;
 125         tok->atbol = 1;
 126         tok->pendin = 0;
 127         tok->prompt = tok->nextprompt = NULL;
 128         tok->lineno = 0;
 129         tok->level = 0;
 130         tok->filename = NULL;
 131         tok->altwarning = 1;
 132         tok->alterror = 1;
 133         tok->alttabsize = 1;
 134         tok->altindstack[0] = 0;
 135         tok->decoding_state = STATE_INIT;
 136         tok->decoding_erred = 0;
 137         tok->read_coding_spec = 0;
 138         tok->enc = NULL;
 139         tok->encoding = NULL;
 140         tok->cont_line = 0;
 141 #ifndef PGEN
 142         tok->decoding_readline = NULL;
 143         tok->decoding_buffer = NULL;
 144 #endif
 145         return tok;
 146 }
 147
 148 #ifdef PGEN
 149
 150 static char *
 151 decoding_fgets(char *s, int size, struct tok_state *tok)
 152 {
 153         return fgets(s, size, tok->fp);
 154 }
 155
 156 static int
 157 decoding_feof(struct tok_state *tok)
 158 {
 159         return feof(tok->fp);
 160 }
 161
 162 static const char *
 163 decode_str(const char *str, struct tok_state *tok)
 164 {
 165         return str;
 166 }
 167
 168 #else /* PGEN */
 169
 170 static char *
 171 error_ret(struct tok_state *tok) /* XXX */
 172 {
 173         tok->decoding_erred = 1;
 174         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 175                 PyMem_FREE(tok->buf);
 176         tok->buf = NULL;
 177         return NULL;            /* as if it were EOF */
 178 }
 179
 180 static char *
 181 new_string(const char *s, Py_ssize_t len)
 182 {
 183         char* result = (char *)PyMem_MALLOC(len + 1);
 184         if (result != NULL) {
 185                 memcpy(result, s, len);
 186                 result[len] = '\0';
 187         }
 188         return result;
 189 }
 190
 191 static char *
 192 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 193 {
 194         char buf[13];
 195         int i;
 196         for (i = 0; i < 12; i++) {
 197                 int c = s[i];
 198                 if (c == '\0') break;
 199                 else if (c == '_') buf[i] = '-';
 200                 else buf[i] = tolower(c);
 201         }
 202         buf[i] = '\0';
 203         if (strcmp(buf, "utf-8") == 0 ||
 204             strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
 205         else if (strcmp(buf, "latin-1") == 0 ||
 206                  strcmp(buf, "iso-8859-1") == 0 ||
 207                  strcmp(buf, "iso-latin-1") == 0 ||
 208                  strncmp(buf, "latin-1-", 8) == 0 ||
 209                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 210                  strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
 211         else return s;
 212 }
 213
 214 /* Return the coding spec in S, or NULL if none is found.  */
 215
 216 static char *
 217 get_coding_spec(const char *s, Py_ssize_t size)
 218 {
 219         Py_ssize_t i;
 220         /* Coding spec must be in a comment, and that comment must be
 221          * the only statement on the source code line. */
 222         for (i = 0; i < size - 6; i++) {
 223                 if (s[i] == '#')
 224                         break;
 225                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 226                         return NULL;
 227         }
 228         for (; i < size - 6; i++) { /* XXX inefficient search */
 229                 const char* t = s + i;
 230                 if (strncmp(t, "coding", 6) == 0) {
 231                         const char* begin = NULL;
 232                         t += 6;
 233                         if (t[0] != ':' && t[0] != '=')
 234                                 continue;
 235                         do {
 236                                 t++;
 237                         } while (t[0] == '\x20' || t[0] == '\t');
 238
 239                         begin = t;
 240                         while (isalnum(Py_CHARMASK(t[0])) ||
 241                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 242                                 t++;
 243
 244                         if (begin < t) {
 245                                 char* r = new_string(begin, t - begin);
 246                                 char* q = get_normal_name(r);
 247                                 if (r != q) {
 248                                         PyMem_FREE(r);
 249                                         r = new_string(q, strlen(q));
 250                                 }
 251                                 return r;
 252                         }
 253                 }
 254         }
 255         return NULL;
 256 }
 257
 258 /* Check whether the line contains a coding spec. If it does,
 259    invoke the set_readline function for the new encoding.
 260    This function receives the tok_state and the new encoding.
 261    Return 1 on success, 0 on failure.  */
 262
 263 static int
 264 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 265                   int set_readline(struct tok_state *, const char *))
 266 {
 267         char * cs;
 268         int r = 1;
 269
 270         if (tok->cont_line)
 271                 /* It's a continuation line, so it can't be a coding spec. */
 272                 return 1;
 273         cs = get_coding_spec(line, size);
 274         if (cs != NULL) {
 275                 tok->read_coding_spec = 1;
 276                 if (tok->encoding == NULL) {
 277                         assert(tok->decoding_state == STATE_RAW);
 278                         if (strcmp(cs, "utf-8") == 0) {
 279                                 tok->encoding = cs;
 280                         } else {
 281                                 r = set_readline(tok, cs);
 282                                 if (r) {
 283                                         tok->encoding = cs;
 284                                         tok->decoding_state = STATE_NORMAL;
 285                                 }
 286                                 else
 287                                         PyMem_FREE(cs);
 288                         }
 289                 } else {        /* then, compare cs with BOM */
 290                         r = (strcmp(tok->encoding, cs) == 0);
 291                         PyMem_FREE(cs);
 292                 }
 293         }
 294         if (!r) {
 295                 cs = tok->encoding;
 296                 if (!cs)
 297                         cs = "with BOM";
 298                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 299         }
 300         return r;
 301 }
 302
 303 /* See whether the file starts with a BOM. If it does,
 304    invoke the set_readline function with the new encoding.
 305    Return 1 on success, 0 on failure.  */
 306
 307 static int
 308 check_bom(int get_char(struct tok_state *),
 309           void unget_char(int, struct tok_state *),
 310           int set_readline(struct tok_state *, const char *),
 311           struct tok_state *tok)
 312 {
 313         int ch = get_char(tok);
 314         tok->decoding_state = STATE_RAW;
 315         if (ch == EOF) {
 316                 return 1;
 317         } else if (ch == 0xEF) {
 318                 ch = get_char(tok);
 319                 if (ch != 0xBB) {
 320                         unget_char(ch, tok);
 321                         unget_char(0xEF, tok);
 322                         /* any token beginning with '\xEF' is a bad token */
 323                         return 1;
 324                 }
 325                 ch = get_char(tok);
 326                 if (ch != 0xBF) {
 327                         unget_char(ch, tok);
 328                         unget_char(0xBB, tok);
 329                         unget_char(0xEF, tok);
 330                         /* any token beginning with '\xEF' is a bad token */
 331                         return 1;
 332                 }
 333 #if 0
 334         /* Disable support for UTF-16 BOMs until a decision
 335            is made whether this needs to be supported.  */
 336         } else if (ch == 0xFE) {
 337                 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
 338                 if (!set_readline(tok, "utf-16-be")) return 0;
 339                 tok->decoding_state = STATE_NORMAL;
 340         } else if (ch == 0xFF) {
 341                 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
 342                 if (!set_readline(tok, "utf-16-le")) return 0;
 343                 tok->decoding_state = STATE_NORMAL;
 344 #endif
 345         } else {
 346                 unget_char(ch, tok);
 347                 return 1;
 348         }
 349         if (tok->encoding != NULL)
 350                 PyMem_FREE(tok->encoding);
 351         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 352         /* No need to set_readline: input is already utf-8 */
 353         return 1;
 354 }
 355
 356 /* Read a line of text from TOK into S, using the stream in TOK.
 357    Return NULL on failure, else S.
 358
 359    On entry, tok->decoding_buffer will be one of:
 360      1) NULL: need to call tok->decoding_readline to get a new line
 361      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 362            stored the result in tok->decoding_buffer
 363      3) PyByteArrayObject *: previous call to fp_readl did not have enough room
 364            (in the s buffer) to copy entire contents of the line read
 365            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 366            In this case, fp_readl is called in a loop (with an expanded buffer)
 367            until the buffer ends with a '\n' (or until the end of the file is
 368            reached): see tok_nextc and its calls to decoding_fgets.
 369 */
 370
 371 static char *
 372 fp_readl(char *s, int size, struct tok_state *tok)
 373 {
 374         PyObject* bufobj;
 375         const char *buf;
 376         Py_ssize_t buflen;
 377
 378         /* Ask for one less byte so we can terminate it */
 379         assert(size > 0);
 380         size--;
 381
 382         if (tok->decoding_buffer) {
 383                 bufobj = tok->decoding_buffer;
 384                 Py_INCREF(bufobj);
 385         }
 386         else
 387         {
 388                 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
 389                 if (bufobj == NULL)
 390                         goto error;
 391         }
 392         if (PyUnicode_CheckExact(bufobj))
 393         {
 394                 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
 395                 if (buf == NULL) {
 396                         goto error;
 397                 }
 398         }
 399         else
 400         {
 401                 buf = PyByteArray_AsString(bufobj);
 402                 if (buf == NULL) {
 403                         goto error;
 404                 }
 405                 buflen = PyByteArray_GET_SIZE(bufobj);
 406         }
 407
 408         Py_XDECREF(tok->decoding_buffer);
 409         if (buflen > size) {
 410                 /* Too many chars, the rest goes into tok->decoding_buffer */
 411                 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
 412                                                                  buflen-size);
 413                 if (tok->decoding_buffer == NULL)
 414                         goto error;
 415                 buflen = size;
 416         }
 417         else
 418                 tok->decoding_buffer = NULL;
 419
 420         memcpy(s, buf, buflen);
 421         s[buflen] = '\0';
 422         if (buflen == 0) /* EOF */
 423                 s = NULL;
 424         Py_DECREF(bufobj);
 425         return s;
 426
 427 error:
 428         Py_XDECREF(bufobj);
 429         return error_ret(tok);
 430 }
 431
 432 /* Set the readline function for TOK to a StreamReader's
 433    readline function. The StreamReader is named ENC.
 434
 435    This function is called from check_bom and check_coding_spec.
 436
 437    ENC is usually identical to the future value of tok->encoding,
 438    except for the (currently unsupported) case of UTF-16.
 439
 440    Return 1 on success, 0 on failure. */
 441
 442 static int
 443 fp_setreadl(struct tok_state *tok, const char* enc)
 444 {
 445         PyObject *readline = NULL, *stream = NULL, *io = NULL;
 446
 447         io = PyImport_ImportModuleNoBlock("io");
 448         if (io == NULL)
 449                 goto cleanup;
 450
 451         if (tok->filename)
 452                 stream = PyObject_CallMethod(io, "open", "ssis",
 453                                              tok->filename, "r", -1, enc);
 454         else
 455                 stream = PyObject_CallMethod(io, "open", "isisOOO",
 456                                 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
 457         if (stream == NULL)
 458                 goto cleanup;
 459
 460         Py_XDECREF(tok->decoding_readline);
 461         readline = PyObject_GetAttrString(stream, "readline");
 462         tok->decoding_readline = readline;
 463
 464         /* The file has been reopened; parsing will restart from
 465          * the beginning of the file, we have to reset the line number.
 466          * But this function has been called from inside tok_nextc() which
 467          * will increment lineno before it returns. So we set it -1 so that
 468          * the next call to tok_nextc() will start with tok->lineno == 0.
 469          */
 470         tok->lineno = -1;
 471
 472   cleanup:
 473         Py_XDECREF(stream);
 474         Py_XDECREF(io);
 475         return readline != NULL;
 476 }
 477
 478 /* Fetch the next byte from TOK. */
 479
 480 static int fp_getc(struct tok_state *tok) {
 481         return getc(tok->fp);
 482 }
 483
 484 /* Unfetch the last byte back into TOK.  */
 485
 486 static void fp_ungetc(int c, struct tok_state *tok) {
 487         ungetc(c, tok->fp);
 488 }
 489
 490 /* Check whether the characters at s start a valid
 491    UTF-8 sequence. Return the number of characters forming
 492    the sequence if yes, 0 if not.  */
 493 static int valid_utf8(const unsigned char* s)
 494 {
 495         int expected = 0;
 496         int length;
 497         if (*s < 0x80)
 498                 /* single-byte code */
 499                 return 1;
 500         if (*s < 0xc0)
 501                 /* following byte */
 502                 return 0;
 503         if (*s < 0xE0)
 504                 expected = 1;
 505         else if (*s < 0xF0)
 506                 expected = 2;
 507         else if (*s < 0xF8)
 508                 expected = 3;
 509         else
 510                 return 0;
 511         length = expected + 1;
 512         for (; expected; expected--)
 513                 if (s[expected] < 0x80 || s[expected] >= 0xC0)
 514                         return 0;
 515         return length;
 516 }
 517
 518 /* Read a line of input from TOK. Determine encoding
 519    if necessary.  */
 520
 521 static char *
 522 decoding_fgets(char *s, int size, struct tok_state *tok)
 523 {
 524         char *line = NULL;
 525         int badchar = 0;
 526         for (;;) {
 527                 if (tok->decoding_state == STATE_NORMAL) {
 528                         /* We already have a codec associated with
 529                            this input. */
 530                         line = fp_readl(s, size, tok);
 531                         break;
 532                 } else if (tok->decoding_state == STATE_RAW) {
 533                         /* We want a 'raw' read. */
 534                         line = Py_UniversalNewlineFgets(s, size,
 535                                                         tok->fp, NULL);
 536                         break;
 537                 } else {
 538                         /* We have not yet determined the encoding.
 539                            If an encoding is found, use the file-pointer
 540                            reader functions from now on. */
 541                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 542                                 return error_ret(tok);
 543                         assert(tok->decoding_state != STATE_INIT);
 544                 }
 545         }
 546         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 547                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 548                         return error_ret(tok);
 549                 }
 550         }
 551 #ifndef PGEN
 552         /* The default encoding is UTF-8, so make sure we don't have any
 553            non-UTF-8 sequences in it. */
 554         if (line && !tok->encoding) {
 555                 unsigned char *c;
 556                 int length;
 557                 for (c = (unsigned char *)line; *c; c += length)
 558                         if (!(length = valid_utf8(c))) {
 559                                 badchar = *c;
 560                                 break;
 561                         }
 562         }
 563         if (badchar) {
 564                 char buf[500];
 565                 /* Need to add 1 to the line number, since this line
 566                    has not been counted, yet.  */
 567                 sprintf(buf,
 568                         "Non-UTF-8 code starting with '\\x%.2x' "
 569                         "in file %.200s on line %i, "
 570                         "but no encoding declared; "
 571                         "see http://python.org/dev/peps/pep-0263/ for details",
 572                         badchar, tok->filename, tok->lineno + 1);
 573                 PyErr_SetString(PyExc_SyntaxError, buf);
 574                 return error_ret(tok);
 575         }
 576 #endif
 577         return line;
 578 }
 579
 580 static int
 581 decoding_feof(struct tok_state *tok)
 582 {
 583         if (tok->decoding_state != STATE_NORMAL) {
 584                 return feof(tok->fp);
 585         } else {
 586                 PyObject* buf = tok->decoding_buffer;
 587                 if (buf == NULL) {
 588                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 589                         if (buf == NULL) {
 590                                 error_ret(tok);
 591                                 return 1;
 592                         } else {
 593                                 tok->decoding_buffer = buf;
 594                         }
 595                 }
 596                 return PyObject_Length(buf) == 0;
 597         }
 598 }
 599
 600 /* Fetch a byte from TOK, using the string buffer. */
 601
 602 static int
 603 buf_getc(struct tok_state *tok) {
 604         return Py_CHARMASK(*tok->str++);
 605 }
 606
 607 /* Unfetch a byte from TOK, using the string buffer. */
 608
 609 static void
 610 buf_ungetc(int c, struct tok_state *tok) {
 611         tok->str--;
 612         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 613 }
 614
 615 /* Set the readline function for TOK to ENC. For the string-based
 616    tokenizer, this means to just record the encoding. */
 617
 618 static int
 619 buf_setreadl(struct tok_state *tok, const char* enc) {
 620         tok->enc = enc;
 621         return 1;
 622 }
 623
 624 /* Return a UTF-8 encoding Python string object from the
 625    C byte string STR, which is encoded with ENC. */
 626
 627 static PyObject *
 628 translate_into_utf8(const char* str, const char* enc) {
 629         PyObject *utf8;
 630         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 631         if (buf == NULL)
 632                 return NULL;
 633         utf8 = PyUnicode_AsUTF8String(buf);
 634         Py_DECREF(buf);
 635         return utf8;
 636 }
 637
 638 /* Decode a byte string STR for use as the buffer of TOK.
 639    Look for encoding declarations inside STR, and record them
 640    inside TOK.  */
 641
 642 static const char *
 643 decode_str(const char *str, struct tok_state *tok)
 644 {
 645         PyObject* utf8 = NULL;
 646         const char *s;
 647         const char *newl[2] = {NULL, NULL};
 648         int lineno = 0;
 649         tok->enc = NULL;
 650         tok->str = str;
 651         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 652                 return error_ret(tok);
 653         str = tok->str;         /* string after BOM if any */
 654         assert(str);
 655         if (tok->enc != NULL) {
 656                 utf8 = translate_into_utf8(str, tok->enc);
 657                 if (utf8 == NULL)
 658                         return error_ret(tok);
 659                 str = PyBytes_AsString(utf8);
 660         }
 661         for (s = str;; s++) {
 662                 if (*s == '\0') break;
 663                 else if (*s == '\n') {
 664                         assert(lineno < 2);
 665                         newl[lineno] = s;
 666                         lineno++;
 667                         if (lineno == 2) break;
 668                 }
 669         }
 670         tok->enc = NULL;
 671         /* need to check line 1 and 2 separately since check_coding_spec
 672            assumes a single line as input */
 673         if (newl[0]) {
 674                 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 675                         return error_ret(tok);
 676                 if (tok->enc == NULL && newl[1]) {
 677                         if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 678                                                tok, buf_setreadl))
 679                                 return error_ret(tok);
 680                 }
 681         }
 682         if (tok->enc != NULL) {
 683                 assert(utf8 == NULL);
 684                 utf8 = translate_into_utf8(str, tok->enc);
 685                 if (utf8 == NULL)
 686                         return error_ret(tok);
 687                 str = PyBytes_AS_STRING(utf8);
 688         }
 689         assert(tok->decoding_buffer == NULL);
 690         tok->decoding_buffer = utf8; /* CAUTION */
 691         return str;
 692 }
 693
 694 #endif /* PGEN */
 695
 696 /* Set up tokenizer for string */
 697
 698 struct tok_state *
 699 PyTokenizer_FromString(const char *str)
 700 {
 701         struct tok_state *tok = tok_new();
 702         if (tok == NULL)
 703                 return NULL;
 704         str = (char *)decode_str(str, tok);
 705         if (str == NULL) {
 706                 PyTokenizer_Free(tok);
 707                 return NULL;
 708         }
 709
 710         /* XXX: constify members. */
 711         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 712         return tok;
 713 }
 714
 715 struct tok_state *
 716 PyTokenizer_FromUTF8(const char *str)
 717 {
 718         struct tok_state *tok = tok_new();
 719         if (tok == NULL)
 720                 return NULL;
 721         tok->decoding_state = STATE_RAW;
 722         tok->read_coding_spec = 1;
 723         tok->enc = NULL;
 724         tok->str = str;
 725         tok->encoding = (char *)PyMem_MALLOC(6);
 726         if (!tok->encoding) {
 727                 PyTokenizer_Free(tok);
 728                 return NULL;
 729         }
 730         strcpy(tok->encoding, "utf-8");
 731
 732         /* XXX: constify members. */
 733         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 734         return tok;
 735 }
 736
 737
 738 /* Set up tokenizer for file */
 739
 740 struct tok_state *
 741 PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
 742 {
 743         struct tok_state *tok = tok_new();
 744         if (tok == NULL)
 745                 return NULL;
 746         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 747                 PyTokenizer_Free(tok);
 748                 return NULL;
 749         }
 750         tok->cur = tok->inp = tok->buf;
 751         tok->end = tok->buf + BUFSIZ;
 752         tok->fp = fp;
 753         tok->prompt = ps1;
 754         tok->nextprompt = ps2;
 755         if (enc != NULL) {
 756                 /* Must copy encoding declaration since it
 757                    gets copied into the parse tree. */
 758                 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
 759                 if (!tok->encoding) {
 760                         PyTokenizer_Free(tok);
 761                         return NULL;
 762                 }
 763                 strcpy(tok->encoding, enc);
 764                 tok->decoding_state = STATE_NORMAL;
 765         }
 766         return tok;
 767 }
 768
 769
 770 /* Free a tok_state structure */
 771
 772 void
 773 PyTokenizer_Free(struct tok_state *tok)
 774 {
 775         if (tok->encoding != NULL)
 776                 PyMem_FREE(tok->encoding);
 777 #ifndef PGEN
 778         Py_XDECREF(tok->decoding_readline);
 779         Py_XDECREF(tok->decoding_buffer);
 780 #endif
 781         if (tok->fp != NULL && tok->buf != NULL)
 782                 PyMem_FREE(tok->buf);
 783         PyMem_FREE(tok);
 784 }
 785
 786 /* Get next char, updating state; error code goes into tok->done */
 787
 788 static int
 789 tok_nextc(register struct tok_state *tok)
 790 {
 791         for (;;) {
 792                 if (tok->cur != tok->inp) {
 793                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 794                 }
 795                 if (tok->done != E_OK)
 796                         return EOF;
 797                 if (tok->fp == NULL) {
 798                         char *end = strchr(tok->inp, '\n');
 799                         if (end != NULL)
 800                                 end++;
 801                         else {
 802                                 end = strchr(tok->inp, '\0');
 803                                 if (end == tok->inp) {
 804                                         tok->done = E_EOF;
 805                                         return EOF;
 806                                 }
 807                         }
 808                         if (tok->start == NULL)
 809                                 tok->buf = tok->cur;
 810                         tok->line_start = tok->cur;
 811                         tok->lineno++;
 812                         tok->inp = end;
 813                         return Py_CHARMASK(*tok->cur++);
 814                 }
 815                 if (tok->prompt != NULL) {
 816                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 817 #ifndef PGEN
 818                         if (tok->encoding && newtok && *newtok) {
 819                                 /* Recode to UTF-8 */
 820                                 Py_ssize_t buflen;
 821                                 const char* buf;
 822                                 PyObject *u = translate_into_utf8(newtok, tok->encoding);
 823                                 PyMem_FREE(newtok);
 824                                 if (!u) {
 825                                         tok->done = E_DECODE;
 826                                         return EOF;
 827                                 }
 828                                 buflen = PyBytes_GET_SIZE(u);
 829                                 buf = PyBytes_AS_STRING(u);
 830                                 if (!buf) {
 831                                         Py_DECREF(u);
 832                                         tok->done = E_DECODE;
 833                                         return EOF;
 834                                 }
 835                                 newtok = PyMem_MALLOC(buflen+1);
 836                                 strcpy(newtok, buf);
 837                                 Py_DECREF(u);
 838                         }
 839 #endif
 840                         if (tok->nextprompt != NULL)
 841                                 tok->prompt = tok->nextprompt;
 842                         if (newtok == NULL)
 843                                 tok->done = E_INTR;
 844                         else if (*newtok == '\0') {
 845                                 PyMem_FREE(newtok);
 846                                 tok->done = E_EOF;
 847                         }
 848                         else if (tok->start != NULL) {
 849                                 size_t start = tok->start - tok->buf;
 850                                 size_t oldlen = tok->cur - tok->buf;
 851                                 size_t newlen = oldlen + strlen(newtok);
 852                                 char *buf = tok->buf;
 853                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 854                                 tok->lineno++;
 855                                 if (buf == NULL) {
 856                                         PyMem_FREE(tok->buf);
 857                                         tok->buf = NULL;
 858                                         PyMem_FREE(newtok);
 859                                         tok->done = E_NOMEM;
 860                                         return EOF;
 861                                 }
 862                                 tok->buf = buf;
 863                                 tok->cur = tok->buf + oldlen;
 864                                 tok->line_start = tok->cur;
 865                                 strcpy(tok->buf + oldlen, newtok);
 866                                 PyMem_FREE(newtok);
 867                                 tok->inp = tok->buf + newlen;
 868                                 tok->end = tok->inp + 1;
 869                                 tok->start = tok->buf + start;
 870                         }
 871                         else {
 872                                 tok->lineno++;
 873                                 if (tok->buf != NULL)
 874                                         PyMem_FREE(tok->buf);
 875                                 tok->buf = newtok;
 876                                 tok->line_start = tok->buf;
 877                                 tok->cur = tok->buf;
 878                                 tok->line_start = tok->buf;
 879                                 tok->inp = strchr(tok->buf, '\0');
 880                                 tok->end = tok->inp + 1;
 881                         }
 882                 }
 883                 else {
 884                         int done = 0;
 885                         Py_ssize_t cur = 0;
 886                         char *pt;
 887                         if (tok->start == NULL) {
 888                                 if (tok->buf == NULL) {
 889                                         tok->buf = (char *)
 890                                                 PyMem_MALLOC(BUFSIZ);
 891                                         if (tok->buf == NULL) {
 892                                                 tok->done = E_NOMEM;
 893                                                 return EOF;
 894                                         }
 895                                         tok->end = tok->buf + BUFSIZ;
 896                                 }
 897                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 898                                           tok) == NULL) {
 899                                         tok->done = E_EOF;
 900                                         done = 1;
 901                                 }
 902                                 else {
 903                                         tok->done = E_OK;
 904                                         tok->inp = strchr(tok->buf, '\0');
 905                                         done = tok->inp[-1] == '\n';
 906                                 }
 907                         }
 908                         else {
 909                                 cur = tok->cur - tok->buf;
 910                                 if (decoding_feof(tok)) {
 911                                         tok->done = E_EOF;
 912                                         done = 1;
 913                                 }
 914                                 else
 915                                         tok->done = E_OK;
 916                         }
 917                         tok->lineno++;
 918                         /* Read until '\n' or EOF */
 919                         while (!done) {
 920                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 921                                                   tok->start - tok->buf;
 922                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 923                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 924                                 char *newbuf = tok->buf;
 925                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 926                                                                newsize);
 927                                 if (newbuf == NULL) {
 928                                         tok->done = E_NOMEM;
 929                                         tok->cur = tok->inp;
 930                                         return EOF;
 931                                 }
 932                                 tok->buf = newbuf;
 933                                 tok->inp = tok->buf + curvalid;
 934                                 tok->end = tok->buf + newsize;
 935                                 tok->start = curstart < 0 ? NULL :
 936                                              tok->buf + curstart;
 937                                 if (decoding_fgets(tok->inp,
 938                                                (int)(tok->end - tok->inp),
 939                                                tok) == NULL) {
 940                                         /* Break out early on decoding
 941                                            errors, as tok->buf will be NULL
 942                                          */
 943                                         if (tok->decoding_erred)
 944                                                 return EOF;
 945                                         /* Last line does not end in \n,
 946                                            fake one */
 947                                         strcpy(tok->inp, "\n");
 948                                 }
 949                                 tok->inp = strchr(tok->inp, '\0');
 950                                 done = tok->inp[-1] == '\n';
 951                         }
 952                         if (tok->buf != NULL) {
 953                                 tok->cur = tok->buf + cur;
 954                                 tok->line_start = tok->cur;
 955                                 /* replace "\r\n" with "\n" */
 956                                 /* For Mac leave the \r, giving a syntax error */
 957                                 pt = tok->inp - 2;
 958                                 if (pt >= tok->buf && *pt == '\r') {
 959                                         *pt++ = '\n';
 960                                         *pt = '\0';
 961                                         tok->inp = pt;
 962                                 }
 963                         }
 964                 }
 965                 if (tok->done != E_OK) {
 966                         if (tok->prompt != NULL)
 967                                 PySys_WriteStderr("\n");
 968                         tok->cur = tok->inp;
 969                         return EOF;
 970                 }
 971         }
 972         /*NOTREACHED*/
 973 }
 974
 975
 976 /* Back-up one character */
 977
 978 static void
 979 tok_backup(register struct tok_state *tok, register int c)
 980 {
 981         if (c != EOF) {
 982                 if (--tok->cur < tok->buf)
 983                         Py_FatalError("tok_backup: begin of buffer");
 984                 if (*tok->cur != c)
 985                         *tok->cur = c;
 986         }
 987 }
 988
 989
 990 /* Return the token corresponding to a single character */
 991
 992 int
 993 PyToken_OneChar(int c)
 994 {
 995         switch (c) {
 996         case '(':       return LPAR;
 997         case ')':       return RPAR;
 998         case '[':       return LSQB;
 999         case ']':       return RSQB;
1000         case ':':       return COLON;
1001         case ',':       return COMMA;
1002         case ';':       return SEMI;
1003         case '+':       return PLUS;
1004         case '-':       return MINUS;
1005         case '*':       return STAR;
1006         case '/':       return SLASH;
1007         case '|':       return VBAR;
1008         case '&':       return AMPER;
1009         case '<':       return LESS;
1010         case '>':       return GREATER;
1011         case '=':       return EQUAL;
1012         case '.':       return DOT;
1013         case '%':       return PERCENT;
1014         case '{':       return LBRACE;
1015         case '}':       return RBRACE;
1016         case '^':       return CIRCUMFLEX;
1017         case '~':       return TILDE;
1018         case '@':       return AT;
1019         default:        return OP;
1020         }
1021 }
1022
1023
1024 int
1025 PyToken_TwoChars(int c1, int c2)
1026 {
1027         switch (c1) {
1028         case '=':
1029                 switch (c2) {
1030                 case '=':       return EQEQUAL;
1031                 }
1032                 break;
1033         case '!':
1034                 switch (c2) {
1035                 case '=':       return NOTEQUAL;
1036                 }
1037                 break;
1038         case '<':
1039                 switch (c2) {
1040                 case '>':       return NOTEQUAL;
1041                 case '=':       return LESSEQUAL;
1042                 case '<':       return LEFTSHIFT;
1043                 }
1044                 break;
1045         case '>':
1046                 switch (c2) {
1047                 case '=':       return GREATEREQUAL;
1048                 case '>':       return RIGHTSHIFT;
1049                 }
1050                 break;
1051         case '+':
1052                 switch (c2) {
1053                 case '=':       return PLUSEQUAL;
1054                 }
1055                 break;
1056         case '-':
1057                 switch (c2) {
1058                 case '=':       return MINEQUAL;
1059                 case '>':       return RARROW;
1060                 }
1061                 break;
1062         case '*':
1063                 switch (c2) {
1064                 case '*':       return DOUBLESTAR;
1065                 case '=':       return STAREQUAL;
1066                 }
1067                 break;
1068         case '/':
1069                 switch (c2) {
1070                 case '/':       return DOUBLESLASH;
1071                 case '=':       return SLASHEQUAL;
1072                 }
1073                 break;
1074         case '|':
1075                 switch (c2) {
1076                 case '=':       return VBAREQUAL;
1077                 }
1078                 break;
1079         case '%':
1080                 switch (c2) {
1081                 case '=':       return PERCENTEQUAL;
1082                 }
1083                 break;
1084         case '&':
1085                 switch (c2) {
1086                 case '=':       return AMPEREQUAL;
1087                 }
1088                 break;
1089         case '^':
1090                 switch (c2) {
1091                 case '=':       return CIRCUMFLEXEQUAL;
1092                 }
1093                 break;
1094         }
1095         return OP;
1096 }
1097
1098 int
1099 PyToken_ThreeChars(int c1, int c2, int c3)
1100 {
1101         switch (c1) {
1102         case '<':
1103                 switch (c2) {
1104                 case '<':
1105                         switch (c3) {
1106                         case '=':
1107                                 return LEFTSHIFTEQUAL;
1108                         }
1109                         break;
1110                 }
1111                 break;
1112         case '>':
1113                 switch (c2) {
1114                 case '>':
1115                         switch (c3) {
1116                         case '=':
1117                                 return RIGHTSHIFTEQUAL;
1118                         }
1119                         break;
1120                 }
1121                 break;
1122         case '*':
1123                 switch (c2) {
1124                 case '*':
1125                         switch (c3) {
1126                         case '=':
1127                                 return DOUBLESTAREQUAL;
1128                         }
1129                         break;
1130                 }
1131                 break;
1132         case '/':
1133                 switch (c2) {
1134                 case '/':
1135                         switch (c3) {
1136                         case '=':
1137                                 return DOUBLESLASHEQUAL;
1138                         }
1139                         break;
1140                 }
1141                 break;
1142         case '.':
1143                 switch (c2) {
1144                 case '.':
1145                         switch (c3) {
1146                         case '.':
1147                                 return ELLIPSIS;
1148                         }
1149                         break;
1150                 }
1151                 break;
1152         }
1153         return OP;
1154 }
1155
1156 static int
1157 indenterror(struct tok_state *tok)
1158 {
1159         if (tok->alterror) {
1160                 tok->done = E_TABSPACE;
1161                 tok->cur = tok->inp;
1162                 return 1;
1163         }
1164         if (tok->altwarning) {
1165                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1166                                   "in indentation\n", tok->filename);
1167                 tok->altwarning = 0;
1168         }
1169         return 0;
1170 }
1171
1172 #ifdef PGEN
1173 #define verify_identifier(s,e) 1
1174 #else
1175 /* Verify that the identifier follows PEP 3131. */
1176 static int
1177 verify_identifier(char *start, char *end)
1178 {
1179         PyObject *s;
1180         int result;
1181         s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1182         if (s == NULL) {
1183                 PyErr_Clear();
1184                 return 0;
1185         }
1186         result = PyUnicode_IsIdentifier(s);
1187         Py_DECREF(s);
1188         return result;
1189 }
1190 #endif
1191
1192 /* Get next token, after space stripping etc. */
1193
1194 static int
1195 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1196 {
1197         register int c;
1198         int blankline, nonascii;
1199
1200         *p_start = *p_end = NULL;
1201   nextline:
1202         tok->start = NULL;
1203         blankline = 0;
1204
1205         /* Get indentation level */
1206         if (tok->atbol) {
1207                 register int col = 0;
1208                 register int altcol = 0;
1209                 tok->atbol = 0;
1210                 for (;;) {
1211                         c = tok_nextc(tok);
1212                         if (c == ' ')
1213                                 col++, altcol++;
1214                         else if (c == '\t') {
1215                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1216                                 altcol = (altcol/tok->alttabsize + 1)
1217                                         * tok->alttabsize;
1218                         }
1219                         else if (c == '\014') /* Control-L (formfeed) */
1220                                 col = altcol = 0; /* For Emacs users */
1221                         else
1222                                 break;
1223                 }
1224                 tok_backup(tok, c);
1225                 if (c == '#' || c == '\n') {
1226                         /* Lines with only whitespace and/or comments
1227                            shouldn't affect the indentation and are
1228                            not passed to the parser as NEWLINE tokens,
1229                            except *totally* empty lines in interactive
1230                            mode, which signal the end of a command group. */
1231                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1232                                 blankline = 0; /* Let it through */
1233                         else
1234                                 blankline = 1; /* Ignore completely */
1235                         /* We can't jump back right here since we still
1236                            may need to skip to the end of a comment */
1237                 }
1238                 if (!blankline && tok->level == 0) {
1239                         if (col == tok->indstack[tok->indent]) {
1240                                 /* No change */
1241                                 if (altcol != tok->altindstack[tok->indent]) {
1242                                         if (indenterror(tok))
1243                                                 return ERRORTOKEN;
1244                                 }
1245                         }
1246                         else if (col > tok->indstack[tok->indent]) {
1247                                 /* Indent -- always one */
1248                                 if (tok->indent+1 >= MAXINDENT) {
1249                                         tok->done = E_TOODEEP;
1250                                         tok->cur = tok->inp;
1251                                         return ERRORTOKEN;
1252                                 }
1253                                 if (altcol <= tok->altindstack[tok->indent]) {
1254                                         if (indenterror(tok))
1255                                                 return ERRORTOKEN;
1256                                 }
1257                                 tok->pendin++;
1258                                 tok->indstack[++tok->indent] = col;
1259                                 tok->altindstack[tok->indent] = altcol;
1260                         }
1261                         else /* col < tok->indstack[tok->indent] */ {
1262                                 /* Dedent -- any number, must be consistent */
1263                                 while (tok->indent > 0 &&
1264                                         col < tok->indstack[tok->indent]) {
1265                                         tok->pendin--;
1266                                         tok->indent--;
1267                                 }
1268                                 if (col != tok->indstack[tok->indent]) {
1269                                         tok->done = E_DEDENT;
1270                                         tok->cur = tok->inp;
1271                                         return ERRORTOKEN;
1272                                 }
1273                                 if (altcol != tok->altindstack[tok->indent]) {
1274                                         if (indenterror(tok))
1275                                                 return ERRORTOKEN;
1276                                 }
1277                         }
1278                 }
1279         }
1280
1281         tok->start = tok->cur;
1282
1283         /* Return pending indents/dedents */
1284         if (tok->pendin != 0) {
1285                 if (tok->pendin < 0) {
1286                         tok->pendin++;
1287                         return DEDENT;
1288                 }
1289                 else {
1290                         tok->pendin--;
1291                         return INDENT;
1292                 }
1293         }
1294
1295  again:
1296         tok->start = NULL;
1297         /* Skip spaces */
1298         do {
1299                 c = tok_nextc(tok);
1300         } while (c == ' ' || c == '\t' || c == '\014');
1301
1302         /* Set start of current token */
1303         tok->start = tok->cur - 1;
1304
1305         /* Skip comment */
1306         if (c == '#')
1307                 while (c != EOF && c != '\n')
1308                         c = tok_nextc(tok);
1309
1310         /* Check for EOF and errors now */
1311         if (c == EOF) {
1312                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1313         }
1314
1315         /* Identifier (most frequent token!) */
1316         nonascii = 0;
1317         if (is_potential_identifier_start(c)) {
1318                 /* Process b"", r"" and br"" */
1319                 if (c == 'b' || c == 'B') {
1320                         c = tok_nextc(tok);
1321                         if (c == '"' || c == '\'')
1322                                 goto letter_quote;
1323                 }
1324                 if (c == 'r' || c == 'R') {
1325                         c = tok_nextc(tok);
1326                         if (c == '"' || c == '\'')
1327                                 goto letter_quote;
1328             }
1329                 while (is_potential_identifier_char(c)) {
1330                         if (c >= 128)
1331                                 nonascii = 1;
1332                         c = tok_nextc(tok);
1333                 }
1334                 tok_backup(tok, c);
1335                 if (nonascii &&
1336                     !verify_identifier(tok->start, tok->cur)) {
1337                         tok->done = E_IDENTIFIER;
1338                         return ERRORTOKEN;
1339                 }
1340                 *p_start = tok->start;
1341                 *p_end = tok->cur;
1342                 return NAME;
1343         }
1344
1345         /* Newline */
1346         if (c == '\n') {
1347                 tok->atbol = 1;
1348                 if (blankline || tok->level > 0)
1349                         goto nextline;
1350                 *p_start = tok->start;
1351                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1352                 tok->cont_line = 0;
1353                 return NEWLINE;
1354         }
1355
1356         /* Period or number starting with period? */
1357         if (c == '.') {
1358                 c = tok_nextc(tok);
1359                 if (isdigit(c)) {
1360                         goto fraction;
1361                 } else if (c == '.') {
1362                         c = tok_nextc(tok);
1363                         if (c == '.') {
1364                                 *p_start = tok->start;
1365                                 *p_end = tok->cur;
1366                                 return ELLIPSIS;
1367                         } else {
1368                                 tok_backup(tok, c);
1369                         }
1370                         tok_backup(tok, '.');
1371                 } else {
1372                         tok_backup(tok, c);
1373                 }
1374                 *p_start = tok->start;
1375                 *p_end = tok->cur;
1376                 return DOT;
1377         }
1378
1379         /* Number */
1380         if (isdigit(c)) {
1381                 if (c == '0') {
1382                         /* Hex, octal or binary -- maybe. */
1383                         c = tok_nextc(tok);
1384                         if (c == '.')
1385                                 goto fraction;
1386 #ifndef WITHOUT_COMPLEX
1387                         if (c == 'j' || c == 'J')
1388                                 goto imaginary;
1389 #endif
1390                         if (c == 'x' || c == 'X') {
1391
1392                                 /* Hex */
1393                                 c = tok_nextc(tok);
1394                                 if (!isxdigit(c)) {
1395                                         tok->done = E_TOKEN;
1396                                         tok_backup(tok, c);
1397                                         return ERRORTOKEN;
1398                                 }
1399                                 do {
1400                                         c = tok_nextc(tok);
1401                                 } while (isxdigit(c));
1402                         }
1403                         else if (c == 'o' || c == 'O') {
1404                                 /* Octal */
1405                                 c = tok_nextc(tok);
1406                                 if (c < '0' || c >= '8') {
1407                                         tok->done = E_TOKEN;
1408                                         tok_backup(tok, c);
1409                                         return ERRORTOKEN;
1410                                 }
1411                                 do {
1412                                         c = tok_nextc(tok);
1413                                 } while ('0' <= c && c < '8');
1414                         }
1415                         else if (c == 'b' || c == 'B') {
1416                                 /* Binary */
1417                                 c = tok_nextc(tok);
1418                                 if (c != '0' && c != '1') {
1419                                         tok->done = E_TOKEN;
1420                                         tok_backup(tok, c);
1421                                         return ERRORTOKEN;
1422                                 }
1423                                 do {
1424                                         c = tok_nextc(tok);
1425                                 } while (c == '0' || c == '1');
1426                         }
1427                         else {
1428                                 int nonzero = 0;
1429                                 /* maybe old-style octal; c is first char of it */
1430                                 /* in any case, allow '0' as a literal */
1431                                 while (c == '0')
1432                                         c = tok_nextc(tok);
1433                                 while (isdigit(c)) {
1434                                         nonzero = 1;
1435                                         c = tok_nextc(tok);
1436                                 }
1437                                 if (c == '.')
1438                                         goto fraction;
1439                                 else if (c == 'e' || c == 'E')
1440                                         goto exponent;
1441 #ifndef WITHOUT_COMPLEX
1442                                 else if (c == 'j' || c == 'J')
1443                                         goto imaginary;
1444 #endif
1445                                 else if (nonzero) {
1446                                         tok->done = E_TOKEN;
1447                                         tok_backup(tok, c);
1448                                         return ERRORTOKEN;
1449                                 }
1450                         }
1451                 }
1452                 else {
1453                         /* Decimal */
1454                         do {
1455                                 c = tok_nextc(tok);
1456                         } while (isdigit(c));
1457                         {
1458                                 /* Accept floating point numbers. */
1459                                 if (c == '.') {
1460                 fraction:
1461                                         /* Fraction */
1462                                         do {
1463                                                 c = tok_nextc(tok);
1464                                         } while (isdigit(c));
1465                                 }
1466                                 if (c == 'e' || c == 'E') {
1467                 exponent:
1468                                         /* Exponent part */
1469                                         c = tok_nextc(tok);
1470                                         if (c == '+' || c == '-')
1471                                                 c = tok_nextc(tok);
1472                                         if (!isdigit(c)) {
1473                                                 tok->done = E_TOKEN;
1474                                                 tok_backup(tok, c);
1475                                                 return ERRORTOKEN;
1476                                         }
1477                                         do {
1478                                                 c = tok_nextc(tok);
1479                                         } while (isdigit(c));
1480                                 }
1481 #ifndef WITHOUT_COMPLEX
1482                                 if (c == 'j' || c == 'J')
1483                                         /* Imaginary part */
1484                 imaginary:
1485                                         c = tok_nextc(tok);
1486 #endif
1487                         }
1488                 }
1489                 tok_backup(tok, c);
1490                 *p_start = tok->start;
1491                 *p_end = tok->cur;
1492                 return NUMBER;
1493         }
1494
1495   letter_quote:
1496         /* String */
1497         if (c == '\'' || c == '"') {
1498                 int quote = c;
1499                 int quote_size = 1;             /* 1 or 3 */
1500                 int end_quote_size = 0;
1501
1502                 /* Find the quote size and start of string */
1503                 c = tok_nextc(tok);
1504                 if (c == quote) {
1505                         c = tok_nextc(tok);
1506                         if (c == quote)
1507                                 quote_size = 3;
1508                         else
1509                                 end_quote_size = 1;     /* empty string found */
1510                 }
1511                 if (c != quote)
1512                     tok_backup(tok, c);
1513
1514                 /* Get rest of string */
1515                 while (end_quote_size != quote_size) {
1516                         c = tok_nextc(tok);
1517                         if (c == EOF) {
1518                                 if (quote_size == 3)
1519                                         tok->done = E_EOFS;
1520                                 else
1521                                         tok->done = E_EOLS;
1522                                 tok->cur = tok->inp;
1523                                 return ERRORTOKEN;
1524                         }
1525                         if (quote_size == 1 && c == '\n') {
1526                             tok->done = E_EOLS;
1527                             tok->cur = tok->inp;
1528                             return ERRORTOKEN;
1529                         }
1530                         if (c == quote)
1531                             end_quote_size += 1;
1532                         else {
1533                             end_quote_size = 0;
1534                             if (c == '\\')
1535                                 c = tok_nextc(tok);  /* skip escaped char */
1536                         }
1537                 }
1538
1539                 *p_start = tok->start;
1540                 *p_end = tok->cur;
1541                 return STRING;
1542         }
1543
1544         /* Line continuation */
1545         if (c == '\\') {
1546                 c = tok_nextc(tok);
1547                 if (c != '\n') {
1548                         tok->done = E_LINECONT;
1549                         tok->cur = tok->inp;
1550                         return ERRORTOKEN;
1551                 }
1552                 tok->cont_line = 1;
1553                 goto again; /* Read next line */
1554         }
1555
1556         /* Check for two-character token */
1557         {
1558                 int c2 = tok_nextc(tok);
1559                 int token = PyToken_TwoChars(c, c2);
1560                 if (token != OP) {
1561                         int c3 = tok_nextc(tok);
1562                         int token3 = PyToken_ThreeChars(c, c2, c3);
1563                         if (token3 != OP) {
1564                                 token = token3;
1565                         } else {
1566                                 tok_backup(tok, c3);
1567                         }
1568                         *p_start = tok->start;
1569                         *p_end = tok->cur;
1570                         return token;
1571                 }
1572                 tok_backup(tok, c2);
1573         }
1574
1575         /* Keep track of parentheses nesting level */
1576         switch (c) {
1577         case '(':
1578         case '[':
1579         case '{':
1580                 tok->level++;
1581                 break;
1582         case ')':
1583         case ']':
1584         case '}':
1585                 tok->level--;
1586                 break;
1587         }
1588
1589         /* Punctuation character */
1590         *p_start = tok->start;
1591         *p_end = tok->cur;
1592         return PyToken_OneChar(c);
1593 }
1594
1595 int
1596 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1597 {
1598         int result = tok_get(tok, p_start, p_end);
1599         if (tok->decoding_erred) {
1600                 result = ERRORTOKEN;
1601                 tok->done = E_DECODE;
1602         }
1603         return result;
1604 }
1605
1606 /* Get -*- encoding -*- from a Python file.
1607
1608    PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
1609    the first or second line of the file (in which case the encoding
1610    should be assumed to be PyUnicode_GetDefaultEncoding()).
1611
1612    The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1613    by the caller.
1614 */
1615 char *
1616 PyTokenizer_FindEncoding(int fd)
1617 {
1618         struct tok_state *tok;
1619         FILE *fp;
1620         char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1621
1622         fd = dup(fd);
1623         if (fd < 0) {
1624                 return NULL;
1625         }
1626         fp = fdopen(fd, "r");
1627         if (fp == NULL) {
1628                 return NULL;
1629         }
1630         tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1631         if (tok == NULL) {
1632                 fclose(fp);
1633                 return NULL;
1634         }
1635         while (tok->lineno < 2 && tok->done == E_OK) {
1636                 PyTokenizer_Get(tok, &p_start, &p_end);
1637         }
1638         fclose(fp);
1639         if (tok->encoding) {
1640             encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1641             if (encoding)
1642                 strcpy(encoding, tok->encoding);
1643         }
1644         PyTokenizer_Free(tok);
1645         return encoding;
1646 }
1647
1648 #ifdef Py_DEBUG
1649
1650 void
1651 tok_dump(int type, char *start, char *end)
1652 {
1653         printf("%s", _PyParser_TokenNames[type]);
1654         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1655                 printf("(%.*s)", (int)(end - start), start);
1656 }
1657
1658 #endif