Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #include "pydebug.h"
  20 #endif /* PGEN */
  21
  22 extern char *PyOS_Readline(FILE *, FILE *, char *);
  23 /* Return malloc'ed string including trailing \n;
  24    empty malloc'ed string for EOF;
  25    NULL if interrupted */
  26
  27 /* Don't ever change this -- it would break the portability of Python code */
  28 #define TABSIZE 8
  29
  30 /* Forward */
  31 static struct tok_state *tok_new(void);
  32 static int tok_nextc(struct tok_state *tok);
  33 static void tok_backup(struct tok_state *tok, int c);
  34
  35 /* Token names */
  36
  37 char *_PyParser_TokenNames[] = {
  38         "ENDMARKER",
  39         "NAME",
  40         "NUMBER",
  41         "STRING",
  42         "NEWLINE",
  43         "INDENT",
  44         "DEDENT",
  45         "LPAR",
  46         "RPAR",
  47         "LSQB",
  48         "RSQB",
  49         "COLON",
  50         "COMMA",
  51         "SEMI",
  52         "PLUS",
  53         "MINUS",
  54         "STAR",
  55         "SLASH",
  56         "VBAR",
  57         "AMPER",
  58         "LESS",
  59         "GREATER",
  60         "EQUAL",
  61         "DOT",
  62         "PERCENT",
  63         "BACKQUOTE",
  64         "LBRACE",
  65         "RBRACE",
  66         "EQEQUAL",
  67         "NOTEQUAL",
  68         "LESSEQUAL",
  69         "GREATEREQUAL",
  70         "TILDE",
  71         "CIRCUMFLEX",
  72         "LEFTSHIFT",
  73         "RIGHTSHIFT",
  74         "DOUBLESTAR",
  75         "PLUSEQUAL",
  76         "MINEQUAL",
  77         "STAREQUAL",
  78         "SLASHEQUAL",
  79         "PERCENTEQUAL",
  80         "AMPEREQUAL",
  81         "VBAREQUAL",
  82         "CIRCUMFLEXEQUAL",
  83         "LEFTSHIFTEQUAL",
  84         "RIGHTSHIFTEQUAL",
  85         "DOUBLESTAREQUAL",
  86         "DOUBLESLASH",
  87         "DOUBLESLASHEQUAL",
  88         "AT",
  89         /* This table must match the #defines in token.h! */
  90         "OP",
  91         "<ERRORTOKEN>",
  92         "<N_TOKENS>"
  93 };
  94
  95
  96 /* Create and initialize a new tok_state structure */
  97
  98 static struct tok_state *
  99 tok_new(void)
 100 {
 101         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 102                                                 sizeof(struct tok_state));
 103         if (tok == NULL)
 104                 return NULL;
 105         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 106         tok->done = E_OK;
 107         tok->fp = NULL;
 108         tok->tabsize = TABSIZE;
 109         tok->indent = 0;
 110         tok->indstack[0] = 0;
 111         tok->atbol = 1;
 112         tok->pendin = 0;
 113         tok->prompt = tok->nextprompt = NULL;
 114         tok->lineno = 0;
 115         tok->level = 0;
 116         tok->filename = NULL;
 117         tok->altwarning = 0;
 118         tok->alterror = 0;
 119         tok->alttabsize = 1;
 120         tok->altindstack[0] = 0;
 121         tok->decoding_state = 0;
 122         tok->decoding_erred = 0;
 123         tok->read_coding_spec = 0;
 124         tok->encoding = NULL;
 125         tok->cont_line = 0;
 126 #ifndef PGEN
 127         tok->decoding_readline = NULL;
 128         tok->decoding_buffer = NULL;
 129 #endif
 130         return tok;
 131 }
 132
 133 #ifdef PGEN
 134
 135 static char *
 136 decoding_fgets(char *s, int size, struct tok_state *tok)
 137 {
 138         return fgets(s, size, tok->fp);
 139 }
 140
 141 static int
 142 decoding_feof(struct tok_state *tok)
 143 {
 144         return feof(tok->fp);
 145 }
 146
 147 static const char *
 148 decode_str(const char *str, struct tok_state *tok)
 149 {
 150         return str;
 151 }
 152
 153 #else /* PGEN */
 154
 155 static char *
 156 error_ret(struct tok_state *tok) /* XXX */
 157 {
 158         tok->decoding_erred = 1;
 159         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 160                 PyMem_FREE(tok->buf);
 161         tok->buf = NULL;
 162         return NULL;            /* as if it were EOF */
 163 }
 164
 165 static char *
 166 new_string(const char *s, Py_ssize_t len)
 167 {
 168         char* result = (char *)PyMem_MALLOC(len + 1);
 169         if (result != NULL) {
 170                 memcpy(result, s, len);
 171                 result[len] = '\0';
 172         }
 173         return result;
 174 }
 175
 176 static char *
 177 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 178 {
 179         char buf[13];
 180         int i;
 181         for (i = 0; i < 12; i++) {
 182                 int c = s[i];
 183                 if (c == '\0')
 184                         break;
 185                 else if (c == '_')
 186                         buf[i] = '-';
 187                 else
 188                         buf[i] = tolower(c);
 189         }
 190         buf[i] = '\0';
 191         if (strcmp(buf, "utf-8") == 0 ||
 192             strncmp(buf, "utf-8-", 6) == 0)
 193                 return "utf-8";
 194         else if (strcmp(buf, "latin-1") == 0 ||
 195                  strcmp(buf, "iso-8859-1") == 0 ||
 196                  strcmp(buf, "iso-latin-1") == 0 ||
 197                  strncmp(buf, "latin-1-", 8) == 0 ||
 198                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 199                  strncmp(buf, "iso-latin-1-", 12) == 0)
 200                 return "iso-8859-1";
 201         else
 202                 return s;
 203 }
 204
 205 /* Return the coding spec in S, or NULL if none is found.  */
 206
 207 static char *
 208 get_coding_spec(const char *s, Py_ssize_t size)
 209 {
 210         Py_ssize_t i;
 211         /* Coding spec must be in a comment, and that comment must be
 212          * the only statement on the source code line. */
 213         for (i = 0; i < size - 6; i++) {
 214                 if (s[i] == '#')
 215                         break;
 216                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 217                         return NULL;
 218         }
 219         for (; i < size - 6; i++) { /* XXX inefficient search */
 220                 const char* t = s + i;
 221                 if (strncmp(t, "coding", 6) == 0) {
 222                         const char* begin = NULL;
 223                         t += 6;
 224                         if (t[0] != ':' && t[0] != '=')
 225                                 continue;
 226                         do {
 227                                 t++;
 228                         } while (t[0] == '\x20' || t[0] == '\t');
 229
 230                         begin = t;
 231                         while (isalnum(Py_CHARMASK(t[0])) ||
 232                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 233                                 t++;
 234
 235                         if (begin < t) {
 236                                 char* r = new_string(begin, t - begin);
 237                                 char* q = get_normal_name(r);
 238                                 if (r != q) {
 239                                         PyMem_FREE(r);
 240                                         r = new_string(q, strlen(q));
 241                                 }
 242                                 return r;
 243                         }
 244                 }
 245         }
 246         return NULL;
 247 }
 248
 249 /* Check whether the line contains a coding spec. If it does,
 250    invoke the set_readline function for the new encoding.
 251    This function receives the tok_state and the new encoding.
 252    Return 1 on success, 0 on failure.  */
 253
 254 static int
 255 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 256                   int set_readline(struct tok_state *, const char *))
 257 {
 258         char * cs;
 259         int r = 1;
 260
 261         if (tok->cont_line)
 262                 /* It's a continuation line, so it can't be a coding spec. */
 263                 return 1;
 264         cs = get_coding_spec(line, size);
 265         if (cs != NULL) {
 266                 tok->read_coding_spec = 1;
 267                 if (tok->encoding == NULL) {
 268                         assert(tok->decoding_state == 1); /* raw */
 269                         if (strcmp(cs, "utf-8") == 0 ||
 270                             strcmp(cs, "iso-8859-1") == 0) {
 271                                 tok->encoding = cs;
 272                         } else {
 273 #ifdef Py_USING_UNICODE
 274                                 r = set_readline(tok, cs);
 275                                 if (r) {
 276                                         tok->encoding = cs;
 277                                         tok->decoding_state = -1;
 278                                 }
 279                                 else
 280                                         PyMem_FREE(cs);
 281 #else
 282                                 /* Without Unicode support, we cannot
 283                                    process the coding spec. Since there
 284                                    won't be any Unicode literals, that
 285                                    won't matter. */
 286                                 PyMem_FREE(cs);
 287 #endif
 288                         }
 289                 } else {        /* then, compare cs with BOM */
 290                         r = (strcmp(tok->encoding, cs) == 0);
 291                         PyMem_FREE(cs);
 292                 }
 293         }
 294         if (!r) {
 295                 cs = tok->encoding;
 296                 if (!cs)
 297                         cs = "with BOM";
 298                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 299         }
 300         return r;
 301 }
 302
 303 /* See whether the file starts with a BOM. If it does,
 304    invoke the set_readline function with the new encoding.
 305    Return 1 on success, 0 on failure.  */
 306
 307 static int
 308 check_bom(int get_char(struct tok_state *),
 309           void unget_char(int, struct tok_state *),
 310           int set_readline(struct tok_state *, const char *),
 311           struct tok_state *tok)
 312 {
 313         int ch = get_char(tok);
 314         tok->decoding_state = 1;
 315         if (ch == EOF) {
 316                 return 1;
 317         } else if (ch == 0xEF) {
 318                 ch = get_char(tok);
 319                 if (ch != 0xBB)
 320                         goto NON_BOM;
 321                 ch = get_char(tok);
 322                 if (ch != 0xBF)
 323                         goto NON_BOM;
 324 #if 0
 325         /* Disable support for UTF-16 BOMs until a decision
 326            is made whether this needs to be supported.  */
 327         } else if (ch == 0xFE) {
 328                 ch = get_char(tok);
 329                 if (ch != 0xFF)
 330                         goto NON_BOM;
 331                 if (!set_readline(tok, "utf-16-be"))
 332                         return 0;
 333                 tok->decoding_state = -1;
 334         } else if (ch == 0xFF) {
 335                 ch = get_char(tok);
 336                 if (ch != 0xFE)
 337                         goto NON_BOM;
 338                 if (!set_readline(tok, "utf-16-le"))
 339                         return 0;
 340                 tok->decoding_state = -1;
 341 #endif
 342         } else {
 343                 unget_char(ch, tok);
 344                 return 1;
 345         }
 346         if (tok->encoding != NULL)
 347                 PyMem_FREE(tok->encoding);
 348         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 349         return 1;
 350   NON_BOM:
 351         /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 352         unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
 353         return 1;
 354 }
 355
 356 /* Read a line of text from TOK into S, using the stream in TOK.
 357    Return NULL on failure, else S.
 358
 359    On entry, tok->decoding_buffer will be one of:
 360      1) NULL: need to call tok->decoding_readline to get a new line
 361      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 362            stored the result in tok->decoding_buffer
 363      3) PyStringObject *: previous call to fp_readl did not have enough room
 364            (in the s buffer) to copy entire contents of the line read
 365            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 366            In this case, fp_readl is called in a loop (with an expanded buffer)
 367            until the buffer ends with a '\n' (or until the end of the file is
 368            reached): see tok_nextc and its calls to decoding_fgets.
 369 */
 370
 371 static char *
 372 fp_readl(char *s, int size, struct tok_state *tok)
 373 {
 374 #ifndef Py_USING_UNICODE
 375         /* In a non-Unicode built, this should never be called. */
 376         Py_FatalError("fp_readl should not be called in this build.");
 377         return NULL; /* Keep compiler happy (not reachable) */
 378 #else
 379         PyObject* utf8 = NULL;
 380         PyObject* buf = tok->decoding_buffer;
 381         char *str;
 382         Py_ssize_t utf8len;
 383
 384         /* Ask for one less byte so we can terminate it */
 385         assert(size > 0);
 386         size--;
 387
 388         if (buf == NULL) {
 389                 buf = PyObject_CallObject(tok->decoding_readline, NULL);
 390                 if (buf == NULL)
 391                         return error_ret(tok);
 392         } else {
 393                 tok->decoding_buffer = NULL;
 394                 if (PyString_CheckExact(buf))
 395                         utf8 = buf;
 396         }
 397         if (utf8 == NULL) {
 398                 utf8 = PyUnicode_AsUTF8String(buf);
 399                 Py_DECREF(buf);
 400                 if (utf8 == NULL)
 401                         return error_ret(tok);
 402         }
 403         str = PyString_AsString(utf8);
 404         utf8len = PyString_GET_SIZE(utf8);
 405         if (utf8len > size) {
 406                 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 407                 if (tok->decoding_buffer == NULL) {
 408                         Py_DECREF(utf8);
 409                         return error_ret(tok);
 410                 }
 411                 utf8len = size;
 412         }
 413         memcpy(s, str, utf8len);
 414         s[utf8len] = '\0';
 415         Py_DECREF(utf8);
 416         if (utf8len == 0)
 417                 return NULL; /* EOF */
 418         return s;
 419 #endif
 420 }
 421
 422 /* Set the readline function for TOK to a StreamReader's
 423    readline function. The StreamReader is named ENC.
 424
 425    This function is called from check_bom and check_coding_spec.
 426
 427    ENC is usually identical to the future value of tok->encoding,
 428    except for the (currently unsupported) case of UTF-16.
 429
 430    Return 1 on success, 0 on failure. */
 431
 432 static int
 433 fp_setreadl(struct tok_state *tok, const char* enc)
 434 {
 435         PyObject *reader, *stream, *readline;
 436
 437         /* XXX: constify filename argument. */
 438         stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 439         if (stream == NULL)
 440                 return 0;
 441
 442         reader = PyCodec_StreamReader(enc, stream, NULL);
 443         Py_DECREF(stream);
 444         if (reader == NULL)
 445                 return 0;
 446
 447         readline = PyObject_GetAttrString(reader, "readline");
 448         Py_DECREF(reader);
 449         if (readline == NULL)
 450                 return 0;
 451
 452         tok->decoding_readline = readline;
 453         return 1;
 454 }
 455
 456 /* Fetch the next byte from TOK. */
 457
 458 static int fp_getc(struct tok_state *tok) {
 459         return getc(tok->fp);
 460 }
 461
 462 /* Unfetch the last byte back into TOK.  */
 463
 464 static void fp_ungetc(int c, struct tok_state *tok) {
 465         ungetc(c, tok->fp);
 466 }
 467
 468 /* Read a line of input from TOK. Determine encoding
 469    if necessary.  */
 470
 471 static char *
 472 decoding_fgets(char *s, int size, struct tok_state *tok)
 473 {
 474         char *line = NULL;
 475         int badchar = 0;
 476         for (;;) {
 477                 if (tok->decoding_state < 0) {
 478                         /* We already have a codec associated with
 479                            this input. */
 480                         line = fp_readl(s, size, tok);
 481                         break;
 482                 } else if (tok->decoding_state > 0) {
 483                         /* We want a 'raw' read. */
 484                         line = Py_UniversalNewlineFgets(s, size,
 485                                                         tok->fp, NULL);
 486                         break;
 487                 } else {
 488                         /* We have not yet determined the encoding.
 489                            If an encoding is found, use the file-pointer
 490                            reader functions from now on. */
 491                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 492                                 return error_ret(tok);
 493                         assert(tok->decoding_state != 0);
 494                 }
 495         }
 496         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 497                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 498                         return error_ret(tok);
 499                 }
 500         }
 501 #ifndef PGEN
 502         /* The default encoding is ASCII, so make sure we don't have any
 503            non-ASCII bytes in it. */
 504         if (line && !tok->encoding) {
 505                 unsigned char *c;
 506                 for (c = (unsigned char *)line; *c; c++)
 507                         if (*c > 127) {
 508                                 badchar = *c;
 509                                 break;
 510                         }
 511         }
 512         if (badchar) {
 513                 char buf[500];
 514                 /* Need to add 1 to the line number, since this line
 515                    has not been counted, yet.  */
 516                 sprintf(buf,
 517                         "Non-ASCII character '\\x%.2x' "
 518                         "in file %.200s on line %i, "
 519                         "but no encoding declared; "
 520                         "see http://www.python.org/peps/pep-0263.html for details",
 521                         badchar, tok->filename, tok->lineno + 1);
 522                 PyErr_SetString(PyExc_SyntaxError, buf);
 523                 return error_ret(tok);
 524         }
 525 #endif
 526         return line;
 527 }
 528
 529 static int
 530 decoding_feof(struct tok_state *tok)
 531 {
 532         if (tok->decoding_state >= 0) {
 533                 return feof(tok->fp);
 534         } else {
 535                 PyObject* buf = tok->decoding_buffer;
 536                 if (buf == NULL) {
 537                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 538                         if (buf == NULL) {
 539                                 error_ret(tok);
 540                                 return 1;
 541                         } else {
 542                                 tok->decoding_buffer = buf;
 543                         }
 544                 }
 545                 return PyObject_Length(buf) == 0;
 546         }
 547 }
 548
 549 /* Fetch a byte from TOK, using the string buffer. */
 550
 551 static int
 552 buf_getc(struct tok_state *tok) {
 553         return Py_CHARMASK(*tok->str++);
 554 }
 555
 556 /* Unfetch a byte from TOK, using the string buffer. */
 557
 558 static void
 559 buf_ungetc(int c, struct tok_state *tok) {
 560         tok->str--;
 561         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 562 }
 563
 564 /* Set the readline function for TOK to ENC. For the string-based
 565    tokenizer, this means to just record the encoding. */
 566
 567 static int
 568 buf_setreadl(struct tok_state *tok, const char* enc) {
 569         tok->enc = enc;
 570         return 1;
 571 }
 572
 573 /* Return a UTF-8 encoding Python string object from the
 574    C byte string STR, which is encoded with ENC. */
 575
 576 #ifdef Py_USING_UNICODE
 577 static PyObject *
 578 translate_into_utf8(const char* str, const char* enc) {
 579         PyObject *utf8;
 580         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 581         if (buf == NULL)
 582                 return NULL;
 583         utf8 = PyUnicode_AsUTF8String(buf);
 584         Py_DECREF(buf);
 585         return utf8;
 586 }
 587 #endif
 588
 589 /* Decode a byte string STR for use as the buffer of TOK.
 590    Look for encoding declarations inside STR, and record them
 591    inside TOK.  */
 592
 593 static const char *
 594 decode_str(const char *str, struct tok_state *tok)
 595 {
 596         PyObject* utf8 = NULL;
 597         const char *s;
 598         const char *newl[2] = {NULL, NULL};
 599         int lineno = 0;
 600         tok->enc = NULL;
 601         tok->str = str;
 602         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 603                 return error_ret(tok);
 604         str = tok->str;         /* string after BOM if any */
 605         assert(str);
 606 #ifdef Py_USING_UNICODE
 607         if (tok->enc != NULL) {
 608                 utf8 = translate_into_utf8(str, tok->enc);
 609                 if (utf8 == NULL)
 610                         return error_ret(tok);
 611                 str = PyString_AsString(utf8);
 612         }
 613 #endif
 614         for (s = str;; s++) {
 615                 if (*s == '\0') break;
 616                 else if (*s == '\n') {
 617                         assert(lineno < 2);
 618                         newl[lineno] = s;
 619                         lineno++;
 620                         if (lineno == 2) break;
 621                 }
 622         }
 623         tok->enc = NULL;
 624         /* need to check line 1 and 2 separately since check_coding_spec
 625            assumes a single line as input */
 626         if (newl[0]) {
 627                 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 628                         return error_ret(tok);
 629                 if (tok->enc == NULL && newl[1]) {
 630                         if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 631                                                tok, buf_setreadl))
 632                                 return error_ret(tok);
 633                 }
 634         }
 635 #ifdef Py_USING_UNICODE
 636         if (tok->enc != NULL) {
 637                 assert(utf8 == NULL);
 638                 utf8 = translate_into_utf8(str, tok->enc);
 639                 if (utf8 == NULL)
 640                         return error_ret(tok);
 641                 str = PyString_AsString(utf8);
 642         }
 643 #endif
 644         assert(tok->decoding_buffer == NULL);
 645         tok->decoding_buffer = utf8; /* CAUTION */
 646         return str;
 647 }
 648
 649 #endif /* PGEN */
 650
 651 /* Set up tokenizer for string */
 652
 653 struct tok_state *
 654 PyTokenizer_FromString(const char *str)
 655 {
 656         struct tok_state *tok = tok_new();
 657         if (tok == NULL)
 658                 return NULL;
 659         str = (char *)decode_str(str, tok);
 660         if (str == NULL) {
 661                 PyTokenizer_Free(tok);
 662                 return NULL;
 663         }
 664
 665         /* XXX: constify members. */
 666         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 667         return tok;
 668 }
 669
 670
 671 /* Set up tokenizer for file */
 672
 673 struct tok_state *
 674 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 675 {
 676         struct tok_state *tok = tok_new();
 677         if (tok == NULL)
 678                 return NULL;
 679         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 680                 PyTokenizer_Free(tok);
 681                 return NULL;
 682         }
 683         tok->cur = tok->inp = tok->buf;
 684         tok->end = tok->buf + BUFSIZ;
 685         tok->fp = fp;
 686         tok->prompt = ps1;
 687         tok->nextprompt = ps2;
 688         return tok;
 689 }
 690
 691
 692 /* Free a tok_state structure */
 693
 694 void
 695 PyTokenizer_Free(struct tok_state *tok)
 696 {
 697         if (tok->encoding != NULL)
 698                 PyMem_FREE(tok->encoding);
 699 #ifndef PGEN
 700         Py_XDECREF(tok->decoding_readline);
 701         Py_XDECREF(tok->decoding_buffer);
 702 #endif
 703         if (tok->fp != NULL && tok->buf != NULL)
 704                 PyMem_FREE(tok->buf);
 705         PyMem_FREE(tok);
 706 }
 707
 708 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 709 static int
 710 tok_stdin_decode(struct tok_state *tok, char **inp)
 711 {
 712         PyObject *enc, *sysstdin, *decoded, *utf8;
 713         const char *encoding;
 714         char *converted;
 715
 716         if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 717                 return 0;
 718         sysstdin = PySys_GetObject("stdin");
 719         if (sysstdin == NULL || !PyFile_Check(sysstdin))
 720                 return 0;
 721
 722         enc = ((PyFileObject *)sysstdin)->f_encoding;
 723         if (enc == NULL || !PyString_Check(enc))
 724                 return 0;
 725         Py_INCREF(enc);
 726
 727         encoding = PyString_AsString(enc);
 728         decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 729         if (decoded == NULL)
 730                 goto error_clear;
 731
 732         utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 733         Py_DECREF(decoded);
 734         if (utf8 == NULL)
 735                 goto error_clear;
 736
 737         assert(PyString_Check(utf8));
 738         converted = new_string(PyString_AS_STRING(utf8),
 739                                PyString_GET_SIZE(utf8));
 740         Py_DECREF(utf8);
 741         if (converted == NULL)
 742                 goto error_nomem;
 743
 744         PyMem_FREE(*inp);
 745         *inp = converted;
 746         if (tok->encoding != NULL)
 747                 PyMem_FREE(tok->encoding);
 748         tok->encoding = new_string(encoding, strlen(encoding));
 749         if (tok->encoding == NULL)
 750                 goto error_nomem;
 751
 752         Py_DECREF(enc);
 753         return 0;
 754
 755 error_nomem:
 756         Py_DECREF(enc);
 757         tok->done = E_NOMEM;
 758         return -1;
 759
 760 error_clear:
 761         /* Fallback to iso-8859-1: for backward compatibility */
 762         Py_DECREF(enc);
 763         PyErr_Clear();
 764         return 0;
 765 }
 766 #endif
 767
 768 /* Get next char, updating state; error code goes into tok->done */
 769
 770 static int
 771 tok_nextc(register struct tok_state *tok)
 772 {
 773         for (;;) {
 774                 if (tok->cur != tok->inp) {
 775                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 776                 }
 777                 if (tok->done != E_OK)
 778                         return EOF;
 779                 if (tok->fp == NULL) {
 780                         char *end = strchr(tok->inp, '\n');
 781                         if (end != NULL)
 782                                 end++;
 783                         else {
 784                                 end = strchr(tok->inp, '\0');
 785                                 if (end == tok->inp) {
 786                                         tok->done = E_EOF;
 787                                         return EOF;
 788                                 }
 789                         }
 790                         if (tok->start == NULL)
 791                                 tok->buf = tok->cur;
 792                         tok->line_start = tok->cur;
 793                         tok->lineno++;
 794                         tok->inp = end;
 795                         return Py_CHARMASK(*tok->cur++);
 796                 }
 797                 if (tok->prompt != NULL) {
 798                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 799                         if (tok->nextprompt != NULL)
 800                                 tok->prompt = tok->nextprompt;
 801                         if (newtok == NULL)
 802                                 tok->done = E_INTR;
 803                         else if (*newtok == '\0') {
 804                                 PyMem_FREE(newtok);
 805                                 tok->done = E_EOF;
 806                         }
 807 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 808                         else if (tok_stdin_decode(tok, &newtok) != 0)
 809                                 PyMem_FREE(newtok);
 810 #endif
 811                         else if (tok->start != NULL) {
 812                                 size_t start = tok->start - tok->buf;
 813                                 size_t oldlen = tok->cur - tok->buf;
 814                                 size_t newlen = oldlen + strlen(newtok);
 815                                 char *buf = tok->buf;
 816                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 817                                 tok->lineno++;
 818                                 if (buf == NULL) {
 819                                         PyMem_FREE(tok->buf);
 820                                         tok->buf = NULL;
 821                                         PyMem_FREE(newtok);
 822                                         tok->done = E_NOMEM;
 823                                         return EOF;
 824                                 }
 825                                 tok->buf = buf;
 826                                 tok->cur = tok->buf + oldlen;
 827                                 tok->line_start = tok->cur;
 828                                 strcpy(tok->buf + oldlen, newtok);
 829                                 PyMem_FREE(newtok);
 830                                 tok->inp = tok->buf + newlen;
 831                                 tok->end = tok->inp + 1;
 832                                 tok->start = tok->buf + start;
 833                         }
 834                         else {
 835                                 tok->lineno++;
 836                                 if (tok->buf != NULL)
 837                                         PyMem_FREE(tok->buf);
 838                                 tok->buf = newtok;
 839                                 tok->line_start = tok->buf;
 840                                 tok->cur = tok->buf;
 841                                 tok->line_start = tok->buf;
 842                                 tok->inp = strchr(tok->buf, '\0');
 843                                 tok->end = tok->inp + 1;
 844                         }
 845                 }
 846                 else {
 847                         int done = 0;
 848                         Py_ssize_t cur = 0;
 849                         char *pt;
 850                         if (tok->start == NULL) {
 851                                 if (tok->buf == NULL) {
 852                                         tok->buf = (char *)
 853                                                 PyMem_MALLOC(BUFSIZ);
 854                                         if (tok->buf == NULL) {
 855                                                 tok->done = E_NOMEM;
 856                                                 return EOF;
 857                                         }
 858                                         tok->end = tok->buf + BUFSIZ;
 859                                 }
 860                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 861                                           tok) == NULL) {
 862                                         tok->done = E_EOF;
 863                                         done = 1;
 864                                 }
 865                                 else {
 866                                         tok->done = E_OK;
 867                                         tok->inp = strchr(tok->buf, '\0');
 868                                         done = tok->inp[-1] == '\n';
 869                                 }
 870                         }
 871                         else {
 872                                 cur = tok->cur - tok->buf;
 873                                 if (decoding_feof(tok)) {
 874                                         tok->done = E_EOF;
 875                                         done = 1;
 876                                 }
 877                                 else
 878                                         tok->done = E_OK;
 879                         }
 880                         tok->lineno++;
 881                         /* Read until '\n' or EOF */
 882                         while (!done) {
 883                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 884                                                   tok->start - tok->buf;
 885                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 886                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 887                                 char *newbuf = tok->buf;
 888                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 889                                                                newsize);
 890                                 if (newbuf == NULL) {
 891                                         tok->done = E_NOMEM;
 892                                         tok->cur = tok->inp;
 893                                         return EOF;
 894                                 }
 895                                 tok->buf = newbuf;
 896                                 tok->inp = tok->buf + curvalid;
 897                                 tok->end = tok->buf + newsize;
 898                                 tok->start = curstart < 0 ? NULL :
 899                                              tok->buf + curstart;
 900                                 if (decoding_fgets(tok->inp,
 901                                                (int)(tok->end - tok->inp),
 902                                                tok) == NULL) {
 903                                         /* Break out early on decoding
 904                                            errors, as tok->buf will be NULL
 905                                          */
 906                                         if (tok->decoding_erred)
 907                                                 return EOF;
 908                                         /* Last line does not end in \n,
 909                                            fake one */
 910                                         strcpy(tok->inp, "\n");
 911                                 }
 912                                 tok->inp = strchr(tok->inp, '\0');
 913                                 done = tok->inp[-1] == '\n';
 914                         }
 915                         if (tok->buf != NULL) {
 916                                 tok->cur = tok->buf + cur;
 917                                 tok->line_start = tok->cur;
 918                                 /* replace "\r\n" with "\n" */
 919                                 /* For Mac leave the \r, giving a syntax error */
 920                                 pt = tok->inp - 2;
 921                                 if (pt >= tok->buf && *pt == '\r') {
 922                                         *pt++ = '\n';
 923                                         *pt = '\0';
 924                                         tok->inp = pt;
 925                                 }
 926                         }
 927                 }
 928                 if (tok->done != E_OK) {
 929                         if (tok->prompt != NULL)
 930                                 PySys_WriteStderr("\n");
 931                         tok->cur = tok->inp;
 932                         return EOF;
 933                 }
 934         }
 935         /*NOTREACHED*/
 936 }
 937
 938
 939 /* Back-up one character */
 940
 941 static void
 942 tok_backup(register struct tok_state *tok, register int c)
 943 {
 944         if (c != EOF) {
 945                 if (--tok->cur < tok->buf)
 946                         Py_FatalError("tok_backup: begin of buffer");
 947                 if (*tok->cur != c)
 948                         *tok->cur = c;
 949         }
 950 }
 951
 952
 953 /* Return the token corresponding to a single character */
 954
 955 int
 956 PyToken_OneChar(int c)
 957 {
 958         switch (c) {
 959         case '(':       return LPAR;
 960         case ')':       return RPAR;
 961         case '[':       return LSQB;
 962         case ']':       return RSQB;
 963         case ':':       return COLON;
 964         case ',':       return COMMA;
 965         case ';':       return SEMI;
 966         case '+':       return PLUS;
 967         case '-':       return MINUS;
 968         case '*':       return STAR;
 969         case '/':       return SLASH;
 970         case '|':       return VBAR;
 971         case '&':       return AMPER;
 972         case '<':       return LESS;
 973         case '>':       return GREATER;
 974         case '=':       return EQUAL;
 975         case '.':       return DOT;
 976         case '%':       return PERCENT;
 977         case '`':       return BACKQUOTE;
 978         case '{':       return LBRACE;
 979         case '}':       return RBRACE;
 980         case '^':       return CIRCUMFLEX;
 981         case '~':       return TILDE;
 982         case '@':       return AT;
 983         default:        return OP;
 984         }
 985 }
 986
 987
 988 int
 989 PyToken_TwoChars(int c1, int c2)
 990 {
 991         switch (c1) {
 992         case '=':
 993                 switch (c2) {
 994                 case '=':       return EQEQUAL;
 995                 }
 996                 break;
 997         case '!':
 998                 switch (c2) {
 999                 case '=':       return NOTEQUAL;
1000                 }
1001                 break;
1002         case '<':
1003                 switch (c2) {
1004                 case '>':       return NOTEQUAL;
1005                 case '=':       return LESSEQUAL;
1006                 case '<':       return LEFTSHIFT;
1007                 }
1008                 break;
1009         case '>':
1010                 switch (c2) {
1011                 case '=':       return GREATEREQUAL;
1012                 case '>':       return RIGHTSHIFT;
1013                 }
1014                 break;
1015         case '+':
1016                 switch (c2) {
1017                 case '=':       return PLUSEQUAL;
1018                 }
1019                 break;
1020         case '-':
1021                 switch (c2) {
1022                 case '=':       return MINEQUAL;
1023                 }
1024                 break;
1025         case '*':
1026                 switch (c2) {
1027                 case '*':       return DOUBLESTAR;
1028                 case '=':       return STAREQUAL;
1029                 }
1030                 break;
1031         case '/':
1032                 switch (c2) {
1033                 case '/':       return DOUBLESLASH;
1034                 case '=':       return SLASHEQUAL;
1035                 }
1036                 break;
1037         case '|':
1038                 switch (c2) {
1039                 case '=':       return VBAREQUAL;
1040                 }
1041                 break;
1042         case '%':
1043                 switch (c2) {
1044                 case '=':       return PERCENTEQUAL;
1045                 }
1046                 break;
1047         case '&':
1048                 switch (c2) {
1049                 case '=':       return AMPEREQUAL;
1050                 }
1051                 break;
1052         case '^':
1053                 switch (c2) {
1054                 case '=':       return CIRCUMFLEXEQUAL;
1055                 }
1056                 break;
1057         }
1058         return OP;
1059 }
1060
1061 int
1062 PyToken_ThreeChars(int c1, int c2, int c3)
1063 {
1064         switch (c1) {
1065         case '<':
1066                 switch (c2) {
1067                 case '<':
1068                         switch (c3) {
1069                         case '=':
1070                                 return LEFTSHIFTEQUAL;
1071                         }
1072                         break;
1073                 }
1074                 break;
1075         case '>':
1076                 switch (c2) {
1077                 case '>':
1078                         switch (c3) {
1079                         case '=':
1080                                 return RIGHTSHIFTEQUAL;
1081                         }
1082                         break;
1083                 }
1084                 break;
1085         case '*':
1086                 switch (c2) {
1087                 case '*':
1088                         switch (c3) {
1089                         case '=':
1090                                 return DOUBLESTAREQUAL;
1091                         }
1092                         break;
1093                 }
1094                 break;
1095         case '/':
1096                 switch (c2) {
1097                 case '/':
1098                         switch (c3) {
1099                         case '=':
1100                                 return DOUBLESLASHEQUAL;
1101                         }
1102                         break;
1103                 }
1104                 break;
1105         }
1106         return OP;
1107 }
1108
1109 static int
1110 indenterror(struct tok_state *tok)
1111 {
1112         if (tok->alterror) {
1113                 tok->done = E_TABSPACE;
1114                 tok->cur = tok->inp;
1115                 return 1;
1116         }
1117         if (tok->altwarning) {
1118                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1119                                   "in indentation\n", tok->filename);
1120                 tok->altwarning = 0;
1121         }
1122         return 0;
1123 }
1124
1125
1126 /* Get next token, after space stripping etc. */
1127
1128 static int
1129 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1130 {
1131         register int c;
1132         int blankline;
1133
1134         *p_start = *p_end = NULL;
1135   nextline:
1136         tok->start = NULL;
1137         blankline = 0;
1138
1139         /* Get indentation level */
1140         if (tok->atbol) {
1141                 register int col = 0;
1142                 register int altcol = 0;
1143                 tok->atbol = 0;
1144                 for (;;) {
1145                         c = tok_nextc(tok);
1146                         if (c == ' ')
1147                                 col++, altcol++;
1148                         else if (c == '\t') {
1149                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1150                                 altcol = (altcol/tok->alttabsize + 1)
1151                                         * tok->alttabsize;
1152                         }
1153                         else if (c == '\014') /* Control-L (formfeed) */
1154                                 col = altcol = 0; /* For Emacs users */
1155                         else
1156                                 break;
1157                 }
1158                 tok_backup(tok, c);
1159                 if (c == '#' || c == '\n') {
1160                         /* Lines with only whitespace and/or comments
1161                            shouldn't affect the indentation and are
1162                            not passed to the parser as NEWLINE tokens,
1163                            except *totally* empty lines in interactive
1164                            mode, which signal the end of a command group. */
1165                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1166                                 blankline = 0; /* Let it through */
1167                         else
1168                                 blankline = 1; /* Ignore completely */
1169                         /* We can't jump back right here since we still
1170                            may need to skip to the end of a comment */
1171                 }
1172                 if (!blankline && tok->level == 0) {
1173                         if (col == tok->indstack[tok->indent]) {
1174                                 /* No change */
1175                                 if (altcol != tok->altindstack[tok->indent]) {
1176                                         if (indenterror(tok))
1177                                                 return ERRORTOKEN;
1178                                 }
1179                         }
1180                         else if (col > tok->indstack[tok->indent]) {
1181                                 /* Indent -- always one */
1182                                 if (tok->indent+1 >= MAXINDENT) {
1183                                         tok->done = E_TOODEEP;
1184                                         tok->cur = tok->inp;
1185                                         return ERRORTOKEN;
1186                                 }
1187                                 if (altcol <= tok->altindstack[tok->indent]) {
1188                                         if (indenterror(tok))
1189                                                 return ERRORTOKEN;
1190                                 }
1191                                 tok->pendin++;
1192                                 tok->indstack[++tok->indent] = col;
1193                                 tok->altindstack[tok->indent] = altcol;
1194                         }
1195                         else /* col < tok->indstack[tok->indent] */ {
1196                                 /* Dedent -- any number, must be consistent */
1197                                 while (tok->indent > 0 &&
1198                                         col < tok->indstack[tok->indent]) {
1199                                         tok->pendin--;
1200                                         tok->indent--;
1201                                 }
1202                                 if (col != tok->indstack[tok->indent]) {
1203                                         tok->done = E_DEDENT;
1204                                         tok->cur = tok->inp;
1205                                         return ERRORTOKEN;
1206                                 }
1207                                 if (altcol != tok->altindstack[tok->indent]) {
1208                                         if (indenterror(tok))
1209                                                 return ERRORTOKEN;
1210                                 }
1211                         }
1212                 }
1213         }
1214
1215         tok->start = tok->cur;
1216
1217         /* Return pending indents/dedents */
1218         if (tok->pendin != 0) {
1219                 if (tok->pendin < 0) {
1220                         tok->pendin++;
1221                         return DEDENT;
1222                 }
1223                 else {
1224                         tok->pendin--;
1225                         return INDENT;
1226                 }
1227         }
1228
1229  again:
1230         tok->start = NULL;
1231         /* Skip spaces */
1232         do {
1233                 c = tok_nextc(tok);
1234         } while (c == ' ' || c == '\t' || c == '\014');
1235
1236         /* Set start of current token */
1237         tok->start = tok->cur - 1;
1238
1239         /* Skip comment, while looking for tab-setting magic */
1240         if (c == '#') {
1241                 static char *tabforms[] = {
1242                         "tab-width:",           /* Emacs */
1243                         ":tabstop=",            /* vim, full form */
1244                         ":ts=",                 /* vim, abbreviated form */
1245                         "set tabsize=",         /* will vi never die? */
1246                 /* more templates can be added here to support other editors */
1247                 };
1248                 char cbuf[80];
1249                 char *tp, **cp;
1250                 tp = cbuf;
1251                 do {
1252                         *tp++ = c = tok_nextc(tok);
1253                 } while (c != EOF && c != '\n' &&
1254                          (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1255                 *tp = '\0';
1256                 for (cp = tabforms;
1257                      cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1258                      cp++) {
1259                         if ((tp = strstr(cbuf, *cp))) {
1260                                 int newsize = atoi(tp + strlen(*cp));
1261
1262                                 if (newsize >= 1 && newsize <= 40) {
1263                                         tok->tabsize = newsize;
1264                                         if (Py_VerboseFlag)
1265                                             PySys_WriteStderr(
1266                                                 "Tab size set to %d\n",
1267                                                 newsize);
1268                                 }
1269                         }
1270                 }
1271                 while (c != EOF && c != '\n')
1272                         c = tok_nextc(tok);
1273         }
1274
1275         /* Check for EOF and errors now */
1276         if (c == EOF) {
1277                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1278         }
1279
1280         /* Identifier (most frequent token!) */
1281         if (isalpha(c) || c == '_') {
1282                 /* Process r"", u"" and ur"" */
1283                 switch (c) {
1284                 case 'b':
1285                 case 'B':
1286                         c = tok_nextc(tok);
1287                         if (c == 'r' || c == 'R')
1288                                 c = tok_nextc(tok);
1289                         if (c == '"' || c == '\'')
1290                                 goto letter_quote;
1291                         break;
1292                 case 'r':
1293                 case 'R':
1294                         c = tok_nextc(tok);
1295                         if (c == '"' || c == '\'')
1296                                 goto letter_quote;
1297                         break;
1298                 case 'u':
1299                 case 'U':
1300                         c = tok_nextc(tok);
1301                         if (c == 'r' || c == 'R')
1302                                 c = tok_nextc(tok);
1303                         if (c == '"' || c == '\'')
1304                                 goto letter_quote;
1305                         break;
1306                 }
1307                 while (isalnum(c) || c == '_') {
1308                         c = tok_nextc(tok);
1309                 }
1310                 tok_backup(tok, c);
1311                 *p_start = tok->start;
1312                 *p_end = tok->cur;
1313                 return NAME;
1314         }
1315
1316         /* Newline */
1317         if (c == '\n') {
1318                 tok->atbol = 1;
1319                 if (blankline || tok->level > 0)
1320                         goto nextline;
1321                 *p_start = tok->start;
1322                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1323                 tok->cont_line = 0;
1324                 return NEWLINE;
1325         }
1326
1327         /* Period or number starting with period? */
1328         if (c == '.') {
1329                 c = tok_nextc(tok);
1330                 if (isdigit(c)) {
1331                         goto fraction;
1332                 }
1333                 else {
1334                         tok_backup(tok, c);
1335                         *p_start = tok->start;
1336                         *p_end = tok->cur;
1337                         return DOT;
1338                 }
1339         }
1340
1341         /* Number */
1342         if (isdigit(c)) {
1343                 if (c == '0') {
1344                         /* Hex, octal or binary -- maybe. */
1345                         c = tok_nextc(tok);
1346                         if (c == '.')
1347                                 goto fraction;
1348 #ifndef WITHOUT_COMPLEX
1349                         if (c == 'j' || c == 'J')
1350                                 goto imaginary;
1351 #endif
1352                         if (c == 'x' || c == 'X') {
1353
1354                                 /* Hex */
1355                                 c = tok_nextc(tok);
1356                                 if (!isxdigit(c)) {
1357                                         tok->done = E_TOKEN;
1358                                         tok_backup(tok, c);
1359                                         return ERRORTOKEN;
1360                                 }
1361                                 do {
1362                                         c = tok_nextc(tok);
1363                                 } while (isxdigit(c));
1364                         }
1365                         else if (c == 'o' || c == 'O') {
1366                                 /* Octal */
1367                                 c = tok_nextc(tok);
1368                                 if (c < '0' || c >= '8') {
1369                                         tok->done = E_TOKEN;
1370                                         tok_backup(tok, c);
1371                                         return ERRORTOKEN;
1372                                 }
1373                                 do {
1374                                         c = tok_nextc(tok);
1375                                 } while ('0' <= c && c < '8');
1376                         }
1377                         else if (c == 'b' || c == 'B') {
1378                                 /* Binary */
1379                                 c = tok_nextc(tok);
1380                                 if (c != '0' && c != '1') {
1381                                         tok->done = E_TOKEN;
1382                                         tok_backup(tok, c);
1383                                         return ERRORTOKEN;
1384                                 }
1385                                 do {
1386                                         c = tok_nextc(tok);
1387                                 } while (c == '0' || c == '1');
1388                         }
1389                         else {
1390                                 int found_decimal = 0;
1391                                 /* Octal; c is first char of it */
1392                                 /* There's no 'isoctdigit' macro, sigh */
1393                                 while ('0' <= c && c < '8') {
1394                                         c = tok_nextc(tok);
1395                                 }
1396                                 if (isdigit(c)) {
1397                                         found_decimal = 1;
1398                                         do {
1399                                                 c = tok_nextc(tok);
1400                                         } while (isdigit(c));
1401                                 }
1402                                 if (c == '.')
1403                                         goto fraction;
1404                                 else if (c == 'e' || c == 'E')
1405                                         goto exponent;
1406 #ifndef WITHOUT_COMPLEX
1407                                 else if (c == 'j' || c == 'J')
1408                                         goto imaginary;
1409 #endif
1410                                 else if (found_decimal) {
1411                                         tok->done = E_TOKEN;
1412                                         tok_backup(tok, c);
1413                                         return ERRORTOKEN;
1414                                 }
1415                         }
1416                         if (c == 'l' || c == 'L')
1417                                 c = tok_nextc(tok);
1418                 }
1419                 else {
1420                         /* Decimal */
1421                         do {
1422                                 c = tok_nextc(tok);
1423                         } while (isdigit(c));
1424                         if (c == 'l' || c == 'L')
1425                                 c = tok_nextc(tok);
1426                         else {
1427                                 /* Accept floating point numbers. */
1428                                 if (c == '.') {
1429                 fraction:
1430                                         /* Fraction */
1431                                         do {
1432                                                 c = tok_nextc(tok);
1433                                         } while (isdigit(c));
1434                                 }
1435                                 if (c == 'e' || c == 'E') {
1436                 exponent:
1437                                         /* Exponent part */
1438                                         c = tok_nextc(tok);
1439                                         if (c == '+' || c == '-')
1440                                                 c = tok_nextc(tok);
1441                                         if (!isdigit(c)) {
1442                                                 tok->done = E_TOKEN;
1443                                                 tok_backup(tok, c);
1444                                                 return ERRORTOKEN;
1445                                         }
1446                                         do {
1447                                                 c = tok_nextc(tok);
1448                                         } while (isdigit(c));
1449                                 }
1450 #ifndef WITHOUT_COMPLEX
1451                                 if (c == 'j' || c == 'J')
1452                                         /* Imaginary part */
1453                 imaginary:
1454                                         c = tok_nextc(tok);
1455 #endif
1456                         }
1457                 }
1458                 tok_backup(tok, c);
1459                 *p_start = tok->start;
1460                 *p_end = tok->cur;
1461                 return NUMBER;
1462         }
1463
1464   letter_quote:
1465         /* String */
1466         if (c == '\'' || c == '"') {
1467                 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1468                 int quote = c;
1469                 int triple = 0;
1470                 int tripcount = 0;
1471                 for (;;) {
1472                         c = tok_nextc(tok);
1473                         if (c == '\n') {
1474                                 if (!triple) {
1475                                         tok->done = E_EOLS;
1476                                         tok_backup(tok, c);
1477                                         return ERRORTOKEN;
1478                                 }
1479                                 tripcount = 0;
1480                                 tok->cont_line = 1; /* multiline string. */
1481                         }
1482                         else if (c == EOF) {
1483                                 if (triple)
1484                                         tok->done = E_EOFS;
1485                                 else
1486                                         tok->done = E_EOLS;
1487                                 tok->cur = tok->inp;
1488                                 return ERRORTOKEN;
1489                         }
1490                         else if (c == quote) {
1491                                 tripcount++;
1492                                 if (tok->cur - tok->start == quote2) {
1493                                         c = tok_nextc(tok);
1494                                         if (c == quote) {
1495                                                 triple = 1;
1496                                                 tripcount = 0;
1497                                                 continue;
1498                                         }
1499                                         tok_backup(tok, c);
1500                                 }
1501                                 if (!triple || tripcount == 3)
1502                                         break;
1503                         }
1504                         else if (c == '\\') {
1505                                 tripcount = 0;
1506                                 c = tok_nextc(tok);
1507                                 if (c == EOF) {
1508                                         tok->done = E_EOLS;
1509                                         tok->cur = tok->inp;
1510                                         return ERRORTOKEN;
1511                                 }
1512                         }
1513                         else
1514                                 tripcount = 0;
1515                 }
1516                 *p_start = tok->start;
1517                 *p_end = tok->cur;
1518                 return STRING;
1519         }
1520
1521         /* Line continuation */
1522         if (c == '\\') {
1523                 c = tok_nextc(tok);
1524                 if (c != '\n') {
1525                         tok->done = E_LINECONT;
1526                         tok->cur = tok->inp;
1527                         return ERRORTOKEN;
1528                 }
1529                 tok->cont_line = 1;
1530                 goto again; /* Read next line */
1531         }
1532
1533         /* Check for two-character token */
1534         {
1535                 int c2 = tok_nextc(tok);
1536                 int token = PyToken_TwoChars(c, c2);
1537 #ifndef PGEN
1538                 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1539                         if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1540                                                "<> not supported in 3.x; use !=",
1541                                                tok->filename, tok->lineno,
1542                                                NULL, NULL)) {
1543                                 return ERRORTOKEN;
1544                         }
1545                 }
1546 #endif
1547                 if (token != OP) {
1548                         int c3 = tok_nextc(tok);
1549                         int token3 = PyToken_ThreeChars(c, c2, c3);
1550                         if (token3 != OP) {
1551                                 token = token3;
1552                         } else {
1553                                 tok_backup(tok, c3);
1554                         }
1555                         *p_start = tok->start;
1556                         *p_end = tok->cur;
1557                         return token;
1558                 }
1559                 tok_backup(tok, c2);
1560         }
1561
1562         /* Keep track of parentheses nesting level */
1563         switch (c) {
1564         case '(':
1565         case '[':
1566         case '{':
1567                 tok->level++;
1568                 break;
1569         case ')':
1570         case ']':
1571         case '}':
1572                 tok->level--;
1573                 break;
1574         }
1575
1576         /* Punctuation character */
1577         *p_start = tok->start;
1578         *p_end = tok->cur;
1579         return PyToken_OneChar(c);
1580 }
1581
1582 int
1583 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1584 {
1585         int result = tok_get(tok, p_start, p_end);
1586         if (tok->decoding_erred) {
1587                 result = ERRORTOKEN;
1588                 tok->done = E_DECODE;
1589         }
1590         return result;
1591 }
1592
1593 /* This function is only called from parsetok. However, it cannot live
1594    there, as it must be empty for PGEN, and we can check for PGEN only
1595    in this file. */
1596
1597 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1598 char*
1599 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1600 {
1601         return NULL;
1602 }
1603 #else
1604 #ifdef Py_USING_UNICODE
1605 static PyObject *
1606 dec_utf8(const char *enc, const char *text, size_t len) {
1607         PyObject *ret = NULL;
1608         PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1609         if (unicode_text) {
1610                 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1611                 Py_DECREF(unicode_text);
1612         }
1613         if (!ret) {
1614                 PyErr_Clear();
1615         }
1616         return ret;
1617 }
1618 char *
1619 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1620 {
1621         char *text = NULL;
1622         if (tok->encoding) {
1623                 /* convert source to original encondig */
1624                 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1625                 if (lineobj != NULL) {
1626                         int linelen = PyString_Size(lineobj);
1627                         const char *line = PyString_AsString(lineobj);
1628                         text = PyObject_MALLOC(linelen + 1);
1629                         if (text != NULL && line != NULL) {
1630                                 if (linelen)
1631                                         strncpy(text, line, linelen);
1632                                 text[linelen] = '\0';
1633                         }
1634                         Py_DECREF(lineobj);
1635
1636                         /* adjust error offset */
1637                         if (*offset > 1) {
1638                                 PyObject *offsetobj = dec_utf8(tok->encoding,
1639                                                                tok->buf, *offset-1);
1640                                 if (offsetobj) {
1641                                         *offset = PyString_Size(offsetobj) + 1;
1642                                         Py_DECREF(offsetobj);
1643                                 }
1644                         }
1645
1646                 }
1647         }
1648         return text;
1649
1650 }
1651 #endif /* defined(Py_USING_UNICODE) */
1652 #endif
1653
1654
1655 #ifdef Py_DEBUG
1656
1657 void
1658 tok_dump(int type, char *start, char *end)
1659 {
1660         printf("%s", _PyParser_TokenNames[type]);
1661         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1662                 printf("(%.*s)", (int)(end - start), start);
1663 }
1664
1665 #endif