Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #include "pydebug.h"
  20 #endif /* PGEN */
  21
  22 extern char *PyOS_Readline(FILE *, FILE *, char *);
  23 /* Return malloc'ed string including trailing \n;
  24    empty malloc'ed string for EOF;
  25    NULL if interrupted */
  26
  27 /* Don't ever change this -- it would break the portability of Python code */
  28 #define TABSIZE 8
  29
  30 /* Convert a possibly signed character to a nonnegative int */
  31 /* XXX This assumes characters are 8 bits wide */
  32 #ifdef __CHAR_UNSIGNED__
  33 #define Py_CHARMASK(c)          (c)
  34 #else
  35 #define Py_CHARMASK(c)          ((c) & 0xff)
  36 #endif
  37
  38 /* Forward */
  39 static struct tok_state *tok_new(void);
  40 static int tok_nextc(struct tok_state *tok);
  41 static void tok_backup(struct tok_state *tok, int c);
  42
  43 /* Token names */
  44
  45 char *_PyParser_TokenNames[] = {
  46         "ENDMARKER",
  47         "NAME",
  48         "NUMBER",
  49         "STRING",
  50         "NEWLINE",
  51         "INDENT",
  52         "DEDENT",
  53         "LPAR",
  54         "RPAR",
  55         "LSQB",
  56         "RSQB",
  57         "COLON",
  58         "COMMA",
  59         "SEMI",
  60         "PLUS",
  61         "MINUS",
  62         "STAR",
  63         "SLASH",
  64         "VBAR",
  65         "AMPER",
  66         "LESS",
  67         "GREATER",
  68         "EQUAL",
  69         "DOT",
  70         "PERCENT",
  71         "BACKQUOTE",
  72         "LBRACE",
  73         "RBRACE",
  74         "EQEQUAL",
  75         "NOTEQUAL",
  76         "LESSEQUAL",
  77         "GREATEREQUAL",
  78         "TILDE",
  79         "CIRCUMFLEX",
  80         "LEFTSHIFT",
  81         "RIGHTSHIFT",
  82         "DOUBLESTAR",
  83         "PLUSEQUAL",
  84         "MINEQUAL",
  85         "STAREQUAL",
  86         "SLASHEQUAL",
  87         "PERCENTEQUAL",
  88         "AMPEREQUAL",
  89         "VBAREQUAL",
  90         "CIRCUMFLEXEQUAL",
  91         "LEFTSHIFTEQUAL",
  92         "RIGHTSHIFTEQUAL",
  93         "DOUBLESTAREQUAL",
  94         "DOUBLESLASH",
  95         "DOUBLESLASHEQUAL",
  96         "AT",
  97         /* This table must match the #defines in token.h! */
  98         "OP",
  99         "<ERRORTOKEN>",
 100         "<N_TOKENS>"
 101 };
 102
 103
 104 /* Create and initialize a new tok_state structure */
 105
 106 static struct tok_state *
 107 tok_new(void)
 108 {
 109         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 110                                                 sizeof(struct tok_state));
 111         if (tok == NULL)
 112                 return NULL;
 113         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 114         tok->done = E_OK;
 115         tok->fp = NULL;
 116         tok->tabsize = TABSIZE;
 117         tok->indent = 0;
 118         tok->indstack[0] = 0;
 119         tok->atbol = 1;
 120         tok->pendin = 0;
 121         tok->prompt = tok->nextprompt = NULL;
 122         tok->lineno = 0;
 123         tok->level = 0;
 124         tok->filename = NULL;
 125         tok->altwarning = 0;
 126         tok->alterror = 0;
 127         tok->alttabsize = 1;
 128         tok->altindstack[0] = 0;
 129         tok->decoding_state = 0;
 130         tok->decoding_erred = 0;
 131         tok->read_coding_spec = 0;
 132         tok->encoding = NULL;
 133         tok->cont_line = 0;
 134 #ifndef PGEN
 135         tok->decoding_readline = NULL;
 136         tok->decoding_buffer = NULL;
 137 #endif
 138         return tok;
 139 }
 140
 141 #ifdef PGEN
 142
 143 static char *
 144 decoding_fgets(char *s, int size, struct tok_state *tok)
 145 {
 146         return fgets(s, size, tok->fp);
 147 }
 148
 149 static int
 150 decoding_feof(struct tok_state *tok)
 151 {
 152         return feof(tok->fp);
 153 }
 154
 155 static const char *
 156 decode_str(const char *str, struct tok_state *tok)
 157 {
 158         return str;
 159 }
 160
 161 #else /* PGEN */
 162
 163 static char *
 164 error_ret(struct tok_state *tok) /* XXX */
 165 {
 166         tok->decoding_erred = 1;
 167         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 168                 PyMem_FREE(tok->buf);
 169         tok->buf = NULL;
 170         return NULL;            /* as if it were EOF */
 171 }
 172
 173 static char *
 174 new_string(const char *s, Py_ssize_t len)
 175 {
 176         char* result = (char *)PyMem_MALLOC(len + 1);
 177         if (result != NULL) {
 178                 memcpy(result, s, len);
 179                 result[len] = '\0';
 180         }
 181         return result;
 182 }
 183
 184 static char *
 185 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 186 {
 187         char buf[13];
 188         int i;
 189         for (i = 0; i < 12; i++) {
 190                 int c = s[i];
 191                 if (c == '\0') break;
 192                 else if (c == '_') buf[i] = '-';
 193                 else buf[i] = tolower(c);
 194         }
 195         buf[i] = '\0';
 196         if (strcmp(buf, "utf-8") == 0 ||
 197             strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
 198         else if (strcmp(buf, "latin-1") == 0 ||
 199                  strcmp(buf, "iso-8859-1") == 0 ||
 200                  strcmp(buf, "iso-latin-1") == 0 ||
 201                  strncmp(buf, "latin-1-", 8) == 0 ||
 202                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 203                  strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
 204         else return s;
 205 }
 206
 207 /* Return the coding spec in S, or NULL if none is found.  */
 208
 209 static char *
 210 get_coding_spec(const char *s, Py_ssize_t size)
 211 {
 212         Py_ssize_t i;
 213         /* Coding spec must be in a comment, and that comment must be
 214          * the only statement on the source code line. */
 215         for (i = 0; i < size - 6; i++) {
 216                 if (s[i] == '#')
 217                         break;
 218                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 219                         return NULL;
 220         }
 221         for (; i < size - 6; i++) { /* XXX inefficient search */
 222                 const char* t = s + i;
 223                 if (strncmp(t, "coding", 6) == 0) {
 224                         const char* begin = NULL;
 225                         t += 6;
 226                         if (t[0] != ':' && t[0] != '=')
 227                                 continue;
 228                         do {
 229                                 t++;
 230                         } while (t[0] == '\x20' || t[0] == '\t');
 231
 232                         begin = t;
 233                         while (isalnum(Py_CHARMASK(t[0])) ||
 234                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 235                                 t++;
 236
 237                         if (begin < t) {
 238                                 char* r = new_string(begin, t - begin);
 239                                 char* q = get_normal_name(r);
 240                                 if (r != q) {
 241                                         PyMem_FREE(r);
 242                                         r = new_string(q, strlen(q));
 243                                 }
 244                                 return r;
 245                         }
 246                 }
 247         }
 248         return NULL;
 249 }
 250
 251 /* Check whether the line contains a coding spec. If it does,
 252    invoke the set_readline function for the new encoding.
 253    This function receives the tok_state and the new encoding.
 254    Return 1 on success, 0 on failure.  */
 255
 256 static int
 257 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 258                   int set_readline(struct tok_state *, const char *))
 259 {
 260         char * cs;
 261         int r = 1;
 262
 263         if (tok->cont_line)
 264                 /* It's a continuation line, so it can't be a coding spec. */
 265                 return 1;
 266         cs = get_coding_spec(line, size);
 267         if (cs != NULL) {
 268                 tok->read_coding_spec = 1;
 269                 if (tok->encoding == NULL) {
 270                         assert(tok->decoding_state == 1); /* raw */
 271                         if (strcmp(cs, "utf-8") == 0 ||
 272                             strcmp(cs, "iso-8859-1") == 0) {
 273                                 tok->encoding = cs;
 274                         } else {
 275 #ifdef Py_USING_UNICODE
 276                                 r = set_readline(tok, cs);
 277                                 if (r) {
 278                                         tok->encoding = cs;
 279                                         tok->decoding_state = -1;
 280                                 }
 281                                 else
 282                                         PyMem_FREE(cs);
 283 #else
 284                                 /* Without Unicode support, we cannot
 285                                    process the coding spec. Since there
 286                                    won't be any Unicode literals, that
 287                                    won't matter. */
 288                                 PyMem_FREE(cs);
 289 #endif
 290                         }
 291                 } else {        /* then, compare cs with BOM */
 292                         r = (strcmp(tok->encoding, cs) == 0);
 293                         PyMem_FREE(cs);
 294                 }
 295         }
 296         if (!r) {
 297                 cs = tok->encoding;
 298                 if (!cs)
 299                         cs = "with BOM";
 300                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 301         }
 302         return r;
 303 }
 304
 305 /* See whether the file starts with a BOM. If it does,
 306    invoke the set_readline function with the new encoding.
 307    Return 1 on success, 0 on failure.  */
 308
 309 static int
 310 check_bom(int get_char(struct tok_state *),
 311           void unget_char(int, struct tok_state *),
 312           int set_readline(struct tok_state *, const char *),
 313           struct tok_state *tok)
 314 {
 315         int ch = get_char(tok);
 316         tok->decoding_state = 1;
 317         if (ch == EOF) {
 318                 return 1;
 319         } else if (ch == 0xEF) {
 320                 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
 321                 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
 322 #if 0
 323         /* Disable support for UTF-16 BOMs until a decision
 324            is made whether this needs to be supported.  */
 325         } else if (ch == 0xFE) {
 326                 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
 327                 if (!set_readline(tok, "utf-16-be")) return 0;
 328                 tok->decoding_state = -1;
 329         } else if (ch == 0xFF) {
 330                 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
 331                 if (!set_readline(tok, "utf-16-le")) return 0;
 332                 tok->decoding_state = -1;
 333 #endif
 334         } else {
 335                 unget_char(ch, tok);
 336                 return 1;
 337         }
 338         if (tok->encoding != NULL)
 339                 PyMem_FREE(tok->encoding);
 340         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 341         return 1;
 342   NON_BOM:
 343         /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 344         unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
 345         return 1;
 346 }
 347
 348 /* Read a line of text from TOK into S, using the stream in TOK.
 349    Return NULL on failure, else S.
 350
 351    On entry, tok->decoding_buffer will be one of:
 352      1) NULL: need to call tok->decoding_readline to get a new line
 353      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 354            stored the result in tok->decoding_buffer
 355      3) PyStringObject *: previous call to fp_readl did not have enough room
 356            (in the s buffer) to copy entire contents of the line read
 357            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 358            In this case, fp_readl is called in a loop (with an expanded buffer)
 359            until the buffer ends with a '\n' (or until the end of the file is
 360            reached): see tok_nextc and its calls to decoding_fgets.
 361 */
 362
 363 static char *
 364 fp_readl(char *s, int size, struct tok_state *tok)
 365 {
 366 #ifndef Py_USING_UNICODE
 367         /* In a non-Unicode built, this should never be called. */
 368         Py_FatalError("fp_readl should not be called in this build.");
 369         return NULL; /* Keep compiler happy (not reachable) */
 370 #else
 371         PyObject* utf8 = NULL;
 372         PyObject* buf = tok->decoding_buffer;
 373         char *str;
 374         Py_ssize_t utf8len;
 375
 376         /* Ask for one less byte so we can terminate it */
 377         assert(size > 0);
 378         size--;
 379
 380         if (buf == NULL) {
 381                 buf = PyObject_CallObject(tok->decoding_readline, NULL);
 382                 if (buf == NULL)
 383                         return error_ret(tok);
 384         } else {
 385                 tok->decoding_buffer = NULL;
 386                 if (PyString_CheckExact(buf))
 387                         utf8 = buf;
 388         }
 389         if (utf8 == NULL) {
 390                 utf8 = PyUnicode_AsUTF8String(buf);
 391                 Py_DECREF(buf);
 392                 if (utf8 == NULL)
 393                         return error_ret(tok);
 394         }
 395         str = PyString_AsString(utf8);
 396         utf8len = PyString_GET_SIZE(utf8);
 397         if (utf8len > size) {
 398                 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 399                 if (tok->decoding_buffer == NULL) {
 400                         Py_DECREF(utf8);
 401                         return error_ret(tok);
 402                 }
 403                 utf8len = size;
 404         }
 405         memcpy(s, str, utf8len);
 406         s[utf8len] = '\0';
 407         Py_DECREF(utf8);
 408         if (utf8len == 0) return NULL; /* EOF */
 409         return s;
 410 #endif
 411 }
 412
 413 /* Set the readline function for TOK to a StreamReader's
 414    readline function. The StreamReader is named ENC.
 415
 416    This function is called from check_bom and check_coding_spec.
 417
 418    ENC is usually identical to the future value of tok->encoding,
 419    except for the (currently unsupported) case of UTF-16.
 420
 421    Return 1 on success, 0 on failure. */
 422
 423 static int
 424 fp_setreadl(struct tok_state *tok, const char* enc)
 425 {
 426         PyObject *reader, *stream, *readline;
 427
 428         /* XXX: constify filename argument. */
 429         stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 430         if (stream == NULL)
 431                 return 0;
 432
 433         reader = PyCodec_StreamReader(enc, stream, NULL);
 434         Py_DECREF(stream);
 435         if (reader == NULL)
 436                 return 0;
 437
 438         readline = PyObject_GetAttrString(reader, "readline");
 439         Py_DECREF(reader);
 440         if (readline == NULL)
 441                 return 0;
 442
 443         tok->decoding_readline = readline;
 444         return 1;
 445 }
 446
 447 /* Fetch the next byte from TOK. */
 448
 449 static int fp_getc(struct tok_state *tok) {
 450         return getc(tok->fp);
 451 }
 452
 453 /* Unfetch the last byte back into TOK.  */
 454
 455 static void fp_ungetc(int c, struct tok_state *tok) {
 456         ungetc(c, tok->fp);
 457 }
 458
 459 /* Read a line of input from TOK. Determine encoding
 460    if necessary.  */
 461
 462 static char *
 463 decoding_fgets(char *s, int size, struct tok_state *tok)
 464 {
 465         char *line = NULL;
 466         int badchar = 0;
 467         for (;;) {
 468                 if (tok->decoding_state < 0) {
 469                         /* We already have a codec associated with
 470                            this input. */
 471                         line = fp_readl(s, size, tok);
 472                         break;
 473                 } else if (tok->decoding_state > 0) {
 474                         /* We want a 'raw' read. */
 475                         line = Py_UniversalNewlineFgets(s, size,
 476                                                         tok->fp, NULL);
 477                         break;
 478                 } else {
 479                         /* We have not yet determined the encoding.
 480                            If an encoding is found, use the file-pointer
 481                            reader functions from now on. */
 482                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 483                                 return error_ret(tok);
 484                         assert(tok->decoding_state != 0);
 485                 }
 486         }
 487         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 488                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 489                         return error_ret(tok);
 490                 }
 491         }
 492 #ifndef PGEN
 493         /* The default encoding is ASCII, so make sure we don't have any
 494            non-ASCII bytes in it. */
 495         if (line && !tok->encoding) {
 496                 unsigned char *c;
 497                 for (c = (unsigned char *)line; *c; c++)
 498                         if (*c > 127) {
 499                                 badchar = *c;
 500                                 break;
 501                         }
 502         }
 503         if (badchar) {
 504                 char buf[500];
 505                 /* Need to add 1 to the line number, since this line
 506                    has not been counted, yet.  */
 507                 sprintf(buf,
 508                         "Non-ASCII character '\\x%.2x' "
 509                         "in file %.200s on line %i, "
 510                         "but no encoding declared; "
 511                         "see http://www.python.org/peps/pep-0263.html for details",
 512                         badchar, tok->filename, tok->lineno + 1);
 513                 PyErr_SetString(PyExc_SyntaxError, buf);
 514                 return error_ret(tok);
 515         }
 516 #endif
 517         return line;
 518 }
 519
 520 static int
 521 decoding_feof(struct tok_state *tok)
 522 {
 523         if (tok->decoding_state >= 0) {
 524                 return feof(tok->fp);
 525         } else {
 526                 PyObject* buf = tok->decoding_buffer;
 527                 if (buf == NULL) {
 528                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 529                         if (buf == NULL) {
 530                                 error_ret(tok);
 531                                 return 1;
 532                         } else {
 533                                 tok->decoding_buffer = buf;
 534                         }
 535                 }
 536                 return PyObject_Length(buf) == 0;
 537         }
 538 }
 539
 540 /* Fetch a byte from TOK, using the string buffer. */
 541
 542 static int
 543 buf_getc(struct tok_state *tok) {
 544         return Py_CHARMASK(*tok->str++);
 545 }
 546
 547 /* Unfetch a byte from TOK, using the string buffer. */
 548
 549 static void
 550 buf_ungetc(int c, struct tok_state *tok) {
 551         tok->str--;
 552         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 553 }
 554
 555 /* Set the readline function for TOK to ENC. For the string-based
 556    tokenizer, this means to just record the encoding. */
 557
 558 static int
 559 buf_setreadl(struct tok_state *tok, const char* enc) {
 560         tok->enc = enc;
 561         return 1;
 562 }
 563
 564 /* Return a UTF-8 encoding Python string object from the
 565    C byte string STR, which is encoded with ENC. */
 566
 567 #ifdef Py_USING_UNICODE
 568 static PyObject *
 569 translate_into_utf8(const char* str, const char* enc) {
 570         PyObject *utf8;
 571         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 572         if (buf == NULL)
 573                 return NULL;
 574         utf8 = PyUnicode_AsUTF8String(buf);
 575         Py_DECREF(buf);
 576         return utf8;
 577 }
 578 #endif
 579
 580 /* Decode a byte string STR for use as the buffer of TOK.
 581    Look for encoding declarations inside STR, and record them
 582    inside TOK.  */
 583
 584 static const char *
 585 decode_str(const char *str, struct tok_state *tok)
 586 {
 587         PyObject* utf8 = NULL;
 588         const char *s;
 589         int lineno = 0;
 590         tok->enc = NULL;
 591         tok->str = str;
 592         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 593                 return error_ret(tok);
 594         str = tok->str;         /* string after BOM if any */
 595         assert(str);
 596 #ifdef Py_USING_UNICODE
 597         if (tok->enc != NULL) {
 598                 utf8 = translate_into_utf8(str, tok->enc);
 599                 if (utf8 == NULL)
 600                         return error_ret(tok);
 601                 str = PyString_AsString(utf8);
 602         }
 603 #endif
 604         for (s = str;; s++) {
 605                 if (*s == '\0') break;
 606                 else if (*s == '\n') {
 607                         lineno++;
 608                         if (lineno == 2) break;
 609                 }
 610         }
 611         tok->enc = NULL;
 612         if (!check_coding_spec(str, s - str, tok, buf_setreadl))
 613                 return error_ret(tok);
 614 #ifdef Py_USING_UNICODE
 615         if (tok->enc != NULL) {
 616                 assert(utf8 == NULL);
 617                 utf8 = translate_into_utf8(str, tok->enc);
 618                 if (utf8 == NULL) {
 619                         PyErr_Format(PyExc_SyntaxError,
 620                                 "unknown encoding: %s", tok->enc);
 621                         return error_ret(tok);
 622                 }
 623                 str = PyString_AsString(utf8);
 624         }
 625 #endif
 626         assert(tok->decoding_buffer == NULL);
 627         tok->decoding_buffer = utf8; /* CAUTION */
 628         return str;
 629 }
 630
 631 #endif /* PGEN */
 632
 633 /* Set up tokenizer for string */
 634
 635 struct tok_state *
 636 PyTokenizer_FromString(const char *str)
 637 {
 638         struct tok_state *tok = tok_new();
 639         if (tok == NULL)
 640                 return NULL;
 641         str = (char *)decode_str(str, tok);
 642         if (str == NULL) {
 643                 PyTokenizer_Free(tok);
 644                 return NULL;
 645         }
 646
 647         /* XXX: constify members. */
 648         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 649         return tok;
 650 }
 651
 652
 653 /* Set up tokenizer for file */
 654
 655 struct tok_state *
 656 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 657 {
 658         struct tok_state *tok = tok_new();
 659         if (tok == NULL)
 660                 return NULL;
 661         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 662                 PyTokenizer_Free(tok);
 663                 return NULL;
 664         }
 665         tok->cur = tok->inp = tok->buf;
 666         tok->end = tok->buf + BUFSIZ;
 667         tok->fp = fp;
 668         tok->prompt = ps1;
 669         tok->nextprompt = ps2;
 670         return tok;
 671 }
 672
 673
 674 /* Free a tok_state structure */
 675
 676 void
 677 PyTokenizer_Free(struct tok_state *tok)
 678 {
 679         if (tok->encoding != NULL)
 680                 PyMem_FREE(tok->encoding);
 681 #ifndef PGEN
 682         Py_XDECREF(tok->decoding_readline);
 683         Py_XDECREF(tok->decoding_buffer);
 684 #endif
 685         if (tok->fp != NULL && tok->buf != NULL)
 686                 PyMem_FREE(tok->buf);
 687         PyMem_FREE(tok);
 688 }
 689
 690 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 691 static int
 692 tok_stdin_decode(struct tok_state *tok, char **inp)
 693 {
 694         PyObject *enc, *sysstdin, *decoded, *utf8;
 695         const char *encoding;
 696         char *converted;
 697
 698         if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 699                 return 0;
 700         sysstdin = PySys_GetObject("stdin");
 701         if (sysstdin == NULL || !PyFile_Check(sysstdin))
 702                 return 0;
 703
 704         enc = ((PyFileObject *)sysstdin)->f_encoding;
 705         if (enc == NULL || !PyString_Check(enc))
 706                 return 0;
 707         Py_INCREF(enc);
 708
 709         encoding = PyString_AsString(enc);
 710         decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 711         if (decoded == NULL)
 712                 goto error_clear;
 713
 714         utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 715         Py_DECREF(decoded);
 716         if (utf8 == NULL)
 717                 goto error_clear;
 718
 719         assert(PyString_Check(utf8));
 720         converted = new_string(PyString_AS_STRING(utf8),
 721                                PyString_GET_SIZE(utf8));
 722         Py_DECREF(utf8);
 723         if (converted == NULL)
 724                 goto error_nomem;
 725
 726         PyMem_FREE(*inp);
 727         *inp = converted;
 728         if (tok->encoding != NULL)
 729                 PyMem_FREE(tok->encoding);
 730         tok->encoding = new_string(encoding, strlen(encoding));
 731         if (tok->encoding == NULL)
 732                 goto error_nomem;
 733
 734         Py_DECREF(enc);
 735         return 0;
 736
 737 error_nomem:
 738         Py_DECREF(enc);
 739         tok->done = E_NOMEM;
 740         return -1;
 741
 742 error_clear:
 743         /* Fallback to iso-8859-1: for backward compatibility */
 744         Py_DECREF(enc);
 745         PyErr_Clear();
 746         return 0;
 747 }
 748 #endif
 749
 750 /* Get next char, updating state; error code goes into tok->done */
 751
 752 static int
 753 tok_nextc(register struct tok_state *tok)
 754 {
 755         for (;;) {
 756                 if (tok->cur != tok->inp) {
 757                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 758                 }
 759                 if (tok->done != E_OK)
 760                         return EOF;
 761                 if (tok->fp == NULL) {
 762                         char *end = strchr(tok->inp, '\n');
 763                         if (end != NULL)
 764                                 end++;
 765                         else {
 766                                 end = strchr(tok->inp, '\0');
 767                                 if (end == tok->inp) {
 768                                         tok->done = E_EOF;
 769                                         return EOF;
 770                                 }
 771                         }
 772                         if (tok->start == NULL)
 773                                 tok->buf = tok->cur;
 774                         tok->line_start = tok->cur;
 775                         tok->lineno++;
 776                         tok->inp = end;
 777                         return Py_CHARMASK(*tok->cur++);
 778                 }
 779                 if (tok->prompt != NULL) {
 780                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 781                         if (tok->nextprompt != NULL)
 782                                 tok->prompt = tok->nextprompt;
 783                         if (newtok == NULL)
 784                                 tok->done = E_INTR;
 785                         else if (*newtok == '\0') {
 786                                 PyMem_FREE(newtok);
 787                                 tok->done = E_EOF;
 788                         }
 789 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 790                         else if (tok_stdin_decode(tok, &newtok) != 0)
 791                                 PyMem_FREE(newtok);
 792 #endif
 793                         else if (tok->start != NULL) {
 794                                 size_t start = tok->start - tok->buf;
 795                                 size_t oldlen = tok->cur - tok->buf;
 796                                 size_t newlen = oldlen + strlen(newtok);
 797                                 char *buf = tok->buf;
 798                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 799                                 tok->lineno++;
 800                                 if (buf == NULL) {
 801                                         PyMem_FREE(tok->buf);
 802                                         tok->buf = NULL;
 803                                         PyMem_FREE(newtok);
 804                                         tok->done = E_NOMEM;
 805                                         return EOF;
 806                                 }
 807                                 tok->buf = buf;
 808                                 tok->cur = tok->buf + oldlen;
 809                                 tok->line_start = tok->cur;
 810                                 strcpy(tok->buf + oldlen, newtok);
 811                                 PyMem_FREE(newtok);
 812                                 tok->inp = tok->buf + newlen;
 813                                 tok->end = tok->inp + 1;
 814                                 tok->start = tok->buf + start;
 815                         }
 816                         else {
 817                                 tok->lineno++;
 818                                 if (tok->buf != NULL)
 819                                         PyMem_FREE(tok->buf);
 820                                 tok->buf = newtok;
 821                                 tok->line_start = tok->buf;
 822                                 tok->cur = tok->buf;
 823                                 tok->line_start = tok->buf;
 824                                 tok->inp = strchr(tok->buf, '\0');
 825                                 tok->end = tok->inp + 1;
 826                         }
 827                 }
 828                 else {
 829                         int done = 0;
 830                         Py_ssize_t cur = 0;
 831                         char *pt;
 832                         if (tok->start == NULL) {
 833                                 if (tok->buf == NULL) {
 834                                         tok->buf = (char *)
 835                                                 PyMem_MALLOC(BUFSIZ);
 836                                         if (tok->buf == NULL) {
 837                                                 tok->done = E_NOMEM;
 838                                                 return EOF;
 839                                         }
 840                                         tok->end = tok->buf + BUFSIZ;
 841                                 }
 842                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 843                                           tok) == NULL) {
 844                                         tok->done = E_EOF;
 845                                         done = 1;
 846                                 }
 847                                 else {
 848                                         tok->done = E_OK;
 849                                         tok->inp = strchr(tok->buf, '\0');
 850                                         done = tok->inp[-1] == '\n';
 851                                 }
 852                         }
 853                         else {
 854                                 cur = tok->cur - tok->buf;
 855                                 if (decoding_feof(tok)) {
 856                                         tok->done = E_EOF;
 857                                         done = 1;
 858                                 }
 859                                 else
 860                                         tok->done = E_OK;
 861                         }
 862                         tok->lineno++;
 863                         /* Read until '\n' or EOF */
 864                         while (!done) {
 865                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 866                                                   tok->start - tok->buf;
 867                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 868                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 869                                 char *newbuf = tok->buf;
 870                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 871                                                                newsize);
 872                                 if (newbuf == NULL) {
 873                                         tok->done = E_NOMEM;
 874                                         tok->cur = tok->inp;
 875                                         return EOF;
 876                                 }
 877                                 tok->buf = newbuf;
 878                                 tok->inp = tok->buf + curvalid;
 879                                 tok->end = tok->buf + newsize;
 880                                 tok->start = curstart < 0 ? NULL :
 881                                              tok->buf + curstart;
 882                                 if (decoding_fgets(tok->inp,
 883                                                (int)(tok->end - tok->inp),
 884                                                tok) == NULL) {
 885                                         /* Break out early on decoding
 886                                            errors, as tok->buf will be NULL
 887                                          */
 888                                         if (tok->decoding_erred)
 889                                                 return EOF;
 890                                         /* Last line does not end in \n,
 891                                            fake one */
 892                                         strcpy(tok->inp, "\n");
 893                                 }
 894                                 tok->inp = strchr(tok->inp, '\0');
 895                                 done = tok->inp[-1] == '\n';
 896                         }
 897                         if (tok->buf != NULL) {
 898                                 tok->cur = tok->buf + cur;
 899                                 tok->line_start = tok->cur;
 900                                 /* replace "\r\n" with "\n" */
 901                                 /* For Mac leave the \r, giving a syntax error */
 902                                 pt = tok->inp - 2;
 903                                 if (pt >= tok->buf && *pt == '\r') {
 904                                         *pt++ = '\n';
 905                                         *pt = '\0';
 906                                         tok->inp = pt;
 907                                 }
 908                         }
 909                 }
 910                 if (tok->done != E_OK) {
 911                         if (tok->prompt != NULL)
 912                                 PySys_WriteStderr("\n");
 913                         tok->cur = tok->inp;
 914                         return EOF;
 915                 }
 916         }
 917         /*NOTREACHED*/
 918 }
 919
 920
 921 /* Back-up one character */
 922
 923 static void
 924 tok_backup(register struct tok_state *tok, register int c)
 925 {
 926         if (c != EOF) {
 927                 if (--tok->cur < tok->buf)
 928                         Py_FatalError("tok_backup: begin of buffer");
 929                 if (*tok->cur != c)
 930                         *tok->cur = c;
 931         }
 932 }
 933
 934
 935 /* Return the token corresponding to a single character */
 936
 937 int
 938 PyToken_OneChar(int c)
 939 {
 940         switch (c) {
 941         case '(':       return LPAR;
 942         case ')':       return RPAR;
 943         case '[':       return LSQB;
 944         case ']':       return RSQB;
 945         case ':':       return COLON;
 946         case ',':       return COMMA;
 947         case ';':       return SEMI;
 948         case '+':       return PLUS;
 949         case '-':       return MINUS;
 950         case '*':       return STAR;
 951         case '/':       return SLASH;
 952         case '|':       return VBAR;
 953         case '&':       return AMPER;
 954         case '<':       return LESS;
 955         case '>':       return GREATER;
 956         case '=':       return EQUAL;
 957         case '.':       return DOT;
 958         case '%':       return PERCENT;
 959         case '`':       return BACKQUOTE;
 960         case '{':       return LBRACE;
 961         case '}':       return RBRACE;
 962         case '^':       return CIRCUMFLEX;
 963         case '~':       return TILDE;
 964         case '@':       return AT;
 965         default:        return OP;
 966         }
 967 }
 968
 969
 970 int
 971 PyToken_TwoChars(int c1, int c2)
 972 {
 973         switch (c1) {
 974         case '=':
 975                 switch (c2) {
 976                 case '=':       return EQEQUAL;
 977                 }
 978                 break;
 979         case '!':
 980                 switch (c2) {
 981                 case '=':       return NOTEQUAL;
 982                 }
 983                 break;
 984         case '<':
 985                 switch (c2) {
 986                 case '>':       return NOTEQUAL;
 987                 case '=':       return LESSEQUAL;
 988                 case '<':       return LEFTSHIFT;
 989                 }
 990                 break;
 991         case '>':
 992                 switch (c2) {
 993                 case '=':       return GREATEREQUAL;
 994                 case '>':       return RIGHTSHIFT;
 995                 }
 996                 break;
 997         case '+':
 998                 switch (c2) {
 999                 case '=':       return PLUSEQUAL;
1000                 }
1001                 break;
1002         case '-':
1003                 switch (c2) {
1004                 case '=':       return MINEQUAL;
1005                 }
1006                 break;
1007         case '*':
1008                 switch (c2) {
1009                 case '*':       return DOUBLESTAR;
1010                 case '=':       return STAREQUAL;
1011                 }
1012                 break;
1013         case '/':
1014                 switch (c2) {
1015                 case '/':       return DOUBLESLASH;
1016                 case '=':       return SLASHEQUAL;
1017                 }
1018                 break;
1019         case '|':
1020                 switch (c2) {
1021                 case '=':       return VBAREQUAL;
1022                 }
1023                 break;
1024         case '%':
1025                 switch (c2) {
1026                 case '=':       return PERCENTEQUAL;
1027                 }
1028                 break;
1029         case '&':
1030                 switch (c2) {
1031                 case '=':       return AMPEREQUAL;
1032                 }
1033                 break;
1034         case '^':
1035                 switch (c2) {
1036                 case '=':       return CIRCUMFLEXEQUAL;
1037                 }
1038                 break;
1039         }
1040         return OP;
1041 }
1042
1043 int
1044 PyToken_ThreeChars(int c1, int c2, int c3)
1045 {
1046         switch (c1) {
1047         case '<':
1048                 switch (c2) {
1049                 case '<':
1050                         switch (c3) {
1051                         case '=':
1052                                 return LEFTSHIFTEQUAL;
1053                         }
1054                         break;
1055                 }
1056                 break;
1057         case '>':
1058                 switch (c2) {
1059                 case '>':
1060                         switch (c3) {
1061                         case '=':
1062                                 return RIGHTSHIFTEQUAL;
1063                         }
1064                         break;
1065                 }
1066                 break;
1067         case '*':
1068                 switch (c2) {
1069                 case '*':
1070                         switch (c3) {
1071                         case '=':
1072                                 return DOUBLESTAREQUAL;
1073                         }
1074                         break;
1075                 }
1076                 break;
1077         case '/':
1078                 switch (c2) {
1079                 case '/':
1080                         switch (c3) {
1081                         case '=':
1082                                 return DOUBLESLASHEQUAL;
1083                         }
1084                         break;
1085                 }
1086                 break;
1087         }
1088         return OP;
1089 }
1090
1091 static int
1092 indenterror(struct tok_state *tok)
1093 {
1094         if (tok->alterror) {
1095                 tok->done = E_TABSPACE;
1096                 tok->cur = tok->inp;
1097                 return 1;
1098         }
1099         if (tok->altwarning) {
1100                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1101                                   "in indentation\n", tok->filename);
1102                 tok->altwarning = 0;
1103         }
1104         return 0;
1105 }
1106
1107
1108 /* Get next token, after space stripping etc. */
1109
1110 static int
1111 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1112 {
1113         register int c;
1114         int blankline;
1115
1116         *p_start = *p_end = NULL;
1117   nextline:
1118         tok->start = NULL;
1119         blankline = 0;
1120
1121         /* Get indentation level */
1122         if (tok->atbol) {
1123                 register int col = 0;
1124                 register int altcol = 0;
1125                 tok->atbol = 0;
1126                 for (;;) {
1127                         c = tok_nextc(tok);
1128                         if (c == ' ')
1129                                 col++, altcol++;
1130                         else if (c == '\t') {
1131                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1132                                 altcol = (altcol/tok->alttabsize + 1)
1133                                         * tok->alttabsize;
1134                         }
1135                         else if (c == '\014') /* Control-L (formfeed) */
1136                                 col = altcol = 0; /* For Emacs users */
1137                         else
1138                                 break;
1139                 }
1140                 tok_backup(tok, c);
1141                 if (c == '#' || c == '\n') {
1142                         /* Lines with only whitespace and/or comments
1143                            shouldn't affect the indentation and are
1144                            not passed to the parser as NEWLINE tokens,
1145                            except *totally* empty lines in interactive
1146                            mode, which signal the end of a command group. */
1147                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1148                                 blankline = 0; /* Let it through */
1149                         else
1150                                 blankline = 1; /* Ignore completely */
1151                         /* We can't jump back right here since we still
1152                            may need to skip to the end of a comment */
1153                 }
1154                 if (!blankline && tok->level == 0) {
1155                         if (col == tok->indstack[tok->indent]) {
1156                                 /* No change */
1157                                 if (altcol != tok->altindstack[tok->indent]) {
1158                                         if (indenterror(tok))
1159                                                 return ERRORTOKEN;
1160                                 }
1161                         }
1162                         else if (col > tok->indstack[tok->indent]) {
1163                                 /* Indent -- always one */
1164                                 if (tok->indent+1 >= MAXINDENT) {
1165                                         tok->done = E_TOODEEP;
1166                                         tok->cur = tok->inp;
1167                                         return ERRORTOKEN;
1168                                 }
1169                                 if (altcol <= tok->altindstack[tok->indent]) {
1170                                         if (indenterror(tok))
1171                                                 return ERRORTOKEN;
1172                                 }
1173                                 tok->pendin++;
1174                                 tok->indstack[++tok->indent] = col;
1175                                 tok->altindstack[tok->indent] = altcol;
1176                         }
1177                         else /* col < tok->indstack[tok->indent] */ {
1178                                 /* Dedent -- any number, must be consistent */
1179                                 while (tok->indent > 0 &&
1180                                         col < tok->indstack[tok->indent]) {
1181                                         tok->pendin--;
1182                                         tok->indent--;
1183                                 }
1184                                 if (col != tok->indstack[tok->indent]) {
1185                                         tok->done = E_DEDENT;
1186                                         tok->cur = tok->inp;
1187                                         return ERRORTOKEN;
1188                                 }
1189                                 if (altcol != tok->altindstack[tok->indent]) {
1190                                         if (indenterror(tok))
1191                                                 return ERRORTOKEN;
1192                                 }
1193                         }
1194                 }
1195         }
1196
1197         tok->start = tok->cur;
1198
1199         /* Return pending indents/dedents */
1200         if (tok->pendin != 0) {
1201                 if (tok->pendin < 0) {
1202                         tok->pendin++;
1203                         return DEDENT;
1204                 }
1205                 else {
1206                         tok->pendin--;
1207                         return INDENT;
1208                 }
1209         }
1210
1211  again:
1212         tok->start = NULL;
1213         /* Skip spaces */
1214         do {
1215                 c = tok_nextc(tok);
1216         } while (c == ' ' || c == '\t' || c == '\014');
1217
1218         /* Set start of current token */
1219         tok->start = tok->cur - 1;
1220
1221         /* Skip comment, while looking for tab-setting magic */
1222         if (c == '#') {
1223                 static char *tabforms[] = {
1224                         "tab-width:",           /* Emacs */
1225                         ":tabstop=",            /* vim, full form */
1226                         ":ts=",                 /* vim, abbreviated form */
1227                         "set tabsize=",         /* will vi never die? */
1228                 /* more templates can be added here to support other editors */
1229                 };
1230                 char cbuf[80];
1231                 char *tp, **cp;
1232                 tp = cbuf;
1233                 do {
1234                         *tp++ = c = tok_nextc(tok);
1235                 } while (c != EOF && c != '\n' &&
1236                          (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1237                 *tp = '\0';
1238                 for (cp = tabforms;
1239                      cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1240                      cp++) {
1241                         if ((tp = strstr(cbuf, *cp))) {
1242                                 int newsize = atoi(tp + strlen(*cp));
1243
1244                                 if (newsize >= 1 && newsize <= 40) {
1245                                         tok->tabsize = newsize;
1246                                         if (Py_VerboseFlag)
1247                                             PySys_WriteStderr(
1248                                                 "Tab size set to %d\n",
1249                                                 newsize);
1250                                 }
1251                         }
1252                 }
1253                 while (c != EOF && c != '\n')
1254                         c = tok_nextc(tok);
1255         }
1256
1257         /* Check for EOF and errors now */
1258         if (c == EOF) {
1259                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1260         }
1261
1262         /* Identifier (most frequent token!) */
1263         if (isalpha(c) || c == '_') {
1264                 /* Process r"", u"" and ur"" */
1265                 switch (c) {
1266                 case 'r':
1267                 case 'R':
1268                         c = tok_nextc(tok);
1269                         if (c == '"' || c == '\'')
1270                                 goto letter_quote;
1271                         break;
1272                 case 'u':
1273                 case 'U':
1274                         c = tok_nextc(tok);
1275                         if (c == 'r' || c == 'R')
1276                                 c = tok_nextc(tok);
1277                         if (c == '"' || c == '\'')
1278                                 goto letter_quote;
1279                         break;
1280                 }
1281                 while (isalnum(c) || c == '_') {
1282                         c = tok_nextc(tok);
1283                 }
1284                 tok_backup(tok, c);
1285                 *p_start = tok->start;
1286                 *p_end = tok->cur;
1287                 return NAME;
1288         }
1289
1290         /* Newline */
1291         if (c == '\n') {
1292                 tok->atbol = 1;
1293                 if (blankline || tok->level > 0)
1294                         goto nextline;
1295                 *p_start = tok->start;
1296                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1297                 tok->cont_line = 0;
1298                 return NEWLINE;
1299         }
1300
1301         /* Period or number starting with period? */
1302         if (c == '.') {
1303                 c = tok_nextc(tok);
1304                 if (isdigit(c)) {
1305                         goto fraction;
1306                 }
1307                 else {
1308                         tok_backup(tok, c);
1309                         *p_start = tok->start;
1310                         *p_end = tok->cur;
1311                         return DOT;
1312                 }
1313         }
1314
1315         /* Number */
1316         if (isdigit(c)) {
1317                 if (c == '0') {
1318                         /* Hex or octal -- maybe. */
1319                         c = tok_nextc(tok);
1320                         if (c == '.')
1321                                 goto fraction;
1322 #ifndef WITHOUT_COMPLEX
1323                         if (c == 'j' || c == 'J')
1324                                 goto imaginary;
1325 #endif
1326                         if (c == 'x' || c == 'X') {
1327                                 /* Hex */
1328                                 do {
1329                                         c = tok_nextc(tok);
1330                                 } while (isxdigit(c));
1331                         }
1332                         else {
1333                                 int found_decimal = 0;
1334                                 /* Octal; c is first char of it */
1335                                 /* There's no 'isoctdigit' macro, sigh */
1336                                 while ('0' <= c && c < '8') {
1337                                         c = tok_nextc(tok);
1338                                 }
1339                                 if (isdigit(c)) {
1340                                         found_decimal = 1;
1341                                         do {
1342                                                 c = tok_nextc(tok);
1343                                         } while (isdigit(c));
1344                                 }
1345                                 if (c == '.')
1346                                         goto fraction;
1347                                 else if (c == 'e' || c == 'E')
1348                                         goto exponent;
1349 #ifndef WITHOUT_COMPLEX
1350                                 else if (c == 'j' || c == 'J')
1351                                         goto imaginary;
1352 #endif
1353                                 else if (found_decimal) {
1354                                         tok->done = E_TOKEN;
1355                                         tok_backup(tok, c);
1356                                         return ERRORTOKEN;
1357                                 }
1358                         }
1359                         if (c == 'l' || c == 'L')
1360                                 c = tok_nextc(tok);
1361                 }
1362                 else {
1363                         /* Decimal */
1364                         do {
1365                                 c = tok_nextc(tok);
1366                         } while (isdigit(c));
1367                         if (c == 'l' || c == 'L')
1368                                 c = tok_nextc(tok);
1369                         else {
1370                                 /* Accept floating point numbers. */
1371                                 if (c == '.') {
1372                 fraction:
1373                                         /* Fraction */
1374                                         do {
1375                                                 c = tok_nextc(tok);
1376                                         } while (isdigit(c));
1377                                 }
1378                                 if (c == 'e' || c == 'E') {
1379                 exponent:
1380                                         /* Exponent part */
1381                                         c = tok_nextc(tok);
1382                                         if (c == '+' || c == '-')
1383                                                 c = tok_nextc(tok);
1384                                         if (!isdigit(c)) {
1385                                                 tok->done = E_TOKEN;
1386                                                 tok_backup(tok, c);
1387                                                 return ERRORTOKEN;
1388                                         }
1389                                         do {
1390                                                 c = tok_nextc(tok);
1391                                         } while (isdigit(c));
1392                                 }
1393 #ifndef WITHOUT_COMPLEX
1394                                 if (c == 'j' || c == 'J')
1395                                         /* Imaginary part */
1396                 imaginary:
1397                                         c = tok_nextc(tok);
1398 #endif
1399                         }
1400                 }
1401                 tok_backup(tok, c);
1402                 *p_start = tok->start;
1403                 *p_end = tok->cur;
1404                 return NUMBER;
1405         }
1406
1407   letter_quote:
1408         /* String */
1409         if (c == '\'' || c == '"') {
1410                 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1411                 int quote = c;
1412                 int triple = 0;
1413                 int tripcount = 0;
1414                 for (;;) {
1415                         c = tok_nextc(tok);
1416                         if (c == '\n') {
1417                                 if (!triple) {
1418                                         tok->done = E_EOLS;
1419                                         tok_backup(tok, c);
1420                                         return ERRORTOKEN;
1421                                 }
1422                                 tripcount = 0;
1423                                 tok->cont_line = 1; /* multiline string. */
1424                         }
1425                         else if (c == EOF) {
1426                                 if (triple)
1427                                         tok->done = E_EOFS;
1428                                 else
1429                                         tok->done = E_EOLS;
1430                                 tok->cur = tok->inp;
1431                                 return ERRORTOKEN;
1432                         }
1433                         else if (c == quote) {
1434                                 tripcount++;
1435                                 if (tok->cur - tok->start == quote2) {
1436                                         c = tok_nextc(tok);
1437                                         if (c == quote) {
1438                                                 triple = 1;
1439                                                 tripcount = 0;
1440                                                 continue;
1441                                         }
1442                                         tok_backup(tok, c);
1443                                 }
1444                                 if (!triple || tripcount == 3)
1445                                         break;
1446                         }
1447                         else if (c == '\\') {
1448                                 tripcount = 0;
1449                                 c = tok_nextc(tok);
1450                                 if (c == EOF) {
1451                                         tok->done = E_EOLS;
1452                                         tok->cur = tok->inp;
1453                                         return ERRORTOKEN;
1454                                 }
1455                         }
1456                         else
1457                                 tripcount = 0;
1458                 }
1459                 *p_start = tok->start;
1460                 *p_end = tok->cur;
1461                 return STRING;
1462         }
1463
1464         /* Line continuation */
1465         if (c == '\\') {
1466                 c = tok_nextc(tok);
1467                 if (c != '\n') {
1468                         tok->done = E_LINECONT;
1469                         tok->cur = tok->inp;
1470                         return ERRORTOKEN;
1471                 }
1472                 tok->cont_line = 1;
1473                 goto again; /* Read next line */
1474         }
1475
1476         /* Check for two-character token */
1477         {
1478                 int c2 = tok_nextc(tok);
1479                 int token = PyToken_TwoChars(c, c2);
1480 #ifndef PGEN
1481                 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1482                         if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1483                                                "<> not supported in 3.x",
1484                                                tok->filename, tok->lineno,
1485                                                NULL, NULL)) {
1486                                 return ERRORTOKEN;
1487                         }
1488                 }
1489 #endif
1490                 if (token != OP) {
1491                         int c3 = tok_nextc(tok);
1492                         int token3 = PyToken_ThreeChars(c, c2, c3);
1493                         if (token3 != OP) {
1494                                 token = token3;
1495                         } else {
1496                                 tok_backup(tok, c3);
1497                         }
1498                         *p_start = tok->start;
1499                         *p_end = tok->cur;
1500                         return token;
1501                 }
1502                 tok_backup(tok, c2);
1503         }
1504
1505         /* Keep track of parentheses nesting level */
1506         switch (c) {
1507         case '(':
1508         case '[':
1509         case '{':
1510                 tok->level++;
1511                 break;
1512         case ')':
1513         case ']':
1514         case '}':
1515                 tok->level--;
1516                 break;
1517         }
1518
1519         /* Punctuation character */
1520         *p_start = tok->start;
1521         *p_end = tok->cur;
1522         return PyToken_OneChar(c);
1523 }
1524
1525 int
1526 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1527 {
1528         int result = tok_get(tok, p_start, p_end);
1529         if (tok->decoding_erred) {
1530                 result = ERRORTOKEN;
1531                 tok->done = E_DECODE;
1532         }
1533         return result;
1534 }
1535
1536 /* This function is only called from parsetok. However, it cannot live
1537    there, as it must be empty for PGEN, and we can check for PGEN only
1538    in this file. */
1539
1540 #ifdef PGEN
1541 char*
1542 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1543 {
1544         return NULL;
1545 }
1546 #else
1547 #ifdef Py_USING_UNICODE
1548 static PyObject *
1549 dec_utf8(const char *enc, const char *text, size_t len) {
1550         PyObject *ret = NULL;
1551         PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1552         if (unicode_text) {
1553                 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1554                 Py_DECREF(unicode_text);
1555         }
1556         if (!ret) {
1557                 PyErr_Clear();
1558         }
1559         return ret;
1560 }
1561
1562 char *
1563 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1564 {
1565         char *text = NULL;
1566         if (tok->encoding) {
1567                 /* convert source to original encondig */
1568                 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1569                 if (lineobj != NULL) {
1570                         int linelen = PyString_Size(lineobj);
1571                         const char *line = PyString_AsString(lineobj);
1572                         text = PyObject_MALLOC(linelen + 1);
1573                         if (text != NULL && line != NULL) {
1574                                 if (linelen)
1575                                         strncpy(text, line, linelen);
1576                                 text[linelen] = '\0';
1577                         }
1578                         Py_DECREF(lineobj);
1579
1580                         /* adjust error offset */
1581                         if (*offset > 1) {
1582                                 PyObject *offsetobj = dec_utf8(tok->encoding,
1583                                                                tok->buf, *offset-1);
1584                                 if (offsetobj) {
1585                                         *offset = PyString_Size(offsetobj) + 1;
1586                                         Py_DECREF(offsetobj);
1587                                 }
1588                         }
1589
1590                 }
1591         }
1592         return text;
1593
1594 }
1595 #endif /* defined(Py_USING_UNICODE) */
1596 #endif
1597
1598
1599 #ifdef Py_DEBUG
1600
1601 void
1602 tok_dump(int type, char *start, char *end)
1603 {
1604         printf("%s", _PyParser_TokenNames[type]);
1605         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1606                 printf("(%.*s)", (int)(end - start), start);
1607 }
1608
1609 #endif