Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #endif /* PGEN */
  20
  21 extern char *PyOS_Readline(FILE *, FILE *, char *);
  22 /* Return malloc'ed string including trailing \n;
  23    empty malloc'ed string for EOF;
  24    NULL if interrupted */
  25
  26 /* Don't ever change this -- it would break the portability of Python code */
  27 #define TABSIZE 8
  28
  29 /* Convert a possibly signed character to a nonnegative int */
  30 /* XXX This assumes characters are 8 bits wide */
  31 #ifdef __CHAR_UNSIGNED__
  32 #define Py_CHARMASK(c)          (c)
  33 #else
  34 #define Py_CHARMASK(c)          ((c) & 0xff)
  35 #endif
  36
  37 /* Forward */
  38 static struct tok_state *tok_new(void);
  39 static int tok_nextc(struct tok_state *tok);
  40 static void tok_backup(struct tok_state *tok, int c);
  41
  42 /* Token names */
  43
  44 char *_PyParser_TokenNames[] = {
  45         "ENDMARKER",
  46         "NAME",
  47         "NUMBER",
  48         "STRING",
  49         "NEWLINE",
  50         "INDENT",
  51         "DEDENT",
  52         "LPAR",
  53         "RPAR",
  54         "LSQB",
  55         "RSQB",
  56         "COLON",
  57         "COMMA",
  58         "SEMI",
  59         "PLUS",
  60         "MINUS",
  61         "STAR",
  62         "SLASH",
  63         "VBAR",
  64         "AMPER",
  65         "LESS",
  66         "GREATER",
  67         "EQUAL",
  68         "DOT",
  69         "PERCENT",
  70         "BACKQUOTE",
  71         "LBRACE",
  72         "RBRACE",
  73         "EQEQUAL",
  74         "NOTEQUAL",
  75         "LESSEQUAL",
  76         "GREATEREQUAL",
  77         "TILDE",
  78         "CIRCUMFLEX",
  79         "LEFTSHIFT",
  80         "RIGHTSHIFT",
  81         "DOUBLESTAR",
  82         "PLUSEQUAL",
  83         "MINEQUAL",
  84         "STAREQUAL",
  85         "SLASHEQUAL",
  86         "PERCENTEQUAL",
  87         "AMPEREQUAL",
  88         "VBAREQUAL",
  89         "CIRCUMFLEXEQUAL",
  90         "LEFTSHIFTEQUAL",
  91         "RIGHTSHIFTEQUAL",
  92         "DOUBLESTAREQUAL",
  93         "DOUBLESLASH",
  94         "DOUBLESLASHEQUAL",
  95         "AT",
  96         /* This table must match the #defines in token.h! */
  97         "OP",
  98         "<ERRORTOKEN>",
  99         "<N_TOKENS>"
 100 };
 101
 102
 103 /* Create and initialize a new tok_state structure */
 104
 105 static struct tok_state *
 106 tok_new(void)
 107 {
 108         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 109                                                 sizeof(struct tok_state));
 110         if (tok == NULL)
 111                 return NULL;
 112         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 113         tok->done = E_OK;
 114         tok->fp = NULL;
 115         tok->tabsize = TABSIZE;
 116         tok->indent = 0;
 117         tok->indstack[0] = 0;
 118         tok->atbol = 1;
 119         tok->pendin = 0;
 120         tok->prompt = tok->nextprompt = NULL;
 121         tok->lineno = 0;
 122         tok->level = 0;
 123         tok->filename = NULL;
 124         tok->altwarning = 0;
 125         tok->alterror = 0;
 126         tok->alttabsize = 1;
 127         tok->altindstack[0] = 0;
 128         tok->decoding_state = 0;
 129         tok->decoding_erred = 0;
 130         tok->read_coding_spec = 0;
 131         tok->encoding = NULL;
 132         tok->cont_line = 0;
 133 #ifndef PGEN
 134         tok->decoding_readline = NULL;
 135         tok->decoding_buffer = NULL;
 136 #endif
 137         return tok;
 138 }
 139
 140 #ifdef PGEN
 141
 142 static char *
 143 decoding_fgets(char *s, int size, struct tok_state *tok)
 144 {
 145         return fgets(s, size, tok->fp);
 146 }
 147
 148 static int
 149 decoding_feof(struct tok_state *tok)
 150 {
 151         return feof(tok->fp);
 152 }
 153
 154 static const char *
 155 decode_str(const char *str, struct tok_state *tok)
 156 {
 157         return str;
 158 }
 159
 160 #else /* PGEN */
 161
 162 static char *
 163 error_ret(struct tok_state *tok) /* XXX */
 164 {
 165         tok->decoding_erred = 1;
 166         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 167                 PyMem_FREE(tok->buf);
 168         tok->buf = NULL;
 169         return NULL;            /* as if it were EOF */
 170 }
 171
 172 static char *
 173 new_string(const char *s, Py_ssize_t len)
 174 {
 175         char* result = (char *)PyMem_MALLOC(len + 1);
 176         if (result != NULL) {
 177                 memcpy(result, s, len);
 178                 result[len] = '\0';
 179         }
 180         return result;
 181 }
 182
 183 static char *
 184 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 185 {
 186         char buf[13];
 187         int i;
 188         for (i = 0; i < 12; i++) {
 189                 int c = s[i];
 190                 if (c == '\0') break;
 191                 else if (c == '_') buf[i] = '-';
 192                 else buf[i] = tolower(c);
 193         }
 194         buf[i] = '\0';
 195         if (strcmp(buf, "utf-8") == 0 ||
 196             strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
 197         else if (strcmp(buf, "latin-1") == 0 ||
 198                  strcmp(buf, "iso-8859-1") == 0 ||
 199                  strcmp(buf, "iso-latin-1") == 0 ||
 200                  strncmp(buf, "latin-1-", 8) == 0 ||
 201                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 202                  strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
 203         else return s;
 204 }
 205
 206 /* Return the coding spec in S, or NULL if none is found.  */
 207
 208 static char *
 209 get_coding_spec(const char *s, Py_ssize_t size)
 210 {
 211         Py_ssize_t i;
 212         /* Coding spec must be in a comment, and that comment must be
 213          * the only statement on the source code line. */
 214         for (i = 0; i < size - 6; i++) {
 215                 if (s[i] == '#')
 216                         break;
 217                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 218                         return NULL;
 219         }
 220         for (; i < size - 6; i++) { /* XXX inefficient search */
 221                 const char* t = s + i;
 222                 if (strncmp(t, "coding", 6) == 0) {
 223                         const char* begin = NULL;
 224                         t += 6;
 225                         if (t[0] != ':' && t[0] != '=')
 226                                 continue;
 227                         do {
 228                                 t++;
 229                         } while (t[0] == '\x20' || t[0] == '\t');
 230
 231                         begin = t;
 232                         while (isalnum(Py_CHARMASK(t[0])) ||
 233                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 234                                 t++;
 235
 236                         if (begin < t) {
 237                                 char* r = new_string(begin, t - begin);
 238                                 char* q = get_normal_name(r);
 239                                 if (r != q) {
 240                                         PyMem_FREE(r);
 241                                         r = new_string(q, strlen(q));
 242                                 }
 243                                 return r;
 244                         }
 245                 }
 246         }
 247         return NULL;
 248 }
 249
 250 /* Check whether the line contains a coding spec. If it does,
 251    invoke the set_readline function for the new encoding.
 252    This function receives the tok_state and the new encoding.
 253    Return 1 on success, 0 on failure.  */
 254
 255 static int
 256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 257                   int set_readline(struct tok_state *, const char *))
 258 {
 259         char * cs;
 260         int r = 1;
 261
 262         if (tok->cont_line)
 263                 /* It's a continuation line, so it can't be a coding spec. */
 264                 return 1;
 265         cs = get_coding_spec(line, size);
 266         if (cs != NULL) {
 267                 tok->read_coding_spec = 1;
 268                 if (tok->encoding == NULL) {
 269                         assert(tok->decoding_state == 1); /* raw */
 270                         if (strcmp(cs, "utf-8") == 0 ||
 271                             strcmp(cs, "iso-8859-1") == 0) {
 272                                 tok->encoding = cs;
 273                         } else {
 274 #ifdef Py_USING_UNICODE
 275                                 r = set_readline(tok, cs);
 276                                 if (r) {
 277                                         tok->encoding = cs;
 278                                         tok->decoding_state = -1;
 279                                 }
 280                                 else
 281                                         PyMem_FREE(cs);
 282 #else
 283                                 /* Without Unicode support, we cannot
 284                                    process the coding spec. Since there
 285                                    won't be any Unicode literals, that
 286                                    won't matter. */
 287                                 PyMem_FREE(cs);
 288 #endif
 289                         }
 290                 } else {        /* then, compare cs with BOM */
 291                         r = (strcmp(tok->encoding, cs) == 0);
 292                         PyMem_FREE(cs);
 293                 }
 294         }
 295         if (!r) {
 296                 cs = tok->encoding;
 297                 if (!cs)
 298                         cs = "with BOM";
 299                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 300         }
 301         return r;
 302 }
 303
 304 /* See whether the file starts with a BOM. If it does,
 305    invoke the set_readline function with the new encoding.
 306    Return 1 on success, 0 on failure.  */
 307
 308 static int
 309 check_bom(int get_char(struct tok_state *),
 310           void unget_char(int, struct tok_state *),
 311           int set_readline(struct tok_state *, const char *),
 312           struct tok_state *tok)
 313 {
 314         int ch = get_char(tok);
 315         tok->decoding_state = 1;
 316         if (ch == EOF) {
 317                 return 1;
 318         } else if (ch == 0xEF) {
 319                 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
 320                 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
 321 #if 0
 322         /* Disable support for UTF-16 BOMs until a decision
 323            is made whether this needs to be supported.  */
 324         } else if (ch == 0xFE) {
 325                 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
 326                 if (!set_readline(tok, "utf-16-be")) return 0;
 327                 tok->decoding_state = -1;
 328         } else if (ch == 0xFF) {
 329                 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
 330                 if (!set_readline(tok, "utf-16-le")) return 0;
 331                 tok->decoding_state = -1;
 332 #endif
 333         } else {
 334                 unget_char(ch, tok);
 335                 return 1;
 336         }
 337         if (tok->encoding != NULL)
 338                 PyMem_FREE(tok->encoding);
 339         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 340         return 1;
 341   NON_BOM:
 342         /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 343         unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
 344         return 1;
 345 }
 346
 347 /* Read a line of text from TOK into S, using the stream in TOK.
 348    Return NULL on failure, else S.
 349
 350    On entry, tok->decoding_buffer will be one of:
 351      1) NULL: need to call tok->decoding_readline to get a new line
 352      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 353            stored the result in tok->decoding_buffer
 354      3) PyStringObject *: previous call to fp_readl did not have enough room
 355            (in the s buffer) to copy entire contents of the line read
 356            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 357            In this case, fp_readl is called in a loop (with an expanded buffer)
 358            until the buffer ends with a '\n' (or until the end of the file is
 359            reached): see tok_nextc and its calls to decoding_fgets.
 360 */
 361
 362 static char *
 363 fp_readl(char *s, int size, struct tok_state *tok)
 364 {
 365 #ifndef Py_USING_UNICODE
 366         /* In a non-Unicode built, this should never be called. */
 367         Py_FatalError("fp_readl should not be called in this build.");
 368         return NULL; /* Keep compiler happy (not reachable) */
 369 #else
 370         PyObject* utf8 = NULL;
 371         PyObject* buf = tok->decoding_buffer;
 372         char *str;
 373         Py_ssize_t utf8len;
 374
 375         /* Ask for one less byte so we can terminate it */
 376         assert(size > 0);
 377         size--;
 378
 379         if (buf == NULL) {
 380                 buf = PyObject_CallObject(tok->decoding_readline, NULL);
 381                 if (buf == NULL)
 382                         return error_ret(tok);
 383         } else {
 384                 tok->decoding_buffer = NULL;
 385                 if (PyString_CheckExact(buf))
 386                         utf8 = buf;
 387         }
 388         if (utf8 == NULL) {
 389                 utf8 = PyUnicode_AsUTF8String(buf);
 390                 Py_DECREF(buf);
 391                 if (utf8 == NULL)
 392                         return error_ret(tok);
 393         }
 394         str = PyString_AsString(utf8);
 395         utf8len = PyString_GET_SIZE(utf8);
 396         if (utf8len > size) {
 397                 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 398                 if (tok->decoding_buffer == NULL) {
 399                         Py_DECREF(utf8);
 400                         return error_ret(tok);
 401                 }
 402                 utf8len = size;
 403         }
 404         memcpy(s, str, utf8len);
 405         s[utf8len] = '\0';
 406         Py_DECREF(utf8);
 407         if (utf8len == 0) return NULL; /* EOF */
 408         return s;
 409 #endif
 410 }
 411
 412 /* Set the readline function for TOK to a StreamReader's
 413    readline function. The StreamReader is named ENC.
 414
 415    This function is called from check_bom and check_coding_spec.
 416
 417    ENC is usually identical to the future value of tok->encoding,
 418    except for the (currently unsupported) case of UTF-16.
 419
 420    Return 1 on success, 0 on failure. */
 421
 422 static int
 423 fp_setreadl(struct tok_state *tok, const char* enc)
 424 {
 425         PyObject *reader, *stream, *readline;
 426
 427         /* XXX: constify filename argument. */
 428         stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 429         if (stream == NULL)
 430                 return 0;
 431
 432         reader = PyCodec_StreamReader(enc, stream, NULL);
 433         Py_DECREF(stream);
 434         if (reader == NULL)
 435                 return 0;
 436
 437         readline = PyObject_GetAttrString(reader, "readline");
 438         Py_DECREF(reader);
 439         if (readline == NULL)
 440                 return 0;
 441
 442         tok->decoding_readline = readline;
 443         return 1;
 444 }
 445
 446 /* Fetch the next byte from TOK. */
 447
 448 static int fp_getc(struct tok_state *tok) {
 449         return getc(tok->fp);
 450 }
 451
 452 /* Unfetch the last byte back into TOK.  */
 453
 454 static void fp_ungetc(int c, struct tok_state *tok) {
 455         ungetc(c, tok->fp);
 456 }
 457
 458 /* Read a line of input from TOK. Determine encoding
 459    if necessary.  */
 460
 461 static char *
 462 decoding_fgets(char *s, int size, struct tok_state *tok)
 463 {
 464         char *line = NULL;
 465         int badchar = 0;
 466         for (;;) {
 467                 if (tok->decoding_state < 0) {
 468                         /* We already have a codec associated with
 469                            this input. */
 470                         line = fp_readl(s, size, tok);
 471                         break;
 472                 } else if (tok->decoding_state > 0) {
 473                         /* We want a 'raw' read. */
 474                         line = Py_UniversalNewlineFgets(s, size,
 475                                                         tok->fp, NULL);
 476                         break;
 477                 } else {
 478                         /* We have not yet determined the encoding.
 479                            If an encoding is found, use the file-pointer
 480                            reader functions from now on. */
 481                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 482                                 return error_ret(tok);
 483                         assert(tok->decoding_state != 0);
 484                 }
 485         }
 486         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 487                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 488                         return error_ret(tok);
 489                 }
 490         }
 491 #ifndef PGEN
 492         /* The default encoding is ASCII, so make sure we don't have any
 493            non-ASCII bytes in it. */
 494         if (line && !tok->encoding) {
 495                 unsigned char *c;
 496                 for (c = (unsigned char *)line; *c; c++)
 497                         if (*c > 127) {
 498                                 badchar = *c;
 499                                 break;
 500                         }
 501         }
 502         if (badchar) {
 503                 char buf[500];
 504                 /* Need to add 1 to the line number, since this line
 505                    has not been counted, yet.  */
 506                 sprintf(buf,
 507                         "Non-ASCII character '\\x%.2x' "
 508                         "in file %.200s on line %i, "
 509                         "but no encoding declared; "
 510                         "see http://www.python.org/peps/pep-0263.html for details",
 511                         badchar, tok->filename, tok->lineno + 1);
 512                 PyErr_SetString(PyExc_SyntaxError, buf);
 513                 return error_ret(tok);
 514         }
 515 #endif
 516         return line;
 517 }
 518
 519 static int
 520 decoding_feof(struct tok_state *tok)
 521 {
 522         if (tok->decoding_state >= 0) {
 523                 return feof(tok->fp);
 524         } else {
 525                 PyObject* buf = tok->decoding_buffer;
 526                 if (buf == NULL) {
 527                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 528                         if (buf == NULL) {
 529                                 error_ret(tok);
 530                                 return 1;
 531                         } else {
 532                                 tok->decoding_buffer = buf;
 533                         }
 534                 }
 535                 return PyObject_Length(buf) == 0;
 536         }
 537 }
 538
 539 /* Fetch a byte from TOK, using the string buffer. */
 540
 541 static int
 542 buf_getc(struct tok_state *tok) {
 543         return Py_CHARMASK(*tok->str++);
 544 }
 545
 546 /* Unfetch a byte from TOK, using the string buffer. */
 547
 548 static void
 549 buf_ungetc(int c, struct tok_state *tok) {
 550         tok->str--;
 551         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 552 }
 553
 554 /* Set the readline function for TOK to ENC. For the string-based
 555    tokenizer, this means to just record the encoding. */
 556
 557 static int
 558 buf_setreadl(struct tok_state *tok, const char* enc) {
 559         tok->enc = enc;
 560         return 1;
 561 }
 562
 563 /* Return a UTF-8 encoding Python string object from the
 564    C byte string STR, which is encoded with ENC. */
 565
 566 #ifdef Py_USING_UNICODE
 567 static PyObject *
 568 translate_into_utf8(const char* str, const char* enc) {
 569         PyObject *utf8;
 570         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 571         if (buf == NULL)
 572                 return NULL;
 573         utf8 = PyUnicode_AsUTF8String(buf);
 574         Py_DECREF(buf);
 575         return utf8;
 576 }
 577 #endif
 578
 579 /* Decode a byte string STR for use as the buffer of TOK.
 580    Look for encoding declarations inside STR, and record them
 581    inside TOK.  */
 582
 583 static const char *
 584 decode_str(const char *str, struct tok_state *tok)
 585 {
 586         PyObject* utf8 = NULL;
 587         const char *s;
 588         int lineno = 0;
 589         tok->enc = NULL;
 590         tok->str = str;
 591         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 592                 return error_ret(tok);
 593         str = tok->str;         /* string after BOM if any */
 594         assert(str);
 595 #ifdef Py_USING_UNICODE
 596         if (tok->enc != NULL) {
 597                 utf8 = translate_into_utf8(str, tok->enc);
 598                 if (utf8 == NULL)
 599                         return error_ret(tok);
 600                 str = PyString_AsString(utf8);
 601         }
 602 #endif
 603         for (s = str;; s++) {
 604                 if (*s == '\0') break;
 605                 else if (*s == '\n') {
 606                         lineno++;
 607                         if (lineno == 2) break;
 608                 }
 609         }
 610         tok->enc = NULL;
 611         if (!check_coding_spec(str, s - str, tok, buf_setreadl))
 612                 return error_ret(tok);
 613 #ifdef Py_USING_UNICODE
 614         if (tok->enc != NULL) {
 615                 assert(utf8 == NULL);
 616                 utf8 = translate_into_utf8(str, tok->enc);
 617                 if (utf8 == NULL) {
 618                         PyErr_Format(PyExc_SyntaxError,
 619                                 "unknown encoding: %s", tok->enc);
 620                         return error_ret(tok);
 621                 }
 622                 str = PyString_AsString(utf8);
 623         }
 624 #endif
 625         assert(tok->decoding_buffer == NULL);
 626         tok->decoding_buffer = utf8; /* CAUTION */
 627         return str;
 628 }
 629
 630 #endif /* PGEN */
 631
 632 /* Set up tokenizer for string */
 633
 634 struct tok_state *
 635 PyTokenizer_FromString(const char *str)
 636 {
 637         struct tok_state *tok = tok_new();
 638         if (tok == NULL)
 639                 return NULL;
 640         str = (char *)decode_str(str, tok);
 641         if (str == NULL) {
 642                 PyTokenizer_Free(tok);
 643                 return NULL;
 644         }
 645
 646         /* XXX: constify members. */
 647         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 648         return tok;
 649 }
 650
 651
 652 /* Set up tokenizer for file */
 653
 654 struct tok_state *
 655 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 656 {
 657         struct tok_state *tok = tok_new();
 658         if (tok == NULL)
 659                 return NULL;
 660         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 661                 PyTokenizer_Free(tok);
 662                 return NULL;
 663         }
 664         tok->cur = tok->inp = tok->buf;
 665         tok->end = tok->buf + BUFSIZ;
 666         tok->fp = fp;
 667         tok->prompt = ps1;
 668         tok->nextprompt = ps2;
 669         return tok;
 670 }
 671
 672
 673 /* Free a tok_state structure */
 674
 675 void
 676 PyTokenizer_Free(struct tok_state *tok)
 677 {
 678         if (tok->encoding != NULL)
 679                 PyMem_FREE(tok->encoding);
 680 #ifndef PGEN
 681         Py_XDECREF(tok->decoding_readline);
 682         Py_XDECREF(tok->decoding_buffer);
 683 #endif
 684         if (tok->fp != NULL && tok->buf != NULL)
 685                 PyMem_FREE(tok->buf);
 686         PyMem_FREE(tok);
 687 }
 688
 689 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 690 static int
 691 tok_stdin_decode(struct tok_state *tok, char **inp)
 692 {
 693         PyObject *enc, *sysstdin, *decoded, *utf8;
 694         const char *encoding;
 695         char *converted;
 696
 697         if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 698                 return 0;
 699         sysstdin = PySys_GetObject("stdin");
 700         if (sysstdin == NULL || !PyFile_Check(sysstdin))
 701                 return 0;
 702
 703         enc = ((PyFileObject *)sysstdin)->f_encoding;
 704         if (enc == NULL || !PyString_Check(enc))
 705                 return 0;
 706         Py_INCREF(enc);
 707
 708         encoding = PyString_AsString(enc);
 709         decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 710         if (decoded == NULL)
 711                 goto error_clear;
 712
 713         utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 714         Py_DECREF(decoded);
 715         if (utf8 == NULL)
 716                 goto error_clear;
 717
 718         assert(PyString_Check(utf8));
 719         converted = new_string(PyString_AS_STRING(utf8),
 720                                PyString_GET_SIZE(utf8));
 721         Py_DECREF(utf8);
 722         if (converted == NULL)
 723                 goto error_nomem;
 724
 725         PyMem_FREE(*inp);
 726         *inp = converted;
 727         if (tok->encoding != NULL)
 728                 PyMem_FREE(tok->encoding);
 729         tok->encoding = new_string(encoding, strlen(encoding));
 730         if (tok->encoding == NULL)
 731                 goto error_nomem;
 732
 733         Py_DECREF(enc);
 734         return 0;
 735
 736 error_nomem:
 737         Py_DECREF(enc);
 738         tok->done = E_NOMEM;
 739         return -1;
 740
 741 error_clear:
 742         /* Fallback to iso-8859-1: for backward compatibility */
 743         Py_DECREF(enc);
 744         PyErr_Clear();
 745         return 0;
 746 }
 747 #endif
 748
 749 /* Get next char, updating state; error code goes into tok->done */
 750
 751 static int
 752 tok_nextc(register struct tok_state *tok)
 753 {
 754         for (;;) {
 755                 if (tok->cur != tok->inp) {
 756                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 757                 }
 758                 if (tok->done != E_OK)
 759                         return EOF;
 760                 if (tok->fp == NULL) {
 761                         char *end = strchr(tok->inp, '\n');
 762                         if (end != NULL)
 763                                 end++;
 764                         else {
 765                                 end = strchr(tok->inp, '\0');
 766                                 if (end == tok->inp) {
 767                                         tok->done = E_EOF;
 768                                         return EOF;
 769                                 }
 770                         }
 771                         if (tok->start == NULL)
 772                                 tok->buf = tok->cur;
 773                         tok->line_start = tok->cur;
 774                         tok->lineno++;
 775                         tok->inp = end;
 776                         return Py_CHARMASK(*tok->cur++);
 777                 }
 778                 if (tok->prompt != NULL) {
 779                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 780                         if (tok->nextprompt != NULL)
 781                                 tok->prompt = tok->nextprompt;
 782                         if (newtok == NULL)
 783                                 tok->done = E_INTR;
 784                         else if (*newtok == '\0') {
 785                                 PyMem_FREE(newtok);
 786                                 tok->done = E_EOF;
 787                         }
 788 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 789                         else if (tok_stdin_decode(tok, &newtok) != 0)
 790                                 PyMem_FREE(newtok);
 791 #endif
 792                         else if (tok->start != NULL) {
 793                                 size_t start = tok->start - tok->buf;
 794                                 size_t oldlen = tok->cur - tok->buf;
 795                                 size_t newlen = oldlen + strlen(newtok);
 796                                 char *buf = tok->buf;
 797                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 798                                 tok->lineno++;
 799                                 if (buf == NULL) {
 800                                         PyMem_FREE(tok->buf);
 801                                         tok->buf = NULL;
 802                                         PyMem_FREE(newtok);
 803                                         tok->done = E_NOMEM;
 804                                         return EOF;
 805                                 }
 806                                 tok->buf = buf;
 807                                 tok->cur = tok->buf + oldlen;
 808                                 tok->line_start = tok->cur;
 809                                 strcpy(tok->buf + oldlen, newtok);
 810                                 PyMem_FREE(newtok);
 811                                 tok->inp = tok->buf + newlen;
 812                                 tok->end = tok->inp + 1;
 813                                 tok->start = tok->buf + start;
 814                         }
 815                         else {
 816                                 tok->lineno++;
 817                                 if (tok->buf != NULL)
 818                                         PyMem_FREE(tok->buf);
 819                                 tok->buf = newtok;
 820                                 tok->line_start = tok->buf;
 821                                 tok->cur = tok->buf;
 822                                 tok->line_start = tok->buf;
 823                                 tok->inp = strchr(tok->buf, '\0');
 824                                 tok->end = tok->inp + 1;
 825                         }
 826                 }
 827                 else {
 828                         int done = 0;
 829                         Py_ssize_t cur = 0;
 830                         char *pt;
 831                         if (tok->start == NULL) {
 832                                 if (tok->buf == NULL) {
 833                                         tok->buf = (char *)
 834                                                 PyMem_MALLOC(BUFSIZ);
 835                                         if (tok->buf == NULL) {
 836                                                 tok->done = E_NOMEM;
 837                                                 return EOF;
 838                                         }
 839                                         tok->end = tok->buf + BUFSIZ;
 840                                 }
 841                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 842                                           tok) == NULL) {
 843                                         tok->done = E_EOF;
 844                                         done = 1;
 845                                 }
 846                                 else {
 847                                         tok->done = E_OK;
 848                                         tok->inp = strchr(tok->buf, '\0');
 849                                         done = tok->inp[-1] == '\n';
 850                                 }
 851                         }
 852                         else {
 853                                 cur = tok->cur - tok->buf;
 854                                 if (decoding_feof(tok)) {
 855                                         tok->done = E_EOF;
 856                                         done = 1;
 857                                 }
 858                                 else
 859                                         tok->done = E_OK;
 860                         }
 861                         tok->lineno++;
 862                         /* Read until '\n' or EOF */
 863                         while (!done) {
 864                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 865                                                   tok->start - tok->buf;
 866                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 867                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 868                                 char *newbuf = tok->buf;
 869                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 870                                                                newsize);
 871                                 if (newbuf == NULL) {
 872                                         tok->done = E_NOMEM;
 873                                         tok->cur = tok->inp;
 874                                         return EOF;
 875                                 }
 876                                 tok->buf = newbuf;
 877                                 tok->inp = tok->buf + curvalid;
 878                                 tok->end = tok->buf + newsize;
 879                                 tok->start = curstart < 0 ? NULL :
 880                                              tok->buf + curstart;
 881                                 if (decoding_fgets(tok->inp,
 882                                                (int)(tok->end - tok->inp),
 883                                                tok) == NULL) {
 884                                         /* Break out early on decoding
 885                                            errors, as tok->buf will be NULL
 886                                          */
 887                                         if (tok->decoding_erred)
 888                                                 return EOF;
 889                                         /* Last line does not end in \n,
 890                                            fake one */
 891                                         strcpy(tok->inp, "\n");
 892                                 }
 893                                 tok->inp = strchr(tok->inp, '\0');
 894                                 done = tok->inp[-1] == '\n';
 895                         }
 896                         if (tok->buf != NULL) {
 897                                 tok->cur = tok->buf + cur;
 898                                 tok->line_start = tok->cur;
 899                                 /* replace "\r\n" with "\n" */
 900                                 /* For Mac leave the \r, giving a syntax error */
 901                                 pt = tok->inp - 2;
 902                                 if (pt >= tok->buf && *pt == '\r') {
 903                                         *pt++ = '\n';
 904                                         *pt = '\0';
 905                                         tok->inp = pt;
 906                                 }
 907                         }
 908                 }
 909                 if (tok->done != E_OK) {
 910                         if (tok->prompt != NULL)
 911                                 PySys_WriteStderr("\n");
 912                         tok->cur = tok->inp;
 913                         return EOF;
 914                 }
 915         }
 916         /*NOTREACHED*/
 917 }
 918
 919
 920 /* Back-up one character */
 921
 922 static void
 923 tok_backup(register struct tok_state *tok, register int c)
 924 {
 925         if (c != EOF) {
 926                 if (--tok->cur < tok->buf)
 927                         Py_FatalError("tok_backup: begin of buffer");
 928                 if (*tok->cur != c)
 929                         *tok->cur = c;
 930         }
 931 }
 932
 933
 934 /* Return the token corresponding to a single character */
 935
 936 int
 937 PyToken_OneChar(int c)
 938 {
 939         switch (c) {
 940         case '(':       return LPAR;
 941         case ')':       return RPAR;
 942         case '[':       return LSQB;
 943         case ']':       return RSQB;
 944         case ':':       return COLON;
 945         case ',':       return COMMA;
 946         case ';':       return SEMI;
 947         case '+':       return PLUS;
 948         case '-':       return MINUS;
 949         case '*':       return STAR;
 950         case '/':       return SLASH;
 951         case '|':       return VBAR;
 952         case '&':       return AMPER;
 953         case '<':       return LESS;
 954         case '>':       return GREATER;
 955         case '=':       return EQUAL;
 956         case '.':       return DOT;
 957         case '%':       return PERCENT;
 958         case '`':       return BACKQUOTE;
 959         case '{':       return LBRACE;
 960         case '}':       return RBRACE;
 961         case '^':       return CIRCUMFLEX;
 962         case '~':       return TILDE;
 963         case '@':       return AT;
 964         default:        return OP;
 965         }
 966 }
 967
 968
 969 int
 970 PyToken_TwoChars(int c1, int c2)
 971 {
 972         switch (c1) {
 973         case '=':
 974                 switch (c2) {
 975                 case '=':       return EQEQUAL;
 976                 }
 977                 break;
 978         case '!':
 979                 switch (c2) {
 980                 case '=':       return NOTEQUAL;
 981                 }
 982                 break;
 983         case '<':
 984                 switch (c2) {
 985                 case '>':       return NOTEQUAL;
 986                 case '=':       return LESSEQUAL;
 987                 case '<':       return LEFTSHIFT;
 988                 }
 989                 break;
 990         case '>':
 991                 switch (c2) {
 992                 case '=':       return GREATEREQUAL;
 993                 case '>':       return RIGHTSHIFT;
 994                 }
 995                 break;
 996         case '+':
 997                 switch (c2) {
 998                 case '=':       return PLUSEQUAL;
 999                 }
1000                 break;
1001         case '-':
1002                 switch (c2) {
1003                 case '=':       return MINEQUAL;
1004                 }
1005                 break;
1006         case '*':
1007                 switch (c2) {
1008                 case '*':       return DOUBLESTAR;
1009                 case '=':       return STAREQUAL;
1010                 }
1011                 break;
1012         case '/':
1013                 switch (c2) {
1014                 case '/':       return DOUBLESLASH;
1015                 case '=':       return SLASHEQUAL;
1016                 }
1017                 break;
1018         case '|':
1019                 switch (c2) {
1020                 case '=':       return VBAREQUAL;
1021                 }
1022                 break;
1023         case '%':
1024                 switch (c2) {
1025                 case '=':       return PERCENTEQUAL;
1026                 }
1027                 break;
1028         case '&':
1029                 switch (c2) {
1030                 case '=':       return AMPEREQUAL;
1031                 }
1032                 break;
1033         case '^':
1034                 switch (c2) {
1035                 case '=':       return CIRCUMFLEXEQUAL;
1036                 }
1037                 break;
1038         }
1039         return OP;
1040 }
1041
1042 int
1043 PyToken_ThreeChars(int c1, int c2, int c3)
1044 {
1045         switch (c1) {
1046         case '<':
1047                 switch (c2) {
1048                 case '<':
1049                         switch (c3) {
1050                         case '=':
1051                                 return LEFTSHIFTEQUAL;
1052                         }
1053                         break;
1054                 }
1055                 break;
1056         case '>':
1057                 switch (c2) {
1058                 case '>':
1059                         switch (c3) {
1060                         case '=':
1061                                 return RIGHTSHIFTEQUAL;
1062                         }
1063                         break;
1064                 }
1065                 break;
1066         case '*':
1067                 switch (c2) {
1068                 case '*':
1069                         switch (c3) {
1070                         case '=':
1071                                 return DOUBLESTAREQUAL;
1072                         }
1073                         break;
1074                 }
1075                 break;
1076         case '/':
1077                 switch (c2) {
1078                 case '/':
1079                         switch (c3) {
1080                         case '=':
1081                                 return DOUBLESLASHEQUAL;
1082                         }
1083                         break;
1084                 }
1085                 break;
1086         }
1087         return OP;
1088 }
1089
1090 static int
1091 indenterror(struct tok_state *tok)
1092 {
1093         if (tok->alterror) {
1094                 tok->done = E_TABSPACE;
1095                 tok->cur = tok->inp;
1096                 return 1;
1097         }
1098         if (tok->altwarning) {
1099                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1100                                   "in indentation\n", tok->filename);
1101                 tok->altwarning = 0;
1102         }
1103         return 0;
1104 }
1105
1106
1107 /* Get next token, after space stripping etc. */
1108
1109 static int
1110 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1111 {
1112         register int c;
1113         int blankline;
1114
1115         *p_start = *p_end = NULL;
1116   nextline:
1117         tok->start = NULL;
1118         blankline = 0;
1119
1120         /* Get indentation level */
1121         if (tok->atbol) {
1122                 register int col = 0;
1123                 register int altcol = 0;
1124                 tok->atbol = 0;
1125                 for (;;) {
1126                         c = tok_nextc(tok);
1127                         if (c == ' ')
1128                                 col++, altcol++;
1129                         else if (c == '\t') {
1130                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1131                                 altcol = (altcol/tok->alttabsize + 1)
1132                                         * tok->alttabsize;
1133                         }
1134                         else if (c == '\014') /* Control-L (formfeed) */
1135                                 col = altcol = 0; /* For Emacs users */
1136                         else
1137                                 break;
1138                 }
1139                 tok_backup(tok, c);
1140                 if (c == '#' || c == '\n') {
1141                         /* Lines with only whitespace and/or comments
1142                            shouldn't affect the indentation and are
1143                            not passed to the parser as NEWLINE tokens,
1144                            except *totally* empty lines in interactive
1145                            mode, which signal the end of a command group. */
1146                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1147                                 blankline = 0; /* Let it through */
1148                         else
1149                                 blankline = 1; /* Ignore completely */
1150                         /* We can't jump back right here since we still
1151                            may need to skip to the end of a comment */
1152                 }
1153                 if (!blankline && tok->level == 0) {
1154                         if (col == tok->indstack[tok->indent]) {
1155                                 /* No change */
1156                                 if (altcol != tok->altindstack[tok->indent]) {
1157                                         if (indenterror(tok))
1158                                                 return ERRORTOKEN;
1159                                 }
1160                         }
1161                         else if (col > tok->indstack[tok->indent]) {
1162                                 /* Indent -- always one */
1163                                 if (tok->indent+1 >= MAXINDENT) {
1164                                         tok->done = E_TOODEEP;
1165                                         tok->cur = tok->inp;
1166                                         return ERRORTOKEN;
1167                                 }
1168                                 if (altcol <= tok->altindstack[tok->indent]) {
1169                                         if (indenterror(tok))
1170                                                 return ERRORTOKEN;
1171                                 }
1172                                 tok->pendin++;
1173                                 tok->indstack[++tok->indent] = col;
1174                                 tok->altindstack[tok->indent] = altcol;
1175                         }
1176                         else /* col < tok->indstack[tok->indent] */ {
1177                                 /* Dedent -- any number, must be consistent */
1178                                 while (tok->indent > 0 &&
1179                                         col < tok->indstack[tok->indent]) {
1180                                         tok->pendin--;
1181                                         tok->indent--;
1182                                 }
1183                                 if (col != tok->indstack[tok->indent]) {
1184                                         tok->done = E_DEDENT;
1185                                         tok->cur = tok->inp;
1186                                         return ERRORTOKEN;
1187                                 }
1188                                 if (altcol != tok->altindstack[tok->indent]) {
1189                                         if (indenterror(tok))
1190                                                 return ERRORTOKEN;
1191                                 }
1192                         }
1193                 }
1194         }
1195
1196         tok->start = tok->cur;
1197
1198         /* Return pending indents/dedents */
1199         if (tok->pendin != 0) {
1200                 if (tok->pendin < 0) {
1201                         tok->pendin++;
1202                         return DEDENT;
1203                 }
1204                 else {
1205                         tok->pendin--;
1206                         return INDENT;
1207                 }
1208         }
1209
1210  again:
1211         tok->start = NULL;
1212         /* Skip spaces */
1213         do {
1214                 c = tok_nextc(tok);
1215         } while (c == ' ' || c == '\t' || c == '\014');
1216
1217         /* Set start of current token */
1218         tok->start = tok->cur - 1;
1219
1220         /* Skip comment, while looking for tab-setting magic */
1221         if (c == '#') {
1222                 static char *tabforms[] = {
1223                         "tab-width:",           /* Emacs */
1224                         ":tabstop=",            /* vim, full form */
1225                         ":ts=",                 /* vim, abbreviated form */
1226                         "set tabsize=",         /* will vi never die? */
1227                 /* more templates can be added here to support other editors */
1228                 };
1229                 char cbuf[80];
1230                 char *tp, **cp;
1231                 tp = cbuf;
1232                 do {
1233                         *tp++ = c = tok_nextc(tok);
1234                 } while (c != EOF && c != '\n' &&
1235                          (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1236                 *tp = '\0';
1237                 for (cp = tabforms;
1238                      cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1239                      cp++) {
1240                         if ((tp = strstr(cbuf, *cp))) {
1241                                 int newsize = atoi(tp + strlen(*cp));
1242
1243                                 if (newsize >= 1 && newsize <= 40) {
1244                                         tok->tabsize = newsize;
1245                                         if (Py_VerboseFlag)
1246                                             PySys_WriteStderr(
1247                                                 "Tab size set to %d\n",
1248                                                 newsize);
1249                                 }
1250                         }
1251                 }
1252                 while (c != EOF && c != '\n')
1253                         c = tok_nextc(tok);
1254         }
1255
1256         /* Check for EOF and errors now */
1257         if (c == EOF) {
1258                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1259         }
1260
1261         /* Identifier (most frequent token!) */
1262         if (isalpha(c) || c == '_') {
1263                 /* Process r"", u"" and ur"" */
1264                 switch (c) {
1265                 case 'r':
1266                 case 'R':
1267                         c = tok_nextc(tok);
1268                         if (c == '"' || c == '\'')
1269                                 goto letter_quote;
1270                         break;
1271                 case 'u':
1272                 case 'U':
1273                         c = tok_nextc(tok);
1274                         if (c == 'r' || c == 'R')
1275                                 c = tok_nextc(tok);
1276                         if (c == '"' || c == '\'')
1277                                 goto letter_quote;
1278                         break;
1279                 }
1280                 while (isalnum(c) || c == '_') {
1281                         c = tok_nextc(tok);
1282                 }
1283                 tok_backup(tok, c);
1284                 *p_start = tok->start;
1285                 *p_end = tok->cur;
1286                 return NAME;
1287         }
1288
1289         /* Newline */
1290         if (c == '\n') {
1291                 tok->atbol = 1;
1292                 if (blankline || tok->level > 0)
1293                         goto nextline;
1294                 *p_start = tok->start;
1295                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1296                 tok->cont_line = 0;
1297                 return NEWLINE;
1298         }
1299
1300         /* Period or number starting with period? */
1301         if (c == '.') {
1302                 c = tok_nextc(tok);
1303                 if (isdigit(c)) {
1304                         goto fraction;
1305                 }
1306                 else {
1307                         tok_backup(tok, c);
1308                         *p_start = tok->start;
1309                         *p_end = tok->cur;
1310                         return DOT;
1311                 }
1312         }
1313
1314         /* Number */
1315         if (isdigit(c)) {
1316                 if (c == '0') {
1317                         /* Hex or octal -- maybe. */
1318                         c = tok_nextc(tok);
1319                         if (c == '.')
1320                                 goto fraction;
1321 #ifndef WITHOUT_COMPLEX
1322                         if (c == 'j' || c == 'J')
1323                                 goto imaginary;
1324 #endif
1325                         if (c == 'x' || c == 'X') {
1326                                 /* Hex */
1327                                 do {
1328                                         c = tok_nextc(tok);
1329                                 } while (isxdigit(c));
1330                         }
1331                         else {
1332                                 int found_decimal = 0;
1333                                 /* Octal; c is first char of it */
1334                                 /* There's no 'isoctdigit' macro, sigh */
1335                                 while ('0' <= c && c < '8') {
1336                                         c = tok_nextc(tok);
1337                                 }
1338                                 if (isdigit(c)) {
1339                                         found_decimal = 1;
1340                                         do {
1341                                                 c = tok_nextc(tok);
1342                                         } while (isdigit(c));
1343                                 }
1344                                 if (c == '.')
1345                                         goto fraction;
1346                                 else if (c == 'e' || c == 'E')
1347                                         goto exponent;
1348 #ifndef WITHOUT_COMPLEX
1349                                 else if (c == 'j' || c == 'J')
1350                                         goto imaginary;
1351 #endif
1352                                 else if (found_decimal) {
1353                                         tok->done = E_TOKEN;
1354                                         tok_backup(tok, c);
1355                                         return ERRORTOKEN;
1356                                 }
1357                         }
1358                         if (c == 'l' || c == 'L')
1359                                 c = tok_nextc(tok);
1360                 }
1361                 else {
1362                         /* Decimal */
1363                         do {
1364                                 c = tok_nextc(tok);
1365                         } while (isdigit(c));
1366                         if (c == 'l' || c == 'L')
1367                                 c = tok_nextc(tok);
1368                         else {
1369                                 /* Accept floating point numbers. */
1370                                 if (c == '.') {
1371                 fraction:
1372                                         /* Fraction */
1373                                         do {
1374                                                 c = tok_nextc(tok);
1375                                         } while (isdigit(c));
1376                                 }
1377                                 if (c == 'e' || c == 'E') {
1378                 exponent:
1379                                         /* Exponent part */
1380                                         c = tok_nextc(tok);
1381                                         if (c == '+' || c == '-')
1382                                                 c = tok_nextc(tok);
1383                                         if (!isdigit(c)) {
1384                                                 tok->done = E_TOKEN;
1385                                                 tok_backup(tok, c);
1386                                                 return ERRORTOKEN;
1387                                         }
1388                                         do {
1389                                                 c = tok_nextc(tok);
1390                                         } while (isdigit(c));
1391                                 }
1392 #ifndef WITHOUT_COMPLEX
1393                                 if (c == 'j' || c == 'J')
1394                                         /* Imaginary part */
1395                 imaginary:
1396                                         c = tok_nextc(tok);
1397 #endif
1398                         }
1399                 }
1400                 tok_backup(tok, c);
1401                 *p_start = tok->start;
1402                 *p_end = tok->cur;
1403                 return NUMBER;
1404         }
1405
1406   letter_quote:
1407         /* String */
1408         if (c == '\'' || c == '"') {
1409                 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1410                 int quote = c;
1411                 int triple = 0;
1412                 int tripcount = 0;
1413                 for (;;) {
1414                         c = tok_nextc(tok);
1415                         if (c == '\n') {
1416                                 if (!triple) {
1417                                         tok->done = E_EOLS;
1418                                         tok_backup(tok, c);
1419                                         return ERRORTOKEN;
1420                                 }
1421                                 tripcount = 0;
1422                                 tok->cont_line = 1; /* multiline string. */
1423                         }
1424                         else if (c == EOF) {
1425                                 if (triple)
1426                                         tok->done = E_EOFS;
1427                                 else
1428                                         tok->done = E_EOLS;
1429                                 tok->cur = tok->inp;
1430                                 return ERRORTOKEN;
1431                         }
1432                         else if (c == quote) {
1433                                 tripcount++;
1434                                 if (tok->cur - tok->start == quote2) {
1435                                         c = tok_nextc(tok);
1436                                         if (c == quote) {
1437                                                 triple = 1;
1438                                                 tripcount = 0;
1439                                                 continue;
1440                                         }
1441                                         tok_backup(tok, c);
1442                                 }
1443                                 if (!triple || tripcount == 3)
1444                                         break;
1445                         }
1446                         else if (c == '\\') {
1447                                 tripcount = 0;
1448                                 c = tok_nextc(tok);
1449                                 if (c == EOF) {
1450                                         tok->done = E_EOLS;
1451                                         tok->cur = tok->inp;
1452                                         return ERRORTOKEN;
1453                                 }
1454                         }
1455                         else
1456                                 tripcount = 0;
1457                 }
1458                 *p_start = tok->start;
1459                 *p_end = tok->cur;
1460                 return STRING;
1461         }
1462
1463         /* Line continuation */
1464         if (c == '\\') {
1465                 c = tok_nextc(tok);
1466                 if (c != '\n') {
1467                         tok->done = E_LINECONT;
1468                         tok->cur = tok->inp;
1469                         return ERRORTOKEN;
1470                 }
1471                 tok->cont_line = 1;
1472                 goto again; /* Read next line */
1473         }
1474
1475         /* Check for two-character token */
1476         {
1477                 int c2 = tok_nextc(tok);
1478                 int token = PyToken_TwoChars(c, c2);
1479                 if (token != OP) {
1480                         int c3 = tok_nextc(tok);
1481                         int token3 = PyToken_ThreeChars(c, c2, c3);
1482                         if (token3 != OP) {
1483                                 token = token3;
1484                         } else {
1485                                 tok_backup(tok, c3);
1486                         }
1487                         *p_start = tok->start;
1488                         *p_end = tok->cur;
1489                         return token;
1490                 }
1491                 tok_backup(tok, c2);
1492         }
1493
1494         /* Keep track of parentheses nesting level */
1495         switch (c) {
1496         case '(':
1497         case '[':
1498         case '{':
1499                 tok->level++;
1500                 break;
1501         case ')':
1502         case ']':
1503         case '}':
1504                 tok->level--;
1505                 break;
1506         }
1507
1508         /* Punctuation character */
1509         *p_start = tok->start;
1510         *p_end = tok->cur;
1511         return PyToken_OneChar(c);
1512 }
1513
1514 int
1515 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1516 {
1517         int result = tok_get(tok, p_start, p_end);
1518         if (tok->decoding_erred) {
1519                 result = ERRORTOKEN;
1520                 tok->done = E_DECODE;
1521         }
1522         return result;
1523 }
1524
1525 #ifdef Py_DEBUG
1526
1527 void
1528 tok_dump(int type, char *start, char *end)
1529 {
1530         printf("%s", _PyParser_TokenNames[type]);
1531         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1532                 printf("(%.*s)", (int)(end - start), start);
1533 }
1534
1535 #endif