Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #endif /* PGEN */
  20
  21 extern char *PyOS_Readline(FILE *, FILE *, char *);
  22 /* Return malloc'ed string including trailing \n;
  23    empty malloc'ed string for EOF;
  24    NULL if interrupted */
  25
  26 /* Don't ever change this -- it would break the portability of Python code */
  27 #define TABSIZE 8
  28
  29 /* Convert a possibly signed character to a nonnegative int */
  30 /* XXX This assumes characters are 8 bits wide */
  31 #ifdef __CHAR_UNSIGNED__
  32 #define Py_CHARMASK(c)          (c)
  33 #else
  34 #define Py_CHARMASK(c)          ((c) & 0xff)
  35 #endif
  36
  37 /* Forward */
  38 static struct tok_state *tok_new(void);
  39 static int tok_nextc(struct tok_state *tok);
  40 static void tok_backup(struct tok_state *tok, int c);
  41
  42 /* Token names */
  43
  44 char *_PyParser_TokenNames[] = {
  45         "ENDMARKER",
  46         "NAME",
  47         "NUMBER",
  48         "STRING",
  49         "NEWLINE",
  50         "INDENT",
  51         "DEDENT",
  52         "LPAR",
  53         "RPAR",
  54         "LSQB",
  55         "RSQB",
  56         "COLON",
  57         "COMMA",
  58         "SEMI",
  59         "PLUS",
  60         "MINUS",
  61         "STAR",
  62         "SLASH",
  63         "VBAR",
  64         "AMPER",
  65         "LESS",
  66         "GREATER",
  67         "EQUAL",
  68         "DOT",
  69         "PERCENT",
  70         "BACKQUOTE",
  71         "LBRACE",
  72         "RBRACE",
  73         "EQEQUAL",
  74         "NOTEQUAL",
  75         "LESSEQUAL",
  76         "GREATEREQUAL",
  77         "TILDE",
  78         "CIRCUMFLEX",
  79         "LEFTSHIFT",
  80         "RIGHTSHIFT",
  81         "DOUBLESTAR",
  82         "PLUSEQUAL",
  83         "MINEQUAL",
  84         "STAREQUAL",
  85         "SLASHEQUAL",
  86         "PERCENTEQUAL",
  87         "AMPEREQUAL",
  88         "VBAREQUAL",
  89         "CIRCUMFLEXEQUAL",
  90         "LEFTSHIFTEQUAL",
  91         "RIGHTSHIFTEQUAL",
  92         "DOUBLESTAREQUAL",
  93         "DOUBLESLASH",
  94         "DOUBLESLASHEQUAL",
  95         "AT",
  96         /* This table must match the #defines in token.h! */
  97         "OP",
  98         "<ERRORTOKEN>",
  99         "<N_TOKENS>"
 100 };
 101
 102
 103 /* Create and initialize a new tok_state structure */
 104
 105 static struct tok_state *
 106 tok_new(void)
 107 {
 108         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 109                                                 sizeof(struct tok_state));
 110         if (tok == NULL)
 111                 return NULL;
 112         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 113         tok->done = E_OK;
 114         tok->fp = NULL;
 115         tok->tabsize = TABSIZE;
 116         tok->indent = 0;
 117         tok->indstack[0] = 0;
 118         tok->atbol = 1;
 119         tok->pendin = 0;
 120         tok->prompt = tok->nextprompt = NULL;
 121         tok->lineno = 0;
 122         tok->level = 0;
 123         tok->filename = NULL;
 124         tok->altwarning = 0;
 125         tok->alterror = 0;
 126         tok->alttabsize = 1;
 127         tok->altindstack[0] = 0;
 128         tok->decoding_state = 0;
 129         tok->decoding_erred = 0;
 130         tok->read_coding_spec = 0;
 131         tok->encoding = NULL;
 132         tok->cont_line = 0;
 133 #ifndef PGEN
 134         tok->decoding_readline = NULL;
 135         tok->decoding_buffer = NULL;
 136 #endif
 137         return tok;
 138 }
 139
 140 #ifdef PGEN
 141
 142 static char *
 143 decoding_fgets(char *s, int size, struct tok_state *tok)
 144 {
 145         return fgets(s, size, tok->fp);
 146 }
 147
 148 static int
 149 decoding_feof(struct tok_state *tok)
 150 {
 151         return feof(tok->fp);
 152 }
 153
 154 static const char *
 155 decode_str(const char *str, struct tok_state *tok)
 156 {
 157         return str;
 158 }
 159
 160 #else /* PGEN */
 161
 162 static char *
 163 error_ret(struct tok_state *tok) /* XXX */
 164 {
 165         tok->decoding_erred = 1;
 166         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 167                 PyMem_FREE(tok->buf);
 168         tok->buf = NULL;
 169         return NULL;            /* as if it were EOF */
 170 }
 171
 172 static char *
 173 new_string(const char *s, Py_ssize_t len)
 174 {
 175         char* result = (char *)PyMem_MALLOC(len + 1);
 176         if (result != NULL) {
 177                 memcpy(result, s, len);
 178                 result[len] = '\0';
 179         }
 180         return result;
 181 }
 182
 183 static char *
 184 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 185 {
 186         char buf[13];
 187         int i;
 188         for (i = 0; i < 12; i++) {
 189                 int c = s[i];
 190                 if (c == '\0') break;
 191                 else if (c == '_') buf[i] = '-';
 192                 else buf[i] = tolower(c);
 193         }
 194         buf[i] = '\0';
 195         if (strcmp(buf, "utf-8") == 0 ||
 196             strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
 197         else if (strcmp(buf, "latin-1") == 0 ||
 198                  strcmp(buf, "iso-8859-1") == 0 ||
 199                  strcmp(buf, "iso-latin-1") == 0 ||
 200                  strncmp(buf, "latin-1-", 8) == 0 ||
 201                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 202                  strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
 203         else return s;
 204 }
 205
 206 /* Return the coding spec in S, or NULL if none is found.  */
 207
 208 static char *
 209 get_coding_spec(const char *s, Py_ssize_t size)
 210 {
 211         Py_ssize_t i;
 212         /* Coding spec must be in a comment, and that comment must be
 213          * the only statement on the source code line. */
 214         for (i = 0; i < size - 6; i++) {
 215                 if (s[i] == '#')
 216                         break;
 217                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 218                         return NULL;
 219         }
 220         for (; i < size - 6; i++) { /* XXX inefficient search */
 221                 const char* t = s + i;
 222                 if (strncmp(t, "coding", 6) == 0) {
 223                         const char* begin = NULL;
 224                         t += 6;
 225                         if (t[0] != ':' && t[0] != '=')
 226                                 continue;
 227                         do {
 228                                 t++;
 229                         } while (t[0] == '\x20' || t[0] == '\t');
 230
 231                         begin = t;
 232                         while (isalnum(Py_CHARMASK(t[0])) ||
 233                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 234                                 t++;
 235
 236                         if (begin < t) {
 237                                 char* r = new_string(begin, t - begin);
 238                                 char* q = get_normal_name(r);
 239                                 if (r != q) {
 240                                         PyMem_FREE(r);
 241                                         r = new_string(q, strlen(q));
 242                                 }
 243                                 return r;
 244                         }
 245                 }
 246         }
 247         return NULL;
 248 }
 249
 250 /* Check whether the line contains a coding spec. If it does,
 251    invoke the set_readline function for the new encoding.
 252    This function receives the tok_state and the new encoding.
 253    Return 1 on success, 0 on failure.  */
 254
 255 static int
 256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 257                   int set_readline(struct tok_state *, const char *))
 258 {
 259         char * cs;
 260         int r = 1;
 261
 262         if (tok->cont_line)
 263                 /* It's a continuation line, so it can't be a coding spec. */
 264                 return 1;
 265         cs = get_coding_spec(line, size);
 266         if (cs != NULL) {
 267                 tok->read_coding_spec = 1;
 268                 if (tok->encoding == NULL) {
 269                         assert(tok->decoding_state == 1); /* raw */
 270                         if (strcmp(cs, "utf-8") == 0 ||
 271                             strcmp(cs, "iso-8859-1") == 0) {
 272                                 tok->encoding = cs;
 273                         } else {
 274 #ifdef Py_USING_UNICODE
 275                                 r = set_readline(tok, cs);
 276                                 if (r) {
 277                                         tok->encoding = cs;
 278                                         tok->decoding_state = -1;
 279                                 }
 280                                 else
 281                                         PyMem_FREE(cs);
 282 #else
 283                                 /* Without Unicode support, we cannot
 284                                    process the coding spec. Since there
 285                                    won't be any Unicode literals, that
 286                                    won't matter. */
 287                                 PyMem_FREE(cs);
 288 #endif
 289                         }
 290                 } else {        /* then, compare cs with BOM */
 291                         r = (strcmp(tok->encoding, cs) == 0);
 292                         PyMem_FREE(cs);
 293                 }
 294         }
 295         if (!r) {
 296                 cs = tok->encoding;
 297                 if (!cs)
 298                         cs = "with BOM";
 299                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 300         }
 301         return r;
 302 }
 303
 304 /* See whether the file starts with a BOM. If it does,
 305    invoke the set_readline function with the new encoding.
 306    Return 1 on success, 0 on failure.  */
 307
 308 static int
 309 check_bom(int get_char(struct tok_state *),
 310           void unget_char(int, struct tok_state *),
 311           int set_readline(struct tok_state *, const char *),
 312           struct tok_state *tok)
 313 {
 314         int ch = get_char(tok);
 315         tok->decoding_state = 1;
 316         if (ch == EOF) {
 317                 return 1;
 318         } else if (ch == 0xEF) {
 319                 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
 320                 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
 321 #if 0
 322         /* Disable support for UTF-16 BOMs until a decision
 323            is made whether this needs to be supported.  */
 324         } else if (ch == 0xFE) {
 325                 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
 326                 if (!set_readline(tok, "utf-16-be")) return 0;
 327                 tok->decoding_state = -1;
 328         } else if (ch == 0xFF) {
 329                 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
 330                 if (!set_readline(tok, "utf-16-le")) return 0;
 331                 tok->decoding_state = -1;
 332 #endif
 333         } else {
 334                 unget_char(ch, tok);
 335                 return 1;
 336         }
 337         if (tok->encoding != NULL)
 338                 PyMem_FREE(tok->encoding);
 339         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 340         return 1;
 341   NON_BOM:
 342         /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 343         unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
 344         return 1;
 345 }
 346
 347 /* Read a line of text from TOK into S, using the stream in TOK.
 348    Return NULL on failure, else S.
 349
 350    On entry, tok->decoding_buffer will be one of:
 351      1) NULL: need to call tok->decoding_readline to get a new line
 352      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 353            stored the result in tok->decoding_buffer
 354      3) PyStringObject *: previous call to fp_readl did not have enough room
 355            (in the s buffer) to copy entire contents of the line read
 356            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 357            In this case, fp_readl is called in a loop (with an expanded buffer)
 358            until the buffer ends with a '\n' (or until the end of the file is
 359            reached): see tok_nextc and its calls to decoding_fgets.
 360 */
 361
 362 static char *
 363 fp_readl(char *s, int size, struct tok_state *tok)
 364 {
 365 #ifndef Py_USING_UNICODE
 366         /* In a non-Unicode built, this should never be called. */
 367         Py_FatalError("fp_readl should not be called in this build.");
 368         return NULL; /* Keep compiler happy (not reachable) */
 369 #else
 370         PyObject* utf8 = NULL;
 371         PyObject* buf = tok->decoding_buffer;
 372         char *str;
 373         Py_ssize_t utf8len;
 374
 375         /* Ask for one less byte so we can terminate it */
 376         assert(size > 0);
 377         size--;
 378
 379         if (buf == NULL) {
 380                 buf = PyObject_CallObject(tok->decoding_readline, NULL);
 381                 if (buf == NULL)
 382                         return error_ret(tok);
 383         } else {
 384                 tok->decoding_buffer = NULL;
 385                 if (PyString_CheckExact(buf))
 386                         utf8 = buf;
 387         }
 388         if (utf8 == NULL) {
 389                 utf8 = PyUnicode_AsUTF8String(buf);
 390                 Py_DECREF(buf);
 391                 if (utf8 == NULL)
 392                         return error_ret(tok);
 393         }
 394         str = PyString_AsString(utf8);
 395         utf8len = PyString_GET_SIZE(utf8);
 396         if (utf8len > size) {
 397                 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 398                 if (tok->decoding_buffer == NULL) {
 399                         Py_DECREF(utf8);
 400                         return error_ret(tok);
 401                 }
 402                 utf8len = size;
 403         }
 404         memcpy(s, str, utf8len);
 405         s[utf8len] = '\0';
 406         Py_DECREF(utf8);
 407         if (utf8len == 0) return NULL; /* EOF */
 408         return s;
 409 #endif
 410 }
 411
 412 /* Set the readline function for TOK to a StreamReader's
 413    readline function. The StreamReader is named ENC.
 414
 415    This function is called from check_bom and check_coding_spec.
 416
 417    ENC is usually identical to the future value of tok->encoding,
 418    except for the (currently unsupported) case of UTF-16.
 419
 420    Return 1 on success, 0 on failure. */
 421
 422 static int
 423 fp_setreadl(struct tok_state *tok, const char* enc)
 424 {
 425         PyObject *reader, *stream, *readline;
 426
 427         /* XXX: constify filename argument. */
 428         stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 429         if (stream == NULL)
 430                 return 0;
 431
 432         reader = PyCodec_StreamReader(enc, stream, NULL);
 433         Py_DECREF(stream);
 434         if (reader == NULL)
 435                 return 0;
 436
 437         readline = PyObject_GetAttrString(reader, "readline");
 438         Py_DECREF(reader);
 439         if (readline == NULL)
 440                 return 0;
 441
 442         tok->decoding_readline = readline;
 443         return 1;
 444 }
 445
 446 /* Fetch the next byte from TOK. */
 447
 448 static int fp_getc(struct tok_state *tok) {
 449         return getc(tok->fp);
 450 }
 451
 452 /* Unfetch the last byte back into TOK.  */
 453
 454 static void fp_ungetc(int c, struct tok_state *tok) {
 455         ungetc(c, tok->fp);
 456 }
 457
 458 /* Read a line of input from TOK. Determine encoding
 459    if necessary.  */
 460
 461 static char *
 462 decoding_fgets(char *s, int size, struct tok_state *tok)
 463 {
 464         char *line = NULL;
 465         int badchar = 0;
 466         for (;;) {
 467                 if (tok->decoding_state < 0) {
 468                         /* We already have a codec associated with
 469                            this input. */
 470                         line = fp_readl(s, size, tok);
 471                         break;
 472                 } else if (tok->decoding_state > 0) {
 473                         /* We want a 'raw' read. */
 474                         line = Py_UniversalNewlineFgets(s, size,
 475                                                         tok->fp, NULL);
 476                         break;
 477                 } else {
 478                         /* We have not yet determined the encoding.
 479                            If an encoding is found, use the file-pointer
 480                            reader functions from now on. */
 481                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 482                                 return error_ret(tok);
 483                         assert(tok->decoding_state != 0);
 484                 }
 485         }
 486         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 487                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 488                         return error_ret(tok);
 489                 }
 490         }
 491 #ifndef PGEN
 492         /* The default encoding is ASCII, so make sure we don't have any
 493            non-ASCII bytes in it. */
 494         if (line && !tok->encoding) {
 495                 unsigned char *c;
 496                 for (c = (unsigned char *)line; *c; c++)
 497                         if (*c > 127) {
 498                                 badchar = *c;
 499                                 break;
 500                         }
 501         }
 502         if (badchar) {
 503                 char buf[500];
 504                 /* Need to add 1 to the line number, since this line
 505                    has not been counted, yet.  */
 506                 sprintf(buf,
 507                         "Non-ASCII character '\\x%.2x' "
 508                         "in file %.200s on line %i, "
 509                         "but no encoding declared; "
 510                         "see http://www.python.org/peps/pep-0263.html for details",
 511                         badchar, tok->filename, tok->lineno + 1);
 512                 PyErr_SetString(PyExc_SyntaxError, buf);
 513                 return error_ret(tok);
 514         }
 515 #endif
 516         return line;
 517 }
 518
 519 static int
 520 decoding_feof(struct tok_state *tok)
 521 {
 522         if (tok->decoding_state >= 0) {
 523                 return feof(tok->fp);
 524         } else {
 525                 PyObject* buf = tok->decoding_buffer;
 526                 if (buf == NULL) {
 527                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 528                         if (buf == NULL) {
 529                                 error_ret(tok);
 530                                 return 1;
 531                         } else {
 532                                 tok->decoding_buffer = buf;
 533                         }
 534                 }
 535                 return PyObject_Length(buf) == 0;
 536         }
 537 }
 538
 539 /* Fetch a byte from TOK, using the string buffer. */
 540
 541 static int
 542 buf_getc(struct tok_state *tok) {
 543         return Py_CHARMASK(*tok->str++);
 544 }
 545
 546 /* Unfetch a byte from TOK, using the string buffer. */
 547
 548 static void
 549 buf_ungetc(int c, struct tok_state *tok) {
 550         tok->str--;
 551         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 552 }
 553
 554 /* Set the readline function for TOK to ENC. For the string-based
 555    tokenizer, this means to just record the encoding. */
 556
 557 static int
 558 buf_setreadl(struct tok_state *tok, const char* enc) {
 559         tok->enc = enc;
 560         return 1;
 561 }
 562
 563 /* Return a UTF-8 encoding Python string object from the
 564    C byte string STR, which is encoded with ENC. */
 565
 566 #ifdef Py_USING_UNICODE
 567 static PyObject *
 568 translate_into_utf8(const char* str, const char* enc) {
 569         PyObject *utf8;
 570         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 571         if (buf == NULL)
 572                 return NULL;
 573         utf8 = PyUnicode_AsUTF8String(buf);
 574         Py_DECREF(buf);
 575         return utf8;
 576 }
 577 #endif
 578
 579 /* Decode a byte string STR for use as the buffer of TOK.
 580    Look for encoding declarations inside STR, and record them
 581    inside TOK.  */
 582
 583 static const char *
 584 decode_str(const char *str, struct tok_state *tok)
 585 {
 586         PyObject* utf8 = NULL;
 587         const char *s;
 588         int lineno = 0;
 589         tok->enc = NULL;
 590         tok->str = str;
 591         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 592                 return error_ret(tok);
 593         str = tok->str;         /* string after BOM if any */
 594         assert(str);
 595 #ifdef Py_USING_UNICODE
 596         if (tok->enc != NULL) {
 597                 utf8 = translate_into_utf8(str, tok->enc);
 598                 if (utf8 == NULL)
 599                         return error_ret(tok);
 600                 str = PyString_AsString(utf8);
 601         }
 602 #endif
 603         for (s = str;; s++) {
 604                 if (*s == '\0') break;
 605                 else if (*s == '\n') {
 606                         lineno++;
 607                         if (lineno == 2) break;
 608                 }
 609         }
 610         tok->enc = NULL;
 611         if (!check_coding_spec(str, s - str, tok, buf_setreadl))
 612                 return error_ret(tok);
 613 #ifdef Py_USING_UNICODE
 614         if (tok->enc != NULL) {
 615                 assert(utf8 == NULL);
 616                 utf8 = translate_into_utf8(str, tok->enc);
 617                 if (utf8 == NULL) {
 618                         PyErr_Format(PyExc_SyntaxError,
 619                                 "unknown encoding: %s", tok->enc);
 620                         return error_ret(tok);
 621                 }
 622                 str = PyString_AsString(utf8);
 623         }
 624 #endif
 625         assert(tok->decoding_buffer == NULL);
 626         tok->decoding_buffer = utf8; /* CAUTION */
 627         return str;
 628 }
 629
 630 #endif /* PGEN */
 631
 632 /* Set up tokenizer for string */
 633
 634 struct tok_state *
 635 PyTokenizer_FromString(const char *str)
 636 {
 637         struct tok_state *tok = tok_new();
 638         if (tok == NULL)
 639                 return NULL;
 640         str = (char *)decode_str(str, tok);
 641         if (str == NULL) {
 642                 PyTokenizer_Free(tok);
 643                 return NULL;
 644         }
 645
 646         /* XXX: constify members. */
 647         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 648         return tok;
 649 }
 650
 651
 652 /* Set up tokenizer for file */
 653
 654 struct tok_state *
 655 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 656 {
 657         struct tok_state *tok = tok_new();
 658         if (tok == NULL)
 659                 return NULL;
 660         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 661                 PyTokenizer_Free(tok);
 662                 return NULL;
 663         }
 664         tok->cur = tok->inp = tok->buf;
 665         tok->end = tok->buf + BUFSIZ;
 666         tok->fp = fp;
 667         tok->prompt = ps1;
 668         tok->nextprompt = ps2;
 669         return tok;
 670 }
 671
 672
 673 /* Free a tok_state structure */
 674
 675 void
 676 PyTokenizer_Free(struct tok_state *tok)
 677 {
 678         if (tok->encoding != NULL)
 679                 PyMem_FREE(tok->encoding);
 680 #ifndef PGEN
 681         Py_XDECREF(tok->decoding_readline);
 682         Py_XDECREF(tok->decoding_buffer);
 683 #endif
 684         if (tok->fp != NULL && tok->buf != NULL)
 685                 PyMem_FREE(tok->buf);
 686         PyMem_FREE(tok);
 687 }
 688
 689 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 690 static int
 691 tok_stdin_decode(struct tok_state *tok, char **inp)
 692 {
 693         PyObject *enc, *sysstdin, *decoded, *utf8;
 694         const char *encoding;
 695         char *converted;
 696
 697         if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 698                 return 0;
 699         sysstdin = PySys_GetObject("stdin");
 700         if (sysstdin == NULL || !PyFile_Check(sysstdin))
 701                 return 0;
 702
 703         enc = ((PyFileObject *)sysstdin)->f_encoding;
 704         if (enc == NULL || !PyString_Check(enc))
 705                 return 0;
 706         Py_INCREF(enc);
 707
 708         encoding = PyString_AsString(enc);
 709         decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 710         if (decoded == NULL)
 711                 goto error_clear;
 712
 713         utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 714         Py_DECREF(decoded);
 715         if (utf8 == NULL)
 716                 goto error_clear;
 717
 718         assert(PyString_Check(utf8));
 719         converted = new_string(PyString_AS_STRING(utf8),
 720                                PyString_GET_SIZE(utf8));
 721         Py_DECREF(utf8);
 722         if (converted == NULL)
 723                 goto error_nomem;
 724
 725         PyMem_FREE(*inp);
 726         *inp = converted;
 727         if (tok->encoding != NULL)
 728                 PyMem_FREE(tok->encoding);
 729         tok->encoding = new_string(encoding, strlen(encoding));
 730         if (tok->encoding == NULL)
 731                 goto error_nomem;
 732
 733         Py_DECREF(enc);
 734         return 0;
 735
 736 error_nomem:
 737         Py_DECREF(enc);
 738         tok->done = E_NOMEM;
 739         return -1;
 740
 741 error_clear:
 742         /* Fallback to iso-8859-1: for backward compatibility */
 743         Py_DECREF(enc);
 744         PyErr_Clear();
 745         return 0;
 746 }
 747 #endif
 748
 749 /* Get next char, updating state; error code goes into tok->done */
 750
 751 static int
 752 tok_nextc(register struct tok_state *tok)
 753 {
 754         for (;;) {
 755                 if (tok->cur != tok->inp) {
 756                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 757                 }
 758                 if (tok->done != E_OK)
 759                         return EOF;
 760                 if (tok->fp == NULL) {
 761                         char *end = strchr(tok->inp, '\n');
 762                         if (end != NULL)
 763                                 end++;
 764                         else {
 765                                 end = strchr(tok->inp, '\0');
 766                                 if (end == tok->inp) {
 767                                         tok->done = E_EOF;
 768                                         return EOF;
 769                                 }
 770                         }
 771                         if (tok->start == NULL)
 772                                 tok->buf = tok->cur;
 773                         tok->line_start = tok->cur;
 774                         tok->lineno++;
 775                         tok->inp = end;
 776                         return Py_CHARMASK(*tok->cur++);
 777                 }
 778                 if (tok->prompt != NULL) {
 779                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 780                         if (tok->nextprompt != NULL)
 781                                 tok->prompt = tok->nextprompt;
 782                         if (newtok == NULL)
 783                                 tok->done = E_INTR;
 784                         else if (*newtok == '\0') {
 785                                 PyMem_FREE(newtok);
 786                                 tok->done = E_EOF;
 787                         }
 788 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 789                         else if (tok_stdin_decode(tok, &newtok) != 0)
 790                                 PyMem_FREE(newtok);
 791 #endif
 792                         else if (tok->start != NULL) {
 793                                 size_t start = tok->start - tok->buf;
 794                                 size_t oldlen = tok->cur - tok->buf;
 795                                 size_t newlen = oldlen + strlen(newtok);
 796                                 char *buf = tok->buf;
 797                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 798                                 tok->lineno++;
 799                                 if (buf == NULL) {
 800                                         PyMem_FREE(tok->buf);
 801                                         tok->buf = NULL;
 802                                         PyMem_FREE(newtok);
 803                                         tok->done = E_NOMEM;
 804                                         return EOF;
 805                                 }
 806                                 tok->buf = buf;
 807                                 tok->cur = tok->buf + oldlen;
 808                                 tok->line_start = tok->cur;
 809                                 strcpy(tok->buf + oldlen, newtok);
 810                                 PyMem_FREE(newtok);
 811                                 tok->inp = tok->buf + newlen;
 812                                 tok->end = tok->inp + 1;
 813                                 tok->start = tok->buf + start;
 814                         }
 815                         else {
 816                                 tok->lineno++;
 817                                 if (tok->buf != NULL)
 818                                         PyMem_FREE(tok->buf);
 819                                 tok->buf = newtok;
 820                                 tok->line_start = tok->buf;
 821                                 tok->cur = tok->buf;
 822                                 tok->line_start = tok->buf;
 823                                 tok->inp = strchr(tok->buf, '\0');
 824                                 tok->end = tok->inp + 1;
 825                         }
 826                 }
 827                 else {
 828                         int done = 0;
 829                         Py_ssize_t cur = 0;
 830                         char *pt;
 831                         if (tok->start == NULL) {
 832                                 if (tok->buf == NULL) {
 833                                         tok->buf = (char *)
 834                                                 PyMem_MALLOC(BUFSIZ);
 835                                         if (tok->buf == NULL) {
 836                                                 tok->done = E_NOMEM;
 837                                                 return EOF;
 838                                         }
 839                                         tok->end = tok->buf + BUFSIZ;
 840                                 }
 841                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 842                                           tok) == NULL) {
 843                                         tok->done = E_EOF;
 844                                         done = 1;
 845                                 }
 846                                 else {
 847                                         tok->done = E_OK;
 848                                         tok->inp = strchr(tok->buf, '\0');
 849                                         done = tok->inp[-1] == '\n';
 850                                 }
 851                         }
 852                         else {
 853                                 cur = tok->cur - tok->buf;
 854                                 if (decoding_feof(tok)) {
 855                                         tok->done = E_EOF;
 856                                         done = 1;
 857                                 }
 858                                 else
 859                                         tok->done = E_OK;
 860                         }
 861                         tok->lineno++;
 862                         /* Read until '\n' or EOF */
 863                         while (!done) {
 864                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 865                                                   tok->start - tok->buf;
 866                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 867                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 868                                 char *newbuf = tok->buf;
 869                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 870                                                                newsize);
 871                                 if (newbuf == NULL) {
 872                                         tok->done = E_NOMEM;
 873                                         tok->cur = tok->inp;
 874                                         return EOF;
 875                                 }
 876                                 tok->buf = newbuf;
 877                                 tok->inp = tok->buf + curvalid;
 878                                 tok->end = tok->buf + newsize;
 879                                 tok->start = curstart < 0 ? NULL :
 880                                              tok->buf + curstart;
 881                                 if (decoding_fgets(tok->inp,
 882                                                (int)(tok->end - tok->inp),
 883                                                tok) == NULL) {
 884                                         /* Break out early on decoding
 885                                            errors, as tok->buf will be NULL
 886                                          */
 887                                         if (tok->decoding_erred)
 888                                                 return EOF;
 889                                         /* Last line does not end in \n,
 890                                            fake one */
 891                                         strcpy(tok->inp, "\n");
 892                                 }
 893                                 tok->inp = strchr(tok->inp, '\0');
 894                                 done = tok->inp[-1] == '\n';
 895                         }
 896                         if (tok->buf != NULL) {
 897                                 tok->cur = tok->buf + cur;
 898                                 tok->line_start = tok->cur;
 899                                 /* replace "\r\n" with "\n" */
 900                                 /* For Mac leave the \r, giving a syntax error */
 901                                 pt = tok->inp - 2;
 902                                 if (pt >= tok->buf && *pt == '\r') {
 903                                         *pt++ = '\n';
 904                                         *pt = '\0';
 905                                         tok->inp = pt;
 906                                 }
 907                         }
 908                 }
 909                 if (tok->done != E_OK) {
 910                         if (tok->prompt != NULL)
 911                                 PySys_WriteStderr("\n");
 912                         tok->cur = tok->inp;
 913                         return EOF;
 914                 }
 915         }
 916         /*NOTREACHED*/
 917 }
 918
 919
 920 /* Back-up one character */
 921
 922 static void
 923 tok_backup(register struct tok_state *tok, register int c)
 924 {
 925         if (c != EOF) {
 926                 if (--tok->cur < tok->buf)
 927                         Py_FatalError("tok_backup: begin of buffer");
 928                 if (*tok->cur != c)
 929                         *tok->cur = c;
 930         }
 931 }
 932
 933
 934 /* Return the token corresponding to a single character */
 935
 936 int
 937 PyToken_OneChar(int c)
 938 {
 939         switch (c) {
 940         case '(':       return LPAR;
 941         case ')':       return RPAR;
 942         case '[':       return LSQB;
 943         case ']':       return RSQB;
 944         case ':':       return COLON;
 945         case ',':       return COMMA;
 946         case ';':       return SEMI;
 947         case '+':       return PLUS;
 948         case '-':       return MINUS;
 949         case '*':       return STAR;
 950         case '/':       return SLASH;
 951         case '|':       return VBAR;
 952         case '&':       return AMPER;
 953         case '<':       return LESS;
 954         case '>':       return GREATER;
 955         case '=':       return EQUAL;
 956         case '.':       return DOT;
 957         case '%':       return PERCENT;
 958         case '`':       return BACKQUOTE;
 959         case '{':       return LBRACE;
 960         case '}':       return RBRACE;
 961         case '^':       return CIRCUMFLEX;
 962         case '~':       return TILDE;
 963         case '@':       return AT;
 964         default:        return OP;
 965         }
 966 }
 967
 968
 969 int
 970 PyToken_TwoChars(int c1, int c2)
 971 {
 972         switch (c1) {
 973         case '=':
 974                 switch (c2) {
 975                 case '=':       return EQEQUAL;
 976                 }
 977                 break;
 978         case '!':
 979                 switch (c2) {
 980                 case '=':       return NOTEQUAL;
 981                 }
 982                 break;
 983         case '<':
 984                 switch (c2) {
 985                 case '>':       return NOTEQUAL;
 986                 case '=':       return LESSEQUAL;
 987                 case '<':       return LEFTSHIFT;
 988                 }
 989                 break;
 990         case '>':
 991                 switch (c2) {
 992                 case '=':       return GREATEREQUAL;
 993                 case '>':       return RIGHTSHIFT;
 994                 }
 995                 break;
 996         case '+':
 997                 switch (c2) {
 998                 case '=':       return PLUSEQUAL;
 999                 }
1000                 break;
1001         case '-':
1002                 switch (c2) {
1003                 case '=':       return MINEQUAL;
1004                 }
1005                 break;
1006         case '*':
1007                 switch (c2) {
1008                 case '*':       return DOUBLESTAR;
1009                 case '=':       return STAREQUAL;
1010                 }
1011                 break;
1012         case '/':
1013                 switch (c2) {
1014                 case '/':       return DOUBLESLASH;
1015                 case '=':       return SLASHEQUAL;
1016                 }
1017                 break;
1018         case '|':
1019                 switch (c2) {
1020                 case '=':       return VBAREQUAL;
1021                 }
1022                 break;
1023         case '%':
1024                 switch (c2) {
1025                 case '=':       return PERCENTEQUAL;
1026                 }
1027                 break;
1028         case '&':
1029                 switch (c2) {
1030                 case '=':       return AMPEREQUAL;
1031                 }
1032                 break;
1033         case '^':
1034                 switch (c2) {
1035                 case '=':       return CIRCUMFLEXEQUAL;
1036                 }
1037                 break;
1038         }
1039         return OP;
1040 }
1041
1042 int
1043 PyToken_ThreeChars(int c1, int c2, int c3)
1044 {
1045         switch (c1) {
1046         case '<':
1047                 switch (c2) {
1048                 case '<':
1049                         switch (c3) {
1050                         case '=':
1051                                 return LEFTSHIFTEQUAL;
1052                         }
1053                         break;
1054                 }
1055                 break;
1056         case '>':
1057                 switch (c2) {
1058                 case '>':
1059                         switch (c3) {
1060                         case '=':
1061                                 return RIGHTSHIFTEQUAL;
1062                         }
1063                         break;
1064                 }
1065                 break;
1066         case '*':
1067                 switch (c2) {
1068                 case '*':
1069                         switch (c3) {
1070                         case '=':
1071                                 return DOUBLESTAREQUAL;
1072                         }
1073                         break;
1074                 }
1075                 break;
1076         case '/':
1077                 switch (c2) {
1078                 case '/':
1079                         switch (c3) {
1080                         case '=':
1081                                 return DOUBLESLASHEQUAL;
1082                         }
1083                         break;
1084                 }
1085                 break;
1086         }
1087         return OP;
1088 }
1089
1090 static int
1091 indenterror(struct tok_state *tok)
1092 {
1093         if (tok->alterror) {
1094                 tok->done = E_TABSPACE;
1095                 tok->cur = tok->inp;
1096                 return 1;
1097         }
1098         if (tok->altwarning) {
1099                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1100                                   "in indentation\n", tok->filename);
1101                 tok->altwarning = 0;
1102         }
1103         return 0;
1104 }
1105
1106
1107 /* Get next token, after space stripping etc. */
1108
1109 static int
1110 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1111 {
1112         register int c;
1113         int blankline;
1114
1115         *p_start = *p_end = NULL;
1116   nextline:
1117         tok->start = NULL;
1118         blankline = 0;
1119
1120         /* Get indentation level */
1121         if (tok->atbol) {
1122                 register int col = 0;
1123                 register int altcol = 0;
1124                 tok->atbol = 0;
1125                 for (;;) {
1126                         c = tok_nextc(tok);
1127                         if (c == ' ')
1128                                 col++, altcol++;
1129                         else if (c == '\t') {
1130                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1131                                 altcol = (altcol/tok->alttabsize + 1)
1132                                         * tok->alttabsize;
1133                         }
1134                         else if (c == '\014') /* Control-L (formfeed) */
1135                                 col = altcol = 0; /* For Emacs users */
1136                         else
1137                                 break;
1138                 }
1139                 tok_backup(tok, c);
1140                 if (c == '#' || c == '\n') {
1141                         /* Lines with only whitespace and/or comments
1142                            shouldn't affect the indentation and are
1143                            not passed to the parser as NEWLINE tokens,
1144                            except *totally* empty lines in interactive
1145                            mode, which signal the end of a command group. */
1146                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1147                                 blankline = 0; /* Let it through */
1148                         else
1149                                 blankline = 1; /* Ignore completely */
1150                         /* We can't jump back right here since we still
1151                            may need to skip to the end of a comment */
1152                 }
1153                 if (!blankline && tok->level == 0) {
1154                         if (col == tok->indstack[tok->indent]) {
1155                                 /* No change */
1156                                 if (altcol != tok->altindstack[tok->indent]) {
1157                                         if (indenterror(tok))
1158                                                 return ERRORTOKEN;
1159                                 }
1160                         }
1161                         else if (col > tok->indstack[tok->indent]) {
1162                                 /* Indent -- always one */
1163                                 if (tok->indent+1 >= MAXINDENT) {
1164                                         tok->done = E_TOODEEP;
1165                                         tok->cur = tok->inp;
1166                                         return ERRORTOKEN;
1167                                 }
1168                                 if (altcol <= tok->altindstack[tok->indent]) {
1169                                         if (indenterror(tok))
1170                                                 return ERRORTOKEN;
1171                                 }
1172                                 tok->pendin++;
1173                                 tok->indstack[++tok->indent] = col;
1174                                 tok->altindstack[tok->indent] = altcol;
1175                         }
1176                         else /* col < tok->indstack[tok->indent] */ {
1177                                 /* Dedent -- any number, must be consistent */
1178                                 while (tok->indent > 0 &&
1179                                         col < tok->indstack[tok->indent]) {
1180                                         tok->pendin--;
1181                                         tok->indent--;
1182                                 }
1183                                 if (col != tok->indstack[tok->indent]) {
1184                                         tok->done = E_DEDENT;
1185                                         tok->cur = tok->inp;
1186                                         return ERRORTOKEN;
1187                                 }
1188                                 if (altcol != tok->altindstack[tok->indent]) {
1189                                         if (indenterror(tok))
1190                                                 return ERRORTOKEN;
1191                                 }
1192                         }
1193                 }
1194         }
1195
1196         tok->start = tok->cur;
1197
1198         /* Return pending indents/dedents */
1199         if (tok->pendin != 0) {
1200                 if (tok->pendin < 0) {
1201                         tok->pendin++;
1202                         return DEDENT;
1203                 }
1204                 else {
1205                         tok->pendin--;
1206                         return INDENT;
1207                 }
1208         }
1209
1210  again:
1211         tok->start = NULL;
1212         /* Skip spaces */
1213         do {
1214                 c = tok_nextc(tok);
1215         } while (c == ' ' || c == '\t' || c == '\014');
1216
1217         /* Set start of current token */
1218         tok->start = tok->cur - 1;
1219
1220         /* Skip comment, while looking for tab-setting magic */
1221         if (c == '#') {
1222                 static char *tabforms[] = {
1223                         "tab-width:",           /* Emacs */
1224                         ":tabstop=",            /* vim, full form */
1225                         ":ts=",                 /* vim, abbreviated form */
1226                         "set tabsize=",         /* will vi never die? */
1227                 /* more templates can be added here to support other editors */
1228                 };
1229                 char cbuf[80];
1230                 char *tp, **cp;
1231                 tp = cbuf;
1232                 do {
1233                         *tp++ = c = tok_nextc(tok);
1234                 } while (c != EOF && c != '\n' &&
1235                          (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1236                 *tp = '\0';
1237                 for (cp = tabforms;
1238                      cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1239                      cp++) {
1240                         if ((tp = strstr(cbuf, *cp))) {
1241                                 int newsize = atoi(tp + strlen(*cp));
1242
1243                                 if (newsize >= 1 && newsize <= 40) {
1244                                         tok->tabsize = newsize;
1245                                         if (Py_VerboseFlag)
1246                                             PySys_WriteStderr(
1247                                                 "Tab size set to %d\n",
1248                                                 newsize);
1249                                 }
1250                         }
1251                 }
1252                 while (c != EOF && c != '\n')
1253                         c = tok_nextc(tok);
1254         }
1255
1256         /* Check for EOF and errors now */
1257         if (c == EOF) {
1258                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1259         }
1260
1261         /* Identifier (most frequent token!) */
1262         if (isalpha(c) || c == '_') {
1263                 /* Process r"", u"" and ur"" */
1264                 switch (c) {
1265                 case 'r':
1266                 case 'R':
1267                         c = tok_nextc(tok);
1268                         if (c == '"' || c == '\'')
1269                                 goto letter_quote;
1270                         break;
1271                 case 'u':
1272                 case 'U':
1273                         c = tok_nextc(tok);
1274                         if (c == 'r' || c == 'R')
1275                                 c = tok_nextc(tok);
1276                         if (c == '"' || c == '\'')
1277                                 goto letter_quote;
1278                         break;
1279                 }
1280                 while (isalnum(c) || c == '_') {
1281                         c = tok_nextc(tok);
1282                 }
1283                 tok_backup(tok, c);
1284                 *p_start = tok->start;
1285                 *p_end = tok->cur;
1286                 return NAME;
1287         }
1288
1289         /* Newline */
1290         if (c == '\n') {
1291                 tok->atbol = 1;
1292                 if (blankline || tok->level > 0)
1293                         goto nextline;
1294                 *p_start = tok->start;
1295                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1296                 tok->cont_line = 0;
1297                 return NEWLINE;
1298         }
1299
1300         /* Period or number starting with period? */
1301         if (c == '.') {
1302                 c = tok_nextc(tok);
1303                 if (isdigit(c)) {
1304                         goto fraction;
1305                 }
1306                 else {
1307                         tok_backup(tok, c);
1308                         *p_start = tok->start;
1309                         *p_end = tok->cur;
1310                         return DOT;
1311                 }
1312         }
1313
1314         /* Number */
1315         if (isdigit(c)) {
1316                 if (c == '0') {
1317                         /* Hex or octal -- maybe. */
1318                         c = tok_nextc(tok);
1319                         if (c == '.')
1320                                 goto fraction;
1321 #ifndef WITHOUT_COMPLEX
1322                         if (c == 'j' || c == 'J')
1323                                 goto imaginary;
1324 #endif
1325                         if (c == 'x' || c == 'X') {
1326                                 /* Hex */
1327                                 do {
1328                                         c = tok_nextc(tok);
1329                                 } while (isxdigit(c));
1330                         }
1331                         else {
1332                                 int found_decimal = 0;
1333                                 /* Octal; c is first char of it */
1334                                 /* There's no 'isoctdigit' macro, sigh */
1335                                 while ('0' <= c && c < '8') {
1336                                         c = tok_nextc(tok);
1337                                 }
1338                                 if (isdigit(c)) {
1339                                         found_decimal = 1;
1340                                         do {
1341                                                 c = tok_nextc(tok);
1342                                         } while (isdigit(c));
1343                                 }
1344                                 if (c == '.')
1345                                         goto fraction;
1346                                 else if (c == 'e' || c == 'E')
1347                                         goto exponent;
1348 #ifndef WITHOUT_COMPLEX
1349                                 else if (c == 'j' || c == 'J')
1350                                         goto imaginary;
1351 #endif
1352                                 else if (found_decimal) {
1353                                         tok->done = E_TOKEN;
1354                                         tok_backup(tok, c);
1355                                         return ERRORTOKEN;
1356                                 }
1357                         }
1358                         if (c == 'l' || c == 'L')
1359                                 c = tok_nextc(tok);
1360                 }
1361                 else {
1362                         /* Decimal */
1363                         do {
1364                                 c = tok_nextc(tok);
1365                         } while (isdigit(c));
1366                         if (c == 'l' || c == 'L')
1367                                 c = tok_nextc(tok);
1368                         else {
1369                                 /* Accept floating point numbers. */
1370                                 if (c == '.') {
1371                 fraction:
1372                                         /* Fraction */
1373                                         do {
1374                                                 c = tok_nextc(tok);
1375                                         } while (isdigit(c));
1376                                 }
1377                                 if (c == 'e' || c == 'E') {
1378                 exponent:
1379                                         /* Exponent part */
1380                                         c = tok_nextc(tok);
1381                                         if (c == '+' || c == '-')
1382                                                 c = tok_nextc(tok);
1383                                         if (!isdigit(c)) {
1384                                                 tok->done = E_TOKEN;
1385                                                 tok_backup(tok, c);
1386                                                 return ERRORTOKEN;
1387                                         }
1388                                         do {
1389                                                 c = tok_nextc(tok);
1390                                         } while (isdigit(c));
1391                                 }
1392 #ifndef WITHOUT_COMPLEX
1393                                 if (c == 'j' || c == 'J')
1394                                         /* Imaginary part */
1395                 imaginary:
1396                                         c = tok_nextc(tok);
1397 #endif
1398                         }
1399                 }
1400                 tok_backup(tok, c);
1401                 *p_start = tok->start;
1402                 *p_end = tok->cur;
1403                 return NUMBER;
1404         }
1405
1406   letter_quote:
1407         /* String */
1408         if (c == '\'' || c == '"') {
1409                 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1410                 int quote = c;
1411                 int triple = 0;
1412                 int tripcount = 0;
1413                 for (;;) {
1414                         c = tok_nextc(tok);
1415                         if (c == '\n') {
1416                                 if (!triple) {
1417                                         tok->done = E_EOLS;
1418                                         tok_backup(tok, c);
1419                                         return ERRORTOKEN;
1420                                 }
1421                                 tripcount = 0;
1422                                 tok->cont_line = 1; /* multiline string. */
1423                         }
1424                         else if (c == EOF) {
1425                                 if (triple)
1426                                         tok->done = E_EOFS;
1427                                 else
1428                                         tok->done = E_EOLS;
1429                                 tok->cur = tok->inp;
1430                                 return ERRORTOKEN;
1431                         }
1432                         else if (c == quote) {
1433                                 tripcount++;
1434                                 if (tok->cur - tok->start == quote2) {
1435                                         c = tok_nextc(tok);
1436                                         if (c == quote) {
1437                                                 triple = 1;
1438                                                 tripcount = 0;
1439                                                 continue;
1440                                         }
1441                                         tok_backup(tok, c);
1442                                 }
1443                                 if (!triple || tripcount == 3)
1444                                         break;
1445                         }
1446                         else if (c == '\\') {
1447                                 tripcount = 0;
1448                                 c = tok_nextc(tok);
1449                                 if (c == EOF) {
1450                                         tok->done = E_EOLS;
1451                                         tok->cur = tok->inp;
1452                                         return ERRORTOKEN;
1453                                 }
1454                         }
1455                         else
1456                                 tripcount = 0;
1457                 }
1458                 *p_start = tok->start;
1459                 *p_end = tok->cur;
1460                 return STRING;
1461         }
1462
1463         /* Line continuation */
1464         if (c == '\\') {
1465                 c = tok_nextc(tok);
1466                 if (c != '\n') {
1467                         tok->done = E_LINECONT;
1468                         tok->cur = tok->inp;
1469                         return ERRORTOKEN;
1470                 }
1471                 tok->cont_line = 1;
1472                 goto again; /* Read next line */
1473         }
1474
1475         /* Check for two-character token */
1476         {
1477                 int c2 = tok_nextc(tok);
1478                 int token = PyToken_TwoChars(c, c2);
1479                 if (token != OP) {
1480                         int c3 = tok_nextc(tok);
1481                         int token3 = PyToken_ThreeChars(c, c2, c3);
1482                         if (token3 != OP) {
1483                                 token = token3;
1484                         } else {
1485                                 tok_backup(tok, c3);
1486                         }
1487                         *p_start = tok->start;
1488                         *p_end = tok->cur;
1489                         return token;
1490                 }
1491                 tok_backup(tok, c2);
1492         }
1493
1494         /* Keep track of parentheses nesting level */
1495         switch (c) {
1496         case '(':
1497         case '[':
1498         case '{':
1499                 tok->level++;
1500                 break;
1501         case ')':
1502         case ']':
1503         case '}':
1504                 tok->level--;
1505                 break;
1506         }
1507
1508         /* Punctuation character */
1509         *p_start = tok->start;
1510         *p_end = tok->cur;
1511         return PyToken_OneChar(c);
1512 }
1513
1514 int
1515 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1516 {
1517         int result = tok_get(tok, p_start, p_end);
1518         if (tok->decoding_erred) {
1519                 result = ERRORTOKEN;
1520                 tok->done = E_DECODE;
1521         }
1522         return result;
1523 }
1524
1525 /* This function is only called from parsetok. However, it cannot live
1526    there, as it must be empty for PGEN, and we can check for PGEN only
1527    in this file. */
1528
1529 #ifdef PGEN
1530 char*
1531 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1532 {
1533         return NULL;
1534 }
1535 #else
1536 static PyObject *
1537 dec_utf8(const char *enc, const char *text, size_t len) {
1538         PyObject *ret = NULL;
1539         PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1540         if (unicode_text) {
1541                 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1542                 Py_DECREF(unicode_text);
1543         }
1544         if (!ret) {
1545                 PyErr_Print();
1546         }
1547         return ret;
1548 }
1549
1550 char *
1551 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1552 {
1553         char *text = NULL;
1554         if (tok->encoding) {
1555                 /* convert source to original encondig */
1556                 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1557                 if (lineobj != NULL) {
1558                         int linelen = PyString_Size(lineobj);
1559                         const char *line = PyString_AsString(lineobj);
1560                         text = PyObject_MALLOC(linelen + 1);
1561                         if (text != NULL && line != NULL) {
1562                                 if (linelen)
1563                                         strncpy(text, line, linelen);
1564                                 text[linelen] = '\0';
1565                         }
1566                         Py_DECREF(lineobj);
1567
1568                         /* adjust error offset */
1569                         if (*offset > 1) {
1570                                 PyObject *offsetobj = dec_utf8(tok->encoding,
1571                                                                tok->buf, *offset-1);
1572                                 if (offsetobj) {
1573                                         *offset = PyString_Size(offsetobj) + 1;
1574                                         Py_DECREF(offsetobj);
1575                                 }
1576                         }
1577
1578                 }
1579         }
1580         return text;
1581
1582 }
1583 #endif
1584
1585
1586
1587 #ifdef Py_DEBUG
1588
1589 void
1590 tok_dump(int type, char *start, char *end)
1591 {
1592         printf("%s", _PyParser_TokenNames[type]);
1593         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1594                 printf("(%.*s)", (int)(end - start), start);
1595 }
1596
1597 #endif