Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #include "pydebug.h"
  20 #endif /* PGEN */
  21
  22 extern char *PyOS_Readline(FILE *, FILE *, char *);
  23 /* Return malloc'ed string including trailing \n;
  24    empty malloc'ed string for EOF;
  25    NULL if interrupted */
  26
  27 /* Don't ever change this -- it would break the portability of Python code */
  28 #define TABSIZE 8
  29
  30 /* Forward */
  31 static struct tok_state *tok_new(void);
  32 static int tok_nextc(struct tok_state *tok);
  33 static void tok_backup(struct tok_state *tok, int c);
  34
  35 /* Token names */
  36
  37 char *_PyParser_TokenNames[] = {
  38         "ENDMARKER",
  39         "NAME",
  40         "NUMBER",
  41         "STRING",
  42         "NEWLINE",
  43         "INDENT",
  44         "DEDENT",
  45         "LPAR",
  46         "RPAR",
  47         "LSQB",
  48         "RSQB",
  49         "COLON",
  50         "COMMA",
  51         "SEMI",
  52         "PLUS",
  53         "MINUS",
  54         "STAR",
  55         "SLASH",
  56         "VBAR",
  57         "AMPER",
  58         "LESS",
  59         "GREATER",
  60         "EQUAL",
  61         "DOT",
  62         "PERCENT",
  63         "BACKQUOTE",
  64         "LBRACE",
  65         "RBRACE",
  66         "EQEQUAL",
  67         "NOTEQUAL",
  68         "LESSEQUAL",
  69         "GREATEREQUAL",
  70         "TILDE",
  71         "CIRCUMFLEX",
  72         "LEFTSHIFT",
  73         "RIGHTSHIFT",
  74         "DOUBLESTAR",
  75         "PLUSEQUAL",
  76         "MINEQUAL",
  77         "STAREQUAL",
  78         "SLASHEQUAL",
  79         "PERCENTEQUAL",
  80         "AMPEREQUAL",
  81         "VBAREQUAL",
  82         "CIRCUMFLEXEQUAL",
  83         "LEFTSHIFTEQUAL",
  84         "RIGHTSHIFTEQUAL",
  85         "DOUBLESTAREQUAL",
  86         "DOUBLESLASH",
  87         "DOUBLESLASHEQUAL",
  88         "AT",
  89         /* This table must match the #defines in token.h! */
  90         "OP",
  91         "<ERRORTOKEN>",
  92         "<N_TOKENS>"
  93 };
  94
  95
  96 /* Create and initialize a new tok_state structure */
  97
  98 static struct tok_state *
  99 tok_new(void)
 100 {
 101         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 102                                                 sizeof(struct tok_state));
 103         if (tok == NULL)
 104                 return NULL;
 105         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 106         tok->done = E_OK;
 107         tok->fp = NULL;
 108         tok->tabsize = TABSIZE;
 109         tok->indent = 0;
 110         tok->indstack[0] = 0;
 111         tok->atbol = 1;
 112         tok->pendin = 0;
 113         tok->prompt = tok->nextprompt = NULL;
 114         tok->lineno = 0;
 115         tok->level = 0;
 116         tok->filename = NULL;
 117         tok->altwarning = 0;
 118         tok->alterror = 0;
 119         tok->alttabsize = 1;
 120         tok->altindstack[0] = 0;
 121         tok->decoding_state = 0;
 122         tok->decoding_erred = 0;
 123         tok->read_coding_spec = 0;
 124         tok->encoding = NULL;
 125         tok->cont_line = 0;
 126 #ifndef PGEN
 127         tok->decoding_readline = NULL;
 128         tok->decoding_buffer = NULL;
 129 #endif
 130         return tok;
 131 }
 132
 133 #ifdef PGEN
 134
 135 static char *
 136 decoding_fgets(char *s, int size, struct tok_state *tok)
 137 {
 138         return fgets(s, size, tok->fp);
 139 }
 140
 141 static int
 142 decoding_feof(struct tok_state *tok)
 143 {
 144         return feof(tok->fp);
 145 }
 146
 147 static const char *
 148 decode_str(const char *str, struct tok_state *tok)
 149 {
 150         return str;
 151 }
 152
 153 #else /* PGEN */
 154
 155 static char *
 156 error_ret(struct tok_state *tok) /* XXX */
 157 {
 158         tok->decoding_erred = 1;
 159         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 160                 PyMem_FREE(tok->buf);
 161         tok->buf = NULL;
 162         return NULL;            /* as if it were EOF */
 163 }
 164
 165 static char *
 166 new_string(const char *s, Py_ssize_t len)
 167 {
 168         char* result = (char *)PyMem_MALLOC(len + 1);
 169         if (result != NULL) {
 170                 memcpy(result, s, len);
 171                 result[len] = '\0';
 172         }
 173         return result;
 174 }
 175
 176 static char *
 177 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 178 {
 179         char buf[13];
 180         int i;
 181         for (i = 0; i < 12; i++) {
 182                 int c = s[i];
 183                 if (c == '\0') break;
 184                 else if (c == '_') buf[i] = '-';
 185                 else buf[i] = tolower(c);
 186         }
 187         buf[i] = '\0';
 188         if (strcmp(buf, "utf-8") == 0 ||
 189             strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
 190         else if (strcmp(buf, "latin-1") == 0 ||
 191                  strcmp(buf, "iso-8859-1") == 0 ||
 192                  strcmp(buf, "iso-latin-1") == 0 ||
 193                  strncmp(buf, "latin-1-", 8) == 0 ||
 194                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 195                  strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
 196         else return s;
 197 }
 198
 199 /* Return the coding spec in S, or NULL if none is found.  */
 200
 201 static char *
 202 get_coding_spec(const char *s, Py_ssize_t size)
 203 {
 204         Py_ssize_t i;
 205         /* Coding spec must be in a comment, and that comment must be
 206          * the only statement on the source code line. */
 207         for (i = 0; i < size - 6; i++) {
 208                 if (s[i] == '#')
 209                         break;
 210                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 211                         return NULL;
 212         }
 213         for (; i < size - 6; i++) { /* XXX inefficient search */
 214                 const char* t = s + i;
 215                 if (strncmp(t, "coding", 6) == 0) {
 216                         const char* begin = NULL;
 217                         t += 6;
 218                         if (t[0] != ':' && t[0] != '=')
 219                                 continue;
 220                         do {
 221                                 t++;
 222                         } while (t[0] == '\x20' || t[0] == '\t');
 223
 224                         begin = t;
 225                         while (isalnum(Py_CHARMASK(t[0])) ||
 226                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 227                                 t++;
 228
 229                         if (begin < t) {
 230                                 char* r = new_string(begin, t - begin);
 231                                 char* q = get_normal_name(r);
 232                                 if (r != q) {
 233                                         PyMem_FREE(r);
 234                                         r = new_string(q, strlen(q));
 235                                 }
 236                                 return r;
 237                         }
 238                 }
 239         }
 240         return NULL;
 241 }
 242
 243 /* Check whether the line contains a coding spec. If it does,
 244    invoke the set_readline function for the new encoding.
 245    This function receives the tok_state and the new encoding.
 246    Return 1 on success, 0 on failure.  */
 247
 248 static int
 249 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 250                   int set_readline(struct tok_state *, const char *))
 251 {
 252         char * cs;
 253         int r = 1;
 254
 255         if (tok->cont_line)
 256                 /* It's a continuation line, so it can't be a coding spec. */
 257                 return 1;
 258         cs = get_coding_spec(line, size);
 259         if (cs != NULL) {
 260                 tok->read_coding_spec = 1;
 261                 if (tok->encoding == NULL) {
 262                         assert(tok->decoding_state == 1); /* raw */
 263                         if (strcmp(cs, "utf-8") == 0 ||
 264                             strcmp(cs, "iso-8859-1") == 0) {
 265                                 tok->encoding = cs;
 266                         } else {
 267 #ifdef Py_USING_UNICODE
 268                                 r = set_readline(tok, cs);
 269                                 if (r) {
 270                                         tok->encoding = cs;
 271                                         tok->decoding_state = -1;
 272                                 }
 273                                 else
 274                                         PyMem_FREE(cs);
 275 #else
 276                                 /* Without Unicode support, we cannot
 277                                    process the coding spec. Since there
 278                                    won't be any Unicode literals, that
 279                                    won't matter. */
 280                                 PyMem_FREE(cs);
 281 #endif
 282                         }
 283                 } else {        /* then, compare cs with BOM */
 284                         r = (strcmp(tok->encoding, cs) == 0);
 285                         PyMem_FREE(cs);
 286                 }
 287         }
 288         if (!r) {
 289                 cs = tok->encoding;
 290                 if (!cs)
 291                         cs = "with BOM";
 292                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 293         }
 294         return r;
 295 }
 296
 297 /* See whether the file starts with a BOM. If it does,
 298    invoke the set_readline function with the new encoding.
 299    Return 1 on success, 0 on failure.  */
 300
 301 static int
 302 check_bom(int get_char(struct tok_state *),
 303           void unget_char(int, struct tok_state *),
 304           int set_readline(struct tok_state *, const char *),
 305           struct tok_state *tok)
 306 {
 307         int ch = get_char(tok);
 308         tok->decoding_state = 1;
 309         if (ch == EOF) {
 310                 return 1;
 311         } else if (ch == 0xEF) {
 312                 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
 313                 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
 314 #if 0
 315         /* Disable support for UTF-16 BOMs until a decision
 316            is made whether this needs to be supported.  */
 317         } else if (ch == 0xFE) {
 318                 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
 319                 if (!set_readline(tok, "utf-16-be")) return 0;
 320                 tok->decoding_state = -1;
 321         } else if (ch == 0xFF) {
 322                 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
 323                 if (!set_readline(tok, "utf-16-le")) return 0;
 324                 tok->decoding_state = -1;
 325 #endif
 326         } else {
 327                 unget_char(ch, tok);
 328                 return 1;
 329         }
 330         if (tok->encoding != NULL)
 331                 PyMem_FREE(tok->encoding);
 332         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 333         return 1;
 334   NON_BOM:
 335         /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 336         unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
 337         return 1;
 338 }
 339
 340 /* Read a line of text from TOK into S, using the stream in TOK.
 341    Return NULL on failure, else S.
 342
 343    On entry, tok->decoding_buffer will be one of:
 344      1) NULL: need to call tok->decoding_readline to get a new line
 345      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 346            stored the result in tok->decoding_buffer
 347      3) PyStringObject *: previous call to fp_readl did not have enough room
 348            (in the s buffer) to copy entire contents of the line read
 349            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 350            In this case, fp_readl is called in a loop (with an expanded buffer)
 351            until the buffer ends with a '\n' (or until the end of the file is
 352            reached): see tok_nextc and its calls to decoding_fgets.
 353 */
 354
 355 static char *
 356 fp_readl(char *s, int size, struct tok_state *tok)
 357 {
 358 #ifndef Py_USING_UNICODE
 359         /* In a non-Unicode built, this should never be called. */
 360         Py_FatalError("fp_readl should not be called in this build.");
 361         return NULL; /* Keep compiler happy (not reachable) */
 362 #else
 363         PyObject* utf8 = NULL;
 364         PyObject* buf = tok->decoding_buffer;
 365         char *str;
 366         Py_ssize_t utf8len;
 367
 368         /* Ask for one less byte so we can terminate it */
 369         assert(size > 0);
 370         size--;
 371
 372         if (buf == NULL) {
 373                 buf = PyObject_CallObject(tok->decoding_readline, NULL);
 374                 if (buf == NULL)
 375                         return error_ret(tok);
 376         } else {
 377                 tok->decoding_buffer = NULL;
 378                 if (PyString_CheckExact(buf))
 379                         utf8 = buf;
 380         }
 381         if (utf8 == NULL) {
 382                 utf8 = PyUnicode_AsUTF8String(buf);
 383                 Py_DECREF(buf);
 384                 if (utf8 == NULL)
 385                         return error_ret(tok);
 386         }
 387         str = PyString_AsString(utf8);
 388         utf8len = PyString_GET_SIZE(utf8);
 389         if (utf8len > size) {
 390                 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 391                 if (tok->decoding_buffer == NULL) {
 392                         Py_DECREF(utf8);
 393                         return error_ret(tok);
 394                 }
 395                 utf8len = size;
 396         }
 397         memcpy(s, str, utf8len);
 398         s[utf8len] = '\0';
 399         Py_DECREF(utf8);
 400         if (utf8len == 0) return NULL; /* EOF */
 401         return s;
 402 #endif
 403 }
 404
 405 /* Set the readline function for TOK to a StreamReader's
 406    readline function. The StreamReader is named ENC.
 407
 408    This function is called from check_bom and check_coding_spec.
 409
 410    ENC is usually identical to the future value of tok->encoding,
 411    except for the (currently unsupported) case of UTF-16.
 412
 413    Return 1 on success, 0 on failure. */
 414
 415 static int
 416 fp_setreadl(struct tok_state *tok, const char* enc)
 417 {
 418         PyObject *reader, *stream, *readline;
 419
 420         /* XXX: constify filename argument. */
 421         stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 422         if (stream == NULL)
 423                 return 0;
 424
 425         reader = PyCodec_StreamReader(enc, stream, NULL);
 426         Py_DECREF(stream);
 427         if (reader == NULL)
 428                 return 0;
 429
 430         readline = PyObject_GetAttrString(reader, "readline");
 431         Py_DECREF(reader);
 432         if (readline == NULL)
 433                 return 0;
 434
 435         tok->decoding_readline = readline;
 436         return 1;
 437 }
 438
 439 /* Fetch the next byte from TOK. */
 440
 441 static int fp_getc(struct tok_state *tok) {
 442         return getc(tok->fp);
 443 }
 444
 445 /* Unfetch the last byte back into TOK.  */
 446
 447 static void fp_ungetc(int c, struct tok_state *tok) {
 448         ungetc(c, tok->fp);
 449 }
 450
 451 /* Read a line of input from TOK. Determine encoding
 452    if necessary.  */
 453
 454 static char *
 455 decoding_fgets(char *s, int size, struct tok_state *tok)
 456 {
 457         char *line = NULL;
 458         int badchar = 0;
 459         for (;;) {
 460                 if (tok->decoding_state < 0) {
 461                         /* We already have a codec associated with
 462                            this input. */
 463                         line = fp_readl(s, size, tok);
 464                         break;
 465                 } else if (tok->decoding_state > 0) {
 466                         /* We want a 'raw' read. */
 467                         line = Py_UniversalNewlineFgets(s, size,
 468                                                         tok->fp, NULL);
 469                         break;
 470                 } else {
 471                         /* We have not yet determined the encoding.
 472                            If an encoding is found, use the file-pointer
 473                            reader functions from now on. */
 474                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 475                                 return error_ret(tok);
 476                         assert(tok->decoding_state != 0);
 477                 }
 478         }
 479         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 480                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 481                         return error_ret(tok);
 482                 }
 483         }
 484 #ifndef PGEN
 485         /* The default encoding is ASCII, so make sure we don't have any
 486            non-ASCII bytes in it. */
 487         if (line && !tok->encoding) {
 488                 unsigned char *c;
 489                 for (c = (unsigned char *)line; *c; c++)
 490                         if (*c > 127) {
 491                                 badchar = *c;
 492                                 break;
 493                         }
 494         }
 495         if (badchar) {
 496                 char buf[500];
 497                 /* Need to add 1 to the line number, since this line
 498                    has not been counted, yet.  */
 499                 sprintf(buf,
 500                         "Non-ASCII character '\\x%.2x' "
 501                         "in file %.200s on line %i, "
 502                         "but no encoding declared; "
 503                         "see http://www.python.org/peps/pep-0263.html for details",
 504                         badchar, tok->filename, tok->lineno + 1);
 505                 PyErr_SetString(PyExc_SyntaxError, buf);
 506                 return error_ret(tok);
 507         }
 508 #endif
 509         return line;
 510 }
 511
 512 static int
 513 decoding_feof(struct tok_state *tok)
 514 {
 515         if (tok->decoding_state >= 0) {
 516                 return feof(tok->fp);
 517         } else {
 518                 PyObject* buf = tok->decoding_buffer;
 519                 if (buf == NULL) {
 520                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 521                         if (buf == NULL) {
 522                                 error_ret(tok);
 523                                 return 1;
 524                         } else {
 525                                 tok->decoding_buffer = buf;
 526                         }
 527                 }
 528                 return PyObject_Length(buf) == 0;
 529         }
 530 }
 531
 532 /* Fetch a byte from TOK, using the string buffer. */
 533
 534 static int
 535 buf_getc(struct tok_state *tok) {
 536         return Py_CHARMASK(*tok->str++);
 537 }
 538
 539 /* Unfetch a byte from TOK, using the string buffer. */
 540
 541 static void
 542 buf_ungetc(int c, struct tok_state *tok) {
 543         tok->str--;
 544         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 545 }
 546
 547 /* Set the readline function for TOK to ENC. For the string-based
 548    tokenizer, this means to just record the encoding. */
 549
 550 static int
 551 buf_setreadl(struct tok_state *tok, const char* enc) {
 552         tok->enc = enc;
 553         return 1;
 554 }
 555
 556 /* Return a UTF-8 encoding Python string object from the
 557    C byte string STR, which is encoded with ENC. */
 558
 559 #ifdef Py_USING_UNICODE
 560 static PyObject *
 561 translate_into_utf8(const char* str, const char* enc) {
 562         PyObject *utf8;
 563         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 564         if (buf == NULL)
 565                 return NULL;
 566         utf8 = PyUnicode_AsUTF8String(buf);
 567         Py_DECREF(buf);
 568         return utf8;
 569 }
 570 #endif
 571
 572 /* Decode a byte string STR for use as the buffer of TOK.
 573    Look for encoding declarations inside STR, and record them
 574    inside TOK.  */
 575
 576 static const char *
 577 decode_str(const char *str, struct tok_state *tok)
 578 {
 579         PyObject* utf8 = NULL;
 580         const char *s;
 581         const char *newl[2] = {NULL, NULL};
 582         int lineno = 0;
 583         tok->enc = NULL;
 584         tok->str = str;
 585         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 586                 return error_ret(tok);
 587         str = tok->str;         /* string after BOM if any */
 588         assert(str);
 589 #ifdef Py_USING_UNICODE
 590         if (tok->enc != NULL) {
 591                 utf8 = translate_into_utf8(str, tok->enc);
 592                 if (utf8 == NULL)
 593                         return error_ret(tok);
 594                 str = PyString_AsString(utf8);
 595         }
 596 #endif
 597         for (s = str;; s++) {
 598                 if (*s == '\0') break;
 599                 else if (*s == '\n') {
 600                         assert(lineno < 2);
 601                         newl[lineno] = s;
 602                         lineno++;
 603                         if (lineno == 2) break;
 604                 }
 605         }
 606         tok->enc = NULL;
 607         /* need to check line 1 and 2 separately since check_coding_spec
 608            assumes a single line as input */
 609         if (newl[0]) {
 610                 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 611                         return error_ret(tok);
 612                 if (tok->enc == NULL && newl[1]) {
 613                         if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 614                                                tok, buf_setreadl))
 615                                 return error_ret(tok);
 616                 }
 617         }
 618 #ifdef Py_USING_UNICODE
 619         if (tok->enc != NULL) {
 620                 assert(utf8 == NULL);
 621                 utf8 = translate_into_utf8(str, tok->enc);
 622                 if (utf8 == NULL)
 623                         return error_ret(tok);
 624                 str = PyString_AsString(utf8);
 625         }
 626 #endif
 627         assert(tok->decoding_buffer == NULL);
 628         tok->decoding_buffer = utf8; /* CAUTION */
 629         return str;
 630 }
 631
 632 #endif /* PGEN */
 633
 634 /* Set up tokenizer for string */
 635
 636 struct tok_state *
 637 PyTokenizer_FromString(const char *str)
 638 {
 639         struct tok_state *tok = tok_new();
 640         if (tok == NULL)
 641                 return NULL;
 642         str = (char *)decode_str(str, tok);
 643         if (str == NULL) {
 644                 PyTokenizer_Free(tok);
 645                 return NULL;
 646         }
 647
 648         /* XXX: constify members. */
 649         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 650         return tok;
 651 }
 652
 653
 654 /* Set up tokenizer for file */
 655
 656 struct tok_state *
 657 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 658 {
 659         struct tok_state *tok = tok_new();
 660         if (tok == NULL)
 661                 return NULL;
 662         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 663                 PyTokenizer_Free(tok);
 664                 return NULL;
 665         }
 666         tok->cur = tok->inp = tok->buf;
 667         tok->end = tok->buf + BUFSIZ;
 668         tok->fp = fp;
 669         tok->prompt = ps1;
 670         tok->nextprompt = ps2;
 671         return tok;
 672 }
 673
 674
 675 /* Free a tok_state structure */
 676
 677 void
 678 PyTokenizer_Free(struct tok_state *tok)
 679 {
 680         if (tok->encoding != NULL)
 681                 PyMem_FREE(tok->encoding);
 682 #ifndef PGEN
 683         Py_XDECREF(tok->decoding_readline);
 684         Py_XDECREF(tok->decoding_buffer);
 685 #endif
 686         if (tok->fp != NULL && tok->buf != NULL)
 687                 PyMem_FREE(tok->buf);
 688         PyMem_FREE(tok);
 689 }
 690
 691 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 692 static int
 693 tok_stdin_decode(struct tok_state *tok, char **inp)
 694 {
 695         PyObject *enc, *sysstdin, *decoded, *utf8;
 696         const char *encoding;
 697         char *converted;
 698
 699         if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 700                 return 0;
 701         sysstdin = PySys_GetObject("stdin");
 702         if (sysstdin == NULL || !PyFile_Check(sysstdin))
 703                 return 0;
 704
 705         enc = ((PyFileObject *)sysstdin)->f_encoding;
 706         if (enc == NULL || !PyString_Check(enc))
 707                 return 0;
 708         Py_INCREF(enc);
 709
 710         encoding = PyString_AsString(enc);
 711         decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 712         if (decoded == NULL)
 713                 goto error_clear;
 714
 715         utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 716         Py_DECREF(decoded);
 717         if (utf8 == NULL)
 718                 goto error_clear;
 719
 720         assert(PyString_Check(utf8));
 721         converted = new_string(PyString_AS_STRING(utf8),
 722                                PyString_GET_SIZE(utf8));
 723         Py_DECREF(utf8);
 724         if (converted == NULL)
 725                 goto error_nomem;
 726
 727         PyMem_FREE(*inp);
 728         *inp = converted;
 729         if (tok->encoding != NULL)
 730                 PyMem_FREE(tok->encoding);
 731         tok->encoding = new_string(encoding, strlen(encoding));
 732         if (tok->encoding == NULL)
 733                 goto error_nomem;
 734
 735         Py_DECREF(enc);
 736         return 0;
 737
 738 error_nomem:
 739         Py_DECREF(enc);
 740         tok->done = E_NOMEM;
 741         return -1;
 742
 743 error_clear:
 744         /* Fallback to iso-8859-1: for backward compatibility */
 745         Py_DECREF(enc);
 746         PyErr_Clear();
 747         return 0;
 748 }
 749 #endif
 750
 751 /* Get next char, updating state; error code goes into tok->done */
 752
 753 static int
 754 tok_nextc(register struct tok_state *tok)
 755 {
 756         for (;;) {
 757                 if (tok->cur != tok->inp) {
 758                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 759                 }
 760                 if (tok->done != E_OK)
 761                         return EOF;
 762                 if (tok->fp == NULL) {
 763                         char *end = strchr(tok->inp, '\n');
 764                         if (end != NULL)
 765                                 end++;
 766                         else {
 767                                 end = strchr(tok->inp, '\0');
 768                                 if (end == tok->inp) {
 769                                         tok->done = E_EOF;
 770                                         return EOF;
 771                                 }
 772                         }
 773                         if (tok->start == NULL)
 774                                 tok->buf = tok->cur;
 775                         tok->line_start = tok->cur;
 776                         tok->lineno++;
 777                         tok->inp = end;
 778                         return Py_CHARMASK(*tok->cur++);
 779                 }
 780                 if (tok->prompt != NULL) {
 781                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 782                         if (tok->nextprompt != NULL)
 783                                 tok->prompt = tok->nextprompt;
 784                         if (newtok == NULL)
 785                                 tok->done = E_INTR;
 786                         else if (*newtok == '\0') {
 787                                 PyMem_FREE(newtok);
 788                                 tok->done = E_EOF;
 789                         }
 790 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 791                         else if (tok_stdin_decode(tok, &newtok) != 0)
 792                                 PyMem_FREE(newtok);
 793 #endif
 794                         else if (tok->start != NULL) {
 795                                 size_t start = tok->start - tok->buf;
 796                                 size_t oldlen = tok->cur - tok->buf;
 797                                 size_t newlen = oldlen + strlen(newtok);
 798                                 char *buf = tok->buf;
 799                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 800                                 tok->lineno++;
 801                                 if (buf == NULL) {
 802                                         PyMem_FREE(tok->buf);
 803                                         tok->buf = NULL;
 804                                         PyMem_FREE(newtok);
 805                                         tok->done = E_NOMEM;
 806                                         return EOF;
 807                                 }
 808                                 tok->buf = buf;
 809                                 tok->cur = tok->buf + oldlen;
 810                                 tok->line_start = tok->cur;
 811                                 strcpy(tok->buf + oldlen, newtok);
 812                                 PyMem_FREE(newtok);
 813                                 tok->inp = tok->buf + newlen;
 814                                 tok->end = tok->inp + 1;
 815                                 tok->start = tok->buf + start;
 816                         }
 817                         else {
 818                                 tok->lineno++;
 819                                 if (tok->buf != NULL)
 820                                         PyMem_FREE(tok->buf);
 821                                 tok->buf = newtok;
 822                                 tok->line_start = tok->buf;
 823                                 tok->cur = tok->buf;
 824                                 tok->line_start = tok->buf;
 825                                 tok->inp = strchr(tok->buf, '\0');
 826                                 tok->end = tok->inp + 1;
 827                         }
 828                 }
 829                 else {
 830                         int done = 0;
 831                         Py_ssize_t cur = 0;
 832                         char *pt;
 833                         if (tok->start == NULL) {
 834                                 if (tok->buf == NULL) {
 835                                         tok->buf = (char *)
 836                                                 PyMem_MALLOC(BUFSIZ);
 837                                         if (tok->buf == NULL) {
 838                                                 tok->done = E_NOMEM;
 839                                                 return EOF;
 840                                         }
 841                                         tok->end = tok->buf + BUFSIZ;
 842                                 }
 843                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 844                                           tok) == NULL) {
 845                                         tok->done = E_EOF;
 846                                         done = 1;
 847                                 }
 848                                 else {
 849                                         tok->done = E_OK;
 850                                         tok->inp = strchr(tok->buf, '\0');
 851                                         done = tok->inp[-1] == '\n';
 852                                 }
 853                         }
 854                         else {
 855                                 cur = tok->cur - tok->buf;
 856                                 if (decoding_feof(tok)) {
 857                                         tok->done = E_EOF;
 858                                         done = 1;
 859                                 }
 860                                 else
 861                                         tok->done = E_OK;
 862                         }
 863                         tok->lineno++;
 864                         /* Read until '\n' or EOF */
 865                         while (!done) {
 866                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 867                                                   tok->start - tok->buf;
 868                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 869                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 870                                 char *newbuf = tok->buf;
 871                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 872                                                                newsize);
 873                                 if (newbuf == NULL) {
 874                                         tok->done = E_NOMEM;
 875                                         tok->cur = tok->inp;
 876                                         return EOF;
 877                                 }
 878                                 tok->buf = newbuf;
 879                                 tok->inp = tok->buf + curvalid;
 880                                 tok->end = tok->buf + newsize;
 881                                 tok->start = curstart < 0 ? NULL :
 882                                              tok->buf + curstart;
 883                                 if (decoding_fgets(tok->inp,
 884                                                (int)(tok->end - tok->inp),
 885                                                tok) == NULL) {
 886                                         /* Break out early on decoding
 887                                            errors, as tok->buf will be NULL
 888                                          */
 889                                         if (tok->decoding_erred)
 890                                                 return EOF;
 891                                         /* Last line does not end in \n,
 892                                            fake one */
 893                                         strcpy(tok->inp, "\n");
 894                                 }
 895                                 tok->inp = strchr(tok->inp, '\0');
 896                                 done = tok->inp[-1] == '\n';
 897                         }
 898                         if (tok->buf != NULL) {
 899                                 tok->cur = tok->buf + cur;
 900                                 tok->line_start = tok->cur;
 901                                 /* replace "\r\n" with "\n" */
 902                                 /* For Mac leave the \r, giving a syntax error */
 903                                 pt = tok->inp - 2;
 904                                 if (pt >= tok->buf && *pt == '\r') {
 905                                         *pt++ = '\n';
 906                                         *pt = '\0';
 907                                         tok->inp = pt;
 908                                 }
 909                         }
 910                 }
 911                 if (tok->done != E_OK) {
 912                         if (tok->prompt != NULL)
 913                                 PySys_WriteStderr("\n");
 914                         tok->cur = tok->inp;
 915                         return EOF;
 916                 }
 917         }
 918         /*NOTREACHED*/
 919 }
 920
 921
 922 /* Back-up one character */
 923
 924 static void
 925 tok_backup(register struct tok_state *tok, register int c)
 926 {
 927         if (c != EOF) {
 928                 if (--tok->cur < tok->buf)
 929                         Py_FatalError("tok_backup: begin of buffer");
 930                 if (*tok->cur != c)
 931                         *tok->cur = c;
 932         }
 933 }
 934
 935
 936 /* Return the token corresponding to a single character */
 937
 938 int
 939 PyToken_OneChar(int c)
 940 {
 941         switch (c) {
 942         case '(':       return LPAR;
 943         case ')':       return RPAR;
 944         case '[':       return LSQB;
 945         case ']':       return RSQB;
 946         case ':':       return COLON;
 947         case ',':       return COMMA;
 948         case ';':       return SEMI;
 949         case '+':       return PLUS;
 950         case '-':       return MINUS;
 951         case '*':       return STAR;
 952         case '/':       return SLASH;
 953         case '|':       return VBAR;
 954         case '&':       return AMPER;
 955         case '<':       return LESS;
 956         case '>':       return GREATER;
 957         case '=':       return EQUAL;
 958         case '.':       return DOT;
 959         case '%':       return PERCENT;
 960         case '`':       return BACKQUOTE;
 961         case '{':       return LBRACE;
 962         case '}':       return RBRACE;
 963         case '^':       return CIRCUMFLEX;
 964         case '~':       return TILDE;
 965         case '@':       return AT;
 966         default:        return OP;
 967         }
 968 }
 969
 970
 971 int
 972 PyToken_TwoChars(int c1, int c2)
 973 {
 974         switch (c1) {
 975         case '=':
 976                 switch (c2) {
 977                 case '=':       return EQEQUAL;
 978                 }
 979                 break;
 980         case '!':
 981                 switch (c2) {
 982                 case '=':       return NOTEQUAL;
 983                 }
 984                 break;
 985         case '<':
 986                 switch (c2) {
 987                 case '>':       return NOTEQUAL;
 988                 case '=':       return LESSEQUAL;
 989                 case '<':       return LEFTSHIFT;
 990                 }
 991                 break;
 992         case '>':
 993                 switch (c2) {
 994                 case '=':       return GREATEREQUAL;
 995                 case '>':       return RIGHTSHIFT;
 996                 }
 997                 break;
 998         case '+':
 999                 switch (c2) {
1000                 case '=':       return PLUSEQUAL;
1001                 }
1002                 break;
1003         case '-':
1004                 switch (c2) {
1005                 case '=':       return MINEQUAL;
1006                 }
1007                 break;
1008         case '*':
1009                 switch (c2) {
1010                 case '*':       return DOUBLESTAR;
1011                 case '=':       return STAREQUAL;
1012                 }
1013                 break;
1014         case '/':
1015                 switch (c2) {
1016                 case '/':       return DOUBLESLASH;
1017                 case '=':       return SLASHEQUAL;
1018                 }
1019                 break;
1020         case '|':
1021                 switch (c2) {
1022                 case '=':       return VBAREQUAL;
1023                 }
1024                 break;
1025         case '%':
1026                 switch (c2) {
1027                 case '=':       return PERCENTEQUAL;
1028                 }
1029                 break;
1030         case '&':
1031                 switch (c2) {
1032                 case '=':       return AMPEREQUAL;
1033                 }
1034                 break;
1035         case '^':
1036                 switch (c2) {
1037                 case '=':       return CIRCUMFLEXEQUAL;
1038                 }
1039                 break;
1040         }
1041         return OP;
1042 }
1043
1044 int
1045 PyToken_ThreeChars(int c1, int c2, int c3)
1046 {
1047         switch (c1) {
1048         case '<':
1049                 switch (c2) {
1050                 case '<':
1051                         switch (c3) {
1052                         case '=':
1053                                 return LEFTSHIFTEQUAL;
1054                         }
1055                         break;
1056                 }
1057                 break;
1058         case '>':
1059                 switch (c2) {
1060                 case '>':
1061                         switch (c3) {
1062                         case '=':
1063                                 return RIGHTSHIFTEQUAL;
1064                         }
1065                         break;
1066                 }
1067                 break;
1068         case '*':
1069                 switch (c2) {
1070                 case '*':
1071                         switch (c3) {
1072                         case '=':
1073                                 return DOUBLESTAREQUAL;
1074                         }
1075                         break;
1076                 }
1077                 break;
1078         case '/':
1079                 switch (c2) {
1080                 case '/':
1081                         switch (c3) {
1082                         case '=':
1083                                 return DOUBLESLASHEQUAL;
1084                         }
1085                         break;
1086                 }
1087                 break;
1088         }
1089         return OP;
1090 }
1091
1092 static int
1093 indenterror(struct tok_state *tok)
1094 {
1095         if (tok->alterror) {
1096                 tok->done = E_TABSPACE;
1097                 tok->cur = tok->inp;
1098                 return 1;
1099         }
1100         if (tok->altwarning) {
1101                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1102                                   "in indentation\n", tok->filename);
1103                 tok->altwarning = 0;
1104         }
1105         return 0;
1106 }
1107
1108
1109 /* Get next token, after space stripping etc. */
1110
1111 static int
1112 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1113 {
1114         register int c;
1115         int blankline;
1116
1117         *p_start = *p_end = NULL;
1118   nextline:
1119         tok->start = NULL;
1120         blankline = 0;
1121
1122         /* Get indentation level */
1123         if (tok->atbol) {
1124                 register int col = 0;
1125                 register int altcol = 0;
1126                 tok->atbol = 0;
1127                 for (;;) {
1128                         c = tok_nextc(tok);
1129                         if (c == ' ')
1130                                 col++, altcol++;
1131                         else if (c == '\t') {
1132                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1133                                 altcol = (altcol/tok->alttabsize + 1)
1134                                         * tok->alttabsize;
1135                         }
1136                         else if (c == '\014') /* Control-L (formfeed) */
1137                                 col = altcol = 0; /* For Emacs users */
1138                         else
1139                                 break;
1140                 }
1141                 tok_backup(tok, c);
1142                 if (c == '#' || c == '\n') {
1143                         /* Lines with only whitespace and/or comments
1144                            shouldn't affect the indentation and are
1145                            not passed to the parser as NEWLINE tokens,
1146                            except *totally* empty lines in interactive
1147                            mode, which signal the end of a command group. */
1148                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1149                                 blankline = 0; /* Let it through */
1150                         else
1151                                 blankline = 1; /* Ignore completely */
1152                         /* We can't jump back right here since we still
1153                            may need to skip to the end of a comment */
1154                 }
1155                 if (!blankline && tok->level == 0) {
1156                         if (col == tok->indstack[tok->indent]) {
1157                                 /* No change */
1158                                 if (altcol != tok->altindstack[tok->indent]) {
1159                                         if (indenterror(tok))
1160                                                 return ERRORTOKEN;
1161                                 }
1162                         }
1163                         else if (col > tok->indstack[tok->indent]) {
1164                                 /* Indent -- always one */
1165                                 if (tok->indent+1 >= MAXINDENT) {
1166                                         tok->done = E_TOODEEP;
1167                                         tok->cur = tok->inp;
1168                                         return ERRORTOKEN;
1169                                 }
1170                                 if (altcol <= tok->altindstack[tok->indent]) {
1171                                         if (indenterror(tok))
1172                                                 return ERRORTOKEN;
1173                                 }
1174                                 tok->pendin++;
1175                                 tok->indstack[++tok->indent] = col;
1176                                 tok->altindstack[tok->indent] = altcol;
1177                         }
1178                         else /* col < tok->indstack[tok->indent] */ {
1179                                 /* Dedent -- any number, must be consistent */
1180                                 while (tok->indent > 0 &&
1181                                         col < tok->indstack[tok->indent]) {
1182                                         tok->pendin--;
1183                                         tok->indent--;
1184                                 }
1185                                 if (col != tok->indstack[tok->indent]) {
1186                                         tok->done = E_DEDENT;
1187                                         tok->cur = tok->inp;
1188                                         return ERRORTOKEN;
1189                                 }
1190                                 if (altcol != tok->altindstack[tok->indent]) {
1191                                         if (indenterror(tok))
1192                                                 return ERRORTOKEN;
1193                                 }
1194                         }
1195                 }
1196         }
1197
1198         tok->start = tok->cur;
1199
1200         /* Return pending indents/dedents */
1201         if (tok->pendin != 0) {
1202                 if (tok->pendin < 0) {
1203                         tok->pendin++;
1204                         return DEDENT;
1205                 }
1206                 else {
1207                         tok->pendin--;
1208                         return INDENT;
1209                 }
1210         }
1211
1212  again:
1213         tok->start = NULL;
1214         /* Skip spaces */
1215         do {
1216                 c = tok_nextc(tok);
1217         } while (c == ' ' || c == '\t' || c == '\014');
1218
1219         /* Set start of current token */
1220         tok->start = tok->cur - 1;
1221
1222         /* Skip comment, while looking for tab-setting magic */
1223         if (c == '#') {
1224                 static char *tabforms[] = {
1225                         "tab-width:",           /* Emacs */
1226                         ":tabstop=",            /* vim, full form */
1227                         ":ts=",                 /* vim, abbreviated form */
1228                         "set tabsize=",         /* will vi never die? */
1229                 /* more templates can be added here to support other editors */
1230                 };
1231                 char cbuf[80];
1232                 char *tp, **cp;
1233                 tp = cbuf;
1234                 do {
1235                         *tp++ = c = tok_nextc(tok);
1236                 } while (c != EOF && c != '\n' &&
1237                          (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1238                 *tp = '\0';
1239                 for (cp = tabforms;
1240                      cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1241                      cp++) {
1242                         if ((tp = strstr(cbuf, *cp))) {
1243                                 int newsize = atoi(tp + strlen(*cp));
1244
1245                                 if (newsize >= 1 && newsize <= 40) {
1246                                         tok->tabsize = newsize;
1247                                         if (Py_VerboseFlag)
1248                                             PySys_WriteStderr(
1249                                                 "Tab size set to %d\n",
1250                                                 newsize);
1251                                 }
1252                         }
1253                 }
1254                 while (c != EOF && c != '\n')
1255                         c = tok_nextc(tok);
1256         }
1257
1258         /* Check for EOF and errors now */
1259         if (c == EOF) {
1260                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1261         }
1262
1263         /* Identifier (most frequent token!) */
1264         if (isalpha(c) || c == '_') {
1265                 /* Process r"", u"" and ur"" */
1266                 switch (c) {
1267                 case 'b':
1268                 case 'B':
1269                         c = tok_nextc(tok);
1270                         if (c == 'r' || c == 'R')
1271                                 c = tok_nextc(tok);
1272                         if (c == '"' || c == '\'')
1273                                 goto letter_quote;
1274                         break;
1275                 case 'r':
1276                 case 'R':
1277                         c = tok_nextc(tok);
1278                         if (c == '"' || c == '\'')
1279                                 goto letter_quote;
1280                         break;
1281                 case 'u':
1282                 case 'U':
1283                         c = tok_nextc(tok);
1284                         if (c == 'r' || c == 'R')
1285                                 c = tok_nextc(tok);
1286                         if (c == '"' || c == '\'')
1287                                 goto letter_quote;
1288                         break;
1289                 }
1290                 while (isalnum(c) || c == '_') {
1291                         c = tok_nextc(tok);
1292                 }
1293                 tok_backup(tok, c);
1294                 *p_start = tok->start;
1295                 *p_end = tok->cur;
1296                 return NAME;
1297         }
1298
1299         /* Newline */
1300         if (c == '\n') {
1301                 tok->atbol = 1;
1302                 if (blankline || tok->level > 0)
1303                         goto nextline;
1304                 *p_start = tok->start;
1305                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1306                 tok->cont_line = 0;
1307                 return NEWLINE;
1308         }
1309
1310         /* Period or number starting with period? */
1311         if (c == '.') {
1312                 c = tok_nextc(tok);
1313                 if (isdigit(c)) {
1314                         goto fraction;
1315                 }
1316                 else {
1317                         tok_backup(tok, c);
1318                         *p_start = tok->start;
1319                         *p_end = tok->cur;
1320                         return DOT;
1321                 }
1322         }
1323
1324         /* Number */
1325         if (isdigit(c)) {
1326                 if (c == '0') {
1327                         /* Hex, octal or binary -- maybe. */
1328                         c = tok_nextc(tok);
1329                         if (c == '.')
1330                                 goto fraction;
1331 #ifndef WITHOUT_COMPLEX
1332                         if (c == 'j' || c == 'J')
1333                                 goto imaginary;
1334 #endif
1335                         if (c == 'x' || c == 'X') {
1336
1337                                 /* Hex */
1338                                 c = tok_nextc(tok);
1339                                 if (!isxdigit(c)) {
1340                                         tok->done = E_TOKEN;
1341                                         tok_backup(tok, c);
1342                                         return ERRORTOKEN;
1343                                 }
1344                                 do {
1345                                         c = tok_nextc(tok);
1346                                 } while (isxdigit(c));
1347                         }
1348                         else if (c == 'o' || c == 'O') {
1349                                 /* Octal */
1350                                 c = tok_nextc(tok);
1351                                 if (c < '0' || c >= '8') {
1352                                         tok->done = E_TOKEN;
1353                                         tok_backup(tok, c);
1354                                         return ERRORTOKEN;
1355                                 }
1356                                 do {
1357                                         c = tok_nextc(tok);
1358                                 } while ('0' <= c && c < '8');
1359                         }
1360                         else if (c == 'b' || c == 'B') {
1361                                 /* Binary */
1362                                 c = tok_nextc(tok);
1363                                 if (c != '0' && c != '1') {
1364                                         tok->done = E_TOKEN;
1365                                         tok_backup(tok, c);
1366                                         return ERRORTOKEN;
1367                                 }
1368                                 do {
1369                                         c = tok_nextc(tok);
1370                                 } while (c == '0' || c == '1');
1371                         }
1372                         else {
1373                                 int found_decimal = 0;
1374                                 /* Octal; c is first char of it */
1375                                 /* There's no 'isoctdigit' macro, sigh */
1376                                 while ('0' <= c && c < '8') {
1377                                         c = tok_nextc(tok);
1378                                 }
1379                                 if (isdigit(c)) {
1380                                         found_decimal = 1;
1381                                         do {
1382                                                 c = tok_nextc(tok);
1383                                         } while (isdigit(c));
1384                                 }
1385                                 if (c == '.')
1386                                         goto fraction;
1387                                 else if (c == 'e' || c == 'E')
1388                                         goto exponent;
1389 #ifndef WITHOUT_COMPLEX
1390                                 else if (c == 'j' || c == 'J')
1391                                         goto imaginary;
1392 #endif
1393                                 else if (found_decimal) {
1394                                         tok->done = E_TOKEN;
1395                                         tok_backup(tok, c);
1396                                         return ERRORTOKEN;
1397                                 }
1398                         }
1399                         if (c == 'l' || c == 'L')
1400                                 c = tok_nextc(tok);
1401                 }
1402                 else {
1403                         /* Decimal */
1404                         do {
1405                                 c = tok_nextc(tok);
1406                         } while (isdigit(c));
1407                         if (c == 'l' || c == 'L')
1408                                 c = tok_nextc(tok);
1409                         else {
1410                                 /* Accept floating point numbers. */
1411                                 if (c == '.') {
1412                 fraction:
1413                                         /* Fraction */
1414                                         do {
1415                                                 c = tok_nextc(tok);
1416                                         } while (isdigit(c));
1417                                 }
1418                                 if (c == 'e' || c == 'E') {
1419                 exponent:
1420                                         /* Exponent part */
1421                                         c = tok_nextc(tok);
1422                                         if (c == '+' || c == '-')
1423                                                 c = tok_nextc(tok);
1424                                         if (!isdigit(c)) {
1425                                                 tok->done = E_TOKEN;
1426                                                 tok_backup(tok, c);
1427                                                 return ERRORTOKEN;
1428                                         }
1429                                         do {
1430                                                 c = tok_nextc(tok);
1431                                         } while (isdigit(c));
1432                                 }
1433 #ifndef WITHOUT_COMPLEX
1434                                 if (c == 'j' || c == 'J')
1435                                         /* Imaginary part */
1436                 imaginary:
1437                                         c = tok_nextc(tok);
1438 #endif
1439                         }
1440                 }
1441                 tok_backup(tok, c);
1442                 *p_start = tok->start;
1443                 *p_end = tok->cur;
1444                 return NUMBER;
1445         }
1446
1447   letter_quote:
1448         /* String */
1449         if (c == '\'' || c == '"') {
1450                 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1451                 int quote = c;
1452                 int triple = 0;
1453                 int tripcount = 0;
1454                 for (;;) {
1455                         c = tok_nextc(tok);
1456                         if (c == '\n') {
1457                                 if (!triple) {
1458                                         tok->done = E_EOLS;
1459                                         tok_backup(tok, c);
1460                                         return ERRORTOKEN;
1461                                 }
1462                                 tripcount = 0;
1463                                 tok->cont_line = 1; /* multiline string. */
1464                         }
1465                         else if (c == EOF) {
1466                                 if (triple)
1467                                         tok->done = E_EOFS;
1468                                 else
1469                                         tok->done = E_EOLS;
1470                                 tok->cur = tok->inp;
1471                                 return ERRORTOKEN;
1472                         }
1473                         else if (c == quote) {
1474                                 tripcount++;
1475                                 if (tok->cur - tok->start == quote2) {
1476                                         c = tok_nextc(tok);
1477                                         if (c == quote) {
1478                                                 triple = 1;
1479                                                 tripcount = 0;
1480                                                 continue;
1481                                         }
1482                                         tok_backup(tok, c);
1483                                 }
1484                                 if (!triple || tripcount == 3)
1485                                         break;
1486                         }
1487                         else if (c == '\\') {
1488                                 tripcount = 0;
1489                                 c = tok_nextc(tok);
1490                                 if (c == EOF) {
1491                                         tok->done = E_EOLS;
1492                                         tok->cur = tok->inp;
1493                                         return ERRORTOKEN;
1494                                 }
1495                         }
1496                         else
1497                                 tripcount = 0;
1498                 }
1499                 *p_start = tok->start;
1500                 *p_end = tok->cur;
1501                 return STRING;
1502         }
1503
1504         /* Line continuation */
1505         if (c == '\\') {
1506                 c = tok_nextc(tok);
1507                 if (c != '\n') {
1508                         tok->done = E_LINECONT;
1509                         tok->cur = tok->inp;
1510                         return ERRORTOKEN;
1511                 }
1512                 tok->cont_line = 1;
1513                 goto again; /* Read next line */
1514         }
1515
1516         /* Check for two-character token */
1517         {
1518                 int c2 = tok_nextc(tok);
1519                 int token = PyToken_TwoChars(c, c2);
1520 #ifndef PGEN
1521                 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1522                         if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1523                                                "<> not supported in 3.x; use !=",
1524                                                tok->filename, tok->lineno,
1525                                                NULL, NULL)) {
1526                                 return ERRORTOKEN;
1527                         }
1528                 }
1529 #endif
1530                 if (token != OP) {
1531                         int c3 = tok_nextc(tok);
1532                         int token3 = PyToken_ThreeChars(c, c2, c3);
1533                         if (token3 != OP) {
1534                                 token = token3;
1535                         } else {
1536                                 tok_backup(tok, c3);
1537                         }
1538                         *p_start = tok->start;
1539                         *p_end = tok->cur;
1540                         return token;
1541                 }
1542                 tok_backup(tok, c2);
1543         }
1544
1545         /* Keep track of parentheses nesting level */
1546         switch (c) {
1547         case '(':
1548         case '[':
1549         case '{':
1550                 tok->level++;
1551                 break;
1552         case ')':
1553         case ']':
1554         case '}':
1555                 tok->level--;
1556                 break;
1557         }
1558
1559         /* Punctuation character */
1560         *p_start = tok->start;
1561         *p_end = tok->cur;
1562         return PyToken_OneChar(c);
1563 }
1564
1565 int
1566 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1567 {
1568         int result = tok_get(tok, p_start, p_end);
1569         if (tok->decoding_erred) {
1570                 result = ERRORTOKEN;
1571                 tok->done = E_DECODE;
1572         }
1573         return result;
1574 }
1575
1576 /* This function is only called from parsetok. However, it cannot live
1577    there, as it must be empty for PGEN, and we can check for PGEN only
1578    in this file. */
1579
1580 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1581 char*
1582 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1583 {
1584         return NULL;
1585 }
1586 #else
1587 #ifdef Py_USING_UNICODE
1588 static PyObject *
1589 dec_utf8(const char *enc, const char *text, size_t len) {
1590         PyObject *ret = NULL;
1591         PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1592         if (unicode_text) {
1593                 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1594                 Py_DECREF(unicode_text);
1595         }
1596         if (!ret) {
1597                 PyErr_Clear();
1598         }
1599         return ret;
1600 }
1601 char *
1602 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1603 {
1604         char *text = NULL;
1605         if (tok->encoding) {
1606                 /* convert source to original encondig */
1607                 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1608                 if (lineobj != NULL) {
1609                         int linelen = PyString_Size(lineobj);
1610                         const char *line = PyString_AsString(lineobj);
1611                         text = PyObject_MALLOC(linelen + 1);
1612                         if (text != NULL && line != NULL) {
1613                                 if (linelen)
1614                                         strncpy(text, line, linelen);
1615                                 text[linelen] = '\0';
1616                         }
1617                         Py_DECREF(lineobj);
1618
1619                         /* adjust error offset */
1620                         if (*offset > 1) {
1621                                 PyObject *offsetobj = dec_utf8(tok->encoding,
1622                                                                tok->buf, *offset-1);
1623                                 if (offsetobj) {
1624                                         *offset = PyString_Size(offsetobj) + 1;
1625                                         Py_DECREF(offsetobj);
1626                                 }
1627                         }
1628
1629                 }
1630         }
1631         return text;
1632
1633 }
1634 #endif /* defined(Py_USING_UNICODE) */
1635 #endif
1636
1637
1638 #ifdef Py_DEBUG
1639
1640 void
1641 tok_dump(int type, char *start, char *end)
1642 {
1643         printf("%s", _PyParser_TokenNames[type]);
1644         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1645                 printf("(%.*s)", (int)(end - start), start);
1646 }
1647
1648 #endif