Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #include "pydebug.h"
  20 #endif /* PGEN */
  21
  22 extern char *PyOS_Readline(FILE *, FILE *, char *);
  23 /* Return malloc'ed string including trailing \n;
  24    empty malloc'ed string for EOF;
  25    NULL if interrupted */
  26
  27 /* Don't ever change this -- it would break the portability of Python code */
  28 #define TABSIZE 8
  29
  30 /* Forward */
  31 static struct tok_state *tok_new(void);
  32 static int tok_nextc(struct tok_state *tok);
  33 static void tok_backup(struct tok_state *tok, int c);
  34
  35 /* Token names */
  36
  37 char *_PyParser_TokenNames[] = {
  38         "ENDMARKER",
  39         "NAME",
  40         "NUMBER",
  41         "STRING",
  42         "NEWLINE",
  43         "INDENT",
  44         "DEDENT",
  45         "LPAR",
  46         "RPAR",
  47         "LSQB",
  48         "RSQB",
  49         "COLON",
  50         "COMMA",
  51         "SEMI",
  52         "PLUS",
  53         "MINUS",
  54         "STAR",
  55         "SLASH",
  56         "VBAR",
  57         "AMPER",
  58         "LESS",
  59         "GREATER",
  60         "EQUAL",
  61         "DOT",
  62         "PERCENT",
  63         "BACKQUOTE",
  64         "LBRACE",
  65         "RBRACE",
  66         "EQEQUAL",
  67         "NOTEQUAL",
  68         "LESSEQUAL",
  69         "GREATEREQUAL",
  70         "TILDE",
  71         "CIRCUMFLEX",
  72         "LEFTSHIFT",
  73         "RIGHTSHIFT",
  74         "DOUBLESTAR",
  75         "PLUSEQUAL",
  76         "MINEQUAL",
  77         "STAREQUAL",
  78         "SLASHEQUAL",
  79         "PERCENTEQUAL",
  80         "AMPEREQUAL",
  81         "VBAREQUAL",
  82         "CIRCUMFLEXEQUAL",
  83         "LEFTSHIFTEQUAL",
  84         "RIGHTSHIFTEQUAL",
  85         "DOUBLESTAREQUAL",
  86         "DOUBLESLASH",
  87         "DOUBLESLASHEQUAL",
  88         "AT",
  89         /* This table must match the #defines in token.h! */
  90         "OP",
  91         "<ERRORTOKEN>",
  92         "<N_TOKENS>"
  93 };
  94
  95
  96 /* Create and initialize a new tok_state structure */
  97
  98 static struct tok_state *
  99 tok_new(void)
 100 {
 101         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 102                                                 sizeof(struct tok_state));
 103         if (tok == NULL)
 104                 return NULL;
 105         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 106         tok->done = E_OK;
 107         tok->fp = NULL;
 108         tok->tabsize = TABSIZE;
 109         tok->indent = 0;
 110         tok->indstack[0] = 0;
 111         tok->atbol = 1;
 112         tok->pendin = 0;
 113         tok->prompt = tok->nextprompt = NULL;
 114         tok->lineno = 0;
 115         tok->level = 0;
 116         tok->filename = NULL;
 117         tok->altwarning = 0;
 118         tok->alterror = 0;
 119         tok->alttabsize = 1;
 120         tok->altindstack[0] = 0;
 121         tok->decoding_state = 0;
 122         tok->decoding_erred = 0;
 123         tok->read_coding_spec = 0;
 124         tok->encoding = NULL;
 125         tok->cont_line = 0;
 126 #ifndef PGEN
 127         tok->decoding_readline = NULL;
 128         tok->decoding_buffer = NULL;
 129 #endif
 130         return tok;
 131 }
 132
 133 #ifdef PGEN
 134
 135 static char *
 136 decoding_fgets(char *s, int size, struct tok_state *tok)
 137 {
 138         return fgets(s, size, tok->fp);
 139 }
 140
 141 static int
 142 decoding_feof(struct tok_state *tok)
 143 {
 144         return feof(tok->fp);
 145 }
 146
 147 static const char *
 148 decode_str(const char *str, struct tok_state *tok)
 149 {
 150         return str;
 151 }
 152
 153 #else /* PGEN */
 154
 155 static char *
 156 error_ret(struct tok_state *tok) /* XXX */
 157 {
 158         tok->decoding_erred = 1;
 159         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 160                 PyMem_FREE(tok->buf);
 161         tok->buf = NULL;
 162         return NULL;            /* as if it were EOF */
 163 }
 164
 165 static char *
 166 new_string(const char *s, Py_ssize_t len)
 167 {
 168         char* result = (char *)PyMem_MALLOC(len + 1);
 169         if (result != NULL) {
 170                 memcpy(result, s, len);
 171                 result[len] = '\0';
 172         }
 173         return result;
 174 }
 175
 176 static char *
 177 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 178 {
 179         char buf[13];
 180         int i;
 181         for (i = 0; i < 12; i++) {
 182                 int c = s[i];
 183                 if (c == '\0') break;
 184                 else if (c == '_') buf[i] = '-';
 185                 else buf[i] = tolower(c);
 186         }
 187         buf[i] = '\0';
 188         if (strcmp(buf, "utf-8") == 0 ||
 189             strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
 190         else if (strcmp(buf, "latin-1") == 0 ||
 191                  strcmp(buf, "iso-8859-1") == 0 ||
 192                  strcmp(buf, "iso-latin-1") == 0 ||
 193                  strncmp(buf, "latin-1-", 8) == 0 ||
 194                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 195                  strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
 196         else return s;
 197 }
 198
 199 /* Return the coding spec in S, or NULL if none is found.  */
 200
 201 static char *
 202 get_coding_spec(const char *s, Py_ssize_t size)
 203 {
 204         Py_ssize_t i;
 205         /* Coding spec must be in a comment, and that comment must be
 206          * the only statement on the source code line. */
 207         for (i = 0; i < size - 6; i++) {
 208                 if (s[i] == '#')
 209                         break;
 210                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 211                         return NULL;
 212         }
 213         for (; i < size - 6; i++) { /* XXX inefficient search */
 214                 const char* t = s + i;
 215                 if (strncmp(t, "coding", 6) == 0) {
 216                         const char* begin = NULL;
 217                         t += 6;
 218                         if (t[0] != ':' && t[0] != '=')
 219                                 continue;
 220                         do {
 221                                 t++;
 222                         } while (t[0] == '\x20' || t[0] == '\t');
 223
 224                         begin = t;
 225                         while (isalnum(Py_CHARMASK(t[0])) ||
 226                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 227                                 t++;
 228
 229                         if (begin < t) {
 230                                 char* r = new_string(begin, t - begin);
 231                                 char* q = get_normal_name(r);
 232                                 if (r != q) {
 233                                         PyMem_FREE(r);
 234                                         r = new_string(q, strlen(q));
 235                                 }
 236                                 return r;
 237                         }
 238                 }
 239         }
 240         return NULL;
 241 }
 242
 243 /* Check whether the line contains a coding spec. If it does,
 244    invoke the set_readline function for the new encoding.
 245    This function receives the tok_state and the new encoding.
 246    Return 1 on success, 0 on failure.  */
 247
 248 static int
 249 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 250                   int set_readline(struct tok_state *, const char *))
 251 {
 252         char * cs;
 253         int r = 1;
 254
 255         if (tok->cont_line)
 256                 /* It's a continuation line, so it can't be a coding spec. */
 257                 return 1;
 258         cs = get_coding_spec(line, size);
 259         if (cs != NULL) {
 260                 tok->read_coding_spec = 1;
 261                 if (tok->encoding == NULL) {
 262                         assert(tok->decoding_state == 1); /* raw */
 263                         if (strcmp(cs, "utf-8") == 0 ||
 264                             strcmp(cs, "iso-8859-1") == 0) {
 265                                 tok->encoding = cs;
 266                         } else {
 267 #ifdef Py_USING_UNICODE
 268                                 r = set_readline(tok, cs);
 269                                 if (r) {
 270                                         tok->encoding = cs;
 271                                         tok->decoding_state = -1;
 272                                 }
 273                                 else
 274                                         PyMem_FREE(cs);
 275 #else
 276                                 /* Without Unicode support, we cannot
 277                                    process the coding spec. Since there
 278                                    won't be any Unicode literals, that
 279                                    won't matter. */
 280                                 PyMem_FREE(cs);
 281 #endif
 282                         }
 283                 } else {        /* then, compare cs with BOM */
 284                         r = (strcmp(tok->encoding, cs) == 0);
 285                         PyMem_FREE(cs);
 286                 }
 287         }
 288         if (!r) {
 289                 cs = tok->encoding;
 290                 if (!cs)
 291                         cs = "with BOM";
 292                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 293         }
 294         return r;
 295 }
 296
 297 /* See whether the file starts with a BOM. If it does,
 298    invoke the set_readline function with the new encoding.
 299    Return 1 on success, 0 on failure.  */
 300
 301 static int
 302 check_bom(int get_char(struct tok_state *),
 303           void unget_char(int, struct tok_state *),
 304           int set_readline(struct tok_state *, const char *),
 305           struct tok_state *tok)
 306 {
 307         int ch = get_char(tok);
 308         tok->decoding_state = 1;
 309         if (ch == EOF) {
 310                 return 1;
 311         } else if (ch == 0xEF) {
 312                 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
 313                 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
 314 #if 0
 315         /* Disable support for UTF-16 BOMs until a decision
 316            is made whether this needs to be supported.  */
 317         } else if (ch == 0xFE) {
 318                 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
 319                 if (!set_readline(tok, "utf-16-be")) return 0;
 320                 tok->decoding_state = -1;
 321         } else if (ch == 0xFF) {
 322                 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
 323                 if (!set_readline(tok, "utf-16-le")) return 0;
 324                 tok->decoding_state = -1;
 325 #endif
 326         } else {
 327                 unget_char(ch, tok);
 328                 return 1;
 329         }
 330         if (tok->encoding != NULL)
 331                 PyMem_FREE(tok->encoding);
 332         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 333         return 1;
 334   NON_BOM:
 335         /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 336         unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
 337         return 1;
 338 }
 339
 340 /* Read a line of text from TOK into S, using the stream in TOK.
 341    Return NULL on failure, else S.
 342
 343    On entry, tok->decoding_buffer will be one of:
 344      1) NULL: need to call tok->decoding_readline to get a new line
 345      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 346            stored the result in tok->decoding_buffer
 347      3) PyStringObject *: previous call to fp_readl did not have enough room
 348            (in the s buffer) to copy entire contents of the line read
 349            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 350            In this case, fp_readl is called in a loop (with an expanded buffer)
 351            until the buffer ends with a '\n' (or until the end of the file is
 352            reached): see tok_nextc and its calls to decoding_fgets.
 353 */
 354
 355 static char *
 356 fp_readl(char *s, int size, struct tok_state *tok)
 357 {
 358 #ifndef Py_USING_UNICODE
 359         /* In a non-Unicode built, this should never be called. */
 360         Py_FatalError("fp_readl should not be called in this build.");
 361         return NULL; /* Keep compiler happy (not reachable) */
 362 #else
 363         PyObject* utf8 = NULL;
 364         PyObject* buf = tok->decoding_buffer;
 365         char *str;
 366         Py_ssize_t utf8len;
 367
 368         /* Ask for one less byte so we can terminate it */
 369         assert(size > 0);
 370         size--;
 371
 372         if (buf == NULL) {
 373                 buf = PyObject_CallObject(tok->decoding_readline, NULL);
 374                 if (buf == NULL)
 375                         return error_ret(tok);
 376         } else {
 377                 tok->decoding_buffer = NULL;
 378                 if (PyString_CheckExact(buf))
 379                         utf8 = buf;
 380         }
 381         if (utf8 == NULL) {
 382                 utf8 = PyUnicode_AsUTF8String(buf);
 383                 Py_DECREF(buf);
 384                 if (utf8 == NULL)
 385                         return error_ret(tok);
 386         }
 387         str = PyString_AsString(utf8);
 388         utf8len = PyString_GET_SIZE(utf8);
 389         if (utf8len > size) {
 390                 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 391                 if (tok->decoding_buffer == NULL) {
 392                         Py_DECREF(utf8);
 393                         return error_ret(tok);
 394                 }
 395                 utf8len = size;
 396         }
 397         memcpy(s, str, utf8len);
 398         s[utf8len] = '\0';
 399         Py_DECREF(utf8);
 400         if (utf8len == 0) return NULL; /* EOF */
 401         return s;
 402 #endif
 403 }
 404
 405 /* Set the readline function for TOK to a StreamReader's
 406    readline function. The StreamReader is named ENC.
 407
 408    This function is called from check_bom and check_coding_spec.
 409
 410    ENC is usually identical to the future value of tok->encoding,
 411    except for the (currently unsupported) case of UTF-16.
 412
 413    Return 1 on success, 0 on failure. */
 414
 415 static int
 416 fp_setreadl(struct tok_state *tok, const char* enc)
 417 {
 418         PyObject *reader, *stream, *readline;
 419
 420         /* XXX: constify filename argument. */
 421         stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 422         if (stream == NULL)
 423                 return 0;
 424
 425         reader = PyCodec_StreamReader(enc, stream, NULL);
 426         Py_DECREF(stream);
 427         if (reader == NULL)
 428                 return 0;
 429
 430         readline = PyObject_GetAttrString(reader, "readline");
 431         Py_DECREF(reader);
 432         if (readline == NULL)
 433                 return 0;
 434
 435         tok->decoding_readline = readline;
 436         return 1;
 437 }
 438
 439 /* Fetch the next byte from TOK. */
 440
 441 static int fp_getc(struct tok_state *tok) {
 442         return getc(tok->fp);
 443 }
 444
 445 /* Unfetch the last byte back into TOK.  */
 446
 447 static void fp_ungetc(int c, struct tok_state *tok) {
 448         ungetc(c, tok->fp);
 449 }
 450
 451 /* Read a line of input from TOK. Determine encoding
 452    if necessary.  */
 453
 454 static char *
 455 decoding_fgets(char *s, int size, struct tok_state *tok)
 456 {
 457         char *line = NULL;
 458         int badchar = 0;
 459         for (;;) {
 460                 if (tok->decoding_state < 0) {
 461                         /* We already have a codec associated with
 462                            this input. */
 463                         line = fp_readl(s, size, tok);
 464                         break;
 465                 } else if (tok->decoding_state > 0) {
 466                         /* We want a 'raw' read. */
 467                         line = Py_UniversalNewlineFgets(s, size,
 468                                                         tok->fp, NULL);
 469                         break;
 470                 } else {
 471                         /* We have not yet determined the encoding.
 472                            If an encoding is found, use the file-pointer
 473                            reader functions from now on. */
 474                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 475                                 return error_ret(tok);
 476                         assert(tok->decoding_state != 0);
 477                 }
 478         }
 479         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 480                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 481                         return error_ret(tok);
 482                 }
 483         }
 484 #ifndef PGEN
 485         /* The default encoding is ASCII, so make sure we don't have any
 486            non-ASCII bytes in it. */
 487         if (line && !tok->encoding) {
 488                 unsigned char *c;
 489                 for (c = (unsigned char *)line; *c; c++)
 490                         if (*c > 127) {
 491                                 badchar = *c;
 492                                 break;
 493                         }
 494         }
 495         if (badchar) {
 496                 char buf[500];
 497                 /* Need to add 1 to the line number, since this line
 498                    has not been counted, yet.  */
 499                 sprintf(buf,
 500                         "Non-ASCII character '\\x%.2x' "
 501                         "in file %.200s on line %i, "
 502                         "but no encoding declared; "
 503                         "see http://www.python.org/peps/pep-0263.html for details",
 504                         badchar, tok->filename, tok->lineno + 1);
 505                 PyErr_SetString(PyExc_SyntaxError, buf);
 506                 return error_ret(tok);
 507         }
 508 #endif
 509         return line;
 510 }
 511
 512 static int
 513 decoding_feof(struct tok_state *tok)
 514 {
 515         if (tok->decoding_state >= 0) {
 516                 return feof(tok->fp);
 517         } else {
 518                 PyObject* buf = tok->decoding_buffer;
 519                 if (buf == NULL) {
 520                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 521                         if (buf == NULL) {
 522                                 error_ret(tok);
 523                                 return 1;
 524                         } else {
 525                                 tok->decoding_buffer = buf;
 526                         }
 527                 }
 528                 return PyObject_Length(buf) == 0;
 529         }
 530 }
 531
 532 /* Fetch a byte from TOK, using the string buffer. */
 533
 534 static int
 535 buf_getc(struct tok_state *tok) {
 536         return Py_CHARMASK(*tok->str++);
 537 }
 538
 539 /* Unfetch a byte from TOK, using the string buffer. */
 540
 541 static void
 542 buf_ungetc(int c, struct tok_state *tok) {
 543         tok->str--;
 544         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 545 }
 546
 547 /* Set the readline function for TOK to ENC. For the string-based
 548    tokenizer, this means to just record the encoding. */
 549
 550 static int
 551 buf_setreadl(struct tok_state *tok, const char* enc) {
 552         tok->enc = enc;
 553         return 1;
 554 }
 555
 556 /* Return a UTF-8 encoding Python string object from the
 557    C byte string STR, which is encoded with ENC. */
 558
 559 #ifdef Py_USING_UNICODE
 560 static PyObject *
 561 translate_into_utf8(const char* str, const char* enc) {
 562         PyObject *utf8;
 563         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 564         if (buf == NULL)
 565                 return NULL;
 566         utf8 = PyUnicode_AsUTF8String(buf);
 567         Py_DECREF(buf);
 568         return utf8;
 569 }
 570 #endif
 571
 572 /* Decode a byte string STR for use as the buffer of TOK.
 573    Look for encoding declarations inside STR, and record them
 574    inside TOK.  */
 575
 576 static const char *
 577 decode_str(const char *str, struct tok_state *tok)
 578 {
 579         PyObject* utf8 = NULL;
 580         const char *s;
 581         const char *newl[2] = {NULL, NULL};
 582         int lineno = 0;
 583         tok->enc = NULL;
 584         tok->str = str;
 585         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 586                 return error_ret(tok);
 587         str = tok->str;         /* string after BOM if any */
 588         assert(str);
 589 #ifdef Py_USING_UNICODE
 590         if (tok->enc != NULL) {
 591                 utf8 = translate_into_utf8(str, tok->enc);
 592                 if (utf8 == NULL)
 593                         return error_ret(tok);
 594                 str = PyString_AsString(utf8);
 595         }
 596 #endif
 597         for (s = str;; s++) {
 598                 if (*s == '\0') break;
 599                 else if (*s == '\n') {
 600                         assert(lineno < 2);
 601                         newl[lineno] = s;
 602                         lineno++;
 603                         if (lineno == 2) break;
 604                 }
 605         }
 606         tok->enc = NULL;
 607         /* need to check line 1 and 2 separately since check_coding_spec
 608            assumes a single line as input */
 609         if (newl[0]) {
 610                 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 611                         return error_ret(tok);
 612                 if (tok->enc == NULL && newl[1]) {
 613                         if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 614                                                tok, buf_setreadl))
 615                                 return error_ret(tok);
 616                 }
 617         }
 618 #ifdef Py_USING_UNICODE
 619         if (tok->enc != NULL) {
 620                 assert(utf8 == NULL);
 621                 utf8 = translate_into_utf8(str, tok->enc);
 622                 if (utf8 == NULL) {
 623                         PyErr_Format(PyExc_SyntaxError,
 624                                 "unknown encoding: %s", tok->enc);
 625                         return error_ret(tok);
 626                 }
 627                 str = PyString_AsString(utf8);
 628         }
 629 #endif
 630         assert(tok->decoding_buffer == NULL);
 631         tok->decoding_buffer = utf8; /* CAUTION */
 632         return str;
 633 }
 634
 635 #endif /* PGEN */
 636
 637 /* Set up tokenizer for string */
 638
 639 struct tok_state *
 640 PyTokenizer_FromString(const char *str)
 641 {
 642         struct tok_state *tok = tok_new();
 643         if (tok == NULL)
 644                 return NULL;
 645         str = (char *)decode_str(str, tok);
 646         if (str == NULL) {
 647                 PyTokenizer_Free(tok);
 648                 return NULL;
 649         }
 650
 651         /* XXX: constify members. */
 652         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 653         return tok;
 654 }
 655
 656
 657 /* Set up tokenizer for file */
 658
 659 struct tok_state *
 660 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 661 {
 662         struct tok_state *tok = tok_new();
 663         if (tok == NULL)
 664                 return NULL;
 665         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 666                 PyTokenizer_Free(tok);
 667                 return NULL;
 668         }
 669         tok->cur = tok->inp = tok->buf;
 670         tok->end = tok->buf + BUFSIZ;
 671         tok->fp = fp;
 672         tok->prompt = ps1;
 673         tok->nextprompt = ps2;
 674         return tok;
 675 }
 676
 677
 678 /* Free a tok_state structure */
 679
 680 void
 681 PyTokenizer_Free(struct tok_state *tok)
 682 {
 683         if (tok->encoding != NULL)
 684                 PyMem_FREE(tok->encoding);
 685 #ifndef PGEN
 686         Py_XDECREF(tok->decoding_readline);
 687         Py_XDECREF(tok->decoding_buffer);
 688 #endif
 689         if (tok->fp != NULL && tok->buf != NULL)
 690                 PyMem_FREE(tok->buf);
 691         PyMem_FREE(tok);
 692 }
 693
 694 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 695 static int
 696 tok_stdin_decode(struct tok_state *tok, char **inp)
 697 {
 698         PyObject *enc, *sysstdin, *decoded, *utf8;
 699         const char *encoding;
 700         char *converted;
 701
 702         if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 703                 return 0;
 704         sysstdin = PySys_GetObject("stdin");
 705         if (sysstdin == NULL || !PyFile_Check(sysstdin))
 706                 return 0;
 707
 708         enc = ((PyFileObject *)sysstdin)->f_encoding;
 709         if (enc == NULL || !PyString_Check(enc))
 710                 return 0;
 711         Py_INCREF(enc);
 712
 713         encoding = PyString_AsString(enc);
 714         decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 715         if (decoded == NULL)
 716                 goto error_clear;
 717
 718         utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 719         Py_DECREF(decoded);
 720         if (utf8 == NULL)
 721                 goto error_clear;
 722
 723         assert(PyString_Check(utf8));
 724         converted = new_string(PyString_AS_STRING(utf8),
 725                                PyString_GET_SIZE(utf8));
 726         Py_DECREF(utf8);
 727         if (converted == NULL)
 728                 goto error_nomem;
 729
 730         PyMem_FREE(*inp);
 731         *inp = converted;
 732         if (tok->encoding != NULL)
 733                 PyMem_FREE(tok->encoding);
 734         tok->encoding = new_string(encoding, strlen(encoding));
 735         if (tok->encoding == NULL)
 736                 goto error_nomem;
 737
 738         Py_DECREF(enc);
 739         return 0;
 740
 741 error_nomem:
 742         Py_DECREF(enc);
 743         tok->done = E_NOMEM;
 744         return -1;
 745
 746 error_clear:
 747         /* Fallback to iso-8859-1: for backward compatibility */
 748         Py_DECREF(enc);
 749         PyErr_Clear();
 750         return 0;
 751 }
 752 #endif
 753
 754 /* Get next char, updating state; error code goes into tok->done */
 755
 756 static int
 757 tok_nextc(register struct tok_state *tok)
 758 {
 759         for (;;) {
 760                 if (tok->cur != tok->inp) {
 761                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 762                 }
 763                 if (tok->done != E_OK)
 764                         return EOF;
 765                 if (tok->fp == NULL) {
 766                         char *end = strchr(tok->inp, '\n');
 767                         if (end != NULL)
 768                                 end++;
 769                         else {
 770                                 end = strchr(tok->inp, '\0');
 771                                 if (end == tok->inp) {
 772                                         tok->done = E_EOF;
 773                                         return EOF;
 774                                 }
 775                         }
 776                         if (tok->start == NULL)
 777                                 tok->buf = tok->cur;
 778                         tok->line_start = tok->cur;
 779                         tok->lineno++;
 780                         tok->inp = end;
 781                         return Py_CHARMASK(*tok->cur++);
 782                 }
 783                 if (tok->prompt != NULL) {
 784                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 785                         if (tok->nextprompt != NULL)
 786                                 tok->prompt = tok->nextprompt;
 787                         if (newtok == NULL)
 788                                 tok->done = E_INTR;
 789                         else if (*newtok == '\0') {
 790                                 PyMem_FREE(newtok);
 791                                 tok->done = E_EOF;
 792                         }
 793 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 794                         else if (tok_stdin_decode(tok, &newtok) != 0)
 795                                 PyMem_FREE(newtok);
 796 #endif
 797                         else if (tok->start != NULL) {
 798                                 size_t start = tok->start - tok->buf;
 799                                 size_t oldlen = tok->cur - tok->buf;
 800                                 size_t newlen = oldlen + strlen(newtok);
 801                                 char *buf = tok->buf;
 802                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 803                                 tok->lineno++;
 804                                 if (buf == NULL) {
 805                                         PyMem_FREE(tok->buf);
 806                                         tok->buf = NULL;
 807                                         PyMem_FREE(newtok);
 808                                         tok->done = E_NOMEM;
 809                                         return EOF;
 810                                 }
 811                                 tok->buf = buf;
 812                                 tok->cur = tok->buf + oldlen;
 813                                 tok->line_start = tok->cur;
 814                                 strcpy(tok->buf + oldlen, newtok);
 815                                 PyMem_FREE(newtok);
 816                                 tok->inp = tok->buf + newlen;
 817                                 tok->end = tok->inp + 1;
 818                                 tok->start = tok->buf + start;
 819                         }
 820                         else {
 821                                 tok->lineno++;
 822                                 if (tok->buf != NULL)
 823                                         PyMem_FREE(tok->buf);
 824                                 tok->buf = newtok;
 825                                 tok->line_start = tok->buf;
 826                                 tok->cur = tok->buf;
 827                                 tok->line_start = tok->buf;
 828                                 tok->inp = strchr(tok->buf, '\0');
 829                                 tok->end = tok->inp + 1;
 830                         }
 831                 }
 832                 else {
 833                         int done = 0;
 834                         Py_ssize_t cur = 0;
 835                         char *pt;
 836                         if (tok->start == NULL) {
 837                                 if (tok->buf == NULL) {
 838                                         tok->buf = (char *)
 839                                                 PyMem_MALLOC(BUFSIZ);
 840                                         if (tok->buf == NULL) {
 841                                                 tok->done = E_NOMEM;
 842                                                 return EOF;
 843                                         }
 844                                         tok->end = tok->buf + BUFSIZ;
 845                                 }
 846                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 847                                           tok) == NULL) {
 848                                         tok->done = E_EOF;
 849                                         done = 1;
 850                                 }
 851                                 else {
 852                                         tok->done = E_OK;
 853                                         tok->inp = strchr(tok->buf, '\0');
 854                                         done = tok->inp[-1] == '\n';
 855                                 }
 856                         }
 857                         else {
 858                                 cur = tok->cur - tok->buf;
 859                                 if (decoding_feof(tok)) {
 860                                         tok->done = E_EOF;
 861                                         done = 1;
 862                                 }
 863                                 else
 864                                         tok->done = E_OK;
 865                         }
 866                         tok->lineno++;
 867                         /* Read until '\n' or EOF */
 868                         while (!done) {
 869                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 870                                                   tok->start - tok->buf;
 871                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 872                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 873                                 char *newbuf = tok->buf;
 874                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 875                                                                newsize);
 876                                 if (newbuf == NULL) {
 877                                         tok->done = E_NOMEM;
 878                                         tok->cur = tok->inp;
 879                                         return EOF;
 880                                 }
 881                                 tok->buf = newbuf;
 882                                 tok->inp = tok->buf + curvalid;
 883                                 tok->end = tok->buf + newsize;
 884                                 tok->start = curstart < 0 ? NULL :
 885                                              tok->buf + curstart;
 886                                 if (decoding_fgets(tok->inp,
 887                                                (int)(tok->end - tok->inp),
 888                                                tok) == NULL) {
 889                                         /* Break out early on decoding
 890                                            errors, as tok->buf will be NULL
 891                                          */
 892                                         if (tok->decoding_erred)
 893                                                 return EOF;
 894                                         /* Last line does not end in \n,
 895                                            fake one */
 896                                         strcpy(tok->inp, "\n");
 897                                 }
 898                                 tok->inp = strchr(tok->inp, '\0');
 899                                 done = tok->inp[-1] == '\n';
 900                         }
 901                         if (tok->buf != NULL) {
 902                                 tok->cur = tok->buf + cur;
 903                                 tok->line_start = tok->cur;
 904                                 /* replace "\r\n" with "\n" */
 905                                 /* For Mac leave the \r, giving a syntax error */
 906                                 pt = tok->inp - 2;
 907                                 if (pt >= tok->buf && *pt == '\r') {
 908                                         *pt++ = '\n';
 909                                         *pt = '\0';
 910                                         tok->inp = pt;
 911                                 }
 912                         }
 913                 }
 914                 if (tok->done != E_OK) {
 915                         if (tok->prompt != NULL)
 916                                 PySys_WriteStderr("\n");
 917                         tok->cur = tok->inp;
 918                         return EOF;
 919                 }
 920         }
 921         /*NOTREACHED*/
 922 }
 923
 924
 925 /* Back-up one character */
 926
 927 static void
 928 tok_backup(register struct tok_state *tok, register int c)
 929 {
 930         if (c != EOF) {
 931                 if (--tok->cur < tok->buf)
 932                         Py_FatalError("tok_backup: begin of buffer");
 933                 if (*tok->cur != c)
 934                         *tok->cur = c;
 935         }
 936 }
 937
 938
 939 /* Return the token corresponding to a single character */
 940
 941 int
 942 PyToken_OneChar(int c)
 943 {
 944         switch (c) {
 945         case '(':       return LPAR;
 946         case ')':       return RPAR;
 947         case '[':       return LSQB;
 948         case ']':       return RSQB;
 949         case ':':       return COLON;
 950         case ',':       return COMMA;
 951         case ';':       return SEMI;
 952         case '+':       return PLUS;
 953         case '-':       return MINUS;
 954         case '*':       return STAR;
 955         case '/':       return SLASH;
 956         case '|':       return VBAR;
 957         case '&':       return AMPER;
 958         case '<':       return LESS;
 959         case '>':       return GREATER;
 960         case '=':       return EQUAL;
 961         case '.':       return DOT;
 962         case '%':       return PERCENT;
 963         case '`':       return BACKQUOTE;
 964         case '{':       return LBRACE;
 965         case '}':       return RBRACE;
 966         case '^':       return CIRCUMFLEX;
 967         case '~':       return TILDE;
 968         case '@':       return AT;
 969         default:        return OP;
 970         }
 971 }
 972
 973
 974 int
 975 PyToken_TwoChars(int c1, int c2)
 976 {
 977         switch (c1) {
 978         case '=':
 979                 switch (c2) {
 980                 case '=':       return EQEQUAL;
 981                 }
 982                 break;
 983         case '!':
 984                 switch (c2) {
 985                 case '=':       return NOTEQUAL;
 986                 }
 987                 break;
 988         case '<':
 989                 switch (c2) {
 990                 case '>':       return NOTEQUAL;
 991                 case '=':       return LESSEQUAL;
 992                 case '<':       return LEFTSHIFT;
 993                 }
 994                 break;
 995         case '>':
 996                 switch (c2) {
 997                 case '=':       return GREATEREQUAL;
 998                 case '>':       return RIGHTSHIFT;
 999                 }
1000                 break;
1001         case '+':
1002                 switch (c2) {
1003                 case '=':       return PLUSEQUAL;
1004                 }
1005                 break;
1006         case '-':
1007                 switch (c2) {
1008                 case '=':       return MINEQUAL;
1009                 }
1010                 break;
1011         case '*':
1012                 switch (c2) {
1013                 case '*':       return DOUBLESTAR;
1014                 case '=':       return STAREQUAL;
1015                 }
1016                 break;
1017         case '/':
1018                 switch (c2) {
1019                 case '/':       return DOUBLESLASH;
1020                 case '=':       return SLASHEQUAL;
1021                 }
1022                 break;
1023         case '|':
1024                 switch (c2) {
1025                 case '=':       return VBAREQUAL;
1026                 }
1027                 break;
1028         case '%':
1029                 switch (c2) {
1030                 case '=':       return PERCENTEQUAL;
1031                 }
1032                 break;
1033         case '&':
1034                 switch (c2) {
1035                 case '=':       return AMPEREQUAL;
1036                 }
1037                 break;
1038         case '^':
1039                 switch (c2) {
1040                 case '=':       return CIRCUMFLEXEQUAL;
1041                 }
1042                 break;
1043         }
1044         return OP;
1045 }
1046
1047 int
1048 PyToken_ThreeChars(int c1, int c2, int c3)
1049 {
1050         switch (c1) {
1051         case '<':
1052                 switch (c2) {
1053                 case '<':
1054                         switch (c3) {
1055                         case '=':
1056                                 return LEFTSHIFTEQUAL;
1057                         }
1058                         break;
1059                 }
1060                 break;
1061         case '>':
1062                 switch (c2) {
1063                 case '>':
1064                         switch (c3) {
1065                         case '=':
1066                                 return RIGHTSHIFTEQUAL;
1067                         }
1068                         break;
1069                 }
1070                 break;
1071         case '*':
1072                 switch (c2) {
1073                 case '*':
1074                         switch (c3) {
1075                         case '=':
1076                                 return DOUBLESTAREQUAL;
1077                         }
1078                         break;
1079                 }
1080                 break;
1081         case '/':
1082                 switch (c2) {
1083                 case '/':
1084                         switch (c3) {
1085                         case '=':
1086                                 return DOUBLESLASHEQUAL;
1087                         }
1088                         break;
1089                 }
1090                 break;
1091         }
1092         return OP;
1093 }
1094
1095 static int
1096 indenterror(struct tok_state *tok)
1097 {
1098         if (tok->alterror) {
1099                 tok->done = E_TABSPACE;
1100                 tok->cur = tok->inp;
1101                 return 1;
1102         }
1103         if (tok->altwarning) {
1104                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1105                                   "in indentation\n", tok->filename);
1106                 tok->altwarning = 0;
1107         }
1108         return 0;
1109 }
1110
1111
1112 /* Get next token, after space stripping etc. */
1113
1114 static int
1115 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1116 {
1117         register int c;
1118         int blankline;
1119
1120         *p_start = *p_end = NULL;
1121   nextline:
1122         tok->start = NULL;
1123         blankline = 0;
1124
1125         /* Get indentation level */
1126         if (tok->atbol) {
1127                 register int col = 0;
1128                 register int altcol = 0;
1129                 tok->atbol = 0;
1130                 for (;;) {
1131                         c = tok_nextc(tok);
1132                         if (c == ' ')
1133                                 col++, altcol++;
1134                         else if (c == '\t') {
1135                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1136                                 altcol = (altcol/tok->alttabsize + 1)
1137                                         * tok->alttabsize;
1138                         }
1139                         else if (c == '\014') /* Control-L (formfeed) */
1140                                 col = altcol = 0; /* For Emacs users */
1141                         else
1142                                 break;
1143                 }
1144                 tok_backup(tok, c);
1145                 if (c == '#' || c == '\n') {
1146                         /* Lines with only whitespace and/or comments
1147                            shouldn't affect the indentation and are
1148                            not passed to the parser as NEWLINE tokens,
1149                            except *totally* empty lines in interactive
1150                            mode, which signal the end of a command group. */
1151                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1152                                 blankline = 0; /* Let it through */
1153                         else
1154                                 blankline = 1; /* Ignore completely */
1155                         /* We can't jump back right here since we still
1156                            may need to skip to the end of a comment */
1157                 }
1158                 if (!blankline && tok->level == 0) {
1159                         if (col == tok->indstack[tok->indent]) {
1160                                 /* No change */
1161                                 if (altcol != tok->altindstack[tok->indent]) {
1162                                         if (indenterror(tok))
1163                                                 return ERRORTOKEN;
1164                                 }
1165                         }
1166                         else if (col > tok->indstack[tok->indent]) {
1167                                 /* Indent -- always one */
1168                                 if (tok->indent+1 >= MAXINDENT) {
1169                                         tok->done = E_TOODEEP;
1170                                         tok->cur = tok->inp;
1171                                         return ERRORTOKEN;
1172                                 }
1173                                 if (altcol <= tok->altindstack[tok->indent]) {
1174                                         if (indenterror(tok))
1175                                                 return ERRORTOKEN;
1176                                 }
1177                                 tok->pendin++;
1178                                 tok->indstack[++tok->indent] = col;
1179                                 tok->altindstack[tok->indent] = altcol;
1180                         }
1181                         else /* col < tok->indstack[tok->indent] */ {
1182                                 /* Dedent -- any number, must be consistent */
1183                                 while (tok->indent > 0 &&
1184                                         col < tok->indstack[tok->indent]) {
1185                                         tok->pendin--;
1186                                         tok->indent--;
1187                                 }
1188                                 if (col != tok->indstack[tok->indent]) {
1189                                         tok->done = E_DEDENT;
1190                                         tok->cur = tok->inp;
1191                                         return ERRORTOKEN;
1192                                 }
1193                                 if (altcol != tok->altindstack[tok->indent]) {
1194                                         if (indenterror(tok))
1195                                                 return ERRORTOKEN;
1196                                 }
1197                         }
1198                 }
1199         }
1200
1201         tok->start = tok->cur;
1202
1203         /* Return pending indents/dedents */
1204         if (tok->pendin != 0) {
1205                 if (tok->pendin < 0) {
1206                         tok->pendin++;
1207                         return DEDENT;
1208                 }
1209                 else {
1210                         tok->pendin--;
1211                         return INDENT;
1212                 }
1213         }
1214
1215  again:
1216         tok->start = NULL;
1217         /* Skip spaces */
1218         do {
1219                 c = tok_nextc(tok);
1220         } while (c == ' ' || c == '\t' || c == '\014');
1221
1222         /* Set start of current token */
1223         tok->start = tok->cur - 1;
1224
1225         /* Skip comment, while looking for tab-setting magic */
1226         if (c == '#') {
1227                 static char *tabforms[] = {
1228                         "tab-width:",           /* Emacs */
1229                         ":tabstop=",            /* vim, full form */
1230                         ":ts=",                 /* vim, abbreviated form */
1231                         "set tabsize=",         /* will vi never die? */
1232                 /* more templates can be added here to support other editors */
1233                 };
1234                 char cbuf[80];
1235                 char *tp, **cp;
1236                 tp = cbuf;
1237                 do {
1238                         *tp++ = c = tok_nextc(tok);
1239                 } while (c != EOF && c != '\n' &&
1240                          (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1241                 *tp = '\0';
1242                 for (cp = tabforms;
1243                      cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1244                      cp++) {
1245                         if ((tp = strstr(cbuf, *cp))) {
1246                                 int newsize = atoi(tp + strlen(*cp));
1247
1248                                 if (newsize >= 1 && newsize <= 40) {
1249                                         tok->tabsize = newsize;
1250                                         if (Py_VerboseFlag)
1251                                             PySys_WriteStderr(
1252                                                 "Tab size set to %d\n",
1253                                                 newsize);
1254                                 }
1255                         }
1256                 }
1257                 while (c != EOF && c != '\n')
1258                         c = tok_nextc(tok);
1259         }
1260
1261         /* Check for EOF and errors now */
1262         if (c == EOF) {
1263                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1264         }
1265
1266         /* Identifier (most frequent token!) */
1267         if (isalpha(c) || c == '_') {
1268                 /* Process r"", u"" and ur"" */
1269                 switch (c) {
1270                 case 'b':
1271                 case 'B':
1272                         c = tok_nextc(tok);
1273                         if (c == 'r' || c == 'R')
1274                                 c = tok_nextc(tok);
1275                         if (c == '"' || c == '\'')
1276                                 goto letter_quote;
1277                         break;
1278                 case 'r':
1279                 case 'R':
1280                         c = tok_nextc(tok);
1281                         if (c == '"' || c == '\'')
1282                                 goto letter_quote;
1283                         break;
1284                 case 'u':
1285                 case 'U':
1286                         c = tok_nextc(tok);
1287                         if (c == 'r' || c == 'R')
1288                                 c = tok_nextc(tok);
1289                         if (c == '"' || c == '\'')
1290                                 goto letter_quote;
1291                         break;
1292                 }
1293                 while (isalnum(c) || c == '_') {
1294                         c = tok_nextc(tok);
1295                 }
1296                 tok_backup(tok, c);
1297                 *p_start = tok->start;
1298                 *p_end = tok->cur;
1299                 return NAME;
1300         }
1301
1302         /* Newline */
1303         if (c == '\n') {
1304                 tok->atbol = 1;
1305                 if (blankline || tok->level > 0)
1306                         goto nextline;
1307                 *p_start = tok->start;
1308                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1309                 tok->cont_line = 0;
1310                 return NEWLINE;
1311         }
1312
1313         /* Period or number starting with period? */
1314         if (c == '.') {
1315                 c = tok_nextc(tok);
1316                 if (isdigit(c)) {
1317                         goto fraction;
1318                 }
1319                 else {
1320                         tok_backup(tok, c);
1321                         *p_start = tok->start;
1322                         *p_end = tok->cur;
1323                         return DOT;
1324                 }
1325         }
1326
1327         /* Number */
1328         if (isdigit(c)) {
1329                 if (c == '0') {
1330                         /* Hex, octal or binary -- maybe. */
1331                         c = tok_nextc(tok);
1332                         if (c == '.')
1333                                 goto fraction;
1334 #ifndef WITHOUT_COMPLEX
1335                         if (c == 'j' || c == 'J')
1336                                 goto imaginary;
1337 #endif
1338                         if (c == 'x' || c == 'X') {
1339
1340                                 /* Hex */
1341                                 c = tok_nextc(tok);
1342                                 if (!isxdigit(c)) {
1343                                         tok->done = E_TOKEN;
1344                                         tok_backup(tok, c);
1345                                         return ERRORTOKEN;
1346                                 }
1347                                 do {
1348                                         c = tok_nextc(tok);
1349                                 } while (isxdigit(c));
1350                         }
1351                         else if (c == 'o' || c == 'O') {
1352                                 /* Octal */
1353                                 c = tok_nextc(tok);
1354                                 if (c < '0' || c > '8') {
1355                                         tok->done = E_TOKEN;
1356                                         tok_backup(tok, c);
1357                                         return ERRORTOKEN;
1358                                 }
1359                                 do {
1360                                         c = tok_nextc(tok);
1361                                 } while ('0' <= c && c < '8');
1362                         }
1363                         else if (c == 'b' || c == 'B') {
1364                                 /* Binary */
1365                                 c = tok_nextc(tok);
1366                                 if (c != '0' && c != '1') {
1367                                         tok->done = E_TOKEN;
1368                                         tok_backup(tok, c);
1369                                         return ERRORTOKEN;
1370                                 }
1371                                 do {
1372                                         c = tok_nextc(tok);
1373                                 } while (c == '0' || c == '1');
1374                         }
1375                         else {
1376                                 int found_decimal = 0;
1377                                 /* Octal; c is first char of it */
1378                                 /* There's no 'isoctdigit' macro, sigh */
1379                                 while ('0' <= c && c < '8') {
1380                                         c = tok_nextc(tok);
1381                                 }
1382                                 if (isdigit(c)) {
1383                                         found_decimal = 1;
1384                                         do {
1385                                                 c = tok_nextc(tok);
1386                                         } while (isdigit(c));
1387                                 }
1388                                 if (c == '.')
1389                                         goto fraction;
1390                                 else if (c == 'e' || c == 'E')
1391                                         goto exponent;
1392 #ifndef WITHOUT_COMPLEX
1393                                 else if (c == 'j' || c == 'J')
1394                                         goto imaginary;
1395 #endif
1396                                 else if (found_decimal) {
1397                                         tok->done = E_TOKEN;
1398                                         tok_backup(tok, c);
1399                                         return ERRORTOKEN;
1400                                 }
1401                         }
1402                         if (c == 'l' || c == 'L')
1403                                 c = tok_nextc(tok);
1404                 }
1405                 else {
1406                         /* Decimal */
1407                         do {
1408                                 c = tok_nextc(tok);
1409                         } while (isdigit(c));
1410                         if (c == 'l' || c == 'L')
1411                                 c = tok_nextc(tok);
1412                         else {
1413                                 /* Accept floating point numbers. */
1414                                 if (c == '.') {
1415                 fraction:
1416                                         /* Fraction */
1417                                         do {
1418                                                 c = tok_nextc(tok);
1419                                         } while (isdigit(c));
1420                                 }
1421                                 if (c == 'e' || c == 'E') {
1422                 exponent:
1423                                         /* Exponent part */
1424                                         c = tok_nextc(tok);
1425                                         if (c == '+' || c == '-')
1426                                                 c = tok_nextc(tok);
1427                                         if (!isdigit(c)) {
1428                                                 tok->done = E_TOKEN;
1429                                                 tok_backup(tok, c);
1430                                                 return ERRORTOKEN;
1431                                         }
1432                                         do {
1433                                                 c = tok_nextc(tok);
1434                                         } while (isdigit(c));
1435                                 }
1436 #ifndef WITHOUT_COMPLEX
1437                                 if (c == 'j' || c == 'J')
1438                                         /* Imaginary part */
1439                 imaginary:
1440                                         c = tok_nextc(tok);
1441 #endif
1442                         }
1443                 }
1444                 tok_backup(tok, c);
1445                 *p_start = tok->start;
1446                 *p_end = tok->cur;
1447                 return NUMBER;
1448         }
1449
1450   letter_quote:
1451         /* String */
1452         if (c == '\'' || c == '"') {
1453                 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1454                 int quote = c;
1455                 int triple = 0;
1456                 int tripcount = 0;
1457                 for (;;) {
1458                         c = tok_nextc(tok);
1459                         if (c == '\n') {
1460                                 if (!triple) {
1461                                         tok->done = E_EOLS;
1462                                         tok_backup(tok, c);
1463                                         return ERRORTOKEN;
1464                                 }
1465                                 tripcount = 0;
1466                                 tok->cont_line = 1; /* multiline string. */
1467                         }
1468                         else if (c == EOF) {
1469                                 if (triple)
1470                                         tok->done = E_EOFS;
1471                                 else
1472                                         tok->done = E_EOLS;
1473                                 tok->cur = tok->inp;
1474                                 return ERRORTOKEN;
1475                         }
1476                         else if (c == quote) {
1477                                 tripcount++;
1478                                 if (tok->cur - tok->start == quote2) {
1479                                         c = tok_nextc(tok);
1480                                         if (c == quote) {
1481                                                 triple = 1;
1482                                                 tripcount = 0;
1483                                                 continue;
1484                                         }
1485                                         tok_backup(tok, c);
1486                                 }
1487                                 if (!triple || tripcount == 3)
1488                                         break;
1489                         }
1490                         else if (c == '\\') {
1491                                 tripcount = 0;
1492                                 c = tok_nextc(tok);
1493                                 if (c == EOF) {
1494                                         tok->done = E_EOLS;
1495                                         tok->cur = tok->inp;
1496                                         return ERRORTOKEN;
1497                                 }
1498                         }
1499                         else
1500                                 tripcount = 0;
1501                 }
1502                 *p_start = tok->start;
1503                 *p_end = tok->cur;
1504                 return STRING;
1505         }
1506
1507         /* Line continuation */
1508         if (c == '\\') {
1509                 c = tok_nextc(tok);
1510                 if (c != '\n') {
1511                         tok->done = E_LINECONT;
1512                         tok->cur = tok->inp;
1513                         return ERRORTOKEN;
1514                 }
1515                 tok->cont_line = 1;
1516                 goto again; /* Read next line */
1517         }
1518
1519         /* Check for two-character token */
1520         {
1521                 int c2 = tok_nextc(tok);
1522                 int token = PyToken_TwoChars(c, c2);
1523 #ifndef PGEN
1524                 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1525                         if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1526                                                "<> not supported in 3.x; use !=",
1527                                                tok->filename, tok->lineno,
1528                                                NULL, NULL)) {
1529                                 return ERRORTOKEN;
1530                         }
1531                 }
1532 #endif
1533                 if (token != OP) {
1534                         int c3 = tok_nextc(tok);
1535                         int token3 = PyToken_ThreeChars(c, c2, c3);
1536                         if (token3 != OP) {
1537                                 token = token3;
1538                         } else {
1539                                 tok_backup(tok, c3);
1540                         }
1541                         *p_start = tok->start;
1542                         *p_end = tok->cur;
1543                         return token;
1544                 }
1545                 tok_backup(tok, c2);
1546         }
1547
1548         /* Keep track of parentheses nesting level */
1549         switch (c) {
1550         case '(':
1551         case '[':
1552         case '{':
1553                 tok->level++;
1554                 break;
1555         case ')':
1556         case ']':
1557         case '}':
1558                 tok->level--;
1559                 break;
1560         }
1561
1562         /* Punctuation character */
1563         *p_start = tok->start;
1564         *p_end = tok->cur;
1565         return PyToken_OneChar(c);
1566 }
1567
1568 int
1569 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1570 {
1571         int result = tok_get(tok, p_start, p_end);
1572         if (tok->decoding_erred) {
1573                 result = ERRORTOKEN;
1574                 tok->done = E_DECODE;
1575         }
1576         return result;
1577 }
1578
1579 /* This function is only called from parsetok. However, it cannot live
1580    there, as it must be empty for PGEN, and we can check for PGEN only
1581    in this file. */
1582
1583 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1584 char*
1585 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1586 {
1587         return NULL;
1588 }
1589 #else
1590 #ifdef Py_USING_UNICODE
1591 static PyObject *
1592 dec_utf8(const char *enc, const char *text, size_t len) {
1593         PyObject *ret = NULL;
1594         PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1595         if (unicode_text) {
1596                 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1597                 Py_DECREF(unicode_text);
1598         }
1599         if (!ret) {
1600                 PyErr_Clear();
1601         }
1602         return ret;
1603 }
1604 char *
1605 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1606 {
1607         char *text = NULL;
1608         if (tok->encoding) {
1609                 /* convert source to original encondig */
1610                 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1611                 if (lineobj != NULL) {
1612                         int linelen = PyString_Size(lineobj);
1613                         const char *line = PyString_AsString(lineobj);
1614                         text = PyObject_MALLOC(linelen + 1);
1615                         if (text != NULL && line != NULL) {
1616                                 if (linelen)
1617                                         strncpy(text, line, linelen);
1618                                 text[linelen] = '\0';
1619                         }
1620                         Py_DECREF(lineobj);
1621
1622                         /* adjust error offset */
1623                         if (*offset > 1) {
1624                                 PyObject *offsetobj = dec_utf8(tok->encoding,
1625                                                                tok->buf, *offset-1);
1626                                 if (offsetobj) {
1627                                         *offset = PyString_Size(offsetobj) + 1;
1628                                         Py_DECREF(offsetobj);
1629                                 }
1630                         }
1631
1632                 }
1633         }
1634         return text;
1635
1636 }
1637 #endif /* defined(Py_USING_UNICODE) */
1638 #endif
1639
1640
1641 #ifdef Py_DEBUG
1642
1643 void
1644 tok_dump(int type, char *start, char *end)
1645 {
1646         printf("%s", _PyParser_TokenNames[type]);
1647         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1648                 printf("(%.*s)", (int)(end - start), start);
1649 }
1650
1651 #endif