Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #include "pydebug.h"
  20 #endif /* PGEN */
  21
  22 extern char *PyOS_Readline(FILE *, FILE *, char *);
  23 /* Return malloc'ed string including trailing \n;
  24    empty malloc'ed string for EOF;
  25    NULL if interrupted */
  26
  27 /* Don't ever change this -- it would break the portability of Python code */
  28 #define TABSIZE 8
  29
  30 /* Forward */
  31 static struct tok_state *tok_new(void);
  32 static int tok_nextc(struct tok_state *tok);
  33 static void tok_backup(struct tok_state *tok, int c);
  34
  35 /* Token names */
  36
  37 char *_PyParser_TokenNames[] = {
  38         "ENDMARKER",
  39         "NAME",
  40         "NUMBER",
  41         "STRING",
  42         "NEWLINE",
  43         "INDENT",
  44         "DEDENT",
  45         "LPAR",
  46         "RPAR",
  47         "LSQB",
  48         "RSQB",
  49         "COLON",
  50         "COMMA",
  51         "SEMI",
  52         "PLUS",
  53         "MINUS",
  54         "STAR",
  55         "SLASH",
  56         "VBAR",
  57         "AMPER",
  58         "LESS",
  59         "GREATER",
  60         "EQUAL",
  61         "DOT",
  62         "PERCENT",
  63         "BACKQUOTE",
  64         "LBRACE",
  65         "RBRACE",
  66         "EQEQUAL",
  67         "NOTEQUAL",
  68         "LESSEQUAL",
  69         "GREATEREQUAL",
  70         "TILDE",
  71         "CIRCUMFLEX",
  72         "LEFTSHIFT",
  73         "RIGHTSHIFT",
  74         "DOUBLESTAR",
  75         "PLUSEQUAL",
  76         "MINEQUAL",
  77         "STAREQUAL",
  78         "SLASHEQUAL",
  79         "PERCENTEQUAL",
  80         "AMPEREQUAL",
  81         "VBAREQUAL",
  82         "CIRCUMFLEXEQUAL",
  83         "LEFTSHIFTEQUAL",
  84         "RIGHTSHIFTEQUAL",
  85         "DOUBLESTAREQUAL",
  86         "DOUBLESLASH",
  87         "DOUBLESLASHEQUAL",
  88         "AT",
  89         /* This table must match the #defines in token.h! */
  90         "OP",
  91         "<ERRORTOKEN>",
  92         "<N_TOKENS>"
  93 };
  94
  95
  96 /* Create and initialize a new tok_state structure */
  97
  98 static struct tok_state *
  99 tok_new(void)
 100 {
 101         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 102                                                 sizeof(struct tok_state));
 103         if (tok == NULL)
 104                 return NULL;
 105         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 106         tok->done = E_OK;
 107         tok->fp = NULL;
 108         tok->input = NULL;
 109         tok->tabsize = TABSIZE;
 110         tok->indent = 0;
 111         tok->indstack[0] = 0;
 112         tok->atbol = 1;
 113         tok->pendin = 0;
 114         tok->prompt = tok->nextprompt = NULL;
 115         tok->lineno = 0;
 116         tok->level = 0;
 117         tok->filename = NULL;
 118         tok->altwarning = 0;
 119         tok->alterror = 0;
 120         tok->alttabsize = 1;
 121         tok->altindstack[0] = 0;
 122         tok->decoding_state = 0;
 123         tok->decoding_erred = 0;
 124         tok->read_coding_spec = 0;
 125         tok->encoding = NULL;
 126         tok->cont_line = 0;
 127 #ifndef PGEN
 128         tok->decoding_readline = NULL;
 129         tok->decoding_buffer = NULL;
 130 #endif
 131         return tok;
 132 }
 133
 134 static char *
 135 new_string(const char *s, Py_ssize_t len)
 136 {
 137         char* result = (char *)PyMem_MALLOC(len + 1);
 138         if (result != NULL) {
 139                 memcpy(result, s, len);
 140                 result[len] = '\0';
 141         }
 142         return result;
 143 }
 144
 145 #ifdef PGEN
 146
 147 static char *
 148 decoding_fgets(char *s, int size, struct tok_state *tok)
 149 {
 150         return fgets(s, size, tok->fp);
 151 }
 152
 153 static int
 154 decoding_feof(struct tok_state *tok)
 155 {
 156         return feof(tok->fp);
 157 }
 158
 159 static char *
 160 decode_str(const char *str, int exec_input, struct tok_state *tok)
 161 {
 162         return new_string(str, strlen(str));
 163 }
 164
 165 #else /* PGEN */
 166
 167 static char *
 168 error_ret(struct tok_state *tok) /* XXX */
 169 {
 170         tok->decoding_erred = 1;
 171         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 172                 PyMem_FREE(tok->buf);
 173         tok->buf = NULL;
 174         return NULL;            /* as if it were EOF */
 175 }
 176
 177
 178 static char *
 179 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 180 {
 181         char buf[13];
 182         int i;
 183         for (i = 0; i < 12; i++) {
 184                 int c = s[i];
 185                 if (c == '\0')
 186                         break;
 187                 else if (c == '_')
 188                         buf[i] = '-';
 189                 else
 190                         buf[i] = tolower(c);
 191         }
 192         buf[i] = '\0';
 193         if (strcmp(buf, "utf-8") == 0 ||
 194             strncmp(buf, "utf-8-", 6) == 0)
 195                 return "utf-8";
 196         else if (strcmp(buf, "latin-1") == 0 ||
 197                  strcmp(buf, "iso-8859-1") == 0 ||
 198                  strcmp(buf, "iso-latin-1") == 0 ||
 199                  strncmp(buf, "latin-1-", 8) == 0 ||
 200                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 201                  strncmp(buf, "iso-latin-1-", 12) == 0)
 202                 return "iso-8859-1";
 203         else
 204                 return s;
 205 }
 206
 207 /* Return the coding spec in S, or NULL if none is found.  */
 208
 209 static char *
 210 get_coding_spec(const char *s, Py_ssize_t size)
 211 {
 212         Py_ssize_t i;
 213         /* Coding spec must be in a comment, and that comment must be
 214          * the only statement on the source code line. */
 215         for (i = 0; i < size - 6; i++) {
 216                 if (s[i] == '#')
 217                         break;
 218                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 219                         return NULL;
 220         }
 221         for (; i < size - 6; i++) { /* XXX inefficient search */
 222                 const char* t = s + i;
 223                 if (strncmp(t, "coding", 6) == 0) {
 224                         const char* begin = NULL;
 225                         t += 6;
 226                         if (t[0] != ':' && t[0] != '=')
 227                                 continue;
 228                         do {
 229                                 t++;
 230                         } while (t[0] == '\x20' || t[0] == '\t');
 231
 232                         begin = t;
 233                         while (isalnum(Py_CHARMASK(t[0])) ||
 234                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 235                                 t++;
 236
 237                         if (begin < t) {
 238                                 char* r = new_string(begin, t - begin);
 239                                 char* q = get_normal_name(r);
 240                                 if (r != q) {
 241                                         PyMem_FREE(r);
 242                                         r = new_string(q, strlen(q));
 243                                 }
 244                                 return r;
 245                         }
 246                 }
 247         }
 248         return NULL;
 249 }
 250
 251 /* Check whether the line contains a coding spec. If it does,
 252    invoke the set_readline function for the new encoding.
 253    This function receives the tok_state and the new encoding.
 254    Return 1 on success, 0 on failure.  */
 255
 256 static int
 257 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 258                   int set_readline(struct tok_state *, const char *))
 259 {
 260         char * cs;
 261         int r = 1;
 262
 263         if (tok->cont_line)
 264                 /* It's a continuation line, so it can't be a coding spec. */
 265                 return 1;
 266         cs = get_coding_spec(line, size);
 267         if (cs != NULL) {
 268                 tok->read_coding_spec = 1;
 269                 if (tok->encoding == NULL) {
 270                         assert(tok->decoding_state == 1); /* raw */
 271                         if (strcmp(cs, "utf-8") == 0 ||
 272                             strcmp(cs, "iso-8859-1") == 0) {
 273                                 tok->encoding = cs;
 274                         } else {
 275 #ifdef Py_USING_UNICODE
 276                                 r = set_readline(tok, cs);
 277                                 if (r) {
 278                                         tok->encoding = cs;
 279                                         tok->decoding_state = -1;
 280                                 }
 281                                 else
 282                                         PyMem_FREE(cs);
 283 #else
 284                                 /* Without Unicode support, we cannot
 285                                    process the coding spec. Since there
 286                                    won't be any Unicode literals, that
 287                                    won't matter. */
 288                                 PyMem_FREE(cs);
 289 #endif
 290                         }
 291                 } else {        /* then, compare cs with BOM */
 292                         r = (strcmp(tok->encoding, cs) == 0);
 293                         PyMem_FREE(cs);
 294                 }
 295         }
 296         if (!r) {
 297                 cs = tok->encoding;
 298                 if (!cs)
 299                         cs = "with BOM";
 300                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 301         }
 302         return r;
 303 }
 304
 305 /* See whether the file starts with a BOM. If it does,
 306    invoke the set_readline function with the new encoding.
 307    Return 1 on success, 0 on failure.  */
 308
 309 static int
 310 check_bom(int get_char(struct tok_state *),
 311           void unget_char(int, struct tok_state *),
 312           int set_readline(struct tok_state *, const char *),
 313           struct tok_state *tok)
 314 {
 315         int ch = get_char(tok);
 316         tok->decoding_state = 1;
 317         if (ch == EOF) {
 318                 return 1;
 319         } else if (ch == 0xEF) {
 320                 ch = get_char(tok);
 321                 if (ch != 0xBB)
 322                         goto NON_BOM;
 323                 ch = get_char(tok);
 324                 if (ch != 0xBF)
 325                         goto NON_BOM;
 326 #if 0
 327         /* Disable support for UTF-16 BOMs until a decision
 328            is made whether this needs to be supported.  */
 329         } else if (ch == 0xFE) {
 330                 ch = get_char(tok);
 331                 if (ch != 0xFF)
 332                         goto NON_BOM;
 333                 if (!set_readline(tok, "utf-16-be"))
 334                         return 0;
 335                 tok->decoding_state = -1;
 336         } else if (ch == 0xFF) {
 337                 ch = get_char(tok);
 338                 if (ch != 0xFE)
 339                         goto NON_BOM;
 340                 if (!set_readline(tok, "utf-16-le"))
 341                         return 0;
 342                 tok->decoding_state = -1;
 343 #endif
 344         } else {
 345                 unget_char(ch, tok);
 346                 return 1;
 347         }
 348         if (tok->encoding != NULL)
 349                 PyMem_FREE(tok->encoding);
 350         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 351         return 1;
 352   NON_BOM:
 353         /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 354         unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
 355         return 1;
 356 }
 357
 358 /* Read a line of text from TOK into S, using the stream in TOK.
 359    Return NULL on failure, else S.
 360
 361    On entry, tok->decoding_buffer will be one of:
 362      1) NULL: need to call tok->decoding_readline to get a new line
 363      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 364            stored the result in tok->decoding_buffer
 365      3) PyStringObject *: previous call to fp_readl did not have enough room
 366            (in the s buffer) to copy entire contents of the line read
 367            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 368            In this case, fp_readl is called in a loop (with an expanded buffer)
 369            until the buffer ends with a '\n' (or until the end of the file is
 370            reached): see tok_nextc and its calls to decoding_fgets.
 371 */
 372
 373 static char *
 374 fp_readl(char *s, int size, struct tok_state *tok)
 375 {
 376 #ifndef Py_USING_UNICODE
 377         /* In a non-Unicode built, this should never be called. */
 378         Py_FatalError("fp_readl should not be called in this build.");
 379         return NULL; /* Keep compiler happy (not reachable) */
 380 #else
 381         PyObject* utf8 = NULL;
 382         PyObject* buf = tok->decoding_buffer;
 383         char *str;
 384         Py_ssize_t utf8len;
 385
 386         /* Ask for one less byte so we can terminate it */
 387         assert(size > 0);
 388         size--;
 389
 390         if (buf == NULL) {
 391                 buf = PyObject_CallObject(tok->decoding_readline, NULL);
 392                 if (buf == NULL)
 393                         return error_ret(tok);
 394         } else {
 395                 tok->decoding_buffer = NULL;
 396                 if (PyString_CheckExact(buf))
 397                         utf8 = buf;
 398         }
 399         if (utf8 == NULL) {
 400                 utf8 = PyUnicode_AsUTF8String(buf);
 401                 Py_DECREF(buf);
 402                 if (utf8 == NULL)
 403                         return error_ret(tok);
 404         }
 405         str = PyString_AsString(utf8);
 406         utf8len = PyString_GET_SIZE(utf8);
 407         if (utf8len > size) {
 408                 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 409                 if (tok->decoding_buffer == NULL) {
 410                         Py_DECREF(utf8);
 411                         return error_ret(tok);
 412                 }
 413                 utf8len = size;
 414         }
 415         memcpy(s, str, utf8len);
 416         s[utf8len] = '\0';
 417         Py_DECREF(utf8);
 418         if (utf8len == 0)
 419                 return NULL; /* EOF */
 420         return s;
 421 #endif
 422 }
 423
 424 /* Set the readline function for TOK to a StreamReader's
 425    readline function. The StreamReader is named ENC.
 426
 427    This function is called from check_bom and check_coding_spec.
 428
 429    ENC is usually identical to the future value of tok->encoding,
 430    except for the (currently unsupported) case of UTF-16.
 431
 432    Return 1 on success, 0 on failure. */
 433
 434 static int
 435 fp_setreadl(struct tok_state *tok, const char* enc)
 436 {
 437         PyObject *reader, *stream, *readline;
 438
 439         /* XXX: constify filename argument. */
 440         stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 441         if (stream == NULL)
 442                 return 0;
 443
 444         reader = PyCodec_StreamReader(enc, stream, NULL);
 445         Py_DECREF(stream);
 446         if (reader == NULL)
 447                 return 0;
 448
 449         readline = PyObject_GetAttrString(reader, "readline");
 450         Py_DECREF(reader);
 451         if (readline == NULL)
 452                 return 0;
 453
 454         tok->decoding_readline = readline;
 455         return 1;
 456 }
 457
 458 /* Fetch the next byte from TOK. */
 459
 460 static int fp_getc(struct tok_state *tok) {
 461         return getc(tok->fp);
 462 }
 463
 464 /* Unfetch the last byte back into TOK.  */
 465
 466 static void fp_ungetc(int c, struct tok_state *tok) {
 467         ungetc(c, tok->fp);
 468 }
 469
 470 /* Read a line of input from TOK. Determine encoding
 471    if necessary.  */
 472
 473 static char *
 474 decoding_fgets(char *s, int size, struct tok_state *tok)
 475 {
 476         char *line = NULL;
 477         int badchar = 0;
 478         for (;;) {
 479                 if (tok->decoding_state < 0) {
 480                         /* We already have a codec associated with
 481                            this input. */
 482                         line = fp_readl(s, size, tok);
 483                         break;
 484                 } else if (tok->decoding_state > 0) {
 485                         /* We want a 'raw' read. */
 486                         line = Py_UniversalNewlineFgets(s, size,
 487                                                         tok->fp, NULL);
 488                         break;
 489                 } else {
 490                         /* We have not yet determined the encoding.
 491                            If an encoding is found, use the file-pointer
 492                            reader functions from now on. */
 493                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 494                                 return error_ret(tok);
 495                         assert(tok->decoding_state != 0);
 496                 }
 497         }
 498         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 499                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 500                         return error_ret(tok);
 501                 }
 502         }
 503 #ifndef PGEN
 504         /* The default encoding is ASCII, so make sure we don't have any
 505            non-ASCII bytes in it. */
 506         if (line && !tok->encoding) {
 507                 unsigned char *c;
 508                 for (c = (unsigned char *)line; *c; c++)
 509                         if (*c > 127) {
 510                                 badchar = *c;
 511                                 break;
 512                         }
 513         }
 514         if (badchar) {
 515                 char buf[500];
 516                 /* Need to add 1 to the line number, since this line
 517                    has not been counted, yet.  */
 518                 sprintf(buf,
 519                         "Non-ASCII character '\\x%.2x' "
 520                         "in file %.200s on line %i, "
 521                         "but no encoding declared; "
 522                         "see http://www.python.org/peps/pep-0263.html for details",
 523                         badchar, tok->filename, tok->lineno + 1);
 524                 PyErr_SetString(PyExc_SyntaxError, buf);
 525                 return error_ret(tok);
 526         }
 527 #endif
 528         return line;
 529 }
 530
 531 static int
 532 decoding_feof(struct tok_state *tok)
 533 {
 534         if (tok->decoding_state >= 0) {
 535                 return feof(tok->fp);
 536         } else {
 537                 PyObject* buf = tok->decoding_buffer;
 538                 if (buf == NULL) {
 539                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 540                         if (buf == NULL) {
 541                                 error_ret(tok);
 542                                 return 1;
 543                         } else {
 544                                 tok->decoding_buffer = buf;
 545                         }
 546                 }
 547                 return PyObject_Length(buf) == 0;
 548         }
 549 }
 550
 551 /* Fetch a byte from TOK, using the string buffer. */
 552
 553 static int
 554 buf_getc(struct tok_state *tok) {
 555         return Py_CHARMASK(*tok->str++);
 556 }
 557
 558 /* Unfetch a byte from TOK, using the string buffer. */
 559
 560 static void
 561 buf_ungetc(int c, struct tok_state *tok) {
 562         tok->str--;
 563         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 564 }
 565
 566 /* Set the readline function for TOK to ENC. For the string-based
 567    tokenizer, this means to just record the encoding. */
 568
 569 static int
 570 buf_setreadl(struct tok_state *tok, const char* enc) {
 571         tok->enc = enc;
 572         return 1;
 573 }
 574
 575 /* Return a UTF-8 encoding Python string object from the
 576    C byte string STR, which is encoded with ENC. */
 577
 578 #ifdef Py_USING_UNICODE
 579 static PyObject *
 580 translate_into_utf8(const char* str, const char* enc) {
 581         PyObject *utf8;
 582         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 583         if (buf == NULL)
 584                 return NULL;
 585         utf8 = PyUnicode_AsUTF8String(buf);
 586         Py_DECREF(buf);
 587         return utf8;
 588 }
 589 #endif
 590
 591
 592 static char *
 593 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
 594         int skip_next_lf = 0, length = strlen(s), final_length;
 595         char *buf, *current;
 596         char c;
 597         buf = PyMem_MALLOC(length + 2);
 598         if (buf == NULL) {
 599                 tok->done = E_NOMEM;
 600                 return NULL;
 601         }
 602         for (current = buf; (c = *s++);) {
 603                 if (skip_next_lf) {
 604                         skip_next_lf = 0;
 605                         if (c == '\n') {
 606                                 c = *s;
 607                                 s++;
 608                                 if (!c)
 609                                         break;
 610                         }
 611                 }
 612                 if (c == '\r') {
 613                         skip_next_lf = 1;
 614                         c = '\n';
 615                 }
 616                 *current = c;
 617                 current++;
 618         }
 619         /* If this is exec input, add a newline to the end of the file if
 620            there isn't one already. */
 621         if (exec_input && *current != '\n') {
 622                 *current = '\n';
 623                 current++;
 624         }
 625         *current = '\0';
 626         final_length = current - buf;
 627         if (final_length < length && final_length)
 628                 /* should never fail */
 629                 buf = PyMem_REALLOC(buf, final_length + 1);
 630         return buf;
 631 }
 632
 633 /* Decode a byte string STR for use as the buffer of TOK.
 634    Look for encoding declarations inside STR, and record them
 635    inside TOK.  */
 636
 637 static const char *
 638 decode_str(const char *input, int single, struct tok_state *tok)
 639 {
 640         PyObject* utf8 = NULL;
 641         const char *str;
 642         const char *s;
 643         const char *newl[2] = {NULL, NULL};
 644         int lineno = 0;
 645         tok->input = str = translate_newlines(input, single, tok);
 646         if (str == NULL)
 647                 return NULL;
 648         tok->enc = NULL;
 649         tok->str = str;
 650         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 651                 return error_ret(tok);
 652         str = tok->str;         /* string after BOM if any */
 653         assert(str);
 654 #ifdef Py_USING_UNICODE
 655         if (tok->enc != NULL) {
 656                 utf8 = translate_into_utf8(str, tok->enc);
 657                 if (utf8 == NULL)
 658                         return error_ret(tok);
 659                 str = PyString_AsString(utf8);
 660         }
 661 #endif
 662         for (s = str;; s++) {
 663                 if (*s == '\0') break;
 664                 else if (*s == '\n') {
 665                         assert(lineno < 2);
 666                         newl[lineno] = s;
 667                         lineno++;
 668                         if (lineno == 2) break;
 669                 }
 670         }
 671         tok->enc = NULL;
 672         /* need to check line 1 and 2 separately since check_coding_spec
 673            assumes a single line as input */
 674         if (newl[0]) {
 675                 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 676                         return error_ret(tok);
 677                 if (tok->enc == NULL && newl[1]) {
 678                         if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 679                                                tok, buf_setreadl))
 680                                 return error_ret(tok);
 681                 }
 682         }
 683 #ifdef Py_USING_UNICODE
 684         if (tok->enc != NULL) {
 685                 assert(utf8 == NULL);
 686                 utf8 = translate_into_utf8(str, tok->enc);
 687                 if (utf8 == NULL)
 688                         return error_ret(tok);
 689                 str = PyString_AsString(utf8);
 690         }
 691 #endif
 692         assert(tok->decoding_buffer == NULL);
 693         tok->decoding_buffer = utf8; /* CAUTION */
 694         return str;
 695 }
 696
 697 #endif /* PGEN */
 698
 699 /* Set up tokenizer for string */
 700
 701 struct tok_state *
 702 PyTokenizer_FromString(const char *str, int exec_input)
 703 {
 704         struct tok_state *tok = tok_new();
 705         if (tok == NULL)
 706                 return NULL;
 707         str = (char *)decode_str(str, exec_input, tok);
 708         if (str == NULL) {
 709                 PyTokenizer_Free(tok);
 710                 return NULL;
 711         }
 712
 713         /* XXX: constify members. */
 714         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 715         return tok;
 716 }
 717
 718
 719 /* Set up tokenizer for file */
 720
 721 struct tok_state *
 722 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 723 {
 724         struct tok_state *tok = tok_new();
 725         if (tok == NULL)
 726                 return NULL;
 727         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 728                 PyTokenizer_Free(tok);
 729                 return NULL;
 730         }
 731         tok->cur = tok->inp = tok->buf;
 732         tok->end = tok->buf + BUFSIZ;
 733         tok->fp = fp;
 734         tok->prompt = ps1;
 735         tok->nextprompt = ps2;
 736         return tok;
 737 }
 738
 739
 740 /* Free a tok_state structure */
 741
 742 void
 743 PyTokenizer_Free(struct tok_state *tok)
 744 {
 745         if (tok->encoding != NULL)
 746                 PyMem_FREE(tok->encoding);
 747 #ifndef PGEN
 748         Py_XDECREF(tok->decoding_readline);
 749         Py_XDECREF(tok->decoding_buffer);
 750 #endif
 751         if (tok->fp != NULL && tok->buf != NULL)
 752                 PyMem_FREE(tok->buf);
 753         if (tok->input)
 754                 PyMem_FREE((char *)tok->input);
 755         PyMem_FREE(tok);
 756 }
 757
 758 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 759 static int
 760 tok_stdin_decode(struct tok_state *tok, char **inp)
 761 {
 762         PyObject *enc, *sysstdin, *decoded, *utf8;
 763         const char *encoding;
 764         char *converted;
 765
 766         if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 767                 return 0;
 768         sysstdin = PySys_GetObject("stdin");
 769         if (sysstdin == NULL || !PyFile_Check(sysstdin))
 770                 return 0;
 771
 772         enc = ((PyFileObject *)sysstdin)->f_encoding;
 773         if (enc == NULL || !PyString_Check(enc))
 774                 return 0;
 775         Py_INCREF(enc);
 776
 777         encoding = PyString_AsString(enc);
 778         decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 779         if (decoded == NULL)
 780                 goto error_clear;
 781
 782         utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 783         Py_DECREF(decoded);
 784         if (utf8 == NULL)
 785                 goto error_clear;
 786
 787         assert(PyString_Check(utf8));
 788         converted = new_string(PyString_AS_STRING(utf8),
 789                                PyString_GET_SIZE(utf8));
 790         Py_DECREF(utf8);
 791         if (converted == NULL)
 792                 goto error_nomem;
 793
 794         PyMem_FREE(*inp);
 795         *inp = converted;
 796         if (tok->encoding != NULL)
 797                 PyMem_FREE(tok->encoding);
 798         tok->encoding = new_string(encoding, strlen(encoding));
 799         if (tok->encoding == NULL)
 800                 goto error_nomem;
 801
 802         Py_DECREF(enc);
 803         return 0;
 804
 805 error_nomem:
 806         Py_DECREF(enc);
 807         tok->done = E_NOMEM;
 808         return -1;
 809
 810 error_clear:
 811         /* Fallback to iso-8859-1: for backward compatibility */
 812         Py_DECREF(enc);
 813         PyErr_Clear();
 814         return 0;
 815 }
 816 #endif
 817
 818 /* Get next char, updating state; error code goes into tok->done */
 819
 820 static int
 821 tok_nextc(register struct tok_state *tok)
 822 {
 823         for (;;) {
 824                 if (tok->cur != tok->inp) {
 825                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 826                 }
 827                 if (tok->done != E_OK)
 828                         return EOF;
 829                 if (tok->fp == NULL) {
 830                         char *end = strchr(tok->inp, '\n');
 831                         if (end != NULL)
 832                                 end++;
 833                         else {
 834                                 end = strchr(tok->inp, '\0');
 835                                 if (end == tok->inp) {
 836                                         tok->done = E_EOF;
 837                                         return EOF;
 838                                 }
 839                         }
 840                         if (tok->start == NULL)
 841                                 tok->buf = tok->cur;
 842                         tok->line_start = tok->cur;
 843                         tok->lineno++;
 844                         tok->inp = end;
 845                         return Py_CHARMASK(*tok->cur++);
 846                 }
 847                 if (tok->prompt != NULL) {
 848                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 849                         if (tok->nextprompt != NULL)
 850                                 tok->prompt = tok->nextprompt;
 851                         if (newtok == NULL)
 852                                 tok->done = E_INTR;
 853                         else if (*newtok == '\0') {
 854                                 PyMem_FREE(newtok);
 855                                 tok->done = E_EOF;
 856                         }
 857 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 858                         else if (tok_stdin_decode(tok, &newtok) != 0)
 859                                 PyMem_FREE(newtok);
 860 #endif
 861                         else if (tok->start != NULL) {
 862                                 size_t start = tok->start - tok->buf;
 863                                 size_t oldlen = tok->cur - tok->buf;
 864                                 size_t newlen = oldlen + strlen(newtok);
 865                                 char *buf = tok->buf;
 866                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 867                                 tok->lineno++;
 868                                 if (buf == NULL) {
 869                                         PyMem_FREE(tok->buf);
 870                                         tok->buf = NULL;
 871                                         PyMem_FREE(newtok);
 872                                         tok->done = E_NOMEM;
 873                                         return EOF;
 874                                 }
 875                                 tok->buf = buf;
 876                                 tok->cur = tok->buf + oldlen;
 877                                 tok->line_start = tok->cur;
 878                                 strcpy(tok->buf + oldlen, newtok);
 879                                 PyMem_FREE(newtok);
 880                                 tok->inp = tok->buf + newlen;
 881                                 tok->end = tok->inp + 1;
 882                                 tok->start = tok->buf + start;
 883                         }
 884                         else {
 885                                 tok->lineno++;
 886                                 if (tok->buf != NULL)
 887                                         PyMem_FREE(tok->buf);
 888                                 tok->buf = newtok;
 889                                 tok->line_start = tok->buf;
 890                                 tok->cur = tok->buf;
 891                                 tok->line_start = tok->buf;
 892                                 tok->inp = strchr(tok->buf, '\0');
 893                                 tok->end = tok->inp + 1;
 894                         }
 895                 }
 896                 else {
 897                         int done = 0;
 898                         Py_ssize_t cur = 0;
 899                         char *pt;
 900                         if (tok->start == NULL) {
 901                                 if (tok->buf == NULL) {
 902                                         tok->buf = (char *)
 903                                                 PyMem_MALLOC(BUFSIZ);
 904                                         if (tok->buf == NULL) {
 905                                                 tok->done = E_NOMEM;
 906                                                 return EOF;
 907                                         }
 908                                         tok->end = tok->buf + BUFSIZ;
 909                                 }
 910                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 911                                           tok) == NULL) {
 912                                         tok->done = E_EOF;
 913                                         done = 1;
 914                                 }
 915                                 else {
 916                                         tok->done = E_OK;
 917                                         tok->inp = strchr(tok->buf, '\0');
 918                                         done = tok->inp[-1] == '\n';
 919                                 }
 920                         }
 921                         else {
 922                                 cur = tok->cur - tok->buf;
 923                                 if (decoding_feof(tok)) {
 924                                         tok->done = E_EOF;
 925                                         done = 1;
 926                                 }
 927                                 else
 928                                         tok->done = E_OK;
 929                         }
 930                         tok->lineno++;
 931                         /* Read until '\n' or EOF */
 932                         while (!done) {
 933                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 934                                                   tok->start - tok->buf;
 935                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 936                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 937                                 char *newbuf = tok->buf;
 938                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 939                                                                newsize);
 940                                 if (newbuf == NULL) {
 941                                         tok->done = E_NOMEM;
 942                                         tok->cur = tok->inp;
 943                                         return EOF;
 944                                 }
 945                                 tok->buf = newbuf;
 946                                 tok->inp = tok->buf + curvalid;
 947                                 tok->end = tok->buf + newsize;
 948                                 tok->start = curstart < 0 ? NULL :
 949                                              tok->buf + curstart;
 950                                 if (decoding_fgets(tok->inp,
 951                                                (int)(tok->end - tok->inp),
 952                                                tok) == NULL) {
 953                                         /* Break out early on decoding
 954                                            errors, as tok->buf will be NULL
 955                                          */
 956                                         if (tok->decoding_erred)
 957                                                 return EOF;
 958                                         /* Last line does not end in \n,
 959                                            fake one */
 960                                         strcpy(tok->inp, "\n");
 961                                 }
 962                                 tok->inp = strchr(tok->inp, '\0');
 963                                 done = tok->inp[-1] == '\n';
 964                         }
 965                         if (tok->buf != NULL) {
 966                                 tok->cur = tok->buf + cur;
 967                                 tok->line_start = tok->cur;
 968                                 /* replace "\r\n" with "\n" */
 969                                 /* For Mac leave the \r, giving a syntax error */
 970                                 pt = tok->inp - 2;
 971                                 if (pt >= tok->buf && *pt == '\r') {
 972                                         *pt++ = '\n';
 973                                         *pt = '\0';
 974                                         tok->inp = pt;
 975                                 }
 976                         }
 977                 }
 978                 if (tok->done != E_OK) {
 979                         if (tok->prompt != NULL)
 980                                 PySys_WriteStderr("\n");
 981                         tok->cur = tok->inp;
 982                         return EOF;
 983                 }
 984         }
 985         /*NOTREACHED*/
 986 }
 987
 988
 989 /* Back-up one character */
 990
 991 static void
 992 tok_backup(register struct tok_state *tok, register int c)
 993 {
 994         if (c != EOF) {
 995                 if (--tok->cur < tok->buf)
 996                         Py_FatalError("tok_backup: beginning of buffer");
 997                 if (*tok->cur != c)
 998                         *tok->cur = c;
 999         }
1000 }
1001
1002
1003 /* Return the token corresponding to a single character */
1004
1005 int
1006 PyToken_OneChar(int c)
1007 {
1008         switch (c) {
1009         case '(':       return LPAR;
1010         case ')':       return RPAR;
1011         case '[':       return LSQB;
1012         case ']':       return RSQB;
1013         case ':':       return COLON;
1014         case ',':       return COMMA;
1015         case ';':       return SEMI;
1016         case '+':       return PLUS;
1017         case '-':       return MINUS;
1018         case '*':       return STAR;
1019         case '/':       return SLASH;
1020         case '|':       return VBAR;
1021         case '&':       return AMPER;
1022         case '<':       return LESS;
1023         case '>':       return GREATER;
1024         case '=':       return EQUAL;
1025         case '.':       return DOT;
1026         case '%':       return PERCENT;
1027         case '`':       return BACKQUOTE;
1028         case '{':       return LBRACE;
1029         case '}':       return RBRACE;
1030         case '^':       return CIRCUMFLEX;
1031         case '~':       return TILDE;
1032         case '@':       return AT;
1033         default:        return OP;
1034         }
1035 }
1036
1037
1038 int
1039 PyToken_TwoChars(int c1, int c2)
1040 {
1041         switch (c1) {
1042         case '=':
1043                 switch (c2) {
1044                 case '=':       return EQEQUAL;
1045                 }
1046                 break;
1047         case '!':
1048                 switch (c2) {
1049                 case '=':       return NOTEQUAL;
1050                 }
1051                 break;
1052         case '<':
1053                 switch (c2) {
1054                 case '>':       return NOTEQUAL;
1055                 case '=':       return LESSEQUAL;
1056                 case '<':       return LEFTSHIFT;
1057                 }
1058                 break;
1059         case '>':
1060                 switch (c2) {
1061                 case '=':       return GREATEREQUAL;
1062                 case '>':       return RIGHTSHIFT;
1063                 }
1064                 break;
1065         case '+':
1066                 switch (c2) {
1067                 case '=':       return PLUSEQUAL;
1068                 }
1069                 break;
1070         case '-':
1071                 switch (c2) {
1072                 case '=':       return MINEQUAL;
1073                 }
1074                 break;
1075         case '*':
1076                 switch (c2) {
1077                 case '*':       return DOUBLESTAR;
1078                 case '=':       return STAREQUAL;
1079                 }
1080                 break;
1081         case '/':
1082                 switch (c2) {
1083                 case '/':       return DOUBLESLASH;
1084                 case '=':       return SLASHEQUAL;
1085                 }
1086                 break;
1087         case '|':
1088                 switch (c2) {
1089                 case '=':       return VBAREQUAL;
1090                 }
1091                 break;
1092         case '%':
1093                 switch (c2) {
1094                 case '=':       return PERCENTEQUAL;
1095                 }
1096                 break;
1097         case '&':
1098                 switch (c2) {
1099                 case '=':       return AMPEREQUAL;
1100                 }
1101                 break;
1102         case '^':
1103                 switch (c2) {
1104                 case '=':       return CIRCUMFLEXEQUAL;
1105                 }
1106                 break;
1107         }
1108         return OP;
1109 }
1110
1111 int
1112 PyToken_ThreeChars(int c1, int c2, int c3)
1113 {
1114         switch (c1) {
1115         case '<':
1116                 switch (c2) {
1117                 case '<':
1118                         switch (c3) {
1119                         case '=':
1120                                 return LEFTSHIFTEQUAL;
1121                         }
1122                         break;
1123                 }
1124                 break;
1125         case '>':
1126                 switch (c2) {
1127                 case '>':
1128                         switch (c3) {
1129                         case '=':
1130                                 return RIGHTSHIFTEQUAL;
1131                         }
1132                         break;
1133                 }
1134                 break;
1135         case '*':
1136                 switch (c2) {
1137                 case '*':
1138                         switch (c3) {
1139                         case '=':
1140                                 return DOUBLESTAREQUAL;
1141                         }
1142                         break;
1143                 }
1144                 break;
1145         case '/':
1146                 switch (c2) {
1147                 case '/':
1148                         switch (c3) {
1149                         case '=':
1150                                 return DOUBLESLASHEQUAL;
1151                         }
1152                         break;
1153                 }
1154                 break;
1155         }
1156         return OP;
1157 }
1158
1159 static int
1160 indenterror(struct tok_state *tok)
1161 {
1162         if (tok->alterror) {
1163                 tok->done = E_TABSPACE;
1164                 tok->cur = tok->inp;
1165                 return 1;
1166         }
1167         if (tok->altwarning) {
1168                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1169                                   "in indentation\n", tok->filename);
1170                 tok->altwarning = 0;
1171         }
1172         return 0;
1173 }
1174
1175
1176 /* Get next token, after space stripping etc. */
1177
1178 static int
1179 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1180 {
1181         register int c;
1182         int blankline;
1183
1184         *p_start = *p_end = NULL;
1185   nextline:
1186         tok->start = NULL;
1187         blankline = 0;
1188
1189         /* Get indentation level */
1190         if (tok->atbol) {
1191                 register int col = 0;
1192                 register int altcol = 0;
1193                 tok->atbol = 0;
1194                 for (;;) {
1195                         c = tok_nextc(tok);
1196                         if (c == ' ')
1197                                 col++, altcol++;
1198                         else if (c == '\t') {
1199                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1200                                 altcol = (altcol/tok->alttabsize + 1)
1201                                         * tok->alttabsize;
1202                         }
1203                         else if (c == '\014') /* Control-L (formfeed) */
1204                                 col = altcol = 0; /* For Emacs users */
1205                         else
1206                                 break;
1207                 }
1208                 tok_backup(tok, c);
1209                 if (c == '#' || c == '\n') {
1210                         /* Lines with only whitespace and/or comments
1211                            shouldn't affect the indentation and are
1212                            not passed to the parser as NEWLINE tokens,
1213                            except *totally* empty lines in interactive
1214                            mode, which signal the end of a command group. */
1215                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1216                                 blankline = 0; /* Let it through */
1217                         else
1218                                 blankline = 1; /* Ignore completely */
1219                         /* We can't jump back right here since we still
1220                            may need to skip to the end of a comment */
1221                 }
1222                 if (!blankline && tok->level == 0) {
1223                         if (col == tok->indstack[tok->indent]) {
1224                                 /* No change */
1225                                 if (altcol != tok->altindstack[tok->indent]) {
1226                                         if (indenterror(tok))
1227                                                 return ERRORTOKEN;
1228                                 }
1229                         }
1230                         else if (col > tok->indstack[tok->indent]) {
1231                                 /* Indent -- always one */
1232                                 if (tok->indent+1 >= MAXINDENT) {
1233                                         tok->done = E_TOODEEP;
1234                                         tok->cur = tok->inp;
1235                                         return ERRORTOKEN;
1236                                 }
1237                                 if (altcol <= tok->altindstack[tok->indent]) {
1238                                         if (indenterror(tok))
1239                                                 return ERRORTOKEN;
1240                                 }
1241                                 tok->pendin++;
1242                                 tok->indstack[++tok->indent] = col;
1243                                 tok->altindstack[tok->indent] = altcol;
1244                         }
1245                         else /* col < tok->indstack[tok->indent] */ {
1246                                 /* Dedent -- any number, must be consistent */
1247                                 while (tok->indent > 0 &&
1248                                         col < tok->indstack[tok->indent]) {
1249                                         tok->pendin--;
1250                                         tok->indent--;
1251                                 }
1252                                 if (col != tok->indstack[tok->indent]) {
1253                                         tok->done = E_DEDENT;
1254                                         tok->cur = tok->inp;
1255                                         return ERRORTOKEN;
1256                                 }
1257                                 if (altcol != tok->altindstack[tok->indent]) {
1258                                         if (indenterror(tok))
1259                                                 return ERRORTOKEN;
1260                                 }
1261                         }
1262                 }
1263         }
1264
1265         tok->start = tok->cur;
1266
1267         /* Return pending indents/dedents */
1268         if (tok->pendin != 0) {
1269                 if (tok->pendin < 0) {
1270                         tok->pendin++;
1271                         return DEDENT;
1272                 }
1273                 else {
1274                         tok->pendin--;
1275                         return INDENT;
1276                 }
1277         }
1278
1279  again:
1280         tok->start = NULL;
1281         /* Skip spaces */
1282         do {
1283                 c = tok_nextc(tok);
1284         } while (c == ' ' || c == '\t' || c == '\014');
1285
1286         /* Set start of current token */
1287         tok->start = tok->cur - 1;
1288
1289         /* Skip comment, while looking for tab-setting magic */
1290         if (c == '#') {
1291                 static char *tabforms[] = {
1292                         "tab-width:",           /* Emacs */
1293                         ":tabstop=",            /* vim, full form */
1294                         ":ts=",                 /* vim, abbreviated form */
1295                         "set tabsize=",         /* will vi never die? */
1296                 /* more templates can be added here to support other editors */
1297                 };
1298                 char cbuf[80];
1299                 char *tp, **cp;
1300                 tp = cbuf;
1301                 do {
1302                         *tp++ = c = tok_nextc(tok);
1303                 } while (c != EOF && c != '\n' &&
1304                          (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1305                 *tp = '\0';
1306                 for (cp = tabforms;
1307                      cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1308                      cp++) {
1309                         if ((tp = strstr(cbuf, *cp))) {
1310                                 int newsize = atoi(tp + strlen(*cp));
1311
1312                                 if (newsize >= 1 && newsize <= 40) {
1313                                         tok->tabsize = newsize;
1314                                         if (Py_VerboseFlag)
1315                                             PySys_WriteStderr(
1316                                                 "Tab size set to %d\n",
1317                                                 newsize);
1318                                 }
1319                         }
1320                 }
1321                 while (c != EOF && c != '\n')
1322                         c = tok_nextc(tok);
1323         }
1324
1325         /* Check for EOF and errors now */
1326         if (c == EOF) {
1327                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1328         }
1329
1330         /* Identifier (most frequent token!) */
1331         if (isalpha(c) || c == '_') {
1332                 /* Process r"", u"" and ur"" */
1333                 switch (c) {
1334                 case 'b':
1335                 case 'B':
1336                         c = tok_nextc(tok);
1337                         if (c == 'r' || c == 'R')
1338                                 c = tok_nextc(tok);
1339                         if (c == '"' || c == '\'')
1340                                 goto letter_quote;
1341                         break;
1342                 case 'r':
1343                 case 'R':
1344                         c = tok_nextc(tok);
1345                         if (c == '"' || c == '\'')
1346                                 goto letter_quote;
1347                         break;
1348                 case 'u':
1349                 case 'U':
1350                         c = tok_nextc(tok);
1351                         if (c == 'r' || c == 'R')
1352                                 c = tok_nextc(tok);
1353                         if (c == '"' || c == '\'')
1354                                 goto letter_quote;
1355                         break;
1356                 }
1357                 while (isalnum(c) || c == '_') {
1358                         c = tok_nextc(tok);
1359                 }
1360                 tok_backup(tok, c);
1361                 *p_start = tok->start;
1362                 *p_end = tok->cur;
1363                 return NAME;
1364         }
1365
1366         /* Newline */
1367         if (c == '\n') {
1368                 tok->atbol = 1;
1369                 if (blankline || tok->level > 0)
1370                         goto nextline;
1371                 *p_start = tok->start;
1372                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1373                 tok->cont_line = 0;
1374                 return NEWLINE;
1375         }
1376
1377         /* Period or number starting with period? */
1378         if (c == '.') {
1379                 c = tok_nextc(tok);
1380                 if (isdigit(c)) {
1381                         goto fraction;
1382                 }
1383                 else {
1384                         tok_backup(tok, c);
1385                         *p_start = tok->start;
1386                         *p_end = tok->cur;
1387                         return DOT;
1388                 }
1389         }
1390
1391         /* Number */
1392         if (isdigit(c)) {
1393                 if (c == '0') {
1394                         /* Hex, octal or binary -- maybe. */
1395                         c = tok_nextc(tok);
1396                         if (c == '.')
1397                                 goto fraction;
1398 #ifndef WITHOUT_COMPLEX
1399                         if (c == 'j' || c == 'J')
1400                                 goto imaginary;
1401 #endif
1402                         if (c == 'x' || c == 'X') {
1403
1404                                 /* Hex */
1405                                 c = tok_nextc(tok);
1406                                 if (!isxdigit(c)) {
1407                                         tok->done = E_TOKEN;
1408                                         tok_backup(tok, c);
1409                                         return ERRORTOKEN;
1410                                 }
1411                                 do {
1412                                         c = tok_nextc(tok);
1413                                 } while (isxdigit(c));
1414                         }
1415                         else if (c == 'o' || c == 'O') {
1416                                 /* Octal */
1417                                 c = tok_nextc(tok);
1418                                 if (c < '0' || c >= '8') {
1419                                         tok->done = E_TOKEN;
1420                                         tok_backup(tok, c);
1421                                         return ERRORTOKEN;
1422                                 }
1423                                 do {
1424                                         c = tok_nextc(tok);
1425                                 } while ('0' <= c && c < '8');
1426                         }
1427                         else if (c == 'b' || c == 'B') {
1428                                 /* Binary */
1429                                 c = tok_nextc(tok);
1430                                 if (c != '0' && c != '1') {
1431                                         tok->done = E_TOKEN;
1432                                         tok_backup(tok, c);
1433                                         return ERRORTOKEN;
1434                                 }
1435                                 do {
1436                                         c = tok_nextc(tok);
1437                                 } while (c == '0' || c == '1');
1438                         }
1439                         else {
1440                                 int found_decimal = 0;
1441                                 /* Octal; c is first char of it */
1442                                 /* There's no 'isoctdigit' macro, sigh */
1443                                 while ('0' <= c && c < '8') {
1444                                         c = tok_nextc(tok);
1445                                 }
1446                                 if (isdigit(c)) {
1447                                         found_decimal = 1;
1448                                         do {
1449                                                 c = tok_nextc(tok);
1450                                         } while (isdigit(c));
1451                                 }
1452                                 if (c == '.')
1453                                         goto fraction;
1454                                 else if (c == 'e' || c == 'E')
1455                                         goto exponent;
1456 #ifndef WITHOUT_COMPLEX
1457                                 else if (c == 'j' || c == 'J')
1458                                         goto imaginary;
1459 #endif
1460                                 else if (found_decimal) {
1461                                         tok->done = E_TOKEN;
1462                                         tok_backup(tok, c);
1463                                         return ERRORTOKEN;
1464                                 }
1465                         }
1466                         if (c == 'l' || c == 'L')
1467                                 c = tok_nextc(tok);
1468                 }
1469                 else {
1470                         /* Decimal */
1471                         do {
1472                                 c = tok_nextc(tok);
1473                         } while (isdigit(c));
1474                         if (c == 'l' || c == 'L')
1475                                 c = tok_nextc(tok);
1476                         else {
1477                                 /* Accept floating point numbers. */
1478                                 if (c == '.') {
1479                 fraction:
1480                                         /* Fraction */
1481                                         do {
1482                                                 c = tok_nextc(tok);
1483                                         } while (isdigit(c));
1484                                 }
1485                                 if (c == 'e' || c == 'E') {
1486                 exponent:
1487                                         /* Exponent part */
1488                                         c = tok_nextc(tok);
1489                                         if (c == '+' || c == '-')
1490                                                 c = tok_nextc(tok);
1491                                         if (!isdigit(c)) {
1492                                                 tok->done = E_TOKEN;
1493                                                 tok_backup(tok, c);
1494                                                 return ERRORTOKEN;
1495                                         }
1496                                         do {
1497                                                 c = tok_nextc(tok);
1498                                         } while (isdigit(c));
1499                                 }
1500 #ifndef WITHOUT_COMPLEX
1501                                 if (c == 'j' || c == 'J')
1502                                         /* Imaginary part */
1503                 imaginary:
1504                                         c = tok_nextc(tok);
1505 #endif
1506                         }
1507                 }
1508                 tok_backup(tok, c);
1509                 *p_start = tok->start;
1510                 *p_end = tok->cur;
1511                 return NUMBER;
1512         }
1513
1514   letter_quote:
1515         /* String */
1516         if (c == '\'' || c == '"') {
1517                 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1518                 int quote = c;
1519                 int triple = 0;
1520                 int tripcount = 0;
1521                 for (;;) {
1522                         c = tok_nextc(tok);
1523                         if (c == '\n') {
1524                                 if (!triple) {
1525                                         tok->done = E_EOLS;
1526                                         tok_backup(tok, c);
1527                                         return ERRORTOKEN;
1528                                 }
1529                                 tripcount = 0;
1530                                 tok->cont_line = 1; /* multiline string. */
1531                         }
1532                         else if (c == EOF) {
1533                                 if (triple)
1534                                         tok->done = E_EOFS;
1535                                 else
1536                                         tok->done = E_EOLS;
1537                                 tok->cur = tok->inp;
1538                                 return ERRORTOKEN;
1539                         }
1540                         else if (c == quote) {
1541                                 tripcount++;
1542                                 if (tok->cur - tok->start == quote2) {
1543                                         c = tok_nextc(tok);
1544                                         if (c == quote) {
1545                                                 triple = 1;
1546                                                 tripcount = 0;
1547                                                 continue;
1548                                         }
1549                                         tok_backup(tok, c);
1550                                 }
1551                                 if (!triple || tripcount == 3)
1552                                         break;
1553                         }
1554                         else if (c == '\\') {
1555                                 tripcount = 0;
1556                                 c = tok_nextc(tok);
1557                                 if (c == EOF) {
1558                                         tok->done = E_EOLS;
1559                                         tok->cur = tok->inp;
1560                                         return ERRORTOKEN;
1561                                 }
1562                         }
1563                         else
1564                                 tripcount = 0;
1565                 }
1566                 *p_start = tok->start;
1567                 *p_end = tok->cur;
1568                 return STRING;
1569         }
1570
1571         /* Line continuation */
1572         if (c == '\\') {
1573                 c = tok_nextc(tok);
1574                 if (c != '\n') {
1575                         tok->done = E_LINECONT;
1576                         tok->cur = tok->inp;
1577                         return ERRORTOKEN;
1578                 }
1579                 tok->cont_line = 1;
1580                 goto again; /* Read next line */
1581         }
1582
1583         /* Check for two-character token */
1584         {
1585                 int c2 = tok_nextc(tok);
1586                 int token = PyToken_TwoChars(c, c2);
1587 #ifndef PGEN
1588                 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1589                         if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1590                                                "<> not supported in 3.x; use !=",
1591                                                tok->filename, tok->lineno,
1592                                                NULL, NULL)) {
1593                                 return ERRORTOKEN;
1594                         }
1595                 }
1596 #endif
1597                 if (token != OP) {
1598                         int c3 = tok_nextc(tok);
1599                         int token3 = PyToken_ThreeChars(c, c2, c3);
1600                         if (token3 != OP) {
1601                                 token = token3;
1602                         } else {
1603                                 tok_backup(tok, c3);
1604                         }
1605                         *p_start = tok->start;
1606                         *p_end = tok->cur;
1607                         return token;
1608                 }
1609                 tok_backup(tok, c2);
1610         }
1611
1612         /* Keep track of parentheses nesting level */
1613         switch (c) {
1614         case '(':
1615         case '[':
1616         case '{':
1617                 tok->level++;
1618                 break;
1619         case ')':
1620         case ']':
1621         case '}':
1622                 tok->level--;
1623                 break;
1624         }
1625
1626         /* Punctuation character */
1627         *p_start = tok->start;
1628         *p_end = tok->cur;
1629         return PyToken_OneChar(c);
1630 }
1631
1632 int
1633 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1634 {
1635         int result = tok_get(tok, p_start, p_end);
1636         if (tok->decoding_erred) {
1637                 result = ERRORTOKEN;
1638                 tok->done = E_DECODE;
1639         }
1640         return result;
1641 }
1642
1643 /* This function is only called from parsetok. However, it cannot live
1644    there, as it must be empty for PGEN, and we can check for PGEN only
1645    in this file. */
1646
1647 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1648 char*
1649 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1650 {
1651         return NULL;
1652 }
1653 #else
1654 #ifdef Py_USING_UNICODE
1655 static PyObject *
1656 dec_utf8(const char *enc, const char *text, size_t len) {
1657         PyObject *ret = NULL;
1658         PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1659         if (unicode_text) {
1660                 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1661                 Py_DECREF(unicode_text);
1662         }
1663         if (!ret) {
1664                 PyErr_Clear();
1665         }
1666         return ret;
1667 }
1668 char *
1669 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1670 {
1671         char *text = NULL;
1672         if (tok->encoding) {
1673                 /* convert source to original encondig */
1674                 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1675                 if (lineobj != NULL) {
1676                         int linelen = PyString_Size(lineobj);
1677                         const char *line = PyString_AsString(lineobj);
1678                         text = PyObject_MALLOC(linelen + 1);
1679                         if (text != NULL && line != NULL) {
1680                                 if (linelen)
1681                                         strncpy(text, line, linelen);
1682                                 text[linelen] = '\0';
1683                         }
1684                         Py_DECREF(lineobj);
1685
1686                         /* adjust error offset */
1687                         if (*offset > 1) {
1688                                 PyObject *offsetobj = dec_utf8(tok->encoding,
1689                                                                tok->buf, *offset-1);
1690                                 if (offsetobj) {
1691                                         *offset = PyString_Size(offsetobj) + 1;
1692                                         Py_DECREF(offsetobj);
1693                                 }
1694                         }
1695
1696                 }
1697         }
1698         return text;
1699
1700 }
1701 #endif /* defined(Py_USING_UNICODE) */
1702 #endif
1703
1704
1705 #ifdef Py_DEBUG
1706
1707 void
1708 tok_dump(int type, char *start, char *end)
1709 {
1710         printf("%s", _PyParser_TokenNames[type]);
1711         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1712                 printf("(%.*s)", (int)(end - start), start);
1713 }
1714
1715 #endif