Parser/tokenizer.c

   1
   2 /* Tokenizer implementation */
   3
   4 #include "Python.h"
   5 #include "pgenheaders.h"
   6
   7 #include <ctype.h>
   8 #include <assert.h>
   9
  10 #include "tokenizer.h"
  11 #include "errcode.h"
  12
  13 #ifndef PGEN
  14 #include "unicodeobject.h"
  15 #include "stringobject.h"
  16 #include "fileobject.h"
  17 #include "codecs.h"
  18 #include "abstract.h"
  19 #include "pydebug.h"
  20 #endif /* PGEN */
  21
  22 extern char *PyOS_Readline(FILE *, FILE *, char *);
  23 /* Return malloc'ed string including trailing \n;
  24    empty malloc'ed string for EOF;
  25    NULL if interrupted */
  26
  27 /* Don't ever change this -- it would break the portability of Python code */
  28 #define TABSIZE 8
  29
  30 /* Forward */
  31 static struct tok_state *tok_new(void);
  32 static int tok_nextc(struct tok_state *tok);
  33 static void tok_backup(struct tok_state *tok, int c);
  34
  35 /* Token names */
  36
  37 char *_PyParser_TokenNames[] = {
  38         "ENDMARKER",
  39         "NAME",
  40         "NUMBER",
  41         "STRING",
  42         "NEWLINE",
  43         "INDENT",
  44         "DEDENT",
  45         "LPAR",
  46         "RPAR",
  47         "LSQB",
  48         "RSQB",
  49         "COLON",
  50         "COMMA",
  51         "SEMI",
  52         "PLUS",
  53         "MINUS",
  54         "STAR",
  55         "SLASH",
  56         "VBAR",
  57         "AMPER",
  58         "LESS",
  59         "GREATER",
  60         "EQUAL",
  61         "DOT",
  62         "PERCENT",
  63         "BACKQUOTE",
  64         "LBRACE",
  65         "RBRACE",
  66         "EQEQUAL",
  67         "NOTEQUAL",
  68         "LESSEQUAL",
  69         "GREATEREQUAL",
  70         "TILDE",
  71         "CIRCUMFLEX",
  72         "LEFTSHIFT",
  73         "RIGHTSHIFT",
  74         "DOUBLESTAR",
  75         "PLUSEQUAL",
  76         "MINEQUAL",
  77         "STAREQUAL",
  78         "SLASHEQUAL",
  79         "PERCENTEQUAL",
  80         "AMPEREQUAL",
  81         "VBAREQUAL",
  82         "CIRCUMFLEXEQUAL",
  83         "LEFTSHIFTEQUAL",
  84         "RIGHTSHIFTEQUAL",
  85         "DOUBLESTAREQUAL",
  86         "DOUBLESLASH",
  87         "DOUBLESLASHEQUAL",
  88         "AT",
  89         /* This table must match the #defines in token.h! */
  90         "OP",
  91         "<ERRORTOKEN>",
  92         "<N_TOKENS>"
  93 };
  94
  95
  96 /* Create and initialize a new tok_state structure */
  97
  98 static struct tok_state *
  99 tok_new(void)
 100 {
 101         struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
 102                                                 sizeof(struct tok_state));
 103         if (tok == NULL)
 104                 return NULL;
 105         tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
 106         tok->done = E_OK;
 107         tok->fp = NULL;
 108         tok->input = NULL;
 109         tok->tabsize = TABSIZE;
 110         tok->indent = 0;
 111         tok->indstack[0] = 0;
 112         tok->atbol = 1;
 113         tok->pendin = 0;
 114         tok->prompt = tok->nextprompt = NULL;
 115         tok->lineno = 0;
 116         tok->level = 0;
 117         tok->filename = NULL;
 118         tok->altwarning = 0;
 119         tok->alterror = 0;
 120         tok->alttabsize = 1;
 121         tok->altindstack[0] = 0;
 122         tok->decoding_state = 0;
 123         tok->decoding_erred = 0;
 124         tok->read_coding_spec = 0;
 125         tok->encoding = NULL;
 126         tok->cont_line = 0;
 127 #ifndef PGEN
 128         tok->decoding_readline = NULL;
 129         tok->decoding_buffer = NULL;
 130 #endif
 131         return tok;
 132 }
 133
 134 static char *
 135 new_string(const char *s, Py_ssize_t len)
 136 {
 137         char* result = (char *)PyMem_MALLOC(len + 1);
 138         if (result != NULL) {
 139                 memcpy(result, s, len);
 140                 result[len] = '\0';
 141         }
 142         return result;
 143 }
 144
 145 #ifdef PGEN
 146
 147 static char *
 148 decoding_fgets(char *s, int size, struct tok_state *tok)
 149 {
 150         return fgets(s, size, tok->fp);
 151 }
 152
 153 static int
 154 decoding_feof(struct tok_state *tok)
 155 {
 156         return feof(tok->fp);
 157 }
 158
 159 static char *
 160 decode_str(const char *str, int exec_input, struct tok_state *tok)
 161 {
 162         return new_string(str, strlen(str));
 163 }
 164
 165 #else /* PGEN */
 166
 167 static char *
 168 error_ret(struct tok_state *tok) /* XXX */
 169 {
 170         tok->decoding_erred = 1;
 171         if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
 172                 PyMem_FREE(tok->buf);
 173         tok->buf = NULL;
 174         return NULL;            /* as if it were EOF */
 175 }
 176
 177
 178 static char *
 179 get_normal_name(char *s)        /* for utf-8 and latin-1 */
 180 {
 181         char buf[13];
 182         int i;
 183         for (i = 0; i < 12; i++) {
 184                 int c = s[i];
 185                 if (c == '\0')
 186                         break;
 187                 else if (c == '_')
 188                         buf[i] = '-';
 189                 else
 190                         buf[i] = tolower(c);
 191         }
 192         buf[i] = '\0';
 193         if (strcmp(buf, "utf-8") == 0 ||
 194             strncmp(buf, "utf-8-", 6) == 0)
 195                 return "utf-8";
 196         else if (strcmp(buf, "latin-1") == 0 ||
 197                  strcmp(buf, "iso-8859-1") == 0 ||
 198                  strcmp(buf, "iso-latin-1") == 0 ||
 199                  strncmp(buf, "latin-1-", 8) == 0 ||
 200                  strncmp(buf, "iso-8859-1-", 11) == 0 ||
 201                  strncmp(buf, "iso-latin-1-", 12) == 0)
 202                 return "iso-8859-1";
 203         else
 204                 return s;
 205 }
 206
 207 /* Return the coding spec in S, or NULL if none is found.  */
 208
 209 static char *
 210 get_coding_spec(const char *s, Py_ssize_t size)
 211 {
 212         Py_ssize_t i;
 213         /* Coding spec must be in a comment, and that comment must be
 214          * the only statement on the source code line. */
 215         for (i = 0; i < size - 6; i++) {
 216                 if (s[i] == '#')
 217                         break;
 218                 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
 219                         return NULL;
 220         }
 221         for (; i < size - 6; i++) { /* XXX inefficient search */
 222                 const char* t = s + i;
 223                 if (strncmp(t, "coding", 6) == 0) {
 224                         const char* begin = NULL;
 225                         t += 6;
 226                         if (t[0] != ':' && t[0] != '=')
 227                                 continue;
 228                         do {
 229                                 t++;
 230                         } while (t[0] == '\x20' || t[0] == '\t');
 231
 232                         begin = t;
 233                         while (isalnum(Py_CHARMASK(t[0])) ||
 234                                t[0] == '-' || t[0] == '_' || t[0] == '.')
 235                                 t++;
 236
 237                         if (begin < t) {
 238                                 char* r = new_string(begin, t - begin);
 239                                 char* q = get_normal_name(r);
 240                                 if (r != q) {
 241                                         PyMem_FREE(r);
 242                                         r = new_string(q, strlen(q));
 243                                 }
 244                                 return r;
 245                         }
 246                 }
 247         }
 248         return NULL;
 249 }
 250
 251 /* Check whether the line contains a coding spec. If it does,
 252    invoke the set_readline function for the new encoding.
 253    This function receives the tok_state and the new encoding.
 254    Return 1 on success, 0 on failure.  */
 255
 256 static int
 257 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
 258                   int set_readline(struct tok_state *, const char *))
 259 {
 260         char * cs;
 261         int r = 1;
 262
 263         if (tok->cont_line)
 264                 /* It's a continuation line, so it can't be a coding spec. */
 265                 return 1;
 266         cs = get_coding_spec(line, size);
 267         if (cs != NULL) {
 268                 tok->read_coding_spec = 1;
 269                 if (tok->encoding == NULL) {
 270                         assert(tok->decoding_state == 1); /* raw */
 271                         if (strcmp(cs, "utf-8") == 0 ||
 272                             strcmp(cs, "iso-8859-1") == 0) {
 273                                 tok->encoding = cs;
 274                         } else {
 275 #ifdef Py_USING_UNICODE
 276                                 r = set_readline(tok, cs);
 277                                 if (r) {
 278                                         tok->encoding = cs;
 279                                         tok->decoding_state = -1;
 280                                 }
 281                                 else
 282                                         PyMem_FREE(cs);
 283 #else
 284                                 /* Without Unicode support, we cannot
 285                                    process the coding spec. Since there
 286                                    won't be any Unicode literals, that
 287                                    won't matter. */
 288                                 PyMem_FREE(cs);
 289 #endif
 290                         }
 291                 } else {        /* then, compare cs with BOM */
 292                         r = (strcmp(tok->encoding, cs) == 0);
 293                         PyMem_FREE(cs);
 294                 }
 295         }
 296         if (!r) {
 297                 cs = tok->encoding;
 298                 if (!cs)
 299                         cs = "with BOM";
 300                 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
 301         }
 302         return r;
 303 }
 304
 305 /* See whether the file starts with a BOM. If it does,
 306    invoke the set_readline function with the new encoding.
 307    Return 1 on success, 0 on failure.  */
 308
 309 static int
 310 check_bom(int get_char(struct tok_state *),
 311           void unget_char(int, struct tok_state *),
 312           int set_readline(struct tok_state *, const char *),
 313           struct tok_state *tok)
 314 {
 315         int ch = get_char(tok);
 316         tok->decoding_state = 1;
 317         if (ch == EOF) {
 318                 return 1;
 319         } else if (ch == 0xEF) {
 320                 ch = get_char(tok);
 321                 if (ch != 0xBB)
 322                         goto NON_BOM;
 323                 ch = get_char(tok);
 324                 if (ch != 0xBF)
 325                         goto NON_BOM;
 326 #if 0
 327         /* Disable support for UTF-16 BOMs until a decision
 328            is made whether this needs to be supported.  */
 329         } else if (ch == 0xFE) {
 330                 ch = get_char(tok);
 331                 if (ch != 0xFF)
 332                         goto NON_BOM;
 333                 if (!set_readline(tok, "utf-16-be"))
 334                         return 0;
 335                 tok->decoding_state = -1;
 336         } else if (ch == 0xFF) {
 337                 ch = get_char(tok);
 338                 if (ch != 0xFE)
 339                         goto NON_BOM;
 340                 if (!set_readline(tok, "utf-16-le"))
 341                         return 0;
 342                 tok->decoding_state = -1;
 343 #endif
 344         } else {
 345                 unget_char(ch, tok);
 346                 return 1;
 347         }
 348         if (tok->encoding != NULL)
 349                 PyMem_FREE(tok->encoding);
 350         tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
 351         return 1;
 352   NON_BOM:
 353         /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
 354         unget_char(0xFF, tok);  /* XXX this will cause a syntax error */
 355         return 1;
 356 }
 357
 358 /* Read a line of text from TOK into S, using the stream in TOK.
 359    Return NULL on failure, else S.
 360
 361    On entry, tok->decoding_buffer will be one of:
 362      1) NULL: need to call tok->decoding_readline to get a new line
 363      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
 364            stored the result in tok->decoding_buffer
 365      3) PyStringObject *: previous call to fp_readl did not have enough room
 366            (in the s buffer) to copy entire contents of the line read
 367            by tok->decoding_readline.  tok->decoding_buffer has the overflow.
 368            In this case, fp_readl is called in a loop (with an expanded buffer)
 369            until the buffer ends with a '\n' (or until the end of the file is
 370            reached): see tok_nextc and its calls to decoding_fgets.
 371 */
 372
 373 static char *
 374 fp_readl(char *s, int size, struct tok_state *tok)
 375 {
 376 #ifndef Py_USING_UNICODE
 377         /* In a non-Unicode built, this should never be called. */
 378         Py_FatalError("fp_readl should not be called in this build.");
 379         return NULL; /* Keep compiler happy (not reachable) */
 380 #else
 381         PyObject* utf8 = NULL;
 382         PyObject* buf = tok->decoding_buffer;
 383         char *str;
 384         Py_ssize_t utf8len;
 385
 386         /* Ask for one less byte so we can terminate it */
 387         assert(size > 0);
 388         size--;
 389
 390         if (buf == NULL) {
 391                 buf = PyObject_CallObject(tok->decoding_readline, NULL);
 392                 if (buf == NULL)
 393                         return error_ret(tok);
 394         } else {
 395                 tok->decoding_buffer = NULL;
 396                 if (PyString_CheckExact(buf))
 397                         utf8 = buf;
 398         }
 399         if (utf8 == NULL) {
 400                 utf8 = PyUnicode_AsUTF8String(buf);
 401                 Py_DECREF(buf);
 402                 if (utf8 == NULL)
 403                         return error_ret(tok);
 404         }
 405         str = PyString_AsString(utf8);
 406         utf8len = PyString_GET_SIZE(utf8);
 407         if (utf8len > size) {
 408                 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
 409                 if (tok->decoding_buffer == NULL) {
 410                         Py_DECREF(utf8);
 411                         return error_ret(tok);
 412                 }
 413                 utf8len = size;
 414         }
 415         memcpy(s, str, utf8len);
 416         s[utf8len] = '\0';
 417         Py_DECREF(utf8);
 418         if (utf8len == 0)
 419                 return NULL; /* EOF */
 420         return s;
 421 #endif
 422 }
 423
 424 /* Set the readline function for TOK to a StreamReader's
 425    readline function. The StreamReader is named ENC.
 426
 427    This function is called from check_bom and check_coding_spec.
 428
 429    ENC is usually identical to the future value of tok->encoding,
 430    except for the (currently unsupported) case of UTF-16.
 431
 432    Return 1 on success, 0 on failure. */
 433
 434 static int
 435 fp_setreadl(struct tok_state *tok, const char* enc)
 436 {
 437         PyObject *reader, *stream, *readline;
 438
 439         /* XXX: constify filename argument. */
 440         stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
 441         if (stream == NULL)
 442                 return 0;
 443
 444         reader = PyCodec_StreamReader(enc, stream, NULL);
 445         Py_DECREF(stream);
 446         if (reader == NULL)
 447                 return 0;
 448
 449         readline = PyObject_GetAttrString(reader, "readline");
 450         Py_DECREF(reader);
 451         if (readline == NULL)
 452                 return 0;
 453
 454         tok->decoding_readline = readline;
 455         return 1;
 456 }
 457
 458 /* Fetch the next byte from TOK. */
 459
 460 static int fp_getc(struct tok_state *tok) {
 461         return getc(tok->fp);
 462 }
 463
 464 /* Unfetch the last byte back into TOK.  */
 465
 466 static void fp_ungetc(int c, struct tok_state *tok) {
 467         ungetc(c, tok->fp);
 468 }
 469
 470 /* Read a line of input from TOK. Determine encoding
 471    if necessary.  */
 472
 473 static char *
 474 decoding_fgets(char *s, int size, struct tok_state *tok)
 475 {
 476         char *line = NULL;
 477         int badchar = 0;
 478         for (;;) {
 479                 if (tok->decoding_state < 0) {
 480                         /* We already have a codec associated with
 481                            this input. */
 482                         line = fp_readl(s, size, tok);
 483                         break;
 484                 } else if (tok->decoding_state > 0) {
 485                         /* We want a 'raw' read. */
 486                         line = Py_UniversalNewlineFgets(s, size,
 487                                                         tok->fp, NULL);
 488                         break;
 489                 } else {
 490                         /* We have not yet determined the encoding.
 491                            If an encoding is found, use the file-pointer
 492                            reader functions from now on. */
 493                         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
 494                                 return error_ret(tok);
 495                         assert(tok->decoding_state != 0);
 496                 }
 497         }
 498         if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
 499                 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
 500                         return error_ret(tok);
 501                 }
 502         }
 503 #ifndef PGEN
 504         /* The default encoding is ASCII, so make sure we don't have any
 505            non-ASCII bytes in it. */
 506         if (line && !tok->encoding) {
 507                 unsigned char *c;
 508                 for (c = (unsigned char *)line; *c; c++)
 509                         if (*c > 127) {
 510                                 badchar = *c;
 511                                 break;
 512                         }
 513         }
 514         if (badchar) {
 515                 char buf[500];
 516                 /* Need to add 1 to the line number, since this line
 517                    has not been counted, yet.  */
 518                 sprintf(buf,
 519                         "Non-ASCII character '\\x%.2x' "
 520                         "in file %.200s on line %i, "
 521                         "but no encoding declared; "
 522                         "see http://www.python.org/peps/pep-0263.html for details",
 523                         badchar, tok->filename, tok->lineno + 1);
 524                 PyErr_SetString(PyExc_SyntaxError, buf);
 525                 return error_ret(tok);
 526         }
 527 #endif
 528         return line;
 529 }
 530
 531 static int
 532 decoding_feof(struct tok_state *tok)
 533 {
 534         if (tok->decoding_state >= 0) {
 535                 return feof(tok->fp);
 536         } else {
 537                 PyObject* buf = tok->decoding_buffer;
 538                 if (buf == NULL) {
 539                         buf = PyObject_CallObject(tok->decoding_readline, NULL);
 540                         if (buf == NULL) {
 541                                 error_ret(tok);
 542                                 return 1;
 543                         } else {
 544                                 tok->decoding_buffer = buf;
 545                         }
 546                 }
 547                 return PyObject_Length(buf) == 0;
 548         }
 549 }
 550
 551 /* Fetch a byte from TOK, using the string buffer. */
 552
 553 static int
 554 buf_getc(struct tok_state *tok) {
 555         return Py_CHARMASK(*tok->str++);
 556 }
 557
 558 /* Unfetch a byte from TOK, using the string buffer. */
 559
 560 static void
 561 buf_ungetc(int c, struct tok_state *tok) {
 562         tok->str--;
 563         assert(Py_CHARMASK(*tok->str) == c);    /* tok->cur may point to read-only segment */
 564 }
 565
 566 /* Set the readline function for TOK to ENC. For the string-based
 567    tokenizer, this means to just record the encoding. */
 568
 569 static int
 570 buf_setreadl(struct tok_state *tok, const char* enc) {
 571         tok->enc = enc;
 572         return 1;
 573 }
 574
 575 /* Return a UTF-8 encoding Python string object from the
 576    C byte string STR, which is encoded with ENC. */
 577
 578 #ifdef Py_USING_UNICODE
 579 static PyObject *
 580 translate_into_utf8(const char* str, const char* enc) {
 581         PyObject *utf8;
 582         PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
 583         if (buf == NULL)
 584                 return NULL;
 585         utf8 = PyUnicode_AsUTF8String(buf);
 586         Py_DECREF(buf);
 587         return utf8;
 588 }
 589 #endif
 590
 591
 592 static char *
 593 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
 594         int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;
 595         char *buf, *current;
 596         char c = '\0';
 597         buf = PyMem_MALLOC(needed_length);
 598         if (buf == NULL) {
 599                 tok->done = E_NOMEM;
 600                 return NULL;
 601         }
 602         for (current = buf; *s; s++, current++) {
 603                 c = *s;
 604                 if (skip_next_lf) {
 605                         skip_next_lf = 0;
 606                         if (c == '\n') {
 607                                 c = *++s;
 608                                 if (!c)
 609                                         break;
 610                         }
 611                 }
 612                 if (c == '\r') {
 613                         skip_next_lf = 1;
 614                         c = '\n';
 615                 }
 616                 *current = c;
 617         }
 618         /* If this is exec input, add a newline to the end of the string if
 619            there isn't one already. */
 620         if (exec_input && c != '\n') {
 621                 *current = '\n';
 622                 current++;
 623         }
 624         *current = '\0';
 625         final_length = current - buf + 1;
 626         if (final_length < needed_length && final_length)
 627                 /* should never fail */
 628                 buf = PyMem_REALLOC(buf, final_length);
 629         return buf;
 630 }
 631
 632 /* Decode a byte string STR for use as the buffer of TOK.
 633    Look for encoding declarations inside STR, and record them
 634    inside TOK.  */
 635
 636 static const char *
 637 decode_str(const char *input, int single, struct tok_state *tok)
 638 {
 639         PyObject* utf8 = NULL;
 640         const char *str;
 641         const char *s;
 642         const char *newl[2] = {NULL, NULL};
 643         int lineno = 0;
 644         tok->input = str = translate_newlines(input, single, tok);
 645         if (str == NULL)
 646                 return NULL;
 647         tok->enc = NULL;
 648         tok->str = str;
 649         if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
 650                 return error_ret(tok);
 651         str = tok->str;         /* string after BOM if any */
 652         assert(str);
 653 #ifdef Py_USING_UNICODE
 654         if (tok->enc != NULL) {
 655                 utf8 = translate_into_utf8(str, tok->enc);
 656                 if (utf8 == NULL)
 657                         return error_ret(tok);
 658                 str = PyString_AsString(utf8);
 659         }
 660 #endif
 661         for (s = str;; s++) {
 662                 if (*s == '\0') break;
 663                 else if (*s == '\n') {
 664                         assert(lineno < 2);
 665                         newl[lineno] = s;
 666                         lineno++;
 667                         if (lineno == 2) break;
 668                 }
 669         }
 670         tok->enc = NULL;
 671         /* need to check line 1 and 2 separately since check_coding_spec
 672            assumes a single line as input */
 673         if (newl[0]) {
 674                 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
 675                         return error_ret(tok);
 676                 if (tok->enc == NULL && newl[1]) {
 677                         if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
 678                                                tok, buf_setreadl))
 679                                 return error_ret(tok);
 680                 }
 681         }
 682 #ifdef Py_USING_UNICODE
 683         if (tok->enc != NULL) {
 684                 assert(utf8 == NULL);
 685                 utf8 = translate_into_utf8(str, tok->enc);
 686                 if (utf8 == NULL)
 687                         return error_ret(tok);
 688                 str = PyString_AsString(utf8);
 689         }
 690 #endif
 691         assert(tok->decoding_buffer == NULL);
 692         tok->decoding_buffer = utf8; /* CAUTION */
 693         return str;
 694 }
 695
 696 #endif /* PGEN */
 697
 698 /* Set up tokenizer for string */
 699
 700 struct tok_state *
 701 PyTokenizer_FromString(const char *str, int exec_input)
 702 {
 703         struct tok_state *tok = tok_new();
 704         if (tok == NULL)
 705                 return NULL;
 706         str = (char *)decode_str(str, exec_input, tok);
 707         if (str == NULL) {
 708                 PyTokenizer_Free(tok);
 709                 return NULL;
 710         }
 711
 712         /* XXX: constify members. */
 713         tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
 714         return tok;
 715 }
 716
 717
 718 /* Set up tokenizer for file */
 719
 720 struct tok_state *
 721 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
 722 {
 723         struct tok_state *tok = tok_new();
 724         if (tok == NULL)
 725                 return NULL;
 726         if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
 727                 PyTokenizer_Free(tok);
 728                 return NULL;
 729         }
 730         tok->cur = tok->inp = tok->buf;
 731         tok->end = tok->buf + BUFSIZ;
 732         tok->fp = fp;
 733         tok->prompt = ps1;
 734         tok->nextprompt = ps2;
 735         return tok;
 736 }
 737
 738
 739 /* Free a tok_state structure */
 740
 741 void
 742 PyTokenizer_Free(struct tok_state *tok)
 743 {
 744         if (tok->encoding != NULL)
 745                 PyMem_FREE(tok->encoding);
 746 #ifndef PGEN
 747         Py_XDECREF(tok->decoding_readline);
 748         Py_XDECREF(tok->decoding_buffer);
 749 #endif
 750         if (tok->fp != NULL && tok->buf != NULL)
 751                 PyMem_FREE(tok->buf);
 752         if (tok->input)
 753                 PyMem_FREE((char *)tok->input);
 754         PyMem_FREE(tok);
 755 }
 756
 757 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 758 static int
 759 tok_stdin_decode(struct tok_state *tok, char **inp)
 760 {
 761         PyObject *enc, *sysstdin, *decoded, *utf8;
 762         const char *encoding;
 763         char *converted;
 764
 765         if (PySys_GetFile((char *)"stdin", NULL) != stdin)
 766                 return 0;
 767         sysstdin = PySys_GetObject("stdin");
 768         if (sysstdin == NULL || !PyFile_Check(sysstdin))
 769                 return 0;
 770
 771         enc = ((PyFileObject *)sysstdin)->f_encoding;
 772         if (enc == NULL || !PyString_Check(enc))
 773                 return 0;
 774         Py_INCREF(enc);
 775
 776         encoding = PyString_AsString(enc);
 777         decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
 778         if (decoded == NULL)
 779                 goto error_clear;
 780
 781         utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
 782         Py_DECREF(decoded);
 783         if (utf8 == NULL)
 784                 goto error_clear;
 785
 786         assert(PyString_Check(utf8));
 787         converted = new_string(PyString_AS_STRING(utf8),
 788                                PyString_GET_SIZE(utf8));
 789         Py_DECREF(utf8);
 790         if (converted == NULL)
 791                 goto error_nomem;
 792
 793         PyMem_FREE(*inp);
 794         *inp = converted;
 795         if (tok->encoding != NULL)
 796                 PyMem_FREE(tok->encoding);
 797         tok->encoding = new_string(encoding, strlen(encoding));
 798         if (tok->encoding == NULL)
 799                 goto error_nomem;
 800
 801         Py_DECREF(enc);
 802         return 0;
 803
 804 error_nomem:
 805         Py_DECREF(enc);
 806         tok->done = E_NOMEM;
 807         return -1;
 808
 809 error_clear:
 810         /* Fallback to iso-8859-1: for backward compatibility */
 811         Py_DECREF(enc);
 812         PyErr_Clear();
 813         return 0;
 814 }
 815 #endif
 816
 817 /* Get next char, updating state; error code goes into tok->done */
 818
 819 static int
 820 tok_nextc(register struct tok_state *tok)
 821 {
 822         for (;;) {
 823                 if (tok->cur != tok->inp) {
 824                         return Py_CHARMASK(*tok->cur++); /* Fast path */
 825                 }
 826                 if (tok->done != E_OK)
 827                         return EOF;
 828                 if (tok->fp == NULL) {
 829                         char *end = strchr(tok->inp, '\n');
 830                         if (end != NULL)
 831                                 end++;
 832                         else {
 833                                 end = strchr(tok->inp, '\0');
 834                                 if (end == tok->inp) {
 835                                         tok->done = E_EOF;
 836                                         return EOF;
 837                                 }
 838                         }
 839                         if (tok->start == NULL)
 840                                 tok->buf = tok->cur;
 841                         tok->line_start = tok->cur;
 842                         tok->lineno++;
 843                         tok->inp = end;
 844                         return Py_CHARMASK(*tok->cur++);
 845                 }
 846                 if (tok->prompt != NULL) {
 847                         char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
 848                         if (tok->nextprompt != NULL)
 849                                 tok->prompt = tok->nextprompt;
 850                         if (newtok == NULL)
 851                                 tok->done = E_INTR;
 852                         else if (*newtok == '\0') {
 853                                 PyMem_FREE(newtok);
 854                                 tok->done = E_EOF;
 855                         }
 856 #if !defined(PGEN) && defined(Py_USING_UNICODE)
 857                         else if (tok_stdin_decode(tok, &newtok) != 0)
 858                                 PyMem_FREE(newtok);
 859 #endif
 860                         else if (tok->start != NULL) {
 861                                 size_t start = tok->start - tok->buf;
 862                                 size_t oldlen = tok->cur - tok->buf;
 863                                 size_t newlen = oldlen + strlen(newtok);
 864                                 char *buf = tok->buf;
 865                                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
 866                                 tok->lineno++;
 867                                 if (buf == NULL) {
 868                                         PyMem_FREE(tok->buf);
 869                                         tok->buf = NULL;
 870                                         PyMem_FREE(newtok);
 871                                         tok->done = E_NOMEM;
 872                                         return EOF;
 873                                 }
 874                                 tok->buf = buf;
 875                                 tok->cur = tok->buf + oldlen;
 876                                 tok->line_start = tok->cur;
 877                                 strcpy(tok->buf + oldlen, newtok);
 878                                 PyMem_FREE(newtok);
 879                                 tok->inp = tok->buf + newlen;
 880                                 tok->end = tok->inp + 1;
 881                                 tok->start = tok->buf + start;
 882                         }
 883                         else {
 884                                 tok->lineno++;
 885                                 if (tok->buf != NULL)
 886                                         PyMem_FREE(tok->buf);
 887                                 tok->buf = newtok;
 888                                 tok->line_start = tok->buf;
 889                                 tok->cur = tok->buf;
 890                                 tok->line_start = tok->buf;
 891                                 tok->inp = strchr(tok->buf, '\0');
 892                                 tok->end = tok->inp + 1;
 893                         }
 894                 }
 895                 else {
 896                         int done = 0;
 897                         Py_ssize_t cur = 0;
 898                         char *pt;
 899                         if (tok->start == NULL) {
 900                                 if (tok->buf == NULL) {
 901                                         tok->buf = (char *)
 902                                                 PyMem_MALLOC(BUFSIZ);
 903                                         if (tok->buf == NULL) {
 904                                                 tok->done = E_NOMEM;
 905                                                 return EOF;
 906                                         }
 907                                         tok->end = tok->buf + BUFSIZ;
 908                                 }
 909                                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
 910                                           tok) == NULL) {
 911                                         tok->done = E_EOF;
 912                                         done = 1;
 913                                 }
 914                                 else {
 915                                         tok->done = E_OK;
 916                                         tok->inp = strchr(tok->buf, '\0');
 917                                         done = tok->inp[-1] == '\n';
 918                                 }
 919                         }
 920                         else {
 921                                 cur = tok->cur - tok->buf;
 922                                 if (decoding_feof(tok)) {
 923                                         tok->done = E_EOF;
 924                                         done = 1;
 925                                 }
 926                                 else
 927                                         tok->done = E_OK;
 928                         }
 929                         tok->lineno++;
 930                         /* Read until '\n' or EOF */
 931                         while (!done) {
 932                                 Py_ssize_t curstart = tok->start == NULL ? -1 :
 933                                                   tok->start - tok->buf;
 934                                 Py_ssize_t curvalid = tok->inp - tok->buf;
 935                                 Py_ssize_t newsize = curvalid + BUFSIZ;
 936                                 char *newbuf = tok->buf;
 937                                 newbuf = (char *)PyMem_REALLOC(newbuf,
 938                                                                newsize);
 939                                 if (newbuf == NULL) {
 940                                         tok->done = E_NOMEM;
 941                                         tok->cur = tok->inp;
 942                                         return EOF;
 943                                 }
 944                                 tok->buf = newbuf;
 945                                 tok->inp = tok->buf + curvalid;
 946                                 tok->end = tok->buf + newsize;
 947                                 tok->start = curstart < 0 ? NULL :
 948                                              tok->buf + curstart;
 949                                 if (decoding_fgets(tok->inp,
 950                                                (int)(tok->end - tok->inp),
 951                                                tok) == NULL) {
 952                                         /* Break out early on decoding
 953                                            errors, as tok->buf will be NULL
 954                                          */
 955                                         if (tok->decoding_erred)
 956                                                 return EOF;
 957                                         /* Last line does not end in \n,
 958                                            fake one */
 959                                         strcpy(tok->inp, "\n");
 960                                 }
 961                                 tok->inp = strchr(tok->inp, '\0');
 962                                 done = tok->inp[-1] == '\n';
 963                         }
 964                         if (tok->buf != NULL) {
 965                                 tok->cur = tok->buf + cur;
 966                                 tok->line_start = tok->cur;
 967                                 /* replace "\r\n" with "\n" */
 968                                 /* For Mac leave the \r, giving a syntax error */
 969                                 pt = tok->inp - 2;
 970                                 if (pt >= tok->buf && *pt == '\r') {
 971                                         *pt++ = '\n';
 972                                         *pt = '\0';
 973                                         tok->inp = pt;
 974                                 }
 975                         }
 976                 }
 977                 if (tok->done != E_OK) {
 978                         if (tok->prompt != NULL)
 979                                 PySys_WriteStderr("\n");
 980                         tok->cur = tok->inp;
 981                         return EOF;
 982                 }
 983         }
 984         /*NOTREACHED*/
 985 }
 986
 987
 988 /* Back-up one character */
 989
 990 static void
 991 tok_backup(register struct tok_state *tok, register int c)
 992 {
 993         if (c != EOF) {
 994                 if (--tok->cur < tok->buf)
 995                         Py_FatalError("tok_backup: beginning of buffer");
 996                 if (*tok->cur != c)
 997                         *tok->cur = c;
 998         }
 999 }
1000
1001
1002 /* Return the token corresponding to a single character */
1003
1004 int
1005 PyToken_OneChar(int c)
1006 {
1007         switch (c) {
1008         case '(':       return LPAR;
1009         case ')':       return RPAR;
1010         case '[':       return LSQB;
1011         case ']':       return RSQB;
1012         case ':':       return COLON;
1013         case ',':       return COMMA;
1014         case ';':       return SEMI;
1015         case '+':       return PLUS;
1016         case '-':       return MINUS;
1017         case '*':       return STAR;
1018         case '/':       return SLASH;
1019         case '|':       return VBAR;
1020         case '&':       return AMPER;
1021         case '<':       return LESS;
1022         case '>':       return GREATER;
1023         case '=':       return EQUAL;
1024         case '.':       return DOT;
1025         case '%':       return PERCENT;
1026         case '`':       return BACKQUOTE;
1027         case '{':       return LBRACE;
1028         case '}':       return RBRACE;
1029         case '^':       return CIRCUMFLEX;
1030         case '~':       return TILDE;
1031         case '@':       return AT;
1032         default:        return OP;
1033         }
1034 }
1035
1036
1037 int
1038 PyToken_TwoChars(int c1, int c2)
1039 {
1040         switch (c1) {
1041         case '=':
1042                 switch (c2) {
1043                 case '=':       return EQEQUAL;
1044                 }
1045                 break;
1046         case '!':
1047                 switch (c2) {
1048                 case '=':       return NOTEQUAL;
1049                 }
1050                 break;
1051         case '<':
1052                 switch (c2) {
1053                 case '>':       return NOTEQUAL;
1054                 case '=':       return LESSEQUAL;
1055                 case '<':       return LEFTSHIFT;
1056                 }
1057                 break;
1058         case '>':
1059                 switch (c2) {
1060                 case '=':       return GREATEREQUAL;
1061                 case '>':       return RIGHTSHIFT;
1062                 }
1063                 break;
1064         case '+':
1065                 switch (c2) {
1066                 case '=':       return PLUSEQUAL;
1067                 }
1068                 break;
1069         case '-':
1070                 switch (c2) {
1071                 case '=':       return MINEQUAL;
1072                 }
1073                 break;
1074         case '*':
1075                 switch (c2) {
1076                 case '*':       return DOUBLESTAR;
1077                 case '=':       return STAREQUAL;
1078                 }
1079                 break;
1080         case '/':
1081                 switch (c2) {
1082                 case '/':       return DOUBLESLASH;
1083                 case '=':       return SLASHEQUAL;
1084                 }
1085                 break;
1086         case '|':
1087                 switch (c2) {
1088                 case '=':       return VBAREQUAL;
1089                 }
1090                 break;
1091         case '%':
1092                 switch (c2) {
1093                 case '=':       return PERCENTEQUAL;
1094                 }
1095                 break;
1096         case '&':
1097                 switch (c2) {
1098                 case '=':       return AMPEREQUAL;
1099                 }
1100                 break;
1101         case '^':
1102                 switch (c2) {
1103                 case '=':       return CIRCUMFLEXEQUAL;
1104                 }
1105                 break;
1106         }
1107         return OP;
1108 }
1109
1110 int
1111 PyToken_ThreeChars(int c1, int c2, int c3)
1112 {
1113         switch (c1) {
1114         case '<':
1115                 switch (c2) {
1116                 case '<':
1117                         switch (c3) {
1118                         case '=':
1119                                 return LEFTSHIFTEQUAL;
1120                         }
1121                         break;
1122                 }
1123                 break;
1124         case '>':
1125                 switch (c2) {
1126                 case '>':
1127                         switch (c3) {
1128                         case '=':
1129                                 return RIGHTSHIFTEQUAL;
1130                         }
1131                         break;
1132                 }
1133                 break;
1134         case '*':
1135                 switch (c2) {
1136                 case '*':
1137                         switch (c3) {
1138                         case '=':
1139                                 return DOUBLESTAREQUAL;
1140                         }
1141                         break;
1142                 }
1143                 break;
1144         case '/':
1145                 switch (c2) {
1146                 case '/':
1147                         switch (c3) {
1148                         case '=':
1149                                 return DOUBLESLASHEQUAL;
1150                         }
1151                         break;
1152                 }
1153                 break;
1154         }
1155         return OP;
1156 }
1157
1158 static int
1159 indenterror(struct tok_state *tok)
1160 {
1161         if (tok->alterror) {
1162                 tok->done = E_TABSPACE;
1163                 tok->cur = tok->inp;
1164                 return 1;
1165         }
1166         if (tok->altwarning) {
1167                 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1168                                   "in indentation\n", tok->filename);
1169                 tok->altwarning = 0;
1170         }
1171         return 0;
1172 }
1173
1174
1175 /* Get next token, after space stripping etc. */
1176
1177 static int
1178 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1179 {
1180         register int c;
1181         int blankline;
1182
1183         *p_start = *p_end = NULL;
1184   nextline:
1185         tok->start = NULL;
1186         blankline = 0;
1187
1188         /* Get indentation level */
1189         if (tok->atbol) {
1190                 register int col = 0;
1191                 register int altcol = 0;
1192                 tok->atbol = 0;
1193                 for (;;) {
1194                         c = tok_nextc(tok);
1195                         if (c == ' ')
1196                                 col++, altcol++;
1197                         else if (c == '\t') {
1198                                 col = (col/tok->tabsize + 1) * tok->tabsize;
1199                                 altcol = (altcol/tok->alttabsize + 1)
1200                                         * tok->alttabsize;
1201                         }
1202                         else if (c == '\014') /* Control-L (formfeed) */
1203                                 col = altcol = 0; /* For Emacs users */
1204                         else
1205                                 break;
1206                 }
1207                 tok_backup(tok, c);
1208                 if (c == '#' || c == '\n') {
1209                         /* Lines with only whitespace and/or comments
1210                            shouldn't affect the indentation and are
1211                            not passed to the parser as NEWLINE tokens,
1212                            except *totally* empty lines in interactive
1213                            mode, which signal the end of a command group. */
1214                         if (col == 0 && c == '\n' && tok->prompt != NULL)
1215                                 blankline = 0; /* Let it through */
1216                         else
1217                                 blankline = 1; /* Ignore completely */
1218                         /* We can't jump back right here since we still
1219                            may need to skip to the end of a comment */
1220                 }
1221                 if (!blankline && tok->level == 0) {
1222                         if (col == tok->indstack[tok->indent]) {
1223                                 /* No change */
1224                                 if (altcol != tok->altindstack[tok->indent]) {
1225                                         if (indenterror(tok))
1226                                                 return ERRORTOKEN;
1227                                 }
1228                         }
1229                         else if (col > tok->indstack[tok->indent]) {
1230                                 /* Indent -- always one */
1231                                 if (tok->indent+1 >= MAXINDENT) {
1232                                         tok->done = E_TOODEEP;
1233                                         tok->cur = tok->inp;
1234                                         return ERRORTOKEN;
1235                                 }
1236                                 if (altcol <= tok->altindstack[tok->indent]) {
1237                                         if (indenterror(tok))
1238                                                 return ERRORTOKEN;
1239                                 }
1240                                 tok->pendin++;
1241                                 tok->indstack[++tok->indent] = col;
1242                                 tok->altindstack[tok->indent] = altcol;
1243                         }
1244                         else /* col < tok->indstack[tok->indent] */ {
1245                                 /* Dedent -- any number, must be consistent */
1246                                 while (tok->indent > 0 &&
1247                                         col < tok->indstack[tok->indent]) {
1248                                         tok->pendin--;
1249                                         tok->indent--;
1250                                 }
1251                                 if (col != tok->indstack[tok->indent]) {
1252                                         tok->done = E_DEDENT;
1253                                         tok->cur = tok->inp;
1254                                         return ERRORTOKEN;
1255                                 }
1256                                 if (altcol != tok->altindstack[tok->indent]) {
1257                                         if (indenterror(tok))
1258                                                 return ERRORTOKEN;
1259                                 }
1260                         }
1261                 }
1262         }
1263
1264         tok->start = tok->cur;
1265
1266         /* Return pending indents/dedents */
1267         if (tok->pendin != 0) {
1268                 if (tok->pendin < 0) {
1269                         tok->pendin++;
1270                         return DEDENT;
1271                 }
1272                 else {
1273                         tok->pendin--;
1274                         return INDENT;
1275                 }
1276         }
1277
1278  again:
1279         tok->start = NULL;
1280         /* Skip spaces */
1281         do {
1282                 c = tok_nextc(tok);
1283         } while (c == ' ' || c == '\t' || c == '\014');
1284
1285         /* Set start of current token */
1286         tok->start = tok->cur - 1;
1287
1288         /* Skip comment, while looking for tab-setting magic */
1289         if (c == '#') {
1290                 static char *tabforms[] = {
1291                         "tab-width:",           /* Emacs */
1292                         ":tabstop=",            /* vim, full form */
1293                         ":ts=",                 /* vim, abbreviated form */
1294                         "set tabsize=",         /* will vi never die? */
1295                 /* more templates can be added here to support other editors */
1296                 };
1297                 char cbuf[80];
1298                 char *tp, **cp;
1299                 tp = cbuf;
1300                 do {
1301                         *tp++ = c = tok_nextc(tok);
1302                 } while (c != EOF && c != '\n' &&
1303                          (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1304                 *tp = '\0';
1305                 for (cp = tabforms;
1306                      cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1307                      cp++) {
1308                         if ((tp = strstr(cbuf, *cp))) {
1309                                 int newsize = atoi(tp + strlen(*cp));
1310
1311                                 if (newsize >= 1 && newsize <= 40) {
1312                                         tok->tabsize = newsize;
1313                                         if (Py_VerboseFlag)
1314                                             PySys_WriteStderr(
1315                                                 "Tab size set to %d\n",
1316                                                 newsize);
1317                                 }
1318                         }
1319                 }
1320                 while (c != EOF && c != '\n')
1321                         c = tok_nextc(tok);
1322         }
1323
1324         /* Check for EOF and errors now */
1325         if (c == EOF) {
1326                 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1327         }
1328
1329         /* Identifier (most frequent token!) */
1330         if (isalpha(c) || c == '_') {
1331                 /* Process r"", u"" and ur"" */
1332                 switch (c) {
1333                 case 'b':
1334                 case 'B':
1335                         c = tok_nextc(tok);
1336                         if (c == 'r' || c == 'R')
1337                                 c = tok_nextc(tok);
1338                         if (c == '"' || c == '\'')
1339                                 goto letter_quote;
1340                         break;
1341                 case 'r':
1342                 case 'R':
1343                         c = tok_nextc(tok);
1344                         if (c == '"' || c == '\'')
1345                                 goto letter_quote;
1346                         break;
1347                 case 'u':
1348                 case 'U':
1349                         c = tok_nextc(tok);
1350                         if (c == 'r' || c == 'R')
1351                                 c = tok_nextc(tok);
1352                         if (c == '"' || c == '\'')
1353                                 goto letter_quote;
1354                         break;
1355                 }
1356                 while (isalnum(c) || c == '_') {
1357                         c = tok_nextc(tok);
1358                 }
1359                 tok_backup(tok, c);
1360                 *p_start = tok->start;
1361                 *p_end = tok->cur;
1362                 return NAME;
1363         }
1364
1365         /* Newline */
1366         if (c == '\n') {
1367                 tok->atbol = 1;
1368                 if (blankline || tok->level > 0)
1369                         goto nextline;
1370                 *p_start = tok->start;
1371                 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1372                 tok->cont_line = 0;
1373                 return NEWLINE;
1374         }
1375
1376         /* Period or number starting with period? */
1377         if (c == '.') {
1378                 c = tok_nextc(tok);
1379                 if (isdigit(c)) {
1380                         goto fraction;
1381                 }
1382                 else {
1383                         tok_backup(tok, c);
1384                         *p_start = tok->start;
1385                         *p_end = tok->cur;
1386                         return DOT;
1387                 }
1388         }
1389
1390         /* Number */
1391         if (isdigit(c)) {
1392                 if (c == '0') {
1393                         /* Hex, octal or binary -- maybe. */
1394                         c = tok_nextc(tok);
1395                         if (c == '.')
1396                                 goto fraction;
1397 #ifndef WITHOUT_COMPLEX
1398                         if (c == 'j' || c == 'J')
1399                                 goto imaginary;
1400 #endif
1401                         if (c == 'x' || c == 'X') {
1402
1403                                 /* Hex */
1404                                 c = tok_nextc(tok);
1405                                 if (!isxdigit(c)) {
1406                                         tok->done = E_TOKEN;
1407                                         tok_backup(tok, c);
1408                                         return ERRORTOKEN;
1409                                 }
1410                                 do {
1411                                         c = tok_nextc(tok);
1412                                 } while (isxdigit(c));
1413                         }
1414                         else if (c == 'o' || c == 'O') {
1415                                 /* Octal */
1416                                 c = tok_nextc(tok);
1417                                 if (c < '0' || c >= '8') {
1418                                         tok->done = E_TOKEN;
1419                                         tok_backup(tok, c);
1420                                         return ERRORTOKEN;
1421                                 }
1422                                 do {
1423                                         c = tok_nextc(tok);
1424                                 } while ('0' <= c && c < '8');
1425                         }
1426                         else if (c == 'b' || c == 'B') {
1427                                 /* Binary */
1428                                 c = tok_nextc(tok);
1429                                 if (c != '0' && c != '1') {
1430                                         tok->done = E_TOKEN;
1431                                         tok_backup(tok, c);
1432                                         return ERRORTOKEN;
1433                                 }
1434                                 do {
1435                                         c = tok_nextc(tok);
1436                                 } while (c == '0' || c == '1');
1437                         }
1438                         else {
1439                                 int found_decimal = 0;
1440                                 /* Octal; c is first char of it */
1441                                 /* There's no 'isoctdigit' macro, sigh */
1442                                 while ('0' <= c && c < '8') {
1443                                         c = tok_nextc(tok);
1444                                 }
1445                                 if (isdigit(c)) {
1446                                         found_decimal = 1;
1447                                         do {
1448                                                 c = tok_nextc(tok);
1449                                         } while (isdigit(c));
1450                                 }
1451                                 if (c == '.')
1452                                         goto fraction;
1453                                 else if (c == 'e' || c == 'E')
1454                                         goto exponent;
1455 #ifndef WITHOUT_COMPLEX
1456                                 else if (c == 'j' || c == 'J')
1457                                         goto imaginary;
1458 #endif
1459                                 else if (found_decimal) {
1460                                         tok->done = E_TOKEN;
1461                                         tok_backup(tok, c);
1462                                         return ERRORTOKEN;
1463                                 }
1464                         }
1465                         if (c == 'l' || c == 'L')
1466                                 c = tok_nextc(tok);
1467                 }
1468                 else {
1469                         /* Decimal */
1470                         do {
1471                                 c = tok_nextc(tok);
1472                         } while (isdigit(c));
1473                         if (c == 'l' || c == 'L')
1474                                 c = tok_nextc(tok);
1475                         else {
1476                                 /* Accept floating point numbers. */
1477                                 if (c == '.') {
1478                 fraction:
1479                                         /* Fraction */
1480                                         do {
1481                                                 c = tok_nextc(tok);
1482                                         } while (isdigit(c));
1483                                 }
1484                                 if (c == 'e' || c == 'E') {
1485                 exponent:
1486                                         /* Exponent part */
1487                                         c = tok_nextc(tok);
1488                                         if (c == '+' || c == '-')
1489                                                 c = tok_nextc(tok);
1490                                         if (!isdigit(c)) {
1491                                                 tok->done = E_TOKEN;
1492                                                 tok_backup(tok, c);
1493                                                 return ERRORTOKEN;
1494                                         }
1495                                         do {
1496                                                 c = tok_nextc(tok);
1497                                         } while (isdigit(c));
1498                                 }
1499 #ifndef WITHOUT_COMPLEX
1500                                 if (c == 'j' || c == 'J')
1501                                         /* Imaginary part */
1502                 imaginary:
1503                                         c = tok_nextc(tok);
1504 #endif
1505                         }
1506                 }
1507                 tok_backup(tok, c);
1508                 *p_start = tok->start;
1509                 *p_end = tok->cur;
1510                 return NUMBER;
1511         }
1512
1513   letter_quote:
1514         /* String */
1515         if (c == '\'' || c == '"') {
1516                 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1517                 int quote = c;
1518                 int triple = 0;
1519                 int tripcount = 0;
1520                 for (;;) {
1521                         c = tok_nextc(tok);
1522                         if (c == '\n') {
1523                                 if (!triple) {
1524                                         tok->done = E_EOLS;
1525                                         tok_backup(tok, c);
1526                                         return ERRORTOKEN;
1527                                 }
1528                                 tripcount = 0;
1529                                 tok->cont_line = 1; /* multiline string. */
1530                         }
1531                         else if (c == EOF) {
1532                                 if (triple)
1533                                         tok->done = E_EOFS;
1534                                 else
1535                                         tok->done = E_EOLS;
1536                                 tok->cur = tok->inp;
1537                                 return ERRORTOKEN;
1538                         }
1539                         else if (c == quote) {
1540                                 tripcount++;
1541                                 if (tok->cur - tok->start == quote2) {
1542                                         c = tok_nextc(tok);
1543                                         if (c == quote) {
1544                                                 triple = 1;
1545                                                 tripcount = 0;
1546                                                 continue;
1547                                         }
1548                                         tok_backup(tok, c);
1549                                 }
1550                                 if (!triple || tripcount == 3)
1551                                         break;
1552                         }
1553                         else if (c == '\\') {
1554                                 tripcount = 0;
1555                                 c = tok_nextc(tok);
1556                                 if (c == EOF) {
1557                                         tok->done = E_EOLS;
1558                                         tok->cur = tok->inp;
1559                                         return ERRORTOKEN;
1560                                 }
1561                         }
1562                         else
1563                                 tripcount = 0;
1564                 }
1565                 *p_start = tok->start;
1566                 *p_end = tok->cur;
1567                 return STRING;
1568         }
1569
1570         /* Line continuation */
1571         if (c == '\\') {
1572                 c = tok_nextc(tok);
1573                 if (c != '\n') {
1574                         tok->done = E_LINECONT;
1575                         tok->cur = tok->inp;
1576                         return ERRORTOKEN;
1577                 }
1578                 tok->cont_line = 1;
1579                 goto again; /* Read next line */
1580         }
1581
1582         /* Check for two-character token */
1583         {
1584                 int c2 = tok_nextc(tok);
1585                 int token = PyToken_TwoChars(c, c2);
1586 #ifndef PGEN
1587                 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1588                         if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1589                                                "<> not supported in 3.x; use !=",
1590                                                tok->filename, tok->lineno,
1591                                                NULL, NULL)) {
1592                                 return ERRORTOKEN;
1593                         }
1594                 }
1595 #endif
1596                 if (token != OP) {
1597                         int c3 = tok_nextc(tok);
1598                         int token3 = PyToken_ThreeChars(c, c2, c3);
1599                         if (token3 != OP) {
1600                                 token = token3;
1601                         } else {
1602                                 tok_backup(tok, c3);
1603                         }
1604                         *p_start = tok->start;
1605                         *p_end = tok->cur;
1606                         return token;
1607                 }
1608                 tok_backup(tok, c2);
1609         }
1610
1611         /* Keep track of parentheses nesting level */
1612         switch (c) {
1613         case '(':
1614         case '[':
1615         case '{':
1616                 tok->level++;
1617                 break;
1618         case ')':
1619         case ']':
1620         case '}':
1621                 tok->level--;
1622                 break;
1623         }
1624
1625         /* Punctuation character */
1626         *p_start = tok->start;
1627         *p_end = tok->cur;
1628         return PyToken_OneChar(c);
1629 }
1630
1631 int
1632 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1633 {
1634         int result = tok_get(tok, p_start, p_end);
1635         if (tok->decoding_erred) {
1636                 result = ERRORTOKEN;
1637                 tok->done = E_DECODE;
1638         }
1639         return result;
1640 }
1641
1642 /* This function is only called from parsetok. However, it cannot live
1643    there, as it must be empty for PGEN, and we can check for PGEN only
1644    in this file. */
1645
1646 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1647 char*
1648 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1649 {
1650         return NULL;
1651 }
1652 #else
1653 #ifdef Py_USING_UNICODE
1654 static PyObject *
1655 dec_utf8(const char *enc, const char *text, size_t len) {
1656         PyObject *ret = NULL;
1657         PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1658         if (unicode_text) {
1659                 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1660                 Py_DECREF(unicode_text);
1661         }
1662         if (!ret) {
1663                 PyErr_Clear();
1664         }
1665         return ret;
1666 }
1667 char *
1668 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1669 {
1670         char *text = NULL;
1671         if (tok->encoding) {
1672                 /* convert source to original encondig */
1673                 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1674                 if (lineobj != NULL) {
1675                         int linelen = PyString_Size(lineobj);
1676                         const char *line = PyString_AsString(lineobj);
1677                         text = PyObject_MALLOC(linelen + 1);
1678                         if (text != NULL && line != NULL) {
1679                                 if (linelen)
1680                                         strncpy(text, line, linelen);
1681                                 text[linelen] = '\0';
1682                         }
1683                         Py_DECREF(lineobj);
1684
1685                         /* adjust error offset */
1686                         if (*offset > 1) {
1687                                 PyObject *offsetobj = dec_utf8(tok->encoding,
1688                                                                tok->buf, *offset-1);
1689                                 if (offsetobj) {
1690                                         *offset = PyString_Size(offsetobj) + 1;
1691                                         Py_DECREF(offsetobj);
1692                                 }
1693                         }
1694
1695                 }
1696         }
1697         return text;
1698
1699 }
1700 #endif /* defined(Py_USING_UNICODE) */
1701 #endif
1702
1703
1704 #ifdef Py_DEBUG
1705
1706 void
1707 tok_dump(int type, char *start, char *end)
1708 {
1709         printf("%s", _PyParser_TokenNames[type]);
1710         if (type == NAME || type == NUMBER || type == STRING || type == OP)
1711                 printf("(%.*s)", (int)(end - start), start);
1712 }
1713
1714 #endif