Issue #4677: add two list comprehension tests to pybench.
[python.git] / Parser / tokenizer.c
blob1d0a4aa3f23b438dc5c17185bbd973130582f436
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
35 /* Token names */
37 char *_PyParser_TokenNames[] = {
38 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
96 /* Create and initialize a new tok_state structure */
98 static struct tok_state *
99 tok_new(void)
101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
103 if (tok == NULL)
104 return NULL;
105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
126 #ifndef PGEN
127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
129 #endif
130 return tok;
133 #ifdef PGEN
135 static char *
136 decoding_fgets(char *s, int size, struct tok_state *tok)
138 return fgets(s, size, tok->fp);
141 static int
142 decoding_feof(struct tok_state *tok)
144 return feof(tok->fp);
147 static const char *
148 decode_str(const char *str, struct tok_state *tok)
150 return str;
153 #else /* PGEN */
155 static char *
156 error_ret(struct tok_state *tok) /* XXX */
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
160 PyMem_FREE(tok->buf);
161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
165 static char *
166 new_string(const char *s, Py_ssize_t len)
168 char* result = (char *)PyMem_MALLOC(len + 1);
169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
173 return result;
176 static char *
177 get_normal_name(char *s) /* for utf-8 and latin-1 */
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0') break;
184 else if (c == '_') buf[i] = '-';
185 else buf[i] = tolower(c);
187 buf[i] = '\0';
188 if (strcmp(buf, "utf-8") == 0 ||
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190 else if (strcmp(buf, "latin-1") == 0 ||
191 strcmp(buf, "iso-8859-1") == 0 ||
192 strcmp(buf, "iso-latin-1") == 0 ||
193 strncmp(buf, "latin-1-", 8) == 0 ||
194 strncmp(buf, "iso-8859-1-", 11) == 0 ||
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196 else return s;
199 /* Return the coding spec in S, or NULL if none is found. */
201 static char *
202 get_coding_spec(const char *s, Py_ssize_t size)
204 Py_ssize_t i;
205 /* Coding spec must be in a comment, and that comment must be
206 * the only statement on the source code line. */
207 for (i = 0; i < size - 6; i++) {
208 if (s[i] == '#')
209 break;
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211 return NULL;
213 for (; i < size - 6; i++) { /* XXX inefficient search */
214 const char* t = s + i;
215 if (strncmp(t, "coding", 6) == 0) {
216 const char* begin = NULL;
217 t += 6;
218 if (t[0] != ':' && t[0] != '=')
219 continue;
220 do {
221 t++;
222 } while (t[0] == '\x20' || t[0] == '\t');
224 begin = t;
225 while (isalnum(Py_CHARMASK(t[0])) ||
226 t[0] == '-' || t[0] == '_' || t[0] == '.')
227 t++;
229 if (begin < t) {
230 char* r = new_string(begin, t - begin);
231 char* q = get_normal_name(r);
232 if (r != q) {
233 PyMem_FREE(r);
234 r = new_string(q, strlen(q));
236 return r;
240 return NULL;
243 /* Check whether the line contains a coding spec. If it does,
244 invoke the set_readline function for the new encoding.
245 This function receives the tok_state and the new encoding.
246 Return 1 on success, 0 on failure. */
248 static int
249 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
250 int set_readline(struct tok_state *, const char *))
252 char * cs;
253 int r = 1;
255 if (tok->cont_line)
256 /* It's a continuation line, so it can't be a coding spec. */
257 return 1;
258 cs = get_coding_spec(line, size);
259 if (cs != NULL) {
260 tok->read_coding_spec = 1;
261 if (tok->encoding == NULL) {
262 assert(tok->decoding_state == 1); /* raw */
263 if (strcmp(cs, "utf-8") == 0 ||
264 strcmp(cs, "iso-8859-1") == 0) {
265 tok->encoding = cs;
266 } else {
267 #ifdef Py_USING_UNICODE
268 r = set_readline(tok, cs);
269 if (r) {
270 tok->encoding = cs;
271 tok->decoding_state = -1;
273 else
274 PyMem_FREE(cs);
275 #else
276 /* Without Unicode support, we cannot
277 process the coding spec. Since there
278 won't be any Unicode literals, that
279 won't matter. */
280 PyMem_FREE(cs);
281 #endif
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
285 PyMem_FREE(cs);
288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
294 return r;
297 /* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
301 static int
302 check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
307 int ch = get_char(tok);
308 tok->decoding_state = 1;
309 if (ch == EOF) {
310 return 1;
311 } else if (ch == 0xEF) {
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314 #if 0
315 /* Disable support for UTF-16 BOMs until a decision
316 is made whether this needs to be supported. */
317 } else if (ch == 0xFE) {
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319 if (!set_readline(tok, "utf-16-be")) return 0;
320 tok->decoding_state = -1;
321 } else if (ch == 0xFF) {
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323 if (!set_readline(tok, "utf-16-le")) return 0;
324 tok->decoding_state = -1;
325 #endif
326 } else {
327 unget_char(ch, tok);
328 return 1;
330 if (tok->encoding != NULL)
331 PyMem_FREE(tok->encoding);
332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333 return 1;
334 NON_BOM:
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337 return 1;
340 /* Read a line of text from TOK into S, using the stream in TOK.
341 Return NULL on failure, else S.
343 On entry, tok->decoding_buffer will be one of:
344 1) NULL: need to call tok->decoding_readline to get a new line
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346 stored the result in tok->decoding_buffer
347 3) PyStringObject *: previous call to fp_readl did not have enough room
348 (in the s buffer) to copy entire contents of the line read
349 by tok->decoding_readline. tok->decoding_buffer has the overflow.
350 In this case, fp_readl is called in a loop (with an expanded buffer)
351 until the buffer ends with a '\n' (or until the end of the file is
352 reached): see tok_nextc and its calls to decoding_fgets.
355 static char *
356 fp_readl(char *s, int size, struct tok_state *tok)
358 #ifndef Py_USING_UNICODE
359 /* In a non-Unicode built, this should never be called. */
360 Py_FatalError("fp_readl should not be called in this build.");
361 return NULL; /* Keep compiler happy (not reachable) */
362 #else
363 PyObject* utf8 = NULL;
364 PyObject* buf = tok->decoding_buffer;
365 char *str;
366 Py_ssize_t utf8len;
368 /* Ask for one less byte so we can terminate it */
369 assert(size > 0);
370 size--;
372 if (buf == NULL) {
373 buf = PyObject_CallObject(tok->decoding_readline, NULL);
374 if (buf == NULL)
375 return error_ret(tok);
376 } else {
377 tok->decoding_buffer = NULL;
378 if (PyString_CheckExact(buf))
379 utf8 = buf;
381 if (utf8 == NULL) {
382 utf8 = PyUnicode_AsUTF8String(buf);
383 Py_DECREF(buf);
384 if (utf8 == NULL)
385 return error_ret(tok);
387 str = PyString_AsString(utf8);
388 utf8len = PyString_GET_SIZE(utf8);
389 if (utf8len > size) {
390 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
391 if (tok->decoding_buffer == NULL) {
392 Py_DECREF(utf8);
393 return error_ret(tok);
395 utf8len = size;
397 memcpy(s, str, utf8len);
398 s[utf8len] = '\0';
399 Py_DECREF(utf8);
400 if (utf8len == 0) return NULL; /* EOF */
401 return s;
402 #endif
405 /* Set the readline function for TOK to a StreamReader's
406 readline function. The StreamReader is named ENC.
408 This function is called from check_bom and check_coding_spec.
410 ENC is usually identical to the future value of tok->encoding,
411 except for the (currently unsupported) case of UTF-16.
413 Return 1 on success, 0 on failure. */
415 static int
416 fp_setreadl(struct tok_state *tok, const char* enc)
418 PyObject *reader, *stream, *readline;
420 /* XXX: constify filename argument. */
421 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
422 if (stream == NULL)
423 return 0;
425 reader = PyCodec_StreamReader(enc, stream, NULL);
426 Py_DECREF(stream);
427 if (reader == NULL)
428 return 0;
430 readline = PyObject_GetAttrString(reader, "readline");
431 Py_DECREF(reader);
432 if (readline == NULL)
433 return 0;
435 tok->decoding_readline = readline;
436 return 1;
439 /* Fetch the next byte from TOK. */
441 static int fp_getc(struct tok_state *tok) {
442 return getc(tok->fp);
445 /* Unfetch the last byte back into TOK. */
447 static void fp_ungetc(int c, struct tok_state *tok) {
448 ungetc(c, tok->fp);
451 /* Read a line of input from TOK. Determine encoding
452 if necessary. */
454 static char *
455 decoding_fgets(char *s, int size, struct tok_state *tok)
457 char *line = NULL;
458 int badchar = 0;
459 for (;;) {
460 if (tok->decoding_state < 0) {
461 /* We already have a codec associated with
462 this input. */
463 line = fp_readl(s, size, tok);
464 break;
465 } else if (tok->decoding_state > 0) {
466 /* We want a 'raw' read. */
467 line = Py_UniversalNewlineFgets(s, size,
468 tok->fp, NULL);
469 break;
470 } else {
471 /* We have not yet determined the encoding.
472 If an encoding is found, use the file-pointer
473 reader functions from now on. */
474 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
475 return error_ret(tok);
476 assert(tok->decoding_state != 0);
479 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
480 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
481 return error_ret(tok);
484 #ifndef PGEN
485 /* The default encoding is ASCII, so make sure we don't have any
486 non-ASCII bytes in it. */
487 if (line && !tok->encoding) {
488 unsigned char *c;
489 for (c = (unsigned char *)line; *c; c++)
490 if (*c > 127) {
491 badchar = *c;
492 break;
495 if (badchar) {
496 char buf[500];
497 /* Need to add 1 to the line number, since this line
498 has not been counted, yet. */
499 sprintf(buf,
500 "Non-ASCII character '\\x%.2x' "
501 "in file %.200s on line %i, "
502 "but no encoding declared; "
503 "see http://www.python.org/peps/pep-0263.html for details",
504 badchar, tok->filename, tok->lineno + 1);
505 PyErr_SetString(PyExc_SyntaxError, buf);
506 return error_ret(tok);
508 #endif
509 return line;
512 static int
513 decoding_feof(struct tok_state *tok)
515 if (tok->decoding_state >= 0) {
516 return feof(tok->fp);
517 } else {
518 PyObject* buf = tok->decoding_buffer;
519 if (buf == NULL) {
520 buf = PyObject_CallObject(tok->decoding_readline, NULL);
521 if (buf == NULL) {
522 error_ret(tok);
523 return 1;
524 } else {
525 tok->decoding_buffer = buf;
528 return PyObject_Length(buf) == 0;
532 /* Fetch a byte from TOK, using the string buffer. */
534 static int
535 buf_getc(struct tok_state *tok) {
536 return Py_CHARMASK(*tok->str++);
539 /* Unfetch a byte from TOK, using the string buffer. */
541 static void
542 buf_ungetc(int c, struct tok_state *tok) {
543 tok->str--;
544 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
547 /* Set the readline function for TOK to ENC. For the string-based
548 tokenizer, this means to just record the encoding. */
550 static int
551 buf_setreadl(struct tok_state *tok, const char* enc) {
552 tok->enc = enc;
553 return 1;
556 /* Return a UTF-8 encoding Python string object from the
557 C byte string STR, which is encoded with ENC. */
559 #ifdef Py_USING_UNICODE
560 static PyObject *
561 translate_into_utf8(const char* str, const char* enc) {
562 PyObject *utf8;
563 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
564 if (buf == NULL)
565 return NULL;
566 utf8 = PyUnicode_AsUTF8String(buf);
567 Py_DECREF(buf);
568 return utf8;
570 #endif
572 /* Decode a byte string STR for use as the buffer of TOK.
573 Look for encoding declarations inside STR, and record them
574 inside TOK. */
576 static const char *
577 decode_str(const char *str, struct tok_state *tok)
579 PyObject* utf8 = NULL;
580 const char *s;
581 const char *newl[2] = {NULL, NULL};
582 int lineno = 0;
583 tok->enc = NULL;
584 tok->str = str;
585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
586 return error_ret(tok);
587 str = tok->str; /* string after BOM if any */
588 assert(str);
589 #ifdef Py_USING_UNICODE
590 if (tok->enc != NULL) {
591 utf8 = translate_into_utf8(str, tok->enc);
592 if (utf8 == NULL)
593 return error_ret(tok);
594 str = PyString_AsString(utf8);
596 #endif
597 for (s = str;; s++) {
598 if (*s == '\0') break;
599 else if (*s == '\n') {
600 assert(lineno < 2);
601 newl[lineno] = s;
602 lineno++;
603 if (lineno == 2) break;
606 tok->enc = NULL;
607 /* need to check line 1 and 2 separately since check_coding_spec
608 assumes a single line as input */
609 if (newl[0]) {
610 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
611 return error_ret(tok);
612 if (tok->enc == NULL && newl[1]) {
613 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
614 tok, buf_setreadl))
615 return error_ret(tok);
618 #ifdef Py_USING_UNICODE
619 if (tok->enc != NULL) {
620 assert(utf8 == NULL);
621 utf8 = translate_into_utf8(str, tok->enc);
622 if (utf8 == NULL) {
623 PyErr_Format(PyExc_SyntaxError,
624 "unknown encoding: %s", tok->enc);
625 return error_ret(tok);
627 str = PyString_AsString(utf8);
629 #endif
630 assert(tok->decoding_buffer == NULL);
631 tok->decoding_buffer = utf8; /* CAUTION */
632 return str;
635 #endif /* PGEN */
637 /* Set up tokenizer for string */
639 struct tok_state *
640 PyTokenizer_FromString(const char *str)
642 struct tok_state *tok = tok_new();
643 if (tok == NULL)
644 return NULL;
645 str = (char *)decode_str(str, tok);
646 if (str == NULL) {
647 PyTokenizer_Free(tok);
648 return NULL;
651 /* XXX: constify members. */
652 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
653 return tok;
657 /* Set up tokenizer for file */
659 struct tok_state *
660 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
662 struct tok_state *tok = tok_new();
663 if (tok == NULL)
664 return NULL;
665 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
666 PyTokenizer_Free(tok);
667 return NULL;
669 tok->cur = tok->inp = tok->buf;
670 tok->end = tok->buf + BUFSIZ;
671 tok->fp = fp;
672 tok->prompt = ps1;
673 tok->nextprompt = ps2;
674 return tok;
678 /* Free a tok_state structure */
680 void
681 PyTokenizer_Free(struct tok_state *tok)
683 if (tok->encoding != NULL)
684 PyMem_FREE(tok->encoding);
685 #ifndef PGEN
686 Py_XDECREF(tok->decoding_readline);
687 Py_XDECREF(tok->decoding_buffer);
688 #endif
689 if (tok->fp != NULL && tok->buf != NULL)
690 PyMem_FREE(tok->buf);
691 PyMem_FREE(tok);
694 #if !defined(PGEN) && defined(Py_USING_UNICODE)
695 static int
696 tok_stdin_decode(struct tok_state *tok, char **inp)
698 PyObject *enc, *sysstdin, *decoded, *utf8;
699 const char *encoding;
700 char *converted;
702 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
703 return 0;
704 sysstdin = PySys_GetObject("stdin");
705 if (sysstdin == NULL || !PyFile_Check(sysstdin))
706 return 0;
708 enc = ((PyFileObject *)sysstdin)->f_encoding;
709 if (enc == NULL || !PyString_Check(enc))
710 return 0;
711 Py_INCREF(enc);
713 encoding = PyString_AsString(enc);
714 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
715 if (decoded == NULL)
716 goto error_clear;
718 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
719 Py_DECREF(decoded);
720 if (utf8 == NULL)
721 goto error_clear;
723 assert(PyString_Check(utf8));
724 converted = new_string(PyString_AS_STRING(utf8),
725 PyString_GET_SIZE(utf8));
726 Py_DECREF(utf8);
727 if (converted == NULL)
728 goto error_nomem;
730 PyMem_FREE(*inp);
731 *inp = converted;
732 if (tok->encoding != NULL)
733 PyMem_FREE(tok->encoding);
734 tok->encoding = new_string(encoding, strlen(encoding));
735 if (tok->encoding == NULL)
736 goto error_nomem;
738 Py_DECREF(enc);
739 return 0;
741 error_nomem:
742 Py_DECREF(enc);
743 tok->done = E_NOMEM;
744 return -1;
746 error_clear:
747 /* Fallback to iso-8859-1: for backward compatibility */
748 Py_DECREF(enc);
749 PyErr_Clear();
750 return 0;
752 #endif
754 /* Get next char, updating state; error code goes into tok->done */
756 static int
757 tok_nextc(register struct tok_state *tok)
759 for (;;) {
760 if (tok->cur != tok->inp) {
761 return Py_CHARMASK(*tok->cur++); /* Fast path */
763 if (tok->done != E_OK)
764 return EOF;
765 if (tok->fp == NULL) {
766 char *end = strchr(tok->inp, '\n');
767 if (end != NULL)
768 end++;
769 else {
770 end = strchr(tok->inp, '\0');
771 if (end == tok->inp) {
772 tok->done = E_EOF;
773 return EOF;
776 if (tok->start == NULL)
777 tok->buf = tok->cur;
778 tok->line_start = tok->cur;
779 tok->lineno++;
780 tok->inp = end;
781 return Py_CHARMASK(*tok->cur++);
783 if (tok->prompt != NULL) {
784 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
785 if (tok->nextprompt != NULL)
786 tok->prompt = tok->nextprompt;
787 if (newtok == NULL)
788 tok->done = E_INTR;
789 else if (*newtok == '\0') {
790 PyMem_FREE(newtok);
791 tok->done = E_EOF;
793 #if !defined(PGEN) && defined(Py_USING_UNICODE)
794 else if (tok_stdin_decode(tok, &newtok) != 0)
795 PyMem_FREE(newtok);
796 #endif
797 else if (tok->start != NULL) {
798 size_t start = tok->start - tok->buf;
799 size_t oldlen = tok->cur - tok->buf;
800 size_t newlen = oldlen + strlen(newtok);
801 char *buf = tok->buf;
802 buf = (char *)PyMem_REALLOC(buf, newlen+1);
803 tok->lineno++;
804 if (buf == NULL) {
805 PyMem_FREE(tok->buf);
806 tok->buf = NULL;
807 PyMem_FREE(newtok);
808 tok->done = E_NOMEM;
809 return EOF;
811 tok->buf = buf;
812 tok->cur = tok->buf + oldlen;
813 tok->line_start = tok->cur;
814 strcpy(tok->buf + oldlen, newtok);
815 PyMem_FREE(newtok);
816 tok->inp = tok->buf + newlen;
817 tok->end = tok->inp + 1;
818 tok->start = tok->buf + start;
820 else {
821 tok->lineno++;
822 if (tok->buf != NULL)
823 PyMem_FREE(tok->buf);
824 tok->buf = newtok;
825 tok->line_start = tok->buf;
826 tok->cur = tok->buf;
827 tok->line_start = tok->buf;
828 tok->inp = strchr(tok->buf, '\0');
829 tok->end = tok->inp + 1;
832 else {
833 int done = 0;
834 Py_ssize_t cur = 0;
835 char *pt;
836 if (tok->start == NULL) {
837 if (tok->buf == NULL) {
838 tok->buf = (char *)
839 PyMem_MALLOC(BUFSIZ);
840 if (tok->buf == NULL) {
841 tok->done = E_NOMEM;
842 return EOF;
844 tok->end = tok->buf + BUFSIZ;
846 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
847 tok) == NULL) {
848 tok->done = E_EOF;
849 done = 1;
851 else {
852 tok->done = E_OK;
853 tok->inp = strchr(tok->buf, '\0');
854 done = tok->inp[-1] == '\n';
857 else {
858 cur = tok->cur - tok->buf;
859 if (decoding_feof(tok)) {
860 tok->done = E_EOF;
861 done = 1;
863 else
864 tok->done = E_OK;
866 tok->lineno++;
867 /* Read until '\n' or EOF */
868 while (!done) {
869 Py_ssize_t curstart = tok->start == NULL ? -1 :
870 tok->start - tok->buf;
871 Py_ssize_t curvalid = tok->inp - tok->buf;
872 Py_ssize_t newsize = curvalid + BUFSIZ;
873 char *newbuf = tok->buf;
874 newbuf = (char *)PyMem_REALLOC(newbuf,
875 newsize);
876 if (newbuf == NULL) {
877 tok->done = E_NOMEM;
878 tok->cur = tok->inp;
879 return EOF;
881 tok->buf = newbuf;
882 tok->inp = tok->buf + curvalid;
883 tok->end = tok->buf + newsize;
884 tok->start = curstart < 0 ? NULL :
885 tok->buf + curstart;
886 if (decoding_fgets(tok->inp,
887 (int)(tok->end - tok->inp),
888 tok) == NULL) {
889 /* Break out early on decoding
890 errors, as tok->buf will be NULL
892 if (tok->decoding_erred)
893 return EOF;
894 /* Last line does not end in \n,
895 fake one */
896 strcpy(tok->inp, "\n");
898 tok->inp = strchr(tok->inp, '\0');
899 done = tok->inp[-1] == '\n';
901 if (tok->buf != NULL) {
902 tok->cur = tok->buf + cur;
903 tok->line_start = tok->cur;
904 /* replace "\r\n" with "\n" */
905 /* For Mac leave the \r, giving a syntax error */
906 pt = tok->inp - 2;
907 if (pt >= tok->buf && *pt == '\r') {
908 *pt++ = '\n';
909 *pt = '\0';
910 tok->inp = pt;
914 if (tok->done != E_OK) {
915 if (tok->prompt != NULL)
916 PySys_WriteStderr("\n");
917 tok->cur = tok->inp;
918 return EOF;
921 /*NOTREACHED*/
925 /* Back-up one character */
927 static void
928 tok_backup(register struct tok_state *tok, register int c)
930 if (c != EOF) {
931 if (--tok->cur < tok->buf)
932 Py_FatalError("tok_backup: begin of buffer");
933 if (*tok->cur != c)
934 *tok->cur = c;
939 /* Return the token corresponding to a single character */
942 PyToken_OneChar(int c)
944 switch (c) {
945 case '(': return LPAR;
946 case ')': return RPAR;
947 case '[': return LSQB;
948 case ']': return RSQB;
949 case ':': return COLON;
950 case ',': return COMMA;
951 case ';': return SEMI;
952 case '+': return PLUS;
953 case '-': return MINUS;
954 case '*': return STAR;
955 case '/': return SLASH;
956 case '|': return VBAR;
957 case '&': return AMPER;
958 case '<': return LESS;
959 case '>': return GREATER;
960 case '=': return EQUAL;
961 case '.': return DOT;
962 case '%': return PERCENT;
963 case '`': return BACKQUOTE;
964 case '{': return LBRACE;
965 case '}': return RBRACE;
966 case '^': return CIRCUMFLEX;
967 case '~': return TILDE;
968 case '@': return AT;
969 default: return OP;
975 PyToken_TwoChars(int c1, int c2)
977 switch (c1) {
978 case '=':
979 switch (c2) {
980 case '=': return EQEQUAL;
982 break;
983 case '!':
984 switch (c2) {
985 case '=': return NOTEQUAL;
987 break;
988 case '<':
989 switch (c2) {
990 case '>': return NOTEQUAL;
991 case '=': return LESSEQUAL;
992 case '<': return LEFTSHIFT;
994 break;
995 case '>':
996 switch (c2) {
997 case '=': return GREATEREQUAL;
998 case '>': return RIGHTSHIFT;
1000 break;
1001 case '+':
1002 switch (c2) {
1003 case '=': return PLUSEQUAL;
1005 break;
1006 case '-':
1007 switch (c2) {
1008 case '=': return MINEQUAL;
1010 break;
1011 case '*':
1012 switch (c2) {
1013 case '*': return DOUBLESTAR;
1014 case '=': return STAREQUAL;
1016 break;
1017 case '/':
1018 switch (c2) {
1019 case '/': return DOUBLESLASH;
1020 case '=': return SLASHEQUAL;
1022 break;
1023 case '|':
1024 switch (c2) {
1025 case '=': return VBAREQUAL;
1027 break;
1028 case '%':
1029 switch (c2) {
1030 case '=': return PERCENTEQUAL;
1032 break;
1033 case '&':
1034 switch (c2) {
1035 case '=': return AMPEREQUAL;
1037 break;
1038 case '^':
1039 switch (c2) {
1040 case '=': return CIRCUMFLEXEQUAL;
1042 break;
1044 return OP;
1048 PyToken_ThreeChars(int c1, int c2, int c3)
1050 switch (c1) {
1051 case '<':
1052 switch (c2) {
1053 case '<':
1054 switch (c3) {
1055 case '=':
1056 return LEFTSHIFTEQUAL;
1058 break;
1060 break;
1061 case '>':
1062 switch (c2) {
1063 case '>':
1064 switch (c3) {
1065 case '=':
1066 return RIGHTSHIFTEQUAL;
1068 break;
1070 break;
1071 case '*':
1072 switch (c2) {
1073 case '*':
1074 switch (c3) {
1075 case '=':
1076 return DOUBLESTAREQUAL;
1078 break;
1080 break;
1081 case '/':
1082 switch (c2) {
1083 case '/':
1084 switch (c3) {
1085 case '=':
1086 return DOUBLESLASHEQUAL;
1088 break;
1090 break;
1092 return OP;
1095 static int
1096 indenterror(struct tok_state *tok)
1098 if (tok->alterror) {
1099 tok->done = E_TABSPACE;
1100 tok->cur = tok->inp;
1101 return 1;
1103 if (tok->altwarning) {
1104 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1105 "in indentation\n", tok->filename);
1106 tok->altwarning = 0;
1108 return 0;
1112 /* Get next token, after space stripping etc. */
1114 static int
1115 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1117 register int c;
1118 int blankline;
1120 *p_start = *p_end = NULL;
1121 nextline:
1122 tok->start = NULL;
1123 blankline = 0;
1125 /* Get indentation level */
1126 if (tok->atbol) {
1127 register int col = 0;
1128 register int altcol = 0;
1129 tok->atbol = 0;
1130 for (;;) {
1131 c = tok_nextc(tok);
1132 if (c == ' ')
1133 col++, altcol++;
1134 else if (c == '\t') {
1135 col = (col/tok->tabsize + 1) * tok->tabsize;
1136 altcol = (altcol/tok->alttabsize + 1)
1137 * tok->alttabsize;
1139 else if (c == '\014') /* Control-L (formfeed) */
1140 col = altcol = 0; /* For Emacs users */
1141 else
1142 break;
1144 tok_backup(tok, c);
1145 if (c == '#' || c == '\n') {
1146 /* Lines with only whitespace and/or comments
1147 shouldn't affect the indentation and are
1148 not passed to the parser as NEWLINE tokens,
1149 except *totally* empty lines in interactive
1150 mode, which signal the end of a command group. */
1151 if (col == 0 && c == '\n' && tok->prompt != NULL)
1152 blankline = 0; /* Let it through */
1153 else
1154 blankline = 1; /* Ignore completely */
1155 /* We can't jump back right here since we still
1156 may need to skip to the end of a comment */
1158 if (!blankline && tok->level == 0) {
1159 if (col == tok->indstack[tok->indent]) {
1160 /* No change */
1161 if (altcol != tok->altindstack[tok->indent]) {
1162 if (indenterror(tok))
1163 return ERRORTOKEN;
1166 else if (col > tok->indstack[tok->indent]) {
1167 /* Indent -- always one */
1168 if (tok->indent+1 >= MAXINDENT) {
1169 tok->done = E_TOODEEP;
1170 tok->cur = tok->inp;
1171 return ERRORTOKEN;
1173 if (altcol <= tok->altindstack[tok->indent]) {
1174 if (indenterror(tok))
1175 return ERRORTOKEN;
1177 tok->pendin++;
1178 tok->indstack[++tok->indent] = col;
1179 tok->altindstack[tok->indent] = altcol;
1181 else /* col < tok->indstack[tok->indent] */ {
1182 /* Dedent -- any number, must be consistent */
1183 while (tok->indent > 0 &&
1184 col < tok->indstack[tok->indent]) {
1185 tok->pendin--;
1186 tok->indent--;
1188 if (col != tok->indstack[tok->indent]) {
1189 tok->done = E_DEDENT;
1190 tok->cur = tok->inp;
1191 return ERRORTOKEN;
1193 if (altcol != tok->altindstack[tok->indent]) {
1194 if (indenterror(tok))
1195 return ERRORTOKEN;
1201 tok->start = tok->cur;
1203 /* Return pending indents/dedents */
1204 if (tok->pendin != 0) {
1205 if (tok->pendin < 0) {
1206 tok->pendin++;
1207 return DEDENT;
1209 else {
1210 tok->pendin--;
1211 return INDENT;
1215 again:
1216 tok->start = NULL;
1217 /* Skip spaces */
1218 do {
1219 c = tok_nextc(tok);
1220 } while (c == ' ' || c == '\t' || c == '\014');
1222 /* Set start of current token */
1223 tok->start = tok->cur - 1;
1225 /* Skip comment, while looking for tab-setting magic */
1226 if (c == '#') {
1227 static char *tabforms[] = {
1228 "tab-width:", /* Emacs */
1229 ":tabstop=", /* vim, full form */
1230 ":ts=", /* vim, abbreviated form */
1231 "set tabsize=", /* will vi never die? */
1232 /* more templates can be added here to support other editors */
1234 char cbuf[80];
1235 char *tp, **cp;
1236 tp = cbuf;
1237 do {
1238 *tp++ = c = tok_nextc(tok);
1239 } while (c != EOF && c != '\n' &&
1240 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1241 *tp = '\0';
1242 for (cp = tabforms;
1243 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1244 cp++) {
1245 if ((tp = strstr(cbuf, *cp))) {
1246 int newsize = atoi(tp + strlen(*cp));
1248 if (newsize >= 1 && newsize <= 40) {
1249 tok->tabsize = newsize;
1250 if (Py_VerboseFlag)
1251 PySys_WriteStderr(
1252 "Tab size set to %d\n",
1253 newsize);
1257 while (c != EOF && c != '\n')
1258 c = tok_nextc(tok);
1261 /* Check for EOF and errors now */
1262 if (c == EOF) {
1263 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1266 /* Identifier (most frequent token!) */
1267 if (isalpha(c) || c == '_') {
1268 /* Process r"", u"" and ur"" */
1269 switch (c) {
1270 case 'b':
1271 case 'B':
1272 c = tok_nextc(tok);
1273 if (c == 'r' || c == 'R')
1274 c = tok_nextc(tok);
1275 if (c == '"' || c == '\'')
1276 goto letter_quote;
1277 break;
1278 case 'r':
1279 case 'R':
1280 c = tok_nextc(tok);
1281 if (c == '"' || c == '\'')
1282 goto letter_quote;
1283 break;
1284 case 'u':
1285 case 'U':
1286 c = tok_nextc(tok);
1287 if (c == 'r' || c == 'R')
1288 c = tok_nextc(tok);
1289 if (c == '"' || c == '\'')
1290 goto letter_quote;
1291 break;
1293 while (isalnum(c) || c == '_') {
1294 c = tok_nextc(tok);
1296 tok_backup(tok, c);
1297 *p_start = tok->start;
1298 *p_end = tok->cur;
1299 return NAME;
1302 /* Newline */
1303 if (c == '\n') {
1304 tok->atbol = 1;
1305 if (blankline || tok->level > 0)
1306 goto nextline;
1307 *p_start = tok->start;
1308 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1309 tok->cont_line = 0;
1310 return NEWLINE;
1313 /* Period or number starting with period? */
1314 if (c == '.') {
1315 c = tok_nextc(tok);
1316 if (isdigit(c)) {
1317 goto fraction;
1319 else {
1320 tok_backup(tok, c);
1321 *p_start = tok->start;
1322 *p_end = tok->cur;
1323 return DOT;
1327 /* Number */
1328 if (isdigit(c)) {
1329 if (c == '0') {
1330 /* Hex, octal or binary -- maybe. */
1331 c = tok_nextc(tok);
1332 if (c == '.')
1333 goto fraction;
1334 #ifndef WITHOUT_COMPLEX
1335 if (c == 'j' || c == 'J')
1336 goto imaginary;
1337 #endif
1338 if (c == 'x' || c == 'X') {
1340 /* Hex */
1341 c = tok_nextc(tok);
1342 if (!isxdigit(c)) {
1343 tok->done = E_TOKEN;
1344 tok_backup(tok, c);
1345 return ERRORTOKEN;
1347 do {
1348 c = tok_nextc(tok);
1349 } while (isxdigit(c));
1351 else if (c == 'o' || c == 'O') {
1352 /* Octal */
1353 c = tok_nextc(tok);
1354 if (c < '0' || c >= '8') {
1355 tok->done = E_TOKEN;
1356 tok_backup(tok, c);
1357 return ERRORTOKEN;
1359 do {
1360 c = tok_nextc(tok);
1361 } while ('0' <= c && c < '8');
1363 else if (c == 'b' || c == 'B') {
1364 /* Binary */
1365 c = tok_nextc(tok);
1366 if (c != '0' && c != '1') {
1367 tok->done = E_TOKEN;
1368 tok_backup(tok, c);
1369 return ERRORTOKEN;
1371 do {
1372 c = tok_nextc(tok);
1373 } while (c == '0' || c == '1');
1375 else {
1376 int found_decimal = 0;
1377 /* Octal; c is first char of it */
1378 /* There's no 'isoctdigit' macro, sigh */
1379 while ('0' <= c && c < '8') {
1380 c = tok_nextc(tok);
1382 if (isdigit(c)) {
1383 found_decimal = 1;
1384 do {
1385 c = tok_nextc(tok);
1386 } while (isdigit(c));
1388 if (c == '.')
1389 goto fraction;
1390 else if (c == 'e' || c == 'E')
1391 goto exponent;
1392 #ifndef WITHOUT_COMPLEX
1393 else if (c == 'j' || c == 'J')
1394 goto imaginary;
1395 #endif
1396 else if (found_decimal) {
1397 tok->done = E_TOKEN;
1398 tok_backup(tok, c);
1399 return ERRORTOKEN;
1402 if (c == 'l' || c == 'L')
1403 c = tok_nextc(tok);
1405 else {
1406 /* Decimal */
1407 do {
1408 c = tok_nextc(tok);
1409 } while (isdigit(c));
1410 if (c == 'l' || c == 'L')
1411 c = tok_nextc(tok);
1412 else {
1413 /* Accept floating point numbers. */
1414 if (c == '.') {
1415 fraction:
1416 /* Fraction */
1417 do {
1418 c = tok_nextc(tok);
1419 } while (isdigit(c));
1421 if (c == 'e' || c == 'E') {
1422 exponent:
1423 /* Exponent part */
1424 c = tok_nextc(tok);
1425 if (c == '+' || c == '-')
1426 c = tok_nextc(tok);
1427 if (!isdigit(c)) {
1428 tok->done = E_TOKEN;
1429 tok_backup(tok, c);
1430 return ERRORTOKEN;
1432 do {
1433 c = tok_nextc(tok);
1434 } while (isdigit(c));
1436 #ifndef WITHOUT_COMPLEX
1437 if (c == 'j' || c == 'J')
1438 /* Imaginary part */
1439 imaginary:
1440 c = tok_nextc(tok);
1441 #endif
1444 tok_backup(tok, c);
1445 *p_start = tok->start;
1446 *p_end = tok->cur;
1447 return NUMBER;
1450 letter_quote:
1451 /* String */
1452 if (c == '\'' || c == '"') {
1453 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1454 int quote = c;
1455 int triple = 0;
1456 int tripcount = 0;
1457 for (;;) {
1458 c = tok_nextc(tok);
1459 if (c == '\n') {
1460 if (!triple) {
1461 tok->done = E_EOLS;
1462 tok_backup(tok, c);
1463 return ERRORTOKEN;
1465 tripcount = 0;
1466 tok->cont_line = 1; /* multiline string. */
1468 else if (c == EOF) {
1469 if (triple)
1470 tok->done = E_EOFS;
1471 else
1472 tok->done = E_EOLS;
1473 tok->cur = tok->inp;
1474 return ERRORTOKEN;
1476 else if (c == quote) {
1477 tripcount++;
1478 if (tok->cur - tok->start == quote2) {
1479 c = tok_nextc(tok);
1480 if (c == quote) {
1481 triple = 1;
1482 tripcount = 0;
1483 continue;
1485 tok_backup(tok, c);
1487 if (!triple || tripcount == 3)
1488 break;
1490 else if (c == '\\') {
1491 tripcount = 0;
1492 c = tok_nextc(tok);
1493 if (c == EOF) {
1494 tok->done = E_EOLS;
1495 tok->cur = tok->inp;
1496 return ERRORTOKEN;
1499 else
1500 tripcount = 0;
1502 *p_start = tok->start;
1503 *p_end = tok->cur;
1504 return STRING;
1507 /* Line continuation */
1508 if (c == '\\') {
1509 c = tok_nextc(tok);
1510 if (c != '\n') {
1511 tok->done = E_LINECONT;
1512 tok->cur = tok->inp;
1513 return ERRORTOKEN;
1515 tok->cont_line = 1;
1516 goto again; /* Read next line */
1519 /* Check for two-character token */
1521 int c2 = tok_nextc(tok);
1522 int token = PyToken_TwoChars(c, c2);
1523 #ifndef PGEN
1524 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1525 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1526 "<> not supported in 3.x; use !=",
1527 tok->filename, tok->lineno,
1528 NULL, NULL)) {
1529 return ERRORTOKEN;
1532 #endif
1533 if (token != OP) {
1534 int c3 = tok_nextc(tok);
1535 int token3 = PyToken_ThreeChars(c, c2, c3);
1536 if (token3 != OP) {
1537 token = token3;
1538 } else {
1539 tok_backup(tok, c3);
1541 *p_start = tok->start;
1542 *p_end = tok->cur;
1543 return token;
1545 tok_backup(tok, c2);
1548 /* Keep track of parentheses nesting level */
1549 switch (c) {
1550 case '(':
1551 case '[':
1552 case '{':
1553 tok->level++;
1554 break;
1555 case ')':
1556 case ']':
1557 case '}':
1558 tok->level--;
1559 break;
1562 /* Punctuation character */
1563 *p_start = tok->start;
1564 *p_end = tok->cur;
1565 return PyToken_OneChar(c);
1569 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1571 int result = tok_get(tok, p_start, p_end);
1572 if (tok->decoding_erred) {
1573 result = ERRORTOKEN;
1574 tok->done = E_DECODE;
1576 return result;
1579 /* This function is only called from parsetok. However, it cannot live
1580 there, as it must be empty for PGEN, and we can check for PGEN only
1581 in this file. */
1583 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1584 char*
1585 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1587 return NULL;
1589 #else
1590 #ifdef Py_USING_UNICODE
1591 static PyObject *
1592 dec_utf8(const char *enc, const char *text, size_t len) {
1593 PyObject *ret = NULL;
1594 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1595 if (unicode_text) {
1596 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1597 Py_DECREF(unicode_text);
1599 if (!ret) {
1600 PyErr_Clear();
1602 return ret;
1604 char *
1605 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1607 char *text = NULL;
1608 if (tok->encoding) {
1609 /* convert source to original encondig */
1610 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1611 if (lineobj != NULL) {
1612 int linelen = PyString_Size(lineobj);
1613 const char *line = PyString_AsString(lineobj);
1614 text = PyObject_MALLOC(linelen + 1);
1615 if (text != NULL && line != NULL) {
1616 if (linelen)
1617 strncpy(text, line, linelen);
1618 text[linelen] = '\0';
1620 Py_DECREF(lineobj);
1622 /* adjust error offset */
1623 if (*offset > 1) {
1624 PyObject *offsetobj = dec_utf8(tok->encoding,
1625 tok->buf, *offset-1);
1626 if (offsetobj) {
1627 *offset = PyString_Size(offsetobj) + 1;
1628 Py_DECREF(offsetobj);
1634 return text;
1637 #endif /* defined(Py_USING_UNICODE) */
1638 #endif
1641 #ifdef Py_DEBUG
1643 void
1644 tok_dump(int type, char *start, char *end)
1646 printf("%s", _PyParser_TokenNames[type]);
1647 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1648 printf("(%.*s)", (int)(end - start), start);
1651 #endif