Test wouldn't work in debug mode.
[python.git] / Parser / tokenizer.c
blob707e76291c47983f4359813a4a7ccf13a09f9985
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
35 /* Token names */
37 char *_PyParser_TokenNames[] = {
38 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
96 /* Create and initialize a new tok_state structure */
98 static struct tok_state *
99 tok_new(void)
101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
103 if (tok == NULL)
104 return NULL;
105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
126 #ifndef PGEN
127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
129 #endif
130 return tok;
133 #ifdef PGEN
135 static char *
136 decoding_fgets(char *s, int size, struct tok_state *tok)
138 return fgets(s, size, tok->fp);
141 static int
142 decoding_feof(struct tok_state *tok)
144 return feof(tok->fp);
147 static const char *
148 decode_str(const char *str, struct tok_state *tok)
150 return str;
153 #else /* PGEN */
155 static char *
156 error_ret(struct tok_state *tok) /* XXX */
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
160 PyMem_FREE(tok->buf);
161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
165 static char *
166 new_string(const char *s, Py_ssize_t len)
168 char* result = (char *)PyMem_MALLOC(len + 1);
169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
173 return result;
176 static char *
177 get_normal_name(char *s) /* for utf-8 and latin-1 */
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0')
184 break;
185 else if (c == '_')
186 buf[i] = '-';
187 else
188 buf[i] = tolower(c);
190 buf[i] = '\0';
191 if (strcmp(buf, "utf-8") == 0 ||
192 strncmp(buf, "utf-8-", 6) == 0)
193 return "utf-8";
194 else if (strcmp(buf, "latin-1") == 0 ||
195 strcmp(buf, "iso-8859-1") == 0 ||
196 strcmp(buf, "iso-latin-1") == 0 ||
197 strncmp(buf, "latin-1-", 8) == 0 ||
198 strncmp(buf, "iso-8859-1-", 11) == 0 ||
199 strncmp(buf, "iso-latin-1-", 12) == 0)
200 return "iso-8859-1";
201 else
202 return s;
205 /* Return the coding spec in S, or NULL if none is found. */
207 static char *
208 get_coding_spec(const char *s, Py_ssize_t size)
210 Py_ssize_t i;
211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
219 for (; i < size - 6; i++) { /* XXX inefficient search */
220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
230 begin = t;
231 while (isalnum(Py_CHARMASK(t[0])) ||
232 t[0] == '-' || t[0] == '_' || t[0] == '.')
233 t++;
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
239 PyMem_FREE(r);
240 r = new_string(q, strlen(q));
242 return r;
246 return NULL;
249 /* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
254 static int
255 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
258 char * cs;
259 int r = 1;
261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
264 cs = get_coding_spec(line, size);
265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
273 #ifdef Py_USING_UNICODE
274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
279 else
280 PyMem_FREE(cs);
281 #else
282 /* Without Unicode support, we cannot
283 process the coding spec. Since there
284 won't be any Unicode literals, that
285 won't matter. */
286 PyMem_FREE(cs);
287 #endif
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
291 PyMem_FREE(cs);
294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 return r;
303 /* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
307 static int
308 check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
313 int ch = get_char(tok);
314 tok->decoding_state = 1;
315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
318 ch = get_char(tok);
319 if (ch != 0xBB)
320 goto NON_BOM;
321 ch = get_char(tok);
322 if (ch != 0xBF)
323 goto NON_BOM;
324 #if 0
325 /* Disable support for UTF-16 BOMs until a decision
326 is made whether this needs to be supported. */
327 } else if (ch == 0xFE) {
328 ch = get_char(tok);
329 if (ch != 0xFF)
330 goto NON_BOM;
331 if (!set_readline(tok, "utf-16-be"))
332 return 0;
333 tok->decoding_state = -1;
334 } else if (ch == 0xFF) {
335 ch = get_char(tok);
336 if (ch != 0xFE)
337 goto NON_BOM;
338 if (!set_readline(tok, "utf-16-le"))
339 return 0;
340 tok->decoding_state = -1;
341 #endif
342 } else {
343 unget_char(ch, tok);
344 return 1;
346 if (tok->encoding != NULL)
347 PyMem_FREE(tok->encoding);
348 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
349 return 1;
350 NON_BOM:
351 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
352 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
353 return 1;
356 /* Read a line of text from TOK into S, using the stream in TOK.
357 Return NULL on failure, else S.
359 On entry, tok->decoding_buffer will be one of:
360 1) NULL: need to call tok->decoding_readline to get a new line
361 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
362 stored the result in tok->decoding_buffer
363 3) PyStringObject *: previous call to fp_readl did not have enough room
364 (in the s buffer) to copy entire contents of the line read
365 by tok->decoding_readline. tok->decoding_buffer has the overflow.
366 In this case, fp_readl is called in a loop (with an expanded buffer)
367 until the buffer ends with a '\n' (or until the end of the file is
368 reached): see tok_nextc and its calls to decoding_fgets.
371 static char *
372 fp_readl(char *s, int size, struct tok_state *tok)
374 #ifndef Py_USING_UNICODE
375 /* In a non-Unicode built, this should never be called. */
376 Py_FatalError("fp_readl should not be called in this build.");
377 return NULL; /* Keep compiler happy (not reachable) */
378 #else
379 PyObject* utf8 = NULL;
380 PyObject* buf = tok->decoding_buffer;
381 char *str;
382 Py_ssize_t utf8len;
384 /* Ask for one less byte so we can terminate it */
385 assert(size > 0);
386 size--;
388 if (buf == NULL) {
389 buf = PyObject_CallObject(tok->decoding_readline, NULL);
390 if (buf == NULL)
391 return error_ret(tok);
392 } else {
393 tok->decoding_buffer = NULL;
394 if (PyString_CheckExact(buf))
395 utf8 = buf;
397 if (utf8 == NULL) {
398 utf8 = PyUnicode_AsUTF8String(buf);
399 Py_DECREF(buf);
400 if (utf8 == NULL)
401 return error_ret(tok);
403 str = PyString_AsString(utf8);
404 utf8len = PyString_GET_SIZE(utf8);
405 if (utf8len > size) {
406 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
407 if (tok->decoding_buffer == NULL) {
408 Py_DECREF(utf8);
409 return error_ret(tok);
411 utf8len = size;
413 memcpy(s, str, utf8len);
414 s[utf8len] = '\0';
415 Py_DECREF(utf8);
416 if (utf8len == 0)
417 return NULL; /* EOF */
418 return s;
419 #endif
422 /* Set the readline function for TOK to a StreamReader's
423 readline function. The StreamReader is named ENC.
425 This function is called from check_bom and check_coding_spec.
427 ENC is usually identical to the future value of tok->encoding,
428 except for the (currently unsupported) case of UTF-16.
430 Return 1 on success, 0 on failure. */
432 static int
433 fp_setreadl(struct tok_state *tok, const char* enc)
435 PyObject *reader, *stream, *readline;
437 /* XXX: constify filename argument. */
438 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
439 if (stream == NULL)
440 return 0;
442 reader = PyCodec_StreamReader(enc, stream, NULL);
443 Py_DECREF(stream);
444 if (reader == NULL)
445 return 0;
447 readline = PyObject_GetAttrString(reader, "readline");
448 Py_DECREF(reader);
449 if (readline == NULL)
450 return 0;
452 tok->decoding_readline = readline;
453 return 1;
456 /* Fetch the next byte from TOK. */
458 static int fp_getc(struct tok_state *tok) {
459 return getc(tok->fp);
462 /* Unfetch the last byte back into TOK. */
464 static void fp_ungetc(int c, struct tok_state *tok) {
465 ungetc(c, tok->fp);
468 /* Read a line of input from TOK. Determine encoding
469 if necessary. */
471 static char *
472 decoding_fgets(char *s, int size, struct tok_state *tok)
474 char *line = NULL;
475 int badchar = 0;
476 for (;;) {
477 if (tok->decoding_state < 0) {
478 /* We already have a codec associated with
479 this input. */
480 line = fp_readl(s, size, tok);
481 break;
482 } else if (tok->decoding_state > 0) {
483 /* We want a 'raw' read. */
484 line = Py_UniversalNewlineFgets(s, size,
485 tok->fp, NULL);
486 break;
487 } else {
488 /* We have not yet determined the encoding.
489 If an encoding is found, use the file-pointer
490 reader functions from now on. */
491 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
492 return error_ret(tok);
493 assert(tok->decoding_state != 0);
496 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
497 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
498 return error_ret(tok);
501 #ifndef PGEN
502 /* The default encoding is ASCII, so make sure we don't have any
503 non-ASCII bytes in it. */
504 if (line && !tok->encoding) {
505 unsigned char *c;
506 for (c = (unsigned char *)line; *c; c++)
507 if (*c > 127) {
508 badchar = *c;
509 break;
512 if (badchar) {
513 char buf[500];
514 /* Need to add 1 to the line number, since this line
515 has not been counted, yet. */
516 sprintf(buf,
517 "Non-ASCII character '\\x%.2x' "
518 "in file %.200s on line %i, "
519 "but no encoding declared; "
520 "see http://www.python.org/peps/pep-0263.html for details",
521 badchar, tok->filename, tok->lineno + 1);
522 PyErr_SetString(PyExc_SyntaxError, buf);
523 return error_ret(tok);
525 #endif
526 return line;
529 static int
530 decoding_feof(struct tok_state *tok)
532 if (tok->decoding_state >= 0) {
533 return feof(tok->fp);
534 } else {
535 PyObject* buf = tok->decoding_buffer;
536 if (buf == NULL) {
537 buf = PyObject_CallObject(tok->decoding_readline, NULL);
538 if (buf == NULL) {
539 error_ret(tok);
540 return 1;
541 } else {
542 tok->decoding_buffer = buf;
545 return PyObject_Length(buf) == 0;
549 /* Fetch a byte from TOK, using the string buffer. */
551 static int
552 buf_getc(struct tok_state *tok) {
553 return Py_CHARMASK(*tok->str++);
556 /* Unfetch a byte from TOK, using the string buffer. */
558 static void
559 buf_ungetc(int c, struct tok_state *tok) {
560 tok->str--;
561 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
564 /* Set the readline function for TOK to ENC. For the string-based
565 tokenizer, this means to just record the encoding. */
567 static int
568 buf_setreadl(struct tok_state *tok, const char* enc) {
569 tok->enc = enc;
570 return 1;
573 /* Return a UTF-8 encoding Python string object from the
574 C byte string STR, which is encoded with ENC. */
576 #ifdef Py_USING_UNICODE
577 static PyObject *
578 translate_into_utf8(const char* str, const char* enc) {
579 PyObject *utf8;
580 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
581 if (buf == NULL)
582 return NULL;
583 utf8 = PyUnicode_AsUTF8String(buf);
584 Py_DECREF(buf);
585 return utf8;
587 #endif
589 /* Decode a byte string STR for use as the buffer of TOK.
590 Look for encoding declarations inside STR, and record them
591 inside TOK. */
593 static const char *
594 decode_str(const char *str, struct tok_state *tok)
596 PyObject* utf8 = NULL;
597 const char *s;
598 const char *newl[2] = {NULL, NULL};
599 int lineno = 0;
600 tok->enc = NULL;
601 tok->str = str;
602 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
603 return error_ret(tok);
604 str = tok->str; /* string after BOM if any */
605 assert(str);
606 #ifdef Py_USING_UNICODE
607 if (tok->enc != NULL) {
608 utf8 = translate_into_utf8(str, tok->enc);
609 if (utf8 == NULL)
610 return error_ret(tok);
611 str = PyString_AsString(utf8);
613 #endif
614 for (s = str;; s++) {
615 if (*s == '\0') break;
616 else if (*s == '\n') {
617 assert(lineno < 2);
618 newl[lineno] = s;
619 lineno++;
620 if (lineno == 2) break;
623 tok->enc = NULL;
624 /* need to check line 1 and 2 separately since check_coding_spec
625 assumes a single line as input */
626 if (newl[0]) {
627 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
628 return error_ret(tok);
629 if (tok->enc == NULL && newl[1]) {
630 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
631 tok, buf_setreadl))
632 return error_ret(tok);
635 #ifdef Py_USING_UNICODE
636 if (tok->enc != NULL) {
637 assert(utf8 == NULL);
638 utf8 = translate_into_utf8(str, tok->enc);
639 if (utf8 == NULL)
640 return error_ret(tok);
641 str = PyString_AsString(utf8);
643 #endif
644 assert(tok->decoding_buffer == NULL);
645 tok->decoding_buffer = utf8; /* CAUTION */
646 return str;
649 #endif /* PGEN */
651 /* Set up tokenizer for string */
653 struct tok_state *
654 PyTokenizer_FromString(const char *str)
656 struct tok_state *tok = tok_new();
657 if (tok == NULL)
658 return NULL;
659 str = (char *)decode_str(str, tok);
660 if (str == NULL) {
661 PyTokenizer_Free(tok);
662 return NULL;
665 /* XXX: constify members. */
666 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
667 return tok;
671 /* Set up tokenizer for file */
673 struct tok_state *
674 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
676 struct tok_state *tok = tok_new();
677 if (tok == NULL)
678 return NULL;
679 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
680 PyTokenizer_Free(tok);
681 return NULL;
683 tok->cur = tok->inp = tok->buf;
684 tok->end = tok->buf + BUFSIZ;
685 tok->fp = fp;
686 tok->prompt = ps1;
687 tok->nextprompt = ps2;
688 return tok;
692 /* Free a tok_state structure */
694 void
695 PyTokenizer_Free(struct tok_state *tok)
697 if (tok->encoding != NULL)
698 PyMem_FREE(tok->encoding);
699 #ifndef PGEN
700 Py_XDECREF(tok->decoding_readline);
701 Py_XDECREF(tok->decoding_buffer);
702 #endif
703 if (tok->fp != NULL && tok->buf != NULL)
704 PyMem_FREE(tok->buf);
705 PyMem_FREE(tok);
708 #if !defined(PGEN) && defined(Py_USING_UNICODE)
709 static int
710 tok_stdin_decode(struct tok_state *tok, char **inp)
712 PyObject *enc, *sysstdin, *decoded, *utf8;
713 const char *encoding;
714 char *converted;
716 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
717 return 0;
718 sysstdin = PySys_GetObject("stdin");
719 if (sysstdin == NULL || !PyFile_Check(sysstdin))
720 return 0;
722 enc = ((PyFileObject *)sysstdin)->f_encoding;
723 if (enc == NULL || !PyString_Check(enc))
724 return 0;
725 Py_INCREF(enc);
727 encoding = PyString_AsString(enc);
728 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
729 if (decoded == NULL)
730 goto error_clear;
732 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
733 Py_DECREF(decoded);
734 if (utf8 == NULL)
735 goto error_clear;
737 assert(PyString_Check(utf8));
738 converted = new_string(PyString_AS_STRING(utf8),
739 PyString_GET_SIZE(utf8));
740 Py_DECREF(utf8);
741 if (converted == NULL)
742 goto error_nomem;
744 PyMem_FREE(*inp);
745 *inp = converted;
746 if (tok->encoding != NULL)
747 PyMem_FREE(tok->encoding);
748 tok->encoding = new_string(encoding, strlen(encoding));
749 if (tok->encoding == NULL)
750 goto error_nomem;
752 Py_DECREF(enc);
753 return 0;
755 error_nomem:
756 Py_DECREF(enc);
757 tok->done = E_NOMEM;
758 return -1;
760 error_clear:
761 /* Fallback to iso-8859-1: for backward compatibility */
762 Py_DECREF(enc);
763 PyErr_Clear();
764 return 0;
766 #endif
768 /* Get next char, updating state; error code goes into tok->done */
770 static int
771 tok_nextc(register struct tok_state *tok)
773 for (;;) {
774 if (tok->cur != tok->inp) {
775 return Py_CHARMASK(*tok->cur++); /* Fast path */
777 if (tok->done != E_OK)
778 return EOF;
779 if (tok->fp == NULL) {
780 char *end = strchr(tok->inp, '\n');
781 if (end != NULL)
782 end++;
783 else {
784 end = strchr(tok->inp, '\0');
785 if (end == tok->inp) {
786 tok->done = E_EOF;
787 return EOF;
790 if (tok->start == NULL)
791 tok->buf = tok->cur;
792 tok->line_start = tok->cur;
793 tok->lineno++;
794 tok->inp = end;
795 return Py_CHARMASK(*tok->cur++);
797 if (tok->prompt != NULL) {
798 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
799 if (tok->nextprompt != NULL)
800 tok->prompt = tok->nextprompt;
801 if (newtok == NULL)
802 tok->done = E_INTR;
803 else if (*newtok == '\0') {
804 PyMem_FREE(newtok);
805 tok->done = E_EOF;
807 #if !defined(PGEN) && defined(Py_USING_UNICODE)
808 else if (tok_stdin_decode(tok, &newtok) != 0)
809 PyMem_FREE(newtok);
810 #endif
811 else if (tok->start != NULL) {
812 size_t start = tok->start - tok->buf;
813 size_t oldlen = tok->cur - tok->buf;
814 size_t newlen = oldlen + strlen(newtok);
815 char *buf = tok->buf;
816 buf = (char *)PyMem_REALLOC(buf, newlen+1);
817 tok->lineno++;
818 if (buf == NULL) {
819 PyMem_FREE(tok->buf);
820 tok->buf = NULL;
821 PyMem_FREE(newtok);
822 tok->done = E_NOMEM;
823 return EOF;
825 tok->buf = buf;
826 tok->cur = tok->buf + oldlen;
827 tok->line_start = tok->cur;
828 strcpy(tok->buf + oldlen, newtok);
829 PyMem_FREE(newtok);
830 tok->inp = tok->buf + newlen;
831 tok->end = tok->inp + 1;
832 tok->start = tok->buf + start;
834 else {
835 tok->lineno++;
836 if (tok->buf != NULL)
837 PyMem_FREE(tok->buf);
838 tok->buf = newtok;
839 tok->line_start = tok->buf;
840 tok->cur = tok->buf;
841 tok->line_start = tok->buf;
842 tok->inp = strchr(tok->buf, '\0');
843 tok->end = tok->inp + 1;
846 else {
847 int done = 0;
848 Py_ssize_t cur = 0;
849 char *pt;
850 if (tok->start == NULL) {
851 if (tok->buf == NULL) {
852 tok->buf = (char *)
853 PyMem_MALLOC(BUFSIZ);
854 if (tok->buf == NULL) {
855 tok->done = E_NOMEM;
856 return EOF;
858 tok->end = tok->buf + BUFSIZ;
860 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
861 tok) == NULL) {
862 tok->done = E_EOF;
863 done = 1;
865 else {
866 tok->done = E_OK;
867 tok->inp = strchr(tok->buf, '\0');
868 done = tok->inp[-1] == '\n';
871 else {
872 cur = tok->cur - tok->buf;
873 if (decoding_feof(tok)) {
874 tok->done = E_EOF;
875 done = 1;
877 else
878 tok->done = E_OK;
880 tok->lineno++;
881 /* Read until '\n' or EOF */
882 while (!done) {
883 Py_ssize_t curstart = tok->start == NULL ? -1 :
884 tok->start - tok->buf;
885 Py_ssize_t curvalid = tok->inp - tok->buf;
886 Py_ssize_t newsize = curvalid + BUFSIZ;
887 char *newbuf = tok->buf;
888 newbuf = (char *)PyMem_REALLOC(newbuf,
889 newsize);
890 if (newbuf == NULL) {
891 tok->done = E_NOMEM;
892 tok->cur = tok->inp;
893 return EOF;
895 tok->buf = newbuf;
896 tok->inp = tok->buf + curvalid;
897 tok->end = tok->buf + newsize;
898 tok->start = curstart < 0 ? NULL :
899 tok->buf + curstart;
900 if (decoding_fgets(tok->inp,
901 (int)(tok->end - tok->inp),
902 tok) == NULL) {
903 /* Break out early on decoding
904 errors, as tok->buf will be NULL
906 if (tok->decoding_erred)
907 return EOF;
908 /* Last line does not end in \n,
909 fake one */
910 strcpy(tok->inp, "\n");
912 tok->inp = strchr(tok->inp, '\0');
913 done = tok->inp[-1] == '\n';
915 if (tok->buf != NULL) {
916 tok->cur = tok->buf + cur;
917 tok->line_start = tok->cur;
918 /* replace "\r\n" with "\n" */
919 /* For Mac leave the \r, giving a syntax error */
920 pt = tok->inp - 2;
921 if (pt >= tok->buf && *pt == '\r') {
922 *pt++ = '\n';
923 *pt = '\0';
924 tok->inp = pt;
928 if (tok->done != E_OK) {
929 if (tok->prompt != NULL)
930 PySys_WriteStderr("\n");
931 tok->cur = tok->inp;
932 return EOF;
935 /*NOTREACHED*/
939 /* Back-up one character */
941 static void
942 tok_backup(register struct tok_state *tok, register int c)
944 if (c != EOF) {
945 if (--tok->cur < tok->buf)
946 Py_FatalError("tok_backup: begin of buffer");
947 if (*tok->cur != c)
948 *tok->cur = c;
953 /* Return the token corresponding to a single character */
956 PyToken_OneChar(int c)
958 switch (c) {
959 case '(': return LPAR;
960 case ')': return RPAR;
961 case '[': return LSQB;
962 case ']': return RSQB;
963 case ':': return COLON;
964 case ',': return COMMA;
965 case ';': return SEMI;
966 case '+': return PLUS;
967 case '-': return MINUS;
968 case '*': return STAR;
969 case '/': return SLASH;
970 case '|': return VBAR;
971 case '&': return AMPER;
972 case '<': return LESS;
973 case '>': return GREATER;
974 case '=': return EQUAL;
975 case '.': return DOT;
976 case '%': return PERCENT;
977 case '`': return BACKQUOTE;
978 case '{': return LBRACE;
979 case '}': return RBRACE;
980 case '^': return CIRCUMFLEX;
981 case '~': return TILDE;
982 case '@': return AT;
983 default: return OP;
989 PyToken_TwoChars(int c1, int c2)
991 switch (c1) {
992 case '=':
993 switch (c2) {
994 case '=': return EQEQUAL;
996 break;
997 case '!':
998 switch (c2) {
999 case '=': return NOTEQUAL;
1001 break;
1002 case '<':
1003 switch (c2) {
1004 case '>': return NOTEQUAL;
1005 case '=': return LESSEQUAL;
1006 case '<': return LEFTSHIFT;
1008 break;
1009 case '>':
1010 switch (c2) {
1011 case '=': return GREATEREQUAL;
1012 case '>': return RIGHTSHIFT;
1014 break;
1015 case '+':
1016 switch (c2) {
1017 case '=': return PLUSEQUAL;
1019 break;
1020 case '-':
1021 switch (c2) {
1022 case '=': return MINEQUAL;
1024 break;
1025 case '*':
1026 switch (c2) {
1027 case '*': return DOUBLESTAR;
1028 case '=': return STAREQUAL;
1030 break;
1031 case '/':
1032 switch (c2) {
1033 case '/': return DOUBLESLASH;
1034 case '=': return SLASHEQUAL;
1036 break;
1037 case '|':
1038 switch (c2) {
1039 case '=': return VBAREQUAL;
1041 break;
1042 case '%':
1043 switch (c2) {
1044 case '=': return PERCENTEQUAL;
1046 break;
1047 case '&':
1048 switch (c2) {
1049 case '=': return AMPEREQUAL;
1051 break;
1052 case '^':
1053 switch (c2) {
1054 case '=': return CIRCUMFLEXEQUAL;
1056 break;
1058 return OP;
1062 PyToken_ThreeChars(int c1, int c2, int c3)
1064 switch (c1) {
1065 case '<':
1066 switch (c2) {
1067 case '<':
1068 switch (c3) {
1069 case '=':
1070 return LEFTSHIFTEQUAL;
1072 break;
1074 break;
1075 case '>':
1076 switch (c2) {
1077 case '>':
1078 switch (c3) {
1079 case '=':
1080 return RIGHTSHIFTEQUAL;
1082 break;
1084 break;
1085 case '*':
1086 switch (c2) {
1087 case '*':
1088 switch (c3) {
1089 case '=':
1090 return DOUBLESTAREQUAL;
1092 break;
1094 break;
1095 case '/':
1096 switch (c2) {
1097 case '/':
1098 switch (c3) {
1099 case '=':
1100 return DOUBLESLASHEQUAL;
1102 break;
1104 break;
1106 return OP;
1109 static int
1110 indenterror(struct tok_state *tok)
1112 if (tok->alterror) {
1113 tok->done = E_TABSPACE;
1114 tok->cur = tok->inp;
1115 return 1;
1117 if (tok->altwarning) {
1118 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1119 "in indentation\n", tok->filename);
1120 tok->altwarning = 0;
1122 return 0;
1126 /* Get next token, after space stripping etc. */
1128 static int
1129 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1131 register int c;
1132 int blankline;
1134 *p_start = *p_end = NULL;
1135 nextline:
1136 tok->start = NULL;
1137 blankline = 0;
1139 /* Get indentation level */
1140 if (tok->atbol) {
1141 register int col = 0;
1142 register int altcol = 0;
1143 tok->atbol = 0;
1144 for (;;) {
1145 c = tok_nextc(tok);
1146 if (c == ' ')
1147 col++, altcol++;
1148 else if (c == '\t') {
1149 col = (col/tok->tabsize + 1) * tok->tabsize;
1150 altcol = (altcol/tok->alttabsize + 1)
1151 * tok->alttabsize;
1153 else if (c == '\014') /* Control-L (formfeed) */
1154 col = altcol = 0; /* For Emacs users */
1155 else
1156 break;
1158 tok_backup(tok, c);
1159 if (c == '#' || c == '\n') {
1160 /* Lines with only whitespace and/or comments
1161 shouldn't affect the indentation and are
1162 not passed to the parser as NEWLINE tokens,
1163 except *totally* empty lines in interactive
1164 mode, which signal the end of a command group. */
1165 if (col == 0 && c == '\n' && tok->prompt != NULL)
1166 blankline = 0; /* Let it through */
1167 else
1168 blankline = 1; /* Ignore completely */
1169 /* We can't jump back right here since we still
1170 may need to skip to the end of a comment */
1172 if (!blankline && tok->level == 0) {
1173 if (col == tok->indstack[tok->indent]) {
1174 /* No change */
1175 if (altcol != tok->altindstack[tok->indent]) {
1176 if (indenterror(tok))
1177 return ERRORTOKEN;
1180 else if (col > tok->indstack[tok->indent]) {
1181 /* Indent -- always one */
1182 if (tok->indent+1 >= MAXINDENT) {
1183 tok->done = E_TOODEEP;
1184 tok->cur = tok->inp;
1185 return ERRORTOKEN;
1187 if (altcol <= tok->altindstack[tok->indent]) {
1188 if (indenterror(tok))
1189 return ERRORTOKEN;
1191 tok->pendin++;
1192 tok->indstack[++tok->indent] = col;
1193 tok->altindstack[tok->indent] = altcol;
1195 else /* col < tok->indstack[tok->indent] */ {
1196 /* Dedent -- any number, must be consistent */
1197 while (tok->indent > 0 &&
1198 col < tok->indstack[tok->indent]) {
1199 tok->pendin--;
1200 tok->indent--;
1202 if (col != tok->indstack[tok->indent]) {
1203 tok->done = E_DEDENT;
1204 tok->cur = tok->inp;
1205 return ERRORTOKEN;
1207 if (altcol != tok->altindstack[tok->indent]) {
1208 if (indenterror(tok))
1209 return ERRORTOKEN;
1215 tok->start = tok->cur;
1217 /* Return pending indents/dedents */
1218 if (tok->pendin != 0) {
1219 if (tok->pendin < 0) {
1220 tok->pendin++;
1221 return DEDENT;
1223 else {
1224 tok->pendin--;
1225 return INDENT;
1229 again:
1230 tok->start = NULL;
1231 /* Skip spaces */
1232 do {
1233 c = tok_nextc(tok);
1234 } while (c == ' ' || c == '\t' || c == '\014');
1236 /* Set start of current token */
1237 tok->start = tok->cur - 1;
1239 /* Skip comment, while looking for tab-setting magic */
1240 if (c == '#') {
1241 static char *tabforms[] = {
1242 "tab-width:", /* Emacs */
1243 ":tabstop=", /* vim, full form */
1244 ":ts=", /* vim, abbreviated form */
1245 "set tabsize=", /* will vi never die? */
1246 /* more templates can be added here to support other editors */
1248 char cbuf[80];
1249 char *tp, **cp;
1250 tp = cbuf;
1251 do {
1252 *tp++ = c = tok_nextc(tok);
1253 } while (c != EOF && c != '\n' &&
1254 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1255 *tp = '\0';
1256 for (cp = tabforms;
1257 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1258 cp++) {
1259 if ((tp = strstr(cbuf, *cp))) {
1260 int newsize = atoi(tp + strlen(*cp));
1262 if (newsize >= 1 && newsize <= 40) {
1263 tok->tabsize = newsize;
1264 if (Py_VerboseFlag)
1265 PySys_WriteStderr(
1266 "Tab size set to %d\n",
1267 newsize);
1271 while (c != EOF && c != '\n')
1272 c = tok_nextc(tok);
1275 /* Check for EOF and errors now */
1276 if (c == EOF) {
1277 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1280 /* Identifier (most frequent token!) */
1281 if (isalpha(c) || c == '_') {
1282 /* Process r"", u"" and ur"" */
1283 switch (c) {
1284 case 'b':
1285 case 'B':
1286 c = tok_nextc(tok);
1287 if (c == 'r' || c == 'R')
1288 c = tok_nextc(tok);
1289 if (c == '"' || c == '\'')
1290 goto letter_quote;
1291 break;
1292 case 'r':
1293 case 'R':
1294 c = tok_nextc(tok);
1295 if (c == '"' || c == '\'')
1296 goto letter_quote;
1297 break;
1298 case 'u':
1299 case 'U':
1300 c = tok_nextc(tok);
1301 if (c == 'r' || c == 'R')
1302 c = tok_nextc(tok);
1303 if (c == '"' || c == '\'')
1304 goto letter_quote;
1305 break;
1307 while (isalnum(c) || c == '_') {
1308 c = tok_nextc(tok);
1310 tok_backup(tok, c);
1311 *p_start = tok->start;
1312 *p_end = tok->cur;
1313 return NAME;
1316 /* Newline */
1317 if (c == '\n') {
1318 tok->atbol = 1;
1319 if (blankline || tok->level > 0)
1320 goto nextline;
1321 *p_start = tok->start;
1322 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1323 tok->cont_line = 0;
1324 return NEWLINE;
1327 /* Period or number starting with period? */
1328 if (c == '.') {
1329 c = tok_nextc(tok);
1330 if (isdigit(c)) {
1331 goto fraction;
1333 else {
1334 tok_backup(tok, c);
1335 *p_start = tok->start;
1336 *p_end = tok->cur;
1337 return DOT;
1341 /* Number */
1342 if (isdigit(c)) {
1343 if (c == '0') {
1344 /* Hex, octal or binary -- maybe. */
1345 c = tok_nextc(tok);
1346 if (c == '.')
1347 goto fraction;
1348 #ifndef WITHOUT_COMPLEX
1349 if (c == 'j' || c == 'J')
1350 goto imaginary;
1351 #endif
1352 if (c == 'x' || c == 'X') {
1354 /* Hex */
1355 c = tok_nextc(tok);
1356 if (!isxdigit(c)) {
1357 tok->done = E_TOKEN;
1358 tok_backup(tok, c);
1359 return ERRORTOKEN;
1361 do {
1362 c = tok_nextc(tok);
1363 } while (isxdigit(c));
1365 else if (c == 'o' || c == 'O') {
1366 /* Octal */
1367 c = tok_nextc(tok);
1368 if (c < '0' || c >= '8') {
1369 tok->done = E_TOKEN;
1370 tok_backup(tok, c);
1371 return ERRORTOKEN;
1373 do {
1374 c = tok_nextc(tok);
1375 } while ('0' <= c && c < '8');
1377 else if (c == 'b' || c == 'B') {
1378 /* Binary */
1379 c = tok_nextc(tok);
1380 if (c != '0' && c != '1') {
1381 tok->done = E_TOKEN;
1382 tok_backup(tok, c);
1383 return ERRORTOKEN;
1385 do {
1386 c = tok_nextc(tok);
1387 } while (c == '0' || c == '1');
1389 else {
1390 int found_decimal = 0;
1391 /* Octal; c is first char of it */
1392 /* There's no 'isoctdigit' macro, sigh */
1393 while ('0' <= c && c < '8') {
1394 c = tok_nextc(tok);
1396 if (isdigit(c)) {
1397 found_decimal = 1;
1398 do {
1399 c = tok_nextc(tok);
1400 } while (isdigit(c));
1402 if (c == '.')
1403 goto fraction;
1404 else if (c == 'e' || c == 'E')
1405 goto exponent;
1406 #ifndef WITHOUT_COMPLEX
1407 else if (c == 'j' || c == 'J')
1408 goto imaginary;
1409 #endif
1410 else if (found_decimal) {
1411 tok->done = E_TOKEN;
1412 tok_backup(tok, c);
1413 return ERRORTOKEN;
1416 if (c == 'l' || c == 'L')
1417 c = tok_nextc(tok);
1419 else {
1420 /* Decimal */
1421 do {
1422 c = tok_nextc(tok);
1423 } while (isdigit(c));
1424 if (c == 'l' || c == 'L')
1425 c = tok_nextc(tok);
1426 else {
1427 /* Accept floating point numbers. */
1428 if (c == '.') {
1429 fraction:
1430 /* Fraction */
1431 do {
1432 c = tok_nextc(tok);
1433 } while (isdigit(c));
1435 if (c == 'e' || c == 'E') {
1436 exponent:
1437 /* Exponent part */
1438 c = tok_nextc(tok);
1439 if (c == '+' || c == '-')
1440 c = tok_nextc(tok);
1441 if (!isdigit(c)) {
1442 tok->done = E_TOKEN;
1443 tok_backup(tok, c);
1444 return ERRORTOKEN;
1446 do {
1447 c = tok_nextc(tok);
1448 } while (isdigit(c));
1450 #ifndef WITHOUT_COMPLEX
1451 if (c == 'j' || c == 'J')
1452 /* Imaginary part */
1453 imaginary:
1454 c = tok_nextc(tok);
1455 #endif
1458 tok_backup(tok, c);
1459 *p_start = tok->start;
1460 *p_end = tok->cur;
1461 return NUMBER;
1464 letter_quote:
1465 /* String */
1466 if (c == '\'' || c == '"') {
1467 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1468 int quote = c;
1469 int triple = 0;
1470 int tripcount = 0;
1471 for (;;) {
1472 c = tok_nextc(tok);
1473 if (c == '\n') {
1474 if (!triple) {
1475 tok->done = E_EOLS;
1476 tok_backup(tok, c);
1477 return ERRORTOKEN;
1479 tripcount = 0;
1480 tok->cont_line = 1; /* multiline string. */
1482 else if (c == EOF) {
1483 if (triple)
1484 tok->done = E_EOFS;
1485 else
1486 tok->done = E_EOLS;
1487 tok->cur = tok->inp;
1488 return ERRORTOKEN;
1490 else if (c == quote) {
1491 tripcount++;
1492 if (tok->cur - tok->start == quote2) {
1493 c = tok_nextc(tok);
1494 if (c == quote) {
1495 triple = 1;
1496 tripcount = 0;
1497 continue;
1499 tok_backup(tok, c);
1501 if (!triple || tripcount == 3)
1502 break;
1504 else if (c == '\\') {
1505 tripcount = 0;
1506 c = tok_nextc(tok);
1507 if (c == EOF) {
1508 tok->done = E_EOLS;
1509 tok->cur = tok->inp;
1510 return ERRORTOKEN;
1513 else
1514 tripcount = 0;
1516 *p_start = tok->start;
1517 *p_end = tok->cur;
1518 return STRING;
1521 /* Line continuation */
1522 if (c == '\\') {
1523 c = tok_nextc(tok);
1524 if (c != '\n') {
1525 tok->done = E_LINECONT;
1526 tok->cur = tok->inp;
1527 return ERRORTOKEN;
1529 tok->cont_line = 1;
1530 goto again; /* Read next line */
1533 /* Check for two-character token */
1535 int c2 = tok_nextc(tok);
1536 int token = PyToken_TwoChars(c, c2);
1537 #ifndef PGEN
1538 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1539 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1540 "<> not supported in 3.x; use !=",
1541 tok->filename, tok->lineno,
1542 NULL, NULL)) {
1543 return ERRORTOKEN;
1546 #endif
1547 if (token != OP) {
1548 int c3 = tok_nextc(tok);
1549 int token3 = PyToken_ThreeChars(c, c2, c3);
1550 if (token3 != OP) {
1551 token = token3;
1552 } else {
1553 tok_backup(tok, c3);
1555 *p_start = tok->start;
1556 *p_end = tok->cur;
1557 return token;
1559 tok_backup(tok, c2);
1562 /* Keep track of parentheses nesting level */
1563 switch (c) {
1564 case '(':
1565 case '[':
1566 case '{':
1567 tok->level++;
1568 break;
1569 case ')':
1570 case ']':
1571 case '}':
1572 tok->level--;
1573 break;
1576 /* Punctuation character */
1577 *p_start = tok->start;
1578 *p_end = tok->cur;
1579 return PyToken_OneChar(c);
1583 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1585 int result = tok_get(tok, p_start, p_end);
1586 if (tok->decoding_erred) {
1587 result = ERRORTOKEN;
1588 tok->done = E_DECODE;
1590 return result;
1593 /* This function is only called from parsetok. However, it cannot live
1594 there, as it must be empty for PGEN, and we can check for PGEN only
1595 in this file. */
1597 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1598 char*
1599 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1601 return NULL;
1603 #else
1604 #ifdef Py_USING_UNICODE
1605 static PyObject *
1606 dec_utf8(const char *enc, const char *text, size_t len) {
1607 PyObject *ret = NULL;
1608 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1609 if (unicode_text) {
1610 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1611 Py_DECREF(unicode_text);
1613 if (!ret) {
1614 PyErr_Clear();
1616 return ret;
1618 char *
1619 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1621 char *text = NULL;
1622 if (tok->encoding) {
1623 /* convert source to original encondig */
1624 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1625 if (lineobj != NULL) {
1626 int linelen = PyString_Size(lineobj);
1627 const char *line = PyString_AsString(lineobj);
1628 text = PyObject_MALLOC(linelen + 1);
1629 if (text != NULL && line != NULL) {
1630 if (linelen)
1631 strncpy(text, line, linelen);
1632 text[linelen] = '\0';
1634 Py_DECREF(lineobj);
1636 /* adjust error offset */
1637 if (*offset > 1) {
1638 PyObject *offsetobj = dec_utf8(tok->encoding,
1639 tok->buf, *offset-1);
1640 if (offsetobj) {
1641 *offset = PyString_Size(offsetobj) + 1;
1642 Py_DECREF(offsetobj);
1648 return text;
1651 #endif /* defined(Py_USING_UNICODE) */
1652 #endif
1655 #ifdef Py_DEBUG
1657 void
1658 tok_dump(int type, char *start, char *end)
1660 printf("%s", _PyParser_TokenNames[type]);
1661 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1662 printf("(%.*s)", (int)(end - start), start);
1665 #endif