Issue #3366: Add gamma function to math module.
[python.git] / Parser / tokenizer.c
blob0f6705de0b5559f57d13fce6b6a8d9fc10000655
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
30 /* Forward */
31 static struct tok_state *tok_new(void);
32 static int tok_nextc(struct tok_state *tok);
33 static void tok_backup(struct tok_state *tok, int c);
35 /* Token names */
37 char *_PyParser_TokenNames[] = {
38 "ENDMARKER",
39 "NAME",
40 "NUMBER",
41 "STRING",
42 "NEWLINE",
43 "INDENT",
44 "DEDENT",
45 "LPAR",
46 "RPAR",
47 "LSQB",
48 "RSQB",
49 "COLON",
50 "COMMA",
51 "SEMI",
52 "PLUS",
53 "MINUS",
54 "STAR",
55 "SLASH",
56 "VBAR",
57 "AMPER",
58 "LESS",
59 "GREATER",
60 "EQUAL",
61 "DOT",
62 "PERCENT",
63 "BACKQUOTE",
64 "LBRACE",
65 "RBRACE",
66 "EQEQUAL",
67 "NOTEQUAL",
68 "LESSEQUAL",
69 "GREATEREQUAL",
70 "TILDE",
71 "CIRCUMFLEX",
72 "LEFTSHIFT",
73 "RIGHTSHIFT",
74 "DOUBLESTAR",
75 "PLUSEQUAL",
76 "MINEQUAL",
77 "STAREQUAL",
78 "SLASHEQUAL",
79 "PERCENTEQUAL",
80 "AMPEREQUAL",
81 "VBAREQUAL",
82 "CIRCUMFLEXEQUAL",
83 "LEFTSHIFTEQUAL",
84 "RIGHTSHIFTEQUAL",
85 "DOUBLESTAREQUAL",
86 "DOUBLESLASH",
87 "DOUBLESLASHEQUAL",
88 "AT",
89 /* This table must match the #defines in token.h! */
90 "OP",
91 "<ERRORTOKEN>",
92 "<N_TOKENS>"
96 /* Create and initialize a new tok_state structure */
98 static struct tok_state *
99 tok_new(void)
101 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
102 sizeof(struct tok_state));
103 if (tok == NULL)
104 return NULL;
105 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
106 tok->done = E_OK;
107 tok->fp = NULL;
108 tok->tabsize = TABSIZE;
109 tok->indent = 0;
110 tok->indstack[0] = 0;
111 tok->atbol = 1;
112 tok->pendin = 0;
113 tok->prompt = tok->nextprompt = NULL;
114 tok->lineno = 0;
115 tok->level = 0;
116 tok->filename = NULL;
117 tok->altwarning = 0;
118 tok->alterror = 0;
119 tok->alttabsize = 1;
120 tok->altindstack[0] = 0;
121 tok->decoding_state = 0;
122 tok->decoding_erred = 0;
123 tok->read_coding_spec = 0;
124 tok->encoding = NULL;
125 tok->cont_line = 0;
126 #ifndef PGEN
127 tok->decoding_readline = NULL;
128 tok->decoding_buffer = NULL;
129 #endif
130 return tok;
133 #ifdef PGEN
135 static char *
136 decoding_fgets(char *s, int size, struct tok_state *tok)
138 return fgets(s, size, tok->fp);
141 static int
142 decoding_feof(struct tok_state *tok)
144 return feof(tok->fp);
147 static const char *
148 decode_str(const char *str, struct tok_state *tok)
150 return str;
153 #else /* PGEN */
155 static char *
156 error_ret(struct tok_state *tok) /* XXX */
158 tok->decoding_erred = 1;
159 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
160 PyMem_FREE(tok->buf);
161 tok->buf = NULL;
162 return NULL; /* as if it were EOF */
165 static char *
166 new_string(const char *s, Py_ssize_t len)
168 char* result = (char *)PyMem_MALLOC(len + 1);
169 if (result != NULL) {
170 memcpy(result, s, len);
171 result[len] = '\0';
173 return result;
176 static char *
177 get_normal_name(char *s) /* for utf-8 and latin-1 */
179 char buf[13];
180 int i;
181 for (i = 0; i < 12; i++) {
182 int c = s[i];
183 if (c == '\0') break;
184 else if (c == '_') buf[i] = '-';
185 else buf[i] = tolower(c);
187 buf[i] = '\0';
188 if (strcmp(buf, "utf-8") == 0 ||
189 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
190 else if (strcmp(buf, "latin-1") == 0 ||
191 strcmp(buf, "iso-8859-1") == 0 ||
192 strcmp(buf, "iso-latin-1") == 0 ||
193 strncmp(buf, "latin-1-", 8) == 0 ||
194 strncmp(buf, "iso-8859-1-", 11) == 0 ||
195 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
196 else return s;
199 /* Return the coding spec in S, or NULL if none is found. */
201 static char *
202 get_coding_spec(const char *s, Py_ssize_t size)
204 Py_ssize_t i;
205 /* Coding spec must be in a comment, and that comment must be
206 * the only statement on the source code line. */
207 for (i = 0; i < size - 6; i++) {
208 if (s[i] == '#')
209 break;
210 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
211 return NULL;
213 for (; i < size - 6; i++) { /* XXX inefficient search */
214 const char* t = s + i;
215 if (strncmp(t, "coding", 6) == 0) {
216 const char* begin = NULL;
217 t += 6;
218 if (t[0] != ':' && t[0] != '=')
219 continue;
220 do {
221 t++;
222 } while (t[0] == '\x20' || t[0] == '\t');
224 begin = t;
225 while (isalnum(Py_CHARMASK(t[0])) ||
226 t[0] == '-' || t[0] == '_' || t[0] == '.')
227 t++;
229 if (begin < t) {
230 char* r = new_string(begin, t - begin);
231 char* q = get_normal_name(r);
232 if (r != q) {
233 PyMem_FREE(r);
234 r = new_string(q, strlen(q));
236 return r;
240 return NULL;
243 /* Check whether the line contains a coding spec. If it does,
244 invoke the set_readline function for the new encoding.
245 This function receives the tok_state and the new encoding.
246 Return 1 on success, 0 on failure. */
248 static int
249 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
250 int set_readline(struct tok_state *, const char *))
252 char * cs;
253 int r = 1;
255 if (tok->cont_line)
256 /* It's a continuation line, so it can't be a coding spec. */
257 return 1;
258 cs = get_coding_spec(line, size);
259 if (cs != NULL) {
260 tok->read_coding_spec = 1;
261 if (tok->encoding == NULL) {
262 assert(tok->decoding_state == 1); /* raw */
263 if (strcmp(cs, "utf-8") == 0 ||
264 strcmp(cs, "iso-8859-1") == 0) {
265 tok->encoding = cs;
266 } else {
267 #ifdef Py_USING_UNICODE
268 r = set_readline(tok, cs);
269 if (r) {
270 tok->encoding = cs;
271 tok->decoding_state = -1;
273 else
274 PyMem_FREE(cs);
275 #else
276 /* Without Unicode support, we cannot
277 process the coding spec. Since there
278 won't be any Unicode literals, that
279 won't matter. */
280 PyMem_FREE(cs);
281 #endif
283 } else { /* then, compare cs with BOM */
284 r = (strcmp(tok->encoding, cs) == 0);
285 PyMem_FREE(cs);
288 if (!r) {
289 cs = tok->encoding;
290 if (!cs)
291 cs = "with BOM";
292 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
294 return r;
297 /* See whether the file starts with a BOM. If it does,
298 invoke the set_readline function with the new encoding.
299 Return 1 on success, 0 on failure. */
301 static int
302 check_bom(int get_char(struct tok_state *),
303 void unget_char(int, struct tok_state *),
304 int set_readline(struct tok_state *, const char *),
305 struct tok_state *tok)
307 int ch = get_char(tok);
308 tok->decoding_state = 1;
309 if (ch == EOF) {
310 return 1;
311 } else if (ch == 0xEF) {
312 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
313 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
314 #if 0
315 /* Disable support for UTF-16 BOMs until a decision
316 is made whether this needs to be supported. */
317 } else if (ch == 0xFE) {
318 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
319 if (!set_readline(tok, "utf-16-be")) return 0;
320 tok->decoding_state = -1;
321 } else if (ch == 0xFF) {
322 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
323 if (!set_readline(tok, "utf-16-le")) return 0;
324 tok->decoding_state = -1;
325 #endif
326 } else {
327 unget_char(ch, tok);
328 return 1;
330 if (tok->encoding != NULL)
331 PyMem_FREE(tok->encoding);
332 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
333 return 1;
334 NON_BOM:
335 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
336 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
337 return 1;
340 /* Read a line of text from TOK into S, using the stream in TOK.
341 Return NULL on failure, else S.
343 On entry, tok->decoding_buffer will be one of:
344 1) NULL: need to call tok->decoding_readline to get a new line
345 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
346 stored the result in tok->decoding_buffer
347 3) PyStringObject *: previous call to fp_readl did not have enough room
348 (in the s buffer) to copy entire contents of the line read
349 by tok->decoding_readline. tok->decoding_buffer has the overflow.
350 In this case, fp_readl is called in a loop (with an expanded buffer)
351 until the buffer ends with a '\n' (or until the end of the file is
352 reached): see tok_nextc and its calls to decoding_fgets.
355 static char *
356 fp_readl(char *s, int size, struct tok_state *tok)
358 #ifndef Py_USING_UNICODE
359 /* In a non-Unicode built, this should never be called. */
360 Py_FatalError("fp_readl should not be called in this build.");
361 return NULL; /* Keep compiler happy (not reachable) */
362 #else
363 PyObject* utf8 = NULL;
364 PyObject* buf = tok->decoding_buffer;
365 char *str;
366 Py_ssize_t utf8len;
368 /* Ask for one less byte so we can terminate it */
369 assert(size > 0);
370 size--;
372 if (buf == NULL) {
373 buf = PyObject_CallObject(tok->decoding_readline, NULL);
374 if (buf == NULL)
375 return error_ret(tok);
376 } else {
377 tok->decoding_buffer = NULL;
378 if (PyString_CheckExact(buf))
379 utf8 = buf;
381 if (utf8 == NULL) {
382 utf8 = PyUnicode_AsUTF8String(buf);
383 Py_DECREF(buf);
384 if (utf8 == NULL)
385 return error_ret(tok);
387 str = PyString_AsString(utf8);
388 utf8len = PyString_GET_SIZE(utf8);
389 if (utf8len > size) {
390 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
391 if (tok->decoding_buffer == NULL) {
392 Py_DECREF(utf8);
393 return error_ret(tok);
395 utf8len = size;
397 memcpy(s, str, utf8len);
398 s[utf8len] = '\0';
399 Py_DECREF(utf8);
400 if (utf8len == 0) return NULL; /* EOF */
401 return s;
402 #endif
405 /* Set the readline function for TOK to a StreamReader's
406 readline function. The StreamReader is named ENC.
408 This function is called from check_bom and check_coding_spec.
410 ENC is usually identical to the future value of tok->encoding,
411 except for the (currently unsupported) case of UTF-16.
413 Return 1 on success, 0 on failure. */
415 static int
416 fp_setreadl(struct tok_state *tok, const char* enc)
418 PyObject *reader, *stream, *readline;
420 /* XXX: constify filename argument. */
421 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
422 if (stream == NULL)
423 return 0;
425 reader = PyCodec_StreamReader(enc, stream, NULL);
426 Py_DECREF(stream);
427 if (reader == NULL)
428 return 0;
430 readline = PyObject_GetAttrString(reader, "readline");
431 Py_DECREF(reader);
432 if (readline == NULL)
433 return 0;
435 tok->decoding_readline = readline;
436 return 1;
439 /* Fetch the next byte from TOK. */
441 static int fp_getc(struct tok_state *tok) {
442 return getc(tok->fp);
445 /* Unfetch the last byte back into TOK. */
447 static void fp_ungetc(int c, struct tok_state *tok) {
448 ungetc(c, tok->fp);
451 /* Read a line of input from TOK. Determine encoding
452 if necessary. */
454 static char *
455 decoding_fgets(char *s, int size, struct tok_state *tok)
457 char *line = NULL;
458 int badchar = 0;
459 for (;;) {
460 if (tok->decoding_state < 0) {
461 /* We already have a codec associated with
462 this input. */
463 line = fp_readl(s, size, tok);
464 break;
465 } else if (tok->decoding_state > 0) {
466 /* We want a 'raw' read. */
467 line = Py_UniversalNewlineFgets(s, size,
468 tok->fp, NULL);
469 break;
470 } else {
471 /* We have not yet determined the encoding.
472 If an encoding is found, use the file-pointer
473 reader functions from now on. */
474 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
475 return error_ret(tok);
476 assert(tok->decoding_state != 0);
479 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
480 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
481 return error_ret(tok);
484 #ifndef PGEN
485 /* The default encoding is ASCII, so make sure we don't have any
486 non-ASCII bytes in it. */
487 if (line && !tok->encoding) {
488 unsigned char *c;
489 for (c = (unsigned char *)line; *c; c++)
490 if (*c > 127) {
491 badchar = *c;
492 break;
495 if (badchar) {
496 char buf[500];
497 /* Need to add 1 to the line number, since this line
498 has not been counted, yet. */
499 sprintf(buf,
500 "Non-ASCII character '\\x%.2x' "
501 "in file %.200s on line %i, "
502 "but no encoding declared; "
503 "see http://www.python.org/peps/pep-0263.html for details",
504 badchar, tok->filename, tok->lineno + 1);
505 PyErr_SetString(PyExc_SyntaxError, buf);
506 return error_ret(tok);
508 #endif
509 return line;
512 static int
513 decoding_feof(struct tok_state *tok)
515 if (tok->decoding_state >= 0) {
516 return feof(tok->fp);
517 } else {
518 PyObject* buf = tok->decoding_buffer;
519 if (buf == NULL) {
520 buf = PyObject_CallObject(tok->decoding_readline, NULL);
521 if (buf == NULL) {
522 error_ret(tok);
523 return 1;
524 } else {
525 tok->decoding_buffer = buf;
528 return PyObject_Length(buf) == 0;
532 /* Fetch a byte from TOK, using the string buffer. */
534 static int
535 buf_getc(struct tok_state *tok) {
536 return Py_CHARMASK(*tok->str++);
539 /* Unfetch a byte from TOK, using the string buffer. */
541 static void
542 buf_ungetc(int c, struct tok_state *tok) {
543 tok->str--;
544 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
547 /* Set the readline function for TOK to ENC. For the string-based
548 tokenizer, this means to just record the encoding. */
550 static int
551 buf_setreadl(struct tok_state *tok, const char* enc) {
552 tok->enc = enc;
553 return 1;
556 /* Return a UTF-8 encoding Python string object from the
557 C byte string STR, which is encoded with ENC. */
559 #ifdef Py_USING_UNICODE
560 static PyObject *
561 translate_into_utf8(const char* str, const char* enc) {
562 PyObject *utf8;
563 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
564 if (buf == NULL)
565 return NULL;
566 utf8 = PyUnicode_AsUTF8String(buf);
567 Py_DECREF(buf);
568 return utf8;
570 #endif
572 /* Decode a byte string STR for use as the buffer of TOK.
573 Look for encoding declarations inside STR, and record them
574 inside TOK. */
576 static const char *
577 decode_str(const char *str, struct tok_state *tok)
579 PyObject* utf8 = NULL;
580 const char *s;
581 const char *newl[2] = {NULL, NULL};
582 int lineno = 0;
583 tok->enc = NULL;
584 tok->str = str;
585 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
586 return error_ret(tok);
587 str = tok->str; /* string after BOM if any */
588 assert(str);
589 #ifdef Py_USING_UNICODE
590 if (tok->enc != NULL) {
591 utf8 = translate_into_utf8(str, tok->enc);
592 if (utf8 == NULL)
593 return error_ret(tok);
594 str = PyString_AsString(utf8);
596 #endif
597 for (s = str;; s++) {
598 if (*s == '\0') break;
599 else if (*s == '\n') {
600 assert(lineno < 2);
601 newl[lineno] = s;
602 lineno++;
603 if (lineno == 2) break;
606 tok->enc = NULL;
607 /* need to check line 1 and 2 separately since check_coding_spec
608 assumes a single line as input */
609 if (newl[0]) {
610 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
611 return error_ret(tok);
612 if (tok->enc == NULL && newl[1]) {
613 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
614 tok, buf_setreadl))
615 return error_ret(tok);
618 #ifdef Py_USING_UNICODE
619 if (tok->enc != NULL) {
620 assert(utf8 == NULL);
621 utf8 = translate_into_utf8(str, tok->enc);
622 if (utf8 == NULL)
623 return error_ret(tok);
624 str = PyString_AsString(utf8);
626 #endif
627 assert(tok->decoding_buffer == NULL);
628 tok->decoding_buffer = utf8; /* CAUTION */
629 return str;
632 #endif /* PGEN */
634 /* Set up tokenizer for string */
636 struct tok_state *
637 PyTokenizer_FromString(const char *str)
639 struct tok_state *tok = tok_new();
640 if (tok == NULL)
641 return NULL;
642 str = (char *)decode_str(str, tok);
643 if (str == NULL) {
644 PyTokenizer_Free(tok);
645 return NULL;
648 /* XXX: constify members. */
649 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
650 return tok;
654 /* Set up tokenizer for file */
656 struct tok_state *
657 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
659 struct tok_state *tok = tok_new();
660 if (tok == NULL)
661 return NULL;
662 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
663 PyTokenizer_Free(tok);
664 return NULL;
666 tok->cur = tok->inp = tok->buf;
667 tok->end = tok->buf + BUFSIZ;
668 tok->fp = fp;
669 tok->prompt = ps1;
670 tok->nextprompt = ps2;
671 return tok;
675 /* Free a tok_state structure */
677 void
678 PyTokenizer_Free(struct tok_state *tok)
680 if (tok->encoding != NULL)
681 PyMem_FREE(tok->encoding);
682 #ifndef PGEN
683 Py_XDECREF(tok->decoding_readline);
684 Py_XDECREF(tok->decoding_buffer);
685 #endif
686 if (tok->fp != NULL && tok->buf != NULL)
687 PyMem_FREE(tok->buf);
688 PyMem_FREE(tok);
691 #if !defined(PGEN) && defined(Py_USING_UNICODE)
692 static int
693 tok_stdin_decode(struct tok_state *tok, char **inp)
695 PyObject *enc, *sysstdin, *decoded, *utf8;
696 const char *encoding;
697 char *converted;
699 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
700 return 0;
701 sysstdin = PySys_GetObject("stdin");
702 if (sysstdin == NULL || !PyFile_Check(sysstdin))
703 return 0;
705 enc = ((PyFileObject *)sysstdin)->f_encoding;
706 if (enc == NULL || !PyString_Check(enc))
707 return 0;
708 Py_INCREF(enc);
710 encoding = PyString_AsString(enc);
711 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
712 if (decoded == NULL)
713 goto error_clear;
715 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
716 Py_DECREF(decoded);
717 if (utf8 == NULL)
718 goto error_clear;
720 assert(PyString_Check(utf8));
721 converted = new_string(PyString_AS_STRING(utf8),
722 PyString_GET_SIZE(utf8));
723 Py_DECREF(utf8);
724 if (converted == NULL)
725 goto error_nomem;
727 PyMem_FREE(*inp);
728 *inp = converted;
729 if (tok->encoding != NULL)
730 PyMem_FREE(tok->encoding);
731 tok->encoding = new_string(encoding, strlen(encoding));
732 if (tok->encoding == NULL)
733 goto error_nomem;
735 Py_DECREF(enc);
736 return 0;
738 error_nomem:
739 Py_DECREF(enc);
740 tok->done = E_NOMEM;
741 return -1;
743 error_clear:
744 /* Fallback to iso-8859-1: for backward compatibility */
745 Py_DECREF(enc);
746 PyErr_Clear();
747 return 0;
749 #endif
751 /* Get next char, updating state; error code goes into tok->done */
753 static int
754 tok_nextc(register struct tok_state *tok)
756 for (;;) {
757 if (tok->cur != tok->inp) {
758 return Py_CHARMASK(*tok->cur++); /* Fast path */
760 if (tok->done != E_OK)
761 return EOF;
762 if (tok->fp == NULL) {
763 char *end = strchr(tok->inp, '\n');
764 if (end != NULL)
765 end++;
766 else {
767 end = strchr(tok->inp, '\0');
768 if (end == tok->inp) {
769 tok->done = E_EOF;
770 return EOF;
773 if (tok->start == NULL)
774 tok->buf = tok->cur;
775 tok->line_start = tok->cur;
776 tok->lineno++;
777 tok->inp = end;
778 return Py_CHARMASK(*tok->cur++);
780 if (tok->prompt != NULL) {
781 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
782 if (tok->nextprompt != NULL)
783 tok->prompt = tok->nextprompt;
784 if (newtok == NULL)
785 tok->done = E_INTR;
786 else if (*newtok == '\0') {
787 PyMem_FREE(newtok);
788 tok->done = E_EOF;
790 #if !defined(PGEN) && defined(Py_USING_UNICODE)
791 else if (tok_stdin_decode(tok, &newtok) != 0)
792 PyMem_FREE(newtok);
793 #endif
794 else if (tok->start != NULL) {
795 size_t start = tok->start - tok->buf;
796 size_t oldlen = tok->cur - tok->buf;
797 size_t newlen = oldlen + strlen(newtok);
798 char *buf = tok->buf;
799 buf = (char *)PyMem_REALLOC(buf, newlen+1);
800 tok->lineno++;
801 if (buf == NULL) {
802 PyMem_FREE(tok->buf);
803 tok->buf = NULL;
804 PyMem_FREE(newtok);
805 tok->done = E_NOMEM;
806 return EOF;
808 tok->buf = buf;
809 tok->cur = tok->buf + oldlen;
810 tok->line_start = tok->cur;
811 strcpy(tok->buf + oldlen, newtok);
812 PyMem_FREE(newtok);
813 tok->inp = tok->buf + newlen;
814 tok->end = tok->inp + 1;
815 tok->start = tok->buf + start;
817 else {
818 tok->lineno++;
819 if (tok->buf != NULL)
820 PyMem_FREE(tok->buf);
821 tok->buf = newtok;
822 tok->line_start = tok->buf;
823 tok->cur = tok->buf;
824 tok->line_start = tok->buf;
825 tok->inp = strchr(tok->buf, '\0');
826 tok->end = tok->inp + 1;
829 else {
830 int done = 0;
831 Py_ssize_t cur = 0;
832 char *pt;
833 if (tok->start == NULL) {
834 if (tok->buf == NULL) {
835 tok->buf = (char *)
836 PyMem_MALLOC(BUFSIZ);
837 if (tok->buf == NULL) {
838 tok->done = E_NOMEM;
839 return EOF;
841 tok->end = tok->buf + BUFSIZ;
843 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
844 tok) == NULL) {
845 tok->done = E_EOF;
846 done = 1;
848 else {
849 tok->done = E_OK;
850 tok->inp = strchr(tok->buf, '\0');
851 done = tok->inp[-1] == '\n';
854 else {
855 cur = tok->cur - tok->buf;
856 if (decoding_feof(tok)) {
857 tok->done = E_EOF;
858 done = 1;
860 else
861 tok->done = E_OK;
863 tok->lineno++;
864 /* Read until '\n' or EOF */
865 while (!done) {
866 Py_ssize_t curstart = tok->start == NULL ? -1 :
867 tok->start - tok->buf;
868 Py_ssize_t curvalid = tok->inp - tok->buf;
869 Py_ssize_t newsize = curvalid + BUFSIZ;
870 char *newbuf = tok->buf;
871 newbuf = (char *)PyMem_REALLOC(newbuf,
872 newsize);
873 if (newbuf == NULL) {
874 tok->done = E_NOMEM;
875 tok->cur = tok->inp;
876 return EOF;
878 tok->buf = newbuf;
879 tok->inp = tok->buf + curvalid;
880 tok->end = tok->buf + newsize;
881 tok->start = curstart < 0 ? NULL :
882 tok->buf + curstart;
883 if (decoding_fgets(tok->inp,
884 (int)(tok->end - tok->inp),
885 tok) == NULL) {
886 /* Break out early on decoding
887 errors, as tok->buf will be NULL
889 if (tok->decoding_erred)
890 return EOF;
891 /* Last line does not end in \n,
892 fake one */
893 strcpy(tok->inp, "\n");
895 tok->inp = strchr(tok->inp, '\0');
896 done = tok->inp[-1] == '\n';
898 if (tok->buf != NULL) {
899 tok->cur = tok->buf + cur;
900 tok->line_start = tok->cur;
901 /* replace "\r\n" with "\n" */
902 /* For Mac leave the \r, giving a syntax error */
903 pt = tok->inp - 2;
904 if (pt >= tok->buf && *pt == '\r') {
905 *pt++ = '\n';
906 *pt = '\0';
907 tok->inp = pt;
911 if (tok->done != E_OK) {
912 if (tok->prompt != NULL)
913 PySys_WriteStderr("\n");
914 tok->cur = tok->inp;
915 return EOF;
918 /*NOTREACHED*/
922 /* Back-up one character */
924 static void
925 tok_backup(register struct tok_state *tok, register int c)
927 if (c != EOF) {
928 if (--tok->cur < tok->buf)
929 Py_FatalError("tok_backup: begin of buffer");
930 if (*tok->cur != c)
931 *tok->cur = c;
936 /* Return the token corresponding to a single character */
939 PyToken_OneChar(int c)
941 switch (c) {
942 case '(': return LPAR;
943 case ')': return RPAR;
944 case '[': return LSQB;
945 case ']': return RSQB;
946 case ':': return COLON;
947 case ',': return COMMA;
948 case ';': return SEMI;
949 case '+': return PLUS;
950 case '-': return MINUS;
951 case '*': return STAR;
952 case '/': return SLASH;
953 case '|': return VBAR;
954 case '&': return AMPER;
955 case '<': return LESS;
956 case '>': return GREATER;
957 case '=': return EQUAL;
958 case '.': return DOT;
959 case '%': return PERCENT;
960 case '`': return BACKQUOTE;
961 case '{': return LBRACE;
962 case '}': return RBRACE;
963 case '^': return CIRCUMFLEX;
964 case '~': return TILDE;
965 case '@': return AT;
966 default: return OP;
972 PyToken_TwoChars(int c1, int c2)
974 switch (c1) {
975 case '=':
976 switch (c2) {
977 case '=': return EQEQUAL;
979 break;
980 case '!':
981 switch (c2) {
982 case '=': return NOTEQUAL;
984 break;
985 case '<':
986 switch (c2) {
987 case '>': return NOTEQUAL;
988 case '=': return LESSEQUAL;
989 case '<': return LEFTSHIFT;
991 break;
992 case '>':
993 switch (c2) {
994 case '=': return GREATEREQUAL;
995 case '>': return RIGHTSHIFT;
997 break;
998 case '+':
999 switch (c2) {
1000 case '=': return PLUSEQUAL;
1002 break;
1003 case '-':
1004 switch (c2) {
1005 case '=': return MINEQUAL;
1007 break;
1008 case '*':
1009 switch (c2) {
1010 case '*': return DOUBLESTAR;
1011 case '=': return STAREQUAL;
1013 break;
1014 case '/':
1015 switch (c2) {
1016 case '/': return DOUBLESLASH;
1017 case '=': return SLASHEQUAL;
1019 break;
1020 case '|':
1021 switch (c2) {
1022 case '=': return VBAREQUAL;
1024 break;
1025 case '%':
1026 switch (c2) {
1027 case '=': return PERCENTEQUAL;
1029 break;
1030 case '&':
1031 switch (c2) {
1032 case '=': return AMPEREQUAL;
1034 break;
1035 case '^':
1036 switch (c2) {
1037 case '=': return CIRCUMFLEXEQUAL;
1039 break;
1041 return OP;
1045 PyToken_ThreeChars(int c1, int c2, int c3)
1047 switch (c1) {
1048 case '<':
1049 switch (c2) {
1050 case '<':
1051 switch (c3) {
1052 case '=':
1053 return LEFTSHIFTEQUAL;
1055 break;
1057 break;
1058 case '>':
1059 switch (c2) {
1060 case '>':
1061 switch (c3) {
1062 case '=':
1063 return RIGHTSHIFTEQUAL;
1065 break;
1067 break;
1068 case '*':
1069 switch (c2) {
1070 case '*':
1071 switch (c3) {
1072 case '=':
1073 return DOUBLESTAREQUAL;
1075 break;
1077 break;
1078 case '/':
1079 switch (c2) {
1080 case '/':
1081 switch (c3) {
1082 case '=':
1083 return DOUBLESLASHEQUAL;
1085 break;
1087 break;
1089 return OP;
1092 static int
1093 indenterror(struct tok_state *tok)
1095 if (tok->alterror) {
1096 tok->done = E_TABSPACE;
1097 tok->cur = tok->inp;
1098 return 1;
1100 if (tok->altwarning) {
1101 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1102 "in indentation\n", tok->filename);
1103 tok->altwarning = 0;
1105 return 0;
1109 /* Get next token, after space stripping etc. */
1111 static int
1112 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1114 register int c;
1115 int blankline;
1117 *p_start = *p_end = NULL;
1118 nextline:
1119 tok->start = NULL;
1120 blankline = 0;
1122 /* Get indentation level */
1123 if (tok->atbol) {
1124 register int col = 0;
1125 register int altcol = 0;
1126 tok->atbol = 0;
1127 for (;;) {
1128 c = tok_nextc(tok);
1129 if (c == ' ')
1130 col++, altcol++;
1131 else if (c == '\t') {
1132 col = (col/tok->tabsize + 1) * tok->tabsize;
1133 altcol = (altcol/tok->alttabsize + 1)
1134 * tok->alttabsize;
1136 else if (c == '\014') /* Control-L (formfeed) */
1137 col = altcol = 0; /* For Emacs users */
1138 else
1139 break;
1141 tok_backup(tok, c);
1142 if (c == '#' || c == '\n') {
1143 /* Lines with only whitespace and/or comments
1144 shouldn't affect the indentation and are
1145 not passed to the parser as NEWLINE tokens,
1146 except *totally* empty lines in interactive
1147 mode, which signal the end of a command group. */
1148 if (col == 0 && c == '\n' && tok->prompt != NULL)
1149 blankline = 0; /* Let it through */
1150 else
1151 blankline = 1; /* Ignore completely */
1152 /* We can't jump back right here since we still
1153 may need to skip to the end of a comment */
1155 if (!blankline && tok->level == 0) {
1156 if (col == tok->indstack[tok->indent]) {
1157 /* No change */
1158 if (altcol != tok->altindstack[tok->indent]) {
1159 if (indenterror(tok))
1160 return ERRORTOKEN;
1163 else if (col > tok->indstack[tok->indent]) {
1164 /* Indent -- always one */
1165 if (tok->indent+1 >= MAXINDENT) {
1166 tok->done = E_TOODEEP;
1167 tok->cur = tok->inp;
1168 return ERRORTOKEN;
1170 if (altcol <= tok->altindstack[tok->indent]) {
1171 if (indenterror(tok))
1172 return ERRORTOKEN;
1174 tok->pendin++;
1175 tok->indstack[++tok->indent] = col;
1176 tok->altindstack[tok->indent] = altcol;
1178 else /* col < tok->indstack[tok->indent] */ {
1179 /* Dedent -- any number, must be consistent */
1180 while (tok->indent > 0 &&
1181 col < tok->indstack[tok->indent]) {
1182 tok->pendin--;
1183 tok->indent--;
1185 if (col != tok->indstack[tok->indent]) {
1186 tok->done = E_DEDENT;
1187 tok->cur = tok->inp;
1188 return ERRORTOKEN;
1190 if (altcol != tok->altindstack[tok->indent]) {
1191 if (indenterror(tok))
1192 return ERRORTOKEN;
1198 tok->start = tok->cur;
1200 /* Return pending indents/dedents */
1201 if (tok->pendin != 0) {
1202 if (tok->pendin < 0) {
1203 tok->pendin++;
1204 return DEDENT;
1206 else {
1207 tok->pendin--;
1208 return INDENT;
1212 again:
1213 tok->start = NULL;
1214 /* Skip spaces */
1215 do {
1216 c = tok_nextc(tok);
1217 } while (c == ' ' || c == '\t' || c == '\014');
1219 /* Set start of current token */
1220 tok->start = tok->cur - 1;
1222 /* Skip comment, while looking for tab-setting magic */
1223 if (c == '#') {
1224 static char *tabforms[] = {
1225 "tab-width:", /* Emacs */
1226 ":tabstop=", /* vim, full form */
1227 ":ts=", /* vim, abbreviated form */
1228 "set tabsize=", /* will vi never die? */
1229 /* more templates can be added here to support other editors */
1231 char cbuf[80];
1232 char *tp, **cp;
1233 tp = cbuf;
1234 do {
1235 *tp++ = c = tok_nextc(tok);
1236 } while (c != EOF && c != '\n' &&
1237 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1238 *tp = '\0';
1239 for (cp = tabforms;
1240 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1241 cp++) {
1242 if ((tp = strstr(cbuf, *cp))) {
1243 int newsize = atoi(tp + strlen(*cp));
1245 if (newsize >= 1 && newsize <= 40) {
1246 tok->tabsize = newsize;
1247 if (Py_VerboseFlag)
1248 PySys_WriteStderr(
1249 "Tab size set to %d\n",
1250 newsize);
1254 while (c != EOF && c != '\n')
1255 c = tok_nextc(tok);
1258 /* Check for EOF and errors now */
1259 if (c == EOF) {
1260 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1263 /* Identifier (most frequent token!) */
1264 if (isalpha(c) || c == '_') {
1265 /* Process r"", u"" and ur"" */
1266 switch (c) {
1267 case 'b':
1268 case 'B':
1269 c = tok_nextc(tok);
1270 if (c == 'r' || c == 'R')
1271 c = tok_nextc(tok);
1272 if (c == '"' || c == '\'')
1273 goto letter_quote;
1274 break;
1275 case 'r':
1276 case 'R':
1277 c = tok_nextc(tok);
1278 if (c == '"' || c == '\'')
1279 goto letter_quote;
1280 break;
1281 case 'u':
1282 case 'U':
1283 c = tok_nextc(tok);
1284 if (c == 'r' || c == 'R')
1285 c = tok_nextc(tok);
1286 if (c == '"' || c == '\'')
1287 goto letter_quote;
1288 break;
1290 while (isalnum(c) || c == '_') {
1291 c = tok_nextc(tok);
1293 tok_backup(tok, c);
1294 *p_start = tok->start;
1295 *p_end = tok->cur;
1296 return NAME;
1299 /* Newline */
1300 if (c == '\n') {
1301 tok->atbol = 1;
1302 if (blankline || tok->level > 0)
1303 goto nextline;
1304 *p_start = tok->start;
1305 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1306 tok->cont_line = 0;
1307 return NEWLINE;
1310 /* Period or number starting with period? */
1311 if (c == '.') {
1312 c = tok_nextc(tok);
1313 if (isdigit(c)) {
1314 goto fraction;
1316 else {
1317 tok_backup(tok, c);
1318 *p_start = tok->start;
1319 *p_end = tok->cur;
1320 return DOT;
1324 /* Number */
1325 if (isdigit(c)) {
1326 if (c == '0') {
1327 /* Hex, octal or binary -- maybe. */
1328 c = tok_nextc(tok);
1329 if (c == '.')
1330 goto fraction;
1331 #ifndef WITHOUT_COMPLEX
1332 if (c == 'j' || c == 'J')
1333 goto imaginary;
1334 #endif
1335 if (c == 'x' || c == 'X') {
1337 /* Hex */
1338 c = tok_nextc(tok);
1339 if (!isxdigit(c)) {
1340 tok->done = E_TOKEN;
1341 tok_backup(tok, c);
1342 return ERRORTOKEN;
1344 do {
1345 c = tok_nextc(tok);
1346 } while (isxdigit(c));
1348 else if (c == 'o' || c == 'O') {
1349 /* Octal */
1350 c = tok_nextc(tok);
1351 if (c < '0' || c >= '8') {
1352 tok->done = E_TOKEN;
1353 tok_backup(tok, c);
1354 return ERRORTOKEN;
1356 do {
1357 c = tok_nextc(tok);
1358 } while ('0' <= c && c < '8');
1360 else if (c == 'b' || c == 'B') {
1361 /* Binary */
1362 c = tok_nextc(tok);
1363 if (c != '0' && c != '1') {
1364 tok->done = E_TOKEN;
1365 tok_backup(tok, c);
1366 return ERRORTOKEN;
1368 do {
1369 c = tok_nextc(tok);
1370 } while (c == '0' || c == '1');
1372 else {
1373 int found_decimal = 0;
1374 /* Octal; c is first char of it */
1375 /* There's no 'isoctdigit' macro, sigh */
1376 while ('0' <= c && c < '8') {
1377 c = tok_nextc(tok);
1379 if (isdigit(c)) {
1380 found_decimal = 1;
1381 do {
1382 c = tok_nextc(tok);
1383 } while (isdigit(c));
1385 if (c == '.')
1386 goto fraction;
1387 else if (c == 'e' || c == 'E')
1388 goto exponent;
1389 #ifndef WITHOUT_COMPLEX
1390 else if (c == 'j' || c == 'J')
1391 goto imaginary;
1392 #endif
1393 else if (found_decimal) {
1394 tok->done = E_TOKEN;
1395 tok_backup(tok, c);
1396 return ERRORTOKEN;
1399 if (c == 'l' || c == 'L')
1400 c = tok_nextc(tok);
1402 else {
1403 /* Decimal */
1404 do {
1405 c = tok_nextc(tok);
1406 } while (isdigit(c));
1407 if (c == 'l' || c == 'L')
1408 c = tok_nextc(tok);
1409 else {
1410 /* Accept floating point numbers. */
1411 if (c == '.') {
1412 fraction:
1413 /* Fraction */
1414 do {
1415 c = tok_nextc(tok);
1416 } while (isdigit(c));
1418 if (c == 'e' || c == 'E') {
1419 exponent:
1420 /* Exponent part */
1421 c = tok_nextc(tok);
1422 if (c == '+' || c == '-')
1423 c = tok_nextc(tok);
1424 if (!isdigit(c)) {
1425 tok->done = E_TOKEN;
1426 tok_backup(tok, c);
1427 return ERRORTOKEN;
1429 do {
1430 c = tok_nextc(tok);
1431 } while (isdigit(c));
1433 #ifndef WITHOUT_COMPLEX
1434 if (c == 'j' || c == 'J')
1435 /* Imaginary part */
1436 imaginary:
1437 c = tok_nextc(tok);
1438 #endif
1441 tok_backup(tok, c);
1442 *p_start = tok->start;
1443 *p_end = tok->cur;
1444 return NUMBER;
1447 letter_quote:
1448 /* String */
1449 if (c == '\'' || c == '"') {
1450 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1451 int quote = c;
1452 int triple = 0;
1453 int tripcount = 0;
1454 for (;;) {
1455 c = tok_nextc(tok);
1456 if (c == '\n') {
1457 if (!triple) {
1458 tok->done = E_EOLS;
1459 tok_backup(tok, c);
1460 return ERRORTOKEN;
1462 tripcount = 0;
1463 tok->cont_line = 1; /* multiline string. */
1465 else if (c == EOF) {
1466 if (triple)
1467 tok->done = E_EOFS;
1468 else
1469 tok->done = E_EOLS;
1470 tok->cur = tok->inp;
1471 return ERRORTOKEN;
1473 else if (c == quote) {
1474 tripcount++;
1475 if (tok->cur - tok->start == quote2) {
1476 c = tok_nextc(tok);
1477 if (c == quote) {
1478 triple = 1;
1479 tripcount = 0;
1480 continue;
1482 tok_backup(tok, c);
1484 if (!triple || tripcount == 3)
1485 break;
1487 else if (c == '\\') {
1488 tripcount = 0;
1489 c = tok_nextc(tok);
1490 if (c == EOF) {
1491 tok->done = E_EOLS;
1492 tok->cur = tok->inp;
1493 return ERRORTOKEN;
1496 else
1497 tripcount = 0;
1499 *p_start = tok->start;
1500 *p_end = tok->cur;
1501 return STRING;
1504 /* Line continuation */
1505 if (c == '\\') {
1506 c = tok_nextc(tok);
1507 if (c != '\n') {
1508 tok->done = E_LINECONT;
1509 tok->cur = tok->inp;
1510 return ERRORTOKEN;
1512 tok->cont_line = 1;
1513 goto again; /* Read next line */
1516 /* Check for two-character token */
1518 int c2 = tok_nextc(tok);
1519 int token = PyToken_TwoChars(c, c2);
1520 #ifndef PGEN
1521 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1522 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1523 "<> not supported in 3.x; use !=",
1524 tok->filename, tok->lineno,
1525 NULL, NULL)) {
1526 return ERRORTOKEN;
1529 #endif
1530 if (token != OP) {
1531 int c3 = tok_nextc(tok);
1532 int token3 = PyToken_ThreeChars(c, c2, c3);
1533 if (token3 != OP) {
1534 token = token3;
1535 } else {
1536 tok_backup(tok, c3);
1538 *p_start = tok->start;
1539 *p_end = tok->cur;
1540 return token;
1542 tok_backup(tok, c2);
1545 /* Keep track of parentheses nesting level */
1546 switch (c) {
1547 case '(':
1548 case '[':
1549 case '{':
1550 tok->level++;
1551 break;
1552 case ')':
1553 case ']':
1554 case '}':
1555 tok->level--;
1556 break;
1559 /* Punctuation character */
1560 *p_start = tok->start;
1561 *p_end = tok->cur;
1562 return PyToken_OneChar(c);
1566 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1568 int result = tok_get(tok, p_start, p_end);
1569 if (tok->decoding_erred) {
1570 result = ERRORTOKEN;
1571 tok->done = E_DECODE;
1573 return result;
1576 /* This function is only called from parsetok. However, it cannot live
1577 there, as it must be empty for PGEN, and we can check for PGEN only
1578 in this file. */
1580 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1581 char*
1582 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1584 return NULL;
1586 #else
1587 #ifdef Py_USING_UNICODE
1588 static PyObject *
1589 dec_utf8(const char *enc, const char *text, size_t len) {
1590 PyObject *ret = NULL;
1591 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1592 if (unicode_text) {
1593 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1594 Py_DECREF(unicode_text);
1596 if (!ret) {
1597 PyErr_Clear();
1599 return ret;
1601 char *
1602 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1604 char *text = NULL;
1605 if (tok->encoding) {
1606 /* convert source to original encondig */
1607 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1608 if (lineobj != NULL) {
1609 int linelen = PyString_Size(lineobj);
1610 const char *line = PyString_AsString(lineobj);
1611 text = PyObject_MALLOC(linelen + 1);
1612 if (text != NULL && line != NULL) {
1613 if (linelen)
1614 strncpy(text, line, linelen);
1615 text[linelen] = '\0';
1617 Py_DECREF(lineobj);
1619 /* adjust error offset */
1620 if (*offset > 1) {
1621 PyObject *offsetobj = dec_utf8(tok->encoding,
1622 tok->buf, *offset-1);
1623 if (offsetobj) {
1624 *offset = PyString_Size(offsetobj) + 1;
1625 Py_DECREF(offsetobj);
1631 return text;
1634 #endif /* defined(Py_USING_UNICODE) */
1635 #endif
1638 #ifdef Py_DEBUG
1640 void
1641 tok_dump(int type, char *start, char *end)
1643 printf("%s", _PyParser_TokenNames[type]);
1644 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1645 printf("(%.*s)", (int)(end - start), start);
1648 #endif