Merged revisions 73623-73624 via svnmerge from
[python/dscho.git] / Parser / tokenizer.c
blobcc142a7127278d0b57fbed164bbaa74f8b359f5d
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
21 #define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
27 #define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
34 extern char *PyOS_Readline(FILE *, FILE *, char *);
35 /* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
39 /* Don't ever change this -- it would break the portability of Python code */
40 #define TABSIZE 8
42 /* Forward */
43 static struct tok_state *tok_new(void);
44 static int tok_nextc(struct tok_state *tok);
45 static void tok_backup(struct tok_state *tok, int c);
48 /* Token names */
50 char *_PyParser_TokenNames[] = {
51 "ENDMARKER",
52 "NAME",
53 "NUMBER",
54 "STRING",
55 "NEWLINE",
56 "INDENT",
57 "DEDENT",
58 "LPAR",
59 "RPAR",
60 "LSQB",
61 "RSQB",
62 "COLON",
63 "COMMA",
64 "SEMI",
65 "PLUS",
66 "MINUS",
67 "STAR",
68 "SLASH",
69 "VBAR",
70 "AMPER",
71 "LESS",
72 "GREATER",
73 "EQUAL",
74 "DOT",
75 "PERCENT",
76 "LBRACE",
77 "RBRACE",
78 "EQEQUAL",
79 "NOTEQUAL",
80 "LESSEQUAL",
81 "GREATEREQUAL",
82 "TILDE",
83 "CIRCUMFLEX",
84 "LEFTSHIFT",
85 "RIGHTSHIFT",
86 "DOUBLESTAR",
87 "PLUSEQUAL",
88 "MINEQUAL",
89 "STAREQUAL",
90 "SLASHEQUAL",
91 "PERCENTEQUAL",
92 "AMPEREQUAL",
93 "VBAREQUAL",
94 "CIRCUMFLEXEQUAL",
95 "LEFTSHIFTEQUAL",
96 "RIGHTSHIFTEQUAL",
97 "DOUBLESTAREQUAL",
98 "DOUBLESLASH",
99 "DOUBLESLASHEQUAL",
100 "AT",
101 "RARROW",
102 "ELLIPSIS",
103 /* This table must match the #defines in token.h! */
104 "OP",
105 "<ERRORTOKEN>",
106 "<N_TOKENS>"
110 /* Create and initialize a new tok_state structure */
112 static struct tok_state *
113 tok_new(void)
115 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
116 sizeof(struct tok_state));
117 if (tok == NULL)
118 return NULL;
119 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
120 tok->done = E_OK;
121 tok->fp = NULL;
122 tok->tabsize = TABSIZE;
123 tok->indent = 0;
124 tok->indstack[0] = 0;
125 tok->atbol = 1;
126 tok->pendin = 0;
127 tok->prompt = tok->nextprompt = NULL;
128 tok->lineno = 0;
129 tok->level = 0;
130 tok->filename = NULL;
131 tok->altwarning = 1;
132 tok->alterror = 1;
133 tok->alttabsize = 1;
134 tok->altindstack[0] = 0;
135 tok->decoding_state = STATE_INIT;
136 tok->decoding_erred = 0;
137 tok->read_coding_spec = 0;
138 tok->enc = NULL;
139 tok->encoding = NULL;
140 tok->cont_line = 0;
141 #ifndef PGEN
142 tok->decoding_readline = NULL;
143 tok->decoding_buffer = NULL;
144 #endif
145 return tok;
148 #ifdef PGEN
150 static char *
151 decoding_fgets(char *s, int size, struct tok_state *tok)
153 return fgets(s, size, tok->fp);
156 static int
157 decoding_feof(struct tok_state *tok)
159 return feof(tok->fp);
162 static const char *
163 decode_str(const char *str, struct tok_state *tok)
165 return str;
168 #else /* PGEN */
170 static char *
171 error_ret(struct tok_state *tok) /* XXX */
173 tok->decoding_erred = 1;
174 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
175 PyMem_FREE(tok->buf);
176 tok->buf = NULL;
177 return NULL; /* as if it were EOF */
180 static char *
181 new_string(const char *s, Py_ssize_t len)
183 char* result = (char *)PyMem_MALLOC(len + 1);
184 if (result != NULL) {
185 memcpy(result, s, len);
186 result[len] = '\0';
188 return result;
191 static char *
192 get_normal_name(char *s) /* for utf-8 and latin-1 */
194 char buf[13];
195 int i;
196 for (i = 0; i < 12; i++) {
197 int c = s[i];
198 if (c == '\0') break;
199 else if (c == '_') buf[i] = '-';
200 else buf[i] = tolower(c);
202 buf[i] = '\0';
203 if (strcmp(buf, "utf-8") == 0 ||
204 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
205 else if (strcmp(buf, "latin-1") == 0 ||
206 strcmp(buf, "iso-8859-1") == 0 ||
207 strcmp(buf, "iso-latin-1") == 0 ||
208 strncmp(buf, "latin-1-", 8) == 0 ||
209 strncmp(buf, "iso-8859-1-", 11) == 0 ||
210 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
211 else return s;
214 /* Return the coding spec in S, or NULL if none is found. */
216 static char *
217 get_coding_spec(const char *s, Py_ssize_t size)
219 Py_ssize_t i;
220 /* Coding spec must be in a comment, and that comment must be
221 * the only statement on the source code line. */
222 for (i = 0; i < size - 6; i++) {
223 if (s[i] == '#')
224 break;
225 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
226 return NULL;
228 for (; i < size - 6; i++) { /* XXX inefficient search */
229 const char* t = s + i;
230 if (strncmp(t, "coding", 6) == 0) {
231 const char* begin = NULL;
232 t += 6;
233 if (t[0] != ':' && t[0] != '=')
234 continue;
235 do {
236 t++;
237 } while (t[0] == '\x20' || t[0] == '\t');
239 begin = t;
240 while (isalnum(Py_CHARMASK(t[0])) ||
241 t[0] == '-' || t[0] == '_' || t[0] == '.')
242 t++;
244 if (begin < t) {
245 char* r = new_string(begin, t - begin);
246 char* q = get_normal_name(r);
247 if (r != q) {
248 PyMem_FREE(r);
249 r = new_string(q, strlen(q));
251 return r;
255 return NULL;
258 /* Check whether the line contains a coding spec. If it does,
259 invoke the set_readline function for the new encoding.
260 This function receives the tok_state and the new encoding.
261 Return 1 on success, 0 on failure. */
263 static int
264 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
265 int set_readline(struct tok_state *, const char *))
267 char * cs;
268 int r = 1;
270 if (tok->cont_line)
271 /* It's a continuation line, so it can't be a coding spec. */
272 return 1;
273 cs = get_coding_spec(line, size);
274 if (cs != NULL) {
275 tok->read_coding_spec = 1;
276 if (tok->encoding == NULL) {
277 assert(tok->decoding_state == STATE_RAW);
278 if (strcmp(cs, "utf-8") == 0) {
279 tok->encoding = cs;
280 } else {
281 r = set_readline(tok, cs);
282 if (r) {
283 tok->encoding = cs;
284 tok->decoding_state = STATE_NORMAL;
286 else
287 PyMem_FREE(cs);
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
291 PyMem_FREE(cs);
294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 return r;
303 /* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
307 static int
308 check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
313 int ch = get_char(tok);
314 tok->decoding_state = STATE_RAW;
315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
318 ch = get_char(tok);
319 if (ch != 0xBB) {
320 unget_char(ch, tok);
321 unget_char(0xEF, tok);
322 /* any token beginning with '\xEF' is a bad token */
323 return 1;
325 ch = get_char(tok);
326 if (ch != 0xBF) {
327 unget_char(ch, tok);
328 unget_char(0xBB, tok);
329 unget_char(0xEF, tok);
330 /* any token beginning with '\xEF' is a bad token */
331 return 1;
333 #if 0
334 /* Disable support for UTF-16 BOMs until a decision
335 is made whether this needs to be supported. */
336 } else if (ch == 0xFE) {
337 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
338 if (!set_readline(tok, "utf-16-be")) return 0;
339 tok->decoding_state = STATE_NORMAL;
340 } else if (ch == 0xFF) {
341 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
342 if (!set_readline(tok, "utf-16-le")) return 0;
343 tok->decoding_state = STATE_NORMAL;
344 #endif
345 } else {
346 unget_char(ch, tok);
347 return 1;
349 if (tok->encoding != NULL)
350 PyMem_FREE(tok->encoding);
351 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
352 /* No need to set_readline: input is already utf-8 */
353 return 1;
356 /* Read a line of text from TOK into S, using the stream in TOK.
357 Return NULL on failure, else S.
359 On entry, tok->decoding_buffer will be one of:
360 1) NULL: need to call tok->decoding_readline to get a new line
361 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
362 stored the result in tok->decoding_buffer
363 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
364 (in the s buffer) to copy entire contents of the line read
365 by tok->decoding_readline. tok->decoding_buffer has the overflow.
366 In this case, fp_readl is called in a loop (with an expanded buffer)
367 until the buffer ends with a '\n' (or until the end of the file is
368 reached): see tok_nextc and its calls to decoding_fgets.
371 static char *
372 fp_readl(char *s, int size, struct tok_state *tok)
374 PyObject* bufobj;
375 const char *buf;
376 Py_ssize_t buflen;
378 /* Ask for one less byte so we can terminate it */
379 assert(size > 0);
380 size--;
382 if (tok->decoding_buffer) {
383 bufobj = tok->decoding_buffer;
384 Py_INCREF(bufobj);
386 else
388 bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
389 if (bufobj == NULL)
390 goto error;
392 if (PyUnicode_CheckExact(bufobj))
394 buf = _PyUnicode_AsStringAndSize(bufobj, &buflen);
395 if (buf == NULL) {
396 goto error;
399 else
401 buf = PyByteArray_AsString(bufobj);
402 if (buf == NULL) {
403 goto error;
405 buflen = PyByteArray_GET_SIZE(bufobj);
408 Py_XDECREF(tok->decoding_buffer);
409 if (buflen > size) {
410 /* Too many chars, the rest goes into tok->decoding_buffer */
411 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
412 buflen-size);
413 if (tok->decoding_buffer == NULL)
414 goto error;
415 buflen = size;
417 else
418 tok->decoding_buffer = NULL;
420 memcpy(s, buf, buflen);
421 s[buflen] = '\0';
422 if (buflen == 0) /* EOF */
423 s = NULL;
424 Py_DECREF(bufobj);
425 return s;
427 error:
428 Py_XDECREF(bufobj);
429 return error_ret(tok);
432 /* Set the readline function for TOK to a StreamReader's
433 readline function. The StreamReader is named ENC.
435 This function is called from check_bom and check_coding_spec.
437 ENC is usually identical to the future value of tok->encoding,
438 except for the (currently unsupported) case of UTF-16.
440 Return 1 on success, 0 on failure. */
442 static int
443 fp_setreadl(struct tok_state *tok, const char* enc)
445 PyObject *readline = NULL, *stream = NULL, *io = NULL;
447 io = PyImport_ImportModuleNoBlock("io");
448 if (io == NULL)
449 goto cleanup;
451 if (tok->filename)
452 stream = PyObject_CallMethod(io, "open", "ssis",
453 tok->filename, "r", -1, enc);
454 else
455 stream = PyObject_CallMethod(io, "open", "isisOOO",
456 fileno(tok->fp), "r", -1, enc, Py_None, Py_None, Py_False);
457 if (stream == NULL)
458 goto cleanup;
460 Py_XDECREF(tok->decoding_readline);
461 readline = PyObject_GetAttrString(stream, "readline");
462 tok->decoding_readline = readline;
464 /* The file has been reopened; parsing will restart from
465 * the beginning of the file, we have to reset the line number.
466 * But this function has been called from inside tok_nextc() which
467 * will increment lineno before it returns. So we set it -1 so that
468 * the next call to tok_nextc() will start with tok->lineno == 0.
470 tok->lineno = -1;
472 cleanup:
473 Py_XDECREF(stream);
474 Py_XDECREF(io);
475 return readline != NULL;
478 /* Fetch the next byte from TOK. */
480 static int fp_getc(struct tok_state *tok) {
481 return getc(tok->fp);
484 /* Unfetch the last byte back into TOK. */
486 static void fp_ungetc(int c, struct tok_state *tok) {
487 ungetc(c, tok->fp);
490 /* Check whether the characters at s start a valid
491 UTF-8 sequence. Return the number of characters forming
492 the sequence if yes, 0 if not. */
493 static int valid_utf8(const unsigned char* s)
495 int expected = 0;
496 int length;
497 if (*s < 0x80)
498 /* single-byte code */
499 return 1;
500 if (*s < 0xc0)
501 /* following byte */
502 return 0;
503 if (*s < 0xE0)
504 expected = 1;
505 else if (*s < 0xF0)
506 expected = 2;
507 else if (*s < 0xF8)
508 expected = 3;
509 else
510 return 0;
511 length = expected + 1;
512 for (; expected; expected--)
513 if (s[expected] < 0x80 || s[expected] >= 0xC0)
514 return 0;
515 return length;
518 /* Read a line of input from TOK. Determine encoding
519 if necessary. */
521 static char *
522 decoding_fgets(char *s, int size, struct tok_state *tok)
524 char *line = NULL;
525 int badchar = 0;
526 for (;;) {
527 if (tok->decoding_state == STATE_NORMAL) {
528 /* We already have a codec associated with
529 this input. */
530 line = fp_readl(s, size, tok);
531 break;
532 } else if (tok->decoding_state == STATE_RAW) {
533 /* We want a 'raw' read. */
534 line = Py_UniversalNewlineFgets(s, size,
535 tok->fp, NULL);
536 break;
537 } else {
538 /* We have not yet determined the encoding.
539 If an encoding is found, use the file-pointer
540 reader functions from now on. */
541 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
542 return error_ret(tok);
543 assert(tok->decoding_state != STATE_INIT);
546 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
547 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
548 return error_ret(tok);
551 #ifndef PGEN
552 /* The default encoding is UTF-8, so make sure we don't have any
553 non-UTF-8 sequences in it. */
554 if (line && !tok->encoding) {
555 unsigned char *c;
556 int length;
557 for (c = (unsigned char *)line; *c; c += length)
558 if (!(length = valid_utf8(c))) {
559 badchar = *c;
560 break;
563 if (badchar) {
564 char buf[500];
565 /* Need to add 1 to the line number, since this line
566 has not been counted, yet. */
567 sprintf(buf,
568 "Non-UTF-8 code starting with '\\x%.2x' "
569 "in file %.200s on line %i, "
570 "but no encoding declared; "
571 "see http://python.org/dev/peps/pep-0263/ for details",
572 badchar, tok->filename, tok->lineno + 1);
573 PyErr_SetString(PyExc_SyntaxError, buf);
574 return error_ret(tok);
576 #endif
577 return line;
580 static int
581 decoding_feof(struct tok_state *tok)
583 if (tok->decoding_state != STATE_NORMAL) {
584 return feof(tok->fp);
585 } else {
586 PyObject* buf = tok->decoding_buffer;
587 if (buf == NULL) {
588 buf = PyObject_CallObject(tok->decoding_readline, NULL);
589 if (buf == NULL) {
590 error_ret(tok);
591 return 1;
592 } else {
593 tok->decoding_buffer = buf;
596 return PyObject_Length(buf) == 0;
600 /* Fetch a byte from TOK, using the string buffer. */
602 static int
603 buf_getc(struct tok_state *tok) {
604 return Py_CHARMASK(*tok->str++);
607 /* Unfetch a byte from TOK, using the string buffer. */
609 static void
610 buf_ungetc(int c, struct tok_state *tok) {
611 tok->str--;
612 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
615 /* Set the readline function for TOK to ENC. For the string-based
616 tokenizer, this means to just record the encoding. */
618 static int
619 buf_setreadl(struct tok_state *tok, const char* enc) {
620 tok->enc = enc;
621 return 1;
624 /* Return a UTF-8 encoding Python string object from the
625 C byte string STR, which is encoded with ENC. */
627 static PyObject *
628 translate_into_utf8(const char* str, const char* enc) {
629 PyObject *utf8;
630 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
631 if (buf == NULL)
632 return NULL;
633 utf8 = PyUnicode_AsUTF8String(buf);
634 Py_DECREF(buf);
635 return utf8;
638 /* Decode a byte string STR for use as the buffer of TOK.
639 Look for encoding declarations inside STR, and record them
640 inside TOK. */
642 static const char *
643 decode_str(const char *str, struct tok_state *tok)
645 PyObject* utf8 = NULL;
646 const char *s;
647 const char *newl[2] = {NULL, NULL};
648 int lineno = 0;
649 tok->enc = NULL;
650 tok->str = str;
651 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
652 return error_ret(tok);
653 str = tok->str; /* string after BOM if any */
654 assert(str);
655 if (tok->enc != NULL) {
656 utf8 = translate_into_utf8(str, tok->enc);
657 if (utf8 == NULL)
658 return error_ret(tok);
659 str = PyBytes_AsString(utf8);
661 for (s = str;; s++) {
662 if (*s == '\0') break;
663 else if (*s == '\n') {
664 assert(lineno < 2);
665 newl[lineno] = s;
666 lineno++;
667 if (lineno == 2) break;
670 tok->enc = NULL;
671 /* need to check line 1 and 2 separately since check_coding_spec
672 assumes a single line as input */
673 if (newl[0]) {
674 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
675 return error_ret(tok);
676 if (tok->enc == NULL && newl[1]) {
677 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
678 tok, buf_setreadl))
679 return error_ret(tok);
682 if (tok->enc != NULL) {
683 assert(utf8 == NULL);
684 utf8 = translate_into_utf8(str, tok->enc);
685 if (utf8 == NULL)
686 return error_ret(tok);
687 str = PyBytes_AS_STRING(utf8);
689 assert(tok->decoding_buffer == NULL);
690 tok->decoding_buffer = utf8; /* CAUTION */
691 return str;
694 #endif /* PGEN */
696 /* Set up tokenizer for string */
698 struct tok_state *
699 PyTokenizer_FromString(const char *str)
701 struct tok_state *tok = tok_new();
702 if (tok == NULL)
703 return NULL;
704 str = (char *)decode_str(str, tok);
705 if (str == NULL) {
706 PyTokenizer_Free(tok);
707 return NULL;
710 /* XXX: constify members. */
711 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
712 return tok;
715 struct tok_state *
716 PyTokenizer_FromUTF8(const char *str)
718 struct tok_state *tok = tok_new();
719 if (tok == NULL)
720 return NULL;
721 tok->decoding_state = STATE_RAW;
722 tok->read_coding_spec = 1;
723 tok->enc = NULL;
724 tok->str = str;
725 tok->encoding = (char *)PyMem_MALLOC(6);
726 if (!tok->encoding) {
727 PyTokenizer_Free(tok);
728 return NULL;
730 strcpy(tok->encoding, "utf-8");
732 /* XXX: constify members. */
733 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
734 return tok;
738 /* Set up tokenizer for file */
740 struct tok_state *
741 PyTokenizer_FromFile(FILE *fp, char* enc, char *ps1, char *ps2)
743 struct tok_state *tok = tok_new();
744 if (tok == NULL)
745 return NULL;
746 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
747 PyTokenizer_Free(tok);
748 return NULL;
750 tok->cur = tok->inp = tok->buf;
751 tok->end = tok->buf + BUFSIZ;
752 tok->fp = fp;
753 tok->prompt = ps1;
754 tok->nextprompt = ps2;
755 if (enc != NULL) {
756 /* Must copy encoding declaration since it
757 gets copied into the parse tree. */
758 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
759 if (!tok->encoding) {
760 PyTokenizer_Free(tok);
761 return NULL;
763 strcpy(tok->encoding, enc);
764 tok->decoding_state = STATE_NORMAL;
766 return tok;
770 /* Free a tok_state structure */
772 void
773 PyTokenizer_Free(struct tok_state *tok)
775 if (tok->encoding != NULL)
776 PyMem_FREE(tok->encoding);
777 #ifndef PGEN
778 Py_XDECREF(tok->decoding_readline);
779 Py_XDECREF(tok->decoding_buffer);
780 #endif
781 if (tok->fp != NULL && tok->buf != NULL)
782 PyMem_FREE(tok->buf);
783 PyMem_FREE(tok);
786 /* Get next char, updating state; error code goes into tok->done */
788 static int
789 tok_nextc(register struct tok_state *tok)
791 for (;;) {
792 if (tok->cur != tok->inp) {
793 return Py_CHARMASK(*tok->cur++); /* Fast path */
795 if (tok->done != E_OK)
796 return EOF;
797 if (tok->fp == NULL) {
798 char *end = strchr(tok->inp, '\n');
799 if (end != NULL)
800 end++;
801 else {
802 end = strchr(tok->inp, '\0');
803 if (end == tok->inp) {
804 tok->done = E_EOF;
805 return EOF;
808 if (tok->start == NULL)
809 tok->buf = tok->cur;
810 tok->line_start = tok->cur;
811 tok->lineno++;
812 tok->inp = end;
813 return Py_CHARMASK(*tok->cur++);
815 if (tok->prompt != NULL) {
816 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
817 #ifndef PGEN
818 if (tok->encoding && newtok && *newtok) {
819 /* Recode to UTF-8 */
820 Py_ssize_t buflen;
821 const char* buf;
822 PyObject *u = translate_into_utf8(newtok, tok->encoding);
823 PyMem_FREE(newtok);
824 if (!u) {
825 tok->done = E_DECODE;
826 return EOF;
828 buflen = PyBytes_GET_SIZE(u);
829 buf = PyBytes_AS_STRING(u);
830 if (!buf) {
831 Py_DECREF(u);
832 tok->done = E_DECODE;
833 return EOF;
835 newtok = PyMem_MALLOC(buflen+1);
836 strcpy(newtok, buf);
837 Py_DECREF(u);
839 #endif
840 if (tok->nextprompt != NULL)
841 tok->prompt = tok->nextprompt;
842 if (newtok == NULL)
843 tok->done = E_INTR;
844 else if (*newtok == '\0') {
845 PyMem_FREE(newtok);
846 tok->done = E_EOF;
848 else if (tok->start != NULL) {
849 size_t start = tok->start - tok->buf;
850 size_t oldlen = tok->cur - tok->buf;
851 size_t newlen = oldlen + strlen(newtok);
852 char *buf = tok->buf;
853 buf = (char *)PyMem_REALLOC(buf, newlen+1);
854 tok->lineno++;
855 if (buf == NULL) {
856 PyMem_FREE(tok->buf);
857 tok->buf = NULL;
858 PyMem_FREE(newtok);
859 tok->done = E_NOMEM;
860 return EOF;
862 tok->buf = buf;
863 tok->cur = tok->buf + oldlen;
864 tok->line_start = tok->cur;
865 strcpy(tok->buf + oldlen, newtok);
866 PyMem_FREE(newtok);
867 tok->inp = tok->buf + newlen;
868 tok->end = tok->inp + 1;
869 tok->start = tok->buf + start;
871 else {
872 tok->lineno++;
873 if (tok->buf != NULL)
874 PyMem_FREE(tok->buf);
875 tok->buf = newtok;
876 tok->line_start = tok->buf;
877 tok->cur = tok->buf;
878 tok->line_start = tok->buf;
879 tok->inp = strchr(tok->buf, '\0');
880 tok->end = tok->inp + 1;
883 else {
884 int done = 0;
885 Py_ssize_t cur = 0;
886 char *pt;
887 if (tok->start == NULL) {
888 if (tok->buf == NULL) {
889 tok->buf = (char *)
890 PyMem_MALLOC(BUFSIZ);
891 if (tok->buf == NULL) {
892 tok->done = E_NOMEM;
893 return EOF;
895 tok->end = tok->buf + BUFSIZ;
897 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
898 tok) == NULL) {
899 tok->done = E_EOF;
900 done = 1;
902 else {
903 tok->done = E_OK;
904 tok->inp = strchr(tok->buf, '\0');
905 done = tok->inp[-1] == '\n';
908 else {
909 cur = tok->cur - tok->buf;
910 if (decoding_feof(tok)) {
911 tok->done = E_EOF;
912 done = 1;
914 else
915 tok->done = E_OK;
917 tok->lineno++;
918 /* Read until '\n' or EOF */
919 while (!done) {
920 Py_ssize_t curstart = tok->start == NULL ? -1 :
921 tok->start - tok->buf;
922 Py_ssize_t curvalid = tok->inp - tok->buf;
923 Py_ssize_t newsize = curvalid + BUFSIZ;
924 char *newbuf = tok->buf;
925 newbuf = (char *)PyMem_REALLOC(newbuf,
926 newsize);
927 if (newbuf == NULL) {
928 tok->done = E_NOMEM;
929 tok->cur = tok->inp;
930 return EOF;
932 tok->buf = newbuf;
933 tok->inp = tok->buf + curvalid;
934 tok->end = tok->buf + newsize;
935 tok->start = curstart < 0 ? NULL :
936 tok->buf + curstart;
937 if (decoding_fgets(tok->inp,
938 (int)(tok->end - tok->inp),
939 tok) == NULL) {
940 /* Break out early on decoding
941 errors, as tok->buf will be NULL
943 if (tok->decoding_erred)
944 return EOF;
945 /* Last line does not end in \n,
946 fake one */
947 strcpy(tok->inp, "\n");
949 tok->inp = strchr(tok->inp, '\0');
950 done = tok->inp[-1] == '\n';
952 if (tok->buf != NULL) {
953 tok->cur = tok->buf + cur;
954 tok->line_start = tok->cur;
955 /* replace "\r\n" with "\n" */
956 /* For Mac leave the \r, giving a syntax error */
957 pt = tok->inp - 2;
958 if (pt >= tok->buf && *pt == '\r') {
959 *pt++ = '\n';
960 *pt = '\0';
961 tok->inp = pt;
965 if (tok->done != E_OK) {
966 if (tok->prompt != NULL)
967 PySys_WriteStderr("\n");
968 tok->cur = tok->inp;
969 return EOF;
972 /*NOTREACHED*/
976 /* Back-up one character */
978 static void
979 tok_backup(register struct tok_state *tok, register int c)
981 if (c != EOF) {
982 if (--tok->cur < tok->buf)
983 Py_FatalError("tok_backup: begin of buffer");
984 if (*tok->cur != c)
985 *tok->cur = c;
990 /* Return the token corresponding to a single character */
993 PyToken_OneChar(int c)
995 switch (c) {
996 case '(': return LPAR;
997 case ')': return RPAR;
998 case '[': return LSQB;
999 case ']': return RSQB;
1000 case ':': return COLON;
1001 case ',': return COMMA;
1002 case ';': return SEMI;
1003 case '+': return PLUS;
1004 case '-': return MINUS;
1005 case '*': return STAR;
1006 case '/': return SLASH;
1007 case '|': return VBAR;
1008 case '&': return AMPER;
1009 case '<': return LESS;
1010 case '>': return GREATER;
1011 case '=': return EQUAL;
1012 case '.': return DOT;
1013 case '%': return PERCENT;
1014 case '{': return LBRACE;
1015 case '}': return RBRACE;
1016 case '^': return CIRCUMFLEX;
1017 case '~': return TILDE;
1018 case '@': return AT;
1019 default: return OP;
1025 PyToken_TwoChars(int c1, int c2)
1027 switch (c1) {
1028 case '=':
1029 switch (c2) {
1030 case '=': return EQEQUAL;
1032 break;
1033 case '!':
1034 switch (c2) {
1035 case '=': return NOTEQUAL;
1037 break;
1038 case '<':
1039 switch (c2) {
1040 case '>': return NOTEQUAL;
1041 case '=': return LESSEQUAL;
1042 case '<': return LEFTSHIFT;
1044 break;
1045 case '>':
1046 switch (c2) {
1047 case '=': return GREATEREQUAL;
1048 case '>': return RIGHTSHIFT;
1050 break;
1051 case '+':
1052 switch (c2) {
1053 case '=': return PLUSEQUAL;
1055 break;
1056 case '-':
1057 switch (c2) {
1058 case '=': return MINEQUAL;
1059 case '>': return RARROW;
1061 break;
1062 case '*':
1063 switch (c2) {
1064 case '*': return DOUBLESTAR;
1065 case '=': return STAREQUAL;
1067 break;
1068 case '/':
1069 switch (c2) {
1070 case '/': return DOUBLESLASH;
1071 case '=': return SLASHEQUAL;
1073 break;
1074 case '|':
1075 switch (c2) {
1076 case '=': return VBAREQUAL;
1078 break;
1079 case '%':
1080 switch (c2) {
1081 case '=': return PERCENTEQUAL;
1083 break;
1084 case '&':
1085 switch (c2) {
1086 case '=': return AMPEREQUAL;
1088 break;
1089 case '^':
1090 switch (c2) {
1091 case '=': return CIRCUMFLEXEQUAL;
1093 break;
1095 return OP;
1099 PyToken_ThreeChars(int c1, int c2, int c3)
1101 switch (c1) {
1102 case '<':
1103 switch (c2) {
1104 case '<':
1105 switch (c3) {
1106 case '=':
1107 return LEFTSHIFTEQUAL;
1109 break;
1111 break;
1112 case '>':
1113 switch (c2) {
1114 case '>':
1115 switch (c3) {
1116 case '=':
1117 return RIGHTSHIFTEQUAL;
1119 break;
1121 break;
1122 case '*':
1123 switch (c2) {
1124 case '*':
1125 switch (c3) {
1126 case '=':
1127 return DOUBLESTAREQUAL;
1129 break;
1131 break;
1132 case '/':
1133 switch (c2) {
1134 case '/':
1135 switch (c3) {
1136 case '=':
1137 return DOUBLESLASHEQUAL;
1139 break;
1141 break;
1142 case '.':
1143 switch (c2) {
1144 case '.':
1145 switch (c3) {
1146 case '.':
1147 return ELLIPSIS;
1149 break;
1151 break;
1153 return OP;
1156 static int
1157 indenterror(struct tok_state *tok)
1159 if (tok->alterror) {
1160 tok->done = E_TABSPACE;
1161 tok->cur = tok->inp;
1162 return 1;
1164 if (tok->altwarning) {
1165 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1166 "in indentation\n", tok->filename);
1167 tok->altwarning = 0;
1169 return 0;
1172 #ifdef PGEN
1173 #define verify_identifier(s,e) 1
1174 #else
1175 /* Verify that the identifier follows PEP 3131. */
1176 static int
1177 verify_identifier(char *start, char *end)
1179 PyObject *s;
1180 int result;
1181 s = PyUnicode_DecodeUTF8(start, end-start, NULL);
1182 if (s == NULL) {
1183 PyErr_Clear();
1184 return 0;
1186 result = PyUnicode_IsIdentifier(s);
1187 Py_DECREF(s);
1188 return result;
1190 #endif
1192 /* Get next token, after space stripping etc. */
1194 static int
1195 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1197 register int c;
1198 int blankline, nonascii;
1200 *p_start = *p_end = NULL;
1201 nextline:
1202 tok->start = NULL;
1203 blankline = 0;
1205 /* Get indentation level */
1206 if (tok->atbol) {
1207 register int col = 0;
1208 register int altcol = 0;
1209 tok->atbol = 0;
1210 for (;;) {
1211 c = tok_nextc(tok);
1212 if (c == ' ')
1213 col++, altcol++;
1214 else if (c == '\t') {
1215 col = (col/tok->tabsize + 1) * tok->tabsize;
1216 altcol = (altcol/tok->alttabsize + 1)
1217 * tok->alttabsize;
1219 else if (c == '\014') /* Control-L (formfeed) */
1220 col = altcol = 0; /* For Emacs users */
1221 else
1222 break;
1224 tok_backup(tok, c);
1225 if (c == '#' || c == '\n') {
1226 /* Lines with only whitespace and/or comments
1227 shouldn't affect the indentation and are
1228 not passed to the parser as NEWLINE tokens,
1229 except *totally* empty lines in interactive
1230 mode, which signal the end of a command group. */
1231 if (col == 0 && c == '\n' && tok->prompt != NULL)
1232 blankline = 0; /* Let it through */
1233 else
1234 blankline = 1; /* Ignore completely */
1235 /* We can't jump back right here since we still
1236 may need to skip to the end of a comment */
1238 if (!blankline && tok->level == 0) {
1239 if (col == tok->indstack[tok->indent]) {
1240 /* No change */
1241 if (altcol != tok->altindstack[tok->indent]) {
1242 if (indenterror(tok))
1243 return ERRORTOKEN;
1246 else if (col > tok->indstack[tok->indent]) {
1247 /* Indent -- always one */
1248 if (tok->indent+1 >= MAXINDENT) {
1249 tok->done = E_TOODEEP;
1250 tok->cur = tok->inp;
1251 return ERRORTOKEN;
1253 if (altcol <= tok->altindstack[tok->indent]) {
1254 if (indenterror(tok))
1255 return ERRORTOKEN;
1257 tok->pendin++;
1258 tok->indstack[++tok->indent] = col;
1259 tok->altindstack[tok->indent] = altcol;
1261 else /* col < tok->indstack[tok->indent] */ {
1262 /* Dedent -- any number, must be consistent */
1263 while (tok->indent > 0 &&
1264 col < tok->indstack[tok->indent]) {
1265 tok->pendin--;
1266 tok->indent--;
1268 if (col != tok->indstack[tok->indent]) {
1269 tok->done = E_DEDENT;
1270 tok->cur = tok->inp;
1271 return ERRORTOKEN;
1273 if (altcol != tok->altindstack[tok->indent]) {
1274 if (indenterror(tok))
1275 return ERRORTOKEN;
1281 tok->start = tok->cur;
1283 /* Return pending indents/dedents */
1284 if (tok->pendin != 0) {
1285 if (tok->pendin < 0) {
1286 tok->pendin++;
1287 return DEDENT;
1289 else {
1290 tok->pendin--;
1291 return INDENT;
1295 again:
1296 tok->start = NULL;
1297 /* Skip spaces */
1298 do {
1299 c = tok_nextc(tok);
1300 } while (c == ' ' || c == '\t' || c == '\014');
1302 /* Set start of current token */
1303 tok->start = tok->cur - 1;
1305 /* Skip comment */
1306 if (c == '#')
1307 while (c != EOF && c != '\n')
1308 c = tok_nextc(tok);
1310 /* Check for EOF and errors now */
1311 if (c == EOF) {
1312 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1315 /* Identifier (most frequent token!) */
1316 nonascii = 0;
1317 if (is_potential_identifier_start(c)) {
1318 /* Process b"", r"" and br"" */
1319 if (c == 'b' || c == 'B') {
1320 c = tok_nextc(tok);
1321 if (c == '"' || c == '\'')
1322 goto letter_quote;
1324 if (c == 'r' || c == 'R') {
1325 c = tok_nextc(tok);
1326 if (c == '"' || c == '\'')
1327 goto letter_quote;
1329 while (is_potential_identifier_char(c)) {
1330 if (c >= 128)
1331 nonascii = 1;
1332 c = tok_nextc(tok);
1334 tok_backup(tok, c);
1335 if (nonascii &&
1336 !verify_identifier(tok->start, tok->cur)) {
1337 tok->done = E_IDENTIFIER;
1338 return ERRORTOKEN;
1340 *p_start = tok->start;
1341 *p_end = tok->cur;
1342 return NAME;
1345 /* Newline */
1346 if (c == '\n') {
1347 tok->atbol = 1;
1348 if (blankline || tok->level > 0)
1349 goto nextline;
1350 *p_start = tok->start;
1351 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1352 tok->cont_line = 0;
1353 return NEWLINE;
1356 /* Period or number starting with period? */
1357 if (c == '.') {
1358 c = tok_nextc(tok);
1359 if (isdigit(c)) {
1360 goto fraction;
1361 } else if (c == '.') {
1362 c = tok_nextc(tok);
1363 if (c == '.') {
1364 *p_start = tok->start;
1365 *p_end = tok->cur;
1366 return ELLIPSIS;
1367 } else {
1368 tok_backup(tok, c);
1370 tok_backup(tok, '.');
1371 } else {
1372 tok_backup(tok, c);
1374 *p_start = tok->start;
1375 *p_end = tok->cur;
1376 return DOT;
1379 /* Number */
1380 if (isdigit(c)) {
1381 if (c == '0') {
1382 /* Hex, octal or binary -- maybe. */
1383 c = tok_nextc(tok);
1384 if (c == '.')
1385 goto fraction;
1386 #ifndef WITHOUT_COMPLEX
1387 if (c == 'j' || c == 'J')
1388 goto imaginary;
1389 #endif
1390 if (c == 'x' || c == 'X') {
1392 /* Hex */
1393 c = tok_nextc(tok);
1394 if (!isxdigit(c)) {
1395 tok->done = E_TOKEN;
1396 tok_backup(tok, c);
1397 return ERRORTOKEN;
1399 do {
1400 c = tok_nextc(tok);
1401 } while (isxdigit(c));
1403 else if (c == 'o' || c == 'O') {
1404 /* Octal */
1405 c = tok_nextc(tok);
1406 if (c < '0' || c >= '8') {
1407 tok->done = E_TOKEN;
1408 tok_backup(tok, c);
1409 return ERRORTOKEN;
1411 do {
1412 c = tok_nextc(tok);
1413 } while ('0' <= c && c < '8');
1415 else if (c == 'b' || c == 'B') {
1416 /* Binary */
1417 c = tok_nextc(tok);
1418 if (c != '0' && c != '1') {
1419 tok->done = E_TOKEN;
1420 tok_backup(tok, c);
1421 return ERRORTOKEN;
1423 do {
1424 c = tok_nextc(tok);
1425 } while (c == '0' || c == '1');
1427 else {
1428 int nonzero = 0;
1429 /* maybe old-style octal; c is first char of it */
1430 /* in any case, allow '0' as a literal */
1431 while (c == '0')
1432 c = tok_nextc(tok);
1433 while (isdigit(c)) {
1434 nonzero = 1;
1435 c = tok_nextc(tok);
1437 if (c == '.')
1438 goto fraction;
1439 else if (c == 'e' || c == 'E')
1440 goto exponent;
1441 #ifndef WITHOUT_COMPLEX
1442 else if (c == 'j' || c == 'J')
1443 goto imaginary;
1444 #endif
1445 else if (nonzero) {
1446 tok->done = E_TOKEN;
1447 tok_backup(tok, c);
1448 return ERRORTOKEN;
1452 else {
1453 /* Decimal */
1454 do {
1455 c = tok_nextc(tok);
1456 } while (isdigit(c));
1458 /* Accept floating point numbers. */
1459 if (c == '.') {
1460 fraction:
1461 /* Fraction */
1462 do {
1463 c = tok_nextc(tok);
1464 } while (isdigit(c));
1466 if (c == 'e' || c == 'E') {
1467 exponent:
1468 /* Exponent part */
1469 c = tok_nextc(tok);
1470 if (c == '+' || c == '-')
1471 c = tok_nextc(tok);
1472 if (!isdigit(c)) {
1473 tok->done = E_TOKEN;
1474 tok_backup(tok, c);
1475 return ERRORTOKEN;
1477 do {
1478 c = tok_nextc(tok);
1479 } while (isdigit(c));
1481 #ifndef WITHOUT_COMPLEX
1482 if (c == 'j' || c == 'J')
1483 /* Imaginary part */
1484 imaginary:
1485 c = tok_nextc(tok);
1486 #endif
1489 tok_backup(tok, c);
1490 *p_start = tok->start;
1491 *p_end = tok->cur;
1492 return NUMBER;
1495 letter_quote:
1496 /* String */
1497 if (c == '\'' || c == '"') {
1498 int quote = c;
1499 int quote_size = 1; /* 1 or 3 */
1500 int end_quote_size = 0;
1502 /* Find the quote size and start of string */
1503 c = tok_nextc(tok);
1504 if (c == quote) {
1505 c = tok_nextc(tok);
1506 if (c == quote)
1507 quote_size = 3;
1508 else
1509 end_quote_size = 1; /* empty string found */
1511 if (c != quote)
1512 tok_backup(tok, c);
1514 /* Get rest of string */
1515 while (end_quote_size != quote_size) {
1516 c = tok_nextc(tok);
1517 if (c == EOF) {
1518 if (quote_size == 3)
1519 tok->done = E_EOFS;
1520 else
1521 tok->done = E_EOLS;
1522 tok->cur = tok->inp;
1523 return ERRORTOKEN;
1525 if (quote_size == 1 && c == '\n') {
1526 tok->done = E_EOLS;
1527 tok->cur = tok->inp;
1528 return ERRORTOKEN;
1530 if (c == quote)
1531 end_quote_size += 1;
1532 else {
1533 end_quote_size = 0;
1534 if (c == '\\')
1535 c = tok_nextc(tok); /* skip escaped char */
1539 *p_start = tok->start;
1540 *p_end = tok->cur;
1541 return STRING;
1544 /* Line continuation */
1545 if (c == '\\') {
1546 c = tok_nextc(tok);
1547 if (c != '\n') {
1548 tok->done = E_LINECONT;
1549 tok->cur = tok->inp;
1550 return ERRORTOKEN;
1552 tok->cont_line = 1;
1553 goto again; /* Read next line */
1556 /* Check for two-character token */
1558 int c2 = tok_nextc(tok);
1559 int token = PyToken_TwoChars(c, c2);
1560 if (token != OP) {
1561 int c3 = tok_nextc(tok);
1562 int token3 = PyToken_ThreeChars(c, c2, c3);
1563 if (token3 != OP) {
1564 token = token3;
1565 } else {
1566 tok_backup(tok, c3);
1568 *p_start = tok->start;
1569 *p_end = tok->cur;
1570 return token;
1572 tok_backup(tok, c2);
1575 /* Keep track of parentheses nesting level */
1576 switch (c) {
1577 case '(':
1578 case '[':
1579 case '{':
1580 tok->level++;
1581 break;
1582 case ')':
1583 case ']':
1584 case '}':
1585 tok->level--;
1586 break;
1589 /* Punctuation character */
1590 *p_start = tok->start;
1591 *p_end = tok->cur;
1592 return PyToken_OneChar(c);
1596 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1598 int result = tok_get(tok, p_start, p_end);
1599 if (tok->decoding_erred) {
1600 result = ERRORTOKEN;
1601 tok->done = E_DECODE;
1603 return result;
1606 /* Get -*- encoding -*- from a Python file.
1608 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
1609 the first or second line of the file (in which case the encoding
1610 should be assumed to be PyUnicode_GetDefaultEncoding()).
1612 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1613 by the caller.
1615 char *
1616 PyTokenizer_FindEncoding(int fd)
1618 struct tok_state *tok;
1619 FILE *fp;
1620 char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1622 fd = dup(fd);
1623 if (fd < 0) {
1624 return NULL;
1626 fp = fdopen(fd, "r");
1627 if (fp == NULL) {
1628 return NULL;
1630 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1631 if (tok == NULL) {
1632 fclose(fp);
1633 return NULL;
1635 while (tok->lineno < 2 && tok->done == E_OK) {
1636 PyTokenizer_Get(tok, &p_start, &p_end);
1638 fclose(fp);
1639 if (tok->encoding) {
1640 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1641 if (encoding)
1642 strcpy(encoding, tok->encoding);
1644 PyTokenizer_Free(tok);
1645 return encoding;
1648 #ifdef Py_DEBUG
1650 void
1651 tok_dump(int type, char *start, char *end)
1653 printf("%s", _PyParser_TokenNames[type]);
1654 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1655 printf("(%.*s)", (int)(end - start), start);
1658 #endif