Added LoggerAdapter class, changed copyright dates, made check for extra parameter...
[python.git] / Parser / tokenizer.c
blobee353aaebf5cec2e639bf6e4964ef763a9725583
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #include "pydebug.h"
20 #endif /* PGEN */
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
27 /* Don't ever change this -- it would break the portability of Python code */
28 #define TABSIZE 8
30 /* Convert a possibly signed character to a nonnegative int */
31 /* XXX This assumes characters are 8 bits wide */
32 #ifdef __CHAR_UNSIGNED__
33 #define Py_CHARMASK(c) (c)
34 #else
35 #define Py_CHARMASK(c) ((c) & 0xff)
36 #endif
38 /* Forward */
39 static struct tok_state *tok_new(void);
40 static int tok_nextc(struct tok_state *tok);
41 static void tok_backup(struct tok_state *tok, int c);
43 /* Token names */
45 char *_PyParser_TokenNames[] = {
46 "ENDMARKER",
47 "NAME",
48 "NUMBER",
49 "STRING",
50 "NEWLINE",
51 "INDENT",
52 "DEDENT",
53 "LPAR",
54 "RPAR",
55 "LSQB",
56 "RSQB",
57 "COLON",
58 "COMMA",
59 "SEMI",
60 "PLUS",
61 "MINUS",
62 "STAR",
63 "SLASH",
64 "VBAR",
65 "AMPER",
66 "LESS",
67 "GREATER",
68 "EQUAL",
69 "DOT",
70 "PERCENT",
71 "BACKQUOTE",
72 "LBRACE",
73 "RBRACE",
74 "EQEQUAL",
75 "NOTEQUAL",
76 "LESSEQUAL",
77 "GREATEREQUAL",
78 "TILDE",
79 "CIRCUMFLEX",
80 "LEFTSHIFT",
81 "RIGHTSHIFT",
82 "DOUBLESTAR",
83 "PLUSEQUAL",
84 "MINEQUAL",
85 "STAREQUAL",
86 "SLASHEQUAL",
87 "PERCENTEQUAL",
88 "AMPEREQUAL",
89 "VBAREQUAL",
90 "CIRCUMFLEXEQUAL",
91 "LEFTSHIFTEQUAL",
92 "RIGHTSHIFTEQUAL",
93 "DOUBLESTAREQUAL",
94 "DOUBLESLASH",
95 "DOUBLESLASHEQUAL",
96 "AT",
97 /* This table must match the #defines in token.h! */
98 "OP",
99 "<ERRORTOKEN>",
100 "<N_TOKENS>"
104 /* Create and initialize a new tok_state structure */
106 static struct tok_state *
107 tok_new(void)
109 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
110 sizeof(struct tok_state));
111 if (tok == NULL)
112 return NULL;
113 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
114 tok->done = E_OK;
115 tok->fp = NULL;
116 tok->tabsize = TABSIZE;
117 tok->indent = 0;
118 tok->indstack[0] = 0;
119 tok->atbol = 1;
120 tok->pendin = 0;
121 tok->prompt = tok->nextprompt = NULL;
122 tok->lineno = 0;
123 tok->level = 0;
124 tok->filename = NULL;
125 tok->altwarning = 0;
126 tok->alterror = 0;
127 tok->alttabsize = 1;
128 tok->altindstack[0] = 0;
129 tok->decoding_state = 0;
130 tok->decoding_erred = 0;
131 tok->read_coding_spec = 0;
132 tok->encoding = NULL;
133 tok->cont_line = 0;
134 #ifndef PGEN
135 tok->decoding_readline = NULL;
136 tok->decoding_buffer = NULL;
137 #endif
138 return tok;
141 #ifdef PGEN
143 static char *
144 decoding_fgets(char *s, int size, struct tok_state *tok)
146 return fgets(s, size, tok->fp);
149 static int
150 decoding_feof(struct tok_state *tok)
152 return feof(tok->fp);
155 static const char *
156 decode_str(const char *str, struct tok_state *tok)
158 return str;
161 #else /* PGEN */
163 static char *
164 error_ret(struct tok_state *tok) /* XXX */
166 tok->decoding_erred = 1;
167 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
168 PyMem_FREE(tok->buf);
169 tok->buf = NULL;
170 return NULL; /* as if it were EOF */
173 static char *
174 new_string(const char *s, Py_ssize_t len)
176 char* result = (char *)PyMem_MALLOC(len + 1);
177 if (result != NULL) {
178 memcpy(result, s, len);
179 result[len] = '\0';
181 return result;
184 static char *
185 get_normal_name(char *s) /* for utf-8 and latin-1 */
187 char buf[13];
188 int i;
189 for (i = 0; i < 12; i++) {
190 int c = s[i];
191 if (c == '\0') break;
192 else if (c == '_') buf[i] = '-';
193 else buf[i] = tolower(c);
195 buf[i] = '\0';
196 if (strcmp(buf, "utf-8") == 0 ||
197 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
198 else if (strcmp(buf, "latin-1") == 0 ||
199 strcmp(buf, "iso-8859-1") == 0 ||
200 strcmp(buf, "iso-latin-1") == 0 ||
201 strncmp(buf, "latin-1-", 8) == 0 ||
202 strncmp(buf, "iso-8859-1-", 11) == 0 ||
203 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
204 else return s;
207 /* Return the coding spec in S, or NULL if none is found. */
209 static char *
210 get_coding_spec(const char *s, Py_ssize_t size)
212 Py_ssize_t i;
213 /* Coding spec must be in a comment, and that comment must be
214 * the only statement on the source code line. */
215 for (i = 0; i < size - 6; i++) {
216 if (s[i] == '#')
217 break;
218 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
219 return NULL;
221 for (; i < size - 6; i++) { /* XXX inefficient search */
222 const char* t = s + i;
223 if (strncmp(t, "coding", 6) == 0) {
224 const char* begin = NULL;
225 t += 6;
226 if (t[0] != ':' && t[0] != '=')
227 continue;
228 do {
229 t++;
230 } while (t[0] == '\x20' || t[0] == '\t');
232 begin = t;
233 while (isalnum(Py_CHARMASK(t[0])) ||
234 t[0] == '-' || t[0] == '_' || t[0] == '.')
235 t++;
237 if (begin < t) {
238 char* r = new_string(begin, t - begin);
239 char* q = get_normal_name(r);
240 if (r != q) {
241 PyMem_FREE(r);
242 r = new_string(q, strlen(q));
244 return r;
248 return NULL;
251 /* Check whether the line contains a coding spec. If it does,
252 invoke the set_readline function for the new encoding.
253 This function receives the tok_state and the new encoding.
254 Return 1 on success, 0 on failure. */
256 static int
257 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
258 int set_readline(struct tok_state *, const char *))
260 char * cs;
261 int r = 1;
263 if (tok->cont_line)
264 /* It's a continuation line, so it can't be a coding spec. */
265 return 1;
266 cs = get_coding_spec(line, size);
267 if (cs != NULL) {
268 tok->read_coding_spec = 1;
269 if (tok->encoding == NULL) {
270 assert(tok->decoding_state == 1); /* raw */
271 if (strcmp(cs, "utf-8") == 0 ||
272 strcmp(cs, "iso-8859-1") == 0) {
273 tok->encoding = cs;
274 } else {
275 #ifdef Py_USING_UNICODE
276 r = set_readline(tok, cs);
277 if (r) {
278 tok->encoding = cs;
279 tok->decoding_state = -1;
281 else
282 PyMem_FREE(cs);
283 #else
284 /* Without Unicode support, we cannot
285 process the coding spec. Since there
286 won't be any Unicode literals, that
287 won't matter. */
288 PyMem_FREE(cs);
289 #endif
291 } else { /* then, compare cs with BOM */
292 r = (strcmp(tok->encoding, cs) == 0);
293 PyMem_FREE(cs);
296 if (!r) {
297 cs = tok->encoding;
298 if (!cs)
299 cs = "with BOM";
300 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
302 return r;
305 /* See whether the file starts with a BOM. If it does,
306 invoke the set_readline function with the new encoding.
307 Return 1 on success, 0 on failure. */
309 static int
310 check_bom(int get_char(struct tok_state *),
311 void unget_char(int, struct tok_state *),
312 int set_readline(struct tok_state *, const char *),
313 struct tok_state *tok)
315 int ch = get_char(tok);
316 tok->decoding_state = 1;
317 if (ch == EOF) {
318 return 1;
319 } else if (ch == 0xEF) {
320 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
321 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
322 #if 0
323 /* Disable support for UTF-16 BOMs until a decision
324 is made whether this needs to be supported. */
325 } else if (ch == 0xFE) {
326 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
327 if (!set_readline(tok, "utf-16-be")) return 0;
328 tok->decoding_state = -1;
329 } else if (ch == 0xFF) {
330 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
331 if (!set_readline(tok, "utf-16-le")) return 0;
332 tok->decoding_state = -1;
333 #endif
334 } else {
335 unget_char(ch, tok);
336 return 1;
338 if (tok->encoding != NULL)
339 PyMem_FREE(tok->encoding);
340 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
341 return 1;
342 NON_BOM:
343 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
344 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
345 return 1;
348 /* Read a line of text from TOK into S, using the stream in TOK.
349 Return NULL on failure, else S.
351 On entry, tok->decoding_buffer will be one of:
352 1) NULL: need to call tok->decoding_readline to get a new line
353 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
354 stored the result in tok->decoding_buffer
355 3) PyStringObject *: previous call to fp_readl did not have enough room
356 (in the s buffer) to copy entire contents of the line read
357 by tok->decoding_readline. tok->decoding_buffer has the overflow.
358 In this case, fp_readl is called in a loop (with an expanded buffer)
359 until the buffer ends with a '\n' (or until the end of the file is
360 reached): see tok_nextc and its calls to decoding_fgets.
363 static char *
364 fp_readl(char *s, int size, struct tok_state *tok)
366 #ifndef Py_USING_UNICODE
367 /* In a non-Unicode built, this should never be called. */
368 Py_FatalError("fp_readl should not be called in this build.");
369 return NULL; /* Keep compiler happy (not reachable) */
370 #else
371 PyObject* utf8 = NULL;
372 PyObject* buf = tok->decoding_buffer;
373 char *str;
374 Py_ssize_t utf8len;
376 /* Ask for one less byte so we can terminate it */
377 assert(size > 0);
378 size--;
380 if (buf == NULL) {
381 buf = PyObject_CallObject(tok->decoding_readline, NULL);
382 if (buf == NULL)
383 return error_ret(tok);
384 } else {
385 tok->decoding_buffer = NULL;
386 if (PyString_CheckExact(buf))
387 utf8 = buf;
389 if (utf8 == NULL) {
390 utf8 = PyUnicode_AsUTF8String(buf);
391 Py_DECREF(buf);
392 if (utf8 == NULL)
393 return error_ret(tok);
395 str = PyString_AsString(utf8);
396 utf8len = PyString_GET_SIZE(utf8);
397 if (utf8len > size) {
398 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
399 if (tok->decoding_buffer == NULL) {
400 Py_DECREF(utf8);
401 return error_ret(tok);
403 utf8len = size;
405 memcpy(s, str, utf8len);
406 s[utf8len] = '\0';
407 Py_DECREF(utf8);
408 if (utf8len == 0) return NULL; /* EOF */
409 return s;
410 #endif
413 /* Set the readline function for TOK to a StreamReader's
414 readline function. The StreamReader is named ENC.
416 This function is called from check_bom and check_coding_spec.
418 ENC is usually identical to the future value of tok->encoding,
419 except for the (currently unsupported) case of UTF-16.
421 Return 1 on success, 0 on failure. */
423 static int
424 fp_setreadl(struct tok_state *tok, const char* enc)
426 PyObject *reader, *stream, *readline;
428 /* XXX: constify filename argument. */
429 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
430 if (stream == NULL)
431 return 0;
433 reader = PyCodec_StreamReader(enc, stream, NULL);
434 Py_DECREF(stream);
435 if (reader == NULL)
436 return 0;
438 readline = PyObject_GetAttrString(reader, "readline");
439 Py_DECREF(reader);
440 if (readline == NULL)
441 return 0;
443 tok->decoding_readline = readline;
444 return 1;
447 /* Fetch the next byte from TOK. */
449 static int fp_getc(struct tok_state *tok) {
450 return getc(tok->fp);
453 /* Unfetch the last byte back into TOK. */
455 static void fp_ungetc(int c, struct tok_state *tok) {
456 ungetc(c, tok->fp);
459 /* Read a line of input from TOK. Determine encoding
460 if necessary. */
462 static char *
463 decoding_fgets(char *s, int size, struct tok_state *tok)
465 char *line = NULL;
466 int badchar = 0;
467 for (;;) {
468 if (tok->decoding_state < 0) {
469 /* We already have a codec associated with
470 this input. */
471 line = fp_readl(s, size, tok);
472 break;
473 } else if (tok->decoding_state > 0) {
474 /* We want a 'raw' read. */
475 line = Py_UniversalNewlineFgets(s, size,
476 tok->fp, NULL);
477 break;
478 } else {
479 /* We have not yet determined the encoding.
480 If an encoding is found, use the file-pointer
481 reader functions from now on. */
482 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
483 return error_ret(tok);
484 assert(tok->decoding_state != 0);
487 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
488 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
489 return error_ret(tok);
492 #ifndef PGEN
493 /* The default encoding is ASCII, so make sure we don't have any
494 non-ASCII bytes in it. */
495 if (line && !tok->encoding) {
496 unsigned char *c;
497 for (c = (unsigned char *)line; *c; c++)
498 if (*c > 127) {
499 badchar = *c;
500 break;
503 if (badchar) {
504 char buf[500];
505 /* Need to add 1 to the line number, since this line
506 has not been counted, yet. */
507 sprintf(buf,
508 "Non-ASCII character '\\x%.2x' "
509 "in file %.200s on line %i, "
510 "but no encoding declared; "
511 "see http://www.python.org/peps/pep-0263.html for details",
512 badchar, tok->filename, tok->lineno + 1);
513 PyErr_SetString(PyExc_SyntaxError, buf);
514 return error_ret(tok);
516 #endif
517 return line;
520 static int
521 decoding_feof(struct tok_state *tok)
523 if (tok->decoding_state >= 0) {
524 return feof(tok->fp);
525 } else {
526 PyObject* buf = tok->decoding_buffer;
527 if (buf == NULL) {
528 buf = PyObject_CallObject(tok->decoding_readline, NULL);
529 if (buf == NULL) {
530 error_ret(tok);
531 return 1;
532 } else {
533 tok->decoding_buffer = buf;
536 return PyObject_Length(buf) == 0;
540 /* Fetch a byte from TOK, using the string buffer. */
542 static int
543 buf_getc(struct tok_state *tok) {
544 return Py_CHARMASK(*tok->str++);
547 /* Unfetch a byte from TOK, using the string buffer. */
549 static void
550 buf_ungetc(int c, struct tok_state *tok) {
551 tok->str--;
552 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
555 /* Set the readline function for TOK to ENC. For the string-based
556 tokenizer, this means to just record the encoding. */
558 static int
559 buf_setreadl(struct tok_state *tok, const char* enc) {
560 tok->enc = enc;
561 return 1;
564 /* Return a UTF-8 encoding Python string object from the
565 C byte string STR, which is encoded with ENC. */
567 #ifdef Py_USING_UNICODE
568 static PyObject *
569 translate_into_utf8(const char* str, const char* enc) {
570 PyObject *utf8;
571 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
572 if (buf == NULL)
573 return NULL;
574 utf8 = PyUnicode_AsUTF8String(buf);
575 Py_DECREF(buf);
576 return utf8;
578 #endif
580 /* Decode a byte string STR for use as the buffer of TOK.
581 Look for encoding declarations inside STR, and record them
582 inside TOK. */
584 static const char *
585 decode_str(const char *str, struct tok_state *tok)
587 PyObject* utf8 = NULL;
588 const char *s;
589 int lineno = 0;
590 tok->enc = NULL;
591 tok->str = str;
592 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
593 return error_ret(tok);
594 str = tok->str; /* string after BOM if any */
595 assert(str);
596 #ifdef Py_USING_UNICODE
597 if (tok->enc != NULL) {
598 utf8 = translate_into_utf8(str, tok->enc);
599 if (utf8 == NULL)
600 return error_ret(tok);
601 str = PyString_AsString(utf8);
603 #endif
604 for (s = str;; s++) {
605 if (*s == '\0') break;
606 else if (*s == '\n') {
607 lineno++;
608 if (lineno == 2) break;
611 tok->enc = NULL;
612 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
613 return error_ret(tok);
614 #ifdef Py_USING_UNICODE
615 if (tok->enc != NULL) {
616 assert(utf8 == NULL);
617 utf8 = translate_into_utf8(str, tok->enc);
618 if (utf8 == NULL) {
619 PyErr_Format(PyExc_SyntaxError,
620 "unknown encoding: %s", tok->enc);
621 return error_ret(tok);
623 str = PyString_AsString(utf8);
625 #endif
626 assert(tok->decoding_buffer == NULL);
627 tok->decoding_buffer = utf8; /* CAUTION */
628 return str;
631 #endif /* PGEN */
633 /* Set up tokenizer for string */
635 struct tok_state *
636 PyTokenizer_FromString(const char *str)
638 struct tok_state *tok = tok_new();
639 if (tok == NULL)
640 return NULL;
641 str = (char *)decode_str(str, tok);
642 if (str == NULL) {
643 PyTokenizer_Free(tok);
644 return NULL;
647 /* XXX: constify members. */
648 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
649 return tok;
653 /* Set up tokenizer for file */
655 struct tok_state *
656 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
658 struct tok_state *tok = tok_new();
659 if (tok == NULL)
660 return NULL;
661 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
662 PyTokenizer_Free(tok);
663 return NULL;
665 tok->cur = tok->inp = tok->buf;
666 tok->end = tok->buf + BUFSIZ;
667 tok->fp = fp;
668 tok->prompt = ps1;
669 tok->nextprompt = ps2;
670 return tok;
674 /* Free a tok_state structure */
676 void
677 PyTokenizer_Free(struct tok_state *tok)
679 if (tok->encoding != NULL)
680 PyMem_FREE(tok->encoding);
681 #ifndef PGEN
682 Py_XDECREF(tok->decoding_readline);
683 Py_XDECREF(tok->decoding_buffer);
684 #endif
685 if (tok->fp != NULL && tok->buf != NULL)
686 PyMem_FREE(tok->buf);
687 PyMem_FREE(tok);
690 #if !defined(PGEN) && defined(Py_USING_UNICODE)
691 static int
692 tok_stdin_decode(struct tok_state *tok, char **inp)
694 PyObject *enc, *sysstdin, *decoded, *utf8;
695 const char *encoding;
696 char *converted;
698 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
699 return 0;
700 sysstdin = PySys_GetObject("stdin");
701 if (sysstdin == NULL || !PyFile_Check(sysstdin))
702 return 0;
704 enc = ((PyFileObject *)sysstdin)->f_encoding;
705 if (enc == NULL || !PyString_Check(enc))
706 return 0;
707 Py_INCREF(enc);
709 encoding = PyString_AsString(enc);
710 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
711 if (decoded == NULL)
712 goto error_clear;
714 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
715 Py_DECREF(decoded);
716 if (utf8 == NULL)
717 goto error_clear;
719 assert(PyString_Check(utf8));
720 converted = new_string(PyString_AS_STRING(utf8),
721 PyString_GET_SIZE(utf8));
722 Py_DECREF(utf8);
723 if (converted == NULL)
724 goto error_nomem;
726 PyMem_FREE(*inp);
727 *inp = converted;
728 if (tok->encoding != NULL)
729 PyMem_FREE(tok->encoding);
730 tok->encoding = new_string(encoding, strlen(encoding));
731 if (tok->encoding == NULL)
732 goto error_nomem;
734 Py_DECREF(enc);
735 return 0;
737 error_nomem:
738 Py_DECREF(enc);
739 tok->done = E_NOMEM;
740 return -1;
742 error_clear:
743 /* Fallback to iso-8859-1: for backward compatibility */
744 Py_DECREF(enc);
745 PyErr_Clear();
746 return 0;
748 #endif
750 /* Get next char, updating state; error code goes into tok->done */
752 static int
753 tok_nextc(register struct tok_state *tok)
755 for (;;) {
756 if (tok->cur != tok->inp) {
757 return Py_CHARMASK(*tok->cur++); /* Fast path */
759 if (tok->done != E_OK)
760 return EOF;
761 if (tok->fp == NULL) {
762 char *end = strchr(tok->inp, '\n');
763 if (end != NULL)
764 end++;
765 else {
766 end = strchr(tok->inp, '\0');
767 if (end == tok->inp) {
768 tok->done = E_EOF;
769 return EOF;
772 if (tok->start == NULL)
773 tok->buf = tok->cur;
774 tok->line_start = tok->cur;
775 tok->lineno++;
776 tok->inp = end;
777 return Py_CHARMASK(*tok->cur++);
779 if (tok->prompt != NULL) {
780 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
781 if (tok->nextprompt != NULL)
782 tok->prompt = tok->nextprompt;
783 if (newtok == NULL)
784 tok->done = E_INTR;
785 else if (*newtok == '\0') {
786 PyMem_FREE(newtok);
787 tok->done = E_EOF;
789 #if !defined(PGEN) && defined(Py_USING_UNICODE)
790 else if (tok_stdin_decode(tok, &newtok) != 0)
791 PyMem_FREE(newtok);
792 #endif
793 else if (tok->start != NULL) {
794 size_t start = tok->start - tok->buf;
795 size_t oldlen = tok->cur - tok->buf;
796 size_t newlen = oldlen + strlen(newtok);
797 char *buf = tok->buf;
798 buf = (char *)PyMem_REALLOC(buf, newlen+1);
799 tok->lineno++;
800 if (buf == NULL) {
801 PyMem_FREE(tok->buf);
802 tok->buf = NULL;
803 PyMem_FREE(newtok);
804 tok->done = E_NOMEM;
805 return EOF;
807 tok->buf = buf;
808 tok->cur = tok->buf + oldlen;
809 tok->line_start = tok->cur;
810 strcpy(tok->buf + oldlen, newtok);
811 PyMem_FREE(newtok);
812 tok->inp = tok->buf + newlen;
813 tok->end = tok->inp + 1;
814 tok->start = tok->buf + start;
816 else {
817 tok->lineno++;
818 if (tok->buf != NULL)
819 PyMem_FREE(tok->buf);
820 tok->buf = newtok;
821 tok->line_start = tok->buf;
822 tok->cur = tok->buf;
823 tok->line_start = tok->buf;
824 tok->inp = strchr(tok->buf, '\0');
825 tok->end = tok->inp + 1;
828 else {
829 int done = 0;
830 Py_ssize_t cur = 0;
831 char *pt;
832 if (tok->start == NULL) {
833 if (tok->buf == NULL) {
834 tok->buf = (char *)
835 PyMem_MALLOC(BUFSIZ);
836 if (tok->buf == NULL) {
837 tok->done = E_NOMEM;
838 return EOF;
840 tok->end = tok->buf + BUFSIZ;
842 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
843 tok) == NULL) {
844 tok->done = E_EOF;
845 done = 1;
847 else {
848 tok->done = E_OK;
849 tok->inp = strchr(tok->buf, '\0');
850 done = tok->inp[-1] == '\n';
853 else {
854 cur = tok->cur - tok->buf;
855 if (decoding_feof(tok)) {
856 tok->done = E_EOF;
857 done = 1;
859 else
860 tok->done = E_OK;
862 tok->lineno++;
863 /* Read until '\n' or EOF */
864 while (!done) {
865 Py_ssize_t curstart = tok->start == NULL ? -1 :
866 tok->start - tok->buf;
867 Py_ssize_t curvalid = tok->inp - tok->buf;
868 Py_ssize_t newsize = curvalid + BUFSIZ;
869 char *newbuf = tok->buf;
870 newbuf = (char *)PyMem_REALLOC(newbuf,
871 newsize);
872 if (newbuf == NULL) {
873 tok->done = E_NOMEM;
874 tok->cur = tok->inp;
875 return EOF;
877 tok->buf = newbuf;
878 tok->inp = tok->buf + curvalid;
879 tok->end = tok->buf + newsize;
880 tok->start = curstart < 0 ? NULL :
881 tok->buf + curstart;
882 if (decoding_fgets(tok->inp,
883 (int)(tok->end - tok->inp),
884 tok) == NULL) {
885 /* Break out early on decoding
886 errors, as tok->buf will be NULL
888 if (tok->decoding_erred)
889 return EOF;
890 /* Last line does not end in \n,
891 fake one */
892 strcpy(tok->inp, "\n");
894 tok->inp = strchr(tok->inp, '\0');
895 done = tok->inp[-1] == '\n';
897 if (tok->buf != NULL) {
898 tok->cur = tok->buf + cur;
899 tok->line_start = tok->cur;
900 /* replace "\r\n" with "\n" */
901 /* For Mac leave the \r, giving a syntax error */
902 pt = tok->inp - 2;
903 if (pt >= tok->buf && *pt == '\r') {
904 *pt++ = '\n';
905 *pt = '\0';
906 tok->inp = pt;
910 if (tok->done != E_OK) {
911 if (tok->prompt != NULL)
912 PySys_WriteStderr("\n");
913 tok->cur = tok->inp;
914 return EOF;
917 /*NOTREACHED*/
921 /* Back-up one character */
923 static void
924 tok_backup(register struct tok_state *tok, register int c)
926 if (c != EOF) {
927 if (--tok->cur < tok->buf)
928 Py_FatalError("tok_backup: begin of buffer");
929 if (*tok->cur != c)
930 *tok->cur = c;
935 /* Return the token corresponding to a single character */
938 PyToken_OneChar(int c)
940 switch (c) {
941 case '(': return LPAR;
942 case ')': return RPAR;
943 case '[': return LSQB;
944 case ']': return RSQB;
945 case ':': return COLON;
946 case ',': return COMMA;
947 case ';': return SEMI;
948 case '+': return PLUS;
949 case '-': return MINUS;
950 case '*': return STAR;
951 case '/': return SLASH;
952 case '|': return VBAR;
953 case '&': return AMPER;
954 case '<': return LESS;
955 case '>': return GREATER;
956 case '=': return EQUAL;
957 case '.': return DOT;
958 case '%': return PERCENT;
959 case '`': return BACKQUOTE;
960 case '{': return LBRACE;
961 case '}': return RBRACE;
962 case '^': return CIRCUMFLEX;
963 case '~': return TILDE;
964 case '@': return AT;
965 default: return OP;
971 PyToken_TwoChars(int c1, int c2)
973 switch (c1) {
974 case '=':
975 switch (c2) {
976 case '=': return EQEQUAL;
978 break;
979 case '!':
980 switch (c2) {
981 case '=': return NOTEQUAL;
983 break;
984 case '<':
985 switch (c2) {
986 case '>': return NOTEQUAL;
987 case '=': return LESSEQUAL;
988 case '<': return LEFTSHIFT;
990 break;
991 case '>':
992 switch (c2) {
993 case '=': return GREATEREQUAL;
994 case '>': return RIGHTSHIFT;
996 break;
997 case '+':
998 switch (c2) {
999 case '=': return PLUSEQUAL;
1001 break;
1002 case '-':
1003 switch (c2) {
1004 case '=': return MINEQUAL;
1006 break;
1007 case '*':
1008 switch (c2) {
1009 case '*': return DOUBLESTAR;
1010 case '=': return STAREQUAL;
1012 break;
1013 case '/':
1014 switch (c2) {
1015 case '/': return DOUBLESLASH;
1016 case '=': return SLASHEQUAL;
1018 break;
1019 case '|':
1020 switch (c2) {
1021 case '=': return VBAREQUAL;
1023 break;
1024 case '%':
1025 switch (c2) {
1026 case '=': return PERCENTEQUAL;
1028 break;
1029 case '&':
1030 switch (c2) {
1031 case '=': return AMPEREQUAL;
1033 break;
1034 case '^':
1035 switch (c2) {
1036 case '=': return CIRCUMFLEXEQUAL;
1038 break;
1040 return OP;
1044 PyToken_ThreeChars(int c1, int c2, int c3)
1046 switch (c1) {
1047 case '<':
1048 switch (c2) {
1049 case '<':
1050 switch (c3) {
1051 case '=':
1052 return LEFTSHIFTEQUAL;
1054 break;
1056 break;
1057 case '>':
1058 switch (c2) {
1059 case '>':
1060 switch (c3) {
1061 case '=':
1062 return RIGHTSHIFTEQUAL;
1064 break;
1066 break;
1067 case '*':
1068 switch (c2) {
1069 case '*':
1070 switch (c3) {
1071 case '=':
1072 return DOUBLESTAREQUAL;
1074 break;
1076 break;
1077 case '/':
1078 switch (c2) {
1079 case '/':
1080 switch (c3) {
1081 case '=':
1082 return DOUBLESLASHEQUAL;
1084 break;
1086 break;
1088 return OP;
1091 static int
1092 indenterror(struct tok_state *tok)
1094 if (tok->alterror) {
1095 tok->done = E_TABSPACE;
1096 tok->cur = tok->inp;
1097 return 1;
1099 if (tok->altwarning) {
1100 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1101 "in indentation\n", tok->filename);
1102 tok->altwarning = 0;
1104 return 0;
1108 /* Get next token, after space stripping etc. */
1110 static int
1111 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1113 register int c;
1114 int blankline;
1116 *p_start = *p_end = NULL;
1117 nextline:
1118 tok->start = NULL;
1119 blankline = 0;
1121 /* Get indentation level */
1122 if (tok->atbol) {
1123 register int col = 0;
1124 register int altcol = 0;
1125 tok->atbol = 0;
1126 for (;;) {
1127 c = tok_nextc(tok);
1128 if (c == ' ')
1129 col++, altcol++;
1130 else if (c == '\t') {
1131 col = (col/tok->tabsize + 1) * tok->tabsize;
1132 altcol = (altcol/tok->alttabsize + 1)
1133 * tok->alttabsize;
1135 else if (c == '\014') /* Control-L (formfeed) */
1136 col = altcol = 0; /* For Emacs users */
1137 else
1138 break;
1140 tok_backup(tok, c);
1141 if (c == '#' || c == '\n') {
1142 /* Lines with only whitespace and/or comments
1143 shouldn't affect the indentation and are
1144 not passed to the parser as NEWLINE tokens,
1145 except *totally* empty lines in interactive
1146 mode, which signal the end of a command group. */
1147 if (col == 0 && c == '\n' && tok->prompt != NULL)
1148 blankline = 0; /* Let it through */
1149 else
1150 blankline = 1; /* Ignore completely */
1151 /* We can't jump back right here since we still
1152 may need to skip to the end of a comment */
1154 if (!blankline && tok->level == 0) {
1155 if (col == tok->indstack[tok->indent]) {
1156 /* No change */
1157 if (altcol != tok->altindstack[tok->indent]) {
1158 if (indenterror(tok))
1159 return ERRORTOKEN;
1162 else if (col > tok->indstack[tok->indent]) {
1163 /* Indent -- always one */
1164 if (tok->indent+1 >= MAXINDENT) {
1165 tok->done = E_TOODEEP;
1166 tok->cur = tok->inp;
1167 return ERRORTOKEN;
1169 if (altcol <= tok->altindstack[tok->indent]) {
1170 if (indenterror(tok))
1171 return ERRORTOKEN;
1173 tok->pendin++;
1174 tok->indstack[++tok->indent] = col;
1175 tok->altindstack[tok->indent] = altcol;
1177 else /* col < tok->indstack[tok->indent] */ {
1178 /* Dedent -- any number, must be consistent */
1179 while (tok->indent > 0 &&
1180 col < tok->indstack[tok->indent]) {
1181 tok->pendin--;
1182 tok->indent--;
1184 if (col != tok->indstack[tok->indent]) {
1185 tok->done = E_DEDENT;
1186 tok->cur = tok->inp;
1187 return ERRORTOKEN;
1189 if (altcol != tok->altindstack[tok->indent]) {
1190 if (indenterror(tok))
1191 return ERRORTOKEN;
1197 tok->start = tok->cur;
1199 /* Return pending indents/dedents */
1200 if (tok->pendin != 0) {
1201 if (tok->pendin < 0) {
1202 tok->pendin++;
1203 return DEDENT;
1205 else {
1206 tok->pendin--;
1207 return INDENT;
1211 again:
1212 tok->start = NULL;
1213 /* Skip spaces */
1214 do {
1215 c = tok_nextc(tok);
1216 } while (c == ' ' || c == '\t' || c == '\014');
1218 /* Set start of current token */
1219 tok->start = tok->cur - 1;
1221 /* Skip comment, while looking for tab-setting magic */
1222 if (c == '#') {
1223 static char *tabforms[] = {
1224 "tab-width:", /* Emacs */
1225 ":tabstop=", /* vim, full form */
1226 ":ts=", /* vim, abbreviated form */
1227 "set tabsize=", /* will vi never die? */
1228 /* more templates can be added here to support other editors */
1230 char cbuf[80];
1231 char *tp, **cp;
1232 tp = cbuf;
1233 do {
1234 *tp++ = c = tok_nextc(tok);
1235 } while (c != EOF && c != '\n' &&
1236 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1237 *tp = '\0';
1238 for (cp = tabforms;
1239 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1240 cp++) {
1241 if ((tp = strstr(cbuf, *cp))) {
1242 int newsize = atoi(tp + strlen(*cp));
1244 if (newsize >= 1 && newsize <= 40) {
1245 tok->tabsize = newsize;
1246 if (Py_VerboseFlag)
1247 PySys_WriteStderr(
1248 "Tab size set to %d\n",
1249 newsize);
1253 while (c != EOF && c != '\n')
1254 c = tok_nextc(tok);
1257 /* Check for EOF and errors now */
1258 if (c == EOF) {
1259 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1262 /* Identifier (most frequent token!) */
1263 if (isalpha(c) || c == '_') {
1264 /* Process r"", u"" and ur"" */
1265 switch (c) {
1266 case 'r':
1267 case 'R':
1268 c = tok_nextc(tok);
1269 if (c == '"' || c == '\'')
1270 goto letter_quote;
1271 break;
1272 case 'u':
1273 case 'U':
1274 c = tok_nextc(tok);
1275 if (c == 'r' || c == 'R')
1276 c = tok_nextc(tok);
1277 if (c == '"' || c == '\'')
1278 goto letter_quote;
1279 break;
1281 while (isalnum(c) || c == '_') {
1282 c = tok_nextc(tok);
1284 tok_backup(tok, c);
1285 *p_start = tok->start;
1286 *p_end = tok->cur;
1287 return NAME;
1290 /* Newline */
1291 if (c == '\n') {
1292 tok->atbol = 1;
1293 if (blankline || tok->level > 0)
1294 goto nextline;
1295 *p_start = tok->start;
1296 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1297 tok->cont_line = 0;
1298 return NEWLINE;
1301 /* Period or number starting with period? */
1302 if (c == '.') {
1303 c = tok_nextc(tok);
1304 if (isdigit(c)) {
1305 goto fraction;
1307 else {
1308 tok_backup(tok, c);
1309 *p_start = tok->start;
1310 *p_end = tok->cur;
1311 return DOT;
1315 /* Number */
1316 if (isdigit(c)) {
1317 if (c == '0') {
1318 /* Hex or octal -- maybe. */
1319 c = tok_nextc(tok);
1320 if (c == '.')
1321 goto fraction;
1322 #ifndef WITHOUT_COMPLEX
1323 if (c == 'j' || c == 'J')
1324 goto imaginary;
1325 #endif
1326 if (c == 'x' || c == 'X') {
1327 /* Hex */
1328 do {
1329 c = tok_nextc(tok);
1330 } while (isxdigit(c));
1332 else {
1333 int found_decimal = 0;
1334 /* Octal; c is first char of it */
1335 /* There's no 'isoctdigit' macro, sigh */
1336 while ('0' <= c && c < '8') {
1337 c = tok_nextc(tok);
1339 if (isdigit(c)) {
1340 found_decimal = 1;
1341 do {
1342 c = tok_nextc(tok);
1343 } while (isdigit(c));
1345 if (c == '.')
1346 goto fraction;
1347 else if (c == 'e' || c == 'E')
1348 goto exponent;
1349 #ifndef WITHOUT_COMPLEX
1350 else if (c == 'j' || c == 'J')
1351 goto imaginary;
1352 #endif
1353 else if (found_decimal) {
1354 tok->done = E_TOKEN;
1355 tok_backup(tok, c);
1356 return ERRORTOKEN;
1359 if (c == 'l' || c == 'L')
1360 c = tok_nextc(tok);
1362 else {
1363 /* Decimal */
1364 do {
1365 c = tok_nextc(tok);
1366 } while (isdigit(c));
1367 if (c == 'l' || c == 'L')
1368 c = tok_nextc(tok);
1369 else {
1370 /* Accept floating point numbers. */
1371 if (c == '.') {
1372 fraction:
1373 /* Fraction */
1374 do {
1375 c = tok_nextc(tok);
1376 } while (isdigit(c));
1378 if (c == 'e' || c == 'E') {
1379 exponent:
1380 /* Exponent part */
1381 c = tok_nextc(tok);
1382 if (c == '+' || c == '-')
1383 c = tok_nextc(tok);
1384 if (!isdigit(c)) {
1385 tok->done = E_TOKEN;
1386 tok_backup(tok, c);
1387 return ERRORTOKEN;
1389 do {
1390 c = tok_nextc(tok);
1391 } while (isdigit(c));
1393 #ifndef WITHOUT_COMPLEX
1394 if (c == 'j' || c == 'J')
1395 /* Imaginary part */
1396 imaginary:
1397 c = tok_nextc(tok);
1398 #endif
1401 tok_backup(tok, c);
1402 *p_start = tok->start;
1403 *p_end = tok->cur;
1404 return NUMBER;
1407 letter_quote:
1408 /* String */
1409 if (c == '\'' || c == '"') {
1410 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1411 int quote = c;
1412 int triple = 0;
1413 int tripcount = 0;
1414 for (;;) {
1415 c = tok_nextc(tok);
1416 if (c == '\n') {
1417 if (!triple) {
1418 tok->done = E_EOLS;
1419 tok_backup(tok, c);
1420 return ERRORTOKEN;
1422 tripcount = 0;
1423 tok->cont_line = 1; /* multiline string. */
1425 else if (c == EOF) {
1426 if (triple)
1427 tok->done = E_EOFS;
1428 else
1429 tok->done = E_EOLS;
1430 tok->cur = tok->inp;
1431 return ERRORTOKEN;
1433 else if (c == quote) {
1434 tripcount++;
1435 if (tok->cur - tok->start == quote2) {
1436 c = tok_nextc(tok);
1437 if (c == quote) {
1438 triple = 1;
1439 tripcount = 0;
1440 continue;
1442 tok_backup(tok, c);
1444 if (!triple || tripcount == 3)
1445 break;
1447 else if (c == '\\') {
1448 tripcount = 0;
1449 c = tok_nextc(tok);
1450 if (c == EOF) {
1451 tok->done = E_EOLS;
1452 tok->cur = tok->inp;
1453 return ERRORTOKEN;
1456 else
1457 tripcount = 0;
1459 *p_start = tok->start;
1460 *p_end = tok->cur;
1461 return STRING;
1464 /* Line continuation */
1465 if (c == '\\') {
1466 c = tok_nextc(tok);
1467 if (c != '\n') {
1468 tok->done = E_LINECONT;
1469 tok->cur = tok->inp;
1470 return ERRORTOKEN;
1472 tok->cont_line = 1;
1473 goto again; /* Read next line */
1476 /* Check for two-character token */
1478 int c2 = tok_nextc(tok);
1479 int token = PyToken_TwoChars(c, c2);
1480 #ifndef PGEN
1481 if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {
1482 if (PyErr_WarnExplicit(PyExc_DeprecationWarning,
1483 "<> not supported in 3.x",
1484 tok->filename, tok->lineno,
1485 NULL, NULL)) {
1486 return ERRORTOKEN;
1489 #endif
1490 if (token != OP) {
1491 int c3 = tok_nextc(tok);
1492 int token3 = PyToken_ThreeChars(c, c2, c3);
1493 if (token3 != OP) {
1494 token = token3;
1495 } else {
1496 tok_backup(tok, c3);
1498 *p_start = tok->start;
1499 *p_end = tok->cur;
1500 return token;
1502 tok_backup(tok, c2);
1505 /* Keep track of parentheses nesting level */
1506 switch (c) {
1507 case '(':
1508 case '[':
1509 case '{':
1510 tok->level++;
1511 break;
1512 case ')':
1513 case ']':
1514 case '}':
1515 tok->level--;
1516 break;
1519 /* Punctuation character */
1520 *p_start = tok->start;
1521 *p_end = tok->cur;
1522 return PyToken_OneChar(c);
1526 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1528 int result = tok_get(tok, p_start, p_end);
1529 if (tok->decoding_erred) {
1530 result = ERRORTOKEN;
1531 tok->done = E_DECODE;
1533 return result;
1536 /* This function is only called from parsetok. However, it cannot live
1537 there, as it must be empty for PGEN, and we can check for PGEN only
1538 in this file. */
1540 #ifdef PGEN
1541 char*
1542 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)
1544 return NULL;
1546 #else
1547 #ifdef Py_USING_UNICODE
1548 static PyObject *
1549 dec_utf8(const char *enc, const char *text, size_t len) {
1550 PyObject *ret = NULL;
1551 PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");
1552 if (unicode_text) {
1553 ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");
1554 Py_DECREF(unicode_text);
1556 if (!ret) {
1557 PyErr_Clear();
1559 return ret;
1562 char *
1563 PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)
1565 char *text = NULL;
1566 if (tok->encoding) {
1567 /* convert source to original encondig */
1568 PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);
1569 if (lineobj != NULL) {
1570 int linelen = PyString_Size(lineobj);
1571 const char *line = PyString_AsString(lineobj);
1572 text = PyObject_MALLOC(linelen + 1);
1573 if (text != NULL && line != NULL) {
1574 if (linelen)
1575 strncpy(text, line, linelen);
1576 text[linelen] = '\0';
1578 Py_DECREF(lineobj);
1580 /* adjust error offset */
1581 if (*offset > 1) {
1582 PyObject *offsetobj = dec_utf8(tok->encoding,
1583 tok->buf, *offset-1);
1584 if (offsetobj) {
1585 *offset = PyString_Size(offsetobj) + 1;
1586 Py_DECREF(offsetobj);
1592 return text;
1595 #endif /* defined(Py_USING_UNICODE) */
1596 #endif
1599 #ifdef Py_DEBUG
1601 void
1602 tok_dump(int type, char *start, char *end)
1604 printf("%s", _PyParser_TokenNames[type]);
1605 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1606 printf("(%.*s)", (int)(end - start), start);
1609 #endif