Comment out the prints. These appear to be only for debugging purposes.
[python.git] / Parser / tokenizer.c
blob001d31a1077940d852543e0e58aac2c0171f0507
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
21 extern char *PyOS_Readline(FILE *, FILE *, char *);
22 /* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
26 /* Don't ever change this -- it would break the portability of Python code */
27 #define TABSIZE 8
29 /* Convert a possibly signed character to a nonnegative int */
30 /* XXX This assumes characters are 8 bits wide */
31 #ifdef __CHAR_UNSIGNED__
32 #define Py_CHARMASK(c) (c)
33 #else
34 #define Py_CHARMASK(c) ((c) & 0xff)
35 #endif
37 /* Forward */
38 static struct tok_state *tok_new(void);
39 static int tok_nextc(struct tok_state *tok);
40 static void tok_backup(struct tok_state *tok, int c);
42 /* Token names */
44 char *_PyParser_TokenNames[] = {
45 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
73 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
77 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
81 "DOUBLESTAR",
82 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
93 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
95 "AT",
96 /* This table must match the #defines in token.h! */
97 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
103 /* Create and initialize a new tok_state structure */
105 static struct tok_state *
106 tok_new(void)
108 struct tok_state *tok = PyMem_NEW(struct tok_state, 1);
109 if (tok == NULL)
110 return NULL;
111 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
112 tok->done = E_OK;
113 tok->fp = NULL;
114 tok->tabsize = TABSIZE;
115 tok->indent = 0;
116 tok->indstack[0] = 0;
117 tok->atbol = 1;
118 tok->pendin = 0;
119 tok->prompt = tok->nextprompt = NULL;
120 tok->lineno = 0;
121 tok->level = 0;
122 tok->filename = NULL;
123 tok->altwarning = 0;
124 tok->alterror = 0;
125 tok->alttabsize = 1;
126 tok->altindstack[0] = 0;
127 tok->decoding_state = 0;
128 tok->decoding_erred = 0;
129 tok->read_coding_spec = 0;
130 tok->encoding = NULL;
131 tok->cont_line = 0;
132 #ifndef PGEN
133 tok->decoding_readline = NULL;
134 tok->decoding_buffer = NULL;
135 #endif
136 return tok;
139 #ifdef PGEN
141 static char *
142 decoding_fgets(char *s, int size, struct tok_state *tok)
144 return fgets(s, size, tok->fp);
147 static int
148 decoding_feof(struct tok_state *tok)
150 return feof(tok->fp);
153 static const char *
154 decode_str(const char *str, struct tok_state *tok)
156 return str;
159 #else /* PGEN */
161 static char *
162 error_ret(struct tok_state *tok) /* XXX */
164 tok->decoding_erred = 1;
165 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
166 PyObject_FREE(tok->buf);
167 tok->buf = NULL;
168 return NULL; /* as if it were EOF */
171 static char *
172 new_string(const char *s, Py_ssize_t len)
174 char* result = (char *)PyObject_MALLOC(len + 1);
175 if (result != NULL) {
176 memcpy(result, s, len);
177 result[len] = '\0';
179 return result;
182 static char *
183 get_normal_name(char *s) /* for utf-8 and latin-1 */
185 char buf[13];
186 int i;
187 for (i = 0; i < 12; i++) {
188 int c = s[i];
189 if (c == '\0') break;
190 else if (c == '_') buf[i] = '-';
191 else buf[i] = tolower(c);
193 buf[i] = '\0';
194 if (strcmp(buf, "utf-8") == 0 ||
195 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
196 else if (strcmp(buf, "latin-1") == 0 ||
197 strcmp(buf, "iso-8859-1") == 0 ||
198 strcmp(buf, "iso-latin-1") == 0 ||
199 strncmp(buf, "latin-1-", 8) == 0 ||
200 strncmp(buf, "iso-8859-1-", 11) == 0 ||
201 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
202 else return s;
205 /* Return the coding spec in S, or NULL if none is found. */
207 static char *
208 get_coding_spec(const char *s, Py_ssize_t size)
210 Py_ssize_t i;
211 /* Coding spec must be in a comment, and that comment must be
212 * the only statement on the source code line. */
213 for (i = 0; i < size - 6; i++) {
214 if (s[i] == '#')
215 break;
216 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
217 return NULL;
219 for (; i < size - 6; i++) { /* XXX inefficient search */
220 const char* t = s + i;
221 if (strncmp(t, "coding", 6) == 0) {
222 const char* begin = NULL;
223 t += 6;
224 if (t[0] != ':' && t[0] != '=')
225 continue;
226 do {
227 t++;
228 } while (t[0] == '\x20' || t[0] == '\t');
230 begin = t;
231 while (isalnum(Py_CHARMASK(t[0])) ||
232 t[0] == '-' || t[0] == '_' || t[0] == '.')
233 t++;
235 if (begin < t) {
236 char* r = new_string(begin, t - begin);
237 char* q = get_normal_name(r);
238 if (r != q) {
239 PyObject_FREE(r);
240 r = new_string(q, strlen(q));
242 return r;
246 return NULL;
249 /* Check whether the line contains a coding spec. If it does,
250 invoke the set_readline function for the new encoding.
251 This function receives the tok_state and the new encoding.
252 Return 1 on success, 0 on failure. */
254 static int
255 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
256 int set_readline(struct tok_state *, const char *))
258 char * cs;
259 int r = 1;
261 if (tok->cont_line)
262 /* It's a continuation line, so it can't be a coding spec. */
263 return 1;
264 cs = get_coding_spec(line, size);
265 if (cs != NULL) {
266 tok->read_coding_spec = 1;
267 if (tok->encoding == NULL) {
268 assert(tok->decoding_state == 1); /* raw */
269 if (strcmp(cs, "utf-8") == 0 ||
270 strcmp(cs, "iso-8859-1") == 0) {
271 tok->encoding = cs;
272 } else {
273 #ifdef Py_USING_UNICODE
274 r = set_readline(tok, cs);
275 if (r) {
276 tok->encoding = cs;
277 tok->decoding_state = -1;
279 else
280 PyObject_FREE(cs);
281 #else
282 /* Without Unicode support, we cannot
283 process the coding spec. Since there
284 won't be any Unicode literals, that
285 won't matter. */
286 PyObject_FREE(cs);
287 #endif
289 } else { /* then, compare cs with BOM */
290 r = (strcmp(tok->encoding, cs) == 0);
291 PyObject_FREE(cs);
294 if (!r) {
295 cs = tok->encoding;
296 if (!cs)
297 cs = "with BOM";
298 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300 return r;
303 /* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
307 static int
308 check_bom(int get_char(struct tok_state *),
309 void unget_char(int, struct tok_state *),
310 int set_readline(struct tok_state *, const char *),
311 struct tok_state *tok)
313 int ch = get_char(tok);
314 tok->decoding_state = 1;
315 if (ch == EOF) {
316 return 1;
317 } else if (ch == 0xEF) {
318 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
319 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
320 #if 0
321 /* Disable support for UTF-16 BOMs until a decision
322 is made whether this needs to be supported. */
323 } else if (ch == 0xFE) {
324 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
325 if (!set_readline(tok, "utf-16-be")) return 0;
326 tok->decoding_state = -1;
327 } else if (ch == 0xFF) {
328 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
329 if (!set_readline(tok, "utf-16-le")) return 0;
330 tok->decoding_state = -1;
331 #endif
332 } else {
333 unget_char(ch, tok);
334 return 1;
336 if (tok->encoding != NULL)
337 PyObject_FREE(tok->encoding);
338 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
339 return 1;
340 NON_BOM:
341 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
342 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
343 return 1;
346 /* Read a line of text from TOK into S, using the stream in TOK.
347 Return NULL on failure, else S.
349 On entry, tok->decoding_buffer will be one of:
350 1) NULL: need to call tok->decoding_readline to get a new line
351 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
352 stored the result in tok->decoding_buffer
353 3) PyStringObject *: previous call to fp_readl did not have enough room
354 (in the s buffer) to copy entire contents of the line read
355 by tok->decoding_readline. tok->decoding_buffer has the overflow.
356 In this case, fp_readl is called in a loop (with an expanded buffer)
357 until the buffer ends with a '\n' (or until the end of the file is
358 reached): see tok_nextc and its calls to decoding_fgets.
361 static char *
362 fp_readl(char *s, int size, struct tok_state *tok)
364 #ifndef Py_USING_UNICODE
365 /* In a non-Unicode built, this should never be called. */
366 Py_FatalError("fp_readl should not be called in this build.");
367 return NULL; /* Keep compiler happy (not reachable) */
368 #else
369 PyObject* utf8 = NULL;
370 PyObject* buf = tok->decoding_buffer;
371 char *str;
372 Py_ssize_t utf8len;
374 /* Ask for one less byte so we can terminate it */
375 assert(size > 0);
376 size--;
378 if (buf == NULL) {
379 buf = PyObject_CallObject(tok->decoding_readline, NULL);
380 if (buf == NULL)
381 return error_ret(tok);
382 } else {
383 tok->decoding_buffer = NULL;
384 if (PyString_CheckExact(buf))
385 utf8 = buf;
387 if (utf8 == NULL) {
388 utf8 = PyUnicode_AsUTF8String(buf);
389 Py_DECREF(buf);
390 if (utf8 == NULL)
391 return error_ret(tok);
393 str = PyString_AsString(utf8);
394 utf8len = PyString_GET_SIZE(utf8);
395 if (utf8len > size) {
396 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
397 if (tok->decoding_buffer == NULL) {
398 Py_DECREF(utf8);
399 return error_ret(tok);
401 utf8len = size;
403 memcpy(s, str, utf8len);
404 s[utf8len] = '\0';
405 Py_DECREF(utf8);
406 if (utf8len == 0) return NULL; /* EOF */
407 return s;
408 #endif
411 /* Set the readline function for TOK to a StreamReader's
412 readline function. The StreamReader is named ENC.
414 This function is called from check_bom and check_coding_spec.
416 ENC is usually identical to the future value of tok->encoding,
417 except for the (currently unsupported) case of UTF-16.
419 Return 1 on success, 0 on failure. */
421 static int
422 fp_setreadl(struct tok_state *tok, const char* enc)
424 PyObject *reader, *stream, *readline;
426 /* XXX: constify filename argument. */
427 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
428 if (stream == NULL)
429 return 0;
431 reader = PyCodec_StreamReader(enc, stream, NULL);
432 Py_DECREF(stream);
433 if (reader == NULL)
434 return 0;
436 readline = PyObject_GetAttrString(reader, "readline");
437 Py_DECREF(reader);
438 if (readline == NULL)
439 return 0;
441 tok->decoding_readline = readline;
442 return 1;
445 /* Fetch the next byte from TOK. */
447 static int fp_getc(struct tok_state *tok) {
448 return getc(tok->fp);
451 /* Unfetch the last byte back into TOK. */
453 static void fp_ungetc(int c, struct tok_state *tok) {
454 ungetc(c, tok->fp);
457 /* Read a line of input from TOK. Determine encoding
458 if necessary. */
460 static char *
461 decoding_fgets(char *s, int size, struct tok_state *tok)
463 char *line = NULL;
464 int badchar = 0;
465 for (;;) {
466 if (tok->decoding_state < 0) {
467 /* We already have a codec associated with
468 this input. */
469 line = fp_readl(s, size, tok);
470 break;
471 } else if (tok->decoding_state > 0) {
472 /* We want a 'raw' read. */
473 line = Py_UniversalNewlineFgets(s, size,
474 tok->fp, NULL);
475 break;
476 } else {
477 /* We have not yet determined the encoding.
478 If an encoding is found, use the file-pointer
479 reader functions from now on. */
480 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
481 return error_ret(tok);
482 assert(tok->decoding_state != 0);
485 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
486 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
487 return error_ret(tok);
490 #ifndef PGEN
491 /* The default encoding is ASCII, so make sure we don't have any
492 non-ASCII bytes in it. */
493 if (line && !tok->encoding) {
494 unsigned char *c;
495 for (c = (unsigned char *)line; *c; c++)
496 if (*c > 127) {
497 badchar = *c;
498 break;
501 if (badchar) {
502 char buf[500];
503 /* Need to add 1 to the line number, since this line
504 has not been counted, yet. */
505 sprintf(buf,
506 "Non-ASCII character '\\x%.2x' "
507 "in file %.200s on line %i, "
508 "but no encoding declared; "
509 "see http://www.python.org/peps/pep-0263.html for details",
510 badchar, tok->filename, tok->lineno + 1);
511 PyErr_SetString(PyExc_SyntaxError, buf);
512 return error_ret(tok);
514 #endif
515 return line;
518 static int
519 decoding_feof(struct tok_state *tok)
521 if (tok->decoding_state >= 0) {
522 return feof(tok->fp);
523 } else {
524 PyObject* buf = tok->decoding_buffer;
525 if (buf == NULL) {
526 buf = PyObject_CallObject(tok->decoding_readline, NULL);
527 if (buf == NULL) {
528 error_ret(tok);
529 return 1;
530 } else {
531 tok->decoding_buffer = buf;
534 return PyObject_Length(buf) == 0;
538 /* Fetch a byte from TOK, using the string buffer. */
540 static int
541 buf_getc(struct tok_state *tok) {
542 return Py_CHARMASK(*tok->str++);
545 /* Unfetch a byte from TOK, using the string buffer. */
547 static void
548 buf_ungetc(int c, struct tok_state *tok) {
549 tok->str--;
550 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
553 /* Set the readline function for TOK to ENC. For the string-based
554 tokenizer, this means to just record the encoding. */
556 static int
557 buf_setreadl(struct tok_state *tok, const char* enc) {
558 tok->enc = enc;
559 return 1;
562 /* Return a UTF-8 encoding Python string object from the
563 C byte string STR, which is encoded with ENC. */
565 #ifdef Py_USING_UNICODE
566 static PyObject *
567 translate_into_utf8(const char* str, const char* enc) {
568 PyObject *utf8;
569 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
570 if (buf == NULL)
571 return NULL;
572 utf8 = PyUnicode_AsUTF8String(buf);
573 Py_DECREF(buf);
574 return utf8;
576 #endif
578 /* Decode a byte string STR for use as the buffer of TOK.
579 Look for encoding declarations inside STR, and record them
580 inside TOK. */
582 static const char *
583 decode_str(const char *str, struct tok_state *tok)
585 PyObject* utf8 = NULL;
586 const char *s;
587 int lineno = 0;
588 tok->enc = NULL;
589 tok->str = str;
590 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
591 return error_ret(tok);
592 str = tok->str; /* string after BOM if any */
593 assert(str);
594 #ifdef Py_USING_UNICODE
595 if (tok->enc != NULL) {
596 utf8 = translate_into_utf8(str, tok->enc);
597 if (utf8 == NULL)
598 return error_ret(tok);
599 str = PyString_AsString(utf8);
601 #endif
602 for (s = str;; s++) {
603 if (*s == '\0') break;
604 else if (*s == '\n') {
605 lineno++;
606 if (lineno == 2) break;
609 tok->enc = NULL;
610 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
611 return error_ret(tok);
612 #ifdef Py_USING_UNICODE
613 if (tok->enc != NULL) {
614 assert(utf8 == NULL);
615 utf8 = translate_into_utf8(str, tok->enc);
616 if (utf8 == NULL) {
617 PyErr_Format(PyExc_SyntaxError,
618 "unknown encoding: %s", tok->enc);
619 return error_ret(tok);
621 str = PyString_AsString(utf8);
623 #endif
624 assert(tok->decoding_buffer == NULL);
625 tok->decoding_buffer = utf8; /* CAUTION */
626 return str;
629 #endif /* PGEN */
631 /* Set up tokenizer for string */
633 struct tok_state *
634 PyTokenizer_FromString(const char *str)
636 struct tok_state *tok = tok_new();
637 if (tok == NULL)
638 return NULL;
639 str = (char *)decode_str(str, tok);
640 if (str == NULL) {
641 PyTokenizer_Free(tok);
642 return NULL;
645 /* XXX: constify members. */
646 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
647 return tok;
651 /* Set up tokenizer for file */
653 struct tok_state *
654 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
656 struct tok_state *tok = tok_new();
657 if (tok == NULL)
658 return NULL;
659 if ((tok->buf = (char *)PyObject_MALLOC(BUFSIZ)) == NULL) {
660 PyTokenizer_Free(tok);
661 return NULL;
663 tok->cur = tok->inp = tok->buf;
664 tok->end = tok->buf + BUFSIZ;
665 tok->fp = fp;
666 tok->prompt = ps1;
667 tok->nextprompt = ps2;
668 return tok;
672 /* Free a tok_state structure */
674 void
675 PyTokenizer_Free(struct tok_state *tok)
677 if (tok->encoding != NULL)
678 PyObject_FREE(tok->encoding);
679 #ifndef PGEN
680 Py_XDECREF(tok->decoding_readline);
681 Py_XDECREF(tok->decoding_buffer);
682 #endif
683 if (tok->fp != NULL && tok->buf != NULL)
684 PyObject_FREE(tok->buf);
685 PyMem_FREE(tok);
688 #if !defined(PGEN) && defined(Py_USING_UNICODE)
689 static int
690 tok_stdin_decode(struct tok_state *tok, char **inp)
692 PyObject *enc, *sysstdin, *decoded, *utf8;
693 const char *encoding;
694 char *converted;
696 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
697 return 0;
698 sysstdin = PySys_GetObject("stdin");
699 if (sysstdin == NULL || !PyFile_Check(sysstdin))
700 return 0;
702 enc = ((PyFileObject *)sysstdin)->f_encoding;
703 if (enc == NULL || !PyString_Check(enc))
704 return 0;
705 Py_INCREF(enc);
707 encoding = PyString_AsString(enc);
708 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
709 if (decoded == NULL)
710 goto error_clear;
712 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
713 Py_DECREF(decoded);
714 if (utf8 == NULL)
715 goto error_clear;
717 assert(PyString_Check(utf8));
718 converted = new_string(PyString_AS_STRING(utf8),
719 PyString_GET_SIZE(utf8));
720 Py_DECREF(utf8);
721 if (converted == NULL)
722 goto error_nomem;
724 PyMem_FREE(*inp);
725 *inp = converted;
726 if (tok->encoding != NULL)
727 PyObject_FREE(tok->encoding);
728 tok->encoding = new_string(encoding, strlen(encoding));
729 if (tok->encoding == NULL)
730 goto error_nomem;
732 Py_DECREF(enc);
733 return 0;
735 error_nomem:
736 Py_DECREF(enc);
737 tok->done = E_NOMEM;
738 return -1;
740 error_clear:
741 /* Fallback to iso-8859-1: for backward compatibility */
742 Py_DECREF(enc);
743 PyErr_Clear();
744 return 0;
746 #endif
748 /* Get next char, updating state; error code goes into tok->done */
750 static int
751 tok_nextc(register struct tok_state *tok)
753 for (;;) {
754 if (tok->cur != tok->inp) {
755 return Py_CHARMASK(*tok->cur++); /* Fast path */
757 if (tok->done != E_OK)
758 return EOF;
759 if (tok->fp == NULL) {
760 char *end = strchr(tok->inp, '\n');
761 if (end != NULL)
762 end++;
763 else {
764 end = strchr(tok->inp, '\0');
765 if (end == tok->inp) {
766 tok->done = E_EOF;
767 return EOF;
770 if (tok->start == NULL)
771 tok->buf = tok->cur;
772 tok->line_start = tok->cur;
773 tok->lineno++;
774 tok->inp = end;
775 return Py_CHARMASK(*tok->cur++);
777 if (tok->prompt != NULL) {
778 char *new = PyOS_Readline(stdin, stdout, tok->prompt);
779 if (tok->nextprompt != NULL)
780 tok->prompt = tok->nextprompt;
781 if (new == NULL)
782 tok->done = E_INTR;
783 else if (*new == '\0') {
784 PyMem_FREE(new);
785 tok->done = E_EOF;
787 #if !defined(PGEN) && defined(Py_USING_UNICODE)
788 else if (tok_stdin_decode(tok, &new) != 0)
789 PyMem_FREE(new);
790 #endif
791 else if (tok->start != NULL) {
792 size_t start = tok->start - tok->buf;
793 size_t oldlen = tok->cur - tok->buf;
794 size_t newlen = oldlen + strlen(new);
795 char *buf = tok->buf;
796 buf = (char *)PyObject_REALLOC(buf, newlen+1);
797 tok->lineno++;
798 if (buf == NULL) {
799 PyObject_FREE(tok->buf);
800 tok->buf = NULL;
801 PyMem_FREE(new);
802 tok->done = E_NOMEM;
803 return EOF;
805 tok->buf = buf;
806 tok->cur = tok->buf + oldlen;
807 tok->line_start = tok->cur;
808 strcpy(tok->buf + oldlen, new);
809 PyMem_FREE(new);
810 tok->inp = tok->buf + newlen;
811 tok->end = tok->inp + 1;
812 tok->start = tok->buf + start;
814 else {
815 tok->lineno++;
816 if (tok->buf != NULL)
817 PyObject_FREE(tok->buf);
818 tok->buf = new;
819 tok->line_start = tok->buf;
820 tok->cur = tok->buf;
821 tok->line_start = tok->buf;
822 tok->inp = strchr(tok->buf, '\0');
823 tok->end = tok->inp + 1;
826 else {
827 int done = 0;
828 Py_ssize_t cur = 0;
829 char *pt;
830 if (tok->start == NULL) {
831 if (tok->buf == NULL) {
832 tok->buf = (char *)
833 PyObject_MALLOC(BUFSIZ);
834 if (tok->buf == NULL) {
835 tok->done = E_NOMEM;
836 return EOF;
838 tok->end = tok->buf + BUFSIZ;
840 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
841 tok) == NULL) {
842 tok->done = E_EOF;
843 done = 1;
845 else {
846 tok->done = E_OK;
847 tok->inp = strchr(tok->buf, '\0');
848 done = tok->inp[-1] == '\n';
851 else {
852 cur = tok->cur - tok->buf;
853 if (decoding_feof(tok)) {
854 tok->done = E_EOF;
855 done = 1;
857 else
858 tok->done = E_OK;
860 tok->lineno++;
861 /* Read until '\n' or EOF */
862 while (!done) {
863 Py_ssize_t curstart = tok->start == NULL ? -1 :
864 tok->start - tok->buf;
865 Py_ssize_t curvalid = tok->inp - tok->buf;
866 Py_ssize_t newsize = curvalid + BUFSIZ;
867 char *newbuf = tok->buf;
868 newbuf = (char *)PyObject_REALLOC(newbuf,
869 newsize);
870 if (newbuf == NULL) {
871 tok->done = E_NOMEM;
872 tok->cur = tok->inp;
873 return EOF;
875 tok->buf = newbuf;
876 tok->inp = tok->buf + curvalid;
877 tok->end = tok->buf + newsize;
878 tok->start = curstart < 0 ? NULL :
879 tok->buf + curstart;
880 if (decoding_fgets(tok->inp,
881 (int)(tok->end - tok->inp),
882 tok) == NULL) {
883 /* Break out early on decoding
884 errors, as tok->buf will be NULL
886 if (tok->decoding_erred)
887 return EOF;
888 /* Last line does not end in \n,
889 fake one */
890 strcpy(tok->inp, "\n");
892 tok->inp = strchr(tok->inp, '\0');
893 done = tok->inp[-1] == '\n';
895 tok->cur = tok->buf + cur;
896 tok->line_start = tok->cur;
897 /* replace "\r\n" with "\n" */
898 /* For Mac we leave the \r, giving a syntax error */
899 pt = tok->inp - 2;
900 if (pt >= tok->buf && *pt == '\r') {
901 *pt++ = '\n';
902 *pt = '\0';
903 tok->inp = pt;
906 if (tok->done != E_OK) {
907 if (tok->prompt != NULL)
908 PySys_WriteStderr("\n");
909 tok->cur = tok->inp;
910 return EOF;
913 /*NOTREACHED*/
917 /* Back-up one character */
919 static void
920 tok_backup(register struct tok_state *tok, register int c)
922 if (c != EOF) {
923 if (--tok->cur < tok->buf)
924 Py_FatalError("tok_backup: begin of buffer");
925 if (*tok->cur != c)
926 *tok->cur = c;
931 /* Return the token corresponding to a single character */
934 PyToken_OneChar(int c)
936 switch (c) {
937 case '(': return LPAR;
938 case ')': return RPAR;
939 case '[': return LSQB;
940 case ']': return RSQB;
941 case ':': return COLON;
942 case ',': return COMMA;
943 case ';': return SEMI;
944 case '+': return PLUS;
945 case '-': return MINUS;
946 case '*': return STAR;
947 case '/': return SLASH;
948 case '|': return VBAR;
949 case '&': return AMPER;
950 case '<': return LESS;
951 case '>': return GREATER;
952 case '=': return EQUAL;
953 case '.': return DOT;
954 case '%': return PERCENT;
955 case '`': return BACKQUOTE;
956 case '{': return LBRACE;
957 case '}': return RBRACE;
958 case '^': return CIRCUMFLEX;
959 case '~': return TILDE;
960 case '@': return AT;
961 default: return OP;
967 PyToken_TwoChars(int c1, int c2)
969 switch (c1) {
970 case '=':
971 switch (c2) {
972 case '=': return EQEQUAL;
974 break;
975 case '!':
976 switch (c2) {
977 case '=': return NOTEQUAL;
979 break;
980 case '<':
981 switch (c2) {
982 case '>': return NOTEQUAL;
983 case '=': return LESSEQUAL;
984 case '<': return LEFTSHIFT;
986 break;
987 case '>':
988 switch (c2) {
989 case '=': return GREATEREQUAL;
990 case '>': return RIGHTSHIFT;
992 break;
993 case '+':
994 switch (c2) {
995 case '=': return PLUSEQUAL;
997 break;
998 case '-':
999 switch (c2) {
1000 case '=': return MINEQUAL;
1002 break;
1003 case '*':
1004 switch (c2) {
1005 case '*': return DOUBLESTAR;
1006 case '=': return STAREQUAL;
1008 break;
1009 case '/':
1010 switch (c2) {
1011 case '/': return DOUBLESLASH;
1012 case '=': return SLASHEQUAL;
1014 break;
1015 case '|':
1016 switch (c2) {
1017 case '=': return VBAREQUAL;
1019 break;
1020 case '%':
1021 switch (c2) {
1022 case '=': return PERCENTEQUAL;
1024 break;
1025 case '&':
1026 switch (c2) {
1027 case '=': return AMPEREQUAL;
1029 break;
1030 case '^':
1031 switch (c2) {
1032 case '=': return CIRCUMFLEXEQUAL;
1034 break;
1036 return OP;
1040 PyToken_ThreeChars(int c1, int c2, int c3)
1042 switch (c1) {
1043 case '<':
1044 switch (c2) {
1045 case '<':
1046 switch (c3) {
1047 case '=':
1048 return LEFTSHIFTEQUAL;
1050 break;
1052 break;
1053 case '>':
1054 switch (c2) {
1055 case '>':
1056 switch (c3) {
1057 case '=':
1058 return RIGHTSHIFTEQUAL;
1060 break;
1062 break;
1063 case '*':
1064 switch (c2) {
1065 case '*':
1066 switch (c3) {
1067 case '=':
1068 return DOUBLESTAREQUAL;
1070 break;
1072 break;
1073 case '/':
1074 switch (c2) {
1075 case '/':
1076 switch (c3) {
1077 case '=':
1078 return DOUBLESLASHEQUAL;
1080 break;
1082 break;
1084 return OP;
1087 static int
1088 indenterror(struct tok_state *tok)
1090 if (tok->alterror) {
1091 tok->done = E_TABSPACE;
1092 tok->cur = tok->inp;
1093 return 1;
1095 if (tok->altwarning) {
1096 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1097 "in indentation\n", tok->filename);
1098 tok->altwarning = 0;
1100 return 0;
1104 /* Get next token, after space stripping etc. */
1106 static int
1107 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1109 register int c;
1110 int blankline;
1112 *p_start = *p_end = NULL;
1113 nextline:
1114 tok->start = NULL;
1115 blankline = 0;
1117 /* Get indentation level */
1118 if (tok->atbol) {
1119 register int col = 0;
1120 register int altcol = 0;
1121 tok->atbol = 0;
1122 for (;;) {
1123 c = tok_nextc(tok);
1124 if (c == ' ')
1125 col++, altcol++;
1126 else if (c == '\t') {
1127 col = (col/tok->tabsize + 1) * tok->tabsize;
1128 altcol = (altcol/tok->alttabsize + 1)
1129 * tok->alttabsize;
1131 else if (c == '\014') /* Control-L (formfeed) */
1132 col = altcol = 0; /* For Emacs users */
1133 else
1134 break;
1136 tok_backup(tok, c);
1137 if (c == '#' || c == '\n') {
1138 /* Lines with only whitespace and/or comments
1139 shouldn't affect the indentation and are
1140 not passed to the parser as NEWLINE tokens,
1141 except *totally* empty lines in interactive
1142 mode, which signal the end of a command group. */
1143 if (col == 0 && c == '\n' && tok->prompt != NULL)
1144 blankline = 0; /* Let it through */
1145 else
1146 blankline = 1; /* Ignore completely */
1147 /* We can't jump back right here since we still
1148 may need to skip to the end of a comment */
1150 if (!blankline && tok->level == 0) {
1151 if (col == tok->indstack[tok->indent]) {
1152 /* No change */
1153 if (altcol != tok->altindstack[tok->indent]) {
1154 if (indenterror(tok))
1155 return ERRORTOKEN;
1158 else if (col > tok->indstack[tok->indent]) {
1159 /* Indent -- always one */
1160 if (tok->indent+1 >= MAXINDENT) {
1161 tok->done = E_TOODEEP;
1162 tok->cur = tok->inp;
1163 return ERRORTOKEN;
1165 if (altcol <= tok->altindstack[tok->indent]) {
1166 if (indenterror(tok))
1167 return ERRORTOKEN;
1169 tok->pendin++;
1170 tok->indstack[++tok->indent] = col;
1171 tok->altindstack[tok->indent] = altcol;
1173 else /* col < tok->indstack[tok->indent] */ {
1174 /* Dedent -- any number, must be consistent */
1175 while (tok->indent > 0 &&
1176 col < tok->indstack[tok->indent]) {
1177 tok->pendin--;
1178 tok->indent--;
1180 if (col != tok->indstack[tok->indent]) {
1181 tok->done = E_DEDENT;
1182 tok->cur = tok->inp;
1183 return ERRORTOKEN;
1185 if (altcol != tok->altindstack[tok->indent]) {
1186 if (indenterror(tok))
1187 return ERRORTOKEN;
1193 tok->start = tok->cur;
1195 /* Return pending indents/dedents */
1196 if (tok->pendin != 0) {
1197 if (tok->pendin < 0) {
1198 tok->pendin++;
1199 return DEDENT;
1201 else {
1202 tok->pendin--;
1203 return INDENT;
1207 again:
1208 tok->start = NULL;
1209 /* Skip spaces */
1210 do {
1211 c = tok_nextc(tok);
1212 } while (c == ' ' || c == '\t' || c == '\014');
1214 /* Set start of current token */
1215 tok->start = tok->cur - 1;
1217 /* Skip comment, while looking for tab-setting magic */
1218 if (c == '#') {
1219 static char *tabforms[] = {
1220 "tab-width:", /* Emacs */
1221 ":tabstop=", /* vim, full form */
1222 ":ts=", /* vim, abbreviated form */
1223 "set tabsize=", /* will vi never die? */
1224 /* more templates can be added here to support other editors */
1226 char cbuf[80];
1227 char *tp, **cp;
1228 tp = cbuf;
1229 do {
1230 *tp++ = c = tok_nextc(tok);
1231 } while (c != EOF && c != '\n' &&
1232 tp - cbuf + 1 < sizeof(cbuf));
1233 *tp = '\0';
1234 for (cp = tabforms;
1235 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1236 cp++) {
1237 if ((tp = strstr(cbuf, *cp))) {
1238 int newsize = atoi(tp + strlen(*cp));
1240 if (newsize >= 1 && newsize <= 40) {
1241 tok->tabsize = newsize;
1242 if (Py_VerboseFlag)
1243 PySys_WriteStderr(
1244 "Tab size set to %d\n",
1245 newsize);
1249 while (c != EOF && c != '\n')
1250 c = tok_nextc(tok);
1253 /* Check for EOF and errors now */
1254 if (c == EOF) {
1255 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1258 /* Identifier (most frequent token!) */
1259 if (isalpha(c) || c == '_') {
1260 /* Process r"", u"" and ur"" */
1261 switch (c) {
1262 case 'r':
1263 case 'R':
1264 c = tok_nextc(tok);
1265 if (c == '"' || c == '\'')
1266 goto letter_quote;
1267 break;
1268 case 'u':
1269 case 'U':
1270 c = tok_nextc(tok);
1271 if (c == 'r' || c == 'R')
1272 c = tok_nextc(tok);
1273 if (c == '"' || c == '\'')
1274 goto letter_quote;
1275 break;
1277 while (isalnum(c) || c == '_') {
1278 c = tok_nextc(tok);
1280 tok_backup(tok, c);
1281 *p_start = tok->start;
1282 *p_end = tok->cur;
1283 return NAME;
1286 /* Newline */
1287 if (c == '\n') {
1288 tok->atbol = 1;
1289 if (blankline || tok->level > 0)
1290 goto nextline;
1291 *p_start = tok->start;
1292 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1293 tok->cont_line = 0;
1294 return NEWLINE;
1297 /* Period or number starting with period? */
1298 if (c == '.') {
1299 c = tok_nextc(tok);
1300 if (isdigit(c)) {
1301 goto fraction;
1303 else {
1304 tok_backup(tok, c);
1305 *p_start = tok->start;
1306 *p_end = tok->cur;
1307 return DOT;
1311 /* Number */
1312 if (isdigit(c)) {
1313 if (c == '0') {
1314 /* Hex or octal -- maybe. */
1315 c = tok_nextc(tok);
1316 if (c == '.')
1317 goto fraction;
1318 #ifndef WITHOUT_COMPLEX
1319 if (c == 'j' || c == 'J')
1320 goto imaginary;
1321 #endif
1322 if (c == 'x' || c == 'X') {
1323 /* Hex */
1324 do {
1325 c = tok_nextc(tok);
1326 } while (isxdigit(c));
1328 else {
1329 int found_decimal = 0;
1330 /* Octal; c is first char of it */
1331 /* There's no 'isoctdigit' macro, sigh */
1332 while ('0' <= c && c < '8') {
1333 c = tok_nextc(tok);
1335 if (isdigit(c)) {
1336 found_decimal = 1;
1337 do {
1338 c = tok_nextc(tok);
1339 } while (isdigit(c));
1341 if (c == '.')
1342 goto fraction;
1343 else if (c == 'e' || c == 'E')
1344 goto exponent;
1345 #ifndef WITHOUT_COMPLEX
1346 else if (c == 'j' || c == 'J')
1347 goto imaginary;
1348 #endif
1349 else if (found_decimal) {
1350 tok->done = E_TOKEN;
1351 tok_backup(tok, c);
1352 return ERRORTOKEN;
1355 if (c == 'l' || c == 'L')
1356 c = tok_nextc(tok);
1358 else {
1359 /* Decimal */
1360 do {
1361 c = tok_nextc(tok);
1362 } while (isdigit(c));
1363 if (c == 'l' || c == 'L')
1364 c = tok_nextc(tok);
1365 else {
1366 /* Accept floating point numbers. */
1367 if (c == '.') {
1368 fraction:
1369 /* Fraction */
1370 do {
1371 c = tok_nextc(tok);
1372 } while (isdigit(c));
1374 if (c == 'e' || c == 'E') {
1375 exponent:
1376 /* Exponent part */
1377 c = tok_nextc(tok);
1378 if (c == '+' || c == '-')
1379 c = tok_nextc(tok);
1380 if (!isdigit(c)) {
1381 tok->done = E_TOKEN;
1382 tok_backup(tok, c);
1383 return ERRORTOKEN;
1385 do {
1386 c = tok_nextc(tok);
1387 } while (isdigit(c));
1389 #ifndef WITHOUT_COMPLEX
1390 if (c == 'j' || c == 'J')
1391 /* Imaginary part */
1392 imaginary:
1393 c = tok_nextc(tok);
1394 #endif
1397 tok_backup(tok, c);
1398 *p_start = tok->start;
1399 *p_end = tok->cur;
1400 return NUMBER;
1403 letter_quote:
1404 /* String */
1405 if (c == '\'' || c == '"') {
1406 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1407 int quote = c;
1408 int triple = 0;
1409 int tripcount = 0;
1410 for (;;) {
1411 c = tok_nextc(tok);
1412 if (c == '\n') {
1413 if (!triple) {
1414 tok->done = E_EOLS;
1415 tok_backup(tok, c);
1416 return ERRORTOKEN;
1418 tripcount = 0;
1419 tok->cont_line = 1; /* multiline string. */
1421 else if (c == EOF) {
1422 if (triple)
1423 tok->done = E_EOFS;
1424 else
1425 tok->done = E_EOLS;
1426 tok->cur = tok->inp;
1427 return ERRORTOKEN;
1429 else if (c == quote) {
1430 tripcount++;
1431 if (tok->cur - tok->start == quote2) {
1432 c = tok_nextc(tok);
1433 if (c == quote) {
1434 triple = 1;
1435 tripcount = 0;
1436 continue;
1438 tok_backup(tok, c);
1440 if (!triple || tripcount == 3)
1441 break;
1443 else if (c == '\\') {
1444 tripcount = 0;
1445 c = tok_nextc(tok);
1446 if (c == EOF) {
1447 tok->done = E_EOLS;
1448 tok->cur = tok->inp;
1449 return ERRORTOKEN;
1452 else
1453 tripcount = 0;
1455 *p_start = tok->start;
1456 *p_end = tok->cur;
1457 return STRING;
1460 /* Line continuation */
1461 if (c == '\\') {
1462 c = tok_nextc(tok);
1463 if (c != '\n') {
1464 tok->done = E_LINECONT;
1465 tok->cur = tok->inp;
1466 return ERRORTOKEN;
1468 tok->cont_line = 1;
1469 goto again; /* Read next line */
1472 /* Check for two-character token */
1474 int c2 = tok_nextc(tok);
1475 int token = PyToken_TwoChars(c, c2);
1476 if (token != OP) {
1477 int c3 = tok_nextc(tok);
1478 int token3 = PyToken_ThreeChars(c, c2, c3);
1479 if (token3 != OP) {
1480 token = token3;
1481 } else {
1482 tok_backup(tok, c3);
1484 *p_start = tok->start;
1485 *p_end = tok->cur;
1486 return token;
1488 tok_backup(tok, c2);
1491 /* Keep track of parentheses nesting level */
1492 switch (c) {
1493 case '(':
1494 case '[':
1495 case '{':
1496 tok->level++;
1497 break;
1498 case ')':
1499 case ']':
1500 case '}':
1501 tok->level--;
1502 break;
1505 /* Punctuation character */
1506 *p_start = tok->start;
1507 *p_end = tok->cur;
1508 return PyToken_OneChar(c);
1512 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1514 int result = tok_get(tok, p_start, p_end);
1515 if (tok->decoding_erred) {
1516 result = ERRORTOKEN;
1517 tok->done = E_DECODE;
1519 return result;
1522 #ifdef Py_DEBUG
1524 void
1525 tok_dump(int type, char *start, char *end)
1527 printf("%s", _PyParser_TokenNames[type]);
1528 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1529 printf("(%.*s)", (int)(end - start), start);
1532 #endif