Added new optional credentials argument to SMTPHandler.__init__, and smtp.login(...
[python.git] / Parser / tokenizer.c
blob86543567610d02542bb2d881521f4be0523e0331
2 /* Tokenizer implementation */
4 #include "Python.h"
5 #include "pgenheaders.h"
7 #include <ctype.h>
8 #include <assert.h>
10 #include "tokenizer.h"
11 #include "errcode.h"
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
21 extern char *PyOS_Readline(FILE *, FILE *, char *);
22 /* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
26 /* Don't ever change this -- it would break the portability of Python code */
27 #define TABSIZE 8
29 /* Convert a possibly signed character to a nonnegative int */
30 /* XXX This assumes characters are 8 bits wide */
31 #ifdef __CHAR_UNSIGNED__
32 #define Py_CHARMASK(c) (c)
33 #else
34 #define Py_CHARMASK(c) ((c) & 0xff)
35 #endif
37 /* Forward */
38 static struct tok_state *tok_new(void);
39 static int tok_nextc(struct tok_state *tok);
40 static void tok_backup(struct tok_state *tok, int c);
42 /* Token names */
44 char *_PyParser_TokenNames[] = {
45 "ENDMARKER",
46 "NAME",
47 "NUMBER",
48 "STRING",
49 "NEWLINE",
50 "INDENT",
51 "DEDENT",
52 "LPAR",
53 "RPAR",
54 "LSQB",
55 "RSQB",
56 "COLON",
57 "COMMA",
58 "SEMI",
59 "PLUS",
60 "MINUS",
61 "STAR",
62 "SLASH",
63 "VBAR",
64 "AMPER",
65 "LESS",
66 "GREATER",
67 "EQUAL",
68 "DOT",
69 "PERCENT",
70 "BACKQUOTE",
71 "LBRACE",
72 "RBRACE",
73 "EQEQUAL",
74 "NOTEQUAL",
75 "LESSEQUAL",
76 "GREATEREQUAL",
77 "TILDE",
78 "CIRCUMFLEX",
79 "LEFTSHIFT",
80 "RIGHTSHIFT",
81 "DOUBLESTAR",
82 "PLUSEQUAL",
83 "MINEQUAL",
84 "STAREQUAL",
85 "SLASHEQUAL",
86 "PERCENTEQUAL",
87 "AMPEREQUAL",
88 "VBAREQUAL",
89 "CIRCUMFLEXEQUAL",
90 "LEFTSHIFTEQUAL",
91 "RIGHTSHIFTEQUAL",
92 "DOUBLESTAREQUAL",
93 "DOUBLESLASH",
94 "DOUBLESLASHEQUAL",
95 "AT",
96 /* This table must match the #defines in token.h! */
97 "OP",
98 "<ERRORTOKEN>",
99 "<N_TOKENS>"
103 /* Create and initialize a new tok_state structure */
105 static struct tok_state *
106 tok_new(void)
108 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
109 sizeof(struct tok_state));
110 if (tok == NULL)
111 return NULL;
112 tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
113 tok->done = E_OK;
114 tok->fp = NULL;
115 tok->tabsize = TABSIZE;
116 tok->indent = 0;
117 tok->indstack[0] = 0;
118 tok->atbol = 1;
119 tok->pendin = 0;
120 tok->prompt = tok->nextprompt = NULL;
121 tok->lineno = 0;
122 tok->level = 0;
123 tok->filename = NULL;
124 tok->altwarning = 0;
125 tok->alterror = 0;
126 tok->alttabsize = 1;
127 tok->altindstack[0] = 0;
128 tok->decoding_state = 0;
129 tok->decoding_erred = 0;
130 tok->read_coding_spec = 0;
131 tok->encoding = NULL;
132 tok->cont_line = 0;
133 #ifndef PGEN
134 tok->decoding_readline = NULL;
135 tok->decoding_buffer = NULL;
136 #endif
137 return tok;
140 #ifdef PGEN
142 static char *
143 decoding_fgets(char *s, int size, struct tok_state *tok)
145 return fgets(s, size, tok->fp);
148 static int
149 decoding_feof(struct tok_state *tok)
151 return feof(tok->fp);
154 static const char *
155 decode_str(const char *str, struct tok_state *tok)
157 return str;
160 #else /* PGEN */
162 static char *
163 error_ret(struct tok_state *tok) /* XXX */
165 tok->decoding_erred = 1;
166 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
167 PyMem_FREE(tok->buf);
168 tok->buf = NULL;
169 return NULL; /* as if it were EOF */
172 static char *
173 new_string(const char *s, Py_ssize_t len)
175 char* result = (char *)PyMem_MALLOC(len + 1);
176 if (result != NULL) {
177 memcpy(result, s, len);
178 result[len] = '\0';
180 return result;
183 static char *
184 get_normal_name(char *s) /* for utf-8 and latin-1 */
186 char buf[13];
187 int i;
188 for (i = 0; i < 12; i++) {
189 int c = s[i];
190 if (c == '\0') break;
191 else if (c == '_') buf[i] = '-';
192 else buf[i] = tolower(c);
194 buf[i] = '\0';
195 if (strcmp(buf, "utf-8") == 0 ||
196 strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf, "latin-1") == 0 ||
198 strcmp(buf, "iso-8859-1") == 0 ||
199 strcmp(buf, "iso-latin-1") == 0 ||
200 strncmp(buf, "latin-1-", 8) == 0 ||
201 strncmp(buf, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203 else return s;
206 /* Return the coding spec in S, or NULL if none is found. */
208 static char *
209 get_coding_spec(const char *s, Py_ssize_t size)
211 Py_ssize_t i;
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i = 0; i < size - 6; i++) {
215 if (s[i] == '#')
216 break;
217 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218 return NULL;
220 for (; i < size - 6; i++) { /* XXX inefficient search */
221 const char* t = s + i;
222 if (strncmp(t, "coding", 6) == 0) {
223 const char* begin = NULL;
224 t += 6;
225 if (t[0] != ':' && t[0] != '=')
226 continue;
227 do {
228 t++;
229 } while (t[0] == '\x20' || t[0] == '\t');
231 begin = t;
232 while (isalnum(Py_CHARMASK(t[0])) ||
233 t[0] == '-' || t[0] == '_' || t[0] == '.')
234 t++;
236 if (begin < t) {
237 char* r = new_string(begin, t - begin);
238 char* q = get_normal_name(r);
239 if (r != q) {
240 PyMem_FREE(r);
241 r = new_string(q, strlen(q));
243 return r;
247 return NULL;
250 /* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
255 static int
256 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257 int set_readline(struct tok_state *, const char *))
259 char * cs;
260 int r = 1;
262 if (tok->cont_line)
263 /* It's a continuation line, so it can't be a coding spec. */
264 return 1;
265 cs = get_coding_spec(line, size);
266 if (cs != NULL) {
267 tok->read_coding_spec = 1;
268 if (tok->encoding == NULL) {
269 assert(tok->decoding_state == 1); /* raw */
270 if (strcmp(cs, "utf-8") == 0 ||
271 strcmp(cs, "iso-8859-1") == 0) {
272 tok->encoding = cs;
273 } else {
274 #ifdef Py_USING_UNICODE
275 r = set_readline(tok, cs);
276 if (r) {
277 tok->encoding = cs;
278 tok->decoding_state = -1;
280 else
281 PyMem_FREE(cs);
282 #else
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
286 won't matter. */
287 PyMem_FREE(cs);
288 #endif
290 } else { /* then, compare cs with BOM */
291 r = (strcmp(tok->encoding, cs) == 0);
292 PyMem_FREE(cs);
295 if (!r) {
296 cs = tok->encoding;
297 if (!cs)
298 cs = "with BOM";
299 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
301 return r;
304 /* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
308 static int
309 check_bom(int get_char(struct tok_state *),
310 void unget_char(int, struct tok_state *),
311 int set_readline(struct tok_state *, const char *),
312 struct tok_state *tok)
314 int ch = get_char(tok);
315 tok->decoding_state = 1;
316 if (ch == EOF) {
317 return 1;
318 } else if (ch == 0xEF) {
319 ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
320 ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
321 #if 0
322 /* Disable support for UTF-16 BOMs until a decision
323 is made whether this needs to be supported. */
324 } else if (ch == 0xFE) {
325 ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
326 if (!set_readline(tok, "utf-16-be")) return 0;
327 tok->decoding_state = -1;
328 } else if (ch == 0xFF) {
329 ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
330 if (!set_readline(tok, "utf-16-le")) return 0;
331 tok->decoding_state = -1;
332 #endif
333 } else {
334 unget_char(ch, tok);
335 return 1;
337 if (tok->encoding != NULL)
338 PyMem_FREE(tok->encoding);
339 tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
340 return 1;
341 NON_BOM:
342 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
343 unget_char(0xFF, tok); /* XXX this will cause a syntax error */
344 return 1;
347 /* Read a line of text from TOK into S, using the stream in TOK.
348 Return NULL on failure, else S.
350 On entry, tok->decoding_buffer will be one of:
351 1) NULL: need to call tok->decoding_readline to get a new line
352 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
353 stored the result in tok->decoding_buffer
354 3) PyStringObject *: previous call to fp_readl did not have enough room
355 (in the s buffer) to copy entire contents of the line read
356 by tok->decoding_readline. tok->decoding_buffer has the overflow.
357 In this case, fp_readl is called in a loop (with an expanded buffer)
358 until the buffer ends with a '\n' (or until the end of the file is
359 reached): see tok_nextc and its calls to decoding_fgets.
362 static char *
363 fp_readl(char *s, int size, struct tok_state *tok)
365 #ifndef Py_USING_UNICODE
366 /* In a non-Unicode built, this should never be called. */
367 Py_FatalError("fp_readl should not be called in this build.");
368 return NULL; /* Keep compiler happy (not reachable) */
369 #else
370 PyObject* utf8 = NULL;
371 PyObject* buf = tok->decoding_buffer;
372 char *str;
373 Py_ssize_t utf8len;
375 /* Ask for one less byte so we can terminate it */
376 assert(size > 0);
377 size--;
379 if (buf == NULL) {
380 buf = PyObject_CallObject(tok->decoding_readline, NULL);
381 if (buf == NULL)
382 return error_ret(tok);
383 } else {
384 tok->decoding_buffer = NULL;
385 if (PyString_CheckExact(buf))
386 utf8 = buf;
388 if (utf8 == NULL) {
389 utf8 = PyUnicode_AsUTF8String(buf);
390 Py_DECREF(buf);
391 if (utf8 == NULL)
392 return error_ret(tok);
394 str = PyString_AsString(utf8);
395 utf8len = PyString_GET_SIZE(utf8);
396 if (utf8len > size) {
397 tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
398 if (tok->decoding_buffer == NULL) {
399 Py_DECREF(utf8);
400 return error_ret(tok);
402 utf8len = size;
404 memcpy(s, str, utf8len);
405 s[utf8len] = '\0';
406 Py_DECREF(utf8);
407 if (utf8len == 0) return NULL; /* EOF */
408 return s;
409 #endif
412 /* Set the readline function for TOK to a StreamReader's
413 readline function. The StreamReader is named ENC.
415 This function is called from check_bom and check_coding_spec.
417 ENC is usually identical to the future value of tok->encoding,
418 except for the (currently unsupported) case of UTF-16.
420 Return 1 on success, 0 on failure. */
422 static int
423 fp_setreadl(struct tok_state *tok, const char* enc)
425 PyObject *reader, *stream, *readline;
427 /* XXX: constify filename argument. */
428 stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
429 if (stream == NULL)
430 return 0;
432 reader = PyCodec_StreamReader(enc, stream, NULL);
433 Py_DECREF(stream);
434 if (reader == NULL)
435 return 0;
437 readline = PyObject_GetAttrString(reader, "readline");
438 Py_DECREF(reader);
439 if (readline == NULL)
440 return 0;
442 tok->decoding_readline = readline;
443 return 1;
446 /* Fetch the next byte from TOK. */
448 static int fp_getc(struct tok_state *tok) {
449 return getc(tok->fp);
452 /* Unfetch the last byte back into TOK. */
454 static void fp_ungetc(int c, struct tok_state *tok) {
455 ungetc(c, tok->fp);
458 /* Read a line of input from TOK. Determine encoding
459 if necessary. */
461 static char *
462 decoding_fgets(char *s, int size, struct tok_state *tok)
464 char *line = NULL;
465 int badchar = 0;
466 for (;;) {
467 if (tok->decoding_state < 0) {
468 /* We already have a codec associated with
469 this input. */
470 line = fp_readl(s, size, tok);
471 break;
472 } else if (tok->decoding_state > 0) {
473 /* We want a 'raw' read. */
474 line = Py_UniversalNewlineFgets(s, size,
475 tok->fp, NULL);
476 break;
477 } else {
478 /* We have not yet determined the encoding.
479 If an encoding is found, use the file-pointer
480 reader functions from now on. */
481 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
482 return error_ret(tok);
483 assert(tok->decoding_state != 0);
486 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
487 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
488 return error_ret(tok);
491 #ifndef PGEN
492 /* The default encoding is ASCII, so make sure we don't have any
493 non-ASCII bytes in it. */
494 if (line && !tok->encoding) {
495 unsigned char *c;
496 for (c = (unsigned char *)line; *c; c++)
497 if (*c > 127) {
498 badchar = *c;
499 break;
502 if (badchar) {
503 char buf[500];
504 /* Need to add 1 to the line number, since this line
505 has not been counted, yet. */
506 sprintf(buf,
507 "Non-ASCII character '\\x%.2x' "
508 "in file %.200s on line %i, "
509 "but no encoding declared; "
510 "see http://www.python.org/peps/pep-0263.html for details",
511 badchar, tok->filename, tok->lineno + 1);
512 PyErr_SetString(PyExc_SyntaxError, buf);
513 return error_ret(tok);
515 #endif
516 return line;
519 static int
520 decoding_feof(struct tok_state *tok)
522 if (tok->decoding_state >= 0) {
523 return feof(tok->fp);
524 } else {
525 PyObject* buf = tok->decoding_buffer;
526 if (buf == NULL) {
527 buf = PyObject_CallObject(tok->decoding_readline, NULL);
528 if (buf == NULL) {
529 error_ret(tok);
530 return 1;
531 } else {
532 tok->decoding_buffer = buf;
535 return PyObject_Length(buf) == 0;
539 /* Fetch a byte from TOK, using the string buffer. */
541 static int
542 buf_getc(struct tok_state *tok) {
543 return Py_CHARMASK(*tok->str++);
546 /* Unfetch a byte from TOK, using the string buffer. */
548 static void
549 buf_ungetc(int c, struct tok_state *tok) {
550 tok->str--;
551 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
554 /* Set the readline function for TOK to ENC. For the string-based
555 tokenizer, this means to just record the encoding. */
557 static int
558 buf_setreadl(struct tok_state *tok, const char* enc) {
559 tok->enc = enc;
560 return 1;
563 /* Return a UTF-8 encoding Python string object from the
564 C byte string STR, which is encoded with ENC. */
566 #ifdef Py_USING_UNICODE
567 static PyObject *
568 translate_into_utf8(const char* str, const char* enc) {
569 PyObject *utf8;
570 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
571 if (buf == NULL)
572 return NULL;
573 utf8 = PyUnicode_AsUTF8String(buf);
574 Py_DECREF(buf);
575 return utf8;
577 #endif
579 /* Decode a byte string STR for use as the buffer of TOK.
580 Look for encoding declarations inside STR, and record them
581 inside TOK. */
583 static const char *
584 decode_str(const char *str, struct tok_state *tok)
586 PyObject* utf8 = NULL;
587 const char *s;
588 int lineno = 0;
589 tok->enc = NULL;
590 tok->str = str;
591 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
592 return error_ret(tok);
593 str = tok->str; /* string after BOM if any */
594 assert(str);
595 #ifdef Py_USING_UNICODE
596 if (tok->enc != NULL) {
597 utf8 = translate_into_utf8(str, tok->enc);
598 if (utf8 == NULL)
599 return error_ret(tok);
600 str = PyString_AsString(utf8);
602 #endif
603 for (s = str;; s++) {
604 if (*s == '\0') break;
605 else if (*s == '\n') {
606 lineno++;
607 if (lineno == 2) break;
610 tok->enc = NULL;
611 if (!check_coding_spec(str, s - str, tok, buf_setreadl))
612 return error_ret(tok);
613 #ifdef Py_USING_UNICODE
614 if (tok->enc != NULL) {
615 assert(utf8 == NULL);
616 utf8 = translate_into_utf8(str, tok->enc);
617 if (utf8 == NULL) {
618 PyErr_Format(PyExc_SyntaxError,
619 "unknown encoding: %s", tok->enc);
620 return error_ret(tok);
622 str = PyString_AsString(utf8);
624 #endif
625 assert(tok->decoding_buffer == NULL);
626 tok->decoding_buffer = utf8; /* CAUTION */
627 return str;
630 #endif /* PGEN */
632 /* Set up tokenizer for string */
634 struct tok_state *
635 PyTokenizer_FromString(const char *str)
637 struct tok_state *tok = tok_new();
638 if (tok == NULL)
639 return NULL;
640 str = (char *)decode_str(str, tok);
641 if (str == NULL) {
642 PyTokenizer_Free(tok);
643 return NULL;
646 /* XXX: constify members. */
647 tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
648 return tok;
652 /* Set up tokenizer for file */
654 struct tok_state *
655 PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)
657 struct tok_state *tok = tok_new();
658 if (tok == NULL)
659 return NULL;
660 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
661 PyTokenizer_Free(tok);
662 return NULL;
664 tok->cur = tok->inp = tok->buf;
665 tok->end = tok->buf + BUFSIZ;
666 tok->fp = fp;
667 tok->prompt = ps1;
668 tok->nextprompt = ps2;
669 return tok;
673 /* Free a tok_state structure */
675 void
676 PyTokenizer_Free(struct tok_state *tok)
678 if (tok->encoding != NULL)
679 PyMem_FREE(tok->encoding);
680 #ifndef PGEN
681 Py_XDECREF(tok->decoding_readline);
682 Py_XDECREF(tok->decoding_buffer);
683 #endif
684 if (tok->fp != NULL && tok->buf != NULL)
685 PyMem_FREE(tok->buf);
686 PyMem_FREE(tok);
689 #if !defined(PGEN) && defined(Py_USING_UNICODE)
690 static int
691 tok_stdin_decode(struct tok_state *tok, char **inp)
693 PyObject *enc, *sysstdin, *decoded, *utf8;
694 const char *encoding;
695 char *converted;
697 if (PySys_GetFile((char *)"stdin", NULL) != stdin)
698 return 0;
699 sysstdin = PySys_GetObject("stdin");
700 if (sysstdin == NULL || !PyFile_Check(sysstdin))
701 return 0;
703 enc = ((PyFileObject *)sysstdin)->f_encoding;
704 if (enc == NULL || !PyString_Check(enc))
705 return 0;
706 Py_INCREF(enc);
708 encoding = PyString_AsString(enc);
709 decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);
710 if (decoded == NULL)
711 goto error_clear;
713 utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
714 Py_DECREF(decoded);
715 if (utf8 == NULL)
716 goto error_clear;
718 assert(PyString_Check(utf8));
719 converted = new_string(PyString_AS_STRING(utf8),
720 PyString_GET_SIZE(utf8));
721 Py_DECREF(utf8);
722 if (converted == NULL)
723 goto error_nomem;
725 PyMem_FREE(*inp);
726 *inp = converted;
727 if (tok->encoding != NULL)
728 PyMem_FREE(tok->encoding);
729 tok->encoding = new_string(encoding, strlen(encoding));
730 if (tok->encoding == NULL)
731 goto error_nomem;
733 Py_DECREF(enc);
734 return 0;
736 error_nomem:
737 Py_DECREF(enc);
738 tok->done = E_NOMEM;
739 return -1;
741 error_clear:
742 /* Fallback to iso-8859-1: for backward compatibility */
743 Py_DECREF(enc);
744 PyErr_Clear();
745 return 0;
747 #endif
749 /* Get next char, updating state; error code goes into tok->done */
751 static int
752 tok_nextc(register struct tok_state *tok)
754 for (;;) {
755 if (tok->cur != tok->inp) {
756 return Py_CHARMASK(*tok->cur++); /* Fast path */
758 if (tok->done != E_OK)
759 return EOF;
760 if (tok->fp == NULL) {
761 char *end = strchr(tok->inp, '\n');
762 if (end != NULL)
763 end++;
764 else {
765 end = strchr(tok->inp, '\0');
766 if (end == tok->inp) {
767 tok->done = E_EOF;
768 return EOF;
771 if (tok->start == NULL)
772 tok->buf = tok->cur;
773 tok->line_start = tok->cur;
774 tok->lineno++;
775 tok->inp = end;
776 return Py_CHARMASK(*tok->cur++);
778 if (tok->prompt != NULL) {
779 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
780 if (tok->nextprompt != NULL)
781 tok->prompt = tok->nextprompt;
782 if (newtok == NULL)
783 tok->done = E_INTR;
784 else if (*newtok == '\0') {
785 PyMem_FREE(newtok);
786 tok->done = E_EOF;
788 #if !defined(PGEN) && defined(Py_USING_UNICODE)
789 else if (tok_stdin_decode(tok, &newtok) != 0)
790 PyMem_FREE(newtok);
791 #endif
792 else if (tok->start != NULL) {
793 size_t start = tok->start - tok->buf;
794 size_t oldlen = tok->cur - tok->buf;
795 size_t newlen = oldlen + strlen(newtok);
796 char *buf = tok->buf;
797 buf = (char *)PyMem_REALLOC(buf, newlen+1);
798 tok->lineno++;
799 if (buf == NULL) {
800 PyMem_FREE(tok->buf);
801 tok->buf = NULL;
802 PyMem_FREE(newtok);
803 tok->done = E_NOMEM;
804 return EOF;
806 tok->buf = buf;
807 tok->cur = tok->buf + oldlen;
808 tok->line_start = tok->cur;
809 strcpy(tok->buf + oldlen, newtok);
810 PyMem_FREE(newtok);
811 tok->inp = tok->buf + newlen;
812 tok->end = tok->inp + 1;
813 tok->start = tok->buf + start;
815 else {
816 tok->lineno++;
817 if (tok->buf != NULL)
818 PyMem_FREE(tok->buf);
819 tok->buf = newtok;
820 tok->line_start = tok->buf;
821 tok->cur = tok->buf;
822 tok->line_start = tok->buf;
823 tok->inp = strchr(tok->buf, '\0');
824 tok->end = tok->inp + 1;
827 else {
828 int done = 0;
829 Py_ssize_t cur = 0;
830 char *pt;
831 if (tok->start == NULL) {
832 if (tok->buf == NULL) {
833 tok->buf = (char *)
834 PyMem_MALLOC(BUFSIZ);
835 if (tok->buf == NULL) {
836 tok->done = E_NOMEM;
837 return EOF;
839 tok->end = tok->buf + BUFSIZ;
841 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
842 tok) == NULL) {
843 tok->done = E_EOF;
844 done = 1;
846 else {
847 tok->done = E_OK;
848 tok->inp = strchr(tok->buf, '\0');
849 done = tok->inp[-1] == '\n';
852 else {
853 cur = tok->cur - tok->buf;
854 if (decoding_feof(tok)) {
855 tok->done = E_EOF;
856 done = 1;
858 else
859 tok->done = E_OK;
861 tok->lineno++;
862 /* Read until '\n' or EOF */
863 while (!done) {
864 Py_ssize_t curstart = tok->start == NULL ? -1 :
865 tok->start - tok->buf;
866 Py_ssize_t curvalid = tok->inp - tok->buf;
867 Py_ssize_t newsize = curvalid + BUFSIZ;
868 char *newbuf = tok->buf;
869 newbuf = (char *)PyMem_REALLOC(newbuf,
870 newsize);
871 if (newbuf == NULL) {
872 tok->done = E_NOMEM;
873 tok->cur = tok->inp;
874 return EOF;
876 tok->buf = newbuf;
877 tok->inp = tok->buf + curvalid;
878 tok->end = tok->buf + newsize;
879 tok->start = curstart < 0 ? NULL :
880 tok->buf + curstart;
881 if (decoding_fgets(tok->inp,
882 (int)(tok->end - tok->inp),
883 tok) == NULL) {
884 /* Break out early on decoding
885 errors, as tok->buf will be NULL
887 if (tok->decoding_erred)
888 return EOF;
889 /* Last line does not end in \n,
890 fake one */
891 strcpy(tok->inp, "\n");
893 tok->inp = strchr(tok->inp, '\0');
894 done = tok->inp[-1] == '\n';
896 if (tok->buf != NULL) {
897 tok->cur = tok->buf + cur;
898 tok->line_start = tok->cur;
899 /* replace "\r\n" with "\n" */
900 /* For Mac leave the \r, giving a syntax error */
901 pt = tok->inp - 2;
902 if (pt >= tok->buf && *pt == '\r') {
903 *pt++ = '\n';
904 *pt = '\0';
905 tok->inp = pt;
909 if (tok->done != E_OK) {
910 if (tok->prompt != NULL)
911 PySys_WriteStderr("\n");
912 tok->cur = tok->inp;
913 return EOF;
916 /*NOTREACHED*/
920 /* Back-up one character */
922 static void
923 tok_backup(register struct tok_state *tok, register int c)
925 if (c != EOF) {
926 if (--tok->cur < tok->buf)
927 Py_FatalError("tok_backup: begin of buffer");
928 if (*tok->cur != c)
929 *tok->cur = c;
934 /* Return the token corresponding to a single character */
937 PyToken_OneChar(int c)
939 switch (c) {
940 case '(': return LPAR;
941 case ')': return RPAR;
942 case '[': return LSQB;
943 case ']': return RSQB;
944 case ':': return COLON;
945 case ',': return COMMA;
946 case ';': return SEMI;
947 case '+': return PLUS;
948 case '-': return MINUS;
949 case '*': return STAR;
950 case '/': return SLASH;
951 case '|': return VBAR;
952 case '&': return AMPER;
953 case '<': return LESS;
954 case '>': return GREATER;
955 case '=': return EQUAL;
956 case '.': return DOT;
957 case '%': return PERCENT;
958 case '`': return BACKQUOTE;
959 case '{': return LBRACE;
960 case '}': return RBRACE;
961 case '^': return CIRCUMFLEX;
962 case '~': return TILDE;
963 case '@': return AT;
964 default: return OP;
970 PyToken_TwoChars(int c1, int c2)
972 switch (c1) {
973 case '=':
974 switch (c2) {
975 case '=': return EQEQUAL;
977 break;
978 case '!':
979 switch (c2) {
980 case '=': return NOTEQUAL;
982 break;
983 case '<':
984 switch (c2) {
985 case '>': return NOTEQUAL;
986 case '=': return LESSEQUAL;
987 case '<': return LEFTSHIFT;
989 break;
990 case '>':
991 switch (c2) {
992 case '=': return GREATEREQUAL;
993 case '>': return RIGHTSHIFT;
995 break;
996 case '+':
997 switch (c2) {
998 case '=': return PLUSEQUAL;
1000 break;
1001 case '-':
1002 switch (c2) {
1003 case '=': return MINEQUAL;
1005 break;
1006 case '*':
1007 switch (c2) {
1008 case '*': return DOUBLESTAR;
1009 case '=': return STAREQUAL;
1011 break;
1012 case '/':
1013 switch (c2) {
1014 case '/': return DOUBLESLASH;
1015 case '=': return SLASHEQUAL;
1017 break;
1018 case '|':
1019 switch (c2) {
1020 case '=': return VBAREQUAL;
1022 break;
1023 case '%':
1024 switch (c2) {
1025 case '=': return PERCENTEQUAL;
1027 break;
1028 case '&':
1029 switch (c2) {
1030 case '=': return AMPEREQUAL;
1032 break;
1033 case '^':
1034 switch (c2) {
1035 case '=': return CIRCUMFLEXEQUAL;
1037 break;
1039 return OP;
1043 PyToken_ThreeChars(int c1, int c2, int c3)
1045 switch (c1) {
1046 case '<':
1047 switch (c2) {
1048 case '<':
1049 switch (c3) {
1050 case '=':
1051 return LEFTSHIFTEQUAL;
1053 break;
1055 break;
1056 case '>':
1057 switch (c2) {
1058 case '>':
1059 switch (c3) {
1060 case '=':
1061 return RIGHTSHIFTEQUAL;
1063 break;
1065 break;
1066 case '*':
1067 switch (c2) {
1068 case '*':
1069 switch (c3) {
1070 case '=':
1071 return DOUBLESTAREQUAL;
1073 break;
1075 break;
1076 case '/':
1077 switch (c2) {
1078 case '/':
1079 switch (c3) {
1080 case '=':
1081 return DOUBLESLASHEQUAL;
1083 break;
1085 break;
1087 return OP;
1090 static int
1091 indenterror(struct tok_state *tok)
1093 if (tok->alterror) {
1094 tok->done = E_TABSPACE;
1095 tok->cur = tok->inp;
1096 return 1;
1098 if (tok->altwarning) {
1099 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1100 "in indentation\n", tok->filename);
1101 tok->altwarning = 0;
1103 return 0;
1107 /* Get next token, after space stripping etc. */
1109 static int
1110 tok_get(register struct tok_state *tok, char **p_start, char **p_end)
1112 register int c;
1113 int blankline;
1115 *p_start = *p_end = NULL;
1116 nextline:
1117 tok->start = NULL;
1118 blankline = 0;
1120 /* Get indentation level */
1121 if (tok->atbol) {
1122 register int col = 0;
1123 register int altcol = 0;
1124 tok->atbol = 0;
1125 for (;;) {
1126 c = tok_nextc(tok);
1127 if (c == ' ')
1128 col++, altcol++;
1129 else if (c == '\t') {
1130 col = (col/tok->tabsize + 1) * tok->tabsize;
1131 altcol = (altcol/tok->alttabsize + 1)
1132 * tok->alttabsize;
1134 else if (c == '\014') /* Control-L (formfeed) */
1135 col = altcol = 0; /* For Emacs users */
1136 else
1137 break;
1139 tok_backup(tok, c);
1140 if (c == '#' || c == '\n') {
1141 /* Lines with only whitespace and/or comments
1142 shouldn't affect the indentation and are
1143 not passed to the parser as NEWLINE tokens,
1144 except *totally* empty lines in interactive
1145 mode, which signal the end of a command group. */
1146 if (col == 0 && c == '\n' && tok->prompt != NULL)
1147 blankline = 0; /* Let it through */
1148 else
1149 blankline = 1; /* Ignore completely */
1150 /* We can't jump back right here since we still
1151 may need to skip to the end of a comment */
1153 if (!blankline && tok->level == 0) {
1154 if (col == tok->indstack[tok->indent]) {
1155 /* No change */
1156 if (altcol != tok->altindstack[tok->indent]) {
1157 if (indenterror(tok))
1158 return ERRORTOKEN;
1161 else if (col > tok->indstack[tok->indent]) {
1162 /* Indent -- always one */
1163 if (tok->indent+1 >= MAXINDENT) {
1164 tok->done = E_TOODEEP;
1165 tok->cur = tok->inp;
1166 return ERRORTOKEN;
1168 if (altcol <= tok->altindstack[tok->indent]) {
1169 if (indenterror(tok))
1170 return ERRORTOKEN;
1172 tok->pendin++;
1173 tok->indstack[++tok->indent] = col;
1174 tok->altindstack[tok->indent] = altcol;
1176 else /* col < tok->indstack[tok->indent] */ {
1177 /* Dedent -- any number, must be consistent */
1178 while (tok->indent > 0 &&
1179 col < tok->indstack[tok->indent]) {
1180 tok->pendin--;
1181 tok->indent--;
1183 if (col != tok->indstack[tok->indent]) {
1184 tok->done = E_DEDENT;
1185 tok->cur = tok->inp;
1186 return ERRORTOKEN;
1188 if (altcol != tok->altindstack[tok->indent]) {
1189 if (indenterror(tok))
1190 return ERRORTOKEN;
1196 tok->start = tok->cur;
1198 /* Return pending indents/dedents */
1199 if (tok->pendin != 0) {
1200 if (tok->pendin < 0) {
1201 tok->pendin++;
1202 return DEDENT;
1204 else {
1205 tok->pendin--;
1206 return INDENT;
1210 again:
1211 tok->start = NULL;
1212 /* Skip spaces */
1213 do {
1214 c = tok_nextc(tok);
1215 } while (c == ' ' || c == '\t' || c == '\014');
1217 /* Set start of current token */
1218 tok->start = tok->cur - 1;
1220 /* Skip comment, while looking for tab-setting magic */
1221 if (c == '#') {
1222 static char *tabforms[] = {
1223 "tab-width:", /* Emacs */
1224 ":tabstop=", /* vim, full form */
1225 ":ts=", /* vim, abbreviated form */
1226 "set tabsize=", /* will vi never die? */
1227 /* more templates can be added here to support other editors */
1229 char cbuf[80];
1230 char *tp, **cp;
1231 tp = cbuf;
1232 do {
1233 *tp++ = c = tok_nextc(tok);
1234 } while (c != EOF && c != '\n' &&
1235 (size_t)(tp - cbuf + 1) < sizeof(cbuf));
1236 *tp = '\0';
1237 for (cp = tabforms;
1238 cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1239 cp++) {
1240 if ((tp = strstr(cbuf, *cp))) {
1241 int newsize = atoi(tp + strlen(*cp));
1243 if (newsize >= 1 && newsize <= 40) {
1244 tok->tabsize = newsize;
1245 if (Py_VerboseFlag)
1246 PySys_WriteStderr(
1247 "Tab size set to %d\n",
1248 newsize);
1252 while (c != EOF && c != '\n')
1253 c = tok_nextc(tok);
1256 /* Check for EOF and errors now */
1257 if (c == EOF) {
1258 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1261 /* Identifier (most frequent token!) */
1262 if (isalpha(c) || c == '_') {
1263 /* Process r"", u"" and ur"" */
1264 switch (c) {
1265 case 'r':
1266 case 'R':
1267 c = tok_nextc(tok);
1268 if (c == '"' || c == '\'')
1269 goto letter_quote;
1270 break;
1271 case 'u':
1272 case 'U':
1273 c = tok_nextc(tok);
1274 if (c == 'r' || c == 'R')
1275 c = tok_nextc(tok);
1276 if (c == '"' || c == '\'')
1277 goto letter_quote;
1278 break;
1280 while (isalnum(c) || c == '_') {
1281 c = tok_nextc(tok);
1283 tok_backup(tok, c);
1284 *p_start = tok->start;
1285 *p_end = tok->cur;
1286 return NAME;
1289 /* Newline */
1290 if (c == '\n') {
1291 tok->atbol = 1;
1292 if (blankline || tok->level > 0)
1293 goto nextline;
1294 *p_start = tok->start;
1295 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1296 tok->cont_line = 0;
1297 return NEWLINE;
1300 /* Period or number starting with period? */
1301 if (c == '.') {
1302 c = tok_nextc(tok);
1303 if (isdigit(c)) {
1304 goto fraction;
1306 else {
1307 tok_backup(tok, c);
1308 *p_start = tok->start;
1309 *p_end = tok->cur;
1310 return DOT;
1314 /* Number */
1315 if (isdigit(c)) {
1316 if (c == '0') {
1317 /* Hex or octal -- maybe. */
1318 c = tok_nextc(tok);
1319 if (c == '.')
1320 goto fraction;
1321 #ifndef WITHOUT_COMPLEX
1322 if (c == 'j' || c == 'J')
1323 goto imaginary;
1324 #endif
1325 if (c == 'x' || c == 'X') {
1326 /* Hex */
1327 do {
1328 c = tok_nextc(tok);
1329 } while (isxdigit(c));
1331 else {
1332 int found_decimal = 0;
1333 /* Octal; c is first char of it */
1334 /* There's no 'isoctdigit' macro, sigh */
1335 while ('0' <= c && c < '8') {
1336 c = tok_nextc(tok);
1338 if (isdigit(c)) {
1339 found_decimal = 1;
1340 do {
1341 c = tok_nextc(tok);
1342 } while (isdigit(c));
1344 if (c == '.')
1345 goto fraction;
1346 else if (c == 'e' || c == 'E')
1347 goto exponent;
1348 #ifndef WITHOUT_COMPLEX
1349 else if (c == 'j' || c == 'J')
1350 goto imaginary;
1351 #endif
1352 else if (found_decimal) {
1353 tok->done = E_TOKEN;
1354 tok_backup(tok, c);
1355 return ERRORTOKEN;
1358 if (c == 'l' || c == 'L')
1359 c = tok_nextc(tok);
1361 else {
1362 /* Decimal */
1363 do {
1364 c = tok_nextc(tok);
1365 } while (isdigit(c));
1366 if (c == 'l' || c == 'L')
1367 c = tok_nextc(tok);
1368 else {
1369 /* Accept floating point numbers. */
1370 if (c == '.') {
1371 fraction:
1372 /* Fraction */
1373 do {
1374 c = tok_nextc(tok);
1375 } while (isdigit(c));
1377 if (c == 'e' || c == 'E') {
1378 exponent:
1379 /* Exponent part */
1380 c = tok_nextc(tok);
1381 if (c == '+' || c == '-')
1382 c = tok_nextc(tok);
1383 if (!isdigit(c)) {
1384 tok->done = E_TOKEN;
1385 tok_backup(tok, c);
1386 return ERRORTOKEN;
1388 do {
1389 c = tok_nextc(tok);
1390 } while (isdigit(c));
1392 #ifndef WITHOUT_COMPLEX
1393 if (c == 'j' || c == 'J')
1394 /* Imaginary part */
1395 imaginary:
1396 c = tok_nextc(tok);
1397 #endif
1400 tok_backup(tok, c);
1401 *p_start = tok->start;
1402 *p_end = tok->cur;
1403 return NUMBER;
1406 letter_quote:
1407 /* String */
1408 if (c == '\'' || c == '"') {
1409 Py_ssize_t quote2 = tok->cur - tok->start + 1;
1410 int quote = c;
1411 int triple = 0;
1412 int tripcount = 0;
1413 for (;;) {
1414 c = tok_nextc(tok);
1415 if (c == '\n') {
1416 if (!triple) {
1417 tok->done = E_EOLS;
1418 tok_backup(tok, c);
1419 return ERRORTOKEN;
1421 tripcount = 0;
1422 tok->cont_line = 1; /* multiline string. */
1424 else if (c == EOF) {
1425 if (triple)
1426 tok->done = E_EOFS;
1427 else
1428 tok->done = E_EOLS;
1429 tok->cur = tok->inp;
1430 return ERRORTOKEN;
1432 else if (c == quote) {
1433 tripcount++;
1434 if (tok->cur - tok->start == quote2) {
1435 c = tok_nextc(tok);
1436 if (c == quote) {
1437 triple = 1;
1438 tripcount = 0;
1439 continue;
1441 tok_backup(tok, c);
1443 if (!triple || tripcount == 3)
1444 break;
1446 else if (c == '\\') {
1447 tripcount = 0;
1448 c = tok_nextc(tok);
1449 if (c == EOF) {
1450 tok->done = E_EOLS;
1451 tok->cur = tok->inp;
1452 return ERRORTOKEN;
1455 else
1456 tripcount = 0;
1458 *p_start = tok->start;
1459 *p_end = tok->cur;
1460 return STRING;
1463 /* Line continuation */
1464 if (c == '\\') {
1465 c = tok_nextc(tok);
1466 if (c != '\n') {
1467 tok->done = E_LINECONT;
1468 tok->cur = tok->inp;
1469 return ERRORTOKEN;
1471 tok->cont_line = 1;
1472 goto again; /* Read next line */
1475 /* Check for two-character token */
1477 int c2 = tok_nextc(tok);
1478 int token = PyToken_TwoChars(c, c2);
1479 if (token != OP) {
1480 int c3 = tok_nextc(tok);
1481 int token3 = PyToken_ThreeChars(c, c2, c3);
1482 if (token3 != OP) {
1483 token = token3;
1484 } else {
1485 tok_backup(tok, c3);
1487 *p_start = tok->start;
1488 *p_end = tok->cur;
1489 return token;
1491 tok_backup(tok, c2);
1494 /* Keep track of parentheses nesting level */
1495 switch (c) {
1496 case '(':
1497 case '[':
1498 case '{':
1499 tok->level++;
1500 break;
1501 case ')':
1502 case ']':
1503 case '}':
1504 tok->level--;
1505 break;
1508 /* Punctuation character */
1509 *p_start = tok->start;
1510 *p_end = tok->cur;
1511 return PyToken_OneChar(c);
1515 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1517 int result = tok_get(tok, p_start, p_end);
1518 if (tok->decoding_erred) {
1519 result = ERRORTOKEN;
1520 tok->done = E_DECODE;
1522 return result;
1525 #ifdef Py_DEBUG
1527 void
1528 tok_dump(int type, char *start, char *end)
1530 printf("%s", _PyParser_TokenNames[type]);
1531 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1532 printf("(%.*s)", (int)(end - start), start);
1535 #endif