2 /* Tokenizer implementation */
5 #include "pgenheaders.h"
10 #include "tokenizer.h"
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
21 extern char *PyOS_Readline(FILE *, FILE *, char *);
22 /* Return malloc'ed string including trailing \n;
23 empty malloc'ed string for EOF;
24 NULL if interrupted */
26 /* Don't ever change this -- it would break the portability of Python code */
29 /* Convert a possibly signed character to a nonnegative int */
30 /* XXX This assumes characters are 8 bits wide */
31 #ifdef __CHAR_UNSIGNED__
32 #define Py_CHARMASK(c) (c)
34 #define Py_CHARMASK(c) ((c) & 0xff)
38 static struct tok_state
*tok_new(void);
39 static int tok_nextc(struct tok_state
*tok
);
40 static void tok_backup(struct tok_state
*tok
, int c
);
44 char *_PyParser_TokenNames
[] = {
96 /* This table must match the #defines in token.h! */
103 /* Create and initialize a new tok_state structure */
105 static struct tok_state
*
108 struct tok_state
*tok
= (struct tok_state
*)PyMem_MALLOC(
109 sizeof(struct tok_state
));
112 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
115 tok
->tabsize
= TABSIZE
;
117 tok
->indstack
[0] = 0;
120 tok
->prompt
= tok
->nextprompt
= NULL
;
123 tok
->filename
= NULL
;
127 tok
->altindstack
[0] = 0;
128 tok
->decoding_state
= 0;
129 tok
->decoding_erred
= 0;
130 tok
->read_coding_spec
= 0;
131 tok
->encoding
= NULL
;
134 tok
->decoding_readline
= NULL
;
135 tok
->decoding_buffer
= NULL
;
143 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
145 return fgets(s
, size
, tok
->fp
);
149 decoding_feof(struct tok_state
*tok
)
151 return feof(tok
->fp
);
155 decode_str(const char *str
, struct tok_state
*tok
)
163 error_ret(struct tok_state
*tok
) /* XXX */
165 tok
->decoding_erred
= 1;
166 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
) /* see PyTokenizer_Free */
167 PyMem_FREE(tok
->buf
);
169 return NULL
; /* as if it were EOF */
173 new_string(const char *s
, Py_ssize_t len
)
175 char* result
= (char *)PyMem_MALLOC(len
+ 1);
176 if (result
!= NULL
) {
177 memcpy(result
, s
, len
);
184 get_normal_name(char *s
) /* for utf-8 and latin-1 */
188 for (i
= 0; i
< 12; i
++) {
190 if (c
== '\0') break;
191 else if (c
== '_') buf
[i
] = '-';
192 else buf
[i
] = tolower(c
);
195 if (strcmp(buf
, "utf-8") == 0 ||
196 strncmp(buf
, "utf-8-", 6) == 0) return "utf-8";
197 else if (strcmp(buf
, "latin-1") == 0 ||
198 strcmp(buf
, "iso-8859-1") == 0 ||
199 strcmp(buf
, "iso-latin-1") == 0 ||
200 strncmp(buf
, "latin-1-", 8) == 0 ||
201 strncmp(buf
, "iso-8859-1-", 11) == 0 ||
202 strncmp(buf
, "iso-latin-1-", 12) == 0) return "iso-8859-1";
206 /* Return the coding spec in S, or NULL if none is found. */
209 get_coding_spec(const char *s
, Py_ssize_t size
)
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i
= 0; i
< size
- 6; i
++) {
217 if (s
[i
] != ' ' && s
[i
] != '\t' && s
[i
] != '\014')
220 for (; i
< size
- 6; i
++) { /* XXX inefficient search */
221 const char* t
= s
+ i
;
222 if (strncmp(t
, "coding", 6) == 0) {
223 const char* begin
= NULL
;
225 if (t
[0] != ':' && t
[0] != '=')
229 } while (t
[0] == '\x20' || t
[0] == '\t');
232 while (isalnum(Py_CHARMASK(t
[0])) ||
233 t
[0] == '-' || t
[0] == '_' || t
[0] == '.')
237 char* r
= new_string(begin
, t
- begin
);
238 char* q
= get_normal_name(r
);
241 r
= new_string(q
, strlen(q
));
250 /* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
256 check_coding_spec(const char* line
, Py_ssize_t size
, struct tok_state
*tok
,
257 int set_readline(struct tok_state
*, const char *))
263 /* It's a continuation line, so it can't be a coding spec. */
265 cs
= get_coding_spec(line
, size
);
267 tok
->read_coding_spec
= 1;
268 if (tok
->encoding
== NULL
) {
269 assert(tok
->decoding_state
== 1); /* raw */
270 if (strcmp(cs
, "utf-8") == 0 ||
271 strcmp(cs
, "iso-8859-1") == 0) {
274 #ifdef Py_USING_UNICODE
275 r
= set_readline(tok
, cs
);
278 tok
->decoding_state
= -1;
283 /* Without Unicode support, we cannot
284 process the coding spec. Since there
285 won't be any Unicode literals, that
290 } else { /* then, compare cs with BOM */
291 r
= (strcmp(tok
->encoding
, cs
) == 0);
299 PyErr_Format(PyExc_SyntaxError
, "encoding problem: %s", cs
);
304 /* See whether the file starts with a BOM. If it does,
305 invoke the set_readline function with the new encoding.
306 Return 1 on success, 0 on failure. */
309 check_bom(int get_char(struct tok_state
*),
310 void unget_char(int, struct tok_state
*),
311 int set_readline(struct tok_state
*, const char *),
312 struct tok_state
*tok
)
314 int ch
= get_char(tok
);
315 tok
->decoding_state
= 1;
318 } else if (ch
== 0xEF) {
319 ch
= get_char(tok
); if (ch
!= 0xBB) goto NON_BOM
;
320 ch
= get_char(tok
); if (ch
!= 0xBF) goto NON_BOM
;
322 /* Disable support for UTF-16 BOMs until a decision
323 is made whether this needs to be supported. */
324 } else if (ch
== 0xFE) {
325 ch
= get_char(tok
); if (ch
!= 0xFF) goto NON_BOM
;
326 if (!set_readline(tok
, "utf-16-be")) return 0;
327 tok
->decoding_state
= -1;
328 } else if (ch
== 0xFF) {
329 ch
= get_char(tok
); if (ch
!= 0xFE) goto NON_BOM
;
330 if (!set_readline(tok
, "utf-16-le")) return 0;
331 tok
->decoding_state
= -1;
337 if (tok
->encoding
!= NULL
)
338 PyMem_FREE(tok
->encoding
);
339 tok
->encoding
= new_string("utf-8", 5); /* resulting is in utf-8 */
342 /* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
343 unget_char(0xFF, tok
); /* XXX this will cause a syntax error */
347 /* Read a line of text from TOK into S, using the stream in TOK.
348 Return NULL on failure, else S.
350 On entry, tok->decoding_buffer will be one of:
351 1) NULL: need to call tok->decoding_readline to get a new line
352 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
353 stored the result in tok->decoding_buffer
354 3) PyStringObject *: previous call to fp_readl did not have enough room
355 (in the s buffer) to copy entire contents of the line read
356 by tok->decoding_readline. tok->decoding_buffer has the overflow.
357 In this case, fp_readl is called in a loop (with an expanded buffer)
358 until the buffer ends with a '\n' (or until the end of the file is
359 reached): see tok_nextc and its calls to decoding_fgets.
363 fp_readl(char *s
, int size
, struct tok_state
*tok
)
365 #ifndef Py_USING_UNICODE
366 /* In a non-Unicode built, this should never be called. */
367 Py_FatalError("fp_readl should not be called in this build.");
368 return NULL
; /* Keep compiler happy (not reachable) */
370 PyObject
* utf8
= NULL
;
371 PyObject
* buf
= tok
->decoding_buffer
;
375 /* Ask for one less byte so we can terminate it */
380 buf
= PyObject_CallObject(tok
->decoding_readline
, NULL
);
382 return error_ret(tok
);
384 tok
->decoding_buffer
= NULL
;
385 if (PyString_CheckExact(buf
))
389 utf8
= PyUnicode_AsUTF8String(buf
);
392 return error_ret(tok
);
394 str
= PyString_AsString(utf8
);
395 utf8len
= PyString_GET_SIZE(utf8
);
396 if (utf8len
> size
) {
397 tok
->decoding_buffer
= PyString_FromStringAndSize(str
+size
, utf8len
-size
);
398 if (tok
->decoding_buffer
== NULL
) {
400 return error_ret(tok
);
404 memcpy(s
, str
, utf8len
);
407 if (utf8len
== 0) return NULL
; /* EOF */
412 /* Set the readline function for TOK to a StreamReader's
413 readline function. The StreamReader is named ENC.
415 This function is called from check_bom and check_coding_spec.
417 ENC is usually identical to the future value of tok->encoding,
418 except for the (currently unsupported) case of UTF-16.
420 Return 1 on success, 0 on failure. */
423 fp_setreadl(struct tok_state
*tok
, const char* enc
)
425 PyObject
*reader
, *stream
, *readline
;
427 /* XXX: constify filename argument. */
428 stream
= PyFile_FromFile(tok
->fp
, (char*)tok
->filename
, "rb", NULL
);
432 reader
= PyCodec_StreamReader(enc
, stream
, NULL
);
437 readline
= PyObject_GetAttrString(reader
, "readline");
439 if (readline
== NULL
)
442 tok
->decoding_readline
= readline
;
446 /* Fetch the next byte from TOK. */
448 static int fp_getc(struct tok_state
*tok
) {
449 return getc(tok
->fp
);
452 /* Unfetch the last byte back into TOK. */
454 static void fp_ungetc(int c
, struct tok_state
*tok
) {
458 /* Read a line of input from TOK. Determine encoding
462 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
467 if (tok
->decoding_state
< 0) {
468 /* We already have a codec associated with
470 line
= fp_readl(s
, size
, tok
);
472 } else if (tok
->decoding_state
> 0) {
473 /* We want a 'raw' read. */
474 line
= Py_UniversalNewlineFgets(s
, size
,
478 /* We have not yet determined the encoding.
479 If an encoding is found, use the file-pointer
480 reader functions from now on. */
481 if (!check_bom(fp_getc
, fp_ungetc
, fp_setreadl
, tok
))
482 return error_ret(tok
);
483 assert(tok
->decoding_state
!= 0);
486 if (line
!= NULL
&& tok
->lineno
< 2 && !tok
->read_coding_spec
) {
487 if (!check_coding_spec(line
, strlen(line
), tok
, fp_setreadl
)) {
488 return error_ret(tok
);
492 /* The default encoding is ASCII, so make sure we don't have any
493 non-ASCII bytes in it. */
494 if (line
&& !tok
->encoding
) {
496 for (c
= (unsigned char *)line
; *c
; c
++)
504 /* Need to add 1 to the line number, since this line
505 has not been counted, yet. */
507 "Non-ASCII character '\\x%.2x' "
508 "in file %.200s on line %i, "
509 "but no encoding declared; "
510 "see http://www.python.org/peps/pep-0263.html for details",
511 badchar
, tok
->filename
, tok
->lineno
+ 1);
512 PyErr_SetString(PyExc_SyntaxError
, buf
);
513 return error_ret(tok
);
520 decoding_feof(struct tok_state
*tok
)
522 if (tok
->decoding_state
>= 0) {
523 return feof(tok
->fp
);
525 PyObject
* buf
= tok
->decoding_buffer
;
527 buf
= PyObject_CallObject(tok
->decoding_readline
, NULL
);
532 tok
->decoding_buffer
= buf
;
535 return PyObject_Length(buf
) == 0;
539 /* Fetch a byte from TOK, using the string buffer. */
542 buf_getc(struct tok_state
*tok
) {
543 return Py_CHARMASK(*tok
->str
++);
546 /* Unfetch a byte from TOK, using the string buffer. */
549 buf_ungetc(int c
, struct tok_state
*tok
) {
551 assert(Py_CHARMASK(*tok
->str
) == c
); /* tok->cur may point to read-only segment */
554 /* Set the readline function for TOK to ENC. For the string-based
555 tokenizer, this means to just record the encoding. */
558 buf_setreadl(struct tok_state
*tok
, const char* enc
) {
563 /* Return a UTF-8 encoding Python string object from the
564 C byte string STR, which is encoded with ENC. */
566 #ifdef Py_USING_UNICODE
568 translate_into_utf8(const char* str
, const char* enc
) {
570 PyObject
* buf
= PyUnicode_Decode(str
, strlen(str
), enc
, NULL
);
573 utf8
= PyUnicode_AsUTF8String(buf
);
579 /* Decode a byte string STR for use as the buffer of TOK.
580 Look for encoding declarations inside STR, and record them
584 decode_str(const char *str
, struct tok_state
*tok
)
586 PyObject
* utf8
= NULL
;
591 if (!check_bom(buf_getc
, buf_ungetc
, buf_setreadl
, tok
))
592 return error_ret(tok
);
593 str
= tok
->str
; /* string after BOM if any */
595 #ifdef Py_USING_UNICODE
596 if (tok
->enc
!= NULL
) {
597 utf8
= translate_into_utf8(str
, tok
->enc
);
599 return error_ret(tok
);
600 str
= PyString_AsString(utf8
);
603 for (s
= str
;; s
++) {
604 if (*s
== '\0') break;
605 else if (*s
== '\n') {
607 if (lineno
== 2) break;
611 if (!check_coding_spec(str
, s
- str
, tok
, buf_setreadl
))
612 return error_ret(tok
);
613 #ifdef Py_USING_UNICODE
614 if (tok
->enc
!= NULL
) {
615 assert(utf8
== NULL
);
616 utf8
= translate_into_utf8(str
, tok
->enc
);
618 PyErr_Format(PyExc_SyntaxError
,
619 "unknown encoding: %s", tok
->enc
);
620 return error_ret(tok
);
622 str
= PyString_AsString(utf8
);
625 assert(tok
->decoding_buffer
== NULL
);
626 tok
->decoding_buffer
= utf8
; /* CAUTION */
632 /* Set up tokenizer for string */
635 PyTokenizer_FromString(const char *str
)
637 struct tok_state
*tok
= tok_new();
640 str
= (char *)decode_str(str
, tok
);
642 PyTokenizer_Free(tok
);
646 /* XXX: constify members. */
647 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= (char*)str
;
652 /* Set up tokenizer for file */
655 PyTokenizer_FromFile(FILE *fp
, char *ps1
, char *ps2
)
657 struct tok_state
*tok
= tok_new();
660 if ((tok
->buf
= (char *)PyMem_MALLOC(BUFSIZ
)) == NULL
) {
661 PyTokenizer_Free(tok
);
664 tok
->cur
= tok
->inp
= tok
->buf
;
665 tok
->end
= tok
->buf
+ BUFSIZ
;
668 tok
->nextprompt
= ps2
;
673 /* Free a tok_state structure */
676 PyTokenizer_Free(struct tok_state
*tok
)
678 if (tok
->encoding
!= NULL
)
679 PyMem_FREE(tok
->encoding
);
681 Py_XDECREF(tok
->decoding_readline
);
682 Py_XDECREF(tok
->decoding_buffer
);
684 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
685 PyMem_FREE(tok
->buf
);
689 #if !defined(PGEN) && defined(Py_USING_UNICODE)
691 tok_stdin_decode(struct tok_state
*tok
, char **inp
)
693 PyObject
*enc
, *sysstdin
, *decoded
, *utf8
;
694 const char *encoding
;
697 if (PySys_GetFile((char *)"stdin", NULL
) != stdin
)
699 sysstdin
= PySys_GetObject("stdin");
700 if (sysstdin
== NULL
|| !PyFile_Check(sysstdin
))
703 enc
= ((PyFileObject
*)sysstdin
)->f_encoding
;
704 if (enc
== NULL
|| !PyString_Check(enc
))
708 encoding
= PyString_AsString(enc
);
709 decoded
= PyUnicode_Decode(*inp
, strlen(*inp
), encoding
, NULL
);
713 utf8
= PyUnicode_AsEncodedString(decoded
, "utf-8", NULL
);
718 assert(PyString_Check(utf8
));
719 converted
= new_string(PyString_AS_STRING(utf8
),
720 PyString_GET_SIZE(utf8
));
722 if (converted
== NULL
)
727 if (tok
->encoding
!= NULL
)
728 PyMem_FREE(tok
->encoding
);
729 tok
->encoding
= new_string(encoding
, strlen(encoding
));
730 if (tok
->encoding
== NULL
)
742 /* Fallback to iso-8859-1: for backward compatibility */
749 /* Get next char, updating state; error code goes into tok->done */
752 tok_nextc(register struct tok_state
*tok
)
755 if (tok
->cur
!= tok
->inp
) {
756 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
758 if (tok
->done
!= E_OK
)
760 if (tok
->fp
== NULL
) {
761 char *end
= strchr(tok
->inp
, '\n');
765 end
= strchr(tok
->inp
, '\0');
766 if (end
== tok
->inp
) {
771 if (tok
->start
== NULL
)
773 tok
->line_start
= tok
->cur
;
776 return Py_CHARMASK(*tok
->cur
++);
778 if (tok
->prompt
!= NULL
) {
779 char *newtok
= PyOS_Readline(stdin
, stdout
, tok
->prompt
);
780 if (tok
->nextprompt
!= NULL
)
781 tok
->prompt
= tok
->nextprompt
;
784 else if (*newtok
== '\0') {
788 #if !defined(PGEN) && defined(Py_USING_UNICODE)
789 else if (tok_stdin_decode(tok
, &newtok
) != 0)
792 else if (tok
->start
!= NULL
) {
793 size_t start
= tok
->start
- tok
->buf
;
794 size_t oldlen
= tok
->cur
- tok
->buf
;
795 size_t newlen
= oldlen
+ strlen(newtok
);
796 char *buf
= tok
->buf
;
797 buf
= (char *)PyMem_REALLOC(buf
, newlen
+1);
800 PyMem_FREE(tok
->buf
);
807 tok
->cur
= tok
->buf
+ oldlen
;
808 tok
->line_start
= tok
->cur
;
809 strcpy(tok
->buf
+ oldlen
, newtok
);
811 tok
->inp
= tok
->buf
+ newlen
;
812 tok
->end
= tok
->inp
+ 1;
813 tok
->start
= tok
->buf
+ start
;
817 if (tok
->buf
!= NULL
)
818 PyMem_FREE(tok
->buf
);
820 tok
->line_start
= tok
->buf
;
822 tok
->line_start
= tok
->buf
;
823 tok
->inp
= strchr(tok
->buf
, '\0');
824 tok
->end
= tok
->inp
+ 1;
831 if (tok
->start
== NULL
) {
832 if (tok
->buf
== NULL
) {
834 PyMem_MALLOC(BUFSIZ
);
835 if (tok
->buf
== NULL
) {
839 tok
->end
= tok
->buf
+ BUFSIZ
;
841 if (decoding_fgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
848 tok
->inp
= strchr(tok
->buf
, '\0');
849 done
= tok
->inp
[-1] == '\n';
853 cur
= tok
->cur
- tok
->buf
;
854 if (decoding_feof(tok
)) {
862 /* Read until '\n' or EOF */
864 Py_ssize_t curstart
= tok
->start
== NULL
? -1 :
865 tok
->start
- tok
->buf
;
866 Py_ssize_t curvalid
= tok
->inp
- tok
->buf
;
867 Py_ssize_t newsize
= curvalid
+ BUFSIZ
;
868 char *newbuf
= tok
->buf
;
869 newbuf
= (char *)PyMem_REALLOC(newbuf
,
871 if (newbuf
== NULL
) {
877 tok
->inp
= tok
->buf
+ curvalid
;
878 tok
->end
= tok
->buf
+ newsize
;
879 tok
->start
= curstart
< 0 ? NULL
:
881 if (decoding_fgets(tok
->inp
,
882 (int)(tok
->end
- tok
->inp
),
884 /* Break out early on decoding
885 errors, as tok->buf will be NULL
887 if (tok
->decoding_erred
)
889 /* Last line does not end in \n,
891 strcpy(tok
->inp
, "\n");
893 tok
->inp
= strchr(tok
->inp
, '\0');
894 done
= tok
->inp
[-1] == '\n';
896 if (tok
->buf
!= NULL
) {
897 tok
->cur
= tok
->buf
+ cur
;
898 tok
->line_start
= tok
->cur
;
899 /* replace "\r\n" with "\n" */
900 /* For Mac leave the \r, giving syntax error */
902 if (pt
>= tok
->buf
&& *pt
== '\r') {
909 if (tok
->done
!= E_OK
) {
910 if (tok
->prompt
!= NULL
)
911 PySys_WriteStderr("\n");
920 /* Back-up one character */
923 tok_backup(register struct tok_state
*tok
, register int c
)
926 if (--tok
->cur
< tok
->buf
)
927 Py_FatalError("tok_backup: begin of buffer");
934 /* Return the token corresponding to a single character */
937 PyToken_OneChar(int c
)
940 case '(': return LPAR
;
941 case ')': return RPAR
;
942 case '[': return LSQB
;
943 case ']': return RSQB
;
944 case ':': return COLON
;
945 case ',': return COMMA
;
946 case ';': return SEMI
;
947 case '+': return PLUS
;
948 case '-': return MINUS
;
949 case '*': return STAR
;
950 case '/': return SLASH
;
951 case '|': return VBAR
;
952 case '&': return AMPER
;
953 case '<': return LESS
;
954 case '>': return GREATER
;
955 case '=': return EQUAL
;
956 case '.': return DOT
;
957 case '%': return PERCENT
;
958 case '`': return BACKQUOTE
;
959 case '{': return LBRACE
;
960 case '}': return RBRACE
;
961 case '^': return CIRCUMFLEX
;
962 case '~': return TILDE
;
970 PyToken_TwoChars(int c1
, int c2
)
975 case '=': return EQEQUAL
;
980 case '=': return NOTEQUAL
;
985 case '>': return NOTEQUAL
;
986 case '=': return LESSEQUAL
;
987 case '<': return LEFTSHIFT
;
992 case '=': return GREATEREQUAL
;
993 case '>': return RIGHTSHIFT
;
998 case '=': return PLUSEQUAL
;
1003 case '=': return MINEQUAL
;
1008 case '*': return DOUBLESTAR
;
1009 case '=': return STAREQUAL
;
1014 case '/': return DOUBLESLASH
;
1015 case '=': return SLASHEQUAL
;
1020 case '=': return VBAREQUAL
;
1025 case '=': return PERCENTEQUAL
;
1030 case '=': return AMPEREQUAL
;
1035 case '=': return CIRCUMFLEXEQUAL
;
1043 PyToken_ThreeChars(int c1
, int c2
, int c3
)
1051 return LEFTSHIFTEQUAL
;
1061 return RIGHTSHIFTEQUAL
;
1071 return DOUBLESTAREQUAL
;
1081 return DOUBLESLASHEQUAL
;
1091 indenterror(struct tok_state
*tok
)
1093 if (tok
->alterror
) {
1094 tok
->done
= E_TABSPACE
;
1095 tok
->cur
= tok
->inp
;
1098 if (tok
->altwarning
) {
1099 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1100 "in indentation\n", tok
->filename
);
1101 tok
->altwarning
= 0;
1107 /* Get next token, after space stripping etc. */
1110 tok_get(register struct tok_state
*tok
, char **p_start
, char **p_end
)
1115 *p_start
= *p_end
= NULL
;
1120 /* Get indentation level */
1122 register int col
= 0;
1123 register int altcol
= 0;
1129 else if (c
== '\t') {
1130 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
1131 altcol
= (altcol
/tok
->alttabsize
+ 1)
1134 else if (c
== '\014') /* Control-L (formfeed) */
1135 col
= altcol
= 0; /* For Emacs users */
1140 if (c
== '#' || c
== '\n') {
1141 /* Lines with only whitespace and/or comments
1142 shouldn't affect the indentation and are
1143 not passed to the parser as NEWLINE tokens,
1144 except *totally* empty lines in interactive
1145 mode, which signal the end of a command group. */
1146 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
1147 blankline
= 0; /* Let it through */
1149 blankline
= 1; /* Ignore completely */
1150 /* We can't jump back right here since we still
1151 may need to skip to the end of a comment */
1153 if (!blankline
&& tok
->level
== 0) {
1154 if (col
== tok
->indstack
[tok
->indent
]) {
1156 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1157 if (indenterror(tok
))
1161 else if (col
> tok
->indstack
[tok
->indent
]) {
1162 /* Indent -- always one */
1163 if (tok
->indent
+1 >= MAXINDENT
) {
1164 tok
->done
= E_TOODEEP
;
1165 tok
->cur
= tok
->inp
;
1168 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
1169 if (indenterror(tok
))
1173 tok
->indstack
[++tok
->indent
] = col
;
1174 tok
->altindstack
[tok
->indent
] = altcol
;
1176 else /* col < tok->indstack[tok->indent] */ {
1177 /* Dedent -- any number, must be consistent */
1178 while (tok
->indent
> 0 &&
1179 col
< tok
->indstack
[tok
->indent
]) {
1183 if (col
!= tok
->indstack
[tok
->indent
]) {
1184 tok
->done
= E_DEDENT
;
1185 tok
->cur
= tok
->inp
;
1188 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1189 if (indenterror(tok
))
1196 tok
->start
= tok
->cur
;
1198 /* Return pending indents/dedents */
1199 if (tok
->pendin
!= 0) {
1200 if (tok
->pendin
< 0) {
1215 } while (c
== ' ' || c
== '\t' || c
== '\014');
1217 /* Set start of current token */
1218 tok
->start
= tok
->cur
- 1;
1220 /* Skip comment, while looking for tab-setting magic */
1222 static char *tabforms
[] = {
1223 "tab-width:", /* Emacs */
1224 ":tabstop=", /* vim, full form */
1225 ":ts=", /* vim, abbreviated form */
1226 "set tabsize=", /* will vi never die? */
1227 /* more templates can be added here to support other editors */
1233 *tp
++ = c
= tok_nextc(tok
);
1234 } while (c
!= EOF
&& c
!= '\n' &&
1235 (size_t)(tp
- cbuf
+ 1) < sizeof(cbuf
));
1238 cp
< tabforms
+ sizeof(tabforms
)/sizeof(tabforms
[0]);
1240 if ((tp
= strstr(cbuf
, *cp
))) {
1241 int newsize
= atoi(tp
+ strlen(*cp
));
1243 if (newsize
>= 1 && newsize
<= 40) {
1244 tok
->tabsize
= newsize
;
1247 "Tab size set to %d\n",
1252 while (c
!= EOF
&& c
!= '\n')
1256 /* Check for EOF and errors now */
1258 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
1261 /* Identifier (most frequent token!) */
1262 if (isalpha(c
) || c
== '_') {
1263 /* Process r"", u"" and ur"" */
1268 if (c
== '"' || c
== '\'')
1274 if (c
== 'r' || c
== 'R')
1276 if (c
== '"' || c
== '\'')
1280 while (isalnum(c
) || c
== '_') {
1284 *p_start
= tok
->start
;
1292 if (blankline
|| tok
->level
> 0)
1294 *p_start
= tok
->start
;
1295 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
1300 /* Period or number starting with period? */
1308 *p_start
= tok
->start
;
1317 /* Hex or octal -- maybe. */
1321 #ifndef WITHOUT_COMPLEX
1322 if (c
== 'j' || c
== 'J')
1325 if (c
== 'x' || c
== 'X') {
1329 } while (isxdigit(c
));
1332 int found_decimal
= 0;
1333 /* Octal; c is first char of it */
1334 /* There's no 'isoctdigit' macro, sigh */
1335 while ('0' <= c
&& c
< '8') {
1342 } while (isdigit(c
));
1346 else if (c
== 'e' || c
== 'E')
1348 #ifndef WITHOUT_COMPLEX
1349 else if (c
== 'j' || c
== 'J')
1352 else if (found_decimal
) {
1353 tok
->done
= E_TOKEN
;
1358 if (c
== 'l' || c
== 'L')
1365 } while (isdigit(c
));
1366 if (c
== 'l' || c
== 'L')
1369 /* Accept floating point numbers. */
1375 } while (isdigit(c
));
1377 if (c
== 'e' || c
== 'E') {
1381 if (c
== '+' || c
== '-')
1384 tok
->done
= E_TOKEN
;
1390 } while (isdigit(c
));
1392 #ifndef WITHOUT_COMPLEX
1393 if (c
== 'j' || c
== 'J')
1394 /* Imaginary part */
1401 *p_start
= tok
->start
;
1408 if (c
== '\'' || c
== '"') {
1409 Py_ssize_t quote2
= tok
->cur
- tok
->start
+ 1;
1422 tok
->cont_line
= 1; /* multiline string. */
1424 else if (c
== EOF
) {
1429 tok
->cur
= tok
->inp
;
1432 else if (c
== quote
) {
1434 if (tok
->cur
- tok
->start
== quote2
) {
1443 if (!triple
|| tripcount
== 3)
1446 else if (c
== '\\') {
1451 tok
->cur
= tok
->inp
;
1458 *p_start
= tok
->start
;
1463 /* Line continuation */
1467 tok
->done
= E_LINECONT
;
1468 tok
->cur
= tok
->inp
;
1472 goto again
; /* Read next line */
1475 /* Check for two-character token */
1477 int c2
= tok_nextc(tok
);
1478 int token
= PyToken_TwoChars(c
, c2
);
1480 int c3
= tok_nextc(tok
);
1481 int token3
= PyToken_ThreeChars(c
, c2
, c3
);
1485 tok_backup(tok
, c3
);
1487 *p_start
= tok
->start
;
1491 tok_backup(tok
, c2
);
1494 /* Keep track of parentheses nesting level */
1508 /* Punctuation character */
1509 *p_start
= tok
->start
;
1511 return PyToken_OneChar(c
);
1515 PyTokenizer_Get(struct tok_state
*tok
, char **p_start
, char **p_end
)
1517 int result
= tok_get(tok
, p_start
, p_end
);
1518 if (tok
->decoding_erred
) {
1519 result
= ERRORTOKEN
;
1520 tok
->done
= E_DECODE
;
1528 tok_dump(int type
, char *start
, char *end
)
1530 printf("%s", _PyParser_TokenNames
[type
]);
1531 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
1532 printf("(%.*s)", (int)(end
- start
), start
);