2 /* Tokenizer implementation */
5 #include "pgenheaders.h"
10 #include "tokenizer.h"
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
21 #define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
27 #define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
34 extern char *PyOS_Readline(FILE *, FILE *, char *);
35 /* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
39 /* Don't ever change this -- it would break the portability of Python code */
43 static struct tok_state
*tok_new(void);
44 static int tok_nextc(struct tok_state
*tok
);
45 static void tok_backup(struct tok_state
*tok
, int c
);
50 char *_PyParser_TokenNames
[] = {
103 /* This table must match the #defines in token.h! */
110 /* Create and initialize a new tok_state structure */
112 static struct tok_state
*
115 struct tok_state
*tok
= (struct tok_state
*)PyMem_MALLOC(
116 sizeof(struct tok_state
));
119 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
122 tok
->tabsize
= TABSIZE
;
124 tok
->indstack
[0] = 0;
127 tok
->prompt
= tok
->nextprompt
= NULL
;
130 tok
->filename
= NULL
;
134 tok
->altindstack
[0] = 0;
135 tok
->decoding_state
= STATE_INIT
;
136 tok
->decoding_erred
= 0;
137 tok
->read_coding_spec
= 0;
139 tok
->encoding
= NULL
;
142 tok
->decoding_readline
= NULL
;
143 tok
->decoding_buffer
= NULL
;
151 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
153 return fgets(s
, size
, tok
->fp
);
157 decoding_feof(struct tok_state
*tok
)
159 return feof(tok
->fp
);
163 decode_str(const char *str
, struct tok_state
*tok
)
171 error_ret(struct tok_state
*tok
) /* XXX */
173 tok
->decoding_erred
= 1;
174 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
) /* see PyTokenizer_Free */
175 PyMem_FREE(tok
->buf
);
177 return NULL
; /* as if it were EOF */
181 new_string(const char *s
, Py_ssize_t len
)
183 char* result
= (char *)PyMem_MALLOC(len
+ 1);
184 if (result
!= NULL
) {
185 memcpy(result
, s
, len
);
192 get_normal_name(char *s
) /* for utf-8 and latin-1 */
196 for (i
= 0; i
< 12; i
++) {
206 if (strcmp(buf
, "utf-8") == 0 ||
207 strncmp(buf
, "utf-8-", 6) == 0)
209 else if (strcmp(buf
, "latin-1") == 0 ||
210 strcmp(buf
, "iso-8859-1") == 0 ||
211 strcmp(buf
, "iso-latin-1") == 0 ||
212 strncmp(buf
, "latin-1-", 8) == 0 ||
213 strncmp(buf
, "iso-8859-1-", 11) == 0 ||
214 strncmp(buf
, "iso-latin-1-", 12) == 0)
220 /* Return the coding spec in S, or NULL if none is found. */
223 get_coding_spec(const char *s
, Py_ssize_t size
)
226 /* Coding spec must be in a comment, and that comment must be
227 * the only statement on the source code line. */
228 for (i
= 0; i
< size
- 6; i
++) {
231 if (s
[i
] != ' ' && s
[i
] != '\t' && s
[i
] != '\014')
234 for (; i
< size
- 6; i
++) { /* XXX inefficient search */
235 const char* t
= s
+ i
;
236 if (strncmp(t
, "coding", 6) == 0) {
237 const char* begin
= NULL
;
239 if (t
[0] != ':' && t
[0] != '=')
243 } while (t
[0] == '\x20' || t
[0] == '\t');
246 while (isalnum(Py_CHARMASK(t
[0])) ||
247 t
[0] == '-' || t
[0] == '_' || t
[0] == '.')
251 char* r
= new_string(begin
, t
- begin
);
252 char* q
= get_normal_name(r
);
255 r
= new_string(q
, strlen(q
));
264 /* Check whether the line contains a coding spec. If it does,
265 invoke the set_readline function for the new encoding.
266 This function receives the tok_state and the new encoding.
267 Return 1 on success, 0 on failure. */
270 check_coding_spec(const char* line
, Py_ssize_t size
, struct tok_state
*tok
,
271 int set_readline(struct tok_state
*, const char *))
277 /* It's a continuation line, so it can't be a coding spec. */
279 cs
= get_coding_spec(line
, size
);
281 tok
->read_coding_spec
= 1;
282 if (tok
->encoding
== NULL
) {
283 assert(tok
->decoding_state
== STATE_RAW
);
284 if (strcmp(cs
, "utf-8") == 0) {
287 r
= set_readline(tok
, cs
);
290 tok
->decoding_state
= STATE_NORMAL
;
295 } else { /* then, compare cs with BOM */
296 r
= (strcmp(tok
->encoding
, cs
) == 0);
304 PyErr_Format(PyExc_SyntaxError
, "encoding problem: %s", cs
);
309 /* See whether the file starts with a BOM. If it does,
310 invoke the set_readline function with the new encoding.
311 Return 1 on success, 0 on failure. */
314 check_bom(int get_char(struct tok_state
*),
315 void unget_char(int, struct tok_state
*),
316 int set_readline(struct tok_state
*, const char *),
317 struct tok_state
*tok
)
321 tok
->decoding_state
= STATE_RAW
;
324 } else if (ch1
== 0xEF) {
327 unget_char(ch2
, tok
);
328 unget_char(ch1
, tok
);
333 unget_char(ch3
, tok
);
334 unget_char(ch2
, tok
);
335 unget_char(ch1
, tok
);
339 /* Disable support for UTF-16 BOMs until a decision
340 is made whether this needs to be supported. */
341 } else if (ch1
== 0xFE) {
344 unget_char(ch2
, tok
);
345 unget_char(ch1
, tok
);
348 if (!set_readline(tok
, "utf-16-be"))
350 tok
->decoding_state
= STATE_NORMAL
;
351 } else if (ch1
== 0xFF) {
354 unget_char(ch2
, tok
);
355 unget_char(ch1
, tok
);
358 if (!set_readline(tok
, "utf-16-le"))
360 tok
->decoding_state
= STATE_NORMAL
;
363 unget_char(ch1
, tok
);
366 if (tok
->encoding
!= NULL
)
367 PyMem_FREE(tok
->encoding
);
368 tok
->encoding
= new_string("utf-8", 5); /* resulting is in utf-8 */
369 /* No need to set_readline: input is already utf-8 */
373 /* Read a line of text from TOK into S, using the stream in TOK.
374 Return NULL on failure, else S.
376 On entry, tok->decoding_buffer will be one of:
377 1) NULL: need to call tok->decoding_readline to get a new line
378 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
379 stored the result in tok->decoding_buffer
380 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
381 (in the s buffer) to copy entire contents of the line read
382 by tok->decoding_readline. tok->decoding_buffer has the overflow.
383 In this case, fp_readl is called in a loop (with an expanded buffer)
384 until the buffer ends with a '\n' (or until the end of the file is
385 reached): see tok_nextc and its calls to decoding_fgets.
389 fp_readl(char *s
, int size
, struct tok_state
*tok
)
395 /* Ask for one less byte so we can terminate it */
399 if (tok
->decoding_buffer
) {
400 bufobj
= tok
->decoding_buffer
;
405 bufobj
= PyObject_CallObject(tok
->decoding_readline
, NULL
);
409 if (PyUnicode_CheckExact(bufobj
))
411 buf
= _PyUnicode_AsStringAndSize(bufobj
, &buflen
);
418 buf
= PyByteArray_AsString(bufobj
);
422 buflen
= PyByteArray_GET_SIZE(bufobj
);
425 Py_XDECREF(tok
->decoding_buffer
);
427 /* Too many chars, the rest goes into tok->decoding_buffer */
428 tok
->decoding_buffer
= PyByteArray_FromStringAndSize(buf
+size
,
430 if (tok
->decoding_buffer
== NULL
)
435 tok
->decoding_buffer
= NULL
;
437 memcpy(s
, buf
, buflen
);
439 if (buflen
== 0) /* EOF */
446 return error_ret(tok
);
449 /* Set the readline function for TOK to a StreamReader's
450 readline function. The StreamReader is named ENC.
452 This function is called from check_bom and check_coding_spec.
454 ENC is usually identical to the future value of tok->encoding,
455 except for the (currently unsupported) case of UTF-16.
457 Return 1 on success, 0 on failure. */
460 fp_setreadl(struct tok_state
*tok
, const char* enc
)
462 PyObject
*readline
= NULL
, *stream
= NULL
, *io
= NULL
;
464 io
= PyImport_ImportModuleNoBlock("io");
469 stream
= PyObject_CallMethod(io
, "open", "ssis",
470 tok
->filename
, "r", -1, enc
);
472 stream
= PyObject_CallMethod(io
, "open", "isisOOO",
473 fileno(tok
->fp
), "r", -1, enc
, Py_None
, Py_None
, Py_False
);
477 Py_XDECREF(tok
->decoding_readline
);
478 readline
= PyObject_GetAttrString(stream
, "readline");
479 tok
->decoding_readline
= readline
;
481 /* The file has been reopened; parsing will restart from
482 * the beginning of the file, we have to reset the line number.
483 * But this function has been called from inside tok_nextc() which
484 * will increment lineno before it returns. So we set it -1 so that
485 * the next call to tok_nextc() will start with tok->lineno == 0.
492 return readline
!= NULL
;
495 /* Fetch the next byte from TOK. */
497 static int fp_getc(struct tok_state
*tok
) {
498 return getc(tok
->fp
);
501 /* Unfetch the last byte back into TOK. */
503 static void fp_ungetc(int c
, struct tok_state
*tok
) {
507 /* Check whether the characters at s start a valid
508 UTF-8 sequence. Return the number of characters forming
509 the sequence if yes, 0 if not. */
510 static int valid_utf8(const unsigned char* s
)
515 /* single-byte code */
528 length
= expected
+ 1;
529 for (; expected
; expected
--)
530 if (s
[expected
] < 0x80 || s
[expected
] >= 0xC0)
535 /* Read a line of input from TOK. Determine encoding
539 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
544 if (tok
->decoding_state
== STATE_NORMAL
) {
545 /* We already have a codec associated with
547 line
= fp_readl(s
, size
, tok
);
549 } else if (tok
->decoding_state
== STATE_RAW
) {
550 /* We want a 'raw' read. */
551 line
= Py_UniversalNewlineFgets(s
, size
,
555 /* We have not yet determined the encoding.
556 If an encoding is found, use the file-pointer
557 reader functions from now on. */
558 if (!check_bom(fp_getc
, fp_ungetc
, fp_setreadl
, tok
))
559 return error_ret(tok
);
560 assert(tok
->decoding_state
!= STATE_INIT
);
563 if (line
!= NULL
&& tok
->lineno
< 2 && !tok
->read_coding_spec
) {
564 if (!check_coding_spec(line
, strlen(line
), tok
, fp_setreadl
)) {
565 return error_ret(tok
);
569 /* The default encoding is UTF-8, so make sure we don't have any
570 non-UTF-8 sequences in it. */
571 if (line
&& !tok
->encoding
) {
574 for (c
= (unsigned char *)line
; *c
; c
+= length
)
575 if (!(length
= valid_utf8(c
))) {
582 /* Need to add 1 to the line number, since this line
583 has not been counted, yet. */
585 "Non-UTF-8 code starting with '\\x%.2x' "
586 "in file %.200s on line %i, "
587 "but no encoding declared; "
588 "see http://python.org/dev/peps/pep-0263/ for details",
589 badchar
, tok
->filename
, tok
->lineno
+ 1);
590 PyErr_SetString(PyExc_SyntaxError
, buf
);
591 return error_ret(tok
);
598 decoding_feof(struct tok_state
*tok
)
600 if (tok
->decoding_state
!= STATE_NORMAL
) {
601 return feof(tok
->fp
);
603 PyObject
* buf
= tok
->decoding_buffer
;
605 buf
= PyObject_CallObject(tok
->decoding_readline
, NULL
);
610 tok
->decoding_buffer
= buf
;
613 return PyObject_Length(buf
) == 0;
617 /* Fetch a byte from TOK, using the string buffer. */
620 buf_getc(struct tok_state
*tok
) {
621 return Py_CHARMASK(*tok
->str
++);
624 /* Unfetch a byte from TOK, using the string buffer. */
627 buf_ungetc(int c
, struct tok_state
*tok
) {
629 assert(Py_CHARMASK(*tok
->str
) == c
); /* tok->cur may point to read-only segment */
632 /* Set the readline function for TOK to ENC. For the string-based
633 tokenizer, this means to just record the encoding. */
636 buf_setreadl(struct tok_state
*tok
, const char* enc
) {
641 /* Return a UTF-8 encoding Python string object from the
642 C byte string STR, which is encoded with ENC. */
645 translate_into_utf8(const char* str
, const char* enc
) {
647 PyObject
* buf
= PyUnicode_Decode(str
, strlen(str
), enc
, NULL
);
650 utf8
= PyUnicode_AsUTF8String(buf
);
655 /* Decode a byte string STR for use as the buffer of TOK.
656 Look for encoding declarations inside STR, and record them
660 decode_str(const char *str
, struct tok_state
*tok
)
662 PyObject
* utf8
= NULL
;
664 const char *newl
[2] = {NULL
, NULL
};
668 if (!check_bom(buf_getc
, buf_ungetc
, buf_setreadl
, tok
))
669 return error_ret(tok
);
670 str
= tok
->str
; /* string after BOM if any */
672 if (tok
->enc
!= NULL
) {
673 utf8
= translate_into_utf8(str
, tok
->enc
);
675 return error_ret(tok
);
676 str
= PyBytes_AsString(utf8
);
678 for (s
= str
;; s
++) {
679 if (*s
== '\0') break;
680 else if (*s
== '\n') {
684 if (lineno
== 2) break;
688 /* need to check line 1 and 2 separately since check_coding_spec
689 assumes a single line as input */
691 if (!check_coding_spec(str
, newl
[0] - str
, tok
, buf_setreadl
))
692 return error_ret(tok
);
693 if (tok
->enc
== NULL
&& newl
[1]) {
694 if (!check_coding_spec(newl
[0]+1, newl
[1] - newl
[0],
696 return error_ret(tok
);
699 if (tok
->enc
!= NULL
) {
700 assert(utf8
== NULL
);
701 utf8
= translate_into_utf8(str
, tok
->enc
);
703 return error_ret(tok
);
704 str
= PyBytes_AS_STRING(utf8
);
706 assert(tok
->decoding_buffer
== NULL
);
707 tok
->decoding_buffer
= utf8
; /* CAUTION */
713 /* Set up tokenizer for string */
716 PyTokenizer_FromString(const char *str
)
718 struct tok_state
*tok
= tok_new();
721 str
= (char *)decode_str(str
, tok
);
723 PyTokenizer_Free(tok
);
727 /* XXX: constify members. */
728 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= (char*)str
;
733 PyTokenizer_FromUTF8(const char *str
)
735 struct tok_state
*tok
= tok_new();
738 tok
->decoding_state
= STATE_RAW
;
739 tok
->read_coding_spec
= 1;
742 tok
->encoding
= (char *)PyMem_MALLOC(6);
743 if (!tok
->encoding
) {
744 PyTokenizer_Free(tok
);
747 strcpy(tok
->encoding
, "utf-8");
749 /* XXX: constify members. */
750 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= (char*)str
;
755 /* Set up tokenizer for file */
758 PyTokenizer_FromFile(FILE *fp
, char* enc
, char *ps1
, char *ps2
)
760 struct tok_state
*tok
= tok_new();
763 if ((tok
->buf
= (char *)PyMem_MALLOC(BUFSIZ
)) == NULL
) {
764 PyTokenizer_Free(tok
);
767 tok
->cur
= tok
->inp
= tok
->buf
;
768 tok
->end
= tok
->buf
+ BUFSIZ
;
771 tok
->nextprompt
= ps2
;
773 /* Must copy encoding declaration since it
774 gets copied into the parse tree. */
775 tok
->encoding
= PyMem_MALLOC(strlen(enc
)+1);
776 if (!tok
->encoding
) {
777 PyTokenizer_Free(tok
);
780 strcpy(tok
->encoding
, enc
);
781 tok
->decoding_state
= STATE_NORMAL
;
787 /* Free a tok_state structure */
790 PyTokenizer_Free(struct tok_state
*tok
)
792 if (tok
->encoding
!= NULL
)
793 PyMem_FREE(tok
->encoding
);
795 Py_XDECREF(tok
->decoding_readline
);
796 Py_XDECREF(tok
->decoding_buffer
);
798 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
799 PyMem_FREE(tok
->buf
);
803 /* Get next char, updating state; error code goes into tok->done */
806 tok_nextc(register struct tok_state
*tok
)
809 if (tok
->cur
!= tok
->inp
) {
810 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
812 if (tok
->done
!= E_OK
)
814 if (tok
->fp
== NULL
) {
815 char *end
= strchr(tok
->inp
, '\n');
819 end
= strchr(tok
->inp
, '\0');
820 if (end
== tok
->inp
) {
825 if (tok
->start
== NULL
)
827 tok
->line_start
= tok
->cur
;
830 return Py_CHARMASK(*tok
->cur
++);
832 if (tok
->prompt
!= NULL
) {
833 char *newtok
= PyOS_Readline(stdin
, stdout
, tok
->prompt
);
835 if (tok
->encoding
&& newtok
&& *newtok
) {
836 /* Recode to UTF-8 */
839 PyObject
*u
= translate_into_utf8(newtok
, tok
->encoding
);
842 tok
->done
= E_DECODE
;
845 buflen
= PyBytes_GET_SIZE(u
);
846 buf
= PyBytes_AS_STRING(u
);
849 tok
->done
= E_DECODE
;
852 newtok
= PyMem_MALLOC(buflen
+1);
857 if (tok
->nextprompt
!= NULL
)
858 tok
->prompt
= tok
->nextprompt
;
861 else if (*newtok
== '\0') {
865 else if (tok
->start
!= NULL
) {
866 size_t start
= tok
->start
- tok
->buf
;
867 size_t oldlen
= tok
->cur
- tok
->buf
;
868 size_t newlen
= oldlen
+ strlen(newtok
);
869 char *buf
= tok
->buf
;
870 buf
= (char *)PyMem_REALLOC(buf
, newlen
+1);
873 PyMem_FREE(tok
->buf
);
880 tok
->cur
= tok
->buf
+ oldlen
;
881 tok
->line_start
= tok
->cur
;
882 strcpy(tok
->buf
+ oldlen
, newtok
);
884 tok
->inp
= tok
->buf
+ newlen
;
885 tok
->end
= tok
->inp
+ 1;
886 tok
->start
= tok
->buf
+ start
;
890 if (tok
->buf
!= NULL
)
891 PyMem_FREE(tok
->buf
);
893 tok
->line_start
= tok
->buf
;
895 tok
->line_start
= tok
->buf
;
896 tok
->inp
= strchr(tok
->buf
, '\0');
897 tok
->end
= tok
->inp
+ 1;
904 if (tok
->start
== NULL
) {
905 if (tok
->buf
== NULL
) {
907 PyMem_MALLOC(BUFSIZ
);
908 if (tok
->buf
== NULL
) {
912 tok
->end
= tok
->buf
+ BUFSIZ
;
914 if (decoding_fgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
921 tok
->inp
= strchr(tok
->buf
, '\0');
922 done
= tok
->inp
[-1] == '\n';
926 cur
= tok
->cur
- tok
->buf
;
927 if (decoding_feof(tok
)) {
935 /* Read until '\n' or EOF */
937 Py_ssize_t curstart
= tok
->start
== NULL
? -1 :
938 tok
->start
- tok
->buf
;
939 Py_ssize_t curvalid
= tok
->inp
- tok
->buf
;
940 Py_ssize_t newsize
= curvalid
+ BUFSIZ
;
941 char *newbuf
= tok
->buf
;
942 newbuf
= (char *)PyMem_REALLOC(newbuf
,
944 if (newbuf
== NULL
) {
950 tok
->inp
= tok
->buf
+ curvalid
;
951 tok
->end
= tok
->buf
+ newsize
;
952 tok
->start
= curstart
< 0 ? NULL
:
954 if (decoding_fgets(tok
->inp
,
955 (int)(tok
->end
- tok
->inp
),
957 /* Break out early on decoding
958 errors, as tok->buf will be NULL
960 if (tok
->decoding_erred
)
962 /* Last line does not end in \n,
964 strcpy(tok
->inp
, "\n");
966 tok
->inp
= strchr(tok
->inp
, '\0');
967 done
= tok
->inp
[-1] == '\n';
969 if (tok
->buf
!= NULL
) {
970 tok
->cur
= tok
->buf
+ cur
;
971 tok
->line_start
= tok
->cur
;
972 /* replace "\r\n" with "\n" */
973 /* For Mac leave the \r, giving a syntax error */
975 if (pt
>= tok
->buf
&& *pt
== '\r') {
982 if (tok
->done
!= E_OK
) {
983 if (tok
->prompt
!= NULL
)
984 PySys_WriteStderr("\n");
993 /* Back-up one character */
996 tok_backup(register struct tok_state
*tok
, register int c
)
999 if (--tok
->cur
< tok
->buf
)
1000 Py_FatalError("tok_backup: beginning of buffer");
1007 /* Return the token corresponding to a single character */
1010 PyToken_OneChar(int c
)
1013 case '(': return LPAR
;
1014 case ')': return RPAR
;
1015 case '[': return LSQB
;
1016 case ']': return RSQB
;
1017 case ':': return COLON
;
1018 case ',': return COMMA
;
1019 case ';': return SEMI
;
1020 case '+': return PLUS
;
1021 case '-': return MINUS
;
1022 case '*': return STAR
;
1023 case '/': return SLASH
;
1024 case '|': return VBAR
;
1025 case '&': return AMPER
;
1026 case '<': return LESS
;
1027 case '>': return GREATER
;
1028 case '=': return EQUAL
;
1029 case '.': return DOT
;
1030 case '%': return PERCENT
;
1031 case '{': return LBRACE
;
1032 case '}': return RBRACE
;
1033 case '^': return CIRCUMFLEX
;
1034 case '~': return TILDE
;
1035 case '@': return AT
;
1042 PyToken_TwoChars(int c1
, int c2
)
1047 case '=': return EQEQUAL
;
1052 case '=': return NOTEQUAL
;
1057 case '>': return NOTEQUAL
;
1058 case '=': return LESSEQUAL
;
1059 case '<': return LEFTSHIFT
;
1064 case '=': return GREATEREQUAL
;
1065 case '>': return RIGHTSHIFT
;
1070 case '=': return PLUSEQUAL
;
1075 case '=': return MINEQUAL
;
1076 case '>': return RARROW
;
1081 case '*': return DOUBLESTAR
;
1082 case '=': return STAREQUAL
;
1087 case '/': return DOUBLESLASH
;
1088 case '=': return SLASHEQUAL
;
1093 case '=': return VBAREQUAL
;
1098 case '=': return PERCENTEQUAL
;
1103 case '=': return AMPEREQUAL
;
1108 case '=': return CIRCUMFLEXEQUAL
;
1116 PyToken_ThreeChars(int c1
, int c2
, int c3
)
1124 return LEFTSHIFTEQUAL
;
1134 return RIGHTSHIFTEQUAL
;
1144 return DOUBLESTAREQUAL
;
1154 return DOUBLESLASHEQUAL
;
1174 indenterror(struct tok_state
*tok
)
1176 if (tok
->alterror
) {
1177 tok
->done
= E_TABSPACE
;
1178 tok
->cur
= tok
->inp
;
1181 if (tok
->altwarning
) {
1182 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1183 "in indentation\n", tok
->filename
);
1184 tok
->altwarning
= 0;
1190 #define verify_identifier(tok) 1
1192 /* Verify that the identifier follows PEP 3131. */
1194 verify_identifier(struct tok_state
*tok
)
1198 s
= PyUnicode_DecodeUTF8(tok
->start
, tok
->cur
- tok
->start
, NULL
);
1200 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
)) {
1202 tok
->done
= E_IDENTIFIER
;
1204 tok
->done
= E_ERROR
;
1208 result
= PyUnicode_IsIdentifier(s
);
1211 tok
->done
= E_IDENTIFIER
;
1216 /* Get next token, after space stripping etc. */
1219 tok_get(register struct tok_state
*tok
, char **p_start
, char **p_end
)
1222 int blankline
, nonascii
;
1224 *p_start
= *p_end
= NULL
;
1229 /* Get indentation level */
1231 register int col
= 0;
1232 register int altcol
= 0;
1238 else if (c
== '\t') {
1239 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
1240 altcol
= (altcol
/tok
->alttabsize
+ 1)
1243 else if (c
== '\014') /* Control-L (formfeed) */
1244 col
= altcol
= 0; /* For Emacs users */
1249 if (c
== '#' || c
== '\n') {
1250 /* Lines with only whitespace and/or comments
1251 shouldn't affect the indentation and are
1252 not passed to the parser as NEWLINE tokens,
1253 except *totally* empty lines in interactive
1254 mode, which signal the end of a command group. */
1255 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
1256 blankline
= 0; /* Let it through */
1258 blankline
= 1; /* Ignore completely */
1259 /* We can't jump back right here since we still
1260 may need to skip to the end of a comment */
1262 if (!blankline
&& tok
->level
== 0) {
1263 if (col
== tok
->indstack
[tok
->indent
]) {
1265 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1266 if (indenterror(tok
))
1270 else if (col
> tok
->indstack
[tok
->indent
]) {
1271 /* Indent -- always one */
1272 if (tok
->indent
+1 >= MAXINDENT
) {
1273 tok
->done
= E_TOODEEP
;
1274 tok
->cur
= tok
->inp
;
1277 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
1278 if (indenterror(tok
))
1282 tok
->indstack
[++tok
->indent
] = col
;
1283 tok
->altindstack
[tok
->indent
] = altcol
;
1285 else /* col < tok->indstack[tok->indent] */ {
1286 /* Dedent -- any number, must be consistent */
1287 while (tok
->indent
> 0 &&
1288 col
< tok
->indstack
[tok
->indent
]) {
1292 if (col
!= tok
->indstack
[tok
->indent
]) {
1293 tok
->done
= E_DEDENT
;
1294 tok
->cur
= tok
->inp
;
1297 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1298 if (indenterror(tok
))
1305 tok
->start
= tok
->cur
;
1307 /* Return pending indents/dedents */
1308 if (tok
->pendin
!= 0) {
1309 if (tok
->pendin
< 0) {
1324 } while (c
== ' ' || c
== '\t' || c
== '\014');
1326 /* Set start of current token */
1327 tok
->start
= tok
->cur
- 1;
1331 while (c
!= EOF
&& c
!= '\n')
1334 /* Check for EOF and errors now */
1336 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
1339 /* Identifier (most frequent token!) */
1341 if (is_potential_identifier_start(c
)) {
1342 /* Process b"", r"" and br"" */
1343 if (c
== 'b' || c
== 'B') {
1345 if (c
== '"' || c
== '\'')
1348 if (c
== 'r' || c
== 'R') {
1350 if (c
== '"' || c
== '\'')
1353 while (is_potential_identifier_char(c
)) {
1360 !verify_identifier(tok
)) {
1361 tok
->done
= E_IDENTIFIER
;
1364 *p_start
= tok
->start
;
1372 if (blankline
|| tok
->level
> 0)
1374 *p_start
= tok
->start
;
1375 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
1380 /* Period or number starting with period? */
1385 } else if (c
== '.') {
1388 *p_start
= tok
->start
;
1394 tok_backup(tok
, '.');
1398 *p_start
= tok
->start
;
1406 /* Hex, octal or binary -- maybe. */
1410 #ifndef WITHOUT_COMPLEX
1411 if (c
== 'j' || c
== 'J')
1414 if (c
== 'x' || c
== 'X') {
1419 tok
->done
= E_TOKEN
;
1425 } while (isxdigit(c
));
1427 else if (c
== 'o' || c
== 'O') {
1430 if (c
< '0' || c
>= '8') {
1431 tok
->done
= E_TOKEN
;
1437 } while ('0' <= c
&& c
< '8');
1439 else if (c
== 'b' || c
== 'B') {
1442 if (c
!= '0' && c
!= '1') {
1443 tok
->done
= E_TOKEN
;
1449 } while (c
== '0' || c
== '1');
1453 /* maybe old-style octal; c is first char of it */
1454 /* in any case, allow '0' as a literal */
1457 while (isdigit(c
)) {
1463 else if (c
== 'e' || c
== 'E')
1465 #ifndef WITHOUT_COMPLEX
1466 else if (c
== 'j' || c
== 'J')
1470 tok
->done
= E_TOKEN
;
1480 } while (isdigit(c
));
1482 /* Accept floating point numbers. */
1488 } while (isdigit(c
));
1490 if (c
== 'e' || c
== 'E') {
1494 if (c
== '+' || c
== '-')
1497 tok
->done
= E_TOKEN
;
1503 } while (isdigit(c
));
1505 #ifndef WITHOUT_COMPLEX
1506 if (c
== 'j' || c
== 'J')
1507 /* Imaginary part */
1514 *p_start
= tok
->start
;
1521 if (c
== '\'' || c
== '"') {
1523 int quote_size
= 1; /* 1 or 3 */
1524 int end_quote_size
= 0;
1526 /* Find the quote size and start of string */
1533 end_quote_size
= 1; /* empty string found */
1538 /* Get rest of string */
1539 while (end_quote_size
!= quote_size
) {
1542 if (quote_size
== 3)
1546 tok
->cur
= tok
->inp
;
1549 if (quote_size
== 1 && c
== '\n') {
1551 tok
->cur
= tok
->inp
;
1555 end_quote_size
+= 1;
1559 c
= tok_nextc(tok
); /* skip escaped char */
1563 *p_start
= tok
->start
;
1568 /* Line continuation */
1572 tok
->done
= E_LINECONT
;
1573 tok
->cur
= tok
->inp
;
1577 goto again
; /* Read next line */
1580 /* Check for two-character token */
1582 int c2
= tok_nextc(tok
);
1583 int token
= PyToken_TwoChars(c
, c2
);
1585 int c3
= tok_nextc(tok
);
1586 int token3
= PyToken_ThreeChars(c
, c2
, c3
);
1590 tok_backup(tok
, c3
);
1592 *p_start
= tok
->start
;
1596 tok_backup(tok
, c2
);
1599 /* Keep track of parentheses nesting level */
1613 /* Punctuation character */
1614 *p_start
= tok
->start
;
1616 return PyToken_OneChar(c
);
1620 PyTokenizer_Get(struct tok_state
*tok
, char **p_start
, char **p_end
)
1622 int result
= tok_get(tok
, p_start
, p_end
);
1623 if (tok
->decoding_erred
) {
1624 result
= ERRORTOKEN
;
1625 tok
->done
= E_DECODE
;
1630 /* Get -*- encoding -*- from a Python file.
1632 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
1633 the first or second line of the file (in which case the encoding
1634 should be assumed to be PyUnicode_GetDefaultEncoding()).
1636 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1640 PyTokenizer_FindEncoding(int fd
)
1642 struct tok_state
*tok
;
1644 char *p_start
=NULL
, *p_end
=NULL
, *encoding
= NULL
;
1650 fp
= fdopen(fd
, "r");
1654 tok
= PyTokenizer_FromFile(fp
, NULL
, NULL
, NULL
);
1659 while (tok
->lineno
< 2 && tok
->done
== E_OK
) {
1660 PyTokenizer_Get(tok
, &p_start
, &p_end
);
1663 if (tok
->encoding
) {
1664 encoding
= (char *)PyMem_MALLOC(strlen(tok
->encoding
) + 1);
1666 strcpy(encoding
, tok
->encoding
);
1668 PyTokenizer_Free(tok
);
1675 tok_dump(int type
, char *start
, char *end
)
1677 printf("%s", _PyParser_TokenNames
[type
]);
1678 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
1679 printf("(%.*s)", (int)(end
- start
), start
);