2 /* Tokenizer implementation */
5 #include "pgenheaders.h"
10 #include "tokenizer.h"
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
21 #define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
27 #define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
34 extern char *PyOS_Readline(FILE *, FILE *, char *);
35 /* Return malloc'ed string including trailing \n;
36 empty malloc'ed string for EOF;
37 NULL if interrupted */
39 /* Don't ever change this -- it would break the portability of Python code */
43 static struct tok_state
*tok_new(void);
44 static int tok_nextc(struct tok_state
*tok
);
45 static void tok_backup(struct tok_state
*tok
, int c
);
50 char *_PyParser_TokenNames
[] = {
103 /* This table must match the #defines in token.h! */
110 /* Create and initialize a new tok_state structure */
112 static struct tok_state
*
115 struct tok_state
*tok
= (struct tok_state
*)PyMem_MALLOC(
116 sizeof(struct tok_state
));
119 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
122 tok
->tabsize
= TABSIZE
;
124 tok
->indstack
[0] = 0;
127 tok
->prompt
= tok
->nextprompt
= NULL
;
130 tok
->filename
= NULL
;
134 tok
->altindstack
[0] = 0;
135 tok
->decoding_state
= STATE_INIT
;
136 tok
->decoding_erred
= 0;
137 tok
->read_coding_spec
= 0;
139 tok
->encoding
= NULL
;
142 tok
->decoding_readline
= NULL
;
143 tok
->decoding_buffer
= NULL
;
151 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
153 return fgets(s
, size
, tok
->fp
);
157 decoding_feof(struct tok_state
*tok
)
159 return feof(tok
->fp
);
163 decode_str(const char *str
, struct tok_state
*tok
)
171 error_ret(struct tok_state
*tok
) /* XXX */
173 tok
->decoding_erred
= 1;
174 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
) /* see PyTokenizer_Free */
175 PyMem_FREE(tok
->buf
);
177 return NULL
; /* as if it were EOF */
181 new_string(const char *s
, Py_ssize_t len
)
183 char* result
= (char *)PyMem_MALLOC(len
+ 1);
184 if (result
!= NULL
) {
185 memcpy(result
, s
, len
);
192 get_normal_name(char *s
) /* for utf-8 and latin-1 */
196 for (i
= 0; i
< 12; i
++) {
198 if (c
== '\0') break;
199 else if (c
== '_') buf
[i
] = '-';
200 else buf
[i
] = tolower(c
);
203 if (strcmp(buf
, "utf-8") == 0 ||
204 strncmp(buf
, "utf-8-", 6) == 0) return "utf-8";
205 else if (strcmp(buf
, "latin-1") == 0 ||
206 strcmp(buf
, "iso-8859-1") == 0 ||
207 strcmp(buf
, "iso-latin-1") == 0 ||
208 strncmp(buf
, "latin-1-", 8) == 0 ||
209 strncmp(buf
, "iso-8859-1-", 11) == 0 ||
210 strncmp(buf
, "iso-latin-1-", 12) == 0) return "iso-8859-1";
214 /* Return the coding spec in S, or NULL if none is found. */
217 get_coding_spec(const char *s
, Py_ssize_t size
)
220 /* Coding spec must be in a comment, and that comment must be
221 * the only statement on the source code line. */
222 for (i
= 0; i
< size
- 6; i
++) {
225 if (s
[i
] != ' ' && s
[i
] != '\t' && s
[i
] != '\014')
228 for (; i
< size
- 6; i
++) { /* XXX inefficient search */
229 const char* t
= s
+ i
;
230 if (strncmp(t
, "coding", 6) == 0) {
231 const char* begin
= NULL
;
233 if (t
[0] != ':' && t
[0] != '=')
237 } while (t
[0] == '\x20' || t
[0] == '\t');
240 while (isalnum(Py_CHARMASK(t
[0])) ||
241 t
[0] == '-' || t
[0] == '_' || t
[0] == '.')
245 char* r
= new_string(begin
, t
- begin
);
246 char* q
= get_normal_name(r
);
249 r
= new_string(q
, strlen(q
));
258 /* Check whether the line contains a coding spec. If it does,
259 invoke the set_readline function for the new encoding.
260 This function receives the tok_state and the new encoding.
261 Return 1 on success, 0 on failure. */
264 check_coding_spec(const char* line
, Py_ssize_t size
, struct tok_state
*tok
,
265 int set_readline(struct tok_state
*, const char *))
271 /* It's a continuation line, so it can't be a coding spec. */
273 cs
= get_coding_spec(line
, size
);
275 tok
->read_coding_spec
= 1;
276 if (tok
->encoding
== NULL
) {
277 assert(tok
->decoding_state
== STATE_RAW
);
278 if (strcmp(cs
, "utf-8") == 0) {
281 r
= set_readline(tok
, cs
);
284 tok
->decoding_state
= STATE_NORMAL
;
289 } else { /* then, compare cs with BOM */
290 r
= (strcmp(tok
->encoding
, cs
) == 0);
298 PyErr_Format(PyExc_SyntaxError
, "encoding problem: %s", cs
);
303 /* See whether the file starts with a BOM. If it does,
304 invoke the set_readline function with the new encoding.
305 Return 1 on success, 0 on failure. */
308 check_bom(int get_char(struct tok_state
*),
309 void unget_char(int, struct tok_state
*),
310 int set_readline(struct tok_state
*, const char *),
311 struct tok_state
*tok
)
313 int ch
= get_char(tok
);
314 tok
->decoding_state
= STATE_RAW
;
317 } else if (ch
== 0xEF) {
321 unget_char(0xEF, tok
);
322 /* any token beginning with '\xEF' is a bad token */
328 unget_char(0xBB, tok
);
329 unget_char(0xEF, tok
);
330 /* any token beginning with '\xEF' is a bad token */
334 /* Disable support for UTF-16 BOMs until a decision
335 is made whether this needs to be supported. */
336 } else if (ch
== 0xFE) {
337 ch
= get_char(tok
); if (ch
!= 0xFF) goto NON_BOM
;
338 if (!set_readline(tok
, "utf-16-be")) return 0;
339 tok
->decoding_state
= STATE_NORMAL
;
340 } else if (ch
== 0xFF) {
341 ch
= get_char(tok
); if (ch
!= 0xFE) goto NON_BOM
;
342 if (!set_readline(tok
, "utf-16-le")) return 0;
343 tok
->decoding_state
= STATE_NORMAL
;
349 if (tok
->encoding
!= NULL
)
350 PyMem_FREE(tok
->encoding
);
351 tok
->encoding
= new_string("utf-8", 5); /* resulting is in utf-8 */
352 /* No need to set_readline: input is already utf-8 */
356 /* Read a line of text from TOK into S, using the stream in TOK.
357 Return NULL on failure, else S.
359 On entry, tok->decoding_buffer will be one of:
360 1) NULL: need to call tok->decoding_readline to get a new line
361 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
362 stored the result in tok->decoding_buffer
363 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
364 (in the s buffer) to copy entire contents of the line read
365 by tok->decoding_readline. tok->decoding_buffer has the overflow.
366 In this case, fp_readl is called in a loop (with an expanded buffer)
367 until the buffer ends with a '\n' (or until the end of the file is
368 reached): see tok_nextc and its calls to decoding_fgets.
372 fp_readl(char *s
, int size
, struct tok_state
*tok
)
378 /* Ask for one less byte so we can terminate it */
382 if (tok
->decoding_buffer
) {
383 bufobj
= tok
->decoding_buffer
;
388 bufobj
= PyObject_CallObject(tok
->decoding_readline
, NULL
);
392 if (PyUnicode_CheckExact(bufobj
))
394 buf
= _PyUnicode_AsStringAndSize(bufobj
, &buflen
);
401 buf
= PyByteArray_AsString(bufobj
);
405 buflen
= PyByteArray_GET_SIZE(bufobj
);
408 Py_XDECREF(tok
->decoding_buffer
);
410 /* Too many chars, the rest goes into tok->decoding_buffer */
411 tok
->decoding_buffer
= PyByteArray_FromStringAndSize(buf
+size
,
413 if (tok
->decoding_buffer
== NULL
)
418 tok
->decoding_buffer
= NULL
;
420 memcpy(s
, buf
, buflen
);
422 if (buflen
== 0) /* EOF */
429 return error_ret(tok
);
432 /* Set the readline function for TOK to a StreamReader's
433 readline function. The StreamReader is named ENC.
435 This function is called from check_bom and check_coding_spec.
437 ENC is usually identical to the future value of tok->encoding,
438 except for the (currently unsupported) case of UTF-16.
440 Return 1 on success, 0 on failure. */
443 fp_setreadl(struct tok_state
*tok
, const char* enc
)
445 PyObject
*readline
= NULL
, *stream
= NULL
, *io
= NULL
;
447 io
= PyImport_ImportModuleNoBlock("io");
452 stream
= PyObject_CallMethod(io
, "open", "ssis",
453 tok
->filename
, "r", -1, enc
);
455 stream
= PyObject_CallMethod(io
, "open", "isisOOO",
456 fileno(tok
->fp
), "r", -1, enc
, Py_None
, Py_None
, Py_False
);
460 Py_XDECREF(tok
->decoding_readline
);
461 readline
= PyObject_GetAttrString(stream
, "readline");
462 tok
->decoding_readline
= readline
;
464 /* The file has been reopened; parsing will restart from
465 * the beginning of the file, we have to reset the line number.
466 * But this function has been called from inside tok_nextc() which
467 * will increment lineno before it returns. So we set it -1 so that
468 * the next call to tok_nextc() will start with tok->lineno == 0.
475 return readline
!= NULL
;
478 /* Fetch the next byte from TOK. */
480 static int fp_getc(struct tok_state
*tok
) {
481 return getc(tok
->fp
);
484 /* Unfetch the last byte back into TOK. */
486 static void fp_ungetc(int c
, struct tok_state
*tok
) {
490 /* Check whether the characters at s start a valid
491 UTF-8 sequence. Return the number of characters forming
492 the sequence if yes, 0 if not. */
493 static int valid_utf8(const unsigned char* s
)
498 /* single-byte code */
511 length
= expected
+ 1;
512 for (; expected
; expected
--)
513 if (s
[expected
] < 0x80 || s
[expected
] >= 0xC0)
518 /* Read a line of input from TOK. Determine encoding
522 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
527 if (tok
->decoding_state
== STATE_NORMAL
) {
528 /* We already have a codec associated with
530 line
= fp_readl(s
, size
, tok
);
532 } else if (tok
->decoding_state
== STATE_RAW
) {
533 /* We want a 'raw' read. */
534 line
= Py_UniversalNewlineFgets(s
, size
,
538 /* We have not yet determined the encoding.
539 If an encoding is found, use the file-pointer
540 reader functions from now on. */
541 if (!check_bom(fp_getc
, fp_ungetc
, fp_setreadl
, tok
))
542 return error_ret(tok
);
543 assert(tok
->decoding_state
!= STATE_INIT
);
546 if (line
!= NULL
&& tok
->lineno
< 2 && !tok
->read_coding_spec
) {
547 if (!check_coding_spec(line
, strlen(line
), tok
, fp_setreadl
)) {
548 return error_ret(tok
);
552 /* The default encoding is UTF-8, so make sure we don't have any
553 non-UTF-8 sequences in it. */
554 if (line
&& !tok
->encoding
) {
557 for (c
= (unsigned char *)line
; *c
; c
+= length
)
558 if (!(length
= valid_utf8(c
))) {
565 /* Need to add 1 to the line number, since this line
566 has not been counted, yet. */
568 "Non-UTF-8 code starting with '\\x%.2x' "
569 "in file %.200s on line %i, "
570 "but no encoding declared; "
571 "see http://python.org/dev/peps/pep-0263/ for details",
572 badchar
, tok
->filename
, tok
->lineno
+ 1);
573 PyErr_SetString(PyExc_SyntaxError
, buf
);
574 return error_ret(tok
);
581 decoding_feof(struct tok_state
*tok
)
583 if (tok
->decoding_state
!= STATE_NORMAL
) {
584 return feof(tok
->fp
);
586 PyObject
* buf
= tok
->decoding_buffer
;
588 buf
= PyObject_CallObject(tok
->decoding_readline
, NULL
);
593 tok
->decoding_buffer
= buf
;
596 return PyObject_Length(buf
) == 0;
600 /* Fetch a byte from TOK, using the string buffer. */
603 buf_getc(struct tok_state
*tok
) {
604 return Py_CHARMASK(*tok
->str
++);
607 /* Unfetch a byte from TOK, using the string buffer. */
610 buf_ungetc(int c
, struct tok_state
*tok
) {
612 assert(Py_CHARMASK(*tok
->str
) == c
); /* tok->cur may point to read-only segment */
615 /* Set the readline function for TOK to ENC. For the string-based
616 tokenizer, this means to just record the encoding. */
619 buf_setreadl(struct tok_state
*tok
, const char* enc
) {
624 /* Return a UTF-8 encoding Python string object from the
625 C byte string STR, which is encoded with ENC. */
628 translate_into_utf8(const char* str
, const char* enc
) {
630 PyObject
* buf
= PyUnicode_Decode(str
, strlen(str
), enc
, NULL
);
633 utf8
= PyUnicode_AsUTF8String(buf
);
638 /* Decode a byte string STR for use as the buffer of TOK.
639 Look for encoding declarations inside STR, and record them
643 decode_str(const char *str
, struct tok_state
*tok
)
645 PyObject
* utf8
= NULL
;
647 const char *newl
[2] = {NULL
, NULL
};
651 if (!check_bom(buf_getc
, buf_ungetc
, buf_setreadl
, tok
))
652 return error_ret(tok
);
653 str
= tok
->str
; /* string after BOM if any */
655 if (tok
->enc
!= NULL
) {
656 utf8
= translate_into_utf8(str
, tok
->enc
);
658 return error_ret(tok
);
659 str
= PyBytes_AsString(utf8
);
661 for (s
= str
;; s
++) {
662 if (*s
== '\0') break;
663 else if (*s
== '\n') {
667 if (lineno
== 2) break;
671 /* need to check line 1 and 2 separately since check_coding_spec
672 assumes a single line as input */
674 if (!check_coding_spec(str
, newl
[0] - str
, tok
, buf_setreadl
))
675 return error_ret(tok
);
676 if (tok
->enc
== NULL
&& newl
[1]) {
677 if (!check_coding_spec(newl
[0]+1, newl
[1] - newl
[0],
679 return error_ret(tok
);
682 if (tok
->enc
!= NULL
) {
683 assert(utf8
== NULL
);
684 utf8
= translate_into_utf8(str
, tok
->enc
);
686 return error_ret(tok
);
687 str
= PyBytes_AS_STRING(utf8
);
689 assert(tok
->decoding_buffer
== NULL
);
690 tok
->decoding_buffer
= utf8
; /* CAUTION */
696 /* Set up tokenizer for string */
699 PyTokenizer_FromString(const char *str
)
701 struct tok_state
*tok
= tok_new();
704 str
= (char *)decode_str(str
, tok
);
706 PyTokenizer_Free(tok
);
710 /* XXX: constify members. */
711 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= (char*)str
;
716 PyTokenizer_FromUTF8(const char *str
)
718 struct tok_state
*tok
= tok_new();
721 tok
->decoding_state
= STATE_RAW
;
722 tok
->read_coding_spec
= 1;
725 tok
->encoding
= (char *)PyMem_MALLOC(6);
726 if (!tok
->encoding
) {
727 PyTokenizer_Free(tok
);
730 strcpy(tok
->encoding
, "utf-8");
732 /* XXX: constify members. */
733 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= (char*)str
;
738 /* Set up tokenizer for file */
741 PyTokenizer_FromFile(FILE *fp
, char* enc
, char *ps1
, char *ps2
)
743 struct tok_state
*tok
= tok_new();
746 if ((tok
->buf
= (char *)PyMem_MALLOC(BUFSIZ
)) == NULL
) {
747 PyTokenizer_Free(tok
);
750 tok
->cur
= tok
->inp
= tok
->buf
;
751 tok
->end
= tok
->buf
+ BUFSIZ
;
754 tok
->nextprompt
= ps2
;
756 /* Must copy encoding declaration since it
757 gets copied into the parse tree. */
758 tok
->encoding
= PyMem_MALLOC(strlen(enc
)+1);
759 if (!tok
->encoding
) {
760 PyTokenizer_Free(tok
);
763 strcpy(tok
->encoding
, enc
);
764 tok
->decoding_state
= STATE_NORMAL
;
770 /* Free a tok_state structure */
773 PyTokenizer_Free(struct tok_state
*tok
)
775 if (tok
->encoding
!= NULL
)
776 PyMem_FREE(tok
->encoding
);
778 Py_XDECREF(tok
->decoding_readline
);
779 Py_XDECREF(tok
->decoding_buffer
);
781 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
782 PyMem_FREE(tok
->buf
);
786 /* Get next char, updating state; error code goes into tok->done */
789 tok_nextc(register struct tok_state
*tok
)
792 if (tok
->cur
!= tok
->inp
) {
793 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
795 if (tok
->done
!= E_OK
)
797 if (tok
->fp
== NULL
) {
798 char *end
= strchr(tok
->inp
, '\n');
802 end
= strchr(tok
->inp
, '\0');
803 if (end
== tok
->inp
) {
808 if (tok
->start
== NULL
)
810 tok
->line_start
= tok
->cur
;
813 return Py_CHARMASK(*tok
->cur
++);
815 if (tok
->prompt
!= NULL
) {
816 char *newtok
= PyOS_Readline(stdin
, stdout
, tok
->prompt
);
818 if (tok
->encoding
&& newtok
&& *newtok
) {
819 /* Recode to UTF-8 */
822 PyObject
*u
= translate_into_utf8(newtok
, tok
->encoding
);
825 tok
->done
= E_DECODE
;
828 buflen
= PyBytes_GET_SIZE(u
);
829 buf
= PyBytes_AS_STRING(u
);
832 tok
->done
= E_DECODE
;
835 newtok
= PyMem_MALLOC(buflen
+1);
840 if (tok
->nextprompt
!= NULL
)
841 tok
->prompt
= tok
->nextprompt
;
844 else if (*newtok
== '\0') {
848 else if (tok
->start
!= NULL
) {
849 size_t start
= tok
->start
- tok
->buf
;
850 size_t oldlen
= tok
->cur
- tok
->buf
;
851 size_t newlen
= oldlen
+ strlen(newtok
);
852 char *buf
= tok
->buf
;
853 buf
= (char *)PyMem_REALLOC(buf
, newlen
+1);
856 PyMem_FREE(tok
->buf
);
863 tok
->cur
= tok
->buf
+ oldlen
;
864 tok
->line_start
= tok
->cur
;
865 strcpy(tok
->buf
+ oldlen
, newtok
);
867 tok
->inp
= tok
->buf
+ newlen
;
868 tok
->end
= tok
->inp
+ 1;
869 tok
->start
= tok
->buf
+ start
;
873 if (tok
->buf
!= NULL
)
874 PyMem_FREE(tok
->buf
);
876 tok
->line_start
= tok
->buf
;
878 tok
->line_start
= tok
->buf
;
879 tok
->inp
= strchr(tok
->buf
, '\0');
880 tok
->end
= tok
->inp
+ 1;
887 if (tok
->start
== NULL
) {
888 if (tok
->buf
== NULL
) {
890 PyMem_MALLOC(BUFSIZ
);
891 if (tok
->buf
== NULL
) {
895 tok
->end
= tok
->buf
+ BUFSIZ
;
897 if (decoding_fgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
904 tok
->inp
= strchr(tok
->buf
, '\0');
905 done
= tok
->inp
[-1] == '\n';
909 cur
= tok
->cur
- tok
->buf
;
910 if (decoding_feof(tok
)) {
918 /* Read until '\n' or EOF */
920 Py_ssize_t curstart
= tok
->start
== NULL
? -1 :
921 tok
->start
- tok
->buf
;
922 Py_ssize_t curvalid
= tok
->inp
- tok
->buf
;
923 Py_ssize_t newsize
= curvalid
+ BUFSIZ
;
924 char *newbuf
= tok
->buf
;
925 newbuf
= (char *)PyMem_REALLOC(newbuf
,
927 if (newbuf
== NULL
) {
933 tok
->inp
= tok
->buf
+ curvalid
;
934 tok
->end
= tok
->buf
+ newsize
;
935 tok
->start
= curstart
< 0 ? NULL
:
937 if (decoding_fgets(tok
->inp
,
938 (int)(tok
->end
- tok
->inp
),
940 /* Break out early on decoding
941 errors, as tok->buf will be NULL
943 if (tok
->decoding_erred
)
945 /* Last line does not end in \n,
947 strcpy(tok
->inp
, "\n");
949 tok
->inp
= strchr(tok
->inp
, '\0');
950 done
= tok
->inp
[-1] == '\n';
952 if (tok
->buf
!= NULL
) {
953 tok
->cur
= tok
->buf
+ cur
;
954 tok
->line_start
= tok
->cur
;
955 /* replace "\r\n" with "\n" */
956 /* For Mac leave the \r, giving a syntax error */
958 if (pt
>= tok
->buf
&& *pt
== '\r') {
965 if (tok
->done
!= E_OK
) {
966 if (tok
->prompt
!= NULL
)
967 PySys_WriteStderr("\n");
976 /* Back-up one character */
979 tok_backup(register struct tok_state
*tok
, register int c
)
982 if (--tok
->cur
< tok
->buf
)
983 Py_FatalError("tok_backup: begin of buffer");
990 /* Return the token corresponding to a single character */
993 PyToken_OneChar(int c
)
996 case '(': return LPAR
;
997 case ')': return RPAR
;
998 case '[': return LSQB
;
999 case ']': return RSQB
;
1000 case ':': return COLON
;
1001 case ',': return COMMA
;
1002 case ';': return SEMI
;
1003 case '+': return PLUS
;
1004 case '-': return MINUS
;
1005 case '*': return STAR
;
1006 case '/': return SLASH
;
1007 case '|': return VBAR
;
1008 case '&': return AMPER
;
1009 case '<': return LESS
;
1010 case '>': return GREATER
;
1011 case '=': return EQUAL
;
1012 case '.': return DOT
;
1013 case '%': return PERCENT
;
1014 case '{': return LBRACE
;
1015 case '}': return RBRACE
;
1016 case '^': return CIRCUMFLEX
;
1017 case '~': return TILDE
;
1018 case '@': return AT
;
1025 PyToken_TwoChars(int c1
, int c2
)
1030 case '=': return EQEQUAL
;
1035 case '=': return NOTEQUAL
;
1040 case '>': return NOTEQUAL
;
1041 case '=': return LESSEQUAL
;
1042 case '<': return LEFTSHIFT
;
1047 case '=': return GREATEREQUAL
;
1048 case '>': return RIGHTSHIFT
;
1053 case '=': return PLUSEQUAL
;
1058 case '=': return MINEQUAL
;
1059 case '>': return RARROW
;
1064 case '*': return DOUBLESTAR
;
1065 case '=': return STAREQUAL
;
1070 case '/': return DOUBLESLASH
;
1071 case '=': return SLASHEQUAL
;
1076 case '=': return VBAREQUAL
;
1081 case '=': return PERCENTEQUAL
;
1086 case '=': return AMPEREQUAL
;
1091 case '=': return CIRCUMFLEXEQUAL
;
1099 PyToken_ThreeChars(int c1
, int c2
, int c3
)
1107 return LEFTSHIFTEQUAL
;
1117 return RIGHTSHIFTEQUAL
;
1127 return DOUBLESTAREQUAL
;
1137 return DOUBLESLASHEQUAL
;
1157 indenterror(struct tok_state
*tok
)
1159 if (tok
->alterror
) {
1160 tok
->done
= E_TABSPACE
;
1161 tok
->cur
= tok
->inp
;
1164 if (tok
->altwarning
) {
1165 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1166 "in indentation\n", tok
->filename
);
1167 tok
->altwarning
= 0;
1173 #define verify_identifier(s,e) 1
1175 /* Verify that the identifier follows PEP 3131. */
1177 verify_identifier(char *start
, char *end
)
1181 s
= PyUnicode_DecodeUTF8(start
, end
-start
, NULL
);
1186 result
= PyUnicode_IsIdentifier(s
);
1192 /* Get next token, after space stripping etc. */
1195 tok_get(register struct tok_state
*tok
, char **p_start
, char **p_end
)
1198 int blankline
, nonascii
;
1200 *p_start
= *p_end
= NULL
;
1205 /* Get indentation level */
1207 register int col
= 0;
1208 register int altcol
= 0;
1214 else if (c
== '\t') {
1215 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
1216 altcol
= (altcol
/tok
->alttabsize
+ 1)
1219 else if (c
== '\014') /* Control-L (formfeed) */
1220 col
= altcol
= 0; /* For Emacs users */
1225 if (c
== '#' || c
== '\n') {
1226 /* Lines with only whitespace and/or comments
1227 shouldn't affect the indentation and are
1228 not passed to the parser as NEWLINE tokens,
1229 except *totally* empty lines in interactive
1230 mode, which signal the end of a command group. */
1231 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
1232 blankline
= 0; /* Let it through */
1234 blankline
= 1; /* Ignore completely */
1235 /* We can't jump back right here since we still
1236 may need to skip to the end of a comment */
1238 if (!blankline
&& tok
->level
== 0) {
1239 if (col
== tok
->indstack
[tok
->indent
]) {
1241 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1242 if (indenterror(tok
))
1246 else if (col
> tok
->indstack
[tok
->indent
]) {
1247 /* Indent -- always one */
1248 if (tok
->indent
+1 >= MAXINDENT
) {
1249 tok
->done
= E_TOODEEP
;
1250 tok
->cur
= tok
->inp
;
1253 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
1254 if (indenterror(tok
))
1258 tok
->indstack
[++tok
->indent
] = col
;
1259 tok
->altindstack
[tok
->indent
] = altcol
;
1261 else /* col < tok->indstack[tok->indent] */ {
1262 /* Dedent -- any number, must be consistent */
1263 while (tok
->indent
> 0 &&
1264 col
< tok
->indstack
[tok
->indent
]) {
1268 if (col
!= tok
->indstack
[tok
->indent
]) {
1269 tok
->done
= E_DEDENT
;
1270 tok
->cur
= tok
->inp
;
1273 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1274 if (indenterror(tok
))
1281 tok
->start
= tok
->cur
;
1283 /* Return pending indents/dedents */
1284 if (tok
->pendin
!= 0) {
1285 if (tok
->pendin
< 0) {
1300 } while (c
== ' ' || c
== '\t' || c
== '\014');
1302 /* Set start of current token */
1303 tok
->start
= tok
->cur
- 1;
1307 while (c
!= EOF
&& c
!= '\n')
1310 /* Check for EOF and errors now */
1312 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
1315 /* Identifier (most frequent token!) */
1317 if (is_potential_identifier_start(c
)) {
1318 /* Process b"", r"" and br"" */
1319 if (c
== 'b' || c
== 'B') {
1321 if (c
== '"' || c
== '\'')
1324 if (c
== 'r' || c
== 'R') {
1326 if (c
== '"' || c
== '\'')
1329 while (is_potential_identifier_char(c
)) {
1336 !verify_identifier(tok
->start
, tok
->cur
)) {
1337 tok
->done
= E_IDENTIFIER
;
1340 *p_start
= tok
->start
;
1348 if (blankline
|| tok
->level
> 0)
1350 *p_start
= tok
->start
;
1351 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
1356 /* Period or number starting with period? */
1361 } else if (c
== '.') {
1364 *p_start
= tok
->start
;
1370 tok_backup(tok
, '.');
1374 *p_start
= tok
->start
;
1382 /* Hex, octal or binary -- maybe. */
1386 #ifndef WITHOUT_COMPLEX
1387 if (c
== 'j' || c
== 'J')
1390 if (c
== 'x' || c
== 'X') {
1395 tok
->done
= E_TOKEN
;
1401 } while (isxdigit(c
));
1403 else if (c
== 'o' || c
== 'O') {
1406 if (c
< '0' || c
>= '8') {
1407 tok
->done
= E_TOKEN
;
1413 } while ('0' <= c
&& c
< '8');
1415 else if (c
== 'b' || c
== 'B') {
1418 if (c
!= '0' && c
!= '1') {
1419 tok
->done
= E_TOKEN
;
1425 } while (c
== '0' || c
== '1');
1429 /* maybe old-style octal; c is first char of it */
1430 /* in any case, allow '0' as a literal */
1433 while (isdigit(c
)) {
1439 else if (c
== 'e' || c
== 'E')
1441 #ifndef WITHOUT_COMPLEX
1442 else if (c
== 'j' || c
== 'J')
1446 tok
->done
= E_TOKEN
;
1456 } while (isdigit(c
));
1458 /* Accept floating point numbers. */
1464 } while (isdigit(c
));
1466 if (c
== 'e' || c
== 'E') {
1470 if (c
== '+' || c
== '-')
1473 tok
->done
= E_TOKEN
;
1479 } while (isdigit(c
));
1481 #ifndef WITHOUT_COMPLEX
1482 if (c
== 'j' || c
== 'J')
1483 /* Imaginary part */
1490 *p_start
= tok
->start
;
1497 if (c
== '\'' || c
== '"') {
1499 int quote_size
= 1; /* 1 or 3 */
1500 int end_quote_size
= 0;
1502 /* Find the quote size and start of string */
1509 end_quote_size
= 1; /* empty string found */
1514 /* Get rest of string */
1515 while (end_quote_size
!= quote_size
) {
1518 if (quote_size
== 3)
1522 tok
->cur
= tok
->inp
;
1525 if (quote_size
== 1 && c
== '\n') {
1527 tok
->cur
= tok
->inp
;
1531 end_quote_size
+= 1;
1535 c
= tok_nextc(tok
); /* skip escaped char */
1539 *p_start
= tok
->start
;
1544 /* Line continuation */
1548 tok
->done
= E_LINECONT
;
1549 tok
->cur
= tok
->inp
;
1553 goto again
; /* Read next line */
1556 /* Check for two-character token */
1558 int c2
= tok_nextc(tok
);
1559 int token
= PyToken_TwoChars(c
, c2
);
1561 int c3
= tok_nextc(tok
);
1562 int token3
= PyToken_ThreeChars(c
, c2
, c3
);
1566 tok_backup(tok
, c3
);
1568 *p_start
= tok
->start
;
1572 tok_backup(tok
, c2
);
1575 /* Keep track of parentheses nesting level */
1589 /* Punctuation character */
1590 *p_start
= tok
->start
;
1592 return PyToken_OneChar(c
);
1596 PyTokenizer_Get(struct tok_state
*tok
, char **p_start
, char **p_end
)
1598 int result
= tok_get(tok
, p_start
, p_end
);
1599 if (tok
->decoding_erred
) {
1600 result
= ERRORTOKEN
;
1601 tok
->done
= E_DECODE
;
1606 /* Get -*- encoding -*- from a Python file.
1608 PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
1609 the first or second line of the file (in which case the encoding
1610 should be assumed to be PyUnicode_GetDefaultEncoding()).
1612 The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1616 PyTokenizer_FindEncoding(int fd
)
1618 struct tok_state
*tok
;
1620 char *p_start
=NULL
, *p_end
=NULL
, *encoding
= NULL
;
1626 fp
= fdopen(fd
, "r");
1630 tok
= PyTokenizer_FromFile(fp
, NULL
, NULL
, NULL
);
1635 while (tok
->lineno
< 2 && tok
->done
== E_OK
) {
1636 PyTokenizer_Get(tok
, &p_start
, &p_end
);
1639 if (tok
->encoding
) {
1640 encoding
= (char *)PyMem_MALLOC(strlen(tok
->encoding
) + 1);
1642 strcpy(encoding
, tok
->encoding
);
1644 PyTokenizer_Free(tok
);
1651 tok_dump(int type
, char *start
, char *end
)
1653 printf("%s", _PyParser_TokenNames
[type
]);
1654 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
1655 printf("(%.*s)", (int)(end
- start
), start
);