1 /* This file is included (from xmltok.c, 1-3 times depending on XML_MIN_SIZE)!
4 / _ \\ /| '_ \ / _` | __|
5 | __// \| |_) | (_| | |_
6 \___/_/\_\ .__/ \__,_|\__|
9 Copyright (c) 1997-2000 Thai Open Source Software Center Ltd
10 Copyright (c) 2000 Clark Cooper <coopercc@users.sourceforge.net>
11 Copyright (c) 2002 Fred L. Drake, Jr. <fdrake@users.sourceforge.net>
12 Copyright (c) 2002-2016 Karl Waclawek <karl@waclawek.net>
13 Copyright (c) 2016-2022 Sebastian Pipping <sebastian@pipping.org>
14 Copyright (c) 2017 Rhodri James <rhodri@wildebeest.org.uk>
15 Copyright (c) 2018 Benjamin Peterson <benjamin@python.org>
16 Copyright (c) 2018 Anton Maklakov <antmak.pub@gmail.com>
17 Copyright (c) 2019 David Loffredo <loffredo@steptools.com>
18 Copyright (c) 2020 Boris Kolpackov <boris@codesynthesis.com>
19 Copyright (c) 2022 Martin Ettl <ettl.martin78@googlemail.com>
20 Licensed under the MIT license:
22 Permission is hereby granted, free of charge, to any person obtaining
23 a copy of this software and associated documentation files (the
24 "Software"), to deal in the Software without restriction, including
25 without limitation the rights to use, copy, modify, merge, publish,
26 distribute, sublicense, and/or sell copies of the Software, and to permit
27 persons to whom the Software is furnished to do so, subject to the
30 The above copyright notice and this permission notice shall be included
31 in all copies or substantial portions of the Software.
33 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
34 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
35 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN
36 NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
37 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
38 OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
39 USE OR OTHER DEALINGS IN THE SOFTWARE.
44 # ifndef IS_INVALID_CHAR // i.e. for UTF-16 and XML_MIN_SIZE not defined
45 # define IS_INVALID_CHAR(enc, ptr, n) (0)
48 # define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
51 return XML_TOK_PARTIAL_CHAR; \
52 if (IS_INVALID_CHAR(enc, ptr, n)) { \
53 *(nextTokPtr) = (ptr); \
54 return XML_TOK_INVALID; \
59 # define INVALID_CASES(ptr, nextTokPtr) \
60 INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
61 INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
62 INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
66 *(nextTokPtr) = (ptr); \
67 return XML_TOK_INVALID;
69 # define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
72 return XML_TOK_PARTIAL_CHAR; \
73 if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \
75 return XML_TOK_INVALID; \
80 # define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
82 if (! IS_NAME_CHAR_MINBPC(enc, ptr)) { \
84 return XML_TOK_INVALID; \
94 CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
95 CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
96 CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
98 # define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
100 if ((end) - (ptr) < (n)) \
101 return XML_TOK_PARTIAL_CHAR; \
102 if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \
104 return XML_TOK_INVALID; \
109 # define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
111 if (! IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
113 return XML_TOK_INVALID; \
118 ptr += MINBPC(enc); \
120 CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
121 CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
122 CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
125 # define PREFIX(ident) ident
128 # define HAS_CHARS(enc, ptr, end, count) \
129 ((end) - (ptr) >= ((count)*MINBPC(enc)))
131 # define HAS_CHAR(enc, ptr, end) HAS_CHARS(enc, ptr, end, 1)
133 # define REQUIRE_CHARS(enc, ptr, end, count) \
135 if (! HAS_CHARS(enc, ptr, end, count)) { \
136 return XML_TOK_PARTIAL; \
140 # define REQUIRE_CHAR(enc, ptr, end) REQUIRE_CHARS(enc, ptr, end, 1)
142 /* ptr points to character following "<!-" */
145 PREFIX(scanComment
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
146 const char **nextTokPtr
) {
147 if (HAS_CHAR(enc
, ptr
, end
)) {
148 if (! CHAR_MATCHES(enc
, ptr
, ASCII_MINUS
)) {
150 return XML_TOK_INVALID
;
153 while (HAS_CHAR(enc
, ptr
, end
)) {
154 switch (BYTE_TYPE(enc
, ptr
)) {
155 INVALID_CASES(ptr
, nextTokPtr
)
158 REQUIRE_CHAR(enc
, ptr
, end
);
159 if (CHAR_MATCHES(enc
, ptr
, ASCII_MINUS
)) {
161 REQUIRE_CHAR(enc
, ptr
, end
);
162 if (! CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
164 return XML_TOK_INVALID
;
166 *nextTokPtr
= ptr
+ MINBPC(enc
);
167 return XML_TOK_COMMENT
;
176 return XML_TOK_PARTIAL
;
179 /* ptr points to character following "<!" */
182 PREFIX(scanDecl
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
183 const char **nextTokPtr
) {
184 REQUIRE_CHAR(enc
, ptr
, end
);
185 switch (BYTE_TYPE(enc
, ptr
)) {
187 return PREFIX(scanComment
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
189 *nextTokPtr
= ptr
+ MINBPC(enc
);
190 return XML_TOK_COND_SECT_OPEN
;
197 return XML_TOK_INVALID
;
199 while (HAS_CHAR(enc
, ptr
, end
)) {
200 switch (BYTE_TYPE(enc
, ptr
)) {
202 REQUIRE_CHARS(enc
, ptr
, end
, 2);
203 /* don't allow <!ENTITY% foo "whatever"> */
204 switch (BYTE_TYPE(enc
, ptr
+ MINBPC(enc
))) {
210 return XML_TOK_INVALID
;
217 return XML_TOK_DECL_OPEN
;
224 return XML_TOK_INVALID
;
227 return XML_TOK_PARTIAL
;
231 PREFIX(checkPiTarget
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
235 *tokPtr
= XML_TOK_PI
;
236 if (end
- ptr
!= MINBPC(enc
) * 3)
238 switch (BYTE_TO_ASCII(enc
, ptr
)) {
248 switch (BYTE_TO_ASCII(enc
, ptr
)) {
258 switch (BYTE_TO_ASCII(enc
, ptr
)) {
269 *tokPtr
= XML_TOK_XML_DECL
;
273 /* ptr points to character following "<?" */
276 PREFIX(scanPi
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
277 const char **nextTokPtr
) {
279 const char *target
= ptr
;
280 REQUIRE_CHAR(enc
, ptr
, end
);
281 switch (BYTE_TYPE(enc
, ptr
)) {
282 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
285 return XML_TOK_INVALID
;
287 while (HAS_CHAR(enc
, ptr
, end
)) {
288 switch (BYTE_TYPE(enc
, ptr
)) {
289 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
293 if (! PREFIX(checkPiTarget
)(enc
, target
, ptr
, &tok
)) {
295 return XML_TOK_INVALID
;
298 while (HAS_CHAR(enc
, ptr
, end
)) {
299 switch (BYTE_TYPE(enc
, ptr
)) {
300 INVALID_CASES(ptr
, nextTokPtr
)
303 REQUIRE_CHAR(enc
, ptr
, end
);
304 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
305 *nextTokPtr
= ptr
+ MINBPC(enc
);
314 return XML_TOK_PARTIAL
;
316 if (! PREFIX(checkPiTarget
)(enc
, target
, ptr
, &tok
)) {
318 return XML_TOK_INVALID
;
321 REQUIRE_CHAR(enc
, ptr
, end
);
322 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
323 *nextTokPtr
= ptr
+ MINBPC(enc
);
329 return XML_TOK_INVALID
;
332 return XML_TOK_PARTIAL
;
336 PREFIX(scanCdataSection
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
337 const char **nextTokPtr
) {
338 static const char CDATA_LSQB
[]
339 = {ASCII_C
, ASCII_D
, ASCII_A
, ASCII_T
, ASCII_A
, ASCII_LSQB
};
343 REQUIRE_CHARS(enc
, ptr
, end
, 6);
344 for (i
= 0; i
< 6; i
++, ptr
+= MINBPC(enc
)) {
345 if (! CHAR_MATCHES(enc
, ptr
, CDATA_LSQB
[i
])) {
347 return XML_TOK_INVALID
;
351 return XML_TOK_CDATA_SECT_OPEN
;
355 PREFIX(cdataSectionTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
356 const char **nextTokPtr
) {
359 if (MINBPC(enc
) > 1) {
360 size_t n
= end
- ptr
;
361 if (n
& (MINBPC(enc
) - 1)) {
362 n
&= ~(MINBPC(enc
) - 1);
364 return XML_TOK_PARTIAL
;
368 switch (BYTE_TYPE(enc
, ptr
)) {
371 REQUIRE_CHAR(enc
, ptr
, end
);
372 if (! CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
))
375 REQUIRE_CHAR(enc
, ptr
, end
);
376 if (! CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
380 *nextTokPtr
= ptr
+ MINBPC(enc
);
381 return XML_TOK_CDATA_SECT_CLOSE
;
384 REQUIRE_CHAR(enc
, ptr
, end
);
385 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
388 return XML_TOK_DATA_NEWLINE
;
390 *nextTokPtr
= ptr
+ MINBPC(enc
);
391 return XML_TOK_DATA_NEWLINE
;
392 INVALID_CASES(ptr
, nextTokPtr
)
397 while (HAS_CHAR(enc
, ptr
, end
)) {
398 switch (BYTE_TYPE(enc
, ptr
)) {
399 # define LEAD_CASE(n) \
401 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
403 return XML_TOK_DATA_CHARS; \
418 return XML_TOK_DATA_CHARS
;
425 return XML_TOK_DATA_CHARS
;
428 /* ptr points to character following "</" */
431 PREFIX(scanEndTag
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
432 const char **nextTokPtr
) {
433 REQUIRE_CHAR(enc
, ptr
, end
);
434 switch (BYTE_TYPE(enc
, ptr
)) {
435 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
438 return XML_TOK_INVALID
;
440 while (HAS_CHAR(enc
, ptr
, end
)) {
441 switch (BYTE_TYPE(enc
, ptr
)) {
442 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
446 for (ptr
+= MINBPC(enc
); HAS_CHAR(enc
, ptr
, end
); ptr
+= MINBPC(enc
)) {
447 switch (BYTE_TYPE(enc
, ptr
)) {
453 *nextTokPtr
= ptr
+ MINBPC(enc
);
454 return XML_TOK_END_TAG
;
457 return XML_TOK_INVALID
;
460 return XML_TOK_PARTIAL
;
463 /* no need to check qname syntax here,
464 since end-tag must match exactly */
469 *nextTokPtr
= ptr
+ MINBPC(enc
);
470 return XML_TOK_END_TAG
;
473 return XML_TOK_INVALID
;
476 return XML_TOK_PARTIAL
;
479 /* ptr points to character following "&#X" */
482 PREFIX(scanHexCharRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
483 const char **nextTokPtr
) {
484 if (HAS_CHAR(enc
, ptr
, end
)) {
485 switch (BYTE_TYPE(enc
, ptr
)) {
491 return XML_TOK_INVALID
;
493 for (ptr
+= MINBPC(enc
); HAS_CHAR(enc
, ptr
, end
); ptr
+= MINBPC(enc
)) {
494 switch (BYTE_TYPE(enc
, ptr
)) {
499 *nextTokPtr
= ptr
+ MINBPC(enc
);
500 return XML_TOK_CHAR_REF
;
503 return XML_TOK_INVALID
;
507 return XML_TOK_PARTIAL
;
510 /* ptr points to character following "&#" */
513 PREFIX(scanCharRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
514 const char **nextTokPtr
) {
515 if (HAS_CHAR(enc
, ptr
, end
)) {
516 if (CHAR_MATCHES(enc
, ptr
, ASCII_x
))
517 return PREFIX(scanHexCharRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
518 switch (BYTE_TYPE(enc
, ptr
)) {
523 return XML_TOK_INVALID
;
525 for (ptr
+= MINBPC(enc
); HAS_CHAR(enc
, ptr
, end
); ptr
+= MINBPC(enc
)) {
526 switch (BYTE_TYPE(enc
, ptr
)) {
530 *nextTokPtr
= ptr
+ MINBPC(enc
);
531 return XML_TOK_CHAR_REF
;
534 return XML_TOK_INVALID
;
538 return XML_TOK_PARTIAL
;
541 /* ptr points to character following "&" */
544 PREFIX(scanRef
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
545 const char **nextTokPtr
) {
546 REQUIRE_CHAR(enc
, ptr
, end
);
547 switch (BYTE_TYPE(enc
, ptr
)) {
548 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
550 return PREFIX(scanCharRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
553 return XML_TOK_INVALID
;
555 while (HAS_CHAR(enc
, ptr
, end
)) {
556 switch (BYTE_TYPE(enc
, ptr
)) {
557 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
559 *nextTokPtr
= ptr
+ MINBPC(enc
);
560 return XML_TOK_ENTITY_REF
;
563 return XML_TOK_INVALID
;
566 return XML_TOK_PARTIAL
;
569 /* ptr points to character following first character of attribute name */
572 PREFIX(scanAtts
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
573 const char **nextTokPtr
) {
577 while (HAS_CHAR(enc
, ptr
, end
)) {
578 switch (BYTE_TYPE(enc
, ptr
)) {
579 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
584 return XML_TOK_INVALID
;
588 REQUIRE_CHAR(enc
, ptr
, end
);
589 switch (BYTE_TYPE(enc
, ptr
)) {
590 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
593 return XML_TOK_INVALID
;
604 REQUIRE_CHAR(enc
, ptr
, end
);
605 t
= BYTE_TYPE(enc
, ptr
);
615 return XML_TOK_INVALID
;
626 REQUIRE_CHAR(enc
, ptr
, end
);
627 open
= BYTE_TYPE(enc
, ptr
);
628 if (open
== BT_QUOT
|| open
== BT_APOS
)
637 return XML_TOK_INVALID
;
641 /* in attribute value */
644 REQUIRE_CHAR(enc
, ptr
, end
);
645 t
= BYTE_TYPE(enc
, ptr
);
649 INVALID_CASES(ptr
, nextTokPtr
)
651 int tok
= PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, &ptr
);
653 if (tok
== XML_TOK_INVALID
)
661 return XML_TOK_INVALID
;
668 REQUIRE_CHAR(enc
, ptr
, end
);
669 switch (BYTE_TYPE(enc
, ptr
)) {
680 return XML_TOK_INVALID
;
682 /* ptr points to closing quote */
685 REQUIRE_CHAR(enc
, ptr
, end
);
686 switch (BYTE_TYPE(enc
, ptr
)) {
687 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
694 *nextTokPtr
= ptr
+ MINBPC(enc
);
695 return XML_TOK_START_TAG_WITH_ATTS
;
699 REQUIRE_CHAR(enc
, ptr
, end
);
700 if (! CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
702 return XML_TOK_INVALID
;
704 *nextTokPtr
= ptr
+ MINBPC(enc
);
705 return XML_TOK_EMPTY_ELEMENT_WITH_ATTS
;
708 return XML_TOK_INVALID
;
716 return XML_TOK_INVALID
;
719 return XML_TOK_PARTIAL
;
722 /* ptr points to character following "<" */
725 PREFIX(scanLt
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
726 const char **nextTokPtr
) {
730 REQUIRE_CHAR(enc
, ptr
, end
);
731 switch (BYTE_TYPE(enc
, ptr
)) {
732 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
735 REQUIRE_CHAR(enc
, ptr
, end
);
736 switch (BYTE_TYPE(enc
, ptr
)) {
738 return PREFIX(scanComment
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
740 return PREFIX(scanCdataSection
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
743 return XML_TOK_INVALID
;
745 return PREFIX(scanPi
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
747 return PREFIX(scanEndTag
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
750 return XML_TOK_INVALID
;
755 /* we have a start-tag */
756 while (HAS_CHAR(enc
, ptr
, end
)) {
757 switch (BYTE_TYPE(enc
, ptr
)) {
758 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
763 return XML_TOK_INVALID
;
767 REQUIRE_CHAR(enc
, ptr
, end
);
768 switch (BYTE_TYPE(enc
, ptr
)) {
769 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
772 return XML_TOK_INVALID
;
780 while (HAS_CHAR(enc
, ptr
, end
)) {
781 switch (BYTE_TYPE(enc
, ptr
)) {
782 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
794 return XML_TOK_INVALID
;
796 return PREFIX(scanAtts
)(enc
, ptr
, end
, nextTokPtr
);
798 return XML_TOK_PARTIAL
;
802 *nextTokPtr
= ptr
+ MINBPC(enc
);
803 return XML_TOK_START_TAG_NO_ATTS
;
807 REQUIRE_CHAR(enc
, ptr
, end
);
808 if (! CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
810 return XML_TOK_INVALID
;
812 *nextTokPtr
= ptr
+ MINBPC(enc
);
813 return XML_TOK_EMPTY_ELEMENT_NO_ATTS
;
816 return XML_TOK_INVALID
;
819 return XML_TOK_PARTIAL
;
823 PREFIX(contentTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
824 const char **nextTokPtr
) {
827 if (MINBPC(enc
) > 1) {
828 size_t n
= end
- ptr
;
829 if (n
& (MINBPC(enc
) - 1)) {
830 n
&= ~(MINBPC(enc
) - 1);
832 return XML_TOK_PARTIAL
;
836 switch (BYTE_TYPE(enc
, ptr
)) {
838 return PREFIX(scanLt
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
840 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
843 if (! HAS_CHAR(enc
, ptr
, end
))
844 return XML_TOK_TRAILING_CR
;
845 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
848 return XML_TOK_DATA_NEWLINE
;
850 *nextTokPtr
= ptr
+ MINBPC(enc
);
851 return XML_TOK_DATA_NEWLINE
;
854 if (! HAS_CHAR(enc
, ptr
, end
))
855 return XML_TOK_TRAILING_RSQB
;
856 if (! CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
))
859 if (! HAS_CHAR(enc
, ptr
, end
))
860 return XML_TOK_TRAILING_RSQB
;
861 if (! CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
866 return XML_TOK_INVALID
;
867 INVALID_CASES(ptr
, nextTokPtr
)
872 while (HAS_CHAR(enc
, ptr
, end
)) {
873 switch (BYTE_TYPE(enc
, ptr
)) {
874 # define LEAD_CASE(n) \
876 if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
878 return XML_TOK_DATA_CHARS; \
887 if (HAS_CHARS(enc
, ptr
, end
, 2)) {
888 if (! CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_RSQB
)) {
892 if (HAS_CHARS(enc
, ptr
, end
, 3)) {
893 if (! CHAR_MATCHES(enc
, ptr
+ 2 * MINBPC(enc
), ASCII_GT
)) {
897 *nextTokPtr
= ptr
+ 2 * MINBPC(enc
);
898 return XML_TOK_INVALID
;
910 return XML_TOK_DATA_CHARS
;
917 return XML_TOK_DATA_CHARS
;
920 /* ptr points to character following "%" */
923 PREFIX(scanPercent
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
924 const char **nextTokPtr
) {
925 REQUIRE_CHAR(enc
, ptr
, end
);
926 switch (BYTE_TYPE(enc
, ptr
)) {
927 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
933 return XML_TOK_PERCENT
;
936 return XML_TOK_INVALID
;
938 while (HAS_CHAR(enc
, ptr
, end
)) {
939 switch (BYTE_TYPE(enc
, ptr
)) {
940 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
942 *nextTokPtr
= ptr
+ MINBPC(enc
);
943 return XML_TOK_PARAM_ENTITY_REF
;
946 return XML_TOK_INVALID
;
949 return XML_TOK_PARTIAL
;
953 PREFIX(scanPoundName
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
954 const char **nextTokPtr
) {
955 REQUIRE_CHAR(enc
, ptr
, end
);
956 switch (BYTE_TYPE(enc
, ptr
)) {
957 CHECK_NMSTRT_CASES(enc
, ptr
, end
, nextTokPtr
)
960 return XML_TOK_INVALID
;
962 while (HAS_CHAR(enc
, ptr
, end
)) {
963 switch (BYTE_TYPE(enc
, ptr
)) {
964 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
973 return XML_TOK_POUND_NAME
;
976 return XML_TOK_INVALID
;
979 return -XML_TOK_POUND_NAME
;
983 PREFIX(scanLit
)(int open
, const ENCODING
*enc
, const char *ptr
, const char *end
,
984 const char **nextTokPtr
) {
985 while (HAS_CHAR(enc
, ptr
, end
)) {
986 int t
= BYTE_TYPE(enc
, ptr
);
988 INVALID_CASES(ptr
, nextTokPtr
)
994 if (! HAS_CHAR(enc
, ptr
, end
))
995 return -XML_TOK_LITERAL
;
997 switch (BYTE_TYPE(enc
, ptr
)) {
1004 return XML_TOK_LITERAL
;
1006 return XML_TOK_INVALID
;
1013 return XML_TOK_PARTIAL
;
1017 PREFIX(prologTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1018 const char **nextTokPtr
) {
1021 return XML_TOK_NONE
;
1022 if (MINBPC(enc
) > 1) {
1023 size_t n
= end
- ptr
;
1024 if (n
& (MINBPC(enc
) - 1)) {
1025 n
&= ~(MINBPC(enc
) - 1);
1027 return XML_TOK_PARTIAL
;
1031 switch (BYTE_TYPE(enc
, ptr
)) {
1033 return PREFIX(scanLit
)(BT_QUOT
, enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1035 return PREFIX(scanLit
)(BT_APOS
, enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1038 REQUIRE_CHAR(enc
, ptr
, end
);
1039 switch (BYTE_TYPE(enc
, ptr
)) {
1041 return PREFIX(scanDecl
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1043 return PREFIX(scanPi
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1050 *nextTokPtr
= ptr
- MINBPC(enc
);
1051 return XML_TOK_INSTANCE_START
;
1054 return XML_TOK_INVALID
;
1057 if (ptr
+ MINBPC(enc
) == end
) {
1059 /* indicate that this might be part of a CR/LF pair */
1060 return -XML_TOK_PROLOG_S
;
1067 if (! HAS_CHAR(enc
, ptr
, end
))
1069 switch (BYTE_TYPE(enc
, ptr
)) {
1074 /* don't split CR/LF pair */
1075 if (ptr
+ MINBPC(enc
) != end
)
1080 return XML_TOK_PROLOG_S
;
1084 return XML_TOK_PROLOG_S
;
1086 return PREFIX(scanPercent
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1088 *nextTokPtr
= ptr
+ MINBPC(enc
);
1089 return XML_TOK_COMMA
;
1091 *nextTokPtr
= ptr
+ MINBPC(enc
);
1092 return XML_TOK_OPEN_BRACKET
;
1095 if (! HAS_CHAR(enc
, ptr
, end
))
1096 return -XML_TOK_CLOSE_BRACKET
;
1097 if (CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
)) {
1098 REQUIRE_CHARS(enc
, ptr
, end
, 2);
1099 if (CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_GT
)) {
1100 *nextTokPtr
= ptr
+ 2 * MINBPC(enc
);
1101 return XML_TOK_COND_SECT_CLOSE
;
1105 return XML_TOK_CLOSE_BRACKET
;
1107 *nextTokPtr
= ptr
+ MINBPC(enc
);
1108 return XML_TOK_OPEN_PAREN
;
1111 if (! HAS_CHAR(enc
, ptr
, end
))
1112 return -XML_TOK_CLOSE_PAREN
;
1113 switch (BYTE_TYPE(enc
, ptr
)) {
1115 *nextTokPtr
= ptr
+ MINBPC(enc
);
1116 return XML_TOK_CLOSE_PAREN_ASTERISK
;
1118 *nextTokPtr
= ptr
+ MINBPC(enc
);
1119 return XML_TOK_CLOSE_PAREN_QUESTION
;
1121 *nextTokPtr
= ptr
+ MINBPC(enc
);
1122 return XML_TOK_CLOSE_PAREN_PLUS
;
1131 return XML_TOK_CLOSE_PAREN
;
1134 return XML_TOK_INVALID
;
1136 *nextTokPtr
= ptr
+ MINBPC(enc
);
1139 *nextTokPtr
= ptr
+ MINBPC(enc
);
1140 return XML_TOK_DECL_CLOSE
;
1142 return PREFIX(scanPoundName
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1143 # define LEAD_CASE(n) \
1145 if (end - ptr < n) \
1146 return XML_TOK_PARTIAL_CHAR; \
1147 if (IS_INVALID_CHAR(enc, ptr, n)) { \
1148 *nextTokPtr = ptr; \
1149 return XML_TOK_INVALID; \
1151 if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
1153 tok = XML_TOK_NAME; \
1156 if (IS_NAME_CHAR(enc, ptr, n)) { \
1158 tok = XML_TOK_NMTOKEN; \
1161 *nextTokPtr = ptr; \
1162 return XML_TOK_INVALID;
1178 tok
= XML_TOK_NMTOKEN
;
1182 if (IS_NMSTRT_CHAR_MINBPC(enc
, ptr
)) {
1187 if (IS_NAME_CHAR_MINBPC(enc
, ptr
)) {
1189 tok
= XML_TOK_NMTOKEN
;
1195 return XML_TOK_INVALID
;
1197 while (HAS_CHAR(enc
, ptr
, end
)) {
1198 switch (BYTE_TYPE(enc
, ptr
)) {
1199 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
1216 REQUIRE_CHAR(enc
, ptr
, end
);
1217 tok
= XML_TOK_PREFIXED_NAME
;
1218 switch (BYTE_TYPE(enc
, ptr
)) {
1219 CHECK_NAME_CASES(enc
, ptr
, end
, nextTokPtr
)
1221 tok
= XML_TOK_NMTOKEN
;
1225 case XML_TOK_PREFIXED_NAME
:
1226 tok
= XML_TOK_NMTOKEN
;
1232 if (tok
== XML_TOK_NMTOKEN
) {
1234 return XML_TOK_INVALID
;
1236 *nextTokPtr
= ptr
+ MINBPC(enc
);
1237 return XML_TOK_NAME_PLUS
;
1239 if (tok
== XML_TOK_NMTOKEN
) {
1241 return XML_TOK_INVALID
;
1243 *nextTokPtr
= ptr
+ MINBPC(enc
);
1244 return XML_TOK_NAME_ASTERISK
;
1246 if (tok
== XML_TOK_NMTOKEN
) {
1248 return XML_TOK_INVALID
;
1250 *nextTokPtr
= ptr
+ MINBPC(enc
);
1251 return XML_TOK_NAME_QUESTION
;
1254 return XML_TOK_INVALID
;
1261 PREFIX(attributeValueTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1262 const char **nextTokPtr
) {
1265 return XML_TOK_NONE
;
1266 else if (! HAS_CHAR(enc
, ptr
, end
)) {
1267 /* This line cannot be executed. The incoming data has already
1268 * been tokenized once, so incomplete characters like this have
1269 * already been eliminated from the input. Retaining the paranoia
1270 * check is still valuable, however.
1272 return XML_TOK_PARTIAL
; /* LCOV_EXCL_LINE */
1275 while (HAS_CHAR(enc
, ptr
, end
)) {
1276 switch (BYTE_TYPE(enc
, ptr
)) {
1277 # define LEAD_CASE(n) \
1279 ptr += n; /* NOTE: The encoding has already been validated. */ \
1287 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1289 return XML_TOK_DATA_CHARS
;
1291 /* this is for inside entity references */
1293 return XML_TOK_INVALID
;
1296 *nextTokPtr
= ptr
+ MINBPC(enc
);
1297 return XML_TOK_DATA_NEWLINE
;
1300 return XML_TOK_DATA_CHARS
;
1304 if (! HAS_CHAR(enc
, ptr
, end
))
1305 return XML_TOK_TRAILING_CR
;
1306 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
1309 return XML_TOK_DATA_NEWLINE
;
1312 return XML_TOK_DATA_CHARS
;
1315 *nextTokPtr
= ptr
+ MINBPC(enc
);
1316 return XML_TOK_ATTRIBUTE_VALUE_S
;
1319 return XML_TOK_DATA_CHARS
;
1326 return XML_TOK_DATA_CHARS
;
1330 PREFIX(entityValueTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1331 const char **nextTokPtr
) {
1334 return XML_TOK_NONE
;
1335 else if (! HAS_CHAR(enc
, ptr
, end
)) {
1336 /* This line cannot be executed. The incoming data has already
1337 * been tokenized once, so incomplete characters like this have
1338 * already been eliminated from the input. Retaining the paranoia
1339 * check is still valuable, however.
1341 return XML_TOK_PARTIAL
; /* LCOV_EXCL_LINE */
1344 while (HAS_CHAR(enc
, ptr
, end
)) {
1345 switch (BYTE_TYPE(enc
, ptr
)) {
1346 # define LEAD_CASE(n) \
1348 ptr += n; /* NOTE: The encoding has already been validated. */ \
1356 return PREFIX(scanRef
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1358 return XML_TOK_DATA_CHARS
;
1361 int tok
= PREFIX(scanPercent
)(enc
, ptr
+ MINBPC(enc
), end
, nextTokPtr
);
1362 return (tok
== XML_TOK_PERCENT
) ? XML_TOK_INVALID
: tok
;
1365 return XML_TOK_DATA_CHARS
;
1368 *nextTokPtr
= ptr
+ MINBPC(enc
);
1369 return XML_TOK_DATA_NEWLINE
;
1372 return XML_TOK_DATA_CHARS
;
1376 if (! HAS_CHAR(enc
, ptr
, end
))
1377 return XML_TOK_TRAILING_CR
;
1378 if (BYTE_TYPE(enc
, ptr
) == BT_LF
)
1381 return XML_TOK_DATA_NEWLINE
;
1384 return XML_TOK_DATA_CHARS
;
1391 return XML_TOK_DATA_CHARS
;
1397 PREFIX(ignoreSectionTok
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1398 const char **nextTokPtr
) {
1400 if (MINBPC(enc
) > 1) {
1401 size_t n
= end
- ptr
;
1402 if (n
& (MINBPC(enc
) - 1)) {
1403 n
&= ~(MINBPC(enc
) - 1);
1407 while (HAS_CHAR(enc
, ptr
, end
)) {
1408 switch (BYTE_TYPE(enc
, ptr
)) {
1409 INVALID_CASES(ptr
, nextTokPtr
)
1412 REQUIRE_CHAR(enc
, ptr
, end
);
1413 if (CHAR_MATCHES(enc
, ptr
, ASCII_EXCL
)) {
1415 REQUIRE_CHAR(enc
, ptr
, end
);
1416 if (CHAR_MATCHES(enc
, ptr
, ASCII_LSQB
)) {
1424 REQUIRE_CHAR(enc
, ptr
, end
);
1425 if (CHAR_MATCHES(enc
, ptr
, ASCII_RSQB
)) {
1427 REQUIRE_CHAR(enc
, ptr
, end
);
1428 if (CHAR_MATCHES(enc
, ptr
, ASCII_GT
)) {
1432 return XML_TOK_IGNORE_SECT
;
1443 return XML_TOK_PARTIAL
;
1446 # endif /* XML_DTD */
1449 PREFIX(isPublicId
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1450 const char **badPtr
) {
1453 for (; HAS_CHAR(enc
, ptr
, end
); ptr
+= MINBPC(enc
)) {
1454 switch (BYTE_TYPE(enc
, ptr
)) {
1478 if (CHAR_MATCHES(enc
, ptr
, ASCII_TAB
)) {
1485 if (! (BYTE_TO_ASCII(enc
, ptr
) & ~0x7f))
1489 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1503 /* This must only be called for a well-formed start-tag or empty
1504 element tag. Returns the number of attributes. Pointers to the
1505 first attsMax attributes are stored in atts.
1509 PREFIX(getAtts
)(const ENCODING
*enc
, const char *ptr
, int attsMax
,
1511 enum { other
, inName
, inValue
} state
= inName
;
1513 int open
= 0; /* defined when state == inValue;
1514 initialization just to shut up compilers */
1516 for (ptr
+= MINBPC(enc
);; ptr
+= MINBPC(enc
)) {
1517 switch (BYTE_TYPE(enc
, ptr
)) {
1518 # define START_NAME \
1519 if (state == other) { \
1520 if (nAtts < attsMax) { \
1521 atts[nAtts].name = ptr; \
1522 atts[nAtts].normalized = 1; \
1526 # define LEAD_CASE(n) \
1527 case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \
1528 START_NAME ptr += (n - MINBPC(enc)); \
1541 if (state
!= inValue
) {
1542 if (nAtts
< attsMax
)
1543 atts
[nAtts
].valuePtr
= ptr
+ MINBPC(enc
);
1546 } else if (open
== BT_QUOT
) {
1548 if (nAtts
< attsMax
)
1549 atts
[nAtts
].valueEnd
= ptr
;
1554 if (state
!= inValue
) {
1555 if (nAtts
< attsMax
)
1556 atts
[nAtts
].valuePtr
= ptr
+ MINBPC(enc
);
1559 } else if (open
== BT_APOS
) {
1561 if (nAtts
< attsMax
)
1562 atts
[nAtts
].valueEnd
= ptr
;
1567 if (nAtts
< attsMax
)
1568 atts
[nAtts
].normalized
= 0;
1571 if (state
== inName
)
1573 else if (state
== inValue
&& nAtts
< attsMax
&& atts
[nAtts
].normalized
1574 && (ptr
== atts
[nAtts
].valuePtr
1575 || BYTE_TO_ASCII(enc
, ptr
) != ASCII_SPACE
1576 || BYTE_TO_ASCII(enc
, ptr
+ MINBPC(enc
)) == ASCII_SPACE
1577 || BYTE_TYPE(enc
, ptr
+ MINBPC(enc
)) == open
))
1578 atts
[nAtts
].normalized
= 0;
1582 /* This case ensures that the first attribute name is counted
1583 Apart from that we could just change state on the quote. */
1584 if (state
== inName
)
1586 else if (state
== inValue
&& nAtts
< attsMax
)
1587 atts
[nAtts
].normalized
= 0;
1591 if (state
!= inValue
)
1601 static int PTRFASTCALL
1602 PREFIX(charRefNumber
)(const ENCODING
*enc
, const char *ptr
) {
1606 ptr
+= 2 * MINBPC(enc
);
1607 if (CHAR_MATCHES(enc
, ptr
, ASCII_x
)) {
1608 for (ptr
+= MINBPC(enc
); ! CHAR_MATCHES(enc
, ptr
, ASCII_SEMI
);
1609 ptr
+= MINBPC(enc
)) {
1610 int c
= BYTE_TO_ASCII(enc
, ptr
);
1623 result
|= (c
- ASCII_0
);
1632 result
+= 10 + (c
- ASCII_A
);
1641 result
+= 10 + (c
- ASCII_a
);
1644 if (result
>= 0x110000)
1648 for (; ! CHAR_MATCHES(enc
, ptr
, ASCII_SEMI
); ptr
+= MINBPC(enc
)) {
1649 int c
= BYTE_TO_ASCII(enc
, ptr
);
1651 result
+= (c
- ASCII_0
);
1652 if (result
>= 0x110000)
1656 return checkCharRefNumber(result
);
1660 PREFIX(predefinedEntityName
)(const ENCODING
*enc
, const char *ptr
,
1663 switch ((end
- ptr
) / MINBPC(enc
)) {
1665 if (CHAR_MATCHES(enc
, ptr
+ MINBPC(enc
), ASCII_t
)) {
1666 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1675 if (CHAR_MATCHES(enc
, ptr
, ASCII_a
)) {
1677 if (CHAR_MATCHES(enc
, ptr
, ASCII_m
)) {
1679 if (CHAR_MATCHES(enc
, ptr
, ASCII_p
))
1685 switch (BYTE_TO_ASCII(enc
, ptr
)) {
1688 if (CHAR_MATCHES(enc
, ptr
, ASCII_u
)) {
1690 if (CHAR_MATCHES(enc
, ptr
, ASCII_o
)) {
1692 if (CHAR_MATCHES(enc
, ptr
, ASCII_t
))
1699 if (CHAR_MATCHES(enc
, ptr
, ASCII_p
)) {
1701 if (CHAR_MATCHES(enc
, ptr
, ASCII_o
)) {
1703 if (CHAR_MATCHES(enc
, ptr
, ASCII_s
))
1714 PREFIX(nameMatchesAscii
)(const ENCODING
*enc
, const char *ptr1
,
1715 const char *end1
, const char *ptr2
) {
1717 for (; *ptr2
; ptr1
+= MINBPC(enc
), ptr2
++) {
1718 if (end1
- ptr1
< MINBPC(enc
)) {
1719 /* This line cannot be executed. The incoming data has already
1720 * been tokenized once, so incomplete characters like this have
1721 * already been eliminated from the input. Retaining the
1722 * paranoia check is still valuable, however.
1724 return 0; /* LCOV_EXCL_LINE */
1726 if (! CHAR_MATCHES(enc
, ptr1
, *ptr2
))
1729 return ptr1
== end1
;
1732 static int PTRFASTCALL
1733 PREFIX(nameLength
)(const ENCODING
*enc
, const char *ptr
) {
1734 const char *start
= ptr
;
1736 switch (BYTE_TYPE(enc
, ptr
)) {
1737 # define LEAD_CASE(n) \
1739 ptr += n; /* NOTE: The encoding has already been validated. */ \
1757 return (int)(ptr
- start
);
1762 static const char *PTRFASTCALL
1763 PREFIX(skipS
)(const ENCODING
*enc
, const char *ptr
) {
1765 switch (BYTE_TYPE(enc
, ptr
)) {
1778 PREFIX(updatePosition
)(const ENCODING
*enc
, const char *ptr
, const char *end
,
1780 while (HAS_CHAR(enc
, ptr
, end
)) {
1781 switch (BYTE_TYPE(enc
, ptr
)) {
1782 # define LEAD_CASE(n) \
1784 ptr += n; /* NOTE: The encoding has already been validated. */ \
1785 pos->columnNumber++; \
1792 pos
->columnNumber
= 0;
1799 if (HAS_CHAR(enc
, ptr
, end
) && BYTE_TYPE(enc
, ptr
) == BT_LF
)
1801 pos
->columnNumber
= 0;
1805 pos
->columnNumber
++;
1811 # undef DO_LEAD_CASE
1812 # undef MULTIBYTE_CASES
1813 # undef INVALID_CASES
1814 # undef CHECK_NAME_CASE
1815 # undef CHECK_NAME_CASES
1816 # undef CHECK_NMSTRT_CASE
1817 # undef CHECK_NMSTRT_CASES
1819 #endif /* XML_TOK_IMPL_C */