2 * parserInternals.c : Internal routines (and obsolete ones) needed for the
3 * XML and HTML parsers.
5 * See Copyright for the status of this software.
14 #define XML_DIR_SEP '\\'
16 #define XML_DIR_SEP '/'
23 #include <libxml/xmlmemory.h>
24 #include <libxml/tree.h>
25 #include <libxml/parser.h>
26 #include <libxml/parserInternals.h>
27 #include <libxml/valid.h>
28 #include <libxml/entities.h>
29 #include <libxml/xmlerror.h>
30 #include <libxml/encoding.h>
31 #include <libxml/valid.h>
32 #include <libxml/xmlIO.h>
33 #include <libxml/uri.h>
34 #include <libxml/dict.h>
35 #include <libxml/SAX.h>
36 #ifdef LIBXML_CATALOG_ENABLED
37 #include <libxml/catalog.h>
39 #include <libxml/globals.h>
40 #include <libxml/chvalid.h>
42 #define CUR(ctxt) ctxt->input->cur
43 #define END(ctxt) ctxt->input->end
44 #define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))
46 #include "private/buf.h"
47 #include "private/enc.h"
48 #include "private/error.h"
49 #include "private/io.h"
50 #include "private/parser.h"
53 * Various global defaults for parsing
58 * @version: the include version number
60 * check the compiled lib version against the include one.
61 * This can warn or immediately kill the application
64 xmlCheckVersion(int version
) {
65 int myversion
= LIBXML_VERSION
;
69 if ((myversion
/ 10000) != (version
/ 10000)) {
70 xmlGenericError(xmlGenericErrorContext
,
71 "Fatal: program compiled against libxml %d using libxml %d\n",
72 (version
/ 10000), (myversion
/ 10000));
74 "Fatal: program compiled against libxml %d using libxml %d\n",
75 (version
/ 10000), (myversion
/ 10000));
77 if ((myversion
/ 100) < (version
/ 100)) {
78 xmlGenericError(xmlGenericErrorContext
,
79 "Warning: program compiled against libxml %d using older %d\n",
80 (version
/ 100), (myversion
/ 100));
85 /************************************************************************
87 * Some factorized error routines *
89 ************************************************************************/
94 * @ctxt: an XML parser context
95 * @extra: extra information
97 * Handle a redefinition of attribute error
100 xmlErrMemory(xmlParserCtxtPtr ctxt
, const char *extra
)
102 if ((ctxt
!= NULL
) && (ctxt
->disableSAX
!= 0) &&
103 (ctxt
->instate
== XML_PARSER_EOF
))
106 ctxt
->errNo
= XML_ERR_NO_MEMORY
;
107 ctxt
->instate
= XML_PARSER_EOF
;
108 ctxt
->disableSAX
= 1;
111 __xmlRaiseError(NULL
, NULL
, NULL
, ctxt
, NULL
, XML_FROM_PARSER
,
112 XML_ERR_NO_MEMORY
, XML_ERR_FATAL
, NULL
, 0, extra
,
114 "Memory allocation failed : %s\n", extra
);
116 __xmlRaiseError(NULL
, NULL
, NULL
, ctxt
, NULL
, XML_FROM_PARSER
,
117 XML_ERR_NO_MEMORY
, XML_ERR_FATAL
, NULL
, 0, NULL
,
118 NULL
, NULL
, 0, 0, "Memory allocation failed\n");
123 * @ctxt: an XML parser context
124 * @xmlerr: the error number
125 * @msg: the error message
126 * @str1: an string info
127 * @str2: an string info
129 * Handle an encoding error
132 __xmlErrEncoding(xmlParserCtxtPtr ctxt
, xmlParserErrors xmlerr
,
133 const char *msg
, const xmlChar
* str1
, const xmlChar
* str2
)
135 if ((ctxt
!= NULL
) && (ctxt
->disableSAX
!= 0) &&
136 (ctxt
->instate
== XML_PARSER_EOF
))
139 ctxt
->errNo
= xmlerr
;
140 __xmlRaiseError(NULL
, NULL
, NULL
,
141 ctxt
, NULL
, XML_FROM_PARSER
, xmlerr
, XML_ERR_FATAL
,
142 NULL
, 0, (const char *) str1
, (const char *) str2
,
143 NULL
, 0, 0, msg
, str1
, str2
);
145 ctxt
->wellFormed
= 0;
146 if (ctxt
->recovery
== 0)
147 ctxt
->disableSAX
= 1;
153 * @ctxt: an XML parser context
154 * @msg: the error message
155 * @str: error information
157 * Handle an internal error
159 static void LIBXML_ATTR_FORMAT(2,0)
160 xmlErrInternal(xmlParserCtxtPtr ctxt
, const char *msg
, const xmlChar
* str
)
162 if ((ctxt
!= NULL
) && (ctxt
->disableSAX
!= 0) &&
163 (ctxt
->instate
== XML_PARSER_EOF
))
166 ctxt
->errNo
= XML_ERR_INTERNAL_ERROR
;
167 __xmlRaiseError(NULL
, NULL
, NULL
,
168 ctxt
, NULL
, XML_FROM_PARSER
, XML_ERR_INTERNAL_ERROR
,
169 XML_ERR_FATAL
, NULL
, 0, (const char *) str
, NULL
, NULL
,
172 ctxt
->wellFormed
= 0;
173 if (ctxt
->recovery
== 0)
174 ctxt
->disableSAX
= 1;
180 * @ctxt: an XML parser context
181 * @error: the error number
182 * @msg: the error message
183 * @val: an integer value
187 static void LIBXML_ATTR_FORMAT(3,0)
188 xmlErrEncodingInt(xmlParserCtxtPtr ctxt
, xmlParserErrors error
,
189 const char *msg
, int val
)
191 if ((ctxt
!= NULL
) && (ctxt
->disableSAX
!= 0) &&
192 (ctxt
->instate
== XML_PARSER_EOF
))
196 __xmlRaiseError(NULL
, NULL
, NULL
,
197 ctxt
, NULL
, XML_FROM_PARSER
, error
, XML_ERR_FATAL
,
198 NULL
, 0, NULL
, NULL
, NULL
, val
, 0, msg
, val
);
200 ctxt
->wellFormed
= 0;
201 if (ctxt
->recovery
== 0)
202 ctxt
->disableSAX
= 1;
208 * @c: an unicode character (int)
210 * Check whether the character is allowed by the production
211 * [84] Letter ::= BaseChar | Ideographic
213 * Returns 0 if not, non-zero otherwise
217 return(IS_BASECHAR(c
) || IS_IDEOGRAPHIC(c
));
220 /************************************************************************
222 * Input handling functions for progressive parsing *
224 ************************************************************************/
226 /* #define DEBUG_INPUT */
227 /* #define DEBUG_STACK */
228 /* #define DEBUG_PUSH */
231 /* we need to keep enough input to show errors in context */
235 #define CHECK_BUFFER(in) check_buffer(in)
238 void check_buffer(xmlParserInputPtr in
) {
239 if (in
->base
!= xmlBufContent(in
->buf
->buffer
)) {
240 xmlGenericError(xmlGenericErrorContext
,
241 "xmlParserInput: base mismatch problem\n");
243 if (in
->cur
< in
->base
) {
244 xmlGenericError(xmlGenericErrorContext
,
245 "xmlParserInput: cur < base problem\n");
247 if (in
->cur
> in
->base
+ xmlBufUse(in
->buf
->buffer
)) {
248 xmlGenericError(xmlGenericErrorContext
,
249 "xmlParserInput: cur > base + use problem\n");
251 xmlGenericError(xmlGenericErrorContext
,"buffer %p : content %x, cur %d, use %d\n",
252 (void *) in
, (int) xmlBufContent(in
->buf
->buffer
),
253 in
->cur
- in
->base
, xmlBufUse(in
->buf
->buffer
));
257 #define CHECK_BUFFER(in)
263 * @ctxt: an XML parser context
265 * Blocks further parser processing don't override error
269 xmlHaltParser(xmlParserCtxtPtr ctxt
) {
272 ctxt
->instate
= XML_PARSER_EOF
;
273 ctxt
->disableSAX
= 1;
274 while (ctxt
->inputNr
> 1)
275 xmlFreeInputStream(inputPop(ctxt
));
276 if (ctxt
->input
!= NULL
) {
278 * in case there was a specific allocation deallocate before
281 if (ctxt
->input
->free
!= NULL
) {
282 ctxt
->input
->free((xmlChar
*) ctxt
->input
->base
);
283 ctxt
->input
->free
= NULL
;
285 if (ctxt
->input
->buf
!= NULL
) {
286 xmlFreeParserInputBuffer(ctxt
->input
->buf
);
287 ctxt
->input
->buf
= NULL
;
289 ctxt
->input
->cur
= BAD_CAST
"";
290 ctxt
->input
->length
= 0;
291 ctxt
->input
->base
= ctxt
->input
->cur
;
292 ctxt
->input
->end
= ctxt
->input
->cur
;
297 * xmlParserInputRead:
298 * @in: an XML parser input
299 * @len: an indicative size for the lookahead
301 * DEPRECATED: This function was internal and is deprecated.
303 * Returns -1 as this is an error to use it.
306 xmlParserInputRead(xmlParserInputPtr in ATTRIBUTE_UNUSED
, int len ATTRIBUTE_UNUSED
) {
312 * @ctxt: an XML parser context
315 xmlParserGrow(xmlParserCtxtPtr ctxt
) {
316 xmlParserInputPtr in
= ctxt
->input
;
317 xmlParserInputBufferPtr buf
= in
->buf
;
318 ptrdiff_t curEnd
= in
->end
- in
->cur
;
319 ptrdiff_t curBase
= in
->cur
- in
->base
;
324 /* Don't grow push parser buffer. */
325 if (ctxt
->progressive
)
327 /* Don't grow memory buffers. */
328 if ((buf
->encoder
== NULL
) && (buf
->readcallback
== NULL
))
331 if (((curEnd
> XML_MAX_LOOKUP_LIMIT
) ||
332 (curBase
> XML_MAX_LOOKUP_LIMIT
)) &&
333 ((ctxt
->options
& XML_PARSE_HUGE
) == 0)) {
334 xmlErrInternal(ctxt
, "Huge input lookup", NULL
);
339 if (curEnd
>= INPUT_CHUNK
)
342 ret
= xmlParserInputBufferGrow(buf
, INPUT_CHUNK
);
343 xmlBufSetInputBaseCur(buf
->buffer
, in
, 0, curBase
);
345 /* TODO: Get error code from xmlParserInputBufferGrow */
347 xmlErrInternal(ctxt
, "Growing input buffer", NULL
);
355 * xmlParserInputGrow:
356 * @in: an XML parser input
357 * @len: an indicative size for the lookahead
359 * DEPRECATED: Don't use.
361 * This function increase the input for the parser. It tries to
362 * preserve pointers to the input buffer, and keep already read data
364 * Returns the amount of char read, or -1 in case of error, 0 indicate the
368 xmlParserInputGrow(xmlParserInputPtr in
, int len
) {
372 if ((in
== NULL
) || (len
< 0)) return(-1);
374 xmlGenericError(xmlGenericErrorContext
, "Grow\n");
376 if (in
->buf
== NULL
) return(-1);
377 if (in
->base
== NULL
) return(-1);
378 if (in
->cur
== NULL
) return(-1);
379 if (in
->buf
->buffer
== NULL
) return(-1);
381 /* Don't grow memory buffers. */
382 if ((in
->buf
->encoder
== NULL
) && (in
->buf
->readcallback
== NULL
))
387 indx
= in
->cur
- in
->base
;
388 if (xmlBufUse(in
->buf
->buffer
) > (unsigned int) indx
+ INPUT_CHUNK
) {
394 ret
= xmlParserInputBufferGrow(in
->buf
, len
);
396 in
->base
= xmlBufContent(in
->buf
->buffer
);
397 if (in
->base
== NULL
) {
398 in
->base
= BAD_CAST
"";
403 in
->cur
= in
->base
+ indx
;
404 in
->end
= xmlBufEnd(in
->buf
->buffer
);
413 * @ctxt: an XML parser context
416 xmlParserShrink(xmlParserCtxtPtr ctxt
) {
417 xmlParserInputPtr in
= ctxt
->input
;
418 xmlParserInputBufferPtr buf
= in
->buf
;
421 /* Don't shrink pull parser memory buffers. */
423 ((ctxt
->progressive
== 0) &&
424 (buf
->encoder
== NULL
) && (buf
->readcallback
== NULL
)))
427 used
= in
->cur
- in
->base
;
429 * Do not shrink on large buffers whose only a tiny fraction
432 if (used
> INPUT_CHUNK
) {
433 size_t res
= xmlBufShrink(buf
->buffer
, used
- LINE_LEN
);
437 if ((res
> ULONG_MAX
) ||
438 (in
->consumed
> ULONG_MAX
- (unsigned long)res
))
439 in
->consumed
= ULONG_MAX
;
445 xmlBufSetInputBaseCur(buf
->buffer
, in
, 0, used
);
449 * xmlParserInputShrink:
450 * @in: an XML parser input
452 * DEPRECATED: Don't use.
454 * This function removes used input for the parser.
457 xmlParserInputShrink(xmlParserInputPtr in
) {
462 xmlGenericError(xmlGenericErrorContext
, "Shrink\n");
464 if (in
== NULL
) return;
465 if (in
->buf
== NULL
) return;
466 if (in
->base
== NULL
) return;
467 if (in
->cur
== NULL
) return;
468 if (in
->buf
->buffer
== NULL
) return;
472 used
= in
->cur
- in
->base
;
474 * Do not shrink on large buffers whose only a tiny fraction
477 if (used
> INPUT_CHUNK
) {
478 ret
= xmlBufShrink(in
->buf
->buffer
, used
- LINE_LEN
);
481 if ((ret
> ULONG_MAX
) ||
482 (in
->consumed
> ULONG_MAX
- (unsigned long)ret
))
483 in
->consumed
= ULONG_MAX
;
489 if (xmlBufUse(in
->buf
->buffer
) <= INPUT_CHUNK
) {
490 xmlParserInputBufferRead(in
->buf
, 2 * INPUT_CHUNK
);
493 in
->base
= xmlBufContent(in
->buf
->buffer
);
494 if (in
->base
== NULL
) {
495 /* TODO: raise error */
496 in
->base
= BAD_CAST
"";
501 in
->cur
= in
->base
+ used
;
502 in
->end
= xmlBufEnd(in
->buf
->buffer
);
507 /************************************************************************
509 * UTF8 character input and related functions *
511 ************************************************************************/
515 * @ctxt: the XML parser context
517 * DEPRECATED: Internal function, do not use.
519 * Skip to the next char input char.
523 xmlNextChar(xmlParserCtxtPtr ctxt
)
525 if ((ctxt
== NULL
) || (ctxt
->instate
== XML_PARSER_EOF
) ||
526 (ctxt
->input
== NULL
))
529 if (!(VALID_CTXT(ctxt
))) {
530 xmlErrInternal(ctxt
, "Parser input data memory error\n", NULL
);
531 ctxt
->errNo
= XML_ERR_INTERNAL_ERROR
;
536 if (ctxt
->input
->end
- ctxt
->input
->cur
< INPUT_CHUNK
) {
537 if (xmlParserGrow(ctxt
) < 0)
539 if (ctxt
->input
->cur
>= ctxt
->input
->end
)
543 if (ctxt
->charset
== XML_CHAR_ENCODING_UTF8
) {
544 const unsigned char *cur
;
548 * 2.11 End-of-Line Handling
549 * the literal two-character sequence "#xD#xA" or a standalone
550 * literal #xD, an XML processor must pass to the application
551 * the single character #xA.
553 if (*(ctxt
->input
->cur
) == '\n') {
554 ctxt
->input
->line
++; ctxt
->input
->col
= 1;
559 * We are supposed to handle UTF8, check it's valid
560 * From rfc2044: encoding of the Unicode values on UTF-8:
562 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
563 * 0000 0000-0000 007F 0xxxxxxx
564 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
565 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
567 * Check for the 0x110000 limit too
569 cur
= ctxt
->input
->cur
;
578 avail
= ctxt
->input
->end
- ctxt
->input
->cur
;
580 if ((avail
< 2) || (cur
[1] & 0xc0) != 0x80)
582 if ((c
& 0xe0) == 0xe0) {
585 if ((avail
< 3) || (cur
[2] & 0xc0) != 0x80)
587 if ((c
& 0xf0) == 0xf0) {
588 if (((c
& 0xf8) != 0xf0) ||
589 (avail
< 4) || ((cur
[3] & 0xc0) != 0x80))
592 ctxt
->input
->cur
+= 4;
593 val
= (cur
[0] & 0x7) << 18;
594 val
|= (cur
[1] & 0x3f) << 12;
595 val
|= (cur
[2] & 0x3f) << 6;
596 val
|= cur
[3] & 0x3f;
599 ctxt
->input
->cur
+= 3;
600 val
= (cur
[0] & 0xf) << 12;
601 val
|= (cur
[1] & 0x3f) << 6;
602 val
|= cur
[2] & 0x3f;
604 if (((val
> 0xd7ff) && (val
< 0xe000)) ||
605 ((val
> 0xfffd) && (val
< 0x10000)) ||
607 xmlErrEncodingInt(ctxt
, XML_ERR_INVALID_CHAR
,
608 "Char 0x%X out of allowed range\n",
613 ctxt
->input
->cur
+= 2;
619 * Assume it's a fixed length encoding (1) with
620 * a compatible encoding for the ASCII set, since
621 * XML constructs only use < 128 chars
624 if (*(ctxt
->input
->cur
) == '\n') {
625 ctxt
->input
->line
++; ctxt
->input
->col
= 1;
633 * If we detect an UTF8 error that probably mean that the
634 * input encoding didn't get properly advertised in the
635 * declaration header. Report the error and switch the encoding
636 * to ISO-Latin-1 (if you don't like this policy, just declare the
639 if ((ctxt
== NULL
) || (ctxt
->input
== NULL
) ||
640 (ctxt
->input
->end
- ctxt
->input
->cur
< 4)) {
641 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
642 "Input is not proper UTF-8, indicate encoding !\n",
647 snprintf(buffer
, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
648 ctxt
->input
->cur
[0], ctxt
->input
->cur
[1],
649 ctxt
->input
->cur
[2], ctxt
->input
->cur
[3]);
650 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
651 "Input is not proper UTF-8, indicate encoding !\n%s",
652 BAD_CAST buffer
, NULL
);
654 ctxt
->charset
= XML_CHAR_ENCODING_8859_1
;
661 * @ctxt: the XML parser context
662 * @len: pointer to the length of the char read
664 * DEPRECATED: Internal function, do not use.
666 * The current char value, if using UTF-8 this may actually span multiple
667 * bytes in the input buffer. Implement the end of line normalization:
668 * 2.11 End-of-Line Handling
669 * Wherever an external parsed entity or the literal entity value
670 * of an internal parsed entity contains either the literal two-character
671 * sequence "#xD#xA" or a standalone literal #xD, an XML processor
672 * must pass to the application the single character #xA.
673 * This behavior can conveniently be produced by normalizing all
674 * line breaks to #xA on input, before parsing.)
676 * Returns the current char value and its length
680 xmlCurrentChar(xmlParserCtxtPtr ctxt
, int *len
) {
681 if ((ctxt
== NULL
) || (len
== NULL
) || (ctxt
->input
== NULL
)) return(0);
682 if (ctxt
->instate
== XML_PARSER_EOF
)
685 if ((ctxt
->input
->end
- ctxt
->input
->cur
< INPUT_CHUNK
) &&
686 (xmlParserGrow(ctxt
) < 0))
689 if ((*ctxt
->input
->cur
>= 0x20) && (*ctxt
->input
->cur
<= 0x7F)) {
691 return(*ctxt
->input
->cur
);
693 if (ctxt
->charset
== XML_CHAR_ENCODING_UTF8
) {
695 * We are supposed to handle UTF8, check it's valid
696 * From rfc2044: encoding of the Unicode values on UTF-8:
698 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
699 * 0000 0000-0000 007F 0xxxxxxx
700 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
701 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
703 * Check for the 0x110000 limit too
705 const unsigned char *cur
= ctxt
->input
->cur
;
713 if (((c
& 0x40) == 0) || (c
== 0xC0))
716 avail
= ctxt
->input
->end
- ctxt
->input
->cur
;
719 goto incomplete_sequence
;
720 if ((cur
[1] & 0xc0) != 0x80)
722 if ((c
& 0xe0) == 0xe0) {
724 goto incomplete_sequence
;
725 if ((cur
[2] & 0xc0) != 0x80)
727 if ((c
& 0xf0) == 0xf0) {
729 goto incomplete_sequence
;
730 if (((c
& 0xf8) != 0xf0) ||
731 ((cur
[3] & 0xc0) != 0x80))
735 val
= (cur
[0] & 0x7) << 18;
736 val
|= (cur
[1] & 0x3f) << 12;
737 val
|= (cur
[2] & 0x3f) << 6;
738 val
|= cur
[3] & 0x3f;
744 val
= (cur
[0] & 0xf) << 12;
745 val
|= (cur
[1] & 0x3f) << 6;
746 val
|= cur
[2] & 0x3f;
753 val
= (cur
[0] & 0x1f) << 6;
754 val
|= cur
[1] & 0x3f;
759 xmlErrEncodingInt(ctxt
, XML_ERR_INVALID_CHAR
,
760 "Char 0x%X out of allowed range\n", val
);
766 if ((*ctxt
->input
->cur
== 0) &&
767 (ctxt
->input
->end
> ctxt
->input
->cur
)) {
768 xmlErrEncodingInt(ctxt
, XML_ERR_INVALID_CHAR
,
769 "Char 0x0 out of allowed range\n", 0);
771 if (*ctxt
->input
->cur
== 0xD) {
772 if (ctxt
->input
->cur
[1] == 0xA) {
777 return(*ctxt
->input
->cur
);
781 * Assume it's a fixed length encoding (1) with
782 * a compatible encoding for the ASCII set, since
783 * XML constructs only use < 128 chars
786 if (*ctxt
->input
->cur
== 0xD) {
787 if (ctxt
->input
->cur
[1] == 0xA) {
792 return(*ctxt
->input
->cur
);
796 * If we detect an UTF8 error that probably mean that the
797 * input encoding didn't get properly advertised in the
798 * declaration header. Report the error and switch the encoding
799 * to ISO-Latin-1 (if you don't like this policy, just declare the
802 if (ctxt
->input
->end
- ctxt
->input
->cur
< 4) {
803 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
804 "Input is not proper UTF-8, indicate encoding !\n",
809 snprintf(&buffer
[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
810 ctxt
->input
->cur
[0], ctxt
->input
->cur
[1],
811 ctxt
->input
->cur
[2], ctxt
->input
->cur
[3]);
812 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
813 "Input is not proper UTF-8, indicate encoding !\n%s",
814 BAD_CAST buffer
, NULL
);
816 ctxt
->charset
= XML_CHAR_ENCODING_8859_1
;
818 return(*ctxt
->input
->cur
);
822 * An encoding problem may arise from a truncated input buffer
823 * splitting a character in the middle. In that case do not raise
824 * an error but return 0. This should only happen when push parsing
832 * xmlStringCurrentChar:
833 * @ctxt: the XML parser context
834 * @cur: pointer to the beginning of the char
835 * @len: pointer to the length of the char read
837 * DEPRECATED: Internal function, do not use.
839 * The current char value, if using UTF-8 this may actually span multiple
840 * bytes in the input buffer.
842 * Returns the current char value and its length
846 xmlStringCurrentChar(xmlParserCtxtPtr ctxt
, const xmlChar
* cur
, int *len
)
848 if ((len
== NULL
) || (cur
== NULL
)) return(0);
849 if ((ctxt
== NULL
) || (ctxt
->charset
== XML_CHAR_ENCODING_UTF8
)) {
851 * We are supposed to handle UTF8, check it's valid
852 * From rfc2044: encoding of the Unicode values on UTF-8:
854 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
855 * 0000 0000-0000 007F 0xxxxxxx
856 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
857 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
859 * Check for the 0x110000 limit too
866 if ((cur
[1] & 0xc0) != 0x80)
868 if ((c
& 0xe0) == 0xe0) {
870 if ((cur
[2] & 0xc0) != 0x80)
872 if ((c
& 0xf0) == 0xf0) {
873 if (((c
& 0xf8) != 0xf0) || ((cur
[3] & 0xc0) != 0x80))
877 val
= (cur
[0] & 0x7) << 18;
878 val
|= (cur
[1] & 0x3f) << 12;
879 val
|= (cur
[2] & 0x3f) << 6;
880 val
|= cur
[3] & 0x3f;
884 val
= (cur
[0] & 0xf) << 12;
885 val
|= (cur
[1] & 0x3f) << 6;
886 val
|= cur
[2] & 0x3f;
891 val
= (cur
[0] & 0x1f) << 6;
892 val
|= cur
[1] & 0x3f;
895 xmlErrEncodingInt(ctxt
, XML_ERR_INVALID_CHAR
,
896 "Char 0x%X out of allowed range\n", val
);
906 * Assume it's a fixed length encoding (1) with
907 * a compatible encoding for the ASCII set, since
908 * XML constructs only use < 128 chars
915 * An encoding problem may arise from a truncated input buffer
916 * splitting a character in the middle. In that case do not raise
917 * an error but return 0 to indicate an end of stream problem
919 if ((ctxt
== NULL
) || (ctxt
->input
== NULL
) ||
920 (ctxt
->input
->end
- ctxt
->input
->cur
< 4)) {
925 * If we detect an UTF8 error that probably mean that the
926 * input encoding didn't get properly advertised in the
927 * declaration header. Report the error and switch the encoding
928 * to ISO-Latin-1 (if you don't like this policy, just declare the
934 snprintf(buffer
, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
935 ctxt
->input
->cur
[0], ctxt
->input
->cur
[1],
936 ctxt
->input
->cur
[2], ctxt
->input
->cur
[3]);
937 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
938 "Input is not proper UTF-8, indicate encoding !\n%s",
939 BAD_CAST buffer
, NULL
);
946 * xmlCopyCharMultiByte:
947 * @out: pointer to an array of xmlChar
948 * @val: the char value
950 * append the char value in the array
952 * Returns the number of xmlChar written
955 xmlCopyCharMultiByte(xmlChar
*out
, int val
) {
956 if ((out
== NULL
) || (val
< 0)) return(0);
958 * We are supposed to handle UTF8, check it's valid
959 * From rfc2044: encoding of the Unicode values on UTF-8:
961 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
962 * 0000 0000-0000 007F 0xxxxxxx
963 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
964 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
967 xmlChar
*savedout
= out
;
969 if (val
< 0x800) { *out
++= (val
>> 6) | 0xC0; bits
= 0; }
970 else if (val
< 0x10000) { *out
++= (val
>> 12) | 0xE0; bits
= 6;}
971 else if (val
< 0x110000) { *out
++= (val
>> 18) | 0xF0; bits
= 12; }
973 xmlErrEncodingInt(NULL
, XML_ERR_INVALID_CHAR
,
974 "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n",
978 for ( ; bits
>= 0; bits
-= 6)
979 *out
++= ((val
>> bits
) & 0x3F) | 0x80 ;
980 return (out
- savedout
);
988 * @len: Ignored, compatibility
989 * @out: pointer to an array of xmlChar
990 * @val: the char value
992 * append the char value in the array
994 * Returns the number of xmlChar written
998 xmlCopyChar(int len ATTRIBUTE_UNUSED
, xmlChar
*out
, int val
) {
999 if ((out
== NULL
) || (val
< 0)) return(0);
1000 /* the len parameter is ignored */
1002 return(xmlCopyCharMultiByte (out
, val
));
1008 /************************************************************************
1010 * Commodity functions to switch encodings *
1012 ************************************************************************/
1014 static xmlCharEncodingHandlerPtr
1015 xmlDetectEBCDIC(xmlParserInputPtr input
) {
1017 xmlCharEncodingHandlerPtr handler
;
1018 int inlen
, outlen
, res
, i
;
1021 * To detect the EBCDIC code page, we convert the first 200 bytes
1022 * to EBCDIC-US and try to find the encoding declaration.
1024 handler
= xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC
);
1025 if (handler
== NULL
)
1027 outlen
= sizeof(out
) - 1;
1028 inlen
= input
->end
- input
->cur
;
1029 res
= xmlEncInputChunk(handler
, out
, &outlen
, input
->cur
, &inlen
, 0);
1034 for (i
= 0; i
< outlen
; i
++) {
1037 if ((out
[i
] == 'e') &&
1038 (xmlStrncmp(out
+ i
, BAD_CAST
"encoding", 8) == 0)) {
1039 int start
, cur
, quote
;
1042 while (IS_BLANK_CH(out
[i
]))
1044 if (out
[i
++] != '=')
1046 while (IS_BLANK_CH(out
[i
]))
1049 if ((quote
!= '\'') && (quote
!= '"'))
1053 while (((cur
>= 'a') && (cur
<= 'z')) ||
1054 ((cur
>= 'A') && (cur
<= 'Z')) ||
1055 ((cur
>= '0') && (cur
<= '9')) ||
1056 (cur
== '.') || (cur
== '_') ||
1062 xmlCharEncCloseFunc(handler
);
1063 handler
= xmlFindCharEncodingHandler((char *) out
+ start
);
1072 * xmlSwitchEncoding:
1073 * @ctxt: the parser context
1074 * @enc: the encoding value (number)
1076 * change the input functions when discovering the character encoding
1077 * of a given entity.
1079 * Returns 0 in case of success, -1 otherwise
1082 xmlSwitchEncoding(xmlParserCtxtPtr ctxt
, xmlCharEncoding enc
)
1084 xmlCharEncodingHandlerPtr handler
;
1087 if (ctxt
== NULL
) return(-1);
1090 * FIXME: The BOM shouldn't be skipped here, but in the parsing code.
1092 * Note that we look for a decoded UTF-8 BOM when switching to UTF-16.
1093 * This is mostly useless but Webkit/Chromium relies on this behavior.
1094 * See https://bugs.chromium.org/p/chromium/issues/detail?id=1451026
1096 if ((ctxt
->input
!= NULL
) &&
1097 (ctxt
->input
->consumed
== 0) &&
1098 (ctxt
->input
->cur
!= NULL
) &&
1099 (ctxt
->input
->cur
== ctxt
->input
->base
) &&
1100 ((enc
== XML_CHAR_ENCODING_UTF8
) ||
1101 (enc
== XML_CHAR_ENCODING_UTF16LE
) ||
1102 (enc
== XML_CHAR_ENCODING_UTF16BE
))) {
1104 * Errata on XML-1.0 June 20 2001
1105 * Specific handling of the Byte Order Mark for
1108 if ((ctxt
->input
->cur
[0] == 0xEF) &&
1109 (ctxt
->input
->cur
[1] == 0xBB) &&
1110 (ctxt
->input
->cur
[2] == 0xBF)) {
1111 ctxt
->input
->cur
+= 3;
1116 case XML_CHAR_ENCODING_ERROR
:
1117 __xmlErrEncoding(ctxt
, XML_ERR_UNKNOWN_ENCODING
,
1118 "encoding unknown\n", NULL
, NULL
);
1120 case XML_CHAR_ENCODING_NONE
:
1121 /* let's assume it's UTF-8 without the XML decl */
1122 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1124 case XML_CHAR_ENCODING_UTF8
:
1125 /* default encoding, no conversion should be needed */
1126 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1128 case XML_CHAR_ENCODING_EBCDIC
:
1129 handler
= xmlDetectEBCDIC(ctxt
->input
);
1132 handler
= xmlGetCharEncodingHandler(enc
);
1135 if (handler
== NULL
) {
1140 case XML_CHAR_ENCODING_ASCII
:
1141 /* default encoding, no conversion should be needed */
1142 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1144 case XML_CHAR_ENCODING_8859_1
:
1145 if ((ctxt
->inputNr
== 1) &&
1146 (ctxt
->encoding
== NULL
) &&
1147 (ctxt
->input
!= NULL
) &&
1148 (ctxt
->input
->encoding
!= NULL
)) {
1149 ctxt
->encoding
= xmlStrdup(ctxt
->input
->encoding
);
1151 ctxt
->charset
= enc
;
1154 __xmlErrEncoding(ctxt
, XML_ERR_UNSUPPORTED_ENCODING
,
1155 "encoding not supported: %s\n",
1156 BAD_CAST
xmlGetCharEncodingName(enc
), NULL
);
1158 * TODO: We could recover from errors in external entities
1159 * if we didn't stop the parser. But most callers of this
1160 * function don't check the return value.
1162 xmlStopParser(ctxt
);
1166 ret
= xmlSwitchInputEncoding(ctxt
, ctxt
->input
, handler
);
1167 if ((ret
< 0) || (ctxt
->errNo
== XML_I18N_CONV_FAILED
)) {
1169 * on encoding conversion errors, stop the parser
1171 xmlStopParser(ctxt
);
1172 ctxt
->errNo
= XML_I18N_CONV_FAILED
;
1178 * xmlSwitchInputEncoding:
1179 * @ctxt: the parser context
1180 * @input: the input stream
1181 * @handler: the encoding handler
1183 * change the input functions when discovering the character encoding
1184 * of a given entity.
1186 * Returns 0 in case of success, -1 otherwise
1189 xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt
, xmlParserInputPtr input
,
1190 xmlCharEncodingHandlerPtr handler
)
1193 xmlParserInputBufferPtr in
;
1195 if (handler
== NULL
)
1201 xmlErrInternal(ctxt
,
1202 "static memory buffer doesn't support encoding\n", NULL
);
1204 * Callers assume that the input buffer takes ownership of the
1205 * encoding handler. xmlCharEncCloseFunc frees unregistered
1206 * handlers and avoids a memory leak.
1208 xmlCharEncCloseFunc(handler
);
1212 if (in
->encoder
!= NULL
) {
1213 if (in
->encoder
== handler
)
1217 * Switching encodings during parsing is a really bad idea,
1218 * but Chromium can switch between ISO-8859-1 and UTF-16 before
1219 * separate calls to xmlParseChunk.
1221 * TODO: We should check whether the "raw" input buffer is empty and
1222 * convert the old content using the old encoder.
1225 xmlCharEncCloseFunc(in
->encoder
);
1226 in
->encoder
= handler
;
1230 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1231 in
->encoder
= handler
;
1234 * Is there already some content down the pipe to convert ?
1236 if (xmlBufIsEmpty(in
->buffer
) == 0) {
1237 size_t processed
, use
, consumed
;
1240 * FIXME: The BOM shouldn't be skipped here, but in the parsing code.
1244 * Specific handling of the Byte Order Mark for
1247 if ((handler
->name
!= NULL
) &&
1248 (!strcmp(handler
->name
, "UTF-16LE") ||
1249 !strcmp(handler
->name
, "UTF-16")) &&
1250 (input
->cur
[0] == 0xFF) && (input
->cur
[1] == 0xFE)) {
1253 if ((handler
->name
!= NULL
) &&
1254 (!strcmp(handler
->name
, "UTF-16BE")) &&
1255 (input
->cur
[0] == 0xFE) && (input
->cur
[1] == 0xFF)) {
1259 * Errata on XML-1.0 June 20 2001
1260 * Specific handling of the Byte Order Mark for
1263 if ((handler
->name
!= NULL
) &&
1264 (!strcmp(handler
->name
, "UTF-8")) &&
1265 (input
->cur
[0] == 0xEF) &&
1266 (input
->cur
[1] == 0xBB) && (input
->cur
[2] == 0xBF)) {
1271 * Shrink the current input buffer.
1272 * Move it as the raw buffer and create a new input buffer
1274 processed
= input
->cur
- input
->base
;
1275 xmlBufShrink(in
->buffer
, processed
);
1276 input
->consumed
+= processed
;
1277 in
->raw
= in
->buffer
;
1278 in
->buffer
= xmlBufCreate();
1279 in
->rawconsumed
= processed
;
1280 use
= xmlBufUse(in
->raw
);
1283 * TODO: We must flush and decode the whole buffer to make functions
1284 * like xmlReadMemory work with a user-provided encoding. If the
1285 * encoding is specified directly, we should probably set
1286 * XML_PARSE_IGNORE_ENC in xmlDoRead to avoid switching encodings
1287 * twice. Then we could set "flush" to false which should save
1288 * a considerable amount of memory when parsing from memory.
1289 * It's probably even possible to remove this whole if-block
1292 nbchars
= xmlCharEncInput(in
, 1);
1293 xmlBufResetInput(in
->buffer
, input
);
1295 /* TODO: This could be an out of memory or an encoding error. */
1296 xmlErrInternal(ctxt
,
1297 "switching encoding: encoder error\n",
1299 xmlHaltParser(ctxt
);
1302 consumed
= use
- xmlBufUse(in
->raw
);
1303 if ((consumed
> ULONG_MAX
) ||
1304 (in
->rawconsumed
> ULONG_MAX
- (unsigned long)consumed
))
1305 in
->rawconsumed
= ULONG_MAX
;
1307 in
->rawconsumed
+= consumed
;
1313 * xmlSwitchToEncoding:
1314 * @ctxt: the parser context
1315 * @handler: the encoding handler
1317 * change the input functions when discovering the character encoding
1318 * of a given entity.
1320 * Returns 0 in case of success, -1 otherwise
1323 xmlSwitchToEncoding(xmlParserCtxtPtr ctxt
, xmlCharEncodingHandlerPtr handler
)
1327 return(xmlSwitchInputEncoding(ctxt
, ctxt
->input
, handler
));
1330 /************************************************************************
1332 * Commodity functions to handle entities processing *
1334 ************************************************************************/
1337 * xmlFreeInputStream:
1338 * @input: an xmlParserInputPtr
1340 * Free up an input stream.
1343 xmlFreeInputStream(xmlParserInputPtr input
) {
1344 if (input
== NULL
) return;
1346 if (input
->filename
!= NULL
) xmlFree((char *) input
->filename
);
1347 if (input
->directory
!= NULL
) xmlFree((char *) input
->directory
);
1348 if (input
->encoding
!= NULL
) xmlFree((char *) input
->encoding
);
1349 if (input
->version
!= NULL
) xmlFree((char *) input
->version
);
1350 if ((input
->free
!= NULL
) && (input
->base
!= NULL
))
1351 input
->free((xmlChar
*) input
->base
);
1352 if (input
->buf
!= NULL
)
1353 xmlFreeParserInputBuffer(input
->buf
);
1358 * xmlNewInputStream:
1359 * @ctxt: an XML parser context
1361 * Create a new input stream structure.
1363 * Returns the new input stream or NULL
1366 xmlNewInputStream(xmlParserCtxtPtr ctxt
) {
1367 xmlParserInputPtr input
;
1369 input
= (xmlParserInputPtr
) xmlMalloc(sizeof(xmlParserInput
));
1370 if (input
== NULL
) {
1371 xmlErrMemory(ctxt
, "couldn't allocate a new input stream\n");
1374 memset(input
, 0, sizeof(xmlParserInput
));
1377 input
->standalone
= -1;
1380 * If the context is NULL the id cannot be initialized, but that
1381 * should not happen while parsing which is the situation where
1382 * the id is actually needed.
1385 if (input
->id
>= INT_MAX
) {
1386 xmlErrMemory(ctxt
, "Input ID overflow\n");
1389 input
->id
= ctxt
->input_id
++;
1396 * xmlNewIOInputStream:
1397 * @ctxt: an XML parser context
1398 * @input: an I/O Input
1399 * @enc: the charset encoding if known
1401 * Create a new input stream structure encapsulating the @input into
1402 * a stream suitable for the parser.
1404 * Returns the new input stream or NULL
1407 xmlNewIOInputStream(xmlParserCtxtPtr ctxt
, xmlParserInputBufferPtr input
,
1408 xmlCharEncoding enc
) {
1409 xmlParserInputPtr inputStream
;
1411 if (input
== NULL
) return(NULL
);
1412 if (xmlParserDebugEntities
)
1413 xmlGenericError(xmlGenericErrorContext
, "new input from I/O\n");
1414 inputStream
= xmlNewInputStream(ctxt
);
1415 if (inputStream
== NULL
) {
1418 inputStream
->filename
= NULL
;
1419 inputStream
->buf
= input
;
1420 xmlBufResetInput(inputStream
->buf
->buffer
, inputStream
);
1422 if (enc
!= XML_CHAR_ENCODING_NONE
) {
1423 xmlSwitchEncoding(ctxt
, enc
);
1426 return(inputStream
);
1430 * xmlNewEntityInputStream:
1431 * @ctxt: an XML parser context
1432 * @entity: an Entity pointer
1434 * DEPRECATED: Internal function, do not use.
1436 * Create a new input stream based on an xmlEntityPtr
1438 * Returns the new input stream or NULL
1441 xmlNewEntityInputStream(xmlParserCtxtPtr ctxt
, xmlEntityPtr entity
) {
1442 xmlParserInputPtr input
;
1444 if (entity
== NULL
) {
1445 xmlErrInternal(ctxt
, "xmlNewEntityInputStream entity = NULL\n",
1449 if (xmlParserDebugEntities
)
1450 xmlGenericError(xmlGenericErrorContext
,
1451 "new input from entity: %s\n", entity
->name
);
1452 if (entity
->content
== NULL
) {
1453 switch (entity
->etype
) {
1454 case XML_EXTERNAL_GENERAL_UNPARSED_ENTITY
:
1455 xmlErrInternal(ctxt
, "Cannot parse entity %s\n",
1458 case XML_EXTERNAL_GENERAL_PARSED_ENTITY
:
1459 case XML_EXTERNAL_PARAMETER_ENTITY
:
1460 input
= xmlLoadExternalEntity((char *) entity
->URI
,
1461 (char *) entity
->ExternalID
, ctxt
);
1463 input
->entity
= entity
;
1465 case XML_INTERNAL_GENERAL_ENTITY
:
1466 xmlErrInternal(ctxt
,
1467 "Internal entity %s without content !\n",
1470 case XML_INTERNAL_PARAMETER_ENTITY
:
1471 xmlErrInternal(ctxt
,
1472 "Internal parameter entity %s without content !\n",
1475 case XML_INTERNAL_PREDEFINED_ENTITY
:
1476 xmlErrInternal(ctxt
,
1477 "Predefined entity %s without content !\n",
1483 input
= xmlNewInputStream(ctxt
);
1484 if (input
== NULL
) {
1487 if (entity
->URI
!= NULL
)
1488 input
->filename
= (char *) xmlStrdup((xmlChar
*) entity
->URI
);
1489 input
->base
= entity
->content
;
1490 if (entity
->length
== 0)
1491 entity
->length
= xmlStrlen(entity
->content
);
1492 input
->cur
= entity
->content
;
1493 input
->length
= entity
->length
;
1494 input
->end
= &entity
->content
[input
->length
];
1495 input
->entity
= entity
;
1500 * xmlNewStringInputStream:
1501 * @ctxt: an XML parser context
1502 * @buffer: an memory buffer
1504 * Create a new input stream based on a memory buffer.
1505 * Returns the new input stream
1508 xmlNewStringInputStream(xmlParserCtxtPtr ctxt
, const xmlChar
*buffer
) {
1509 xmlParserInputPtr input
;
1510 xmlParserInputBufferPtr buf
;
1512 if (buffer
== NULL
) {
1513 xmlErrInternal(ctxt
, "xmlNewStringInputStream string = NULL\n",
1517 if (xmlParserDebugEntities
)
1518 xmlGenericError(xmlGenericErrorContext
,
1519 "new fixed input: %.30s\n", buffer
);
1520 buf
= xmlParserInputBufferCreateMem((const char *) buffer
,
1522 XML_CHAR_ENCODING_NONE
);
1524 xmlErrMemory(ctxt
, NULL
);
1527 input
= xmlNewInputStream(ctxt
);
1528 if (input
== NULL
) {
1529 xmlErrMemory(ctxt
, "couldn't allocate a new input stream\n");
1530 xmlFreeParserInputBuffer(buf
);
1534 xmlBufResetInput(input
->buf
->buffer
, input
);
1539 * xmlNewInputFromFile:
1540 * @ctxt: an XML parser context
1541 * @filename: the filename to use as entity
1543 * Create a new input stream based on a file or an URL.
1545 * Returns the new input stream or NULL in case of error
1548 xmlNewInputFromFile(xmlParserCtxtPtr ctxt
, const char *filename
) {
1549 xmlParserInputBufferPtr buf
;
1550 xmlParserInputPtr inputStream
;
1551 char *directory
= NULL
;
1552 xmlChar
*URI
= NULL
;
1554 if (xmlParserDebugEntities
)
1555 xmlGenericError(xmlGenericErrorContext
,
1556 "new input from file: %s\n", filename
);
1557 if (ctxt
== NULL
) return(NULL
);
1558 buf
= xmlParserInputBufferCreateFilename(filename
, XML_CHAR_ENCODING_NONE
);
1560 if (filename
== NULL
)
1561 __xmlLoaderErr(ctxt
,
1562 "failed to load external entity: NULL filename \n",
1565 __xmlLoaderErr(ctxt
, "failed to load external entity \"%s\"\n",
1566 (const char *) filename
);
1570 inputStream
= xmlNewInputStream(ctxt
);
1571 if (inputStream
== NULL
) {
1572 xmlFreeParserInputBuffer(buf
);
1576 inputStream
->buf
= buf
;
1577 inputStream
= xmlCheckHTTPInput(ctxt
, inputStream
);
1578 if (inputStream
== NULL
)
1581 if (inputStream
->filename
== NULL
)
1582 URI
= xmlStrdup((xmlChar
*) filename
);
1584 URI
= xmlStrdup((xmlChar
*) inputStream
->filename
);
1585 directory
= xmlParserGetDirectory((const char *) URI
);
1586 if (inputStream
->filename
!= NULL
) xmlFree((char *)inputStream
->filename
);
1587 inputStream
->filename
= (char *) xmlCanonicPath((const xmlChar
*) URI
);
1588 if (URI
!= NULL
) xmlFree((char *) URI
);
1589 inputStream
->directory
= directory
;
1591 xmlBufResetInput(inputStream
->buf
->buffer
, inputStream
);
1592 if ((ctxt
->directory
== NULL
) && (directory
!= NULL
))
1593 ctxt
->directory
= (char *) xmlStrdup((const xmlChar
*) directory
);
1594 return(inputStream
);
1597 /************************************************************************
1599 * Commodity functions to handle parser contexts *
1601 ************************************************************************/
1604 * xmlInitSAXParserCtxt:
1605 * @ctxt: XML parser context
1606 * @sax: SAX handlert
1607 * @userData: user data
1609 * Initialize a SAX parser context
1611 * Returns 0 in case of success and -1 in case of error
1615 xmlInitSAXParserCtxt(xmlParserCtxtPtr ctxt
, const xmlSAXHandler
*sax
,
1618 xmlParserInputPtr input
;
1621 xmlErrInternal(NULL
, "Got NULL parser context\n", NULL
);
1627 if (ctxt
->dict
== NULL
)
1628 ctxt
->dict
= xmlDictCreate();
1629 if (ctxt
->dict
== NULL
) {
1630 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1633 xmlDictSetLimit(ctxt
->dict
, XML_MAX_DICTIONARY_LIMIT
);
1635 if (ctxt
->sax
== NULL
)
1636 ctxt
->sax
= (xmlSAXHandler
*) xmlMalloc(sizeof(xmlSAXHandler
));
1637 if (ctxt
->sax
== NULL
) {
1638 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1642 memset(ctxt
->sax
, 0, sizeof(xmlSAXHandler
));
1643 xmlSAXVersion(ctxt
->sax
, 2);
1644 ctxt
->userData
= ctxt
;
1646 if (sax
->initialized
== XML_SAX2_MAGIC
) {
1647 memcpy(ctxt
->sax
, sax
, sizeof(xmlSAXHandler
));
1649 memset(ctxt
->sax
, 0, sizeof(xmlSAXHandler
));
1650 memcpy(ctxt
->sax
, sax
, sizeof(xmlSAXHandlerV1
));
1652 ctxt
->userData
= userData
? userData
: ctxt
;
1657 /* Allocate the Input stack */
1658 if (ctxt
->inputTab
== NULL
) {
1659 ctxt
->inputTab
= (xmlParserInputPtr
*)
1660 xmlMalloc(5 * sizeof(xmlParserInputPtr
));
1663 if (ctxt
->inputTab
== NULL
) {
1664 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1670 while ((input
= inputPop(ctxt
)) != NULL
) { /* Non consuming */
1671 xmlFreeInputStream(input
);
1676 ctxt
->version
= NULL
;
1677 ctxt
->encoding
= NULL
;
1678 ctxt
->standalone
= -1;
1679 ctxt
->hasExternalSubset
= 0;
1680 ctxt
->hasPErefs
= 0;
1683 ctxt
->instate
= XML_PARSER_START
;
1685 ctxt
->directory
= NULL
;
1687 /* Allocate the Node stack */
1688 if (ctxt
->nodeTab
== NULL
) {
1689 ctxt
->nodeTab
= (xmlNodePtr
*) xmlMalloc(10 * sizeof(xmlNodePtr
));
1692 if (ctxt
->nodeTab
== NULL
) {
1693 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1705 /* Allocate the Name stack */
1706 if (ctxt
->nameTab
== NULL
) {
1707 ctxt
->nameTab
= (const xmlChar
**) xmlMalloc(10 * sizeof(xmlChar
*));
1710 if (ctxt
->nameTab
== NULL
) {
1711 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1726 /* Allocate the space stack */
1727 if (ctxt
->spaceTab
== NULL
) {
1728 ctxt
->spaceTab
= (int *) xmlMalloc(10 * sizeof(int));
1729 ctxt
->spaceMax
= 10;
1731 if (ctxt
->spaceTab
== NULL
) {
1732 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1748 ctxt
->spaceMax
= 10;
1749 ctxt
->spaceTab
[0] = -1;
1750 ctxt
->space
= &ctxt
->spaceTab
[0];
1752 ctxt
->wellFormed
= 1;
1753 ctxt
->nsWellFormed
= 1;
1755 ctxt
->loadsubset
= xmlLoadExtDtdDefaultValue
;
1756 if (ctxt
->loadsubset
) {
1757 ctxt
->options
|= XML_PARSE_DTDLOAD
;
1759 ctxt
->validate
= xmlDoValidityCheckingDefaultValue
;
1760 ctxt
->pedantic
= xmlPedanticParserDefaultValue
;
1761 if (ctxt
->pedantic
) {
1762 ctxt
->options
|= XML_PARSE_PEDANTIC
;
1764 ctxt
->linenumbers
= xmlLineNumbersDefaultValue
;
1765 ctxt
->keepBlanks
= xmlKeepBlanksDefaultValue
;
1766 if (ctxt
->keepBlanks
== 0) {
1767 ctxt
->sax
->ignorableWhitespace
= xmlSAX2IgnorableWhitespace
;
1768 ctxt
->options
|= XML_PARSE_NOBLANKS
;
1771 ctxt
->vctxt
.flags
= XML_VCTXT_USE_PCTXT
;
1772 ctxt
->vctxt
.userData
= ctxt
;
1773 ctxt
->vctxt
.error
= xmlParserValidityError
;
1774 ctxt
->vctxt
.warning
= xmlParserValidityWarning
;
1775 if (ctxt
->validate
) {
1776 if (xmlGetWarningsDefaultValue
== 0)
1777 ctxt
->vctxt
.warning
= NULL
;
1779 ctxt
->vctxt
.warning
= xmlParserValidityWarning
;
1780 ctxt
->vctxt
.nodeMax
= 0;
1781 ctxt
->options
|= XML_PARSE_DTDVALID
;
1783 ctxt
->replaceEntities
= xmlSubstituteEntitiesDefaultValue
;
1784 if (ctxt
->replaceEntities
) {
1785 ctxt
->options
|= XML_PARSE_NOENT
;
1787 ctxt
->record_info
= 0;
1788 ctxt
->checkIndex
= 0;
1790 ctxt
->errNo
= XML_ERR_OK
;
1792 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1793 ctxt
->catalogs
= NULL
;
1794 ctxt
->sizeentities
= 0;
1795 ctxt
->sizeentcopy
= 0;
1797 xmlInitNodeInfoSeq(&ctxt
->node_seq
);
1802 * xmlInitParserCtxt:
1803 * @ctxt: an XML parser context
1805 * DEPRECATED: Internal function which will be made private in a future
1808 * Initialize a parser context
1810 * Returns 0 in case of success and -1 in case of error
1814 xmlInitParserCtxt(xmlParserCtxtPtr ctxt
)
1816 return(xmlInitSAXParserCtxt(ctxt
, NULL
, NULL
));
1820 * xmlFreeParserCtxt:
1821 * @ctxt: an XML parser context
1823 * Free all the memory used by a parser context. However the parsed
1824 * document in ctxt->myDoc is not freed.
1828 xmlFreeParserCtxt(xmlParserCtxtPtr ctxt
)
1830 xmlParserInputPtr input
;
1832 if (ctxt
== NULL
) return;
1834 while ((input
= inputPop(ctxt
)) != NULL
) { /* Non consuming */
1835 xmlFreeInputStream(input
);
1837 if (ctxt
->spaceTab
!= NULL
) xmlFree(ctxt
->spaceTab
);
1838 if (ctxt
->nameTab
!= NULL
) xmlFree((xmlChar
* *)ctxt
->nameTab
);
1839 if (ctxt
->nodeTab
!= NULL
) xmlFree(ctxt
->nodeTab
);
1840 if (ctxt
->nodeInfoTab
!= NULL
) xmlFree(ctxt
->nodeInfoTab
);
1841 if (ctxt
->inputTab
!= NULL
) xmlFree(ctxt
->inputTab
);
1842 if (ctxt
->version
!= NULL
) xmlFree((char *) ctxt
->version
);
1843 if (ctxt
->encoding
!= NULL
) xmlFree((char *) ctxt
->encoding
);
1844 if (ctxt
->extSubURI
!= NULL
) xmlFree((char *) ctxt
->extSubURI
);
1845 if (ctxt
->extSubSystem
!= NULL
) xmlFree((char *) ctxt
->extSubSystem
);
1846 #ifdef LIBXML_SAX1_ENABLED
1847 if ((ctxt
->sax
!= NULL
) &&
1848 (ctxt
->sax
!= (xmlSAXHandlerPtr
) &xmlDefaultSAXHandler
))
1850 if (ctxt
->sax
!= NULL
)
1851 #endif /* LIBXML_SAX1_ENABLED */
1853 if (ctxt
->directory
!= NULL
) xmlFree((char *) ctxt
->directory
);
1854 if (ctxt
->vctxt
.nodeTab
!= NULL
) xmlFree(ctxt
->vctxt
.nodeTab
);
1855 if (ctxt
->atts
!= NULL
) xmlFree((xmlChar
* *)ctxt
->atts
);
1856 if (ctxt
->dict
!= NULL
) xmlDictFree(ctxt
->dict
);
1857 if (ctxt
->nsTab
!= NULL
) xmlFree((char *) ctxt
->nsTab
);
1858 if (ctxt
->pushTab
!= NULL
) xmlFree(ctxt
->pushTab
);
1859 if (ctxt
->attallocs
!= NULL
) xmlFree(ctxt
->attallocs
);
1860 if (ctxt
->attsDefault
!= NULL
)
1861 xmlHashFree(ctxt
->attsDefault
, xmlHashDefaultDeallocator
);
1862 if (ctxt
->attsSpecial
!= NULL
)
1863 xmlHashFree(ctxt
->attsSpecial
, NULL
);
1864 if (ctxt
->freeElems
!= NULL
) {
1865 xmlNodePtr cur
, next
;
1867 cur
= ctxt
->freeElems
;
1868 while (cur
!= NULL
) {
1874 if (ctxt
->freeAttrs
!= NULL
) {
1875 xmlAttrPtr cur
, next
;
1877 cur
= ctxt
->freeAttrs
;
1878 while (cur
!= NULL
) {
1885 * cleanup the error strings
1887 if (ctxt
->lastError
.message
!= NULL
)
1888 xmlFree(ctxt
->lastError
.message
);
1889 if (ctxt
->lastError
.file
!= NULL
)
1890 xmlFree(ctxt
->lastError
.file
);
1891 if (ctxt
->lastError
.str1
!= NULL
)
1892 xmlFree(ctxt
->lastError
.str1
);
1893 if (ctxt
->lastError
.str2
!= NULL
)
1894 xmlFree(ctxt
->lastError
.str2
);
1895 if (ctxt
->lastError
.str3
!= NULL
)
1896 xmlFree(ctxt
->lastError
.str3
);
1898 #ifdef LIBXML_CATALOG_ENABLED
1899 if (ctxt
->catalogs
!= NULL
)
1900 xmlCatalogFreeLocal(ctxt
->catalogs
);
1908 * Allocate and initialize a new parser context.
1910 * Returns the xmlParserCtxtPtr or NULL
1914 xmlNewParserCtxt(void)
1916 return(xmlNewSAXParserCtxt(NULL
, NULL
));
1920 * xmlNewSAXParserCtxt:
1922 * @userData: user data
1924 * Allocate and initialize a new SAX parser context. If userData is NULL,
1925 * the parser context will be passed as user data.
1927 * Returns the xmlParserCtxtPtr or NULL if memory allocation failed.
1931 xmlNewSAXParserCtxt(const xmlSAXHandler
*sax
, void *userData
)
1933 xmlParserCtxtPtr ctxt
;
1935 ctxt
= (xmlParserCtxtPtr
) xmlMalloc(sizeof(xmlParserCtxt
));
1937 xmlErrMemory(NULL
, "cannot allocate parser context\n");
1940 memset(ctxt
, 0, sizeof(xmlParserCtxt
));
1941 if (xmlInitSAXParserCtxt(ctxt
, sax
, userData
) < 0) {
1942 xmlFreeParserCtxt(ctxt
);
1948 /************************************************************************
1950 * Handling of node information *
1952 ************************************************************************/
1955 * xmlClearParserCtxt:
1956 * @ctxt: an XML parser context
1958 * Clear (release owned resources) and reinitialize a parser context
1962 xmlClearParserCtxt(xmlParserCtxtPtr ctxt
)
1966 xmlClearNodeInfoSeq(&ctxt
->node_seq
);
1972 * xmlParserFindNodeInfo:
1973 * @ctx: an XML parser context
1974 * @node: an XML node within the tree
1976 * DEPRECATED: Don't use.
1978 * Find the parser node info struct for a given node
1980 * Returns an xmlParserNodeInfo block pointer or NULL
1982 const xmlParserNodeInfo
*
1983 xmlParserFindNodeInfo(const xmlParserCtxtPtr ctx
, const xmlNodePtr node
)
1987 if ((ctx
== NULL
) || (node
== NULL
))
1989 /* Find position where node should be at */
1990 pos
= xmlParserFindNodeInfoIndex(&ctx
->node_seq
, node
);
1991 if (pos
< ctx
->node_seq
.length
1992 && ctx
->node_seq
.buffer
[pos
].node
== node
)
1993 return &ctx
->node_seq
.buffer
[pos
];
2000 * xmlInitNodeInfoSeq:
2001 * @seq: a node info sequence pointer
2003 * DEPRECATED: Don't use.
2005 * -- Initialize (set to initial state) node info sequence
2008 xmlInitNodeInfoSeq(xmlParserNodeInfoSeqPtr seq
)
2018 * xmlClearNodeInfoSeq:
2019 * @seq: a node info sequence pointer
2021 * DEPRECATED: Don't use.
2023 * -- Clear (release memory and reinitialize) node
2027 xmlClearNodeInfoSeq(xmlParserNodeInfoSeqPtr seq
)
2031 if (seq
->buffer
!= NULL
)
2032 xmlFree(seq
->buffer
);
2033 xmlInitNodeInfoSeq(seq
);
2037 * xmlParserFindNodeInfoIndex:
2038 * @seq: a node info sequence pointer
2039 * @node: an XML node pointer
2041 * DEPRECATED: Don't use.
2043 * xmlParserFindNodeInfoIndex : Find the index that the info record for
2044 * the given node is or should be at in a sorted sequence
2046 * Returns a long indicating the position of the record
2049 xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeqPtr seq
,
2050 const xmlNodePtr node
)
2052 unsigned long upper
, lower
, middle
;
2055 if ((seq
== NULL
) || (node
== NULL
))
2056 return ((unsigned long) -1);
2058 /* Do a binary search for the key */
2060 upper
= seq
->length
;
2062 while (lower
<= upper
&& !found
) {
2063 middle
= lower
+ (upper
- lower
) / 2;
2064 if (node
== seq
->buffer
[middle
- 1].node
)
2066 else if (node
< seq
->buffer
[middle
- 1].node
)
2072 /* Return position */
2073 if (middle
== 0 || seq
->buffer
[middle
- 1].node
< node
)
2081 * xmlParserAddNodeInfo:
2082 * @ctxt: an XML parser context
2083 * @info: a node info sequence pointer
2085 * DEPRECATED: Don't use.
2087 * Insert node info record into the sorted sequence
2090 xmlParserAddNodeInfo(xmlParserCtxtPtr ctxt
,
2091 const xmlParserNodeInfoPtr info
)
2095 if ((ctxt
== NULL
) || (info
== NULL
)) return;
2097 /* Find pos and check to see if node is already in the sequence */
2098 pos
= xmlParserFindNodeInfoIndex(&ctxt
->node_seq
, (xmlNodePtr
)
2101 if ((pos
< ctxt
->node_seq
.length
) &&
2102 (ctxt
->node_seq
.buffer
!= NULL
) &&
2103 (ctxt
->node_seq
.buffer
[pos
].node
== info
->node
)) {
2104 ctxt
->node_seq
.buffer
[pos
] = *info
;
2107 /* Otherwise, we need to add new node to buffer */
2109 if ((ctxt
->node_seq
.length
+ 1 > ctxt
->node_seq
.maximum
) ||
2110 (ctxt
->node_seq
.buffer
== NULL
)) {
2111 xmlParserNodeInfo
*tmp_buffer
;
2112 unsigned int byte_size
;
2114 if (ctxt
->node_seq
.maximum
== 0)
2115 ctxt
->node_seq
.maximum
= 2;
2116 byte_size
= (sizeof(*ctxt
->node_seq
.buffer
) *
2117 (2 * ctxt
->node_seq
.maximum
));
2119 if (ctxt
->node_seq
.buffer
== NULL
)
2120 tmp_buffer
= (xmlParserNodeInfo
*) xmlMalloc(byte_size
);
2123 (xmlParserNodeInfo
*) xmlRealloc(ctxt
->node_seq
.buffer
,
2126 if (tmp_buffer
== NULL
) {
2127 xmlErrMemory(ctxt
, "failed to allocate buffer\n");
2130 ctxt
->node_seq
.buffer
= tmp_buffer
;
2131 ctxt
->node_seq
.maximum
*= 2;
2134 /* If position is not at end, move elements out of the way */
2135 if (pos
!= ctxt
->node_seq
.length
) {
2138 for (i
= ctxt
->node_seq
.length
; i
> pos
; i
--)
2139 ctxt
->node_seq
.buffer
[i
] = ctxt
->node_seq
.buffer
[i
- 1];
2142 /* Copy element and increase length */
2143 ctxt
->node_seq
.buffer
[pos
] = *info
;
2144 ctxt
->node_seq
.length
++;
2148 /************************************************************************
2150 * Defaults settings *
2152 ************************************************************************/
2154 * xmlPedanticParserDefault:
2157 * DEPRECATED: Use the modern options API with XML_PARSE_PEDANTIC.
2159 * Set and return the previous value for enabling pedantic warnings.
2161 * Returns the last value for 0 for no substitution, 1 for substitution.
2165 xmlPedanticParserDefault(int val
) {
2166 int old
= xmlPedanticParserDefaultValue
;
2168 xmlPedanticParserDefaultValue
= val
;
2173 * xmlLineNumbersDefault:
2176 * DEPRECATED: The modern options API always enables line numbers.
2178 * Set and return the previous value for enabling line numbers in elements
2179 * contents. This may break on old application and is turned off by default.
2181 * Returns the last value for 0 for no substitution, 1 for substitution.
2185 xmlLineNumbersDefault(int val
) {
2186 int old
= xmlLineNumbersDefaultValue
;
2188 xmlLineNumbersDefaultValue
= val
;
2193 * xmlSubstituteEntitiesDefault:
2196 * DEPRECATED: Use the modern options API with XML_PARSE_NOENT.
2198 * Set and return the previous value for default entity support.
2199 * Initially the parser always keep entity references instead of substituting
2200 * entity values in the output. This function has to be used to change the
2201 * default parser behavior
2202 * SAX::substituteEntities() has to be used for changing that on a file by
2205 * Returns the last value for 0 for no substitution, 1 for substitution.
2209 xmlSubstituteEntitiesDefault(int val
) {
2210 int old
= xmlSubstituteEntitiesDefaultValue
;
2212 xmlSubstituteEntitiesDefaultValue
= val
;
2217 * xmlKeepBlanksDefault:
2220 * DEPRECATED: Use the modern options API with XML_PARSE_NOBLANKS.
2222 * Set and return the previous value for default blanks text nodes support.
2223 * The 1.x version of the parser used an heuristic to try to detect
2224 * ignorable white spaces. As a result the SAX callback was generating
2225 * xmlSAX2IgnorableWhitespace() callbacks instead of characters() one, and when
2226 * using the DOM output text nodes containing those blanks were not generated.
2227 * The 2.x and later version will switch to the XML standard way and
2228 * ignorableWhitespace() are only generated when running the parser in
2229 * validating mode and when the current element doesn't allow CDATA or
2231 * This function is provided as a way to force the standard behavior
2232 * on 1.X libs and to switch back to the old mode for compatibility when
2233 * running 1.X client code on 2.X . Upgrade of 1.X code should be done
2234 * by using xmlIsBlankNode() commodity function to detect the "empty"
2236 * This value also affect autogeneration of indentation when saving code
2237 * if blanks sections are kept, indentation is not generated.
2239 * Returns the last value for 0 for no substitution, 1 for substitution.
2243 xmlKeepBlanksDefault(int val
) {
2244 int old
= xmlKeepBlanksDefaultValue
;
2246 xmlKeepBlanksDefaultValue
= val
;
2247 if (!val
) xmlIndentTreeOutput
= 1;