2 * parserInternals.c : Internal routines (and obsolete ones) needed for the
3 * XML and HTML parsers.
5 * See Copyright for the status of this software.
14 #define XML_DIR_SEP '\\'
16 #define XML_DIR_SEP '/'
23 #include <libxml/xmlmemory.h>
24 #include <libxml/tree.h>
25 #include <libxml/parser.h>
26 #include <libxml/parserInternals.h>
27 #include <libxml/valid.h>
28 #include <libxml/entities.h>
29 #include <libxml/xmlerror.h>
30 #include <libxml/encoding.h>
31 #include <libxml/valid.h>
32 #include <libxml/xmlIO.h>
33 #include <libxml/uri.h>
34 #include <libxml/dict.h>
35 #include <libxml/SAX.h>
36 #ifdef LIBXML_CATALOG_ENABLED
37 #include <libxml/catalog.h>
39 #include <libxml/globals.h>
40 #include <libxml/chvalid.h>
42 #define CUR(ctxt) ctxt->input->cur
43 #define END(ctxt) ctxt->input->end
44 #define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))
46 #include "private/buf.h"
47 #include "private/enc.h"
48 #include "private/error.h"
49 #include "private/io.h"
50 #include "private/parser.h"
53 * Various global defaults for parsing
58 * @version: the include version number
60 * check the compiled lib version against the include one.
61 * This can warn or immediately kill the application
64 xmlCheckVersion(int version
) {
65 int myversion
= LIBXML_VERSION
;
69 if ((myversion
/ 10000) != (version
/ 10000)) {
70 xmlGenericError(xmlGenericErrorContext
,
71 "Fatal: program compiled against libxml %d using libxml %d\n",
72 (version
/ 10000), (myversion
/ 10000));
74 "Fatal: program compiled against libxml %d using libxml %d\n",
75 (version
/ 10000), (myversion
/ 10000));
77 if ((myversion
/ 100) < (version
/ 100)) {
78 xmlGenericError(xmlGenericErrorContext
,
79 "Warning: program compiled against libxml %d using older %d\n",
80 (version
/ 100), (myversion
/ 100));
85 /************************************************************************
87 * Some factorized error routines *
89 ************************************************************************/
94 * @ctxt: an XML parser context
95 * @extra: extra information
97 * Handle a redefinition of attribute error
100 xmlErrMemory(xmlParserCtxtPtr ctxt
, const char *extra
)
102 if ((ctxt
!= NULL
) && (ctxt
->disableSAX
!= 0) &&
103 (ctxt
->instate
== XML_PARSER_EOF
))
106 ctxt
->errNo
= XML_ERR_NO_MEMORY
;
107 ctxt
->instate
= XML_PARSER_EOF
;
108 ctxt
->disableSAX
= 1;
111 __xmlRaiseError(NULL
, NULL
, NULL
, ctxt
, NULL
, XML_FROM_PARSER
,
112 XML_ERR_NO_MEMORY
, XML_ERR_FATAL
, NULL
, 0, extra
,
114 "Memory allocation failed : %s\n", extra
);
116 __xmlRaiseError(NULL
, NULL
, NULL
, ctxt
, NULL
, XML_FROM_PARSER
,
117 XML_ERR_NO_MEMORY
, XML_ERR_FATAL
, NULL
, 0, NULL
,
118 NULL
, NULL
, 0, 0, "Memory allocation failed\n");
123 * @ctxt: an XML parser context
124 * @xmlerr: the error number
125 * @msg: the error message
126 * @str1: an string info
127 * @str2: an string info
129 * Handle an encoding error
132 __xmlErrEncoding(xmlParserCtxtPtr ctxt
, xmlParserErrors xmlerr
,
133 const char *msg
, const xmlChar
* str1
, const xmlChar
* str2
)
135 if ((ctxt
!= NULL
) && (ctxt
->disableSAX
!= 0) &&
136 (ctxt
->instate
== XML_PARSER_EOF
))
139 ctxt
->errNo
= xmlerr
;
140 __xmlRaiseError(NULL
, NULL
, NULL
,
141 ctxt
, NULL
, XML_FROM_PARSER
, xmlerr
, XML_ERR_FATAL
,
142 NULL
, 0, (const char *) str1
, (const char *) str2
,
143 NULL
, 0, 0, msg
, str1
, str2
);
145 ctxt
->wellFormed
= 0;
146 if (ctxt
->recovery
== 0)
147 ctxt
->disableSAX
= 1;
153 * @ctxt: an XML parser context
154 * @msg: the error message
155 * @str: error information
157 * Handle an internal error
159 static void LIBXML_ATTR_FORMAT(2,0)
160 xmlErrInternal(xmlParserCtxtPtr ctxt
, const char *msg
, const xmlChar
* str
)
162 if ((ctxt
!= NULL
) && (ctxt
->disableSAX
!= 0) &&
163 (ctxt
->instate
== XML_PARSER_EOF
))
166 ctxt
->errNo
= XML_ERR_INTERNAL_ERROR
;
167 __xmlRaiseError(NULL
, NULL
, NULL
,
168 ctxt
, NULL
, XML_FROM_PARSER
, XML_ERR_INTERNAL_ERROR
,
169 XML_ERR_FATAL
, NULL
, 0, (const char *) str
, NULL
, NULL
,
172 ctxt
->wellFormed
= 0;
173 if (ctxt
->recovery
== 0)
174 ctxt
->disableSAX
= 1;
180 * @ctxt: an XML parser context
181 * @error: the error number
182 * @msg: the error message
183 * @val: an integer value
187 static void LIBXML_ATTR_FORMAT(3,0)
188 xmlErrEncodingInt(xmlParserCtxtPtr ctxt
, xmlParserErrors error
,
189 const char *msg
, int val
)
191 if ((ctxt
!= NULL
) && (ctxt
->disableSAX
!= 0) &&
192 (ctxt
->instate
== XML_PARSER_EOF
))
196 __xmlRaiseError(NULL
, NULL
, NULL
,
197 ctxt
, NULL
, XML_FROM_PARSER
, error
, XML_ERR_FATAL
,
198 NULL
, 0, NULL
, NULL
, NULL
, val
, 0, msg
, val
);
200 ctxt
->wellFormed
= 0;
201 if (ctxt
->recovery
== 0)
202 ctxt
->disableSAX
= 1;
208 * @c: an unicode character (int)
210 * Check whether the character is allowed by the production
211 * [84] Letter ::= BaseChar | Ideographic
213 * Returns 0 if not, non-zero otherwise
217 return(IS_BASECHAR(c
) || IS_IDEOGRAPHIC(c
));
220 /************************************************************************
222 * Input handling functions for progressive parsing *
224 ************************************************************************/
226 /* #define DEBUG_INPUT */
227 /* #define DEBUG_STACK */
228 /* #define DEBUG_PUSH */
231 /* we need to keep enough input to show errors in context */
235 #define CHECK_BUFFER(in) check_buffer(in)
238 void check_buffer(xmlParserInputPtr in
) {
239 if (in
->base
!= xmlBufContent(in
->buf
->buffer
)) {
240 xmlGenericError(xmlGenericErrorContext
,
241 "xmlParserInput: base mismatch problem\n");
243 if (in
->cur
< in
->base
) {
244 xmlGenericError(xmlGenericErrorContext
,
245 "xmlParserInput: cur < base problem\n");
247 if (in
->cur
> in
->base
+ xmlBufUse(in
->buf
->buffer
)) {
248 xmlGenericError(xmlGenericErrorContext
,
249 "xmlParserInput: cur > base + use problem\n");
251 xmlGenericError(xmlGenericErrorContext
,"buffer %p : content %x, cur %d, use %d\n",
252 (void *) in
, (int) xmlBufContent(in
->buf
->buffer
),
253 in
->cur
- in
->base
, xmlBufUse(in
->buf
->buffer
));
257 #define CHECK_BUFFER(in)
263 * @ctxt: an XML parser context
265 * Blocks further parser processing don't override error
269 xmlHaltParser(xmlParserCtxtPtr ctxt
) {
272 ctxt
->instate
= XML_PARSER_EOF
;
273 ctxt
->disableSAX
= 1;
274 while (ctxt
->inputNr
> 1)
275 xmlFreeInputStream(inputPop(ctxt
));
276 if (ctxt
->input
!= NULL
) {
278 * in case there was a specific allocation deallocate before
281 if (ctxt
->input
->free
!= NULL
) {
282 ctxt
->input
->free((xmlChar
*) ctxt
->input
->base
);
283 ctxt
->input
->free
= NULL
;
285 if (ctxt
->input
->buf
!= NULL
) {
286 xmlFreeParserInputBuffer(ctxt
->input
->buf
);
287 ctxt
->input
->buf
= NULL
;
289 ctxt
->input
->cur
= BAD_CAST
"";
290 ctxt
->input
->length
= 0;
291 ctxt
->input
->base
= ctxt
->input
->cur
;
292 ctxt
->input
->end
= ctxt
->input
->cur
;
297 * xmlParserInputRead:
298 * @in: an XML parser input
299 * @len: an indicative size for the lookahead
301 * DEPRECATED: This function was internal and is deprecated.
303 * Returns -1 as this is an error to use it.
306 xmlParserInputRead(xmlParserInputPtr in ATTRIBUTE_UNUSED
, int len ATTRIBUTE_UNUSED
) {
312 * @ctxt: an XML parser context
315 xmlParserGrow(xmlParserCtxtPtr ctxt
) {
316 xmlParserInputPtr in
= ctxt
->input
;
317 xmlParserInputBufferPtr buf
= in
->buf
;
318 ptrdiff_t curEnd
= in
->end
- in
->cur
;
319 ptrdiff_t curBase
= in
->cur
- in
->base
;
324 /* Don't grow push parser buffer. */
325 if (ctxt
->progressive
)
327 /* Don't grow memory buffers. */
328 if ((buf
->encoder
== NULL
) && (buf
->readcallback
== NULL
))
331 if (((curEnd
> XML_MAX_LOOKUP_LIMIT
) ||
332 (curBase
> XML_MAX_LOOKUP_LIMIT
)) &&
333 ((ctxt
->options
& XML_PARSE_HUGE
) == 0)) {
334 xmlErrInternal(ctxt
, "Huge input lookup", NULL
);
339 if (curEnd
>= INPUT_CHUNK
)
342 ret
= xmlParserInputBufferGrow(buf
, INPUT_CHUNK
);
343 xmlBufSetInputBaseCur(buf
->buffer
, in
, 0, curBase
);
345 /* TODO: Get error code from xmlParserInputBufferGrow */
347 xmlErrInternal(ctxt
, "Growing input buffer", NULL
);
355 * xmlParserInputGrow:
356 * @in: an XML parser input
357 * @len: an indicative size for the lookahead
359 * DEPRECATED: Don't use.
361 * This function increase the input for the parser. It tries to
362 * preserve pointers to the input buffer, and keep already read data
364 * Returns the amount of char read, or -1 in case of error, 0 indicate the
368 xmlParserInputGrow(xmlParserInputPtr in
, int len
) {
372 if ((in
== NULL
) || (len
< 0)) return(-1);
374 xmlGenericError(xmlGenericErrorContext
, "Grow\n");
376 if (in
->buf
== NULL
) return(-1);
377 if (in
->base
== NULL
) return(-1);
378 if (in
->cur
== NULL
) return(-1);
379 if (in
->buf
->buffer
== NULL
) return(-1);
381 /* Don't grow memory buffers. */
382 if ((in
->buf
->encoder
== NULL
) && (in
->buf
->readcallback
== NULL
))
387 indx
= in
->cur
- in
->base
;
388 if (xmlBufUse(in
->buf
->buffer
) > (unsigned int) indx
+ INPUT_CHUNK
) {
394 ret
= xmlParserInputBufferGrow(in
->buf
, len
);
396 in
->base
= xmlBufContent(in
->buf
->buffer
);
397 if (in
->base
== NULL
) {
398 in
->base
= BAD_CAST
"";
403 in
->cur
= in
->base
+ indx
;
404 in
->end
= xmlBufEnd(in
->buf
->buffer
);
413 * @ctxt: an XML parser context
416 xmlParserShrink(xmlParserCtxtPtr ctxt
) {
417 xmlParserInputPtr in
= ctxt
->input
;
418 xmlParserInputBufferPtr buf
= in
->buf
;
421 /* Don't shrink pull parser memory buffers. */
423 ((ctxt
->progressive
== 0) &&
424 (buf
->encoder
== NULL
) && (buf
->readcallback
== NULL
)))
427 used
= in
->cur
- in
->base
;
429 * Do not shrink on large buffers whose only a tiny fraction
432 if (used
> INPUT_CHUNK
) {
433 size_t res
= xmlBufShrink(buf
->buffer
, used
- LINE_LEN
);
437 if ((res
> ULONG_MAX
) ||
438 (in
->consumed
> ULONG_MAX
- (unsigned long)res
))
439 in
->consumed
= ULONG_MAX
;
445 xmlBufSetInputBaseCur(buf
->buffer
, in
, 0, used
);
449 * xmlParserInputShrink:
450 * @in: an XML parser input
452 * DEPRECATED: Don't use.
454 * This function removes used input for the parser.
457 xmlParserInputShrink(xmlParserInputPtr in
) {
462 xmlGenericError(xmlGenericErrorContext
, "Shrink\n");
464 if (in
== NULL
) return;
465 if (in
->buf
== NULL
) return;
466 if (in
->base
== NULL
) return;
467 if (in
->cur
== NULL
) return;
468 if (in
->buf
->buffer
== NULL
) return;
472 used
= in
->cur
- in
->base
;
474 * Do not shrink on large buffers whose only a tiny fraction
477 if (used
> INPUT_CHUNK
) {
478 ret
= xmlBufShrink(in
->buf
->buffer
, used
- LINE_LEN
);
481 if ((ret
> ULONG_MAX
) ||
482 (in
->consumed
> ULONG_MAX
- (unsigned long)ret
))
483 in
->consumed
= ULONG_MAX
;
489 if (xmlBufUse(in
->buf
->buffer
) <= INPUT_CHUNK
) {
490 xmlParserInputBufferRead(in
->buf
, 2 * INPUT_CHUNK
);
493 in
->base
= xmlBufContent(in
->buf
->buffer
);
494 if (in
->base
== NULL
) {
495 /* TODO: raise error */
496 in
->base
= BAD_CAST
"";
501 in
->cur
= in
->base
+ used
;
502 in
->end
= xmlBufEnd(in
->buf
->buffer
);
507 /************************************************************************
509 * UTF8 character input and related functions *
511 ************************************************************************/
515 * @ctxt: the XML parser context
517 * DEPRECATED: Internal function, do not use.
519 * Skip to the next char input char.
523 xmlNextChar(xmlParserCtxtPtr ctxt
)
525 if ((ctxt
== NULL
) || (ctxt
->instate
== XML_PARSER_EOF
) ||
526 (ctxt
->input
== NULL
))
529 if (!(VALID_CTXT(ctxt
))) {
530 xmlErrInternal(ctxt
, "Parser input data memory error\n", NULL
);
531 ctxt
->errNo
= XML_ERR_INTERNAL_ERROR
;
536 if (ctxt
->input
->end
- ctxt
->input
->cur
< INPUT_CHUNK
) {
537 if (xmlParserGrow(ctxt
) < 0)
539 if (ctxt
->input
->cur
>= ctxt
->input
->end
)
543 if (ctxt
->charset
== XML_CHAR_ENCODING_UTF8
) {
544 const unsigned char *cur
;
548 * 2.11 End-of-Line Handling
549 * the literal two-character sequence "#xD#xA" or a standalone
550 * literal #xD, an XML processor must pass to the application
551 * the single character #xA.
553 if (*(ctxt
->input
->cur
) == '\n') {
554 ctxt
->input
->line
++; ctxt
->input
->col
= 1;
559 * We are supposed to handle UTF8, check it's valid
560 * From rfc2044: encoding of the Unicode values on UTF-8:
562 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
563 * 0000 0000-0000 007F 0xxxxxxx
564 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
565 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
567 * Check for the 0x110000 limit too
569 cur
= ctxt
->input
->cur
;
578 avail
= ctxt
->input
->end
- ctxt
->input
->cur
;
580 if ((avail
< 2) || (cur
[1] & 0xc0) != 0x80)
582 if ((c
& 0xe0) == 0xe0) {
585 if ((avail
< 3) || (cur
[2] & 0xc0) != 0x80)
587 if ((c
& 0xf0) == 0xf0) {
588 if (((c
& 0xf8) != 0xf0) ||
589 (avail
< 4) || ((cur
[3] & 0xc0) != 0x80))
592 ctxt
->input
->cur
+= 4;
593 val
= (cur
[0] & 0x7) << 18;
594 val
|= (cur
[1] & 0x3f) << 12;
595 val
|= (cur
[2] & 0x3f) << 6;
596 val
|= cur
[3] & 0x3f;
599 ctxt
->input
->cur
+= 3;
600 val
= (cur
[0] & 0xf) << 12;
601 val
|= (cur
[1] & 0x3f) << 6;
602 val
|= cur
[2] & 0x3f;
604 if (((val
> 0xd7ff) && (val
< 0xe000)) ||
605 ((val
> 0xfffd) && (val
< 0x10000)) ||
607 xmlErrEncodingInt(ctxt
, XML_ERR_INVALID_CHAR
,
608 "Char 0x%X out of allowed range\n",
613 ctxt
->input
->cur
+= 2;
619 * Assume it's a fixed length encoding (1) with
620 * a compatible encoding for the ASCII set, since
621 * XML constructs only use < 128 chars
624 if (*(ctxt
->input
->cur
) == '\n') {
625 ctxt
->input
->line
++; ctxt
->input
->col
= 1;
633 * If we detect an UTF8 error that probably mean that the
634 * input encoding didn't get properly advertised in the
635 * declaration header. Report the error and switch the encoding
636 * to ISO-Latin-1 (if you don't like this policy, just declare the
639 if ((ctxt
== NULL
) || (ctxt
->input
== NULL
) ||
640 (ctxt
->input
->end
- ctxt
->input
->cur
< 4)) {
641 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
642 "Input is not proper UTF-8, indicate encoding !\n",
647 snprintf(buffer
, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
648 ctxt
->input
->cur
[0], ctxt
->input
->cur
[1],
649 ctxt
->input
->cur
[2], ctxt
->input
->cur
[3]);
650 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
651 "Input is not proper UTF-8, indicate encoding !\n%s",
652 BAD_CAST buffer
, NULL
);
654 ctxt
->charset
= XML_CHAR_ENCODING_8859_1
;
661 * @ctxt: the XML parser context
662 * @len: pointer to the length of the char read
664 * DEPRECATED: Internal function, do not use.
666 * The current char value, if using UTF-8 this may actually span multiple
667 * bytes in the input buffer. Implement the end of line normalization:
668 * 2.11 End-of-Line Handling
669 * Wherever an external parsed entity or the literal entity value
670 * of an internal parsed entity contains either the literal two-character
671 * sequence "#xD#xA" or a standalone literal #xD, an XML processor
672 * must pass to the application the single character #xA.
673 * This behavior can conveniently be produced by normalizing all
674 * line breaks to #xA on input, before parsing.)
676 * Returns the current char value and its length
680 xmlCurrentChar(xmlParserCtxtPtr ctxt
, int *len
) {
681 if ((ctxt
== NULL
) || (len
== NULL
) || (ctxt
->input
== NULL
)) return(0);
682 if (ctxt
->instate
== XML_PARSER_EOF
)
685 if ((ctxt
->input
->end
- ctxt
->input
->cur
< INPUT_CHUNK
) &&
686 (xmlParserGrow(ctxt
) < 0))
689 if ((*ctxt
->input
->cur
>= 0x20) && (*ctxt
->input
->cur
<= 0x7F)) {
691 return(*ctxt
->input
->cur
);
693 if (ctxt
->charset
== XML_CHAR_ENCODING_UTF8
) {
695 * We are supposed to handle UTF8, check it's valid
696 * From rfc2044: encoding of the Unicode values on UTF-8:
698 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
699 * 0000 0000-0000 007F 0xxxxxxx
700 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
701 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
703 * Check for the 0x110000 limit too
705 const unsigned char *cur
= ctxt
->input
->cur
;
713 if (((c
& 0x40) == 0) || (c
== 0xC0))
716 avail
= ctxt
->input
->end
- ctxt
->input
->cur
;
719 goto incomplete_sequence
;
720 if ((cur
[1] & 0xc0) != 0x80)
722 if ((c
& 0xe0) == 0xe0) {
724 goto incomplete_sequence
;
725 if ((cur
[2] & 0xc0) != 0x80)
727 if ((c
& 0xf0) == 0xf0) {
729 goto incomplete_sequence
;
730 if (((c
& 0xf8) != 0xf0) ||
731 ((cur
[3] & 0xc0) != 0x80))
735 val
= (cur
[0] & 0x7) << 18;
736 val
|= (cur
[1] & 0x3f) << 12;
737 val
|= (cur
[2] & 0x3f) << 6;
738 val
|= cur
[3] & 0x3f;
744 val
= (cur
[0] & 0xf) << 12;
745 val
|= (cur
[1] & 0x3f) << 6;
746 val
|= cur
[2] & 0x3f;
753 val
= (cur
[0] & 0x1f) << 6;
754 val
|= cur
[1] & 0x3f;
759 xmlErrEncodingInt(ctxt
, XML_ERR_INVALID_CHAR
,
760 "Char 0x%X out of allowed range\n", val
);
766 if ((*ctxt
->input
->cur
== 0) &&
767 (ctxt
->input
->end
> ctxt
->input
->cur
)) {
768 xmlErrEncodingInt(ctxt
, XML_ERR_INVALID_CHAR
,
769 "Char 0x0 out of allowed range\n", 0);
771 if (*ctxt
->input
->cur
== 0xD) {
772 if (ctxt
->input
->cur
[1] == 0xA) {
777 return(*ctxt
->input
->cur
);
781 * Assume it's a fixed length encoding (1) with
782 * a compatible encoding for the ASCII set, since
783 * XML constructs only use < 128 chars
786 if (*ctxt
->input
->cur
== 0xD) {
787 if (ctxt
->input
->cur
[1] == 0xA) {
792 return(*ctxt
->input
->cur
);
796 * If we detect an UTF8 error that probably mean that the
797 * input encoding didn't get properly advertised in the
798 * declaration header. Report the error and switch the encoding
799 * to ISO-Latin-1 (if you don't like this policy, just declare the
802 if (ctxt
->input
->end
- ctxt
->input
->cur
< 4) {
803 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
804 "Input is not proper UTF-8, indicate encoding !\n",
809 snprintf(&buffer
[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
810 ctxt
->input
->cur
[0], ctxt
->input
->cur
[1],
811 ctxt
->input
->cur
[2], ctxt
->input
->cur
[3]);
812 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
813 "Input is not proper UTF-8, indicate encoding !\n%s",
814 BAD_CAST buffer
, NULL
);
816 ctxt
->charset
= XML_CHAR_ENCODING_8859_1
;
818 return(*ctxt
->input
->cur
);
822 * An encoding problem may arise from a truncated input buffer
823 * splitting a character in the middle. In that case do not raise
824 * an error but return 0. This should only happen when push parsing
832 * xmlStringCurrentChar:
833 * @ctxt: the XML parser context
834 * @cur: pointer to the beginning of the char
835 * @len: pointer to the length of the char read
837 * DEPRECATED: Internal function, do not use.
839 * The current char value, if using UTF-8 this may actually span multiple
840 * bytes in the input buffer.
842 * Returns the current char value and its length
846 xmlStringCurrentChar(xmlParserCtxtPtr ctxt
, const xmlChar
* cur
, int *len
)
848 if ((len
== NULL
) || (cur
== NULL
)) return(0);
849 if ((ctxt
== NULL
) || (ctxt
->charset
== XML_CHAR_ENCODING_UTF8
)) {
851 * We are supposed to handle UTF8, check it's valid
852 * From rfc2044: encoding of the Unicode values on UTF-8:
854 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
855 * 0000 0000-0000 007F 0xxxxxxx
856 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
857 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
859 * Check for the 0x110000 limit too
866 if ((cur
[1] & 0xc0) != 0x80)
868 if ((c
& 0xe0) == 0xe0) {
870 if ((cur
[2] & 0xc0) != 0x80)
872 if ((c
& 0xf0) == 0xf0) {
873 if (((c
& 0xf8) != 0xf0) || ((cur
[3] & 0xc0) != 0x80))
877 val
= (cur
[0] & 0x7) << 18;
878 val
|= (cur
[1] & 0x3f) << 12;
879 val
|= (cur
[2] & 0x3f) << 6;
880 val
|= cur
[3] & 0x3f;
884 val
= (cur
[0] & 0xf) << 12;
885 val
|= (cur
[1] & 0x3f) << 6;
886 val
|= cur
[2] & 0x3f;
891 val
= (cur
[0] & 0x1f) << 6;
892 val
|= cur
[1] & 0x3f;
895 xmlErrEncodingInt(ctxt
, XML_ERR_INVALID_CHAR
,
896 "Char 0x%X out of allowed range\n", val
);
906 * Assume it's a fixed length encoding (1) with
907 * a compatible encoding for the ASCII set, since
908 * XML constructs only use < 128 chars
915 * An encoding problem may arise from a truncated input buffer
916 * splitting a character in the middle. In that case do not raise
917 * an error but return 0 to indicate an end of stream problem
919 if ((ctxt
== NULL
) || (ctxt
->input
== NULL
) ||
920 (ctxt
->input
->end
- ctxt
->input
->cur
< 4)) {
925 * If we detect an UTF8 error that probably mean that the
926 * input encoding didn't get properly advertised in the
927 * declaration header. Report the error and switch the encoding
928 * to ISO-Latin-1 (if you don't like this policy, just declare the
934 snprintf(buffer
, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
935 ctxt
->input
->cur
[0], ctxt
->input
->cur
[1],
936 ctxt
->input
->cur
[2], ctxt
->input
->cur
[3]);
937 __xmlErrEncoding(ctxt
, XML_ERR_INVALID_CHAR
,
938 "Input is not proper UTF-8, indicate encoding !\n%s",
939 BAD_CAST buffer
, NULL
);
946 * xmlCopyCharMultiByte:
947 * @out: pointer to an array of xmlChar
948 * @val: the char value
950 * append the char value in the array
952 * Returns the number of xmlChar written
955 xmlCopyCharMultiByte(xmlChar
*out
, int val
) {
956 if ((out
== NULL
) || (val
< 0)) return(0);
958 * We are supposed to handle UTF8, check it's valid
959 * From rfc2044: encoding of the Unicode values on UTF-8:
961 * UCS-4 range (hex.) UTF-8 octet sequence (binary)
962 * 0000 0000-0000 007F 0xxxxxxx
963 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
964 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
967 xmlChar
*savedout
= out
;
969 if (val
< 0x800) { *out
++= (val
>> 6) | 0xC0; bits
= 0; }
970 else if (val
< 0x10000) { *out
++= (val
>> 12) | 0xE0; bits
= 6;}
971 else if (val
< 0x110000) { *out
++= (val
>> 18) | 0xF0; bits
= 12; }
973 xmlErrEncodingInt(NULL
, XML_ERR_INVALID_CHAR
,
974 "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n",
978 for ( ; bits
>= 0; bits
-= 6)
979 *out
++= ((val
>> bits
) & 0x3F) | 0x80 ;
980 return (out
- savedout
);
988 * @len: Ignored, compatibility
989 * @out: pointer to an array of xmlChar
990 * @val: the char value
992 * append the char value in the array
994 * Returns the number of xmlChar written
998 xmlCopyChar(int len ATTRIBUTE_UNUSED
, xmlChar
*out
, int val
) {
999 if ((out
== NULL
) || (val
< 0)) return(0);
1000 /* the len parameter is ignored */
1002 return(xmlCopyCharMultiByte (out
, val
));
1008 /************************************************************************
1010 * Commodity functions to switch encodings *
1012 ************************************************************************/
1014 static xmlCharEncodingHandlerPtr
1015 xmlDetectEBCDIC(xmlParserInputPtr input
) {
1017 xmlCharEncodingHandlerPtr handler
;
1018 int inlen
, outlen
, res
, i
;
1021 * To detect the EBCDIC code page, we convert the first 200 bytes
1022 * to EBCDIC-US and try to find the encoding declaration.
1024 handler
= xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC
);
1025 if (handler
== NULL
)
1027 outlen
= sizeof(out
) - 1;
1028 inlen
= input
->end
- input
->cur
;
1029 res
= xmlEncInputChunk(handler
, out
, &outlen
, input
->cur
, &inlen
, 0);
1034 for (i
= 0; i
< outlen
; i
++) {
1037 if ((out
[i
] == 'e') &&
1038 (xmlStrncmp(out
+ i
, BAD_CAST
"encoding", 8) == 0)) {
1039 int start
, cur
, quote
;
1042 while (IS_BLANK_CH(out
[i
]))
1044 if (out
[i
++] != '=')
1046 while (IS_BLANK_CH(out
[i
]))
1049 if ((quote
!= '\'') && (quote
!= '"'))
1053 while (((cur
>= 'a') && (cur
<= 'z')) ||
1054 ((cur
>= 'A') && (cur
<= 'Z')) ||
1055 ((cur
>= '0') && (cur
<= '9')) ||
1056 (cur
== '.') || (cur
== '_') ||
1062 xmlCharEncCloseFunc(handler
);
1063 handler
= xmlFindCharEncodingHandler((char *) out
+ start
);
1072 * xmlSwitchEncoding:
1073 * @ctxt: the parser context
1074 * @enc: the encoding value (number)
1076 * change the input functions when discovering the character encoding
1077 * of a given entity.
1079 * Returns 0 in case of success, -1 otherwise
1082 xmlSwitchEncoding(xmlParserCtxtPtr ctxt
, xmlCharEncoding enc
)
1084 xmlCharEncodingHandlerPtr handler
;
1087 if (ctxt
== NULL
) return(-1);
1089 case XML_CHAR_ENCODING_ERROR
:
1090 __xmlErrEncoding(ctxt
, XML_ERR_UNKNOWN_ENCODING
,
1091 "encoding unknown\n", NULL
, NULL
);
1093 case XML_CHAR_ENCODING_NONE
:
1094 /* let's assume it's UTF-8 without the XML decl */
1095 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1097 case XML_CHAR_ENCODING_UTF8
:
1098 /* default encoding, no conversion should be needed */
1099 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1102 * Errata on XML-1.0 June 20 2001
1103 * Specific handling of the Byte Order Mark for
1106 if ((ctxt
->input
!= NULL
) &&
1107 (ctxt
->input
->cur
[0] == 0xEF) &&
1108 (ctxt
->input
->cur
[1] == 0xBB) &&
1109 (ctxt
->input
->cur
[2] == 0xBF)) {
1110 ctxt
->input
->cur
+= 3;
1113 case XML_CHAR_ENCODING_EBCDIC
:
1114 handler
= xmlDetectEBCDIC(ctxt
->input
);
1117 handler
= xmlGetCharEncodingHandler(enc
);
1120 if (handler
== NULL
) {
1125 case XML_CHAR_ENCODING_ASCII
:
1126 /* default encoding, no conversion should be needed */
1127 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1129 case XML_CHAR_ENCODING_8859_1
:
1130 if ((ctxt
->inputNr
== 1) &&
1131 (ctxt
->encoding
== NULL
) &&
1132 (ctxt
->input
!= NULL
) &&
1133 (ctxt
->input
->encoding
!= NULL
)) {
1134 ctxt
->encoding
= xmlStrdup(ctxt
->input
->encoding
);
1136 ctxt
->charset
= enc
;
1139 __xmlErrEncoding(ctxt
, XML_ERR_UNSUPPORTED_ENCODING
,
1140 "encoding not supported: %s\n",
1141 BAD_CAST
xmlGetCharEncodingName(enc
), NULL
);
1143 * TODO: We could recover from errors in external entities
1144 * if we didn't stop the parser. But most callers of this
1145 * function don't check the return value.
1147 xmlStopParser(ctxt
);
1151 ret
= xmlSwitchInputEncoding(ctxt
, ctxt
->input
, handler
);
1152 if ((ret
< 0) || (ctxt
->errNo
== XML_I18N_CONV_FAILED
)) {
1154 * on encoding conversion errors, stop the parser
1156 xmlStopParser(ctxt
);
1157 ctxt
->errNo
= XML_I18N_CONV_FAILED
;
1163 * xmlSwitchInputEncoding:
1164 * @ctxt: the parser context
1165 * @input: the input stream
1166 * @handler: the encoding handler
1168 * change the input functions when discovering the character encoding
1169 * of a given entity.
1171 * Returns 0 in case of success, -1 otherwise
1174 xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt
, xmlParserInputPtr input
,
1175 xmlCharEncodingHandlerPtr handler
)
1178 xmlParserInputBufferPtr in
;
1180 if (handler
== NULL
)
1186 xmlErrInternal(ctxt
,
1187 "static memory buffer doesn't support encoding\n", NULL
);
1189 * Callers assume that the input buffer takes ownership of the
1190 * encoding handler. xmlCharEncCloseFunc frees unregistered
1191 * handlers and avoids a memory leak.
1193 xmlCharEncCloseFunc(handler
);
1197 if (in
->encoder
!= NULL
) {
1198 if (in
->encoder
== handler
)
1202 * Switching encodings during parsing is a really bad idea,
1203 * but WebKit/Chromium switches from ISO-8859-1 to UTF-16 as soon as
1204 * it finds Unicode characters with code points larger than 255.
1206 * TODO: We should check whether the "raw" input buffer is empty and
1207 * convert the old content using the old encoder.
1210 xmlCharEncCloseFunc(in
->encoder
);
1211 in
->encoder
= handler
;
1215 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1216 in
->encoder
= handler
;
1219 * Is there already some content down the pipe to convert ?
1221 if (xmlBufIsEmpty(in
->buffer
) == 0) {
1222 size_t processed
, use
, consumed
;
1225 * Specific handling of the Byte Order Mark for
1228 if ((handler
->name
!= NULL
) &&
1229 (!strcmp(handler
->name
, "UTF-16LE") ||
1230 !strcmp(handler
->name
, "UTF-16")) &&
1231 (input
->cur
[0] == 0xFF) && (input
->cur
[1] == 0xFE)) {
1234 if ((handler
->name
!= NULL
) &&
1235 (!strcmp(handler
->name
, "UTF-16BE")) &&
1236 (input
->cur
[0] == 0xFE) && (input
->cur
[1] == 0xFF)) {
1240 * Errata on XML-1.0 June 20 2001
1241 * Specific handling of the Byte Order Mark for
1244 if ((handler
->name
!= NULL
) &&
1245 (!strcmp(handler
->name
, "UTF-8")) &&
1246 (input
->cur
[0] == 0xEF) &&
1247 (input
->cur
[1] == 0xBB) && (input
->cur
[2] == 0xBF)) {
1252 * Shrink the current input buffer.
1253 * Move it as the raw buffer and create a new input buffer
1255 processed
= input
->cur
- input
->base
;
1256 xmlBufShrink(in
->buffer
, processed
);
1257 input
->consumed
+= processed
;
1258 in
->raw
= in
->buffer
;
1259 in
->buffer
= xmlBufCreate();
1260 in
->rawconsumed
= processed
;
1261 use
= xmlBufUse(in
->raw
);
1264 * TODO: We must flush and decode the whole buffer to make functions
1265 * like xmlReadMemory work with a user-provided encoding. If the
1266 * encoding is specified directly, we should probably set
1267 * XML_PARSE_IGNORE_ENC in xmlDoRead to avoid switching encodings
1268 * twice. Then we could set "flush" to false which should save
1269 * a considerable amount of memory when parsing from memory.
1270 * It's probably even possible to remove this whole if-block
1273 nbchars
= xmlCharEncInput(in
, 1);
1274 xmlBufResetInput(in
->buffer
, input
);
1276 /* TODO: This could be an out of memory or an encoding error. */
1277 xmlErrInternal(ctxt
,
1278 "switching encoding: encoder error\n",
1280 xmlHaltParser(ctxt
);
1283 consumed
= use
- xmlBufUse(in
->raw
);
1284 if ((consumed
> ULONG_MAX
) ||
1285 (in
->rawconsumed
> ULONG_MAX
- (unsigned long)consumed
))
1286 in
->rawconsumed
= ULONG_MAX
;
1288 in
->rawconsumed
+= consumed
;
1294 * xmlSwitchToEncoding:
1295 * @ctxt: the parser context
1296 * @handler: the encoding handler
1298 * change the input functions when discovering the character encoding
1299 * of a given entity.
1301 * Returns 0 in case of success, -1 otherwise
1304 xmlSwitchToEncoding(xmlParserCtxtPtr ctxt
, xmlCharEncodingHandlerPtr handler
)
1308 return(xmlSwitchInputEncoding(ctxt
, ctxt
->input
, handler
));
1311 /************************************************************************
1313 * Commodity functions to handle entities processing *
1315 ************************************************************************/
1318 * xmlFreeInputStream:
1319 * @input: an xmlParserInputPtr
1321 * Free up an input stream.
1324 xmlFreeInputStream(xmlParserInputPtr input
) {
1325 if (input
== NULL
) return;
1327 if (input
->filename
!= NULL
) xmlFree((char *) input
->filename
);
1328 if (input
->directory
!= NULL
) xmlFree((char *) input
->directory
);
1329 if (input
->encoding
!= NULL
) xmlFree((char *) input
->encoding
);
1330 if (input
->version
!= NULL
) xmlFree((char *) input
->version
);
1331 if ((input
->free
!= NULL
) && (input
->base
!= NULL
))
1332 input
->free((xmlChar
*) input
->base
);
1333 if (input
->buf
!= NULL
)
1334 xmlFreeParserInputBuffer(input
->buf
);
1339 * xmlNewInputStream:
1340 * @ctxt: an XML parser context
1342 * Create a new input stream structure.
1344 * Returns the new input stream or NULL
1347 xmlNewInputStream(xmlParserCtxtPtr ctxt
) {
1348 xmlParserInputPtr input
;
1350 input
= (xmlParserInputPtr
) xmlMalloc(sizeof(xmlParserInput
));
1351 if (input
== NULL
) {
1352 xmlErrMemory(ctxt
, "couldn't allocate a new input stream\n");
1355 memset(input
, 0, sizeof(xmlParserInput
));
1358 input
->standalone
= -1;
1361 * If the context is NULL the id cannot be initialized, but that
1362 * should not happen while parsing which is the situation where
1363 * the id is actually needed.
1366 if (input
->id
>= INT_MAX
) {
1367 xmlErrMemory(ctxt
, "Input ID overflow\n");
1370 input
->id
= ctxt
->input_id
++;
1377 * xmlNewIOInputStream:
1378 * @ctxt: an XML parser context
1379 * @input: an I/O Input
1380 * @enc: the charset encoding if known
1382 * Create a new input stream structure encapsulating the @input into
1383 * a stream suitable for the parser.
1385 * Returns the new input stream or NULL
1388 xmlNewIOInputStream(xmlParserCtxtPtr ctxt
, xmlParserInputBufferPtr input
,
1389 xmlCharEncoding enc
) {
1390 xmlParserInputPtr inputStream
;
1392 if (input
== NULL
) return(NULL
);
1393 if (xmlParserDebugEntities
)
1394 xmlGenericError(xmlGenericErrorContext
, "new input from I/O\n");
1395 inputStream
= xmlNewInputStream(ctxt
);
1396 if (inputStream
== NULL
) {
1399 inputStream
->filename
= NULL
;
1400 inputStream
->buf
= input
;
1401 xmlBufResetInput(inputStream
->buf
->buffer
, inputStream
);
1403 if (enc
!= XML_CHAR_ENCODING_NONE
) {
1404 xmlSwitchEncoding(ctxt
, enc
);
1407 return(inputStream
);
1411 * xmlNewEntityInputStream:
1412 * @ctxt: an XML parser context
1413 * @entity: an Entity pointer
1415 * DEPRECATED: Internal function, do not use.
1417 * Create a new input stream based on an xmlEntityPtr
1419 * Returns the new input stream or NULL
1422 xmlNewEntityInputStream(xmlParserCtxtPtr ctxt
, xmlEntityPtr entity
) {
1423 xmlParserInputPtr input
;
1425 if (entity
== NULL
) {
1426 xmlErrInternal(ctxt
, "xmlNewEntityInputStream entity = NULL\n",
1430 if (xmlParserDebugEntities
)
1431 xmlGenericError(xmlGenericErrorContext
,
1432 "new input from entity: %s\n", entity
->name
);
1433 if (entity
->content
== NULL
) {
1434 switch (entity
->etype
) {
1435 case XML_EXTERNAL_GENERAL_UNPARSED_ENTITY
:
1436 xmlErrInternal(ctxt
, "Cannot parse entity %s\n",
1439 case XML_EXTERNAL_GENERAL_PARSED_ENTITY
:
1440 case XML_EXTERNAL_PARAMETER_ENTITY
:
1441 input
= xmlLoadExternalEntity((char *) entity
->URI
,
1442 (char *) entity
->ExternalID
, ctxt
);
1444 input
->entity
= entity
;
1446 case XML_INTERNAL_GENERAL_ENTITY
:
1447 xmlErrInternal(ctxt
,
1448 "Internal entity %s without content !\n",
1451 case XML_INTERNAL_PARAMETER_ENTITY
:
1452 xmlErrInternal(ctxt
,
1453 "Internal parameter entity %s without content !\n",
1456 case XML_INTERNAL_PREDEFINED_ENTITY
:
1457 xmlErrInternal(ctxt
,
1458 "Predefined entity %s without content !\n",
1464 input
= xmlNewInputStream(ctxt
);
1465 if (input
== NULL
) {
1468 if (entity
->URI
!= NULL
)
1469 input
->filename
= (char *) xmlStrdup((xmlChar
*) entity
->URI
);
1470 input
->base
= entity
->content
;
1471 if (entity
->length
== 0)
1472 entity
->length
= xmlStrlen(entity
->content
);
1473 input
->cur
= entity
->content
;
1474 input
->length
= entity
->length
;
1475 input
->end
= &entity
->content
[input
->length
];
1476 input
->entity
= entity
;
1481 * xmlNewStringInputStream:
1482 * @ctxt: an XML parser context
1483 * @buffer: an memory buffer
1485 * Create a new input stream based on a memory buffer.
1486 * Returns the new input stream
1489 xmlNewStringInputStream(xmlParserCtxtPtr ctxt
, const xmlChar
*buffer
) {
1490 xmlParserInputPtr input
;
1491 xmlParserInputBufferPtr buf
;
1493 if (buffer
== NULL
) {
1494 xmlErrInternal(ctxt
, "xmlNewStringInputStream string = NULL\n",
1498 if (xmlParserDebugEntities
)
1499 xmlGenericError(xmlGenericErrorContext
,
1500 "new fixed input: %.30s\n", buffer
);
1501 buf
= xmlParserInputBufferCreateMem((const char *) buffer
,
1503 XML_CHAR_ENCODING_NONE
);
1505 xmlErrMemory(ctxt
, NULL
);
1508 input
= xmlNewInputStream(ctxt
);
1509 if (input
== NULL
) {
1510 xmlErrMemory(ctxt
, "couldn't allocate a new input stream\n");
1511 xmlFreeParserInputBuffer(buf
);
1515 xmlBufResetInput(input
->buf
->buffer
, input
);
1520 * xmlNewInputFromFile:
1521 * @ctxt: an XML parser context
1522 * @filename: the filename to use as entity
1524 * Create a new input stream based on a file or an URL.
1526 * Returns the new input stream or NULL in case of error
1529 xmlNewInputFromFile(xmlParserCtxtPtr ctxt
, const char *filename
) {
1530 xmlParserInputBufferPtr buf
;
1531 xmlParserInputPtr inputStream
;
1532 char *directory
= NULL
;
1533 xmlChar
*URI
= NULL
;
1535 if (xmlParserDebugEntities
)
1536 xmlGenericError(xmlGenericErrorContext
,
1537 "new input from file: %s\n", filename
);
1538 if (ctxt
== NULL
) return(NULL
);
1539 buf
= xmlParserInputBufferCreateFilename(filename
, XML_CHAR_ENCODING_NONE
);
1541 if (filename
== NULL
)
1542 __xmlLoaderErr(ctxt
,
1543 "failed to load external entity: NULL filename \n",
1546 __xmlLoaderErr(ctxt
, "failed to load external entity \"%s\"\n",
1547 (const char *) filename
);
1551 inputStream
= xmlNewInputStream(ctxt
);
1552 if (inputStream
== NULL
) {
1553 xmlFreeParserInputBuffer(buf
);
1557 inputStream
->buf
= buf
;
1558 inputStream
= xmlCheckHTTPInput(ctxt
, inputStream
);
1559 if (inputStream
== NULL
)
1562 if (inputStream
->filename
== NULL
)
1563 URI
= xmlStrdup((xmlChar
*) filename
);
1565 URI
= xmlStrdup((xmlChar
*) inputStream
->filename
);
1566 directory
= xmlParserGetDirectory((const char *) URI
);
1567 if (inputStream
->filename
!= NULL
) xmlFree((char *)inputStream
->filename
);
1568 inputStream
->filename
= (char *) xmlCanonicPath((const xmlChar
*) URI
);
1569 if (URI
!= NULL
) xmlFree((char *) URI
);
1570 inputStream
->directory
= directory
;
1572 xmlBufResetInput(inputStream
->buf
->buffer
, inputStream
);
1573 if ((ctxt
->directory
== NULL
) && (directory
!= NULL
))
1574 ctxt
->directory
= (char *) xmlStrdup((const xmlChar
*) directory
);
1575 return(inputStream
);
1578 /************************************************************************
1580 * Commodity functions to handle parser contexts *
1582 ************************************************************************/
1585 * xmlInitSAXParserCtxt:
1586 * @ctxt: XML parser context
1587 * @sax: SAX handlert
1588 * @userData: user data
1590 * Initialize a SAX parser context
1592 * Returns 0 in case of success and -1 in case of error
1596 xmlInitSAXParserCtxt(xmlParserCtxtPtr ctxt
, const xmlSAXHandler
*sax
,
1599 xmlParserInputPtr input
;
1602 xmlErrInternal(NULL
, "Got NULL parser context\n", NULL
);
1608 if (ctxt
->dict
== NULL
)
1609 ctxt
->dict
= xmlDictCreate();
1610 if (ctxt
->dict
== NULL
) {
1611 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1614 xmlDictSetLimit(ctxt
->dict
, XML_MAX_DICTIONARY_LIMIT
);
1616 if (ctxt
->sax
== NULL
)
1617 ctxt
->sax
= (xmlSAXHandler
*) xmlMalloc(sizeof(xmlSAXHandler
));
1618 if (ctxt
->sax
== NULL
) {
1619 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1623 memset(ctxt
->sax
, 0, sizeof(xmlSAXHandler
));
1624 xmlSAXVersion(ctxt
->sax
, 2);
1625 ctxt
->userData
= ctxt
;
1627 if (sax
->initialized
== XML_SAX2_MAGIC
) {
1628 memcpy(ctxt
->sax
, sax
, sizeof(xmlSAXHandler
));
1630 memset(ctxt
->sax
, 0, sizeof(xmlSAXHandler
));
1631 memcpy(ctxt
->sax
, sax
, sizeof(xmlSAXHandlerV1
));
1633 ctxt
->userData
= userData
? userData
: ctxt
;
1638 /* Allocate the Input stack */
1639 if (ctxt
->inputTab
== NULL
) {
1640 ctxt
->inputTab
= (xmlParserInputPtr
*)
1641 xmlMalloc(5 * sizeof(xmlParserInputPtr
));
1644 if (ctxt
->inputTab
== NULL
) {
1645 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1651 while ((input
= inputPop(ctxt
)) != NULL
) { /* Non consuming */
1652 xmlFreeInputStream(input
);
1657 ctxt
->version
= NULL
;
1658 ctxt
->encoding
= NULL
;
1659 ctxt
->standalone
= -1;
1660 ctxt
->hasExternalSubset
= 0;
1661 ctxt
->hasPErefs
= 0;
1664 ctxt
->instate
= XML_PARSER_START
;
1666 ctxt
->directory
= NULL
;
1668 /* Allocate the Node stack */
1669 if (ctxt
->nodeTab
== NULL
) {
1670 ctxt
->nodeTab
= (xmlNodePtr
*) xmlMalloc(10 * sizeof(xmlNodePtr
));
1673 if (ctxt
->nodeTab
== NULL
) {
1674 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1686 /* Allocate the Name stack */
1687 if (ctxt
->nameTab
== NULL
) {
1688 ctxt
->nameTab
= (const xmlChar
**) xmlMalloc(10 * sizeof(xmlChar
*));
1691 if (ctxt
->nameTab
== NULL
) {
1692 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1707 /* Allocate the space stack */
1708 if (ctxt
->spaceTab
== NULL
) {
1709 ctxt
->spaceTab
= (int *) xmlMalloc(10 * sizeof(int));
1710 ctxt
->spaceMax
= 10;
1712 if (ctxt
->spaceTab
== NULL
) {
1713 xmlErrMemory(NULL
, "cannot initialize parser context\n");
1729 ctxt
->spaceMax
= 10;
1730 ctxt
->spaceTab
[0] = -1;
1731 ctxt
->space
= &ctxt
->spaceTab
[0];
1733 ctxt
->wellFormed
= 1;
1734 ctxt
->nsWellFormed
= 1;
1736 ctxt
->loadsubset
= xmlLoadExtDtdDefaultValue
;
1737 if (ctxt
->loadsubset
) {
1738 ctxt
->options
|= XML_PARSE_DTDLOAD
;
1740 ctxt
->validate
= xmlDoValidityCheckingDefaultValue
;
1741 ctxt
->pedantic
= xmlPedanticParserDefaultValue
;
1742 if (ctxt
->pedantic
) {
1743 ctxt
->options
|= XML_PARSE_PEDANTIC
;
1745 ctxt
->linenumbers
= xmlLineNumbersDefaultValue
;
1746 ctxt
->keepBlanks
= xmlKeepBlanksDefaultValue
;
1747 if (ctxt
->keepBlanks
== 0) {
1748 ctxt
->sax
->ignorableWhitespace
= xmlSAX2IgnorableWhitespace
;
1749 ctxt
->options
|= XML_PARSE_NOBLANKS
;
1752 ctxt
->vctxt
.flags
= XML_VCTXT_USE_PCTXT
;
1753 ctxt
->vctxt
.userData
= ctxt
;
1754 ctxt
->vctxt
.error
= xmlParserValidityError
;
1755 ctxt
->vctxt
.warning
= xmlParserValidityWarning
;
1756 if (ctxt
->validate
) {
1757 if (xmlGetWarningsDefaultValue
== 0)
1758 ctxt
->vctxt
.warning
= NULL
;
1760 ctxt
->vctxt
.warning
= xmlParserValidityWarning
;
1761 ctxt
->vctxt
.nodeMax
= 0;
1762 ctxt
->options
|= XML_PARSE_DTDVALID
;
1764 ctxt
->replaceEntities
= xmlSubstituteEntitiesDefaultValue
;
1765 if (ctxt
->replaceEntities
) {
1766 ctxt
->options
|= XML_PARSE_NOENT
;
1768 ctxt
->record_info
= 0;
1769 ctxt
->checkIndex
= 0;
1771 ctxt
->errNo
= XML_ERR_OK
;
1773 ctxt
->charset
= XML_CHAR_ENCODING_UTF8
;
1774 ctxt
->catalogs
= NULL
;
1775 ctxt
->sizeentities
= 0;
1776 ctxt
->sizeentcopy
= 0;
1778 xmlInitNodeInfoSeq(&ctxt
->node_seq
);
1783 * xmlInitParserCtxt:
1784 * @ctxt: an XML parser context
1786 * DEPRECATED: Internal function which will be made private in a future
1789 * Initialize a parser context
1791 * Returns 0 in case of success and -1 in case of error
1795 xmlInitParserCtxt(xmlParserCtxtPtr ctxt
)
1797 return(xmlInitSAXParserCtxt(ctxt
, NULL
, NULL
));
1801 * xmlFreeParserCtxt:
1802 * @ctxt: an XML parser context
1804 * Free all the memory used by a parser context. However the parsed
1805 * document in ctxt->myDoc is not freed.
1809 xmlFreeParserCtxt(xmlParserCtxtPtr ctxt
)
1811 xmlParserInputPtr input
;
1813 if (ctxt
== NULL
) return;
1815 while ((input
= inputPop(ctxt
)) != NULL
) { /* Non consuming */
1816 xmlFreeInputStream(input
);
1818 if (ctxt
->spaceTab
!= NULL
) xmlFree(ctxt
->spaceTab
);
1819 if (ctxt
->nameTab
!= NULL
) xmlFree((xmlChar
* *)ctxt
->nameTab
);
1820 if (ctxt
->nodeTab
!= NULL
) xmlFree(ctxt
->nodeTab
);
1821 if (ctxt
->nodeInfoTab
!= NULL
) xmlFree(ctxt
->nodeInfoTab
);
1822 if (ctxt
->inputTab
!= NULL
) xmlFree(ctxt
->inputTab
);
1823 if (ctxt
->version
!= NULL
) xmlFree((char *) ctxt
->version
);
1824 if (ctxt
->encoding
!= NULL
) xmlFree((char *) ctxt
->encoding
);
1825 if (ctxt
->extSubURI
!= NULL
) xmlFree((char *) ctxt
->extSubURI
);
1826 if (ctxt
->extSubSystem
!= NULL
) xmlFree((char *) ctxt
->extSubSystem
);
1827 #ifdef LIBXML_SAX1_ENABLED
1828 if ((ctxt
->sax
!= NULL
) &&
1829 (ctxt
->sax
!= (xmlSAXHandlerPtr
) &xmlDefaultSAXHandler
))
1831 if (ctxt
->sax
!= NULL
)
1832 #endif /* LIBXML_SAX1_ENABLED */
1834 if (ctxt
->directory
!= NULL
) xmlFree((char *) ctxt
->directory
);
1835 if (ctxt
->vctxt
.nodeTab
!= NULL
) xmlFree(ctxt
->vctxt
.nodeTab
);
1836 if (ctxt
->atts
!= NULL
) xmlFree((xmlChar
* *)ctxt
->atts
);
1837 if (ctxt
->dict
!= NULL
) xmlDictFree(ctxt
->dict
);
1838 if (ctxt
->nsTab
!= NULL
) xmlFree((char *) ctxt
->nsTab
);
1839 if (ctxt
->pushTab
!= NULL
) xmlFree(ctxt
->pushTab
);
1840 if (ctxt
->attallocs
!= NULL
) xmlFree(ctxt
->attallocs
);
1841 if (ctxt
->attsDefault
!= NULL
)
1842 xmlHashFree(ctxt
->attsDefault
, xmlHashDefaultDeallocator
);
1843 if (ctxt
->attsSpecial
!= NULL
)
1844 xmlHashFree(ctxt
->attsSpecial
, NULL
);
1845 if (ctxt
->freeElems
!= NULL
) {
1846 xmlNodePtr cur
, next
;
1848 cur
= ctxt
->freeElems
;
1849 while (cur
!= NULL
) {
1855 if (ctxt
->freeAttrs
!= NULL
) {
1856 xmlAttrPtr cur
, next
;
1858 cur
= ctxt
->freeAttrs
;
1859 while (cur
!= NULL
) {
1866 * cleanup the error strings
1868 if (ctxt
->lastError
.message
!= NULL
)
1869 xmlFree(ctxt
->lastError
.message
);
1870 if (ctxt
->lastError
.file
!= NULL
)
1871 xmlFree(ctxt
->lastError
.file
);
1872 if (ctxt
->lastError
.str1
!= NULL
)
1873 xmlFree(ctxt
->lastError
.str1
);
1874 if (ctxt
->lastError
.str2
!= NULL
)
1875 xmlFree(ctxt
->lastError
.str2
);
1876 if (ctxt
->lastError
.str3
!= NULL
)
1877 xmlFree(ctxt
->lastError
.str3
);
1879 #ifdef LIBXML_CATALOG_ENABLED
1880 if (ctxt
->catalogs
!= NULL
)
1881 xmlCatalogFreeLocal(ctxt
->catalogs
);
1889 * Allocate and initialize a new parser context.
1891 * Returns the xmlParserCtxtPtr or NULL
1895 xmlNewParserCtxt(void)
1897 return(xmlNewSAXParserCtxt(NULL
, NULL
));
1901 * xmlNewSAXParserCtxt:
1903 * @userData: user data
1905 * Allocate and initialize a new SAX parser context. If userData is NULL,
1906 * the parser context will be passed as user data.
1908 * Returns the xmlParserCtxtPtr or NULL if memory allocation failed.
1912 xmlNewSAXParserCtxt(const xmlSAXHandler
*sax
, void *userData
)
1914 xmlParserCtxtPtr ctxt
;
1916 ctxt
= (xmlParserCtxtPtr
) xmlMalloc(sizeof(xmlParserCtxt
));
1918 xmlErrMemory(NULL
, "cannot allocate parser context\n");
1921 memset(ctxt
, 0, sizeof(xmlParserCtxt
));
1922 if (xmlInitSAXParserCtxt(ctxt
, sax
, userData
) < 0) {
1923 xmlFreeParserCtxt(ctxt
);
1929 /************************************************************************
1931 * Handling of node information *
1933 ************************************************************************/
1936 * xmlClearParserCtxt:
1937 * @ctxt: an XML parser context
1939 * Clear (release owned resources) and reinitialize a parser context
1943 xmlClearParserCtxt(xmlParserCtxtPtr ctxt
)
1947 xmlClearNodeInfoSeq(&ctxt
->node_seq
);
1953 * xmlParserFindNodeInfo:
1954 * @ctx: an XML parser context
1955 * @node: an XML node within the tree
1957 * DEPRECATED: Don't use.
1959 * Find the parser node info struct for a given node
1961 * Returns an xmlParserNodeInfo block pointer or NULL
1963 const xmlParserNodeInfo
*
1964 xmlParserFindNodeInfo(const xmlParserCtxtPtr ctx
, const xmlNodePtr node
)
1968 if ((ctx
== NULL
) || (node
== NULL
))
1970 /* Find position where node should be at */
1971 pos
= xmlParserFindNodeInfoIndex(&ctx
->node_seq
, node
);
1972 if (pos
< ctx
->node_seq
.length
1973 && ctx
->node_seq
.buffer
[pos
].node
== node
)
1974 return &ctx
->node_seq
.buffer
[pos
];
1981 * xmlInitNodeInfoSeq:
1982 * @seq: a node info sequence pointer
1984 * DEPRECATED: Don't use.
1986 * -- Initialize (set to initial state) node info sequence
1989 xmlInitNodeInfoSeq(xmlParserNodeInfoSeqPtr seq
)
1999 * xmlClearNodeInfoSeq:
2000 * @seq: a node info sequence pointer
2002 * DEPRECATED: Don't use.
2004 * -- Clear (release memory and reinitialize) node
2008 xmlClearNodeInfoSeq(xmlParserNodeInfoSeqPtr seq
)
2012 if (seq
->buffer
!= NULL
)
2013 xmlFree(seq
->buffer
);
2014 xmlInitNodeInfoSeq(seq
);
2018 * xmlParserFindNodeInfoIndex:
2019 * @seq: a node info sequence pointer
2020 * @node: an XML node pointer
2022 * DEPRECATED: Don't use.
2024 * xmlParserFindNodeInfoIndex : Find the index that the info record for
2025 * the given node is or should be at in a sorted sequence
2027 * Returns a long indicating the position of the record
2030 xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeqPtr seq
,
2031 const xmlNodePtr node
)
2033 unsigned long upper
, lower
, middle
;
2036 if ((seq
== NULL
) || (node
== NULL
))
2037 return ((unsigned long) -1);
2039 /* Do a binary search for the key */
2041 upper
= seq
->length
;
2043 while (lower
<= upper
&& !found
) {
2044 middle
= lower
+ (upper
- lower
) / 2;
2045 if (node
== seq
->buffer
[middle
- 1].node
)
2047 else if (node
< seq
->buffer
[middle
- 1].node
)
2053 /* Return position */
2054 if (middle
== 0 || seq
->buffer
[middle
- 1].node
< node
)
2062 * xmlParserAddNodeInfo:
2063 * @ctxt: an XML parser context
2064 * @info: a node info sequence pointer
2066 * DEPRECATED: Don't use.
2068 * Insert node info record into the sorted sequence
2071 xmlParserAddNodeInfo(xmlParserCtxtPtr ctxt
,
2072 const xmlParserNodeInfoPtr info
)
2076 if ((ctxt
== NULL
) || (info
== NULL
)) return;
2078 /* Find pos and check to see if node is already in the sequence */
2079 pos
= xmlParserFindNodeInfoIndex(&ctxt
->node_seq
, (xmlNodePtr
)
2082 if ((pos
< ctxt
->node_seq
.length
) &&
2083 (ctxt
->node_seq
.buffer
!= NULL
) &&
2084 (ctxt
->node_seq
.buffer
[pos
].node
== info
->node
)) {
2085 ctxt
->node_seq
.buffer
[pos
] = *info
;
2088 /* Otherwise, we need to add new node to buffer */
2090 if ((ctxt
->node_seq
.length
+ 1 > ctxt
->node_seq
.maximum
) ||
2091 (ctxt
->node_seq
.buffer
== NULL
)) {
2092 xmlParserNodeInfo
*tmp_buffer
;
2093 unsigned int byte_size
;
2095 if (ctxt
->node_seq
.maximum
== 0)
2096 ctxt
->node_seq
.maximum
= 2;
2097 byte_size
= (sizeof(*ctxt
->node_seq
.buffer
) *
2098 (2 * ctxt
->node_seq
.maximum
));
2100 if (ctxt
->node_seq
.buffer
== NULL
)
2101 tmp_buffer
= (xmlParserNodeInfo
*) xmlMalloc(byte_size
);
2104 (xmlParserNodeInfo
*) xmlRealloc(ctxt
->node_seq
.buffer
,
2107 if (tmp_buffer
== NULL
) {
2108 xmlErrMemory(ctxt
, "failed to allocate buffer\n");
2111 ctxt
->node_seq
.buffer
= tmp_buffer
;
2112 ctxt
->node_seq
.maximum
*= 2;
2115 /* If position is not at end, move elements out of the way */
2116 if (pos
!= ctxt
->node_seq
.length
) {
2119 for (i
= ctxt
->node_seq
.length
; i
> pos
; i
--)
2120 ctxt
->node_seq
.buffer
[i
] = ctxt
->node_seq
.buffer
[i
- 1];
2123 /* Copy element and increase length */
2124 ctxt
->node_seq
.buffer
[pos
] = *info
;
2125 ctxt
->node_seq
.length
++;
2129 /************************************************************************
2131 * Defaults settings *
2133 ************************************************************************/
2135 * xmlPedanticParserDefault:
2138 * DEPRECATED: Use the modern options API with XML_PARSE_PEDANTIC.
2140 * Set and return the previous value for enabling pedantic warnings.
2142 * Returns the last value for 0 for no substitution, 1 for substitution.
2146 xmlPedanticParserDefault(int val
) {
2147 int old
= xmlPedanticParserDefaultValue
;
2149 xmlPedanticParserDefaultValue
= val
;
2154 * xmlLineNumbersDefault:
2157 * DEPRECATED: The modern options API always enables line numbers.
2159 * Set and return the previous value for enabling line numbers in elements
2160 * contents. This may break on old application and is turned off by default.
2162 * Returns the last value for 0 for no substitution, 1 for substitution.
2166 xmlLineNumbersDefault(int val
) {
2167 int old
= xmlLineNumbersDefaultValue
;
2169 xmlLineNumbersDefaultValue
= val
;
2174 * xmlSubstituteEntitiesDefault:
2177 * DEPRECATED: Use the modern options API with XML_PARSE_NOENT.
2179 * Set and return the previous value for default entity support.
2180 * Initially the parser always keep entity references instead of substituting
2181 * entity values in the output. This function has to be used to change the
2182 * default parser behavior
2183 * SAX::substituteEntities() has to be used for changing that on a file by
2186 * Returns the last value for 0 for no substitution, 1 for substitution.
2190 xmlSubstituteEntitiesDefault(int val
) {
2191 int old
= xmlSubstituteEntitiesDefaultValue
;
2193 xmlSubstituteEntitiesDefaultValue
= val
;
2198 * xmlKeepBlanksDefault:
2201 * DEPRECATED: Use the modern options API with XML_PARSE_NOBLANKS.
2203 * Set and return the previous value for default blanks text nodes support.
2204 * The 1.x version of the parser used an heuristic to try to detect
2205 * ignorable white spaces. As a result the SAX callback was generating
2206 * xmlSAX2IgnorableWhitespace() callbacks instead of characters() one, and when
2207 * using the DOM output text nodes containing those blanks were not generated.
2208 * The 2.x and later version will switch to the XML standard way and
2209 * ignorableWhitespace() are only generated when running the parser in
2210 * validating mode and when the current element doesn't allow CDATA or
2212 * This function is provided as a way to force the standard behavior
2213 * on 1.X libs and to switch back to the old mode for compatibility when
2214 * running 1.X client code on 2.X . Upgrade of 1.X code should be done
2215 * by using xmlIsBlankNode() commodity function to detect the "empty"
2217 * This value also affect autogeneration of indentation when saving code
2218 * if blanks sections are kept, indentation is not generated.
2220 * Returns the last value for 0 for no substitution, 1 for substitution.
2224 xmlKeepBlanksDefault(int val
) {
2225 int old
= xmlKeepBlanksDefaultValue
;
2227 xmlKeepBlanksDefaultValue
= val
;
2228 if (!val
) xmlIndentTreeOutput
= 1;