2 * testHTML.c : a small tester program for HTML input.
4 * See Copyright for the status of this software.
11 #ifdef LIBXML_HTML_ENABLED
17 #ifdef HAVE_SYS_TYPES_H
18 #include <sys/types.h>
20 #ifdef HAVE_SYS_STAT_H
33 #include <libxml/xmlmemory.h>
34 #include <libxml/HTMLparser.h>
35 #include <libxml/HTMLtree.h>
36 #include <libxml/debugXML.h>
37 #include <libxml/xmlerror.h>
38 #include <libxml/globals.h>
40 #ifdef LIBXML_DEBUG_ENABLED
45 static int repeat
= 0;
47 #ifdef LIBXML_PUSH_ENABLED
49 #endif /* LIBXML_PUSH_ENABLED */
50 static char *encoding
= NULL
;
51 static int options
= 0;
53 static xmlSAXHandler emptySAXHandlerStruct
= {
54 NULL
, /* internalSubset */
55 NULL
, /* isStandalone */
56 NULL
, /* hasInternalSubset */
57 NULL
, /* hasExternalSubset */
58 NULL
, /* resolveEntity */
60 NULL
, /* entityDecl */
61 NULL
, /* notationDecl */
62 NULL
, /* attributeDecl */
63 NULL
, /* elementDecl */
64 NULL
, /* unparsedEntityDecl */
65 NULL
, /* setDocumentLocator */
66 NULL
, /* startDocument */
67 NULL
, /* endDocument */
68 NULL
, /* startElement */
69 NULL
, /* endElement */
71 NULL
, /* characters */
72 NULL
, /* ignorableWhitespace */
73 NULL
, /* processingInstruction */
75 NULL
, /* xmlParserWarning */
76 NULL
, /* xmlParserError */
77 NULL
, /* xmlParserError */
78 NULL
, /* getParameterEntity */
79 NULL
, /* cdataBlock */
80 NULL
, /* externalSubset */
83 NULL
, /* startElementNsSAX2Func */
84 NULL
, /* endElementNsSAX2Func */
85 NULL
/* xmlStructuredErrorFunc */
88 static xmlSAXHandlerPtr emptySAXHandler
= &emptySAXHandlerStruct
;
89 extern xmlSAXHandlerPtr debugSAXHandler
;
91 /************************************************************************
95 ************************************************************************/
99 * @ctxt: An XML parser context
101 * Is this document tagged standalone ?
106 isStandaloneDebug(void *ctx ATTRIBUTE_UNUSED
)
108 fprintf(stdout
, "SAX.isStandalone()\n");
113 * hasInternalSubsetDebug:
114 * @ctxt: An XML parser context
116 * Does this document has an internal subset
121 hasInternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED
)
123 fprintf(stdout
, "SAX.hasInternalSubset()\n");
128 * hasExternalSubsetDebug:
129 * @ctxt: An XML parser context
131 * Does this document has an external subset
136 hasExternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED
)
138 fprintf(stdout
, "SAX.hasExternalSubset()\n");
143 * hasInternalSubsetDebug:
144 * @ctxt: An XML parser context
146 * Does this document has an internal subset
149 internalSubsetDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
,
150 const xmlChar
*ExternalID
, const xmlChar
*SystemID
)
152 fprintf(stdout
, "SAX.internalSubset(%s,", name
);
153 if (ExternalID
== NULL
)
154 fprintf(stdout
, " ,");
156 fprintf(stdout
, " %s,", ExternalID
);
157 if (SystemID
== NULL
)
158 fprintf(stdout
, " )\n");
160 fprintf(stdout
, " %s)\n", SystemID
);
164 * resolveEntityDebug:
165 * @ctxt: An XML parser context
166 * @publicId: The public ID of the entity
167 * @systemId: The system ID of the entity
169 * Special entity resolver, better left to the parser, it has
170 * more context than the application layer.
171 * The default behaviour is to NOT resolve the entities, in that case
172 * the ENTITY_REF nodes are built in the structure (and the parameter
175 * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
177 static xmlParserInputPtr
178 resolveEntityDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*publicId
, const xmlChar
*systemId
)
180 /* xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx; */
183 fprintf(stdout
, "SAX.resolveEntity(");
184 if (publicId
!= NULL
)
185 fprintf(stdout
, "%s", (char *)publicId
);
187 fprintf(stdout
, " ");
188 if (systemId
!= NULL
)
189 fprintf(stdout
, ", %s)\n", (char *)systemId
);
191 fprintf(stdout
, ", )\n");
193 if (systemId != NULL) {
194 return(xmlNewInputFromFile(ctxt, (char *) systemId));
202 * @ctxt: An XML parser context
203 * @name: The entity name
205 * Get an entity by name
207 * Returns the xmlParserInputPtr if inlined or NULL for DOM behaviour.
210 getEntityDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
)
212 fprintf(stdout
, "SAX.getEntity(%s)\n", name
);
217 * getParameterEntityDebug:
218 * @ctxt: An XML parser context
219 * @name: The entity name
221 * Get a parameter entity by name
223 * Returns the xmlParserInputPtr
226 getParameterEntityDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
)
228 fprintf(stdout
, "SAX.getParameterEntity(%s)\n", name
);
235 * @ctxt: An XML parser context
236 * @name: the entity name
237 * @type: the entity type
238 * @publicId: The public ID of the entity
239 * @systemId: The system ID of the entity
240 * @content: the entity value (without processing).
242 * An entity definition has been parsed
245 entityDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
, int type
,
246 const xmlChar
*publicId
, const xmlChar
*systemId
, xmlChar
*content
)
248 fprintf(stdout
, "SAX.entityDecl(%s, %d, %s, %s, %s)\n",
249 name
, type
, publicId
, systemId
, content
);
253 * attributeDeclDebug:
254 * @ctxt: An XML parser context
255 * @name: the attribute name
256 * @type: the attribute type
258 * An attribute definition has been parsed
261 attributeDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*elem
, const xmlChar
*name
,
262 int type
, int def
, const xmlChar
*defaultValue
,
263 xmlEnumerationPtr tree ATTRIBUTE_UNUSED
)
265 fprintf(stdout
, "SAX.attributeDecl(%s, %s, %d, %d, %s, ...)\n",
266 elem
, name
, type
, def
, defaultValue
);
271 * @ctxt: An XML parser context
272 * @name: the element name
273 * @type: the element type
274 * @content: the element value (without processing).
276 * An element definition has been parsed
279 elementDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
, int type
,
280 xmlElementContentPtr content ATTRIBUTE_UNUSED
)
282 fprintf(stdout
, "SAX.elementDecl(%s, %d, ...)\n",
288 * @ctxt: An XML parser context
289 * @name: The name of the notation
290 * @publicId: The public ID of the entity
291 * @systemId: The system ID of the entity
293 * What to do when a notation declaration has been parsed.
296 notationDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
,
297 const xmlChar
*publicId
, const xmlChar
*systemId
)
299 fprintf(stdout
, "SAX.notationDecl(%s, %s, %s)\n",
300 (char *) name
, (char *) publicId
, (char *) systemId
);
304 * unparsedEntityDeclDebug:
305 * @ctxt: An XML parser context
306 * @name: The name of the entity
307 * @publicId: The public ID of the entity
308 * @systemId: The system ID of the entity
309 * @notationName: the name of the notation
311 * What to do when an unparsed entity declaration is parsed
314 unparsedEntityDeclDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
,
315 const xmlChar
*publicId
, const xmlChar
*systemId
,
316 const xmlChar
*notationName
)
318 fprintf(stdout
, "SAX.unparsedEntityDecl(%s, %s, %s, %s)\n",
319 (char *) name
, (char *) publicId
, (char *) systemId
,
320 (char *) notationName
);
324 * setDocumentLocatorDebug:
325 * @ctxt: An XML parser context
326 * @loc: A SAX Locator
328 * Receive the document locator at startup, actually xmlDefaultSAXLocator
329 * Everything is available on the context, so this is useless in our case.
332 setDocumentLocatorDebug(void *ctx ATTRIBUTE_UNUSED
, xmlSAXLocatorPtr loc ATTRIBUTE_UNUSED
)
334 fprintf(stdout
, "SAX.setDocumentLocator()\n");
338 * startDocumentDebug:
339 * @ctxt: An XML parser context
341 * called when the document start being processed.
344 startDocumentDebug(void *ctx ATTRIBUTE_UNUSED
)
346 fprintf(stdout
, "SAX.startDocument()\n");
351 * @ctxt: An XML parser context
353 * called when the document end has been detected.
356 endDocumentDebug(void *ctx ATTRIBUTE_UNUSED
)
358 fprintf(stdout
, "SAX.endDocument()\n");
363 * @ctxt: An XML parser context
364 * @name: The element name
366 * called when an opening tag has been processed.
369 startElementDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
, const xmlChar
**atts
)
373 fprintf(stdout
, "SAX.startElement(%s", (char *) name
);
375 for (i
= 0;(atts
[i
] != NULL
);i
++) {
376 fprintf(stdout
, ", %s", atts
[i
++]);
377 if (atts
[i
] != NULL
) {
378 unsigned char output
[40];
379 const unsigned char *att
= atts
[i
];
381 fprintf(stdout
, "='");
382 while ((attlen
= strlen((char*)att
)) > 0) {
383 outlen
= sizeof output
- 1;
384 htmlEncodeEntities(output
, &outlen
, att
, &attlen
, '\'');
386 fprintf(stdout
, "%s", (char *) output
);
389 fprintf(stdout
, "'");
393 fprintf(stdout
, ")\n");
398 * @ctxt: An XML parser context
399 * @name: The element name
401 * called when the end of an element has been detected.
404 endElementDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
)
406 fprintf(stdout
, "SAX.endElement(%s)\n", (char *) name
);
411 * @ctxt: An XML parser context
412 * @ch: a xmlChar string
413 * @len: the number of xmlChar
415 * receiving some chars from the parser.
416 * Question: how much at a time ???
419 charactersDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*ch
, int len
)
421 unsigned char output
[40];
422 int inlen
= len
, outlen
= 30;
424 htmlEncodeEntities(output
, &outlen
, ch
, &inlen
, 0);
427 fprintf(stdout
, "SAX.characters(%s, %d)\n", output
, len
);
432 * @ctxt: An XML parser context
433 * @ch: a xmlChar string
434 * @len: the number of xmlChar
436 * receiving some cdata chars from the parser.
437 * Question: how much at a time ???
440 cdataDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*ch
, int len
)
442 unsigned char output
[40];
443 int inlen
= len
, outlen
= 30;
445 htmlEncodeEntities(output
, &outlen
, ch
, &inlen
, 0);
448 fprintf(stdout
, "SAX.cdata(%s, %d)\n", output
, len
);
453 * @ctxt: An XML parser context
454 * @name: The entity name
456 * called when an entity reference is detected.
459 referenceDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*name
)
461 fprintf(stdout
, "SAX.reference(%s)\n", name
);
465 * ignorableWhitespaceDebug:
466 * @ctxt: An XML parser context
467 * @ch: a xmlChar string
468 * @start: the first char in the string
469 * @len: the number of xmlChar
471 * receiving some ignorable whitespaces from the parser.
472 * Question: how much at a time ???
475 ignorableWhitespaceDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*ch
, int len
)
480 for (i
= 0;(i
<len
) && (i
< 30);i
++)
484 fprintf(stdout
, "SAX.ignorableWhitespace(%s, %d)\n", output
, len
);
488 * processingInstructionDebug:
489 * @ctxt: An XML parser context
490 * @target: the target name
491 * @data: the PI data's
492 * @len: the number of xmlChar
494 * A processing instruction has been parsed.
497 processingInstructionDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*target
,
500 fprintf(stdout
, "SAX.processingInstruction(%s, %s)\n",
501 (char *) target
, (char *) data
);
506 * @ctxt: An XML parser context
507 * @value: the comment content
509 * A comment has been parsed.
512 commentDebug(void *ctx ATTRIBUTE_UNUSED
, const xmlChar
*value
)
514 fprintf(stdout
, "SAX.comment(%s)\n", value
);
519 * @ctxt: An XML parser context
520 * @msg: the message to display/transmit
521 * @...: extra parameters for the message display
523 * Display and format a warning messages, gives file, line, position and
527 warningDebug(void *ctx ATTRIBUTE_UNUSED
, const char *msg
, ...)
532 fprintf(stdout
, "SAX.warning: ");
533 vfprintf(stdout
, msg
, args
);
539 * @ctxt: An XML parser context
540 * @msg: the message to display/transmit
541 * @...: extra parameters for the message display
543 * Display and format a error messages, gives file, line, position and
547 errorDebug(void *ctx ATTRIBUTE_UNUSED
, const char *msg
, ...)
552 fprintf(stdout
, "SAX.error: ");
553 vfprintf(stdout
, msg
, args
);
559 * @ctxt: An XML parser context
560 * @msg: the message to display/transmit
561 * @...: extra parameters for the message display
563 * Display and format a fatalError messages, gives file, line, position and
567 fatalErrorDebug(void *ctx ATTRIBUTE_UNUSED
, const char *msg
, ...)
572 fprintf(stdout
, "SAX.fatalError: ");
573 vfprintf(stdout
, msg
, args
);
577 static xmlSAXHandler debugSAXHandlerStruct
= {
580 hasInternalSubsetDebug
,
581 hasExternalSubsetDebug
,
588 unparsedEntityDeclDebug
,
589 setDocumentLocatorDebug
,
596 ignorableWhitespaceDebug
,
597 processingInstructionDebug
,
602 getParameterEntityDebug
,
612 xmlSAXHandlerPtr debugSAXHandler
= &debugSAXHandlerStruct
;
613 /************************************************************************
617 ************************************************************************/
620 parseSAXFile(char *filename
) {
621 htmlDocPtr doc
= NULL
;
624 * Empty callbacks for checking
626 #ifdef LIBXML_PUSH_ENABLED
630 #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
631 f
= fopen(filename
, "rb");
633 f
= fopen(filename
, "r");
638 htmlParserCtxtPtr ctxt
;
642 res
= fread(chars
, 1, 4, f
);
644 ctxt
= htmlCreatePushParserCtxt(emptySAXHandler
, NULL
,
645 chars
, res
, filename
, XML_CHAR_ENCODING_NONE
);
646 while ((res
= fread(chars
, 1, size
, f
)) > 0) {
647 htmlParseChunk(ctxt
, chars
, res
, 0);
649 htmlParseChunk(ctxt
, chars
, 0, 1);
651 htmlFreeParserCtxt(ctxt
);
654 fprintf(stdout
, "htmlSAXParseFile returned non-NULL\n");
660 #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
661 f
= fopen(filename
, "rb");
663 f
= fopen(filename
, "r");
668 htmlParserCtxtPtr ctxt
;
672 res
= fread(chars
, 1, 4, f
);
674 ctxt
= htmlCreatePushParserCtxt(debugSAXHandler
, NULL
,
675 chars
, res
, filename
, XML_CHAR_ENCODING_NONE
);
676 while ((res
= fread(chars
, 1, size
, f
)) > 0) {
677 htmlParseChunk(ctxt
, chars
, res
, 0);
679 htmlParseChunk(ctxt
, chars
, 0, 1);
681 htmlFreeParserCtxt(ctxt
);
684 fprintf(stdout
, "htmlSAXParseFile returned non-NULL\n");
691 #endif /* LIBXML_PUSH_ENABLED */
692 doc
= htmlSAXParseFile(filename
, NULL
, emptySAXHandler
, NULL
);
694 fprintf(stdout
, "htmlSAXParseFile returned non-NULL\n");
702 doc
= htmlSAXParseFile(filename
, NULL
, debugSAXHandler
, NULL
);
704 fprintf(stdout
, "htmlSAXParseFile returned non-NULL\n");
708 #ifdef LIBXML_PUSH_ENABLED
710 #endif /* LIBXML_PUSH_ENABLED */
714 parseAndPrintFile(char *filename
) {
715 htmlDocPtr doc
= NULL
;
718 * build an HTML tree from a string;
720 #ifdef LIBXML_PUSH_ENABLED
724 #if defined(_WIN32) || defined (__DJGPP__) && !defined (__CYGWIN__)
725 f
= fopen(filename
, "rb");
727 f
= fopen(filename
, "r");
732 htmlParserCtxtPtr ctxt
;
736 res
= fread(chars
, 1, 4, f
);
738 ctxt
= htmlCreatePushParserCtxt(NULL
, NULL
,
739 chars
, res
, filename
, XML_CHAR_ENCODING_NONE
);
740 while ((res
= fread(chars
, 1, size
, f
)) > 0) {
741 htmlParseChunk(ctxt
, chars
, res
, 0);
743 htmlParseChunk(ctxt
, chars
, 0, 1);
745 htmlFreeParserCtxt(ctxt
);
750 doc
= htmlReadFile(filename
, NULL
, options
);
753 doc
= htmlReadFile(filename
,NULL
,options
);
756 xmlGenericError(xmlGenericErrorContext
,
757 "Could not parse %s\n", filename
);
760 #ifdef LIBXML_TREE_ENABLED
762 * test intermediate copy if needed.
768 doc
= xmlCopyDoc(doc
, 1);
773 #ifdef LIBXML_OUTPUT_ENABLED
778 #ifdef LIBXML_DEBUG_ENABLED
781 htmlSaveFileEnc("-", doc
, encoding
);
783 htmlDocDump(stdout
, doc
);
785 xmlDebugDumpDocument(stdout
, doc
);
788 htmlSaveFileEnc("-", doc
, encoding
);
790 htmlDocDump(stdout
, doc
);
793 #endif /* LIBXML_OUTPUT_ENABLED */
801 int main(int argc
, char **argv
) {
805 for (i
= 1; i
< argc
; i
++) {
806 #ifdef LIBXML_DEBUG_ENABLED
807 if ((!strcmp(argv
[i
], "-debug")) || (!strcmp(argv
[i
], "--debug")))
811 if ((!strcmp(argv
[i
], "-copy")) || (!strcmp(argv
[i
], "--copy")))
813 #ifdef LIBXML_PUSH_ENABLED
814 else if ((!strcmp(argv
[i
], "-push")) || (!strcmp(argv
[i
], "--push")))
816 #endif /* LIBXML_PUSH_ENABLED */
817 else if ((!strcmp(argv
[i
], "-sax")) || (!strcmp(argv
[i
], "--sax")))
819 else if ((!strcmp(argv
[i
], "-noout")) || (!strcmp(argv
[i
], "--noout")))
821 else if ((!strcmp(argv
[i
], "-repeat")) ||
822 (!strcmp(argv
[i
], "--repeat")))
824 else if ((!strcmp(argv
[i
], "-encode")) ||
825 (!strcmp(argv
[i
], "--encode"))) {
830 for (i
= 1; i
< argc
; i
++) {
831 if ((!strcmp(argv
[i
], "-encode")) ||
832 (!strcmp(argv
[i
], "--encode"))) {
836 if (argv
[i
][0] != '-') {
838 for (count
= 0;count
< 100 * repeat
;count
++) {
840 parseSAXFile(argv
[i
]);
842 parseAndPrintFile(argv
[i
]);
846 parseSAXFile(argv
[i
]);
848 parseAndPrintFile(argv
[i
]);
854 printf("Usage : %s [--debug] [--copy] [--copy] HTMLfiles ...\n",
856 printf("\tParse the HTML files and output the result of the parsing\n");
857 #ifdef LIBXML_DEBUG_ENABLED
858 printf("\t--debug : dump a debug tree of the in-memory document\n");
860 printf("\t--copy : used to test the internal copy implementation\n");
861 printf("\t--sax : debug the sequence of SAX callbacks\n");
862 printf("\t--repeat : parse the file 100 times, for timing\n");
863 printf("\t--noout : do not print the result\n");
864 #ifdef LIBXML_PUSH_ENABLED
865 printf("\t--push : use the push mode parser\n");
866 #endif /* LIBXML_PUSH_ENABLED */
867 printf("\t--encode encoding : output in the given encoding\n");
874 #else /* !LIBXML_HTML_ENABLED */
876 int main(int argc ATTRIBUTE_UNUSED
, char **argv ATTRIBUTE_UNUSED
) {
877 printf("%s : HTML support not compiled in\n", argv
[0]);