1 /* StreamSerializer.java --
2 Copyright (C) 2004,2006 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
38 package gnu
.xml
.transform
;
40 import java
.io
.ByteArrayOutputStream
;
41 import java
.io
.IOException
;
42 import java
.io
.OutputStream
;
43 import java
.io
.UnsupportedEncodingException
;
44 import java
.nio
.ByteBuffer
;
45 import java
.nio
.CharBuffer
;
46 import java
.nio
.charset
.Charset
;
47 import java
.nio
.charset
.CharsetEncoder
;
48 import java
.util
.Collection
;
49 import java
.util
.Collections
;
50 import java
.util
.HashMap
;
51 import java
.util
.HashSet
;
52 import java
.util
.Iterator
;
53 import java
.util
.LinkedList
;
55 import javax
.xml
.XMLConstants
;
56 import org
.w3c
.dom
.Attr
;
57 import org
.w3c
.dom
.Document
;
58 import org
.w3c
.dom
.DocumentType
;
59 import org
.w3c
.dom
.NamedNodeMap
;
60 import org
.w3c
.dom
.Node
;
63 * Serializes a DOM node to an output stream.
65 * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a>
67 public class StreamSerializer
70 static final int SPACE
= 0x20;
71 static final int BANG
= 0x21; // !
72 static final int APOS
= 0x27; // '
73 static final int SLASH
= 0x2f; // /
74 static final int BRA
= 0x3c; // <
75 static final int KET
= 0x3e; // >
76 static final int EQ
= 0x3d; // =
79 * HTML 4.01 boolean attributes
81 static final Map HTML_BOOLEAN_ATTRIBUTES
= new HashMap();
88 HTML_BOOLEAN_ATTRIBUTES
.put("area", set
);
92 HTML_BOOLEAN_ATTRIBUTES
.put("img", set
);
96 HTML_BOOLEAN_ATTRIBUTES
.put("object", set
);
100 HTML_BOOLEAN_ATTRIBUTES
.put("hr", set
);
104 HTML_BOOLEAN_ATTRIBUTES
.put("dl", set
);
105 HTML_BOOLEAN_ATTRIBUTES
.put("ol", set
);
106 HTML_BOOLEAN_ATTRIBUTES
.put("ul", set
);
107 HTML_BOOLEAN_ATTRIBUTES
.put("dir", set
);
108 HTML_BOOLEAN_ATTRIBUTES
.put("menu", set
);
115 HTML_BOOLEAN_ATTRIBUTES
.put("input", set
);
120 HTML_BOOLEAN_ATTRIBUTES
.put("select", set
);
124 HTML_BOOLEAN_ATTRIBUTES
.put("optgroup", set
);
129 HTML_BOOLEAN_ATTRIBUTES
.put("option", set
);
134 HTML_BOOLEAN_ATTRIBUTES
.put("textarea", set
);
138 HTML_BOOLEAN_ATTRIBUTES
.put("button", set
);
142 HTML_BOOLEAN_ATTRIBUTES
.put("th", set
);
143 HTML_BOOLEAN_ATTRIBUTES
.put("td", set
);
147 HTML_BOOLEAN_ATTRIBUTES
.put("frame", set
);
151 HTML_BOOLEAN_ATTRIBUTES
.put("script", set
);
154 // HTML namespace URIs
155 static final HashSet HTML_URIS
= new HashSet();
157 HTML_URIS
.add("http://www.w3.org/1999/xhtml");
160 protected final String encoding
;
161 final Charset charset
;
162 final CharsetEncoder encoder
;
164 final LinkedList namespaces
;
165 protected String eol
;
166 Collection cdataSectionElements
= Collections
.EMPTY_SET
;
168 protected boolean discardDefaultContent
;
169 protected boolean xmlDeclaration
= true;
171 // has a META element with the encoding been added?
172 private boolean htmlEncoded
;
174 public StreamSerializer()
176 this(Stylesheet
.OUTPUT_XML
, null, null);
179 public StreamSerializer(String encoding
)
181 this(Stylesheet
.OUTPUT_XML
, encoding
, null);
184 public StreamSerializer(int mode
, String encoding
, String eol
)
187 if (encoding
== null)
188 encoding
= (mode
== Stylesheet
.OUTPUT_HTML
) ?
"ISO-8859-1" : "UTF-8";
189 this.encoding
= encoding
.intern();
190 charset
= Charset
.forName(this.encoding
);
191 encoder
= charset
.newEncoder();
192 this.eol
= (eol
!= null) ? eol
: System
.getProperty("line.separator");
193 namespaces
= new LinkedList();
196 void setCdataSectionElements(Collection c
)
198 cdataSectionElements
= c
;
201 public void serialize(final Node node
, final OutputStream out
)
204 serialize(node
, out
, false);
207 void serialize(Node node
, final OutputStream out
,
208 boolean convertToCdata
)
213 Node next
= node
.getNextSibling();
214 doSerialize(node
, out
, convertToCdata
);
219 private void doSerialize(final Node node
, final OutputStream out
,
220 boolean convertToCdata
)
224 throw new NullPointerException("no output stream");
226 String value
, prefix
;
228 String uri
= node
.getNamespaceURI();
229 short nt
= node
.getNodeType();
230 if (convertToCdata
&& nt
== Node
.TEXT_NODE
)
231 nt
= Node
.CDATA_SECTION_NODE
;
234 case Node
.ATTRIBUTE_NODE
:
235 prefix
= node
.getPrefix();
236 if (XMLConstants
.XMLNS_ATTRIBUTE_NS_URI
.equals(uri
) ||
237 XMLConstants
.XMLNS_ATTRIBUTE
.equals(prefix
) ||
238 (prefix
!= null && prefix
.startsWith("xmlns:")))
240 String nsuri
= node
.getNodeValue();
241 if (isDefined(nsuri
, prefix
))
243 String name
= node
.getLocalName();
247 name
= node
.getNodeName();
248 int ci
= name
.indexOf(':');
250 name
= name
.substring(ci
+ 1);
254 else if (uri
!= null && !isDefined(uri
, prefix
))
256 prefix
= define(uri
, prefix
);
257 String nsname
= (prefix
== null) ?
"xmlns" : "xmlns:" + prefix
;
259 out
.write(encodeText(nsname
));
261 String nsvalue
= "\"" + encode(uri
, true, true) + "\"";
262 out
.write(nsvalue
.getBytes(encoding
));
265 String a_nodeName
= node
.getNodeName();
266 out
.write(encodeText(a_nodeName
));
267 String a_nodeValue
= node
.getNodeValue();
268 if (mode
== Stylesheet
.OUTPUT_HTML
&&
269 a_nodeName
.equals(a_nodeValue
) &&
270 isHTMLBoolean((Attr
) node
, a_nodeName
))
273 value
= "\"" + encode(a_nodeValue
, true, true) + "\"";
274 out
.write(encodeText(value
));
276 case Node
.ELEMENT_NODE
:
277 pushNamespaceContext();
278 value
= node
.getNodeName();
280 out
.write(encodeText(value
));
281 prefix
= node
.getPrefix();
282 if (uri
!= null && !isDefined(uri
, prefix
))
284 prefix
= define(uri
, prefix
);
285 String nsname
= (prefix
== null) ?
"xmlns" : "xmlns:" + prefix
;
287 out
.write(encodeText(nsname
));
289 String nsvalue
= "\"" + encode(uri
, true, true) + "\"";
290 out
.write(encodeText(nsvalue
));
292 NamedNodeMap attrs
= node
.getAttributes();
295 int len
= attrs
.getLength();
296 for (int i
= 0; i
< len
; i
++)
298 Attr attr
= (Attr
) attrs
.item(i
);
299 if (discardDefaultContent
&& !attr
.getSpecified())
304 serialize(attr
, out
, false);
307 convertToCdata
= cdataSectionElements
.contains(value
);
308 children
= node
.getFirstChild();
309 if (children
== null)
317 serialize(children
, out
, convertToCdata
);
320 out
.write(encodeText(value
));
323 popNamespaceContext();
326 value
= node
.getNodeValue();
327 if (!"yes".equals(node
.getUserData("disable-output-escaping")))
328 value
= encode(value
, false, false);
329 out
.write(encodeText(value
));
331 case Node
.CDATA_SECTION_NODE
:
332 value
= node
.getNodeValue();
333 // Where any instanceof of ]]> occur, split into multiple CDATA
335 int bbk
= value
.indexOf("]]>");
338 String head
= value
.substring(0, bbk
+ 2);
339 out
.write(encodeText("<![CDATA[" + head
+ "]]>"));
340 value
= value
.substring(bbk
+ 2);
341 bbk
= value
.indexOf("]]>");
343 // Write final tail value
344 out
.write(encodeText("<![CDATA[" + value
+ "]]>"));
346 case Node
.COMMENT_NODE
:
347 value
= "<!--" + node
.getNodeValue() + "-->";
348 out
.write(encodeText(value
));
349 Node cp
= node
.getParentNode();
350 if (cp
!= null && cp
.getNodeType() == Node
.DOCUMENT_NODE
)
351 out
.write(encodeText(eol
));
353 case Node
.DOCUMENT_NODE
:
354 case Node
.DOCUMENT_FRAGMENT_NODE
:
355 if (mode
== Stylesheet
.OUTPUT_XML
)
357 if ("UTF-16".equalsIgnoreCase(encoding
))
362 if (!"yes".equals(node
.getUserData("omit-xml-declaration")) &&
365 Document doc
= (node
instanceof Document
) ?
366 (Document
) node
: null;
367 String version
= (doc
!= null) ? doc
.getXmlVersion() : null;
369 version
= (String
) node
.getUserData("version");
374 out
.write("xml version=\"".getBytes("US-ASCII"));
375 out
.write(version
.getBytes("US-ASCII"));
377 if (!("UTF-8".equalsIgnoreCase(encoding
)))
379 out
.write(" encoding=\"".getBytes("US-ASCII"));
380 out
.write(encoding
.getBytes("US-ASCII"));
383 if ((doc
!= null && doc
.getXmlStandalone()) ||
384 "yes".equals(node
.getUserData("standalone")))
385 out
.write(" standalone=\"yes\"".getBytes("US-ASCII"));
388 out
.write(encodeText(eol
));
390 // TODO warn if not outputting the declaration would be a
393 else if (mode
== Stylesheet
.OUTPUT_HTML
)
395 // Ensure that encoding is accessible if head element is present
396 String mediaType
= (String
) node
.getUserData("media-type");
397 if (mediaType
== null)
398 mediaType
= "text/html";
399 String contentType
= mediaType
+ "; charset=" +
400 ((encoding
.indexOf(' ') != -1) ?
401 "\"" + encoding
+ "\"" :
403 Document doc
= (node
instanceof Document
) ?
(Document
) node
:
404 node
.getOwnerDocument();
406 for (Node ctx
= node
.getFirstChild(); ctx
!= null;
407 ctx
= ctx
.getNextSibling())
409 if (ctx
.getNodeType() == Node
.ELEMENT_NODE
&&
410 isHTMLElement(ctx
, "html"))
419 for (Node ctx
= html
.getFirstChild(); ctx
!= null;
420 ctx
= ctx
.getNextSibling())
422 if (isHTMLElement(ctx
, "head"))
431 Node metaContent
= null;
432 for (Node ctx
= head
.getFirstChild(); ctx
!= null;
433 ctx
= ctx
.getNextSibling())
435 if (isHTMLElement(ctx
, "meta"))
437 NamedNodeMap metaAttrs
= ctx
.getAttributes();
438 int len
= metaAttrs
.getLength();
439 String httpEquiv
= null;
441 for (int i
= 0; i
< len
; i
++)
443 Node attr
= metaAttrs
.item(i
);
444 String attrName
= attr
.getNodeName();
445 if ("http-equiv".equalsIgnoreCase(attrName
))
446 httpEquiv
= attr
.getNodeValue();
447 else if ("content".equalsIgnoreCase(attrName
))
450 if ("Content-Type".equalsIgnoreCase(httpEquiv
))
453 metaContent
= content
;
460 meta
= doc
.createElement("meta");
462 Node first
= head
.getFirstChild();
464 head
.appendChild(meta
);
466 head
.insertBefore(meta
, first
);
467 Node metaHttpEquiv
= doc
.createAttribute("http-equiv");
468 meta
.getAttributes().setNamedItem(metaHttpEquiv
);
469 metaHttpEquiv
.setNodeValue("Content-Type");
471 if (metaContent
== null)
473 metaContent
= doc
.createAttribute("content");
474 meta
.getAttributes().setNamedItem(metaContent
);
476 metaContent
.setNodeValue(contentType
);
481 children
= node
.getFirstChild();
482 if (children
!= null)
483 serialize(children
, out
, convertToCdata
);
485 case Node
.DOCUMENT_TYPE_NODE
:
486 DocumentType doctype
= (DocumentType
) node
;
489 out
.write(encodeText("DOCTYPE "));
490 value
= doctype
.getNodeName();
491 out
.write(encodeText(value
));
492 String publicId
= doctype
.getPublicId();
493 if (publicId
!= null)
495 out
.write(encodeText(" PUBLIC "));
497 out
.write(encodeText(publicId
));
500 String systemId
= doctype
.getSystemId();
501 if (systemId
!= null)
503 out
.write(encodeText(" SYSTEM "));
505 out
.write(encodeText(systemId
));
508 String internalSubset
= doctype
.getInternalSubset();
509 if (internalSubset
!= null)
511 out
.write(encodeText(internalSubset
));
514 out
.write(eol
.getBytes(encoding
));
516 case Node
.ENTITY_REFERENCE_NODE
:
517 value
= "&" + node
.getNodeValue() + ";";
518 out
.write(encodeText(value
));
520 case Node
.PROCESSING_INSTRUCTION_NODE
:
521 value
= "<?" + node
.getNodeName() + " " + node
.getNodeValue() + "?>";
522 out
.write(encodeText(value
));
523 Node pp
= node
.getParentNode();
524 if (pp
!= null && pp
.getNodeType() == Node
.DOCUMENT_NODE
)
526 out
.write(encodeText(eol
));
530 System
.err
.println("Unhandled node type: "+nt
);
534 boolean isHTMLElement(Node node
, String name
)
536 if (node
.getNodeType() != Node
.ELEMENT_NODE
)
538 String localName
= node
.getLocalName();
539 if (localName
== null)
540 localName
= node
.getNodeName();
541 if (!name
.equalsIgnoreCase(localName
))
543 String uri
= node
.getNamespaceURI();
544 return (uri
== null || HTML_URIS
.contains(uri
));
547 boolean isDefined(String uri
, String prefix
)
549 if (XMLConstants
.XML_NS_URI
.equals(uri
))
550 return "xml".equals(prefix
);
551 if (XMLConstants
.XMLNS_ATTRIBUTE_NS_URI
.equals(uri
))
552 return "xmlns".equals(prefix
);
555 for (Iterator i
= namespaces
.iterator(); i
.hasNext(); )
557 Map ctx
= (Map
) i
.next();
558 String val
= (String
) ctx
.get(uri
);
559 if (val
!= null && val
.equals(prefix
))
565 void pushNamespaceContext()
567 namespaces
.addFirst(new HashMap());
570 String
define(String uri
, String prefix
)
572 if (namespaces
.isEmpty())
574 HashMap ctx
= (HashMap
) namespaces
.getFirst();
575 while (ctx
.containsValue(prefix
))
577 // Fabricate new prefix
578 prefix
= prefix
+ "_";
580 ctx
.put(uri
, prefix
);
584 void popNamespaceContext()
586 namespaces
.removeFirst();
589 final byte[] encodeText(String text
)
593 boolean htmlNeedingEncoding
=
594 (mode
== Stylesheet
.OUTPUT_HTML
&& !htmlEncoded
);
595 if (!encoder
.canEncode(text
) || htmlNeedingEncoding
)
597 // Check each character
598 StringBuffer buf
= new StringBuffer();
599 int len
= text
.length();
600 for (int i
= 0; i
< len
; i
++)
602 char c
= text
.charAt(i
);
603 if (!encoder
.canEncode(c
))
605 // Replace with character entity reference
606 String hex
= Integer
.toHexString((int) c
);
611 else if (htmlNeedingEncoding
)
613 String entityName
= getHTMLCharacterEntity(c
);
614 if (entityName
!= null)
617 buf
.append(entityName
);
626 text
= buf
.toString();
628 ByteBuffer encoded
= encoder
.encode(CharBuffer
.wrap(text
));
629 int len
= encoded
.limit() - encoded
.position();
630 if (encoded
.hasArray())
632 byte[] ret
= encoded
.array();
633 if (ret
.length
> len
)
636 byte[] ret2
= new byte[len
];
637 System
.arraycopy(ret
, 0, ret2
, 0, len
);
643 byte[] ret
= new byte[len
];
644 encoded
.get(ret
, 0, len
);
648 String
encode(String text
, boolean encodeCtl
, boolean inAttr
)
650 int len
= text
.length();
651 StringBuffer buf
= null;
652 for (int i
= 0; i
< len
; i
++)
654 char c
= text
.charAt(i
);
658 buf
= new StringBuffer(text
.substring(0, i
));
664 buf
= new StringBuffer(text
.substring(0, i
));
669 if (mode
== Stylesheet
.OUTPUT_HTML
&& (i
+ 1) < len
&&
670 text
.charAt(i
+ 1) == '{')
678 buf
= new StringBuffer(text
.substring(0, i
));
682 else if (c
== '\'' && inAttr
)
685 buf
= new StringBuffer(text
.substring(0, i
));
686 if (mode
== Stylesheet
.OUTPUT_HTML
)
687 // HTML does not define ', use character entity ref
688 buf
.append("'");
690 buf
.append("'");
692 else if (c
== '"' && inAttr
)
695 buf
= new StringBuffer(text
.substring(0, i
));
696 buf
.append(""");
703 buf
= new StringBuffer(text
.substring(0, i
));
709 else if (buf
!= null)
712 else if (buf
!= null)
715 return (buf
== null) ? text
: buf
.toString();
718 String
toString(Node node
)
720 ByteArrayOutputStream out
= new ByteArrayOutputStream();
723 serialize(node
, out
);
724 return new String(out
.toByteArray(), encoding
);
726 catch (IOException e
)
728 throw new RuntimeException(e
.getMessage());
732 boolean isHTMLBoolean(Attr attr
, String attrName
)
734 attrName
= attrName
.toLowerCase();
735 Node element
= attr
.getOwnerElement();
736 String elementName
= element
.getLocalName();
737 if (elementName
== null)
739 elementName
= element
.getNodeName();
741 elementName
= elementName
.toLowerCase();
742 Collection attributes
=
743 (Collection
) HTML_BOOLEAN_ATTRIBUTES
.get(elementName
);
744 return (attributes
!= null && attributes
.contains(attrName
));
747 static String
getHTMLCharacterEntity(char c
)
749 // Hardcode these here to avoid loading the HTML DTD
752 case 160: return "nbsp";
753 case 161: return "iexcl";
754 case 162: return "cent";
755 case 163: return "pound";
756 case 164: return "curren";
757 case 165: return "yen";
758 case 166: return "brvbar";
759 case 167: return "sect";
760 case 168: return "uml";
761 case 169: return "copy";
762 case 170: return "ordf";
763 case 171: return "laquo";
764 case 172: return "not";
765 case 173: return "shy";
766 case 174: return "reg";
767 case 175: return "macr";
768 case 176: return "deg";
769 case 177: return "plusmn";
770 case 178: return "sup2";
771 case 179: return "sup3";
772 case 180: return "acute";
773 case 181: return "micro";
774 case 182: return "para";
775 case 183: return "middot";
776 case 184: return "cedil";
777 case 185: return "sup1";
778 case 186: return "ordm";
779 case 187: return "raquo";
780 case 188: return "frac14";
781 case 189: return "frac12";
782 case 190: return "frac34";
783 case 191: return "iquest";
784 case 192: return "Agrave";
785 case 193: return "Aacute";
786 case 194: return "Acirc";
787 case 195: return "Atilde";
788 case 196: return "Auml";
789 case 197: return "Aring";
790 case 198: return "AElig";
791 case 199: return "Ccedil";
792 case 200: return "Egrave";
793 case 201: return "Eacute";
794 case 202: return "Ecirc";
795 case 203: return "Euml";
796 case 204: return "Igrave";
797 case 205: return "Iacute";
798 case 206: return "Icirc";
799 case 207: return "Iuml";
800 case 208: return "ETH";
801 case 209: return "Ntilde";
802 case 210: return "Ograve";
803 case 211: return "Oacute";
804 case 212: return "Ocirc";
805 case 213: return "Otilde";
806 case 214: return "Ouml";
807 case 215: return "times";
808 case 216: return "Oslash";
809 case 217: return "Ugrave";
810 case 218: return "Uacute";
811 case 219: return "Ucirc";
812 case 220: return "Uuml";
813 case 221: return "Yacute";
814 case 222: return "THORN";
815 case 223: return "szlig";
816 case 224: return "agrave";
817 case 225: return "aacute";
818 case 226: return "acirc";
819 case 227: return "atilde";
820 case 228: return "auml";
821 case 229: return "aring";
822 case 230: return "aelig";
823 case 231: return "ccedil";
824 case 232: return "egrave";
825 case 233: return "eacute";
826 case 234: return "ecirc";
827 case 235: return "euml";
828 case 236: return "igrave";
829 case 237: return "iacute";
830 case 238: return "icirc";
831 case 239: return "iuml";
832 case 240: return "eth";
833 case 241: return "ntilde";
834 case 242: return "ograve";
835 case 243: return "oacute";
836 case 244: return "ocirc";
837 case 245: return "otilde";
838 case 246: return "ouml";
839 case 247: return "divide";
840 case 248: return "oslash";
841 case 249: return "ugrave";
842 case 250: return "uacute";
843 case 251: return "ucirc";
844 case 252: return "uuml";
845 case 253: return "yacute";
846 case 254: return "thorn";
847 case 255: return "yuml";
848 default: return null;