2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.parser
.html
;
19 import java
.io
.IOException
;
20 import java
.io
.InputStream
;
21 import java
.util
.HashMap
;
22 import java
.util
.HashSet
;
26 import org
.apache
.commons
.io
.input
.CloseShieldInputStream
;
27 import org
.apache
.tika
.exception
.TikaException
;
28 import org
.apache
.tika
.metadata
.Metadata
;
29 import org
.apache
.tika
.parser
.AbstractParser
;
30 import org
.apache
.tika
.sax
.TeeContentHandler
;
31 import org
.apache
.tika
.sax
.TextContentHandler
;
32 import org
.apache
.tika
.sax
.WriteOutContentHandler
;
33 import org
.apache
.tika
.sax
.XHTMLContentHandler
;
34 import org
.apache
.tika
.sax
.xpath
.Matcher
;
35 import org
.apache
.tika
.sax
.xpath
.MatchingContentHandler
;
36 import org
.apache
.tika
.sax
.xpath
.XPathParser
;
37 import org
.apache
.tika
.utils
.Utils
;
38 import org
.cyberneko
.html
.parsers
.SAXParser
;
39 import org
.xml
.sax
.Attributes
;
40 import org
.xml
.sax
.ContentHandler
;
41 import org
.xml
.sax
.InputSource
;
42 import org
.xml
.sax
.SAXException
;
45 * HTML parser. Uses CyberNeko to turn the input document to HTML SAX events,
46 * and post-processes the events to produce XHTML and metadata expected by
49 public class HtmlParser
extends AbstractParser
{
52 * Set of safe mappings from incoming HTML elements to outgoing
53 * XHTML elements. Ensures that the output is valid XHTML 1.0 Strict.
55 private static final Map
<String
, String
> SAFE_ELEMENTS
=
56 new HashMap
<String
, String
>();
59 * Set of HTML elements whose content will be discarded.
61 private static final Set
<String
> DISCARD_ELEMENTS
= new HashSet
<String
>();
64 // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
65 SAFE_ELEMENTS
.put("P", "p");
66 SAFE_ELEMENTS
.put("H1", "h1");
67 SAFE_ELEMENTS
.put("H2", "h2");
68 SAFE_ELEMENTS
.put("H3", "h3");
69 SAFE_ELEMENTS
.put("H4", "h4");
70 SAFE_ELEMENTS
.put("H5", "h5");
71 SAFE_ELEMENTS
.put("H6", "h6");
72 SAFE_ELEMENTS
.put("UL", "ul");
73 SAFE_ELEMENTS
.put("OL", "ol");
74 SAFE_ELEMENTS
.put("LI", "li");
75 SAFE_ELEMENTS
.put("DL", "dl");
76 SAFE_ELEMENTS
.put("DT", "dt");
77 SAFE_ELEMENTS
.put("DD", "dd");
78 SAFE_ELEMENTS
.put("PRE", "pre");
79 SAFE_ELEMENTS
.put("BLOCKQUOTE", "blockquote");
80 SAFE_ELEMENTS
.put("TABLE", "p"); // TODO colspan/rowspan issues
82 DISCARD_ELEMENTS
.add("STYLE");
83 DISCARD_ELEMENTS
.add("SCRIPT");
87 InputStream stream
, ContentHandler handler
, Metadata metadata
)
88 throws IOException
, SAXException
, TikaException
{
89 // Protect the stream from being closed by CyberNeko
90 stream
= new CloseShieldInputStream(stream
);
92 // Prepare the HTML content handler that generates proper
93 // XHTML events to records relevant document metadata
94 XHTMLContentHandler xhtml
= new XHTMLContentHandler(handler
, metadata
);
95 XPathParser xpath
= new XPathParser(null, "");
96 Matcher body
= xpath
.parse("/HTML/BODY//node()");
97 Matcher title
= xpath
.parse("/HTML/HEAD/TITLE//node()");
98 handler
= new TeeContentHandler(
99 new MatchingContentHandler(getBodyHandler(xhtml
), body
),
100 new MatchingContentHandler(getTitleHandler(metadata
), title
));
102 // Parse the HTML document
103 xhtml
.startDocument();
104 SAXParser parser
= new SAXParser();
105 parser
.setContentHandler(handler
);
106 parser
.parse(new InputSource(Utils
.getUTF8Reader(stream
, metadata
)));
110 private ContentHandler
getTitleHandler(final Metadata metadata
) {
111 return new WriteOutContentHandler() {
113 public void endElement(String u
, String l
, String n
) {
114 metadata
.set(Metadata
.TITLE
, toString());
119 private ContentHandler
getBodyHandler(final XHTMLContentHandler xhtml
) {
120 return new TextContentHandler(xhtml
) {
122 private int discardLevel
= 0;
125 public void startElement(
126 String uri
, String local
, String name
, Attributes atts
)
127 throws SAXException
{
128 if (discardLevel
!= 0) {
130 } else if (DISCARD_ELEMENTS
.contains(name
)) {
132 } else if (SAFE_ELEMENTS
.containsKey(name
)) {
133 xhtml
.startElement(SAFE_ELEMENTS
.get(name
));
134 } else if ("A".equals(name
)) {
135 String href
= atts
.getValue("href");
139 xhtml
.startElement("a", "href", href
);
144 public void endElement(
145 String uri
, String local
, String name
) throws SAXException
{
146 if (discardLevel
!= 0) {
148 } else if (SAFE_ELEMENTS
.containsKey(name
)) {
149 xhtml
.endElement(SAFE_ELEMENTS
.get(name
));
150 } else if ("A".equals(name
)) {
151 xhtml
.endElement("a");
156 public void characters(char[] ch
, int start
, int length
)
157 throws SAXException
{
158 if (discardLevel
== 0) {
159 super.characters(ch
, start
, length
);
164 public void ignorableWhitespace(char[] ch
, int start
, int length
)
165 throws SAXException
{
166 if (discardLevel
== 0) {
167 super.ignorableWhitespace(ch
, start
, length
);