TIKA-113: Metadata (such as title) should not be part of content
[tika.git] / src / main / java / org / apache / tika / parser / html / HtmlParser.java
blobd0f9d8a81e7926e26bf4daafa87f7f63c7933587
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.parser.html;
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.util.HashMap;
22 import java.util.HashSet;
23 import java.util.Map;
24 import java.util.Set;
26 import org.apache.commons.io.input.CloseShieldInputStream;
27 import org.apache.tika.exception.TikaException;
28 import org.apache.tika.metadata.Metadata;
29 import org.apache.tika.parser.AbstractParser;
30 import org.apache.tika.sax.TeeContentHandler;
31 import org.apache.tika.sax.TextContentHandler;
32 import org.apache.tika.sax.WriteOutContentHandler;
33 import org.apache.tika.sax.XHTMLContentHandler;
34 import org.apache.tika.sax.xpath.Matcher;
35 import org.apache.tika.sax.xpath.MatchingContentHandler;
36 import org.apache.tika.sax.xpath.XPathParser;
37 import org.apache.tika.utils.Utils;
38 import org.cyberneko.html.parsers.SAXParser;
39 import org.xml.sax.Attributes;
40 import org.xml.sax.ContentHandler;
41 import org.xml.sax.InputSource;
42 import org.xml.sax.SAXException;
44 /**
45 * HTML parser. Uses CyberNeko to turn the input document to HTML SAX events,
46 * and post-processes the events to produce XHTML and metadata expected by
47 * Tika clients.
49 public class HtmlParser extends AbstractParser {
51 /**
52 * Set of safe mappings from incoming HTML elements to outgoing
53 * XHTML elements. Ensures that the output is valid XHTML 1.0 Strict.
55 private static final Map<String, String> SAFE_ELEMENTS =
56 new HashMap<String, String>();
58 /**
59 * Set of HTML elements whose content will be discarded.
61 private static final Set<String> DISCARD_ELEMENTS = new HashSet<String>();
63 static {
64 // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
65 SAFE_ELEMENTS.put("P", "p");
66 SAFE_ELEMENTS.put("H1", "h1");
67 SAFE_ELEMENTS.put("H2", "h2");
68 SAFE_ELEMENTS.put("H3", "h3");
69 SAFE_ELEMENTS.put("H4", "h4");
70 SAFE_ELEMENTS.put("H5", "h5");
71 SAFE_ELEMENTS.put("H6", "h6");
72 SAFE_ELEMENTS.put("UL", "ul");
73 SAFE_ELEMENTS.put("OL", "ol");
74 SAFE_ELEMENTS.put("LI", "li");
75 SAFE_ELEMENTS.put("DL", "dl");
76 SAFE_ELEMENTS.put("DT", "dt");
77 SAFE_ELEMENTS.put("DD", "dd");
78 SAFE_ELEMENTS.put("PRE", "pre");
79 SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
80 SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
82 DISCARD_ELEMENTS.add("STYLE");
83 DISCARD_ELEMENTS.add("SCRIPT");
86 public void parse(
87 InputStream stream, ContentHandler handler, Metadata metadata)
88 throws IOException, SAXException, TikaException {
89 // Protect the stream from being closed by CyberNeko
90 stream = new CloseShieldInputStream(stream);
92 // Prepare the HTML content handler that generates proper
93 // XHTML events to records relevant document metadata
94 XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
95 XPathParser xpath = new XPathParser(null, "");
96 Matcher body = xpath.parse("/HTML/BODY//node()");
97 Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
98 handler = new TeeContentHandler(
99 new MatchingContentHandler(getBodyHandler(xhtml), body),
100 new MatchingContentHandler(getTitleHandler(metadata), title));
102 // Parse the HTML document
103 xhtml.startDocument();
104 SAXParser parser = new SAXParser();
105 parser.setContentHandler(handler);
106 parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
107 xhtml.endDocument();
110 private ContentHandler getTitleHandler(final Metadata metadata) {
111 return new WriteOutContentHandler() {
112 @Override
113 public void endElement(String u, String l, String n) {
114 metadata.set(Metadata.TITLE, toString());
119 private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
120 return new TextContentHandler(xhtml) {
122 private int discardLevel = 0;
124 @Override
125 public void startElement(
126 String uri, String local, String name, Attributes atts)
127 throws SAXException {
128 if (discardLevel != 0) {
129 discardLevel++;
130 } else if (DISCARD_ELEMENTS.contains(name)) {
131 discardLevel = 1;
132 } else if (SAFE_ELEMENTS.containsKey(name)) {
133 xhtml.startElement(SAFE_ELEMENTS.get(name));
134 } else if ("A".equals(name)) {
135 String href = atts.getValue("href");
136 if (href == null) {
137 href = "";
139 xhtml.startElement("a", "href", href);
143 @Override
144 public void endElement(
145 String uri, String local, String name) throws SAXException {
146 if (discardLevel != 0) {
147 discardLevel--;
148 } else if (SAFE_ELEMENTS.containsKey(name)) {
149 xhtml.endElement(SAFE_ELEMENTS.get(name));
150 } else if ("A".equals(name)) {
151 xhtml.endElement("a");
155 @Override
156 public void characters(char[] ch, int start, int length)
157 throws SAXException {
158 if (discardLevel == 0) {
159 super.characters(ch, start, length);
163 @Override
164 public void ignorableWhitespace(char[] ch, int start, int length)
165 throws SAXException {
166 if (discardLevel == 0) {
167 super.ignorableWhitespace(ch, start, length);