src/main/java/org/apache/tika/parser/html/HtmlParser.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17 package org.apache.tika.parser.html;
  18
  19 import java.io.IOException;
  20 import java.io.InputStream;
  21 import java.util.HashMap;
  22 import java.util.HashSet;
  23 import java.util.Map;
  24 import java.util.Set;
  25
  26 import org.apache.commons.io.input.CloseShieldInputStream;
  27 import org.apache.tika.exception.TikaException;
  28 import org.apache.tika.metadata.Metadata;
  29 import org.apache.tika.parser.AbstractParser;
  30 import org.apache.tika.sax.TeeContentHandler;
  31 import org.apache.tika.sax.TextContentHandler;
  32 import org.apache.tika.sax.WriteOutContentHandler;
  33 import org.apache.tika.sax.XHTMLContentHandler;
  34 import org.apache.tika.sax.xpath.Matcher;
  35 import org.apache.tika.sax.xpath.MatchingContentHandler;
  36 import org.apache.tika.sax.xpath.XPathParser;
  37 import org.apache.tika.utils.Utils;
  38 import org.cyberneko.html.parsers.SAXParser;
  39 import org.xml.sax.Attributes;
  40 import org.xml.sax.ContentHandler;
  41 import org.xml.sax.InputSource;
  42 import org.xml.sax.SAXException;
  43
  44 /**
  45  * HTML parser. Uses CyberNeko to turn the input document to HTML SAX events,
  46  * and post-processes the events to produce XHTML and metadata expected by
  47  * Tika clients.
  48  */
  49 public class HtmlParser extends AbstractParser {
  50
  51     /**
  52      * Set of safe mappings from incoming HTML elements to outgoing
  53      * XHTML elements. Ensures that the output is valid XHTML 1.0 Strict.
  54      */
  55     private static final Map<String, String> SAFE_ELEMENTS =
  56         new HashMap<String, String>();
  57
  58     /**
  59      * Set of HTML elements whose content will be discarded.
  60      */
  61     private static final Set<String> DISCARD_ELEMENTS = new HashSet<String>();
  62
  63     static {
  64         // Based on http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd
  65         SAFE_ELEMENTS.put("P", "p");
  66         SAFE_ELEMENTS.put("H1", "h1");
  67         SAFE_ELEMENTS.put("H2", "h2");
  68         SAFE_ELEMENTS.put("H3", "h3");
  69         SAFE_ELEMENTS.put("H4", "h4");
  70         SAFE_ELEMENTS.put("H5", "h5");
  71         SAFE_ELEMENTS.put("H6", "h6");
  72         SAFE_ELEMENTS.put("UL", "ul");
  73         SAFE_ELEMENTS.put("OL", "ol");
  74         SAFE_ELEMENTS.put("LI", "li");
  75         SAFE_ELEMENTS.put("DL", "dl");
  76         SAFE_ELEMENTS.put("DT", "dt");
  77         SAFE_ELEMENTS.put("DD", "dd");
  78         SAFE_ELEMENTS.put("PRE", "pre");
  79         SAFE_ELEMENTS.put("BLOCKQUOTE", "blockquote");
  80         SAFE_ELEMENTS.put("TABLE", "p"); // TODO colspan/rowspan issues
  81
  82         DISCARD_ELEMENTS.add("STYLE");
  83         DISCARD_ELEMENTS.add("SCRIPT");
  84     }
  85
  86     public void parse(
  87             InputStream stream, ContentHandler handler, Metadata metadata)
  88             throws IOException, SAXException, TikaException {
  89         // Protect the stream from being closed by CyberNeko
  90         stream = new CloseShieldInputStream(stream);
  91
  92         // Prepare the HTML content handler that generates proper
  93         // XHTML events to records relevant document metadata
  94         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
  95         XPathParser xpath = new XPathParser(null, "");
  96         Matcher body = xpath.parse("/HTML/BODY//node()");
  97         Matcher title = xpath.parse("/HTML/HEAD/TITLE//node()");
  98         handler = new TeeContentHandler(
  99                 new MatchingContentHandler(getBodyHandler(xhtml), body),
 100                 new MatchingContentHandler(getTitleHandler(metadata), title));
 101
 102         // Parse the HTML document
 103         xhtml.startDocument();
 104         SAXParser parser = new SAXParser();
 105         parser.setContentHandler(handler);
 106         parser.parse(new InputSource(Utils.getUTF8Reader(stream, metadata)));
 107         xhtml.endDocument();
 108     }
 109
 110     private ContentHandler getTitleHandler(final Metadata metadata) {
 111         return new WriteOutContentHandler() {
 112             @Override
 113             public void endElement(String u, String l, String n) {
 114                 metadata.set(Metadata.TITLE, toString());
 115             }
 116         };
 117     }
 118
 119     private ContentHandler getBodyHandler(final XHTMLContentHandler xhtml) {
 120         return new TextContentHandler(xhtml) {
 121
 122             private int discardLevel = 0;
 123
 124             @Override
 125             public void startElement(
 126                     String uri, String local, String name, Attributes atts)
 127                     throws SAXException {
 128                 if (discardLevel != 0) {
 129                     discardLevel++;
 130                 } else if (DISCARD_ELEMENTS.contains(name)) {
 131                     discardLevel = 1;
 132                 } else if (SAFE_ELEMENTS.containsKey(name)) {
 133                     xhtml.startElement(SAFE_ELEMENTS.get(name));
 134                 } else if ("A".equals(name)) {
 135                     String href = atts.getValue("href");
 136                     if (href == null) {
 137                         href = "";
 138                     }
 139                     xhtml.startElement("a", "href", href);
 140                 }
 141             }
 142
 143             @Override
 144             public void endElement(
 145                     String uri, String local, String name) throws SAXException {
 146                 if (discardLevel != 0) {
 147                     discardLevel--;
 148                 } else if (SAFE_ELEMENTS.containsKey(name)) {
 149                     xhtml.endElement(SAFE_ELEMENTS.get(name));
 150                 } else if ("A".equals(name)) {
 151                     xhtml.endElement("a");
 152                 }
 153             }
 154
 155             @Override
 156             public void characters(char[] ch, int start, int length)
 157                     throws SAXException {
 158                 if (discardLevel == 0) {
 159                     super.characters(ch, start, length);
 160                 }
 161             }
 162
 163             @Override
 164             public void ignorableWhitespace(char[] ch, int start, int length)
 165                     throws SAXException {
 166                 if (discardLevel == 0) {
 167                     super.ignorableWhitespace(ch, start, length);
 168                 }
 169             }
 170
 171         };
 172     }
 173
 174 }