TIKA-87 - MimeTypes should allow modification of MIME types
[tika.git] / src / main / java / org / apache / tika / mime / MimeTypesReader.java
blobd1582aa2ea32f13d8ca2ff8122a1bf3d11841611
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.mime;
19 // Commons Logging imports
20 import org.apache.commons.logging.Log;
21 import org.apache.commons.logging.LogFactory;
23 // DOM imports
24 import org.w3c.dom.Attr;
25 import org.w3c.dom.Node;
26 import org.w3c.dom.Element;
27 import org.w3c.dom.Document;
28 import org.w3c.dom.NodeList;
29 import org.w3c.dom.NamedNodeMap;
30 import org.xml.sax.InputSource;
32 // JDK imports
33 import java.io.InputStream;
34 import java.util.ArrayList;
35 import javax.xml.parsers.DocumentBuilder;
36 import javax.xml.parsers.DocumentBuilderFactory;
38 /**
39 * A reader for XML files compliant with the freedesktop MIME-info DTD.
41 * <pre>
42 * &lt;!DOCTYPE mime-info [
43 * &lt;!ELEMENT mime-info (mime-type)+&gt;
44 * &lt;!ATTLIST mime-info xmlns CDATA #FIXED &quot;http://www.freedesktop.org/standards/shared-mime-info&quot;&gt;
46 * &lt;!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*&gt;
47 * &lt;!ATTLIST mime-type type CDATA #REQUIRED&gt;
49 * &lt;!-- a comment describing a document with the respective MIME type. Example: &quot;WMV video&quot; --&gt;
50 * &lt;!ELEMENT comment (#PCDATA)&gt;
51 * &lt;!ATTLIST comment xml:lang CDATA #IMPLIED&gt;
53 * &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;WMV&quot; --&gt;
54 * &lt;!ELEMENT acronym (#PCDATA)&gt;
55 * &lt;!ATTLIST acronym xml:lang CDATA #IMPLIED&gt;
57 * &lt;!-- a comment describing a the respective unexpanded MIME type acronym. Example: &quot;Windows Media Video&quot; --&gt;
58 * &lt;!ELEMENT expanded-acronym (#PCDATA)&gt;
59 * &lt;!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED&gt;
61 * &lt;!ELEMENT glob EMPTY&gt;
62 * &lt;!ATTLIST glob pattern CDATA #REQUIRED&gt;
64 * &lt;!ELEMENT magic (match)+&gt;
65 * &lt;!ATTLIST magic priority CDATA #IMPLIED&gt;
67 * &lt;!ELEMENT match (match)*&gt;
68 * &lt;!ATTLIST match offset CDATA #REQUIRED&gt;
69 * &lt;!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED&gt;
70 * &lt;!ATTLIST match value CDATA #REQUIRED&gt;
71 * &lt;!ATTLIST match mask CDATA #IMPLIED&gt;
73 * &lt;!ELEMENT root-XML EMPTY&gt;
74 * &lt;!ATTLIST root-XML
75 * namespaceURI CDATA #REQUIRED
76 * localName CDATA #REQUIRED&gt;
78 * &lt;!ELEMENT alias EMPTY&gt;
79 * &lt;!ATTLIST alias
80 * type CDATA #REQUIRED&gt;
82 * &lt;!ELEMENT sub-class-of EMPTY&gt;
83 * &lt;!ATTLIST sub-class-of
84 * type CDATA #REQUIRED&gt;
85 * ]&gt;
86 * </pre>
89 * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
92 final class MimeTypesReader {
94 /** The logger to use */
95 private Log logger = null;
97 private final MimeTypes types;
99 MimeTypesReader(MimeTypes types) {
100 this(types, null);
103 MimeTypesReader(MimeTypes types, Log logger) {
104 this.types = types;
105 if (logger == null) {
106 this.logger = LogFactory.getLog(this.getClass());
107 } else {
108 this.logger = logger;
112 void read(String filepath) {
113 read(MimeTypesReader.class.getClassLoader().getResourceAsStream(filepath));
116 void read(InputStream stream) {
117 try {
118 DocumentBuilderFactory factory = DocumentBuilderFactory
119 .newInstance();
120 DocumentBuilder builder = factory.newDocumentBuilder();
121 Document document = builder.parse(new InputSource(stream));
122 read(document);
123 } catch (Exception e) {
124 if (logger.isWarnEnabled()) {
125 logger.warn(e.toString() + " while loading mime-types");
130 void read(Document document) {
131 Element element = document.getDocumentElement();
132 if (element != null && element.getTagName().equals("mime-info")) {
133 readMimeInfo(element);
137 /** Read Element named mime-info. */
138 private MimeType[] readMimeInfo(Element element) {
139 ArrayList<MimeType> types = new ArrayList<MimeType>();
140 NodeList nodes = element.getChildNodes();
141 for (int i = 0; i < nodes.getLength(); i++) {
142 Node node = nodes.item(i);
143 if (node.getNodeType() == Node.ELEMENT_NODE) {
144 Element nodeElement = (Element) node;
145 if (nodeElement.getTagName().equals("mime-type")) {
146 readMimeType(nodeElement);
150 return types.toArray(new MimeType[types.size()]);
153 /** Read Element named mime-type. */
154 private void readMimeType(Element element) {
156 MimeType type = null;
158 try {
159 type = new MimeType(element.getAttribute("type"));
160 } catch (MimeTypeException mte) {
161 // Mime Type not valid... just ignore it
162 if (logger.isInfoEnabled()) {
163 logger.info(mte.toString() + " ... Ignoring!");
165 return;
168 NodeList nodes = element.getChildNodes();
169 for (int i = 0; i < nodes.getLength(); i++) {
170 Node node = nodes.item(i);
171 if (node.getNodeType() == Node.ELEMENT_NODE) {
172 Element nodeElement = (Element) node;
173 if (nodeElement.getTagName().equals("_comment")) {
174 type.setDescription(nodeElement.getFirstChild()
175 .getNodeValue());
176 } else if (nodeElement.getTagName().equals("glob")) {
177 readGlob(nodeElement, type);
178 } else if (nodeElement.getTagName().equals("magic")) {
179 readMagic(nodeElement, type);
180 } else if (nodeElement.getTagName().equals("alias")) {
181 readAlias(nodeElement, type);
182 } else if (nodeElement.getTagName().equals("root-XML")) {
183 readRootXML(nodeElement, type);
184 } else if (nodeElement.getTagName().equals("sub-class-of")) {
185 readSubClassOf(nodeElement, type);
190 types.add(type);
193 /** Read Element named glob. */
194 private void readGlob(Element element, MimeType type) {
195 type.addPattern(element.getAttribute("pattern"));
198 /** Read Element named alias. */
199 private void readAlias(Element element, MimeType type) {
200 type.addAlias(element.getAttribute("type"));
203 /** Read Element named magic. */
204 private void readMagic(Element element, MimeType mimeType) {
206 Magic magic = null;
207 try {
208 magic = new Magic(Integer
209 .parseInt(element.getAttribute("priority")));
210 } catch (Exception e) {
211 magic = new Magic();
213 magic.setType(mimeType);
214 magic.setClause(readMatches(element));
215 mimeType.addMagic(magic);
218 private Clause readMatches(Element element) {
219 Clause sub = null;
220 Clause prev = Clause.FALSE;
221 Clause clause = null;
222 NodeList nodes = element.getChildNodes();
223 for (int i = 0; i < nodes.getLength(); i++) {
224 Node node = nodes.item(i);
225 if (node.getNodeType() == Node.ELEMENT_NODE) {
226 Element nodeElement = (Element) node;
227 if (nodeElement.getTagName().equals("match")) {
228 sub = readMatches(nodeElement);
229 try {
230 if (sub != null) {
231 clause = new MagicClause(Operator.AND,
232 readMatch(nodeElement), sub);
233 } else {
234 clause = readMatch(nodeElement);
236 clause = new MagicClause(Operator.OR, prev, clause);
237 prev = clause;
238 } catch (MimeTypeException mte) {
239 logger.warn(mte + " while reading magic-match ["
240 + nodeElement + "], Ignoring!");
245 return clause;
248 /** Read Element named match. */
249 private MagicMatch readMatch(Element element) throws MimeTypeException {
251 String offset = null;
252 String value = null;
253 String mask = null;
254 String type = null;
256 NamedNodeMap attrs = element.getAttributes();
257 for (int i = 0; i < attrs.getLength(); i++) {
258 Attr attr = (Attr) attrs.item(i);
259 if (attr.getName().equals("offset")) {
260 offset = attr.getValue();
261 } else if (attr.getName().equals("type")) {
262 type = attr.getValue();
263 } else if (attr.getName().equals("value")) {
264 value = attr.getValue();
265 } else if (attr.getName().equals("mask")) {
266 mask = attr.getValue();
269 // Parse OffSet
270 String[] offsets = offset.split(":");
271 int offStart = 0;
272 int offEnd = 0;
273 try {
274 offStart = Integer.parseInt(offsets[0]);
275 } catch (Exception e) {
276 // WARN log + avoid loading
278 try {
279 offEnd = Integer.parseInt(offsets[1]);
280 } catch (Exception e) {
281 // WARN log
283 offEnd = Math.max(offStart, offEnd);
285 return new MagicMatch(offStart, offEnd, type, mask, value);
288 /** Read Element named root-XML. */
289 private void readRootXML(Element element, MimeType mimeType) {
291 mimeType.addRootXML(element.getAttribute("namespaceURI"), element
292 .getAttribute("localName"));
295 /** Read Element named sub-class-of. */
296 private void readSubClassOf(Element element, MimeType mimeType) {
298 mimeType.addSuperType(element.getAttribute("type"));
301 /** Prints the specified node, then prints all of its children. */
302 public static void printDOM(Node node) {
303 int type = node.getNodeType();
304 switch (type) {
305 // print the document element
306 case Node.DOCUMENT_NODE: {
307 System.out.println("&lt;?xml version=\"1.0\" ?>");
308 printDOM(((Document) node).getDocumentElement());
309 break;
312 // print element with attributes
313 case Node.ELEMENT_NODE: {
314 System.out.print("<");
315 System.out.print(node.getNodeName());
316 NamedNodeMap attrs = node.getAttributes();
317 for (int i = 0; i < attrs.getLength(); i++) {
318 Node attr = attrs.item(i);
319 System.out.print(" " + attr.getNodeName().trim() + "=\""
320 + attr.getNodeValue().trim() + "\"");
322 System.out.println(">");
324 NodeList children = node.getChildNodes();
325 if (children != null) {
326 int len = children.getLength();
327 for (int i = 0; i < len; i++)
328 printDOM(children.item(i));
331 break;
334 // handle entity reference nodes
335 case Node.ENTITY_REFERENCE_NODE: {
336 System.out.print("&");
337 System.out.print(node.getNodeName().trim());
338 System.out.print(";");
339 break;
342 // print cdata sections
343 case Node.CDATA_SECTION_NODE: {
344 System.out.print("<![CDATA[");
345 System.out.print(node.getNodeValue().trim());
346 System.out.print("]]>");
347 break;
350 // print text
351 case Node.TEXT_NODE: {
352 System.out.print(node.getNodeValue().trim());
353 break;
356 // print processing instruction
357 case Node.PROCESSING_INSTRUCTION_NODE: {
358 System.out.print("<?");
359 System.out.print(node.getNodeName().trim());
360 String data = node.getNodeValue().trim();
362 System.out.print(" ");
363 System.out.print(data);
365 System.out.print("?>");
366 break;
370 if (type == Node.ELEMENT_NODE) {
371 System.out.println();
372 System.out.print("</");
373 System.out.print(node.getNodeName().trim());
374 System.out.print('>');