2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.mime
;
19 // Commons Logging imports
20 import org
.apache
.commons
.logging
.Log
;
21 import org
.apache
.commons
.logging
.LogFactory
;
24 import org
.w3c
.dom
.Attr
;
25 import org
.w3c
.dom
.Node
;
26 import org
.w3c
.dom
.Element
;
27 import org
.w3c
.dom
.Document
;
28 import org
.w3c
.dom
.NodeList
;
29 import org
.w3c
.dom
.NamedNodeMap
;
30 import org
.xml
.sax
.InputSource
;
33 import java
.io
.InputStream
;
34 import java
.util
.ArrayList
;
35 import javax
.xml
.parsers
.DocumentBuilder
;
36 import javax
.xml
.parsers
.DocumentBuilderFactory
;
39 * A reader for XML files compliant with the freedesktop MIME-info DTD.
42 * <!DOCTYPE mime-info [
43 * <!ELEMENT mime-info (mime-type)+>
44 * <!ATTLIST mime-info xmlns CDATA #FIXED "http://www.freedesktop.org/standards/shared-mime-info">
46 * <!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*>
47 * <!ATTLIST mime-type type CDATA #REQUIRED>
49 * <!-- a comment describing a document with the respective MIME type. Example: "WMV video" -->
50 * <!ELEMENT comment (#PCDATA)>
51 * <!ATTLIST comment xml:lang CDATA #IMPLIED>
53 * <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "WMV" -->
54 * <!ELEMENT acronym (#PCDATA)>
55 * <!ATTLIST acronym xml:lang CDATA #IMPLIED>
57 * <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "Windows Media Video" -->
58 * <!ELEMENT expanded-acronym (#PCDATA)>
59 * <!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED>
61 * <!ELEMENT glob EMPTY>
62 * <!ATTLIST glob pattern CDATA #REQUIRED>
64 * <!ELEMENT magic (match)+>
65 * <!ATTLIST magic priority CDATA #IMPLIED>
67 * <!ELEMENT match (match)*>
68 * <!ATTLIST match offset CDATA #REQUIRED>
69 * <!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED>
70 * <!ATTLIST match value CDATA #REQUIRED>
71 * <!ATTLIST match mask CDATA #IMPLIED>
73 * <!ELEMENT root-XML EMPTY>
74 * <!ATTLIST root-XML
75 * namespaceURI CDATA #REQUIRED
76 * localName CDATA #REQUIRED>
78 * <!ELEMENT alias EMPTY>
80 * type CDATA #REQUIRED>
82 * <!ELEMENT sub-class-of EMPTY>
83 * <!ATTLIST sub-class-of
84 * type CDATA #REQUIRED>
89 * @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
92 final class MimeTypesReader
{
94 /** The logger to use */
95 private Log logger
= null;
97 private final MimeTypes types
;
99 MimeTypesReader(MimeTypes types
) {
103 MimeTypesReader(MimeTypes types
, Log logger
) {
105 if (logger
== null) {
106 this.logger
= LogFactory
.getLog(this.getClass());
108 this.logger
= logger
;
112 void read(String filepath
) {
113 read(MimeTypesReader
.class.getClassLoader().getResourceAsStream(filepath
));
116 void read(InputStream stream
) {
118 DocumentBuilderFactory factory
= DocumentBuilderFactory
120 DocumentBuilder builder
= factory
.newDocumentBuilder();
121 Document document
= builder
.parse(new InputSource(stream
));
123 } catch (Exception e
) {
124 if (logger
.isWarnEnabled()) {
125 logger
.warn(e
.toString() + " while loading mime-types");
130 void read(Document document
) {
131 Element element
= document
.getDocumentElement();
132 if (element
!= null && element
.getTagName().equals("mime-info")) {
133 readMimeInfo(element
);
137 /** Read Element named mime-info. */
138 private MimeType
[] readMimeInfo(Element element
) {
139 ArrayList
<MimeType
> types
= new ArrayList
<MimeType
>();
140 NodeList nodes
= element
.getChildNodes();
141 for (int i
= 0; i
< nodes
.getLength(); i
++) {
142 Node node
= nodes
.item(i
);
143 if (node
.getNodeType() == Node
.ELEMENT_NODE
) {
144 Element nodeElement
= (Element
) node
;
145 if (nodeElement
.getTagName().equals("mime-type")) {
146 readMimeType(nodeElement
);
150 return types
.toArray(new MimeType
[types
.size()]);
153 /** Read Element named mime-type. */
154 private void readMimeType(Element element
) {
156 MimeType type
= null;
159 type
= new MimeType(element
.getAttribute("type"));
160 } catch (MimeTypeException mte
) {
161 // Mime Type not valid... just ignore it
162 if (logger
.isInfoEnabled()) {
163 logger
.info(mte
.toString() + " ... Ignoring!");
168 NodeList nodes
= element
.getChildNodes();
169 for (int i
= 0; i
< nodes
.getLength(); i
++) {
170 Node node
= nodes
.item(i
);
171 if (node
.getNodeType() == Node
.ELEMENT_NODE
) {
172 Element nodeElement
= (Element
) node
;
173 if (nodeElement
.getTagName().equals("_comment")) {
174 type
.setDescription(nodeElement
.getFirstChild()
176 } else if (nodeElement
.getTagName().equals("glob")) {
177 readGlob(nodeElement
, type
);
178 } else if (nodeElement
.getTagName().equals("magic")) {
179 readMagic(nodeElement
, type
);
180 } else if (nodeElement
.getTagName().equals("alias")) {
181 readAlias(nodeElement
, type
);
182 } else if (nodeElement
.getTagName().equals("root-XML")) {
183 readRootXML(nodeElement
, type
);
184 } else if (nodeElement
.getTagName().equals("sub-class-of")) {
185 readSubClassOf(nodeElement
, type
);
193 /** Read Element named glob. */
194 private void readGlob(Element element
, MimeType type
) {
195 type
.addPattern(element
.getAttribute("pattern"));
198 /** Read Element named alias. */
199 private void readAlias(Element element
, MimeType type
) {
200 type
.addAlias(element
.getAttribute("type"));
203 /** Read Element named magic. */
204 private void readMagic(Element element
, MimeType mimeType
) {
208 magic
= new Magic(Integer
209 .parseInt(element
.getAttribute("priority")));
210 } catch (Exception e
) {
213 magic
.setType(mimeType
);
214 magic
.setClause(readMatches(element
));
215 mimeType
.addMagic(magic
);
218 private Clause
readMatches(Element element
) {
220 Clause prev
= Clause
.FALSE
;
221 Clause clause
= null;
222 NodeList nodes
= element
.getChildNodes();
223 for (int i
= 0; i
< nodes
.getLength(); i
++) {
224 Node node
= nodes
.item(i
);
225 if (node
.getNodeType() == Node
.ELEMENT_NODE
) {
226 Element nodeElement
= (Element
) node
;
227 if (nodeElement
.getTagName().equals("match")) {
228 sub
= readMatches(nodeElement
);
231 clause
= new MagicClause(Operator
.AND
,
232 readMatch(nodeElement
), sub
);
234 clause
= readMatch(nodeElement
);
236 clause
= new MagicClause(Operator
.OR
, prev
, clause
);
238 } catch (MimeTypeException mte
) {
239 logger
.warn(mte
+ " while reading magic-match ["
240 + nodeElement
+ "], Ignoring!");
248 /** Read Element named match. */
249 private MagicMatch
readMatch(Element element
) throws MimeTypeException
{
251 String offset
= null;
256 NamedNodeMap attrs
= element
.getAttributes();
257 for (int i
= 0; i
< attrs
.getLength(); i
++) {
258 Attr attr
= (Attr
) attrs
.item(i
);
259 if (attr
.getName().equals("offset")) {
260 offset
= attr
.getValue();
261 } else if (attr
.getName().equals("type")) {
262 type
= attr
.getValue();
263 } else if (attr
.getName().equals("value")) {
264 value
= attr
.getValue();
265 } else if (attr
.getName().equals("mask")) {
266 mask
= attr
.getValue();
270 String
[] offsets
= offset
.split(":");
274 offStart
= Integer
.parseInt(offsets
[0]);
275 } catch (Exception e
) {
276 // WARN log + avoid loading
279 offEnd
= Integer
.parseInt(offsets
[1]);
280 } catch (Exception e
) {
283 offEnd
= Math
.max(offStart
, offEnd
);
285 return new MagicMatch(offStart
, offEnd
, type
, mask
, value
);
288 /** Read Element named root-XML. */
289 private void readRootXML(Element element
, MimeType mimeType
) {
291 mimeType
.addRootXML(element
.getAttribute("namespaceURI"), element
292 .getAttribute("localName"));
295 /** Read Element named sub-class-of. */
296 private void readSubClassOf(Element element
, MimeType mimeType
) {
298 mimeType
.addSuperType(element
.getAttribute("type"));
301 /** Prints the specified node, then prints all of its children. */
302 public static void printDOM(Node node
) {
303 int type
= node
.getNodeType();
305 // print the document element
306 case Node
.DOCUMENT_NODE
: {
307 System
.out
.println("<?xml version=\"1.0\" ?>");
308 printDOM(((Document
) node
).getDocumentElement());
312 // print element with attributes
313 case Node
.ELEMENT_NODE
: {
314 System
.out
.print("<");
315 System
.out
.print(node
.getNodeName());
316 NamedNodeMap attrs
= node
.getAttributes();
317 for (int i
= 0; i
< attrs
.getLength(); i
++) {
318 Node attr
= attrs
.item(i
);
319 System
.out
.print(" " + attr
.getNodeName().trim() + "=\""
320 + attr
.getNodeValue().trim() + "\"");
322 System
.out
.println(">");
324 NodeList children
= node
.getChildNodes();
325 if (children
!= null) {
326 int len
= children
.getLength();
327 for (int i
= 0; i
< len
; i
++)
328 printDOM(children
.item(i
));
334 // handle entity reference nodes
335 case Node
.ENTITY_REFERENCE_NODE
: {
336 System
.out
.print("&");
337 System
.out
.print(node
.getNodeName().trim());
338 System
.out
.print(";");
342 // print cdata sections
343 case Node
.CDATA_SECTION_NODE
: {
344 System
.out
.print("<![CDATA[");
345 System
.out
.print(node
.getNodeValue().trim());
346 System
.out
.print("]]>");
351 case Node
.TEXT_NODE
: {
352 System
.out
.print(node
.getNodeValue().trim());
356 // print processing instruction
357 case Node
.PROCESSING_INSTRUCTION_NODE
: {
358 System
.out
.print("<?");
359 System
.out
.print(node
.getNodeName().trim());
360 String data
= node
.getNodeValue().trim();
362 System
.out
.print(" ");
363 System
.out
.print(data
);
365 System
.out
.print("?>");
370 if (type
== Node
.ELEMENT_NODE
) {
371 System
.out
.println();
372 System
.out
.print("</");
373 System
.out
.print(node
.getNodeName().trim());
374 System
.out
.print('>');