2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.utils
;
20 import java
.io
.BufferedInputStream
;
22 import java
.io
.FileInputStream
;
23 import java
.io
.FileNotFoundException
;
24 import java
.io
.IOException
;
25 import java
.io
.InputStream
;
26 import java
.io
.StringWriter
;
28 import java
.util
.ArrayList
;
29 import java
.util
.List
;
31 import org
.apache
.tika
.config
.TikaConfig
;
32 import org
.apache
.tika
.exception
.TikaException
;
33 import org
.apache
.tika
.metadata
.Metadata
;
34 import org
.apache
.tika
.metadata
.TikaMimeKeys
;
35 import org
.apache
.tika
.parser
.Parser
;
36 import org
.apache
.tika
.parser
.WriteOutContentHandler
;
37 import org
.xml
.sax
.SAXException
;
40 * Contains utility methods for parsing documents. Intended to provide simple
41 * entry points into the Tika framework.
43 public class ParseUtils
implements TikaMimeKeys
{
46 * Returns a parser that can handle the specified MIME type, and is set to
47 * receive input from a stream opened from the specified URL. NB: Close the
48 * input stream when it is no longer needed!
52 * the document's MIME type
53 * @return a parser appropriate to this MIME type
54 * @throws TikaException
56 public static Parser
getParser(String mimeType
, TikaConfig config
)
57 throws TikaException
{
58 return config
.getParser(mimeType
);
62 * Returns a parser that can handle the specified MIME type, and is set to
63 * receive input from a stream opened from the specified URL. The MIME type
64 * is determined automatically. NB: Close the input stream when it is no
68 * URL pointing to the document to parse
70 * @return a parser appropriate to this MIME type and ready to read input
71 * from the specified document
72 * @throws TikaException
74 public static Parser
getParser(URL documentUrl
, TikaConfig config
)
75 throws TikaException
{
76 String mimetype
= config
.getMimeRepository().getMimeType(documentUrl
)
78 return getParser(mimetype
, config
);
82 * Returns a parser that can handle the specified MIME type, and is set to
83 * receive input from a stream opened from the specified URL. NB: Close the
84 * input stream when it is no longer needed!
87 * File object pointing to the document to parse
89 * @return a parser appropriate to this MIME type and ready to read input
90 * from the specified document
91 * @throws TikaException
93 public static Parser
getParser(File documentFile
, TikaConfig config
)
94 throws TikaException
{
95 String mimetype
= config
.getMimeRepository().getMimeType(documentFile
)
97 return getParser(mimetype
, config
);
101 * Returns a list of parsers from zip InputStream
106 * @return a list of parsers from zip file
107 * @throws TikaException
109 private static List
<Parser
> getParsersFromZip(InputStream zipIs
,
110 TikaConfig config
) throws TikaException
{
111 List
<Parser
> parsers
= new ArrayList
<Parser
>();
112 List
<File
> zipFiles
= Utils
.unzip(zipIs
);
113 for (int i
= 0; i
< zipFiles
.size(); i
++) {
114 File zipEntry
= zipFiles
.get(i
);
115 parsers
.add(getParser(zipEntry
, config
));
121 * Returns a list of parsers from zip File
126 * @return a list of parsers from zip file
127 * @throws TikaException
128 * @throws FileNotFoundException
130 public static List
<Parser
> getParsersFromZip(File zip
, TikaConfig config
)
131 throws TikaException
, FileNotFoundException
{
132 String zipMimeType
= config
.getMimeRepository().getMimeType(zip
)
134 if (!zipMimeType
.equalsIgnoreCase("application/zip")) {
135 throw new TikaException("The file you are using is note a zip file");
137 return getParsersFromZip(new FileInputStream(zip
), config
);
141 * Returns a list of parsers from URL
145 * @return a list of parsers from zip file
146 * @throws TikaException
147 * @throws IOException
149 public static List
<Parser
> getParsersFromZip(URL zip
, TikaConfig config
)
150 throws TikaException
, IOException
{
151 String zipMimeType
= config
.getMimeRepository().getMimeType(zip
)
153 if (!zipMimeType
.equalsIgnoreCase("application/zip")) {
154 throw new TikaException("The file you are using is note a zip file");
156 return getParsersFromZip(zip
.openStream(), config
);
160 * Gets the string content of a document read from an input stream.
162 * @param stream the stream from which to read document data
164 * @param mimeType MIME type of the data
165 * @return the string content parsed from the document
167 public static String
getStringContent(
168 InputStream stream
, TikaConfig config
, String mimeType
)
169 throws TikaException
, IOException
{
171 Parser parser
= config
.getParser(mimeType
);
172 StringWriter writer
= new StringWriter();
174 stream
, new WriteOutContentHandler(writer
), new Metadata());
175 return writer
.toString();
176 } catch (SAXException e
) {
177 throw new TikaException("Unexpected SAX error", e
);
182 * Gets the string content of a document read from an input stream.
185 * URL pointing to the document to parse
187 * @return the string content parsed from the document
189 public static String
getStringContent(URL documentUrl
, TikaConfig config
)
190 throws TikaException
, IOException
{
191 String mime
= config
.getMimeRepository().getMimeType(documentUrl
)
193 return getStringContent(documentUrl
, config
, mime
);
197 * Gets the string content of a document read from an input stream.
200 * URL pointing to the document to parse
203 * MIME type of the data
204 * @return the string content parsed from the document
206 public static String
getStringContent(
207 URL documentUrl
, TikaConfig config
, String mimeType
)
208 throws TikaException
, IOException
{
209 InputStream stream
= documentUrl
.openStream();
211 return getStringContent(stream
, config
, mimeType
);
218 * Gets the string content of a document read from an input stream.
220 * @param documentFile
221 * File object pointing to the document to parse
224 * MIME type of the data
225 * @return the string content parsed from the document
227 public static String
getStringContent(
228 File documentFile
, TikaConfig config
, String mimeType
)
229 throws TikaException
, IOException
{
230 InputStream stream
= new BufferedInputStream(new FileInputStream(
233 return getStringContent(stream
, config
, mimeType
);
240 * Gets the string content of a document read from an input stream.
242 * @param documentFile
243 * File object pointing to the document to parse
245 * @return the string content parsed from the document
247 public static String
getStringContent(File documentFile
, TikaConfig config
)
248 throws TikaException
, IOException
{
250 config
.getMimeRepository().getMimeType(documentFile
).getName();
251 return getStringContent(documentFile
, config
, mime
);