TIKA-71 - Remove ParserConfig and ParserFactory
[tika.git] / src / main / java / org / apache / tika / utils / ParseUtils.java
blob18e93d33eb0f6c61bfa00261552b508e33ce510f
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.utils;
19 //JDK imports
20 import java.io.BufferedInputStream;
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.FileNotFoundException;
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.StringWriter;
27 import java.net.URL;
28 import java.util.ArrayList;
29 import java.util.List;
31 import org.apache.tika.config.TikaConfig;
32 import org.apache.tika.exception.TikaException;
33 import org.apache.tika.metadata.Metadata;
34 import org.apache.tika.metadata.TikaMimeKeys;
35 import org.apache.tika.parser.Parser;
36 import org.apache.tika.parser.WriteOutContentHandler;
37 import org.xml.sax.SAXException;
39 /**
40 * Contains utility methods for parsing documents. Intended to provide simple
41 * entry points into the Tika framework.
43 public class ParseUtils implements TikaMimeKeys {
45 /**
46 * Returns a parser that can handle the specified MIME type, and is set to
47 * receive input from a stream opened from the specified URL. NB: Close the
48 * input stream when it is no longer needed!
50 * @param config
51 * @param mimeType
52 * the document's MIME type
53 * @return a parser appropriate to this MIME type
54 * @throws TikaException
56 public static Parser getParser(String mimeType, TikaConfig config)
57 throws TikaException {
58 return config.getParser(mimeType);
61 /**
62 * Returns a parser that can handle the specified MIME type, and is set to
63 * receive input from a stream opened from the specified URL. The MIME type
64 * is determined automatically. NB: Close the input stream when it is no
65 * longer needed!
67 * @param documentUrl
68 * URL pointing to the document to parse
69 * @param config
70 * @return a parser appropriate to this MIME type and ready to read input
71 * from the specified document
72 * @throws TikaException
74 public static Parser getParser(URL documentUrl, TikaConfig config)
75 throws TikaException {
76 String mimetype = config.getMimeRepository().getMimeType(documentUrl)
77 .getName();
78 return getParser(mimetype, config);
81 /**
82 * Returns a parser that can handle the specified MIME type, and is set to
83 * receive input from a stream opened from the specified URL. NB: Close the
84 * input stream when it is no longer needed!
86 * @param documentFile
87 * File object pointing to the document to parse
88 * @param config
89 * @return a parser appropriate to this MIME type and ready to read input
90 * from the specified document
91 * @throws TikaException
93 public static Parser getParser(File documentFile, TikaConfig config)
94 throws TikaException {
95 String mimetype = config.getMimeRepository().getMimeType(documentFile)
96 .getName();
97 return getParser(mimetype, config);
101 * Returns a list of parsers from zip InputStream
103 * @param zip
104 * InputStream
105 * @param config
106 * @return a list of parsers from zip file
107 * @throws TikaException
109 private static List<Parser> getParsersFromZip(InputStream zipIs,
110 TikaConfig config) throws TikaException {
111 List<Parser> parsers = new ArrayList<Parser>();
112 List<File> zipFiles = Utils.unzip(zipIs);
113 for (int i = 0; i < zipFiles.size(); i++) {
114 File zipEntry = zipFiles.get(i);
115 parsers.add(getParser(zipEntry, config));
117 return parsers;
121 * Returns a list of parsers from zip File
123 * @param zip
124 * File
125 * @param config
126 * @return a list of parsers from zip file
127 * @throws TikaException
128 * @throws FileNotFoundException
130 public static List<Parser> getParsersFromZip(File zip, TikaConfig config)
131 throws TikaException, FileNotFoundException {
132 String zipMimeType = config.getMimeRepository().getMimeType(zip)
133 .getName();
134 if (!zipMimeType.equalsIgnoreCase("application/zip")) {
135 throw new TikaException("The file you are using is note a zip file");
137 return getParsersFromZip(new FileInputStream(zip), config);
141 * Returns a list of parsers from URL
143 * @param URL
144 * @param config
145 * @return a list of parsers from zip file
146 * @throws TikaException
147 * @throws IOException
149 public static List<Parser> getParsersFromZip(URL zip, TikaConfig config)
150 throws TikaException, IOException {
151 String zipMimeType = config.getMimeRepository().getMimeType(zip)
152 .getName();
153 if (!zipMimeType.equalsIgnoreCase("application/zip")) {
154 throw new TikaException("The file you are using is note a zip file");
156 return getParsersFromZip(zip.openStream(), config);
160 * Gets the string content of a document read from an input stream.
162 * @param stream the stream from which to read document data
163 * @param config
164 * @param mimeType MIME type of the data
165 * @return the string content parsed from the document
167 public static String getStringContent(
168 InputStream stream, TikaConfig config, String mimeType)
169 throws TikaException, IOException {
170 try {
171 Parser parser = config.getParser(mimeType);
172 StringWriter writer = new StringWriter();
173 parser.parse(
174 stream, new WriteOutContentHandler(writer), new Metadata());
175 return writer.toString();
176 } catch (SAXException e) {
177 throw new TikaException("Unexpected SAX error", e);
182 * Gets the string content of a document read from an input stream.
184 * @param documentUrl
185 * URL pointing to the document to parse
186 * @param config
187 * @return the string content parsed from the document
189 public static String getStringContent(URL documentUrl, TikaConfig config)
190 throws TikaException, IOException {
191 String mime = config.getMimeRepository().getMimeType(documentUrl)
192 .getName();
193 return getStringContent(documentUrl, config, mime);
197 * Gets the string content of a document read from an input stream.
199 * @param documentUrl
200 * URL pointing to the document to parse
201 * @param config
202 * @param mimeType
203 * MIME type of the data
204 * @return the string content parsed from the document
206 public static String getStringContent(
207 URL documentUrl, TikaConfig config, String mimeType)
208 throws TikaException, IOException {
209 InputStream stream = documentUrl.openStream();
210 try {
211 return getStringContent(stream, config, mimeType);
212 } finally {
213 stream.close();
218 * Gets the string content of a document read from an input stream.
220 * @param documentFile
221 * File object pointing to the document to parse
222 * @param config
223 * @param mimeType
224 * MIME type of the data
225 * @return the string content parsed from the document
227 public static String getStringContent(
228 File documentFile, TikaConfig config, String mimeType)
229 throws TikaException, IOException {
230 InputStream stream = new BufferedInputStream(new FileInputStream(
231 documentFile));
232 try {
233 return getStringContent(stream, config, mimeType);
234 } finally {
235 stream.close();
240 * Gets the string content of a document read from an input stream.
242 * @param documentFile
243 * File object pointing to the document to parse
244 * @param config
245 * @return the string content parsed from the document
247 public static String getStringContent(File documentFile, TikaConfig config)
248 throws TikaException, IOException {
249 String mime =
250 config.getMimeRepository().getMimeType(documentFile).getName();
251 return getStringContent(documentFile, config, mime);