TIKA-46 - Use Metadata in Parser
[tika.git] / src / test / java / org / apache / tika / TestParsers.java
blobfc193d99d648e27d0d81abae7820701ffff5b5db
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika;
19 import java.io.File;
20 import java.io.FileInputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.Collection;
24 import java.util.List;
26 import junit.framework.TestCase;
28 import org.apache.tika.config.Content;
29 import org.apache.tika.config.ParserConfig;
30 import org.apache.tika.config.TikaConfig;
31 import org.apache.tika.metadata.Metadata;
32 import org.apache.tika.parser.Parser;
33 import org.apache.tika.parser.ParserFactory;
34 import org.apache.tika.utils.ParseUtils;
35 import org.apache.tika.utils.Utils;
36 import org.jdom.JDOMException;
38 /**
39 * Junit test class for Tika {@link Parser}s.
41 public class TestParsers extends TestCase {
43 private TikaConfig tc;
45 private File testFilesBaseDir;
47 public void setUp() throws JDOMException, IOException {
49 * FIXME the old mechanism does not work anymore when running the tests
50 * with Maven - need a resource-based one, but this means more changes
51 * to classes which rely on filenames.
53 * String sep = File.separator; StringTokenizer st = new
54 * StringTokenizer(System.getProperty( "java.class.path"),
55 * File.pathSeparator);
57 * classDir = new File(st.nextToken());
59 * config = classDir.getParent() + sep + "config" + sep + "config.xml";
61 * String log4j = classDir.getParent() + sep + "Config" + sep + "log4j" +
62 * sep + "log4j.properties";
65 // FIXME for now, fix filenames according to Maven testing layout
66 // The file below should be the default configuration for the test of
67 // getDefaultConfig() to be legitimate.
68 final String tikaConfigFilename = "target/classes/org/apache/tika/tika-config.xml";
70 testFilesBaseDir = new File("src/test/resources/test-documents");
72 tc = new TikaConfig(tikaConfigFilename);
75 public void testPDFExtraction() throws Exception {
76 File file = getTestFile("testPDF.pdf");
77 String s1 = ParseUtils.getStringContent(file, tc);
78 String s2 = ParseUtils.getStringContent(file, tc, "application/pdf");
79 String s3 = ParseUtils.getStringContent(file, TikaConfig
80 .getDefaultConfig());
81 assertEquals(s1, s2);
82 assertEquals(s1, s3);
85 public void testTXTExtraction() throws Exception {
86 File file = getTestFile("testTXT.txt");
87 String s1 = ParseUtils.getStringContent(file, tc);
88 String s2 = ParseUtils.getStringContent(file, tc, "text/plain");
89 assertEquals(s1, s2);
92 public void testRTFExtraction() throws Exception {
93 File file = getTestFile("testRTF.rtf");
94 String s1 = ParseUtils.getStringContent(file, tc);
95 String s2 = ParseUtils.getStringContent(file, tc, "application/rtf");
96 assertEquals(s1, s2);
99 public void testXMLExtraction() throws Exception {
100 File file = getTestFile("testXML.xml");
101 String s1 = ParseUtils.getStringContent(file, tc);
102 String s2 = ParseUtils.getStringContent(file, tc, "application/xml");
103 assertEquals(s1, s2);
106 public void testPPTExtraction() throws Exception {
107 File file = getTestFile("testPPT.ppt");
108 String s1 = ParseUtils.getStringContent(file, tc);
109 String s2 = ParseUtils.getStringContent(
110 file, tc, "application/vnd.ms-powerpoint");
111 assertEquals(s1, s2);
112 ParserConfig config =
113 tc.getParserConfig("application/vnd.ms-powerpoint");
114 Parser parser = ParserFactory.getParser(config);
115 Collection<Content> contents = config.getContents();
116 assertNotNull(contents);
117 Metadata metadata = new Metadata();
118 InputStream stream = new FileInputStream(file);
119 try {
120 parser.parse(stream, contents, metadata);
121 } finally {
122 stream.close();
124 assertEquals("Sample Powerpoint Slide", metadata.get(Metadata.TITLE));
127 public void testWORDxtraction() throws Exception {
128 File file = getTestFile("testWORD.doc");
129 String s1 = ParseUtils.getStringContent(file, tc);
130 String s2 = ParseUtils.getStringContent(file, tc, "application/msword");
131 assertEquals(s1, s2);
132 ParserConfig config = tc.getParserConfig("application/msword");
133 Parser parser = ParserFactory.getParser(config);
134 Collection<Content> contents = config.getContents();
135 assertNotNull(contents);
136 Metadata metadata = new Metadata();
137 InputStream stream = new FileInputStream(file);
138 try {
139 parser.parse(stream, contents, metadata);
140 } finally {
141 stream.close();
143 assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
146 public void testEXCELExtraction() throws Exception {
147 final String expected = "Numbers and their Squares Number Square 1.0 "
148 + "1.0 2.0 4.0 3.0 9.0 4.0 16.0 5.0 25.0 6.0 36.0 7.0 49.0 8.0 "
149 + "64.0 9.0 81.0 10.0 100.0 11.0 121.0 12.0 144.0 13.0 169.0 "
150 + "14.0 196.0 15.0 225.0 Written and saved in Microsoft Excel "
151 + "X for Mac Service Release 1.";
152 File file = getTestFile("testEXCEL.xls");
153 String s1 = ParseUtils.getStringContent(file, tc);
154 String s2 = ParseUtils.getStringContent(file, tc,
155 "application/vnd.ms-excel");
156 assertEquals(s1, s2);
157 assertTrue("Text does not contain '" + expected + "'", s1
158 .contains(expected));
159 ParserConfig config = tc.getParserConfig("application/vnd.ms-excel");
160 Parser parser = ParserFactory.getParser(config);
161 Collection<Content> contents = config.getContents();
162 assertNotNull(contents);
163 Metadata metadata = new Metadata();
164 InputStream stream = new FileInputStream(file);
165 try {
166 parser.parse(stream, contents, metadata);
167 } finally {
168 stream.close();
170 assertEquals("Simple Excel document", metadata.get(Metadata.TITLE));
173 public void testOOExtraction() throws Exception {
174 File file = getTestFile("testOpenOffice2.odt");
175 String s1 = ParseUtils.getStringContent(file, tc);
176 String s2 = ParseUtils.getStringContent(file, tc,
177 "application/vnd.oasis.opendocument.text");
178 assertEquals(s1, s2);
181 public void testHTMLExtraction() throws Exception {
182 File file = getTestFile("testHTML.html");
183 String s1 = ParseUtils.getStringContent(file, tc);
184 String s2 = ParseUtils.getStringContent(file, tc, "text/html");
185 assertEquals(s1, s2);
187 ParserConfig config = tc.getParserConfig("text/html");
188 Parser parser = ParserFactory.getParser(config);
189 assertNotNull(parser);
191 Collection<Content> contents = config.getContents();
192 assertNotNull(contents);
193 Metadata metadata = new Metadata();
194 InputStream stream = new FileInputStream(file);
195 try {
196 parser.parse(stream, contents, metadata);
197 } finally {
198 stream.close();
200 assertEquals("Title : Test Indexation Html", metadata.get(Metadata.TITLE));
202 final String text = metadata.toString();
203 final String expected = "Test Indexation Html";
204 assertTrue("text contains '" + expected + "'", text.contains(expected));
207 public void testZipExtraction() throws Exception {
208 File zip = getTestFile("test-documents.zip");
209 List<Parser> parsers = ParseUtils.getParsersFromZip(zip, tc);
210 List<File> zipFiles = Utils.unzip(new FileInputStream(zip));
211 for (int i = 0; i < parsers.size(); i++) {
212 Parser zipEntryParser = parsers.get(i);
213 assertNotNull(zipEntryParser);
214 for (int j = 0; j < zipFiles.size(); j++) {
215 /* FIXME: Doesn't work with the new Parser interface
216 ParserConfig config = tc.getParserConfig(
217 zipEntryParser.getMimeType());
218 Map<String, Content> contents = config.getContents();
219 assertNotNull(contents);
220 InputStream stream = new FileInputStream(zipFiles.get(j));
221 try {
222 zipEntryParser.getContents(stream, contents);
223 assertNotNull(contents.get("fullText"));
224 } finally {
225 stream.close();
232 private File getTestFile(String filename) {
233 return new File(testFilesBaseDir, filename);