2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
;
20 import java
.io
.FileInputStream
;
21 import java
.io
.IOException
;
22 import java
.io
.InputStream
;
23 import java
.util
.Collection
;
24 import java
.util
.List
;
26 import junit
.framework
.TestCase
;
28 import org
.apache
.tika
.config
.Content
;
29 import org
.apache
.tika
.config
.ParserConfig
;
30 import org
.apache
.tika
.config
.TikaConfig
;
31 import org
.apache
.tika
.metadata
.Metadata
;
32 import org
.apache
.tika
.parser
.Parser
;
33 import org
.apache
.tika
.parser
.ParserFactory
;
34 import org
.apache
.tika
.utils
.ParseUtils
;
35 import org
.apache
.tika
.utils
.Utils
;
36 import org
.jdom
.JDOMException
;
39 * Junit test class for Tika {@link Parser}s.
41 public class TestParsers
extends TestCase
{
43 private TikaConfig tc
;
45 private File testFilesBaseDir
;
47 public void setUp() throws JDOMException
, IOException
{
49 * FIXME the old mechanism does not work anymore when running the tests
50 * with Maven - need a resource-based one, but this means more changes
51 * to classes which rely on filenames.
53 * String sep = File.separator; StringTokenizer st = new
54 * StringTokenizer(System.getProperty( "java.class.path"),
55 * File.pathSeparator);
57 * classDir = new File(st.nextToken());
59 * config = classDir.getParent() + sep + "config" + sep + "config.xml";
61 * String log4j = classDir.getParent() + sep + "Config" + sep + "log4j" +
62 * sep + "log4j.properties";
65 // FIXME for now, fix filenames according to Maven testing layout
66 // The file below should be the default configuration for the test of
67 // getDefaultConfig() to be legitimate.
68 final String tikaConfigFilename
= "target/classes/org/apache/tika/tika-config.xml";
70 testFilesBaseDir
= new File("src/test/resources/test-documents");
72 tc
= new TikaConfig(tikaConfigFilename
);
75 public void testPDFExtraction() throws Exception
{
76 File file
= getTestFile("testPDF.pdf");
77 String s1
= ParseUtils
.getStringContent(file
, tc
);
78 String s2
= ParseUtils
.getStringContent(file
, tc
, "application/pdf");
79 String s3
= ParseUtils
.getStringContent(file
, TikaConfig
85 public void testTXTExtraction() throws Exception
{
86 File file
= getTestFile("testTXT.txt");
87 String s1
= ParseUtils
.getStringContent(file
, tc
);
88 String s2
= ParseUtils
.getStringContent(file
, tc
, "text/plain");
92 public void testRTFExtraction() throws Exception
{
93 File file
= getTestFile("testRTF.rtf");
94 String s1
= ParseUtils
.getStringContent(file
, tc
);
95 String s2
= ParseUtils
.getStringContent(file
, tc
, "application/rtf");
99 public void testXMLExtraction() throws Exception
{
100 File file
= getTestFile("testXML.xml");
101 String s1
= ParseUtils
.getStringContent(file
, tc
);
102 String s2
= ParseUtils
.getStringContent(file
, tc
, "application/xml");
103 assertEquals(s1
, s2
);
106 public void testPPTExtraction() throws Exception
{
107 File file
= getTestFile("testPPT.ppt");
108 String s1
= ParseUtils
.getStringContent(file
, tc
);
109 String s2
= ParseUtils
.getStringContent(
110 file
, tc
, "application/vnd.ms-powerpoint");
111 assertEquals(s1
, s2
);
112 ParserConfig config
=
113 tc
.getParserConfig("application/vnd.ms-powerpoint");
114 Parser parser
= ParserFactory
.getParser(config
);
115 Collection
<Content
> contents
= config
.getContents();
116 assertNotNull(contents
);
117 Metadata metadata
= new Metadata();
118 InputStream stream
= new FileInputStream(file
);
120 parser
.parse(stream
, contents
, metadata
);
124 assertEquals("Sample Powerpoint Slide", metadata
.get(Metadata
.TITLE
));
127 public void testWORDxtraction() throws Exception
{
128 File file
= getTestFile("testWORD.doc");
129 String s1
= ParseUtils
.getStringContent(file
, tc
);
130 String s2
= ParseUtils
.getStringContent(file
, tc
, "application/msword");
131 assertEquals(s1
, s2
);
132 ParserConfig config
= tc
.getParserConfig("application/msword");
133 Parser parser
= ParserFactory
.getParser(config
);
134 Collection
<Content
> contents
= config
.getContents();
135 assertNotNull(contents
);
136 Metadata metadata
= new Metadata();
137 InputStream stream
= new FileInputStream(file
);
139 parser
.parse(stream
, contents
, metadata
);
143 assertEquals("Sample Word Document", metadata
.get(Metadata
.TITLE
));
146 public void testEXCELExtraction() throws Exception
{
147 final String expected
= "Numbers and their Squares Number Square 1.0 "
148 + "1.0 2.0 4.0 3.0 9.0 4.0 16.0 5.0 25.0 6.0 36.0 7.0 49.0 8.0 "
149 + "64.0 9.0 81.0 10.0 100.0 11.0 121.0 12.0 144.0 13.0 169.0 "
150 + "14.0 196.0 15.0 225.0 Written and saved in Microsoft Excel "
151 + "X for Mac Service Release 1.";
152 File file
= getTestFile("testEXCEL.xls");
153 String s1
= ParseUtils
.getStringContent(file
, tc
);
154 String s2
= ParseUtils
.getStringContent(file
, tc
,
155 "application/vnd.ms-excel");
156 assertEquals(s1
, s2
);
157 assertTrue("Text does not contain '" + expected
+ "'", s1
158 .contains(expected
));
159 ParserConfig config
= tc
.getParserConfig("application/vnd.ms-excel");
160 Parser parser
= ParserFactory
.getParser(config
);
161 Collection
<Content
> contents
= config
.getContents();
162 assertNotNull(contents
);
163 Metadata metadata
= new Metadata();
164 InputStream stream
= new FileInputStream(file
);
166 parser
.parse(stream
, contents
, metadata
);
170 assertEquals("Simple Excel document", metadata
.get(Metadata
.TITLE
));
173 public void testOOExtraction() throws Exception
{
174 File file
= getTestFile("testOpenOffice2.odt");
175 String s1
= ParseUtils
.getStringContent(file
, tc
);
176 String s2
= ParseUtils
.getStringContent(file
, tc
,
177 "application/vnd.oasis.opendocument.text");
178 assertEquals(s1
, s2
);
181 public void testHTMLExtraction() throws Exception
{
182 File file
= getTestFile("testHTML.html");
183 String s1
= ParseUtils
.getStringContent(file
, tc
);
184 String s2
= ParseUtils
.getStringContent(file
, tc
, "text/html");
185 assertEquals(s1
, s2
);
187 ParserConfig config
= tc
.getParserConfig("text/html");
188 Parser parser
= ParserFactory
.getParser(config
);
189 assertNotNull(parser
);
191 Collection
<Content
> contents
= config
.getContents();
192 assertNotNull(contents
);
193 Metadata metadata
= new Metadata();
194 InputStream stream
= new FileInputStream(file
);
196 parser
.parse(stream
, contents
, metadata
);
200 assertEquals("Title : Test Indexation Html", metadata
.get(Metadata
.TITLE
));
202 final String text
= metadata
.toString();
203 final String expected
= "Test Indexation Html";
204 assertTrue("text contains '" + expected
+ "'", text
.contains(expected
));
207 public void testZipExtraction() throws Exception
{
208 File zip
= getTestFile("test-documents.zip");
209 List
<Parser
> parsers
= ParseUtils
.getParsersFromZip(zip
, tc
);
210 List
<File
> zipFiles
= Utils
.unzip(new FileInputStream(zip
));
211 for (int i
= 0; i
< parsers
.size(); i
++) {
212 Parser zipEntryParser
= parsers
.get(i
);
213 assertNotNull(zipEntryParser
);
214 for (int j
= 0; j
< zipFiles
.size(); j
++) {
215 /* FIXME: Doesn't work with the new Parser interface
216 ParserConfig config = tc.getParserConfig(
217 zipEntryParser.getMimeType());
218 Map<String, Content> contents = config.getContents();
219 assertNotNull(contents);
220 InputStream stream = new FileInputStream(zipFiles.get(j));
222 zipEntryParser.getContents(stream, contents);
223 assertNotNull(contents.get("fullText"));
232 private File
getTestFile(String filename
) {
233 return new File(testFilesBaseDir
, filename
);