TIKA-113: Metadata (such as title) should not be part of content
[tika.git] / src / test / java / org / apache / tika / parser / microsoft / WordParserTest.java
blob7e702395019c5b619ddfc5aed9f93d8c9b1273b6
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.parser.microsoft;
19 import java.io.InputStream;
21 import org.apache.tika.metadata.Metadata;
22 import org.apache.tika.sax.BodyContentHandler;
23 import org.xml.sax.ContentHandler;
25 import junit.framework.TestCase;
27 public class WordParserTest extends TestCase {
29 public void testWordParser() throws Exception {
30 InputStream input = WordParserTest.class.getResourceAsStream(
31 "/test-documents/testWORD.doc");
32 try {
33 ContentHandler handler = new BodyContentHandler();
34 Metadata metadata = new Metadata();
35 new OfficeParser().parse(input, handler, metadata);
37 assertEquals(
38 "application/msword",
39 metadata.get(Metadata.CONTENT_TYPE));
40 assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
41 assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
42 assertTrue(handler.toString().contains("Sample Word Document"));
43 } finally {
44 input.close();