TIKA-113: Metadata (such as title) should not be part of content
[tika.git] / src / test / java / org / apache / tika / parser / html / HtmlParserTest.java
blob11c7cccae34082edb214c5fdcb02566356cae738
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.parser.html;
19 import java.io.ByteArrayInputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.StringWriter;
24 import junit.framework.TestCase;
26 import org.apache.tika.exception.TikaException;
27 import org.apache.tika.metadata.Metadata;
28 import org.apache.tika.parser.Parser;
29 import org.apache.tika.sax.BodyContentHandler;
30 import org.apache.tika.sax.TeeContentHandler;
31 import org.xml.sax.Attributes;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.SAXException;
34 import org.xml.sax.helpers.DefaultHandler;
36 public class HtmlParserTest extends TestCase {
38 private Parser parser = new HtmlParser();
40 private static InputStream getStream(String name) {
41 return Thread.currentThread().getContextClassLoader()
42 .getResourceAsStream(name);
45 public void testParseAscii() throws Exception {
46 final StringWriter href = new StringWriter();
48 ContentHandler body = new BodyContentHandler();
49 ContentHandler link = new DefaultHandler() {
50 @Override
51 public void startElement(
52 String u, String l, String n, Attributes a)
53 throws SAXException {
54 if ("a".equals(l)) {
55 href.append(a.getValue("href"));
59 Metadata metadata = new Metadata();
60 InputStream stream = getStream("test-documents/testHTML.html");
61 try {
62 parser.parse(stream, new TeeContentHandler(body, link), metadata);
63 } finally {
64 stream.close();
67 assertEquals(
68 "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
69 assertEquals("http://www.apache.org/", href.toString());
71 String content = body.toString();
72 assertTrue(
73 "Did not contain expected text:" + "Test Indexation Html",
74 content.contains("Test Indexation Html"));
75 assertTrue(
76 "Did not contain expected text:" + "Indexation du fichier",
77 content.contains("Indexation du fichier"));
81 public void XtestParseUTF8() throws IOException, SAXException, TikaException {
82 ContentHandler handler = new BodyContentHandler();
83 Metadata metadata = new Metadata();
85 parser.parse(
86 getStream("test-documents/testHTML_utf8.html"),
87 handler, metadata);
88 String content = handler.toString();
90 assertTrue("Did not contain expected text:"
91 + "Title : Tilte with UTF-8 chars öäå", content
92 .contains("Title : Tilte with UTF-8 chars öäå"));
94 assertTrue("Did not contain expected text:"
95 + "Content with UTF-8 chars", content
96 .contains("Content with UTF-8 chars"));
98 assertTrue("Did not contain expected text:" + "åäö", content
99 .contains("åäö"));
103 public void testParseEmpty() throws Exception {
104 Metadata metadata = new Metadata();
105 StringWriter writer = new StringWriter();
106 parser.parse(
107 new ByteArrayInputStream(new byte[0]),
108 new BodyContentHandler(writer), metadata);
109 String content = writer.toString();
110 assertEquals("", content);