2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.parser
.html
;
19 import java
.io
.ByteArrayInputStream
;
20 import java
.io
.IOException
;
21 import java
.io
.InputStream
;
22 import java
.io
.StringWriter
;
24 import junit
.framework
.TestCase
;
26 import org
.apache
.tika
.exception
.TikaException
;
27 import org
.apache
.tika
.metadata
.Metadata
;
28 import org
.apache
.tika
.parser
.Parser
;
29 import org
.apache
.tika
.sax
.BodyContentHandler
;
30 import org
.apache
.tika
.sax
.TeeContentHandler
;
31 import org
.xml
.sax
.Attributes
;
32 import org
.xml
.sax
.ContentHandler
;
33 import org
.xml
.sax
.SAXException
;
34 import org
.xml
.sax
.helpers
.DefaultHandler
;
36 public class HtmlParserTest
extends TestCase
{
38 private Parser parser
= new HtmlParser();
40 private static InputStream
getStream(String name
) {
41 return Thread
.currentThread().getContextClassLoader()
42 .getResourceAsStream(name
);
45 public void testParseAscii() throws Exception
{
46 final StringWriter href
= new StringWriter();
48 ContentHandler body
= new BodyContentHandler();
49 ContentHandler link
= new DefaultHandler() {
51 public void startElement(
52 String u
, String l
, String n
, Attributes a
)
55 href
.append(a
.getValue("href"));
59 Metadata metadata
= new Metadata();
60 InputStream stream
= getStream("test-documents/testHTML.html");
62 parser
.parse(stream
, new TeeContentHandler(body
, link
), metadata
);
68 "Title : Test Indexation Html", metadata
.get(Metadata
.TITLE
));
69 assertEquals("http://www.apache.org/", href
.toString());
71 String content
= body
.toString();
73 "Did not contain expected text:" + "Test Indexation Html",
74 content
.contains("Test Indexation Html"));
76 "Did not contain expected text:" + "Indexation du fichier",
77 content
.contains("Indexation du fichier"));
81 public void XtestParseUTF8() throws IOException
, SAXException
, TikaException
{
82 ContentHandler handler
= new BodyContentHandler();
83 Metadata metadata
= new Metadata();
86 getStream("test-documents/testHTML_utf8.html"),
88 String content
= handler
.toString();
90 assertTrue("Did not contain expected text:"
91 + "Title : Tilte with UTF-8 chars öäå", content
92 .contains("Title : Tilte with UTF-8 chars öäå"));
94 assertTrue("Did not contain expected text:"
95 + "Content with UTF-8 chars", content
96 .contains("Content with UTF-8 chars"));
98 assertTrue("Did not contain expected text:" + "åäö", content
103 public void testParseEmpty() throws Exception
{
104 Metadata metadata
= new Metadata();
105 StringWriter writer
= new StringWriter();
107 new ByteArrayInputStream(new byte[0]),
108 new BodyContentHandler(writer
), metadata
);
109 String content
= writer
.toString();
110 assertEquals("", content
);