2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org
.apache
.tika
.parser
;
19 import java
.io
.IOException
;
20 import java
.io
.InputStream
;
22 import org
.apache
.tika
.exception
.TikaException
;
23 import org
.apache
.tika
.metadata
.Metadata
;
24 import org
.apache
.tika
.sax
.BodyContentHandler
;
25 import org
.apache
.tika
.sax
.TeeContentHandler
;
26 import org
.apache
.tika
.utils
.RegexUtils
;
27 import org
.xml
.sax
.ContentHandler
;
28 import org
.xml
.sax
.SAXException
;
31 * Parser decorator that post-processes the results from a decorated parser.
32 * The post-processing takes care of filling in any "fulltext", "summary", and
33 * regexp {@link Content} objects with the full text content returned by
34 * the decorated parser. The post-processing also catches and logs any
35 * exceptions thrown by the decorated parser.
37 public class ParserPostProcessor
extends ParserDecorator
{
40 * Creates a post-processing decorator for the given parser.
42 * @param parser the parser to be decorated
44 public ParserPostProcessor(Parser parser
) {
49 * Forwards the call to the delegated parser and post-processes the
50 * results as described above.
53 InputStream stream
, ContentHandler handler
, Metadata metadata
)
54 throws IOException
, SAXException
, TikaException
{
55 ContentHandler body
= new BodyContentHandler();
56 super.parse(stream
, new TeeContentHandler(handler
, body
), metadata
);
58 String content
= body
.toString();
59 metadata
.set("fulltext", content
);
61 int length
= Math
.min(content
.length(), 500);
62 metadata
.set("summary", content
.substring(0, length
));
64 for (String link
: RegexUtils
.extractLinks(content
)) {
65 metadata
.add("outlinks", link
);