TIKA-113: Metadata (such as title) should not be part of content
[tika.git] / src / main / java / org / apache / tika / parser / ParserPostProcessor.java
blob18dd7123053ed8b08af11f21937c0f946727717a
1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 package org.apache.tika.parser;
19 import java.io.IOException;
20 import java.io.InputStream;
22 import org.apache.tika.exception.TikaException;
23 import org.apache.tika.metadata.Metadata;
24 import org.apache.tika.sax.BodyContentHandler;
25 import org.apache.tika.sax.TeeContentHandler;
26 import org.apache.tika.utils.RegexUtils;
27 import org.xml.sax.ContentHandler;
28 import org.xml.sax.SAXException;
30 /**
31 * Parser decorator that post-processes the results from a decorated parser.
32 * The post-processing takes care of filling in any "fulltext", "summary", and
33 * regexp {@link Content} objects with the full text content returned by
34 * the decorated parser. The post-processing also catches and logs any
35 * exceptions thrown by the decorated parser.
37 public class ParserPostProcessor extends ParserDecorator {
39 /**
40 * Creates a post-processing decorator for the given parser.
42 * @param parser the parser to be decorated
44 public ParserPostProcessor(Parser parser) {
45 super(parser);
48 /**
49 * Forwards the call to the delegated parser and post-processes the
50 * results as described above.
52 public void parse(
53 InputStream stream, ContentHandler handler, Metadata metadata)
54 throws IOException, SAXException, TikaException {
55 ContentHandler body = new BodyContentHandler();
56 super.parse(stream, new TeeContentHandler(handler, body), metadata);
58 String content = body.toString();
59 metadata.set("fulltext", content);
61 int length = Math.min(content.length(), 500);
62 metadata.set("summary", content.substring(0, length));
64 for (String link : RegexUtils.extractLinks(content)) {
65 metadata.add("outlinks", link);