libjava/gnu/xml/pipeline/LinkFilter.java

   1 /* LinkFilter.java --
   2    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19 02111-1307 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 package gnu.xml.pipeline;
  39
  40 import java.io.IOException;
  41 import java.net.URL;
  42 import java.util.Enumeration;
  43 import java.util.Vector;
  44
  45 import org.xml.sax.Attributes;
  46 import org.xml.sax.SAXException;
  47
  48
  49 /**
  50  * Pipeline filter to remember XHTML links found in a document,
  51  * so they can later be crawled.  Fragments are not counted, and duplicates
  52  * are ignored.  Callers are responsible for filtering out URLs they aren't
  53  * interested in.  Events are passed through unmodified.
  54  *
  55  * <p> Input MUST include a setDocumentLocator() call, as it's used to
  56  * resolve relative links in the absence of a "base" element.  Input MUST
  57  * also include namespace identifiers, since it is the XHTML namespace
  58  * identifier which is used to identify the relevant elements.
  59  *
  60  * <p><em>FIXME:</em> handle xml:base attribute ... in association with
  61  * a stack of base URIs.  Similarly, recognize/support XLink data.
  62  *
  63  * @author David Brownell
  64  */
  65 public class LinkFilter extends EventFilter
  66 {
  67     // for storing URIs
  68     private Vector              vector = new Vector ();
  69
  70         // struct for "full" link record (tbd)
  71         // these for troubleshooting original source:
  72         //      original uri
  73         //      uri as resolved (base, relative, etc)
  74         //      URI of originating doc
  75         //      line #
  76         //      original element + attrs (img src, desc, etc)
  77
  78         // XLink model of the link ... for inter-site pairups ?
  79
  80     private String              baseURI;
  81
  82     private boolean             siteRestricted = false;
  83
  84     //
  85     // XXX leverage blacklist info (like robots.txt)
  86     //
  87     // XXX constructor w/param ... pipeline for sending link data
  88     // probably XHTML --> XLink, providing info as sketched above
  89     //
  90
  91
  92     /**
  93      * Constructs a new event filter, which collects links in private data
  94      * structure for later enumeration.
  95      */
  96         // constructor used by PipelineFactory
  97     public LinkFilter ()
  98     {
  99         super.setContentHandler (this);
 100     }
 101
 102
 103     /**
 104      * Constructs a new event filter, which collects links in private data
 105      * structure for later enumeration and passes all events, unmodified,
 106      * to the next consumer.
 107      */
 108         // constructor used by PipelineFactory
 109     public LinkFilter (EventConsumer next)
 110     {
 111         super (next);
 112         super.setContentHandler (this);
 113     }
 114
 115
 116     /**
 117      * Returns an enumeration of the links found since the filter
 118      * was constructed, or since removeAllLinks() was called.
 119      *
 120      * @return enumeration of strings.
 121      */
 122     public Enumeration getLinks ()
 123     {
 124         return vector.elements ();
 125     }
 126
 127     /**
 128      * Removes records about all links reported to the event
 129      * stream, as if the filter were newly created.
 130      */
 131     public void removeAllLinks ()
 132     {
 133         vector = new Vector ();
 134     }
 135
 136
 137     /**
 138      * Collects URIs for (X)HTML content from elements which hold them.
 139      */
 140     public void startElement (
 141         String          uri,
 142         String          localName,
 143         String          qName,
 144         Attributes      atts
 145     ) throws SAXException
 146     {
 147         String  link;
 148
 149         // Recognize XHTML links.
 150         if ("http://www.w3.org/1999/xhtml".equals (uri)) {
 151
 152             if ("a".equals (localName) || "base".equals (localName)
 153                     || "area".equals (localName))
 154                 link = atts.getValue ("href");
 155             else if ("iframe".equals (localName) || "frame".equals (localName))
 156                 link = atts.getValue ("src");
 157             else if ("blockquote".equals (localName) || "q".equals (localName)
 158                     || "ins".equals (localName) || "del".equals (localName))
 159                 link = atts.getValue ("cite");
 160             else
 161                 link = null;
 162             link = maybeAddLink (link);
 163
 164             // "base" modifies designated baseURI
 165             if ("base".equals (localName) && link != null)
 166                 baseURI = link;
 167
 168             if ("iframe".equals (localName) || "img".equals (localName))
 169                 maybeAddLink (atts.getValue ("longdesc"));
 170         }
 171
 172         super.startElement (uri, localName, qName, atts);
 173     }
 174
 175     private String maybeAddLink (String link)
 176     {
 177         int             index;
 178
 179         // ignore empty links and fragments inside docs
 180         if (link == null)
 181             return null;
 182         if ((index = link.indexOf ("#")) >= 0)
 183             link = link.substring (0, index);
 184         if (link.equals (""))
 185             return null;
 186
 187         try {
 188             // get the real URI
 189             URL         base = new URL ((baseURI != null)
 190                                     ? baseURI
 191                                     : getDocumentLocator ().getSystemId ());
 192             URL         url = new URL (base, link);
 193
 194             link = url.toString ();
 195
 196             // ignore duplicates
 197             if (vector.contains (link))
 198                 return link;
 199
 200             // other than what "base" does, stick to original site:
 201             if (siteRestricted) {
 202                 // don't switch protocols
 203                 if (!base.getProtocol ().equals (url.getProtocol ()))
 204                     return link;
 205                 // don't switch servers
 206                 if (base.getHost () != null
 207                         && !base.getHost ().equals (url.getHost ()))
 208                     return link;
 209             }
 210
 211             vector.addElement (link);
 212
 213             return link;
 214
 215         } catch (IOException e) {
 216             // bad URLs we don't want
 217         }
 218         return null;
 219     }
 220
 221     /**
 222      * Reports an error if no Locator has been made available.
 223      */
 224     public void startDocument ()
 225     throws SAXException
 226     {
 227         if (getDocumentLocator () == null)
 228             throw new SAXException ("no Locator!");
 229     }
 230
 231     /**
 232      * Forgets about any base URI information that may be recorded.
 233      * Applications will often want to call removeAllLinks(), likely
 234      * after examining the links which were reported.
 235      */
 236     public void endDocument ()
 237     throws SAXException
 238     {
 239         baseURI = null;
 240         super.endDocument ();
 241     }
 242 }