libjava/gnu/xml/pipeline/LinkFilter.java

   1 /* LinkFilter.java --
   2    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19 02111-1307 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 package gnu.xml.pipeline;
  39
  40 import java.io.IOException;
  41 import java.net.URL;
  42 import java.util.Enumeration;
  43 import java.util.Vector;
  44
  45 import org.xml.sax.Attributes;
  46 import org.xml.sax.Locator;
  47 import org.xml.sax.SAXException;
  48
  49
  50 /**
  51  * Pipeline filter to remember XHTML links found in a document,
  52  * so they can later be crawled.  Fragments are not counted, and duplicates
  53  * are ignored.  Callers are responsible for filtering out URLs they aren't
  54  * interested in.  Events are passed through unmodified.
  55  *
  56  * <p> Input MUST include a setDocumentLocator() call, as it's used to
  57  * resolve relative links in the absence of a "base" element.  Input MUST
  58  * also include namespace identifiers, since it is the XHTML namespace
  59  * identifier which is used to identify the relevant elements.
  60  *
  61  * <p><em>FIXME:</em> handle xml:base attribute ... in association with
  62  * a stack of base URIs.  Similarly, recognize/support XLink data.
  63  *
  64  * @author David Brownell
  65  */
  66 public class LinkFilter extends EventFilter
  67 {
  68     // for storing URIs
  69     private Vector              vector = new Vector ();
  70
  71         // struct for "full" link record (tbd)
  72         // these for troubleshooting original source:
  73         //      original uri
  74         //      uri as resolved (base, relative, etc)
  75         //      URI of originating doc
  76         //      line #
  77         //      original element + attrs (img src, desc, etc)
  78
  79         // XLink model of the link ... for inter-site pairups ?
  80
  81     private String              baseURI;
  82
  83     private boolean             siteRestricted = false;
  84
  85     //
  86     // XXX leverage blacklist info (like robots.txt)
  87     //
  88     // XXX constructor w/param ... pipeline for sending link data
  89     // probably XHTML --> XLink, providing info as sketched above
  90     //
  91
  92
  93     /**
  94      * Constructs a new event filter, which collects links in private data
  95      * structure for later enumeration.
  96      */
  97         // constructor used by PipelineFactory
  98     public LinkFilter ()
  99     {
 100         super.setContentHandler (this);
 101     }
 102
 103
 104     /**
 105      * Constructs a new event filter, which collects links in private data
 106      * structure for later enumeration and passes all events, unmodified,
 107      * to the next consumer.
 108      */
 109         // constructor used by PipelineFactory
 110     public LinkFilter (EventConsumer next)
 111     {
 112         super (next);
 113         super.setContentHandler (this);
 114     }
 115
 116
 117     /**
 118      * Returns an enumeration of the links found since the filter
 119      * was constructed, or since removeAllLinks() was called.
 120      *
 121      * @return enumeration of strings.
 122      */
 123     public Enumeration getLinks ()
 124     {
 125         return vector.elements ();
 126     }
 127
 128     /**
 129      * Removes records about all links reported to the event
 130      * stream, as if the filter were newly created.
 131      */
 132     public void removeAllLinks ()
 133     {
 134         vector = new Vector ();
 135     }
 136
 137
 138     /**
 139      * Collects URIs for (X)HTML content from elements which hold them.
 140      */
 141     public void startElement (
 142         String          uri,
 143         String          localName,
 144         String          qName,
 145         Attributes      atts
 146     ) throws SAXException
 147     {
 148         String  link;
 149
 150         // Recognize XHTML links.
 151         if ("http://www.w3.org/1999/xhtml".equals (uri)) {
 152
 153             if ("a".equals (localName) || "base".equals (localName)
 154                     || "area".equals (localName))
 155                 link = atts.getValue ("href");
 156             else if ("iframe".equals (localName) || "frame".equals (localName))
 157                 link = atts.getValue ("src");
 158             else if ("blockquote".equals (localName) || "q".equals (localName)
 159                     || "ins".equals (localName) || "del".equals (localName))
 160                 link = atts.getValue ("cite");
 161             else
 162                 link = null;
 163             link = maybeAddLink (link);
 164
 165             // "base" modifies designated baseURI
 166             if ("base".equals (localName) && link != null)
 167                 baseURI = link;
 168
 169             if ("iframe".equals (localName) || "img".equals (localName))
 170                 maybeAddLink (atts.getValue ("longdesc"));
 171         }
 172
 173         super.startElement (uri, localName, qName, atts);
 174     }
 175
 176     private String maybeAddLink (String link)
 177     {
 178         int             index;
 179
 180         // ignore empty links and fragments inside docs
 181         if (link == null)
 182             return null;
 183         if ((index = link.indexOf ("#")) >= 0)
 184             link = link.substring (0, index);
 185         if (link.equals (""))
 186             return null;
 187
 188         try {
 189             // get the real URI
 190             URL         base = new URL ((baseURI != null)
 191                                     ? baseURI
 192                                     : getDocumentLocator ().getSystemId ());
 193             URL         url = new URL (base, link);
 194
 195             link = url.toString ();
 196
 197             // ignore duplicates
 198             if (vector.contains (link))
 199                 return link;
 200
 201             // other than what "base" does, stick to original site:
 202             if (siteRestricted) {
 203                 // don't switch protocols
 204                 if (!base.getProtocol ().equals (url.getProtocol ()))
 205                     return link;
 206                 // don't switch servers
 207                 if (base.getHost () != null
 208                         && !base.getHost ().equals (url.getHost ()))
 209                     return link;
 210             }
 211
 212             vector.addElement (link);
 213
 214             return link;
 215
 216         } catch (IOException e) {
 217             // bad URLs we don't want
 218         }
 219         return null;
 220     }
 221
 222     /**
 223      * Reports an error if no Locator has been made available.
 224      */
 225     public void startDocument ()
 226     throws SAXException
 227     {
 228         if (getDocumentLocator () == null)
 229             throw new SAXException ("no Locator!");
 230     }
 231
 232     /**
 233      * Forgets about any base URI information that may be recorded.
 234      * Applications will often want to call removeAllLinks(), likely
 235      * after examining the links which were reported.
 236      */
 237     public void endDocument ()
 238     throws SAXException
 239     {
 240         baseURI = null;
 241         super.endDocument ();
 242     }
 243 }