maintenance/check_URLs.pl

   1 #!/usr/bin/perl -w
   2
   3 =head1 NAME
   4
   5 check_URLs.pl - validate URLs located in module code and POD
   6
   7 =head1 SYNOPSIS
   8
   9 B<check_URLs.pl> [B<-d|--dir> path] [B<-v|--verbose>] [B<-?|-h|--help>]
  10
  11 =head1 DESCRIPTION
  12
  13 Checks code and POD of all bioperl-live modules for URLs, and validates them.
  14
  15 Output is a series of lines containing two fields, tab separated.
  16 The first field is the file with the bad URL, the second is the URL itself.
  17
  18 The whole URL is not retrieved, only a HTTP "HEAD" request is done
  19 to see if the URL exists on the server. The request is done using
  20 B<LWP::Simple> so the B<http_proxy> environmental variable will be
  21 honoured.
  22
  23 The URL parsing may not be perfect - although I use the B<Regexp::Common::URI>
  24 module, I have to manually clean up some URLs which are embedded in Perl
  25 strings to convert the matched URL to a more probable real world URL,
  26 e.g. most URLs don\'t end in "'," or ")" :-)
  27
  28 =cut
  29
  30 use strict;
  31 use Data::Dumper;
  32 use File::Find;
  33 use Getopt::Long;
  34 use Regexp::Common qw(URI);
  35 use LWP::Simple;
  36
  37 #
  38 # command line options
  39 #
  40
  41 my ($verbose, $dir, $help) = (0, '../Bio/', undef);
  42 GetOptions(
  43     'v|verbose' => \$verbose,
  44     'd|dir:s' => \$dir,
  45     'h|help|?' => sub{ exec('perldoc',$0); exit(0) }
  46 );
  47
  48 #
  49 # globals
  50 #
  51
  52 my %URL;
  53
  54 #
  55 # find all modules
  56 #
  57
  58 find( \&find_modules, $dir );
  59
  60 #
  61 # validate unique URLs and print fail cases to stdout
  62 #
  63
  64 for my $url (keys %URL) {
  65     print STDERR "Checking $url ... ";
  66     my $ok = head($url);
  67     print STDERR ($ok ? 'ok' : 'FAIL!'), "\n";
  68     if (not $ok) {
  69          for my $file (@{ $URL{$url} }) {
  70              print "$file\t$url\n";
  71          }
  72     }
  73 }
  74
  75 print STDERR Dumper(\%URL) if $verbose;
  76
  77 #
  78 # this is where the action is
  79 #
  80
  81 sub find_modules {
  82     # only want files with .pm
  83     return unless m/\.pm$/;
  84     return unless -f $_;
  85
  86     my $fname = $_;
  87     print STDERR "$fname\n" if $verbose;
  88
  89     # slurp in the file
  90     my $text = do { local( @ARGV, $/ ) = $fname ; <> } ;
  91
  92     # keep track of URLs
  93     while ($text =~ m/$RE{URI}{HTTP}{-keep}/g) {
  94         my $url = $1 or next;
  95         # remove Perl code if URL was embedded in string and other stuff
  96         $url =~ s/\s*[.,;'")]*\s*$//;
  97         print STDERR "$url\n" if $verbose;
  98         push @{ $URL{$url} } , $File::Find::name;
  99     }
 100 }
 101
 102
 103 =head1 OPTIONS
 104
 105 =over 3
 106
 107 =item B<-d | --dir> path
 108
 109 Overides the default directory to recursively look for .pm file
 110 (Default is '../Bio')
 111
 112 =item B<-v | --verbose>
 113
 114 Show the progress through files during the POD checking.
 115
 116 =item B<-? | -h  | --help>
 117
 118 This help text.
 119
 120 =back
 121
 122 =head1 FEEDBACK
 123
 124 =head2 Mailing Lists
 125
 126 User feedback is an integral part of the evolution of this and other
 127 Bioperl modules. Send your comments and suggestions preferably to
 128 the Bioperl mailing list.  Your participation is much appreciated.
 129
 130   bioperl-l@bioperl.org                  - General discussion
 131   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
 132
 133 =head2 Reporting Bugs
 134
 135 Report bugs to the Bioperl bug tracking system to help us keep track
 136 of the bugs and their resolution. Bug reports can be submitted via the
 137 web:
 138
 139   http://bugzilla.open-bio.org/
 140
 141 =head1 AUTHOR - Torsten Seemann
 142
 143 Email: torsten-dot-seemann-at-infotech-dot-monash-dot-edu-dot-au
 144
 145 =cut