Merge pull request #168 from bioperl/fix-remote-taxonomy-test
[bioperl-live.git] / maintenance / check_URLs.pl
blob2e33fd8bfc2478bebafc7f50c8aa7178bca5b0dd
1 #!/usr/bin/perl
3 =head1 NAME
5 check_URLs.pl - validate URLs located in module code and POD
7 =head1 SYNOPSIS
9 B<check_URLs.pl> [B<-d|--dir> path] [B<-v|--verbose>] [B<-?|-h|--help>]
10 [B<-o|--outfile> filename]
12 =head1 DESCRIPTION
14 Checks code and POD of all bioperl-live modules for URLs, and validates them.
16 Output is a series of lines containing two fields, tab separated.
17 The first field is the file with the bad URL, the second is the URL itself.
19 The whole URL is not retrieved, only a HTTP "HEAD" request is done
20 to see if the URL exists on the server. The request is done using
21 B<LWP::Simple> so the B<http_proxy> environmental variable will be
22 honoured.
24 The URL parsing may not be perfect - although I use the B<Regexp::Common::URI>
25 module, I have to manually clean up some URLs which are embedded in Perl
26 strings to convert the matched URL to a more probable real world URL,
27 e.g. most URLs don\'t end in "'," or ")" :-)
29 =cut
31 use strict;
32 use Data::Dumper;
33 use File::Find;
34 use Getopt::Long;
35 use Regexp::Common qw(URI);
36 use LWP::Simple qw($ua head);
38 $ua->timeout(15);
41 # command line options
44 my ($verbose, $dir, $help) = (0, '../Bio/', undef);
45 my $file;
46 GetOptions(
47 'v|verbose' => \$verbose,
48 'd|dir:s' => \$dir,
49 'o|outfile:s' => \$file,
50 'h|help|?' => sub{ exec('perldoc',$0); exit(0) }
53 my $fh;
55 if (defined $file) {
56 open $fh, '>', $file or die "Could not write file '$file': $!\n";
57 } else {
58 $fh = \*STDOUT;
62 # find all modules
65 find( \&find_modules, $dir );
68 # validate unique URLs and print fail cases to stdout
71 my %cached_urls;
73 sub check_url {
74 my ($url, $file) = @_;
75 if (exists $cached_urls{$url}) {
76 print STDERR "$url checked in ".$cached_urls{$url}[0].":".$cached_urls{$url}[1]."\n" if $verbose;
77 print $fh "$file\t$url\n" if $cached_urls{$url}[1] ne 'ok';
78 return;
80 print STDERR "Checking $url in $file... " if $verbose;
81 my $ok = head($url);
82 my $status = $ok ? 'ok' : 'FAIL!';
83 print STDERR "$status!\n" if $verbose;
84 print $fh "$file\t$url\n" if !$ok;
85 $cached_urls{$url} = [$file, $status];
88 close $fh if $file; # don't close STDOUT
91 # this is where the action is
94 sub find_modules {
95 # only want files with .pm
96 return unless m/\.pm$/;
97 return unless -f $_;
99 my $fname = $_;
101 # slurp in the file
102 my $text = do { local( @ARGV, $/ ) = $fname ; <> } ;
104 # keep track of URLs
105 while ($text =~ m/$RE{URI}{HTTP}{-keep}/g) {
106 my $url = $1 or next;
107 # remove Perl code if URL was embedded in string and other stuff
108 $url =~ s/\s*[.,;'")]*\s*$//;
109 check_url($url, $File::Find::name);
114 =head1 OPTIONS
116 =over 3
118 =item B<-d | --dir> path
120 Overides the default directory to recursively look for .pm file
121 (Default is '../Bio')
123 =item B<-v | --verbose>
125 Show the progress through files during the POD checking.
127 =item B<-? | -h | --help>
129 This help text.
131 =back
133 =head1 FEEDBACK
135 =head2 Mailing Lists
137 User feedback is an integral part of the evolution of this and other
138 Bioperl modules. Send your comments and suggestions preferably to
139 the Bioperl mailing list. Your participation is much appreciated.
141 bioperl-l@bioperl.org - General discussion
142 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
144 =head2 Reporting Bugs
146 Report bugs to the Bioperl bug tracking system to help us keep track
147 of the bugs and their resolution. Bug reports can be submitted via the
148 web:
150 https://github.com/bioperl/bioperl-live/issues
152 =head1 AUTHOR - Torsten Seemann
154 Email: torsten-dot-seemann-at-infotech-dot-monash-dot-edu-dot-au
156 =cut