[bug 2714]
[bioperl-live.git] / maintenance / check_URLs.pl
blob14d5f08c0eb7cbb15eb21955de65e8438df93daa
1 #!/usr/bin/perl -w
3 =head1 NAME
5 check_URLs.pl - validate URLs located in module code and POD
7 =head1 SYNOPSIS
9 B<check_URLs.pl> [B<-d|--dir> path] [B<-v|--verbose>] [B<-?|-h|--help>]
11 =head1 DESCRIPTION
13 Checks code and POD of all bioperl-live modules for URLs, and validates them.
15 Output is a series of lines containing two fields, tab separated.
16 The first field is the file with the bad URL, the second is the URL itself.
18 The whole URL is not retrieved, only a HTTP "HEAD" request is done
19 to see if the URL exists on the server. The request is done using
20 B<LWP::Simple> so the B<http_proxy> environmental variable will be
21 honoured.
23 The URL parsing may not be perfect - although I use the B<Regexp::Common::URI>
24 module, I have to manually clean up some URLs which are embedded in Perl
25 strings to convert the matched URL to a more probable real world URL,
26 e.g. most URLs don\'t end in "'," or ")" :-)
28 =cut
30 use strict;
31 use Data::Dumper;
32 use File::Find;
33 use Getopt::Long;
34 use Regexp::Common qw(URI);
35 use LWP::Simple;
38 # command line options
41 my ($verbose, $dir, $help) = (0, '../Bio/', undef);
42 GetOptions(
43 'v|verbose' => \$verbose,
44 'd|dir:s' => \$dir,
45 'h|help|?' => sub{ exec('perldoc',$0); exit(0) }
49 # globals
52 my %URL;
55 # find all modules
58 find( \&find_modules, $dir );
61 # validate unique URLs and print fail cases to stdout
64 for my $url (keys %URL) {
65 print STDERR "Checking $url ... ";
66 my $ok = head($url);
67 print STDERR ($ok ? 'ok' : 'FAIL!'), "\n";
68 if (not $ok) {
69 for my $file (@{ $URL{$url} }) {
70 print "$file\t$url\n";
75 print STDERR Dumper(\%URL) if $verbose;
78 # this is where the action is
81 sub find_modules {
82 # only want files with .pm
83 return unless m/\.pm$/;
84 return unless -f $_;
86 my $fname = $_;
87 print STDERR "$fname\n" if $verbose;
89 # slurp in the file
90 my $text = do { local( @ARGV, $/ ) = $fname ; <> } ;
92 # keep track of URLs
93 while ($text =~ m/$RE{URI}{HTTP}{-keep}/g) {
94 my $url = $1 or next;
95 # remove Perl code if URL was embedded in string and other stuff
96 $url =~ s/\s*[.,;'")]*\s*$//;
97 print STDERR "$url\n" if $verbose;
98 push @{ $URL{$url} } , $File::Find::name;
103 =head1 OPTIONS
105 =over 3
107 =item B<-d | --dir> path
109 Overides the default directory to recursively look for .pm file
110 (Default is '../Bio')
112 =item B<-v | --verbose>
114 Show the progress through files during the POD checking.
116 =item B<-? | -h | --help>
118 This help text.
120 =back
122 =head1 FEEDBACK
124 =head2 Mailing Lists
126 User feedback is an integral part of the evolution of this and other
127 Bioperl modules. Send your comments and suggestions preferably to
128 the Bioperl mailing list. Your participation is much appreciated.
130 bioperl-l@bioperl.org - General discussion
131 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
133 =head2 Reporting Bugs
135 Report bugs to the Bioperl bug tracking system to help us keep track
136 of the bugs and their resolution. Bug reports can be submitted via the
137 web:
139 http://bugzilla.open-bio.org/
141 =head1 AUTHOR - Torsten Seemann
143 Email: torsten-dot-seemann-at-infotech-dot-monash-dot-edu-dot-au
145 =cut