Add a paragraph to summarize the motivation for releases since 5.815
[libwww-perl-eserte.git] / bin / lwp-rget
blob8650cb232575e7d5d5d9d010d51d855c3ba27242
1 #!/usr/bin/perl -w
3 =head1 NAME
5 lwp-rget - Retrieve web documents recursively
7 =head1 SYNOPSIS
9 lwp-rget [--verbose] [--auth=USER:PASS] [--depth=N] [--hier] [--iis]
10 [--keepext=mime/type[,mime/type]] [--limit=N] [--nospace]
11 [--prefix=URL] [--referer=URL] [--sleep=N] [--tolower] <URL>
12 lwp-rget --version
14 =head1 DESCRIPTION
16 This program will retrieve a document and store it in a local file. It
17 will follow any links found in the document and store these documents
18 as well, patching links so that they refer to these local copies.
19 This process continues until there are no more unvisited links or the
20 process is stopped by the one or more of the limits which can be
21 controlled by the command line arguments.
23 This program is useful if you want to make a local copy of a
24 collection of documents or want to do web reading off-line.
26 All documents are stored as plain files in the current directory. The
27 file names chosen are derived from the last component of URL paths.
29 The options are:
31 =over 3
33 =item --auth=USER:PASS<n>
35 Set the authentication credentials to user "USER" and password "PASS" if
36 any restricted parts of the web site are hit. If there are restricted
37 parts of the web site and authentication credentials are not available,
38 those pages will not be downloaded.
40 =item --depth=I<n>
42 Limit the recursive level. Embedded images are always loaded, even if
43 they fall outside the I<--depth>. This means that one can use
44 I<--depth=0> in order to fetch a single document together with all
45 inline graphics.
47 The default depth is 5.
49 =item --hier
51 Download files into a hierarchy that mimics the web site structure.
52 The default is to put all files in the current directory.
54 =item --referer=I<URI>
56 Set the value of the Referer header for the initial request. The
57 special value C<"NONE"> can be used to suppress the Referer header in
58 any of subsequent requests. The Referer header will always be suppressed
59 in all normal C<http> requests if the referring page was transmitted over
60 C<https> as recommended in RFC 2616.
62 =item --iis
64 Sends an "Accept: */*" on all URL requests as a workaround for a bug in
65 IIS 2.0. If no Accept MIME header is present, IIS 2.0 returns with a
66 "406 No acceptable objects were found" error. Also converts any back
67 slashes (\\) in URLs to forward slashes (/).
69 =item --keepext=I<mime/type[,mime/type]>
71 Keeps the current extension for the list MIME types. Useful when
72 downloading text/plain documents that shouldn't all be translated to
73 *.txt files.
75 =item --limit=I<n>
77 Limit the number of documents to get. The default limit is 50.
79 =item --nospace
81 Changes spaces in all URLs to underscore characters (_). Useful when
82 downloading files from sites serving URLs with spaces in them. Does not
83 remove spaces from fragments, e.g., "file.html#somewhere in here".
85 =item --prefix=I<url_prefix>
87 Limit the links to follow. Only URLs that start the prefix string are
88 followed.
90 The default prefix is set as the "directory" of the initial URL to
91 follow. For instance if we start lwp-rget with the URL
92 C<http://www.sn.no/foo/bar.html>, then prefix will be set to
93 C<http://www.sn.no/foo/>.
95 Use C<--prefix=''> if you don't want the fetching to be limited by any
96 prefix.
98 =item --sleep=I<n>
100 Sleep I<n> seconds before retrieving each document. This options allows
101 you to go slowly, not loading the server you visiting too much.
103 =item --tolower
105 Translates all links to lowercase. Useful when downloading files from
106 IIS since it does not serve files in a case sensitive manner.
108 =item --verbose
110 Make more noise while running.
112 =item --quiet
114 Don't make any noise.
116 =item --version
118 Print program version number and quit.
120 =item --help
122 Print the usage message and quit.
124 =back
126 Before the program exits the name of the file, where the initial URL
127 is stored, is printed on stdout. All used filenames are also printed
128 on stderr as they are loaded. This printing can be suppressed with
129 the I<--quiet> option.
131 =head1 SEE ALSO
133 L<lwp-request>, L<LWP>
135 =head1 AUTHOR
137 Gisle Aas <aas@sn.no>
139 =cut
141 use strict;
143 use Getopt::Long qw(GetOptions);
144 use URI::URL qw(url);
145 use LWP::MediaTypes qw(media_suffix);
146 use HTML::Entities ();
148 use vars qw($VERSION);
149 use vars qw($MAX_DEPTH $MAX_DOCS $PREFIX $REFERER $VERBOSE $QUIET $SLEEP $HIER $AUTH $IIS $TOLOWER $NOSPACE %KEEPEXT);
151 my $progname = $0;
152 $progname =~ s|.*/||; # only basename left
153 $progname =~ s/\.\w*$//; #strip extension if any
155 $VERSION = "5.818";
157 #$Getopt::Long::debug = 1;
158 #$Getopt::Long::ignorecase = 0;
160 # Defaults
161 $MAX_DEPTH = 5;
162 $MAX_DOCS = 50;
164 GetOptions('version' => \&print_version,
165 'help' => \&usage,
166 'depth=i' => \$MAX_DEPTH,
167 'limit=i' => \$MAX_DOCS,
168 'verbose!' => \$VERBOSE,
169 'quiet!' => \$QUIET,
170 'sleep=i' => \$SLEEP,
171 'prefix:s' => \$PREFIX,
172 'referer:s'=> \$REFERER,
173 'hier' => \$HIER,
174 'auth=s' => \$AUTH,
175 'iis' => \$IIS,
176 'tolower' => \$TOLOWER,
177 'nospace' => \$NOSPACE,
178 'keepext=s' => \$KEEPEXT{'OPT'},
179 ) || usage();
181 sub print_version {
182 require LWP;
183 my $DISTNAME = 'libwww-perl-' . LWP::Version();
184 print <<"EOT";
185 This is lwp-rget version $VERSION ($DISTNAME)
187 Copyright 1996-1998, Gisle Aas.
189 This program is free software; you can redistribute it and/or
190 modify it under the same terms as Perl itself.
192 exit 0;
195 my $start_url = shift || usage();
196 usage() if @ARGV;
198 require LWP::UserAgent;
199 my $ua = new LWP::UserAgent;
200 $ua->agent("$progname/$VERSION ");
201 $ua->env_proxy;
203 unless (defined $PREFIX) {
204 $PREFIX = url($start_url); # limit to URLs below this one
205 eval {
206 $PREFIX->eparams(undef);
207 $PREFIX->equery(undef);
210 $_ = $PREFIX->epath;
211 s|[^/]+$||;
212 $PREFIX->epath($_);
213 $PREFIX = $PREFIX->as_string;
216 %KEEPEXT = map { lc($_) => 1 } split(/\s*,\s*/, ($KEEPEXT{'OPT'}||0));
218 my $SUPPRESS_REFERER;
219 $SUPPRESS_REFERER++ if ($REFERER || "") eq "NONE";
221 print <<"" if $VERBOSE;
222 START = $start_url
223 MAX_DEPTH = $MAX_DEPTH
224 MAX_DOCS = $MAX_DOCS
225 PREFIX = $PREFIX
227 my $no_docs = 0;
228 my %seen = (); # mapping from URL => local_file
230 my $filename = fetch($start_url, undef, $REFERER);
231 print "$filename\n" unless $QUIET;
233 sub fetch
235 my($url, $type, $referer, $depth) = @_;
237 # Fix http://sitename.com/../blah/blah.html to
238 # http://sitename.com/blah/blah.html
239 $url = $url->as_string if (ref($url));
240 while ($url =~ s#(https?://[^/]+/)\.\.\/#$1#) {}
242 # Fix backslashes (\) in URL if $IIS defined
243 $url = fix_backslashes($url) if (defined $IIS);
245 $url = url($url);
246 $type ||= 'a';
247 # Might be the background attribute
248 $type = 'img' if ($type eq 'body' || $type eq 'td');
249 $depth ||= 0;
251 # Print the URL before we start checking...
252 my $out = (" " x $depth) . $url . " ";
253 $out .= "." x (60 - length($out));
254 print STDERR $out . " " if $VERBOSE;
256 # Can't get mailto things
257 if ($url->scheme eq 'mailto') {
258 print STDERR "*skipping mailto*\n" if $VERBOSE;
259 return $url->as_string;
262 # The $plain_url is a URL without the fragment part
263 my $plain_url = $url->clone;
264 $plain_url->frag(undef);
266 # Check PREFIX, but not for <IMG ...> links
267 if ($type ne 'img' and $url->as_string !~ /^\Q$PREFIX/o) {
268 print STDERR "*outsider*\n" if $VERBOSE;
269 return $url->as_string;
272 # Translate URL to lowercase if $TOLOWER defined
273 $plain_url = to_lower($plain_url) if (defined $TOLOWER);
275 # If we already have it, then there is nothing to be done
276 my $seen = $seen{$plain_url->as_string};
277 if ($seen) {
278 my $frag = $url->frag;
279 $seen .= "#$frag" if defined($frag);
280 $seen = protect_frag_spaces($seen);
281 print STDERR "$seen (again)\n" if $VERBOSE;
282 return $seen;
285 # Too much or too deep
286 if ($depth > $MAX_DEPTH and $type ne 'img') {
287 print STDERR "*too deep*\n" if $VERBOSE;
288 return $url;
290 if ($no_docs > $MAX_DOCS) {
291 print STDERR "*too many*\n" if $VERBOSE;
292 return $url;
295 # Fetch document
296 $no_docs++;
297 sleep($SLEEP) if $SLEEP;
298 my $req = HTTP::Request->new(GET => $url);
299 # See: http://ftp.sunet.se/pub/NT/mirror-microsoft/kb/Q163/7/74.TXT
300 $req->header ('Accept', '*/*') if (defined $IIS); # GIF/JPG from IIS 2.0
301 $req->authorization_basic(split (/:/, $AUTH)) if (defined $AUTH);
302 if ($referer && !$SUPPRESS_REFERER) {
303 if ($req->url->scheme eq 'http') {
304 # RFC 2616, section 15.1.3
305 $referer = url($referer) unless ref($referer);
306 undef $referer if ($referer->scheme || '') eq 'https';
308 $req->referer($referer) if $referer;
310 my $res = $ua->request($req);
312 # Check outcome
313 if ($res->is_success) {
314 my $doc = $res->content;
315 my $ct = $res->content_type;
316 my $name = find_name($res->request->url, $ct);
317 print STDERR "$name\n" unless $QUIET;
318 $seen{$plain_url->as_string} = $name;
320 # If the file is HTML, then we look for internal links
321 if ($ct eq "text/html") {
322 # Save an unprosessed version of the HTML document. This
323 # both reserves the name used, and it also ensures that we
324 # don't loose everything if this program is killed before
325 # we finish.
326 save($name, $doc);
327 my $base = $res->base;
329 # Follow and substitute links...
330 $doc =~
333 <(img|a|body|area|frame|td)\b # some interesting tag
334 [^>]+ # still inside tag (not strictly correct)
335 \b(?:src|href|background) # some link attribute
336 \s*=\s* # =
338 (?: # scope of OR-ing
339 (")([^"]*)" | # value in double quotes OR
340 (')([^']*)' | # value in single quotes OR
341 ([^\s>]+) # quoteless value
344 new_link($1, lc($2), $3||$5, HTML::Entities::decode($4||$6||$7),
345 $base, $name, "$url", $depth+1)
346 /giex;
347 # XXX
348 # The regular expression above is not strictly correct.
349 # It is not really possible to parse HTML with a single
350 # regular expression, but it is faster. Tags that might
351 # confuse us include:
352 # <a alt="href" href=link.html>
353 # <a alt=">" href="link.html">
356 save($name, $doc);
357 return $name;
359 else {
360 print STDERR $res->code . " " . $res->message . "\n" if $VERBOSE;
361 $seen{$plain_url->as_string} = $url->as_string;
362 return $url->as_string;
366 sub new_link
368 my($pre, $type, $quote, $url, $base, $localbase, $referer, $depth) = @_;
370 $url = protect_frag_spaces($url);
372 $url = fetch(url($url, $base)->abs, $type, $referer, $depth);
373 $url = url("file:$url", "file:$localbase")->rel
374 unless $url =~ /^[.+\-\w]+:/;
376 $url = unprotect_frag_spaces($url);
378 return $pre . $quote . $url . $quote;
382 sub protect_frag_spaces
384 my ($url) = @_;
386 $url = $url->as_string if (ref($url));
388 if ($url =~ m/^([^#]*#)(.+)$/)
390 my ($base, $frag) = ($1, $2);
391 $frag =~ s/ /%20/g;
392 $url = $base . $frag;
395 return $url;
399 sub unprotect_frag_spaces
401 my ($url) = @_;
403 $url = $url->as_string if (ref($url));
405 if ($url =~ m/^([^#]*#)(.+)$/)
407 my ($base, $frag) = ($1, $2);
408 $frag =~ s/%20/ /g;
409 $url = $base . $frag;
412 return $url;
416 sub fix_backslashes
418 my ($url) = @_;
419 my ($base, $frag);
421 $url = $url->as_string if (ref($url));
423 if ($url =~ m/([^#]+)(#.*)/)
425 ($base, $frag) = ($1, $2);
427 else
429 $base = $url;
430 $frag = "";
433 $base =~ tr/\\/\//;
434 $base =~ s/%5[cC]/\//g; # URL-encoded back slash is %5C
436 return $base . $frag;
440 sub to_lower
442 my ($url) = @_;
443 my $was_object = 0;
445 if (ref($url))
447 $url = $url->as_string;
448 $was_object = 1;
451 if ($url =~ m/([^#]+)(#.*)/)
453 $url = lc($1) . $2;
455 else
457 $url = lc($url);
460 if ($was_object == 1)
462 return url($url);
464 else
466 return $url;
471 sub translate_spaces
473 my ($url) = @_;
474 my ($base, $frag);
476 $url = $url->as_string if (ref($url));
478 if ($url =~ m/([^#]+)(#.*)/)
480 ($base, $frag) = ($1, $2);
482 else
484 $base = $url;
485 $frag = "";
488 $base =~ s/^ *//; # Remove initial spaces from base
489 $base =~ s/ *$//; # Remove trailing spaces from base
491 $base =~ tr/ /_/;
492 $base =~ s/%20/_/g; # URL-encoded space is %20
494 return $base . $frag;
498 sub mkdirp
500 my($directory, $mode) = @_;
501 my @dirs = split(/\//, $directory);
502 my $path = shift(@dirs); # build it as we go
503 my $result = 1; # assume it will work
505 unless (-d $path) {
506 $result &&= mkdir($path, $mode);
509 foreach (@dirs) {
510 $path .= "/$_";
511 if ( ! -d $path) {
512 $result &&= mkdir($path, $mode);
516 return $result;
520 sub find_name
522 my($url, $type) = @_;
523 #print "find_name($url, $type)\n";
525 # Translate spaces in URL to underscores (_) if $NOSPACE defined
526 $url = translate_spaces($url) if (defined $NOSPACE);
528 # Translate URL to lowercase if $TOLOWER defined
529 $url = to_lower($url) if (defined $TOLOWER);
531 $url = url($url) unless ref($url);
533 my $path = $url->path;
535 # trim path until only the basename is left
536 $path =~ s|(.*/)||;
537 my $dirname = ".$1";
538 if (!$HIER) {
539 $dirname = "";
541 elsif (! -d $dirname) {
542 mkdirp($dirname, 0775);
545 my $extra = ""; # something to make the name unique
546 my $suffix;
548 if ($KEEPEXT{lc($type)}) {
549 $suffix = ($path =~ m/\.(.*)/) ? $1 : "";
551 else {
552 $suffix = media_suffix($type);
555 $path =~ s|\..*||; # trim suffix
556 $path = "index" unless length $path;
558 while (1) {
559 # Construct a new file name
560 my $file = $dirname . $path . $extra;
561 $file .= ".$suffix" if $suffix;
562 # Check if it is unique
563 return $file unless -f $file;
565 # Try something extra
566 unless ($extra) {
567 $extra = "001";
568 next;
570 $extra++;
575 sub save
577 my $name = shift;
578 #print "save($name,...)\n";
579 open(FILE, ">$name") || die "Can't save $name: $!";
580 binmode FILE;
581 print FILE $_[0];
582 close(FILE);
586 sub usage
588 print <<""; exit 1;
589 Usage: $progname [options] <URL>
590 Allowed options are:
591 --auth=USER:PASS Set authentication credentials for web site
592 --depth=N Maximum depth to traverse (default is $MAX_DEPTH)
593 --hier Download into hierarchy (not all files into cwd)
594 --referer=URI Set initial referer header (or "NONE")
595 --iis Workaround IIS 2.0 bug by sending "Accept: */*" MIME
596 header; translates backslashes (\\) to forward slashes (/)
597 --keepext=type Keep file extension for MIME types (comma-separated list)
598 --limit=N A limit on the number documents to get (default is $MAX_DOCS)
599 --nospace Translate spaces URLs (not #fragments) to underscores (_)
600 --version Print version number and quit
601 --verbose More output
602 --quiet No output
603 --sleep=SECS Sleep between gets, ie. go slowly
604 --prefix=PREFIX Limit URLs to follow to those which begin with PREFIX
605 --tolower Translate all URLs to lowercase (useful with IIS servers)