7 This is a multi-tasking file downloader with automatic unpacking. It uses curl to download files, and tar or unzip to unpack the files.
9 Usage: download.py [options] url url url ...
11 By default, the program will only download one file at a time. If you specify the -j option, it will download that many files at a time, except that it imposes an internal limit of one download per server (so as to not rudely swamp a single server with concurrent download requests).
13 You may also pass the -v option if you want to read debugging information.
15 After the options, the program takes one or more URLs to download. You can set up a pipe from the xargs utility if you would like to read URLs from a file.
19 def debug_log(msg
, *pieces
):
20 if not verbose
: return
24 # Change pieces to be all but the last; it starts out being all but the first.
25 all
= ((msg
,) + pieces
)
26 pieces
, last
= all
[:-1], all
[-1]
28 print >>sys
.stderr
, os
.getpid(),
30 print >>sys
.stderr
, x
,
31 print >>sys
.stderr
, last
34 if verbose
: print os
.getpid(),
36 for arg
in argv
: print arg
,
41 # This doesn't go by URL alone because we don't want to let the user spawn 20 jobs that download from a single server. That would be rude to the server.
42 parser
= optparse
.OptionParser(usage
='%prog [options] url [url [url ...]]')
43 parser
.add_option('-j', '--jobs', help='number of domains to download from at once', type='int', default
=1)
44 parser
.add_option('-v', '--verbose', help='print debug logging', default
=False, action
='store_true')
46 opts
, args
= parser
.parse_args()
47 verbose
= opts
.verbose
49 download_urls
= {} # Domain => [URL]
52 'tar.gz': ['tar', 'xz'],
54 'tar.bz2': ['tar', 'xj'],
58 def argv_from_filename(filename
):
59 # Look through the extensions we know we can handle. Stop looking if we find the extension of this filename.
60 chunks
= filename
.split('.')
61 for range_len
in xrange(1, len(chunks
) + 1):
62 # Join the last range_len chunks.
63 ext
= '.'.join(chunks
[-range_len
:])
64 try: argv
= argvs
[ext
]
67 # Include our filename as the last argument.
68 # Remember to copy the list here, so we don't modify the list in the dictionary.
73 known_extensions
= set()
75 # Add these extensions to the set by union.
76 known_extensions
.update(set(key
.split('.')))
83 parsed
= urlparse
.urlparse(url
)
85 # Find out whether this URL has already been downloaded and unpacked. If it has, ask the user whether we should clobber it (as in the case of an aborted download). We could just use tar -U, but letting the user answer no here saves bandwidth usage when redownloading isn't necessary.
87 filename
= os
.path
.split(path
)[1]
88 chunks
= filename
.rsplit('.', 1) #For example: 'SurfWriter-1.0.tar', 'gz'
89 while chunks
[1] in known_extensions
:
90 chunks
= chunks
[0].rsplit('.', 1)
92 #Last part isn't a filename extension. For example, it may be the 0 in 'SurfWriter-1.0'. Summon all the horses and all the king's men.
93 dirname
= '.'.join(chunks
)
95 if os
.path
.exists(dirname
):
96 # Note: We can't use raw_input here because it reads from stdin, and we may be running under xargs. We must use the terminal instead.
97 tty
= file('/dev/tty', 'r+')
98 tty
.write('%s already exists. Remove and redownload it? [yN] ' % (dirname
,))
100 answer
= tty
.readline()
101 # lstrip: Remove leading whitespace.
102 # [:1]: Get only the first character, returning empty (rather than raising IndexError) if the string is empty.
103 # lower: Drop case of all (one) characters.
104 if answer
.lstrip()[:1].lower() == 'y':
110 dirname_old
= dirname
+ '-old'
111 os
.rename(dirname
, dirname_old
)
112 os
.execvp('rm', ['rm', '-Rf', dirname_old
])
114 # Skip this URL by not adding it to download_urls.
117 # netloc = domain[:port]. Split on the port number (if present) and get everything before it.
118 domain
= urlparse
.urlparse(url
).netloc
.rsplit(':', 1)[0]
119 download_urls
.setdefault(domain
, [])
120 download_urls
[domain
].append(url
)
123 debug_log(download_urls
)
124 for domain
in download_urls
:
125 # If we have the maximum number of child processes already, wait for one to exit before spawning more.
126 debug_log('children:', children
)
127 while len(children
) >= opts
.jobs
:
128 exited_pid
, exit_status
= os
.waitpid(-1, os
.WNOHANG
)
130 # A child exited! We can proceed.
131 # Don't forget to remove it from the list of children, so we don't have to wait for it anymore.
132 del children
[children
.index(exited_pid
)]
139 children
.append(child
)
140 # This sleep is mainly so the debug logs from different children don't collide.
146 downloaded_files
= set()
148 urls
= download_urls
[domain
]
149 debug_log('URLs to download:', urls
)
151 curl_argv
= ['curl', url
]
152 filename
= os
.path
.split(urlparse
.urlparse(url
).path
)[-1]
153 tar_argv
= argv_from_filename(filename
)
154 if tar_argv
[0] != 'tar':
156 #Insert the -O option just before the URL, since we'll be saving the archive file to disk to pass to the unpacker.
157 curl_argv
.insert(3, '-O')
158 whole_argv
= (curl_argv
+ ['|'] + tar_argv
) if tar_argv
else curl_argv
159 print_argv(whole_argv
)
163 # We are the child, which will become curl or tar.
164 read_end
, write_end
= os
.pipe() if tar_argv
else (None, None)
169 if write_end
is not None: os
.dup2(write_end
, sys
.stdout
.fileno())
171 # We are the grandchild that will become curl.
172 os
.execvp(curl_argv
[0], curl_argv
)
180 # We are the grandchild that will become tar.
181 os
.dup2(read_end
, sys
.stdin
.fileno())
183 os
.execvp(tar_argv
[0], tar_argv
)
185 # Only the grandchildren need the pipe. Close our copies of both ends of it.
189 # We're still the child; wait for the grandchildren to exit.
190 exited_pid
, curl_exit_status
= os
.waitpid(curl_pid
, 0)
192 tar_exit_status
= 0 # for sys.exit below
194 exited_pid
, tar_exit_status
= os
.waitpid(tar_pid
, 0)
197 print >>sys
.stderr
, 'curl exited with status', curl_exit_status
198 sys
.exit(curl_exit_status
if curl_exit_status
else tar_exit_status
)
200 print >>sys
.stderr
, 'tar exited with status', tar_exit_status
201 sys
.exit(tar_exit_status
)
204 exited_pid
, exit_status
= os
.waitpid(pid
, 0)
207 # We need to unzip this, so add it to the set of filenames to invoke unzip for.
208 # The filename is the last component of the pathname.
209 downloaded_files
.add(url
.rsplit('/', 1)[1])
211 debug_log('Files to unpack:', list(downloaded_files
))
213 # Unpack all the files.
215 # It's now more convenient to have this as a sequence than a set.
216 downloaded_files
= list(downloaded_files
)
218 # Use spawnvp (which creates a new process) for all but the last.
219 for filename
in downloaded_files
[:-1]:
220 debug_log('Unpacking file:', filename
)
221 argv
= argv_from_filename(filename
)
224 os
.spawnvp(os
.P_WAIT
, argv
[0], argv
)
226 print >>sys
.stderr
, 'Unrecognized or no extension on filename:', filename
228 filename
= downloaded_files
[-1]
229 # Use execvp (which reuses this process) for the last.
230 debug_log('Unpacking file:', filename
)
231 argv
= argv_from_filename(filename
)
234 os
.execvp(argv
[0], argv
)
236 print >>sys
.stderr
, 'Unrecognized or no extension on filename:', filename
237 else: #if not downloaded_files:
238 # If no files were successfully downloaded, then we're not going to exec to an unpacker. Exit, so we don't leak this child process (particularly because if we do, it will go do a bunch of duplicate downloading).
239 # We exit with status 1 because we must have had at least one URL to download to even get here, but none of our URLs worked.
242 # Clean up all our child processes.
244 for child
in children
:
245 exited_pid
, tmp_exit_status
= os
.waitpid(child
, 0)
246 if tmp_exit_status
!= 0:
247 exit_status
= tmp_exit_status
248 sys
.exit(exit_status
)