Converted these images to PNG, saving a handful of bytes per image
[adiumx.git] / Utilities / dep-build-scripts / download.py
bloba788c82a90ee8441205ac26cb8209738657f5095
1 #!/usr/bin/env python
3 """
4 Downloader
5 by Peter Hosey
7 This is a multi-tasking file downloader with automatic unpacking. It uses curl to download files, and tar or unzip to unpack the files.
9 Usage: download.py [options] url url url ...
11 By default, the program will only download one file at a time. If you specify the -j option, it will download that many files at a time, except that it imposes an internal limit of one download per server (so as to not rudely swamp a single server with concurrent download requests).
13 You may also pass the -v option if you want to read debugging information.
15 After the options, the program takes one or more URLs to download. You can set up a pipe from the xargs utility if you would like to read URLs from a file.
16 """
18 verbose = False
19 def debug_log(msg, *pieces):
20 if not verbose: return
22 import sys, os
24 # Change pieces to be all but the last; it starts out being all but the first.
25 all = ((msg,) + pieces)
26 pieces, last = all[:-1], all[-1]
28 print >>sys.stderr, os.getpid(),
29 for x in pieces:
30 print >>sys.stderr, x,
31 print >>sys.stderr, last
33 def print_argv(argv):
34 if verbose: print os.getpid(),
35 print '>',
36 for arg in argv: print arg,
37 print
39 import optparse
41 # This doesn't go by URL alone because we don't want to let the user spawn 20 jobs that download from a single server. That would be rude to the server.
42 parser = optparse.OptionParser(usage='%prog [options] url [url [url ...]]')
43 parser.add_option('-j', '--jobs', help='number of domains to download from at once', type='int', default=1)
44 parser.add_option('-v', '--verbose', help='print debug logging', default=False, action='store_true')
46 opts, args = parser.parse_args()
47 verbose = opts.verbose
49 download_urls = {} # Domain => [URL]
51 argvs = {
52 'tar.gz': ['tar', 'xz'],
53 'tgz': ['tar', 'xz'],
54 'tar.bz2': ['tar', 'xj'],
55 'tbz': ['tar', 'xj'],
56 'zip': ['unzip'],
58 def argv_from_filename(filename):
59 # Look through the extensions we know we can handle. Stop looking if we find the extension of this filename.
60 chunks = filename.split('.')
61 for range_len in xrange(1, len(chunks) + 1):
62 # Join the last range_len chunks.
63 ext = '.'.join(chunks[-range_len:])
64 try: argv = argvs[ext]
65 except KeyError: pass
66 else:
67 # Include our filename as the last argument.
68 # Remember to copy the list here, so we don't modify the list in the dictionary.
69 argv = list(argv)
70 if argv[0] != 'tar':
71 argv.append(filename)
72 return argv
73 known_extensions = set()
74 for key in argvs:
75 # Add these extensions to the set by union.
76 known_extensions.update(set(key.split('.')))
78 import os, sys
79 children = []
81 import urlparse
82 for url in args:
83 parsed = urlparse.urlparse(url)
85 # Find out whether this URL has already been downloaded and unpacked. If it has, ask the user whether we should clobber it (as in the case of an aborted download). We could just use tar -U, but letting the user answer no here saves bandwidth usage when redownloading isn't necessary.
86 path = parsed.path
87 filename = os.path.split(path)[1]
88 chunks = filename.rsplit('.', 1) #For example: 'SurfWriter-1.0.tar', 'gz'
89 while chunks[1] in known_extensions:
90 chunks = chunks[0].rsplit('.', 1)
91 else:
92 #Last part isn't a filename extension. For example, it may be the 0 in 'SurfWriter-1.0'. Summon all the horses and all the king's men.
93 dirname = '.'.join(chunks)
95 if os.path.exists(dirname):
96 # Note: We can't use raw_input here because it reads from stdin, and we may be running under xargs. We must use the terminal instead.
97 tty = file('/dev/tty', 'r+')
98 tty.write('%s already exists. Remove and redownload it? [yN] ' % (dirname,))
99 tty.flush()
100 answer = tty.readline()
101 # lstrip: Remove leading whitespace.
102 # [:1]: Get only the first character, returning empty (rather than raising IndexError) if the string is empty.
103 # lower: Drop case of all (one) characters.
104 if answer.lstrip()[:1].lower() == 'y':
105 pid = os.fork()
106 if pid > 0:
107 children.append(pid)
108 else:
109 assert pid == 0
110 dirname_old = dirname + '-old'
111 os.rename(dirname, dirname_old)
112 os.execvp('rm', ['rm', '-Rf', dirname_old])
113 else:
114 # Skip this URL by not adding it to download_urls.
115 continue
117 # netloc = domain[:port]. Split on the port number (if present) and get everything before it.
118 domain = urlparse.urlparse(url).netloc.rsplit(':', 1)[0]
119 download_urls.setdefault(domain, [])
120 download_urls[domain].append(url)
122 import time
123 debug_log(download_urls)
124 for domain in download_urls:
125 # If we have the maximum number of child processes already, wait for one to exit before spawning more.
126 debug_log('children:', children)
127 while len(children) >= opts.jobs:
128 exited_pid, exit_status = os.waitpid(-1, os.WNOHANG)
129 if exited_pid > 0:
130 # A child exited! We can proceed.
131 # Don't forget to remove it from the list of children, so we don't have to wait for it anymore.
132 del children[children.index(exited_pid)]
133 break
134 time.sleep(1)
136 child = os.fork()
137 if child > 0:
138 # We're the parent.
139 children.append(child)
140 # This sleep is mainly so the debug logs from different children don't collide.
141 time.sleep(0.1)
142 continue
144 assert child == 0
146 downloaded_files = set()
148 urls = download_urls[domain]
149 debug_log('URLs to download:', urls)
150 for url in urls:
151 curl_argv = ['curl', url]
152 filename = os.path.split(urlparse.urlparse(url).path)[-1]
153 tar_argv = argv_from_filename(filename)
154 if tar_argv[0] != 'tar':
155 tar_argv = None
156 #Insert the -O option just before the URL, since we'll be saving the archive file to disk to pass to the unpacker.
157 curl_argv.insert(3, '-O')
158 whole_argv = (curl_argv + ['|'] + tar_argv) if tar_argv else curl_argv
159 print_argv(whole_argv)
161 pid = os.fork()
162 if pid == 0:
163 # We are the child, which will become curl or tar.
164 read_end, write_end = os.pipe() if tar_argv else (None, None)
166 curl_pid = os.fork()
167 assert curl_pid >= 0
168 if curl_pid == 0:
169 if write_end is not None: os.dup2(write_end, sys.stdout.fileno())
170 os.close(read_end)
171 # We are the grandchild that will become curl.
172 os.execvp(curl_argv[0], curl_argv)
174 if not tar_argv:
175 tar_pid = None
176 else:
177 tar_pid = os.fork()
178 assert tar_pid >= 0
179 if tar_pid == 0:
180 # We are the grandchild that will become tar.
181 os.dup2(read_end, sys.stdin.fileno())
182 os.close(write_end)
183 os.execvp(tar_argv[0], tar_argv)
185 # Only the grandchildren need the pipe. Close our copies of both ends of it.
186 os.close(read_end)
187 os.close(write_end)
189 # We're still the child; wait for the grandchildren to exit.
190 exited_pid, curl_exit_status = os.waitpid(curl_pid, 0)
191 if tar_pid is None:
192 tar_exit_status = 0 # for sys.exit below
193 else:
194 exited_pid, tar_exit_status = os.waitpid(tar_pid, 0)
196 if curl_exit_status:
197 print >>sys.stderr, 'curl exited with status', curl_exit_status
198 sys.exit(curl_exit_status if curl_exit_status else tar_exit_status)
199 if tar_exit_status:
200 print >>sys.stderr, 'tar exited with status', tar_exit_status
201 sys.exit(tar_exit_status)
203 assert pid > 0
204 exited_pid, exit_status = os.waitpid(pid, 0)
205 if exit_status == 0:
206 if not tar_argv:
207 # We need to unzip this, so add it to the set of filenames to invoke unzip for.
208 # The filename is the last component of the pathname.
209 downloaded_files.add(url.rsplit('/', 1)[1])
211 debug_log('Files to unpack:', list(downloaded_files))
212 if downloaded_files:
213 # Unpack all the files.
215 # It's now more convenient to have this as a sequence than a set.
216 downloaded_files = list(downloaded_files)
218 # Use spawnvp (which creates a new process) for all but the last.
219 for filename in downloaded_files[:-1]:
220 debug_log('Unpacking file:', filename)
221 argv = argv_from_filename(filename)
222 if argv:
223 print_argv(argv)
224 os.spawnvp(os.P_WAIT, argv[0], argv)
225 else:
226 print >>sys.stderr, 'Unrecognized or no extension on filename:', filename
227 else:
228 filename = downloaded_files[-1]
229 # Use execvp (which reuses this process) for the last.
230 debug_log('Unpacking file:', filename)
231 argv = argv_from_filename(filename)
232 if argv:
233 print_argv(argv)
234 os.execvp(argv[0], argv)
235 else:
236 print >>sys.stderr, 'Unrecognized or no extension on filename:', filename
237 else: #if not downloaded_files:
238 # If no files were successfully downloaded, then we're not going to exec to an unpacker. Exit, so we don't leak this child process (particularly because if we do, it will go do a bunch of duplicate downloading).
239 # We exit with status 1 because we must have had at least one URL to download to even get here, but none of our URLs worked.
240 sys.exit(1)
242 # Clean up all our child processes.
243 exit_status = 0
244 for child in children:
245 exited_pid, tmp_exit_status = os.waitpid(child, 0)
246 if tmp_exit_status != 0:
247 exit_status = tmp_exit_status
248 sys.exit(exit_status)