Utilities/dep-build-scripts/download.py

   1 #!/usr/bin/env python
   2
   3 """
   4 Downloader
   5 by Peter Hosey
   6
   7 This is a multi-tasking file downloader with automatic unpacking. It uses curl to download files, and tar or unzip to unpack the files.
   8
   9 Usage: download.py [options] url url url ...
  10
  11 By default, the program will only download one file at a time. If you specify the -j option, it will download that many files at a time, except that it imposes an internal limit of one download per server (so as to not rudely swamp a single server with concurrent download requests).
  12
  13 You may also pass the -v option if you want to read debugging information.
  14
  15 After the options, the program takes one or more URLs to download. You can set up a pipe from the xargs utility if you would like to read URLs from a file.
  16 """
  17
  18 verbose = False
  19 def debug_log(msg, *pieces):
  20         if not verbose: return
  21
  22         import sys, os
  23
  24         # Change pieces to be all but the last; it starts out being all but the first.
  25         all = ((msg,) + pieces)
  26         pieces, last = all[:-1], all[-1]
  27
  28         print >>sys.stderr, os.getpid(),
  29         for x in pieces:
  30                 print >>sys.stderr, x,
  31         print >>sys.stderr, last
  32
  33 def print_argv(argv):
  34         if verbose: print os.getpid(),
  35         print '>',
  36         for arg in argv: print arg,
  37         print
  38
  39 import optparse
  40
  41 # This doesn't go by URL alone because we don't want to let the user spawn 20 jobs that download from a single server. That would be rude to the server.
  42 parser = optparse.OptionParser(usage='%prog [options] url [url [url ...]]')
  43 parser.add_option('-j', '--jobs', help='number of domains to download from at once', type='int', default=1)
  44 parser.add_option('-v', '--verbose', help='print debug logging', default=False, action='store_true')
  45
  46 opts, args = parser.parse_args()
  47 verbose = opts.verbose
  48
  49 download_urls = {} # Domain => [URL]
  50
  51 argvs = {
  52         'tar.gz':  ['tar', 'xz'],
  53         'tgz':     ['tar', 'xz'],
  54         'tar.bz2': ['tar', 'xj'],
  55         'tbz':     ['tar', 'xj'],
  56         'zip':     ['unzip'],
  57 }
  58 def argv_from_filename(filename):
  59         # Look through the extensions we know we can handle. Stop looking if we find the extension of this filename.
  60         chunks = filename.split('.')
  61         for range_len in xrange(1, len(chunks) + 1):
  62                 # Join the last range_len chunks.
  63                 ext = '.'.join(chunks[-range_len:])
  64                 try:             argv = argvs[ext]
  65                 except KeyError: pass
  66                 else:
  67                         # Include our filename as the last argument.
  68                         # Remember to copy the list here, so we don't modify the list in the dictionary.
  69                         argv = list(argv)
  70                         if argv[0] != 'tar':
  71                                 argv.append(filename)
  72                         return argv
  73 known_extensions = set()
  74 for key in argvs:
  75         # Add these extensions to the set by union.
  76         known_extensions.update(set(key.split('.')))
  77
  78 import os, sys
  79 children = []
  80
  81 import urlparse
  82 for url in args:
  83         parsed = urlparse.urlparse(url)
  84
  85         # Find out whether this URL has already been downloaded and unpacked. If it has, ask the user whether we should clobber it (as in the case of an aborted download). We could just use tar -U, but letting the user answer no here saves bandwidth usage when redownloading isn't necessary.
  86         path = parsed.path
  87         filename = os.path.split(path)[1]
  88         chunks = filename.rsplit('.', 1) #For example: 'SurfWriter-1.0.tar', 'gz'
  89         while chunks[1] in known_extensions:
  90                 chunks = chunks[0].rsplit('.', 1)
  91         else:
  92                 #Last part isn't a filename extension. For example, it may be the 0 in 'SurfWriter-1.0'. Summon all the horses and all the king's men.
  93                 dirname = '.'.join(chunks)
  94
  95         if os.path.exists(dirname):
  96                 # Note: We can't use raw_input here because it reads from stdin, and we may be running under xargs. We must use the terminal instead.
  97                 tty = file('/dev/tty', 'r+')
  98                 tty.write('%s already exists. Remove and redownload it? [yN] ' % (dirname,))
  99                 tty.flush()
 100                 answer = tty.readline()
 101                 # lstrip: Remove leading whitespace.
 102                 # [:1]: Get only the first character, returning empty (rather than raising IndexError) if the string is empty.
 103                 # lower: Drop case of all (one) characters.
 104                 if answer.lstrip()[:1].lower() == 'y':
 105                         pid = os.fork()
 106                         if pid > 0:
 107                                 children.append(pid)
 108                         else:
 109                                 assert pid == 0
 110                                 dirname_old = dirname + '-old'
 111                                 os.rename(dirname, dirname_old)
 112                                 os.execvp('rm', ['rm', '-Rf', dirname_old])
 113                 else:
 114                         # Skip this URL by not adding it to download_urls.
 115                         continue
 116
 117         # netloc = domain[:port]. Split on the port number (if present) and get everything before it.
 118         domain = urlparse.urlparse(url).netloc.rsplit(':', 1)[0]
 119         download_urls.setdefault(domain, [])
 120         download_urls[domain].append(url)
 121
 122 import time
 123 debug_log(download_urls)
 124 for domain in download_urls:
 125         # If we have the maximum number of child processes already, wait for one to exit before spawning more.
 126         debug_log('children:', children)
 127         while len(children) >= opts.jobs:
 128                 exited_pid, exit_status = os.waitpid(-1, os.WNOHANG)
 129                 if exited_pid > 0:
 130                         # A child exited! We can proceed.
 131                         # Don't forget to remove it from the list of children, so we don't have to wait for it anymore.
 132                         del children[children.index(exited_pid)]
 133                         break
 134                 time.sleep(1)
 135
 136         child = os.fork()
 137         if child > 0:
 138                 # We're the parent.
 139                 children.append(child)
 140                 # This sleep is mainly so the debug logs from different children don't collide.
 141                 time.sleep(0.1)
 142                 continue
 143
 144         assert child == 0
 145
 146         downloaded_files = set()
 147
 148         urls = download_urls[domain]
 149         debug_log('URLs to download:', urls)
 150         for url in urls:
 151                 curl_argv = ['curl', url]
 152                 filename = os.path.split(urlparse.urlparse(url).path)[-1]
 153                 tar_argv = argv_from_filename(filename)
 154                 if tar_argv[0] != 'tar':
 155                         tar_argv = None
 156                         #Insert the -O option just before the URL, since we'll be saving the archive file to disk to pass to the unpacker.
 157                         curl_argv.insert(3, '-O')
 158                 whole_argv = (curl_argv + ['|'] + tar_argv) if tar_argv else curl_argv
 159                 print_argv(whole_argv)
 160
 161                 pid = os.fork()
 162                 if pid == 0:
 163                         # We are the child, which will become curl or tar.
 164                         read_end, write_end = os.pipe() if tar_argv else (None, None)
 165
 166                         curl_pid = os.fork()
 167                         assert curl_pid >= 0
 168                         if curl_pid == 0:
 169                                 if write_end is not None: os.dup2(write_end, sys.stdout.fileno())
 170                                 os.close(read_end)
 171                                 # We are the grandchild that will become curl.
 172                                 os.execvp(curl_argv[0], curl_argv)
 173
 174                         if not tar_argv:
 175                                 tar_pid = None
 176                         else:
 177                                 tar_pid = os.fork()
 178                                 assert tar_pid >= 0
 179                                 if tar_pid == 0:
 180                                         # We are the grandchild that will become tar.
 181                                         os.dup2(read_end, sys.stdin.fileno())
 182                                         os.close(write_end)
 183                                         os.execvp(tar_argv[0], tar_argv)
 184
 185                         # Only the grandchildren need the pipe. Close our copies of both ends of it.
 186                         os.close(read_end)
 187                         os.close(write_end)
 188
 189                         # We're still the child; wait for the grandchildren to exit.
 190                         exited_pid, curl_exit_status = os.waitpid(curl_pid, 0)
 191                         if tar_pid is None:
 192                                 tar_exit_status = 0 # for sys.exit below
 193                         else:
 194                                 exited_pid, tar_exit_status = os.waitpid(tar_pid, 0)
 195
 196                         if curl_exit_status:
 197                                 print >>sys.stderr, 'curl exited with status', curl_exit_status
 198                                 sys.exit(curl_exit_status if curl_exit_status else tar_exit_status)
 199                         if tar_exit_status:
 200                                 print >>sys.stderr, 'tar exited with status', tar_exit_status
 201                         sys.exit(tar_exit_status)
 202
 203                 assert pid > 0
 204                 exited_pid, exit_status = os.waitpid(pid, 0)
 205                 if exit_status == 0:
 206                         if not tar_argv:
 207                                 # We need to unzip this, so add it to the set of filenames to invoke unzip for.
 208                                 # The filename is the last component of the pathname.
 209                                 downloaded_files.add(url.rsplit('/', 1)[1])
 210
 211         debug_log('Files to unpack:', list(downloaded_files))
 212         if downloaded_files:
 213                 # Unpack all the files.
 214
 215                 # It's now more convenient to have this as a sequence than a set.
 216                 downloaded_files = list(downloaded_files)
 217
 218                 # Use spawnvp (which creates a new process) for all but the last.
 219                 for filename in downloaded_files[:-1]:
 220                         debug_log('Unpacking file:', filename)
 221                         argv = argv_from_filename(filename)
 222                         if argv:
 223                                 print_argv(argv)
 224                                 os.spawnvp(os.P_WAIT, argv[0], argv)
 225                         else:
 226                                 print >>sys.stderr, 'Unrecognized or no extension on filename:', filename
 227                 else:
 228                         filename = downloaded_files[-1]
 229                         # Use execvp (which reuses this process) for the last.
 230                         debug_log('Unpacking file:', filename)
 231                         argv = argv_from_filename(filename)
 232                         if argv:
 233                                 print_argv(argv)
 234                                 os.execvp(argv[0], argv)
 235                         else:
 236                                 print >>sys.stderr, 'Unrecognized or no extension on filename:', filename
 237         else: #if not downloaded_files:
 238                 # If no files were successfully downloaded, then we're not going to exec to an unpacker. Exit, so we don't leak this child process (particularly because if we do, it will go do a bunch of duplicate downloading).
 239                 # We exit with status 1 because we must have had at least one URL to download to even get here, but none of our URLs worked.
 240                 sys.exit(1)
 241
 242 # Clean up all our child processes.
 243 exit_status = 0
 244 for child in children:
 245         exited_pid, tmp_exit_status = os.waitpid(child, 0)
 246         if tmp_exit_status != 0:
 247                 exit_status = tmp_exit_status
 248 sys.exit(exit_status)