python/mozbuild/mozbuild/artifact_cache.py

   1 # This Source Code Form is subject to the terms of the Mozilla Public
   2 # License, v. 2.0. If a copy of the MPL was not distributed with this
   3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   4
   5 """
   6 Fetch and cache artifacts from URLs.
   7
   8 This module manages fetching artifacts from URLS and purging old
   9 artifacts using a simple Least Recently Used cache.
  10
  11 This module requires certain modules be importable from the ambient Python
  12 environment.  Consumers will need to arrange this themselves.
  13
  14 The bulk of the complexity is in managing and persisting several caches.  If
  15 we found a Python LRU cache that pickled cleanly, we could remove a lot of
  16 this code!  Sadly, I found no such candidate implementations, so we pickle
  17 pylru caches manually.
  18
  19 None of the instances (or the underlying caches) are safe for concurrent use.
  20 A future need, perhaps.
  21 """
  22
  23
  24 import binascii
  25 import hashlib
  26 import logging
  27 import os
  28
  29 import dlmanager
  30 import mozpack.path as mozpath
  31 import six
  32 import six.moves.urllib.parse as urlparse
  33
  34 from mozbuild.util import mkdir
  35
  36 # Using 'DownloadManager' through the provided interface we
  37 # can't directly specify a 'chunk_size' for the 'Download' it manages.
  38 # One way to get it to use the 'chunk_size' we want is to monkeypatch
  39 # the defaults of the init function for the 'Download' class.
  40 CHUNK_SIZE = 16 * 1024 * 1024  # 16 MB in bytes.
  41 dl_init = dlmanager.Download.__init__
  42 dl_init.__defaults__ = (
  43     dl_init.__defaults__[:1] + (CHUNK_SIZE,) + dl_init.__defaults__[2:]
  44 )
  45
  46
  47 # Minimum number of downloaded artifacts to keep. Each artifact can be very large,
  48 # so don't make this to large!
  49 MIN_CACHED_ARTIFACTS = 12
  50
  51 # Maximum size of the downloaded artifacts to keep in cache, in bytes (2GiB).
  52 MAX_CACHED_ARTIFACTS_SIZE = 2 * 1024 * 1024 * 1024
  53
  54
  55 class ArtifactPersistLimit(dlmanager.PersistLimit):
  56     """Handle persistence for a cache of artifacts.
  57
  58     When instantiating a DownloadManager, it starts by filling the
  59     PersistLimit instance it's given with register_dir_content.
  60     In practice, this registers all the files already in the cache directory.
  61     After a download finishes, the newly downloaded file is registered, and the
  62     oldest files registered to the PersistLimit instance are removed depending
  63     on the size and file limits it's configured for.
  64
  65     This is all good, but there are a few tweaks we want here:
  66
  67       - We have pickle files in the cache directory that we don't want purged.
  68       - Files that were just downloaded in the same session shouldn't be
  69         purged. (if for some reason we end up downloading more than the default
  70         max size, we don't want the files to be purged)
  71
  72     To achieve this, this subclass of PersistLimit inhibits the register_file
  73     method for pickle files and tracks what files were downloaded in the same
  74     session to avoid removing them.
  75
  76     The register_file method may be used to register cache matches too, so that
  77     later sessions know they were freshly used.
  78     """
  79
  80     def __init__(self, log=None):
  81         super(ArtifactPersistLimit, self).__init__(
  82             size_limit=MAX_CACHED_ARTIFACTS_SIZE, file_limit=MIN_CACHED_ARTIFACTS
  83         )
  84         self._log = log
  85         self._registering_dir = False
  86         self._downloaded_now = set()
  87
  88     def log(self, *args, **kwargs):
  89         if self._log:
  90             self._log(*args, **kwargs)
  91
  92     def register_file(self, path):
  93         if (
  94             path.endswith(".pickle")
  95             or path.endswith(".checksum")
  96             or os.path.basename(path) == ".metadata_never_index"
  97         ):
  98             return
  99         if not self._registering_dir:
 100             # Touch the file so that subsequent calls to a mach artifact
 101             # command know it was recently used. While remove_old_files
 102             # is based on access time, in various cases, the access time is not
 103             # updated when just reading the file, so we force an update.
 104             try:
 105                 os.utime(path, None)
 106             except OSError:
 107                 pass
 108             self._downloaded_now.add(path)
 109         super(ArtifactPersistLimit, self).register_file(path)
 110
 111     def register_dir_content(self, directory, pattern="*"):
 112         self._registering_dir = True
 113         super(ArtifactPersistLimit, self).register_dir_content(directory, pattern)
 114         self._registering_dir = False
 115
 116     def remove_old_files(self):
 117         from dlmanager import fs
 118
 119         files = sorted(self.files, key=lambda f: f.stat.st_atime)
 120         kept = []
 121         while len(files) > self.file_limit and self._files_size >= self.size_limit:
 122             f = files.pop(0)
 123             if f.path in self._downloaded_now:
 124                 kept.append(f)
 125                 continue
 126             try:
 127                 fs.remove(f.path)
 128             except WindowsError:
 129                 # For some reason, on automation, we can't remove those files.
 130                 # So for now, ignore the error.
 131                 kept.append(f)
 132                 continue
 133             self.log(
 134                 logging.INFO,
 135                 "artifact",
 136                 {"filename": f.path},
 137                 "Purged artifact {filename}",
 138             )
 139             self._files_size -= f.stat.st_size
 140         self.files = files + kept
 141
 142     def remove_all(self):
 143         from dlmanager import fs
 144
 145         for f in self.files:
 146             fs.remove(f.path)
 147         self._files_size = 0
 148         self.files = []
 149
 150
 151 class ArtifactCache(object):
 152     """Fetch artifacts from URLS and purge least recently used artifacts from disk."""
 153
 154     def __init__(self, cache_dir, log=None, skip_cache=False):
 155         mkdir(cache_dir, not_indexed=True)
 156         self._cache_dir = cache_dir
 157         self._log = log
 158         self._skip_cache = skip_cache
 159         self._persist_limit = ArtifactPersistLimit(log)
 160         self._download_manager = dlmanager.DownloadManager(
 161             self._cache_dir, persist_limit=self._persist_limit
 162         )
 163         self._last_dl_update = -1
 164
 165     def log(self, *args, **kwargs):
 166         if self._log:
 167             self._log(*args, **kwargs)
 168
 169     def fetch(self, url, force=False):
 170         fname = os.path.basename(url)
 171         try:
 172             # Use the file name from the url if it looks like a hash digest.
 173             if len(fname) not in (32, 40, 56, 64, 96, 128):
 174                 raise TypeError()
 175             binascii.unhexlify(fname)
 176         except (TypeError, binascii.Error):
 177             # We download to a temporary name like HASH[:16]-basename to
 178             # differentiate among URLs with the same basenames.  We used to then
 179             # extract the build ID from the downloaded artifact and use it to make a
 180             # human readable unique name, but extracting build IDs is time consuming
 181             # (especially on Mac OS X, where we must mount a large DMG file).
 182             hash = hashlib.sha256(six.ensure_binary(url)).hexdigest()[:16]
 183             # Strip query string and fragments.
 184             basename = os.path.basename(urlparse.urlparse(url).path)
 185             fname = hash + "-" + basename
 186
 187         path = os.path.abspath(mozpath.join(self._cache_dir, fname))
 188         if self._skip_cache and os.path.exists(path):
 189             self.log(
 190                 logging.INFO,
 191                 "artifact",
 192                 {"path": path},
 193                 "Skipping cache: removing cached downloaded artifact {path}",
 194             )
 195             os.remove(path)
 196
 197         try:
 198             dl = self._download_manager.download(url, fname)
 199
 200             def download_progress(dl, bytes_so_far, total_size):
 201                 if not total_size:
 202                     return
 203                 percent = (float(bytes_so_far) / total_size) * 100
 204                 now = int(percent / 5)
 205                 if now == self._last_dl_update:
 206                     return
 207                 self._last_dl_update = now
 208                 self.log(
 209                     logging.INFO,
 210                     "artifact",
 211                     {
 212                         "bytes_so_far": bytes_so_far,
 213                         "total_size": total_size,
 214                         "percent": percent,
 215                     },
 216                     "Downloading... {percent:02.1f} %",
 217                 )
 218
 219             if dl:
 220                 self.log(
 221                     logging.INFO,
 222                     "artifact",
 223                     {"path": path},
 224                     "Downloading artifact to local cache: {path}",
 225                 )
 226                 dl.set_progress(download_progress)
 227                 dl.wait()
 228             else:
 229                 self.log(
 230                     logging.INFO,
 231                     "artifact",
 232                     {"path": path},
 233                     "Using artifact from local cache: {path}",
 234                 )
 235                 # Avoid the file being removed if it was in the cache already.
 236                 path = os.path.join(self._cache_dir, fname)
 237                 self._persist_limit.register_file(path)
 238
 239             return os.path.abspath(mozpath.join(self._cache_dir, fname))
 240         finally:
 241             # Cancel any background downloads in progress.
 242             self._download_manager.cancel()
 243
 244     def clear_cache(self):
 245         if self._skip_cache:
 246             self.log(
 247                 logging.INFO, "artifact", {}, "Skipping cache: ignoring clear_cache!"
 248             )
 249             return
 250
 251         self._persist_limit.remove_all()