Bug 1874684 - Part 28: Return DateDuration from DifferenceISODateTime. r=mgaudet
[gecko.git] / python / mozbuild / mozbuild / artifact_cache.py
blob572953e1f718546de5ec8c29629160f54f104209
1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 """
6 Fetch and cache artifacts from URLs.
8 This module manages fetching artifacts from URLS and purging old
9 artifacts using a simple Least Recently Used cache.
11 This module requires certain modules be importable from the ambient Python
12 environment. Consumers will need to arrange this themselves.
14 The bulk of the complexity is in managing and persisting several caches. If
15 we found a Python LRU cache that pickled cleanly, we could remove a lot of
16 this code! Sadly, I found no such candidate implementations, so we pickle
17 pylru caches manually.
19 None of the instances (or the underlying caches) are safe for concurrent use.
20 A future need, perhaps.
21 """
24 import binascii
25 import hashlib
26 import logging
27 import os
29 import dlmanager
30 import mozpack.path as mozpath
31 import six
32 import six.moves.urllib.parse as urlparse
34 from mozbuild.util import mkdir
36 # Using 'DownloadManager' through the provided interface we
37 # can't directly specify a 'chunk_size' for the 'Download' it manages.
38 # One way to get it to use the 'chunk_size' we want is to monkeypatch
39 # the defaults of the init function for the 'Download' class.
40 CHUNK_SIZE = 16 * 1024 * 1024 # 16 MB in bytes.
41 dl_init = dlmanager.Download.__init__
42 dl_init.__defaults__ = (
43 dl_init.__defaults__[:1] + (CHUNK_SIZE,) + dl_init.__defaults__[2:]
47 # Minimum number of downloaded artifacts to keep. Each artifact can be very large,
48 # so don't make this to large!
49 MIN_CACHED_ARTIFACTS = 12
51 # Maximum size of the downloaded artifacts to keep in cache, in bytes (2GiB).
52 MAX_CACHED_ARTIFACTS_SIZE = 2 * 1024 * 1024 * 1024
55 class ArtifactPersistLimit(dlmanager.PersistLimit):
56 """Handle persistence for a cache of artifacts.
58 When instantiating a DownloadManager, it starts by filling the
59 PersistLimit instance it's given with register_dir_content.
60 In practice, this registers all the files already in the cache directory.
61 After a download finishes, the newly downloaded file is registered, and the
62 oldest files registered to the PersistLimit instance are removed depending
63 on the size and file limits it's configured for.
65 This is all good, but there are a few tweaks we want here:
67 - We have pickle files in the cache directory that we don't want purged.
68 - Files that were just downloaded in the same session shouldn't be
69 purged. (if for some reason we end up downloading more than the default
70 max size, we don't want the files to be purged)
72 To achieve this, this subclass of PersistLimit inhibits the register_file
73 method for pickle files and tracks what files were downloaded in the same
74 session to avoid removing them.
76 The register_file method may be used to register cache matches too, so that
77 later sessions know they were freshly used.
78 """
80 def __init__(self, log=None):
81 super(ArtifactPersistLimit, self).__init__(
82 size_limit=MAX_CACHED_ARTIFACTS_SIZE, file_limit=MIN_CACHED_ARTIFACTS
84 self._log = log
85 self._registering_dir = False
86 self._downloaded_now = set()
88 def log(self, *args, **kwargs):
89 if self._log:
90 self._log(*args, **kwargs)
92 def register_file(self, path):
93 if (
94 path.endswith(".pickle")
95 or path.endswith(".checksum")
96 or os.path.basename(path) == ".metadata_never_index"
98 return
99 if not self._registering_dir:
100 # Touch the file so that subsequent calls to a mach artifact
101 # command know it was recently used. While remove_old_files
102 # is based on access time, in various cases, the access time is not
103 # updated when just reading the file, so we force an update.
104 try:
105 os.utime(path, None)
106 except OSError:
107 pass
108 self._downloaded_now.add(path)
109 super(ArtifactPersistLimit, self).register_file(path)
111 def register_dir_content(self, directory, pattern="*"):
112 self._registering_dir = True
113 super(ArtifactPersistLimit, self).register_dir_content(directory, pattern)
114 self._registering_dir = False
116 def remove_old_files(self):
117 from dlmanager import fs
119 files = sorted(self.files, key=lambda f: f.stat.st_atime)
120 kept = []
121 while len(files) > self.file_limit and self._files_size >= self.size_limit:
122 f = files.pop(0)
123 if f.path in self._downloaded_now:
124 kept.append(f)
125 continue
126 try:
127 fs.remove(f.path)
128 except WindowsError:
129 # For some reason, on automation, we can't remove those files.
130 # So for now, ignore the error.
131 kept.append(f)
132 continue
133 self.log(
134 logging.INFO,
135 "artifact",
136 {"filename": f.path},
137 "Purged artifact {filename}",
139 self._files_size -= f.stat.st_size
140 self.files = files + kept
142 def remove_all(self):
143 from dlmanager import fs
145 for f in self.files:
146 fs.remove(f.path)
147 self._files_size = 0
148 self.files = []
151 class ArtifactCache(object):
152 """Fetch artifacts from URLS and purge least recently used artifacts from disk."""
154 def __init__(self, cache_dir, log=None, skip_cache=False):
155 mkdir(cache_dir, not_indexed=True)
156 self._cache_dir = cache_dir
157 self._log = log
158 self._skip_cache = skip_cache
159 self._persist_limit = ArtifactPersistLimit(log)
160 self._download_manager = dlmanager.DownloadManager(
161 self._cache_dir, persist_limit=self._persist_limit
163 self._last_dl_update = -1
165 def log(self, *args, **kwargs):
166 if self._log:
167 self._log(*args, **kwargs)
169 def fetch(self, url, force=False):
170 fname = os.path.basename(url)
171 try:
172 # Use the file name from the url if it looks like a hash digest.
173 if len(fname) not in (32, 40, 56, 64, 96, 128):
174 raise TypeError()
175 binascii.unhexlify(fname)
176 except (TypeError, binascii.Error):
177 # We download to a temporary name like HASH[:16]-basename to
178 # differentiate among URLs with the same basenames. We used to then
179 # extract the build ID from the downloaded artifact and use it to make a
180 # human readable unique name, but extracting build IDs is time consuming
181 # (especially on Mac OS X, where we must mount a large DMG file).
182 hash = hashlib.sha256(six.ensure_binary(url)).hexdigest()[:16]
183 # Strip query string and fragments.
184 basename = os.path.basename(urlparse.urlparse(url).path)
185 fname = hash + "-" + basename
187 path = os.path.abspath(mozpath.join(self._cache_dir, fname))
188 if self._skip_cache and os.path.exists(path):
189 self.log(
190 logging.INFO,
191 "artifact",
192 {"path": path},
193 "Skipping cache: removing cached downloaded artifact {path}",
195 os.remove(path)
197 try:
198 dl = self._download_manager.download(url, fname)
200 def download_progress(dl, bytes_so_far, total_size):
201 if not total_size:
202 return
203 percent = (float(bytes_so_far) / total_size) * 100
204 now = int(percent / 5)
205 if now == self._last_dl_update:
206 return
207 self._last_dl_update = now
208 self.log(
209 logging.INFO,
210 "artifact",
212 "bytes_so_far": bytes_so_far,
213 "total_size": total_size,
214 "percent": percent,
216 "Downloading... {percent:02.1f} %",
219 if dl:
220 self.log(
221 logging.INFO,
222 "artifact",
223 {"path": path},
224 "Downloading artifact to local cache: {path}",
226 dl.set_progress(download_progress)
227 dl.wait()
228 else:
229 self.log(
230 logging.INFO,
231 "artifact",
232 {"path": path},
233 "Using artifact from local cache: {path}",
235 # Avoid the file being removed if it was in the cache already.
236 path = os.path.join(self._cache_dir, fname)
237 self._persist_limit.register_file(path)
239 return os.path.abspath(mozpath.join(self._cache_dir, fname))
240 finally:
241 # Cancel any background downloads in progress.
242 self._download_manager.cancel()
244 def clear_cache(self):
245 if self._skip_cache:
246 self.log(
247 logging.INFO, "artifact", {}, "Skipping cache: ignoring clear_cache!"
249 return
251 self._persist_limit.remove_all()