1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 Fetch and cache artifacts from URLs.
8 This module manages fetching artifacts from URLS and purging old
9 artifacts using a simple Least Recently Used cache.
11 This module requires certain modules be importable from the ambient Python
12 environment. Consumers will need to arrange this themselves.
14 The bulk of the complexity is in managing and persisting several caches. If
15 we found a Python LRU cache that pickled cleanly, we could remove a lot of
16 this code! Sadly, I found no such candidate implementations, so we pickle
17 pylru caches manually.
19 None of the instances (or the underlying caches) are safe for concurrent use.
20 A future need, perhaps.
30 import mozpack
.path
as mozpath
32 import six
.moves
.urllib
.parse
as urlparse
34 from mozbuild
.util
import mkdir
36 # Using 'DownloadManager' through the provided interface we
37 # can't directly specify a 'chunk_size' for the 'Download' it manages.
38 # One way to get it to use the 'chunk_size' we want is to monkeypatch
39 # the defaults of the init function for the 'Download' class.
40 CHUNK_SIZE
= 16 * 1024 * 1024 # 16 MB in bytes.
41 dl_init
= dlmanager
.Download
.__init
__
42 dl_init
.__defaults
__ = (
43 dl_init
.__defaults
__[:1] + (CHUNK_SIZE
,) + dl_init
.__defaults
__[2:]
47 # Minimum number of downloaded artifacts to keep. Each artifact can be very large,
48 # so don't make this to large!
49 MIN_CACHED_ARTIFACTS
= 12
51 # Maximum size of the downloaded artifacts to keep in cache, in bytes (2GiB).
52 MAX_CACHED_ARTIFACTS_SIZE
= 2 * 1024 * 1024 * 1024
55 class ArtifactPersistLimit(dlmanager
.PersistLimit
):
56 """Handle persistence for a cache of artifacts.
58 When instantiating a DownloadManager, it starts by filling the
59 PersistLimit instance it's given with register_dir_content.
60 In practice, this registers all the files already in the cache directory.
61 After a download finishes, the newly downloaded file is registered, and the
62 oldest files registered to the PersistLimit instance are removed depending
63 on the size and file limits it's configured for.
65 This is all good, but there are a few tweaks we want here:
67 - We have pickle files in the cache directory that we don't want purged.
68 - Files that were just downloaded in the same session shouldn't be
69 purged. (if for some reason we end up downloading more than the default
70 max size, we don't want the files to be purged)
72 To achieve this, this subclass of PersistLimit inhibits the register_file
73 method for pickle files and tracks what files were downloaded in the same
74 session to avoid removing them.
76 The register_file method may be used to register cache matches too, so that
77 later sessions know they were freshly used.
80 def __init__(self
, log
=None):
81 super(ArtifactPersistLimit
, self
).__init
__(
82 size_limit
=MAX_CACHED_ARTIFACTS_SIZE
, file_limit
=MIN_CACHED_ARTIFACTS
85 self
._registering
_dir
= False
86 self
._downloaded
_now
= set()
88 def log(self
, *args
, **kwargs
):
90 self
._log
(*args
, **kwargs
)
92 def register_file(self
, path
):
94 path
.endswith(".pickle")
95 or path
.endswith(".checksum")
96 or os
.path
.basename(path
) == ".metadata_never_index"
99 if not self
._registering
_dir
:
100 # Touch the file so that subsequent calls to a mach artifact
101 # command know it was recently used. While remove_old_files
102 # is based on access time, in various cases, the access time is not
103 # updated when just reading the file, so we force an update.
108 self
._downloaded
_now
.add(path
)
109 super(ArtifactPersistLimit
, self
).register_file(path
)
111 def register_dir_content(self
, directory
, pattern
="*"):
112 self
._registering
_dir
= True
113 super(ArtifactPersistLimit
, self
).register_dir_content(directory
, pattern
)
114 self
._registering
_dir
= False
116 def remove_old_files(self
):
117 from dlmanager
import fs
119 files
= sorted(self
.files
, key
=lambda f
: f
.stat
.st_atime
)
121 while len(files
) > self
.file_limit
and self
._files
_size
>= self
.size_limit
:
123 if f
.path
in self
._downloaded
_now
:
129 # For some reason, on automation, we can't remove those files.
130 # So for now, ignore the error.
136 {"filename": f
.path
},
137 "Purged artifact {filename}",
139 self
._files
_size
-= f
.stat
.st_size
140 self
.files
= files
+ kept
142 def remove_all(self
):
143 from dlmanager
import fs
151 class ArtifactCache(object):
152 """Fetch artifacts from URLS and purge least recently used artifacts from disk."""
154 def __init__(self
, cache_dir
, log
=None, skip_cache
=False):
155 mkdir(cache_dir
, not_indexed
=True)
156 self
._cache
_dir
= cache_dir
158 self
._skip
_cache
= skip_cache
159 self
._persist
_limit
= ArtifactPersistLimit(log
)
160 self
._download
_manager
= dlmanager
.DownloadManager(
161 self
._cache
_dir
, persist_limit
=self
._persist
_limit
163 self
._last
_dl
_update
= -1
165 def log(self
, *args
, **kwargs
):
167 self
._log
(*args
, **kwargs
)
169 def fetch(self
, url
, force
=False):
170 fname
= os
.path
.basename(url
)
172 # Use the file name from the url if it looks like a hash digest.
173 if len(fname
) not in (32, 40, 56, 64, 96, 128):
175 binascii
.unhexlify(fname
)
176 except (TypeError, binascii
.Error
):
177 # We download to a temporary name like HASH[:16]-basename to
178 # differentiate among URLs with the same basenames. We used to then
179 # extract the build ID from the downloaded artifact and use it to make a
180 # human readable unique name, but extracting build IDs is time consuming
181 # (especially on Mac OS X, where we must mount a large DMG file).
182 hash = hashlib
.sha256(six
.ensure_binary(url
)).hexdigest()[:16]
183 # Strip query string and fragments.
184 basename
= os
.path
.basename(urlparse
.urlparse(url
).path
)
185 fname
= hash + "-" + basename
187 path
= os
.path
.abspath(mozpath
.join(self
._cache
_dir
, fname
))
188 if self
._skip
_cache
and os
.path
.exists(path
):
193 "Skipping cache: removing cached downloaded artifact {path}",
198 dl
= self
._download
_manager
.download(url
, fname
)
200 def download_progress(dl
, bytes_so_far
, total_size
):
203 percent
= (float(bytes_so_far
) / total_size
) * 100
204 now
= int(percent
/ 5)
205 if now
== self
._last
_dl
_update
:
207 self
._last
_dl
_update
= now
212 "bytes_so_far": bytes_so_far
,
213 "total_size": total_size
,
216 "Downloading... {percent:02.1f} %",
224 "Downloading artifact to local cache: {path}",
226 dl
.set_progress(download_progress
)
233 "Using artifact from local cache: {path}",
235 # Avoid the file being removed if it was in the cache already.
236 path
= os
.path
.join(self
._cache
_dir
, fname
)
237 self
._persist
_limit
.register_file(path
)
239 return os
.path
.abspath(mozpath
.join(self
._cache
_dir
, fname
))
241 # Cancel any background downloads in progress.
242 self
._download
_manager
.cancel()
244 def clear_cache(self
):
247 logging
.INFO
, "artifact", {}, "Skipping cache: ignoring clear_cache!"
251 self
._persist
_limit
.remove_all()