Cope with older versions of bzr.
[bzr-fastimport.git] / cache_manager.py
blob4321fe6219801b868d85b669726c7fd3fef7ab12
1 # Copyright (C) 2009 Canonical Ltd
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 """A manager of caches."""
19 import atexit
20 import os
21 import shutil
22 import tempfile
23 import weakref
25 from bzrlib import lru_cache, trace
26 from bzrlib.plugins.fastimport import (
27 branch_mapper,
29 from fastimport.helpers import (
30 single_plural,
32 from fastimport.reftracker import (
33 RefTracker,
37 class _Cleanup(object):
38 """This class makes sure we clean up when CacheManager goes away.
40 We use a helper class to ensure that we are never in a refcycle.
41 """
43 def __init__(self, disk_blobs):
44 self.disk_blobs = disk_blobs
45 self.tempdir = None
46 self.small_blobs = None
48 def __del__(self):
49 self.finalize()
51 def finalize(self):
52 if self.disk_blobs is not None:
53 for info in self.disk_blobs.itervalues():
54 if info[-1] is not None:
55 os.unlink(info[-1])
56 self.disk_blobs = None
57 if self.small_blobs is not None:
58 self.small_blobs.close()
59 self.small_blobs = None
60 if self.tempdir is not None:
61 shutil.rmtree(self.tempdir)
64 class CacheManager(object):
66 _small_blob_threshold = 25*1024
67 _sticky_cache_size = 300*1024*1024
68 _sticky_flushed_size = 100*1024*1024
70 def __init__(self, info=None, verbose=False, inventory_cache_size=10):
71 """Create a manager of caches.
73 :param info: a ConfigObj holding the output from
74 the --info processor, or None if no hints are available
75 """
76 self.verbose = verbose
78 # dataref -> data. datref is either :mark or the sha-1.
79 # Sticky blobs are referenced more than once, and are saved until their
80 # refcount goes to 0
81 self._blobs = {}
82 self._sticky_blobs = {}
83 self._sticky_memory_bytes = 0
84 # if we overflow our memory cache, then we will dump large blobs to
85 # disk in this directory
86 self._tempdir = None
87 # id => (offset, n_bytes, fname)
88 # if fname is None, then the content is stored in the small file
89 self._disk_blobs = {}
90 self._cleanup = _Cleanup(self._disk_blobs)
92 # revision-id -> Inventory cache
93 # these are large and we probably don't need too many as
94 # most parents are recent in history
95 self.inventories = lru_cache.LRUCache(inventory_cache_size)
97 # import commmit-ids -> revision-id lookup table
98 # we need to keep all of these but they are small
99 self.marks = {}
101 # (path, branch_ref) -> file-ids - as generated.
102 # (Use store_file_id/fetch_fileid methods rather than direct access.)
104 # Work out the blobs to make sticky - None means all
105 self._blob_ref_counts = {}
106 if info is not None:
107 try:
108 blobs_by_counts = info['Blob reference counts']
109 # The parser hands values back as lists, already parsed
110 for count, blob_list in blobs_by_counts.items():
111 n = int(count)
112 for b in blob_list:
113 self._blob_ref_counts[b] = n
114 except KeyError:
115 # info not in file - possible when no blobs used
116 pass
118 # BranchMapper has no state (for now?), but we keep it around rather
119 # than reinstantiate on every usage
120 self.branch_mapper = branch_mapper.BranchMapper()
122 self.reftracker = RefTracker()
124 def add_mark(self, mark, commit_id):
125 assert mark[0] != ':'
126 self.marks[mark] = commit_id
128 def lookup_committish(self, committish):
129 """Resolve a 'committish' to a revision id.
131 :param committish: A "committish" string
132 :return: Bazaar revision id
134 assert committish[0] == ':'
135 return self.marks[committish.lstrip(':')]
137 def dump_stats(self, note=trace.note):
138 """Dump some statistics about what we cached."""
139 # TODO: add in inventory stastistics
140 note("Cache statistics:")
141 self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
142 self._show_stats_for(self.marks, "revision-ids", note=note)
143 # These aren't interesting so omit from the output, at least for now
144 #self._show_stats_for(self._blobs, "other blobs", note=note)
145 #self.reftracker.dump_stats(note=note)
147 def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
148 """Dump statistics about a given dictionary.
150 By the key and value need to support len().
152 count = len(dict)
153 if tuple_key:
154 size = sum(map(len, (''.join(k) for k in dict.keys())))
155 else:
156 size = sum(map(len, dict.keys()))
157 size += sum(map(len, dict.values()))
158 size = size * 1.0 / 1024
159 unit = 'K'
160 if size > 1024:
161 size = size / 1024
162 unit = 'M'
163 if size > 1024:
164 size = size / 1024
165 unit = 'G'
166 note(" %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
167 single_plural(count, "item", "items")))
169 def clear_all(self):
170 """Free up any memory used by the caches."""
171 self._blobs.clear()
172 self._sticky_blobs.clear()
173 self.marks.clear()
174 self.reftracker.clear()
175 self.inventories.clear()
177 def _flush_blobs_to_disk(self):
178 blobs = self._sticky_blobs.keys()
179 sticky_blobs = self._sticky_blobs
180 total_blobs = len(sticky_blobs)
181 blobs.sort(key=lambda k:len(sticky_blobs[k]))
182 if self._tempdir is None:
183 tempdir = tempfile.mkdtemp(prefix='fastimport_blobs-')
184 self._tempdir = tempdir
185 self._cleanup.tempdir = self._tempdir
186 self._cleanup.small_blobs = tempfile.TemporaryFile(
187 prefix='small-blobs-', dir=self._tempdir)
188 small_blob_ref = weakref.ref(self._cleanup.small_blobs)
189 # Even though we add it to _Cleanup it seems that the object can be
190 # destroyed 'too late' for cleanup to actually occur. Probably a
191 # combination of bzr's "die directly, don't clean up" and how
192 # exceptions close the running stack.
193 def exit_cleanup():
194 small_blob = small_blob_ref()
195 if small_blob is not None:
196 small_blob.close()
197 shutil.rmtree(tempdir, ignore_errors=True)
198 atexit.register(exit_cleanup)
199 count = 0
200 bytes = 0
201 n_small_bytes = 0
202 while self._sticky_memory_bytes > self._sticky_flushed_size:
203 id = blobs.pop()
204 blob = self._sticky_blobs.pop(id)
205 n_bytes = len(blob)
206 self._sticky_memory_bytes -= n_bytes
207 if n_bytes < self._small_blob_threshold:
208 f = self._cleanup.small_blobs
209 f.seek(0, os.SEEK_END)
210 self._disk_blobs[id] = (f.tell(), n_bytes, None)
211 f.write(blob)
212 n_small_bytes += n_bytes
213 else:
214 fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
215 os.write(fd, blob)
216 os.close(fd)
217 self._disk_blobs[id] = (0, n_bytes, name)
218 bytes += n_bytes
219 del blob
220 count += 1
221 trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
222 % (count, total_blobs, bytes / 1024. / 1024,
223 n_small_bytes / 1024. / 1024))
225 def store_blob(self, id, data):
226 """Store a blob of data."""
227 # Note: If we're not reference counting, everything has to be sticky
228 if not self._blob_ref_counts or id in self._blob_ref_counts:
229 self._sticky_blobs[id] = data
230 self._sticky_memory_bytes += len(data)
231 if self._sticky_memory_bytes > self._sticky_cache_size:
232 self._flush_blobs_to_disk()
233 elif data == '':
234 # Empty data is always sticky
235 self._sticky_blobs[id] = data
236 else:
237 self._blobs[id] = data
239 def _decref(self, id, cache, fn):
240 if not self._blob_ref_counts:
241 return False
242 count = self._blob_ref_counts.get(id, None)
243 if count is not None:
244 count -= 1
245 if count <= 0:
246 del cache[id]
247 if fn is not None:
248 os.unlink(fn)
249 del self._blob_ref_counts[id]
250 return True
251 else:
252 self._blob_ref_counts[id] = count
253 return False
255 def fetch_blob(self, id):
256 """Fetch a blob of data."""
257 if id in self._blobs:
258 return self._blobs.pop(id)
259 if id in self._disk_blobs:
260 (offset, n_bytes, fn) = self._disk_blobs[id]
261 if fn is None:
262 f = self._cleanup.small_blobs
263 f.seek(offset)
264 content = f.read(n_bytes)
265 else:
266 fp = open(fn, 'rb')
267 try:
268 content = fp.read()
269 finally:
270 fp.close()
271 self._decref(id, self._disk_blobs, fn)
272 return content
273 content = self._sticky_blobs[id]
274 if self._decref(id, self._sticky_blobs, None):
275 self._sticky_memory_bytes -= len(content)
276 return content