Strip leading slashes when exporting Subversion repositories.
[bzr-fastimport.git] / cache_manager.py
blob6d8ef05cfdb40d06024dcd82ea4ed4226222e7e8
1 # Copyright (C) 2009 Canonical Ltd
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 """A manager of caches."""
19 import atexit
20 import os
21 import shutil
22 import tempfile
23 import weakref
25 from bzrlib import lru_cache, trace
26 from bzrlib.plugins.fastimport import (
27 branch_mapper,
29 from fastimport.helpers import (
30 single_plural,
32 from fastimport.reftracker import (
33 RefTracker,
37 class _Cleanup(object):
38 """This class makes sure we clean up when CacheManager goes away.
40 We use a helper class to ensure that we are never in a refcycle.
41 """
43 def __init__(self, disk_blobs):
44 self.disk_blobs = disk_blobs
45 self.tempdir = None
46 self.small_blobs = None
48 def __del__(self):
49 self.finalize()
51 def finalize(self):
52 if self.disk_blobs is not None:
53 for info in self.disk_blobs.itervalues():
54 if info[-1] is not None:
55 os.unlink(info[-1])
56 self.disk_blobs = None
57 if self.small_blobs is not None:
58 self.small_blobs.close()
59 self.small_blobs = None
60 if self.tempdir is not None:
61 shutil.rmtree(self.tempdir)
64 class _Cleanup(object):
65 """This class makes sure we clean up when CacheManager goes away.
67 We use a helper class to ensure that we are never in a refcycle.
68 """
70 def __init__(self, disk_blobs):
71 self.disk_blobs = disk_blobs
72 self.tempdir = None
73 self.small_blobs = None
75 def __del__(self):
76 self.finalize()
78 def finalize(self):
79 if self.disk_blobs is not None:
80 for info in self.disk_blobs.itervalues():
81 if info[-1] is not None:
82 os.unlink(info[-1])
83 self.disk_blobs = None
84 if self.small_blobs is not None:
85 self.small_blobs.close()
86 self.small_blobs = None
87 if self.tempdir is not None:
88 shutil.rmtree(self.tempdir)
91 class CacheManager(object):
93 _small_blob_threshold = 25*1024
94 _sticky_cache_size = 300*1024*1024
95 _sticky_flushed_size = 100*1024*1024
97 def __init__(self, info=None, verbose=False, inventory_cache_size=10):
98 """Create a manager of caches.
100 :param info: a ConfigObj holding the output from
101 the --info processor, or None if no hints are available
103 self.verbose = verbose
105 # dataref -> data. datref is either :mark or the sha-1.
106 # Sticky blobs are referenced more than once, and are saved until their
107 # refcount goes to 0
108 self._blobs = {}
109 self._sticky_blobs = {}
110 self._sticky_memory_bytes = 0
111 # if we overflow our memory cache, then we will dump large blobs to
112 # disk in this directory
113 self._tempdir = None
114 # id => (offset, n_bytes, fname)
115 # if fname is None, then the content is stored in the small file
116 self._disk_blobs = {}
117 self._cleanup = _Cleanup(self._disk_blobs)
119 # revision-id -> Inventory cache
120 # these are large and we probably don't need too many as
121 # most parents are recent in history
122 self.inventories = lru_cache.LRUCache(inventory_cache_size)
124 # import commmit-ids -> revision-id lookup table
125 # we need to keep all of these but they are small
126 self.revision_ids = {}
128 # (path, branch_ref) -> file-ids - as generated.
129 # (Use store_file_id/fetch_fileid methods rather than direct access.)
131 # Work out the blobs to make sticky - None means all
132 self._blob_ref_counts = {}
133 if info is not None:
134 try:
135 blobs_by_counts = info['Blob reference counts']
136 # The parser hands values back as lists, already parsed
137 for count, blob_list in blobs_by_counts.items():
138 n = int(count)
139 for b in blob_list:
140 self._blob_ref_counts[b] = n
141 except KeyError:
142 # info not in file - possible when no blobs used
143 pass
145 # BranchMapper has no state (for now?), but we keep it around rather
146 # than reinstantiate on every usage
147 self.branch_mapper = branch_mapper.BranchMapper()
149 self.reftracker = RefTracker()
151 def dump_stats(self, note=trace.note):
152 """Dump some statistics about what we cached."""
153 # TODO: add in inventory stastistics
154 note("Cache statistics:")
155 self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
156 self._show_stats_for(self.revision_ids, "revision-ids", note=note)
157 # These aren't interesting so omit from the output, at least for now
158 #self._show_stats_for(self._blobs, "other blobs", note=note)
159 #self.reftracker.dump_stats(note=note)
161 def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
162 """Dump statistics about a given dictionary.
164 By the key and value need to support len().
166 count = len(dict)
167 if tuple_key:
168 size = sum(map(len, (''.join(k) for k in dict.keys())))
169 else:
170 size = sum(map(len, dict.keys()))
171 size += sum(map(len, dict.values()))
172 size = size * 1.0 / 1024
173 unit = 'K'
174 if size > 1024:
175 size = size / 1024
176 unit = 'M'
177 if size > 1024:
178 size = size / 1024
179 unit = 'G'
180 note(" %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
181 single_plural(count, "item", "items")))
183 def clear_all(self):
184 """Free up any memory used by the caches."""
185 self._blobs.clear()
186 self._sticky_blobs.clear()
187 self.revision_ids.clear()
188 self.reftracker.clear()
189 self.inventories.clear()
191 def _flush_blobs_to_disk(self):
192 blobs = self._sticky_blobs.keys()
193 sticky_blobs = self._sticky_blobs
194 total_blobs = len(sticky_blobs)
195 blobs.sort(key=lambda k:len(sticky_blobs[k]))
196 if self._tempdir is None:
197 tempdir = tempfile.mkdtemp(prefix='fastimport_blobs-')
198 self._tempdir = tempdir
199 self._cleanup.tempdir = self._tempdir
200 self._cleanup.small_blobs = tempfile.TemporaryFile(
201 prefix='small-blobs-', dir=self._tempdir)
202 small_blob_ref = weakref.ref(self._cleanup.small_blobs)
203 # Even though we add it to _Cleanup it seems that the object can be
204 # destroyed 'too late' for cleanup to actually occur. Probably a
205 # combination of bzr's "die directly, don't clean up" and how
206 # exceptions close the running stack.
207 def exit_cleanup():
208 small_blob = small_blob_ref()
209 if small_blob is not None:
210 small_blob.close()
211 shutil.rmtree(tempdir, ignore_errors=True)
212 atexit.register(exit_cleanup)
213 count = 0
214 bytes = 0
215 n_small_bytes = 0
216 while self._sticky_memory_bytes > self._sticky_flushed_size:
217 id = blobs.pop()
218 blob = self._sticky_blobs.pop(id)
219 n_bytes = len(blob)
220 self._sticky_memory_bytes -= n_bytes
221 if n_bytes < self._small_blob_threshold:
222 f = self._cleanup.small_blobs
223 f.seek(0, os.SEEK_END)
224 self._disk_blobs[id] = (f.tell(), n_bytes, None)
225 f.write(blob)
226 n_small_bytes += n_bytes
227 else:
228 fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
229 os.write(fd, blob)
230 os.close(fd)
231 self._disk_blobs[id] = (0, n_bytes, name)
232 bytes += n_bytes
233 del blob
234 count += 1
235 trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
236 % (count, total_blobs, bytes / 1024. / 1024,
237 n_small_bytes / 1024. / 1024))
239 def store_blob(self, id, data):
240 """Store a blob of data."""
241 # Note: If we're not reference counting, everything has to be sticky
242 if not self._blob_ref_counts or id in self._blob_ref_counts:
243 self._sticky_blobs[id] = data
244 self._sticky_memory_bytes += len(data)
245 if self._sticky_memory_bytes > self._sticky_cache_size:
246 self._flush_blobs_to_disk()
247 elif data == '':
248 # Empty data is always sticky
249 self._sticky_blobs[id] = data
250 else:
251 self._blobs[id] = data
253 def _decref(self, id, cache, fn):
254 if not self._blob_ref_counts:
255 return False
256 count = self._blob_ref_counts.get(id, None)
257 if count is not None:
258 count -= 1
259 if count <= 0:
260 del cache[id]
261 if fn is not None:
262 os.unlink(fn)
263 del self._blob_ref_counts[id]
264 return True
265 else:
266 self._blob_ref_counts[id] = count
267 return False
269 def fetch_blob(self, id):
270 """Fetch a blob of data."""
271 if id in self._blobs:
272 return self._blobs.pop(id)
273 if id in self._disk_blobs:
274 (offset, n_bytes, fn) = self._disk_blobs[id]
275 if fn is None:
276 f = self._cleanup.small_blobs
277 f.seek(offset)
278 content = f.read(n_bytes)
279 else:
280 fp = open(fn, 'rb')
281 try:
282 content = fp.read()
283 finally:
284 fp.close()
285 self._decref(id, self._disk_blobs, fn)
286 return content
287 content = self._sticky_blobs[id]
288 if self._decref(id, self._sticky_blobs, None):
289 self._sticky_memory_bytes -= len(content)
290 return content