1 # Copyright (C) 2009 Canonical Ltd
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
13 # You should have received a copy of the GNU General Public License
14 # along with this program; if not, write to the Free Software
15 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 """A manager of caches."""
25 from bzrlib
import lru_cache
, trace
26 from bzrlib
.plugins
.fastimport
import branch_mapper
, helpers
29 class _Cleanup(object):
30 """This class makes sure we clean up when CacheManager goes away.
32 We use a helper class to ensure that we are never in a refcycle.
35 def __init__(self
, disk_blobs
):
36 self
.disk_blobs
= disk_blobs
38 self
.small_blobs
= None
44 if self
.disk_blobs
is not None:
45 for info
in self
.disk_blobs
.itervalues():
46 if info
[-1] is not None:
48 self
.disk_blobs
= None
49 if self
.small_blobs
is not None:
50 self
.small_blobs
.close()
51 self
.small_blobs
= None
52 if self
.tempdir
is not None:
53 shutil
.rmtree(self
.tempdir
)
56 class _Cleanup(object):
57 """This class makes sure we clean up when CacheManager goes away.
59 We use a helper class to ensure that we are never in a refcycle.
62 def __init__(self
, disk_blobs
):
63 self
.disk_blobs
= disk_blobs
65 self
.small_blobs
= None
71 if self
.disk_blobs
is not None:
72 for info
in self
.disk_blobs
.itervalues():
73 if info
[-1] is not None:
75 self
.disk_blobs
= None
76 if self
.small_blobs
is not None:
77 self
.small_blobs
.close()
78 self
.small_blobs
= None
79 if self
.tempdir
is not None:
80 shutils
.rmtree(self
.tempdir
)
83 class CacheManager(object):
85 _small_blob_threshold
= 25*1024
86 _sticky_cache_size
= 300*1024*1024
87 _sticky_flushed_size
= 100*1024*1024
89 def __init__(self
, info
=None, verbose
=False, inventory_cache_size
=10):
90 """Create a manager of caches.
92 :param info: a ConfigObj holding the output from
93 the --info processor, or None if no hints are available
95 self
.verbose
= verbose
97 # dataref -> data. datref is either :mark or the sha-1.
98 # Sticky blobs are referenced more than once, and are saved until their
101 self
._sticky
_blobs
= {}
102 self
._sticky
_memory
_bytes
= 0
103 # if we overflow our memory cache, then we will dump large blobs to
104 # disk in this directory
106 # id => (offset, n_bytes, fname)
107 # if fname is None, then the content is stored in the small file
108 self
._disk
_blobs
= {}
109 self
._cleanup
= _Cleanup(self
._disk
_blobs
)
111 # revision-id -> Inventory cache
112 # these are large and we probably don't need too many as
113 # most parents are recent in history
114 self
.inventories
= lru_cache
.LRUCache(inventory_cache_size
)
116 # import commmit-ids -> revision-id lookup table
117 # we need to keep all of these but they are small
118 self
.revision_ids
= {}
120 # (path, branch_ref) -> file-ids - as generated.
121 # (Use store_file_id/fetch_fileid methods rather than direct access.)
123 # Head tracking: last ref, last id per ref & map of commit ids to ref*s*
128 # Work out the blobs to make sticky - None means all
129 self
._blob
_ref
_counts
= {}
132 blobs_by_counts
= info
['Blob reference counts']
133 # The parser hands values back as lists, already parsed
134 for count
, blob_list
in blobs_by_counts
.items():
137 self
._blob
_ref
_counts
[b
] = n
139 # info not in file - possible when no blobs used
142 # BranchMapper has no state (for now?), but we keep it around rather
143 # than reinstantiate on every usage
144 self
.branch_mapper
= branch_mapper
.BranchMapper()
146 def dump_stats(self
, note
=trace
.note
):
147 """Dump some statistics about what we cached."""
148 # TODO: add in inventory stastistics
149 note("Cache statistics:")
150 self
._show
_stats
_for
(self
._sticky
_blobs
, "sticky blobs", note
=note
)
151 self
._show
_stats
_for
(self
.revision_ids
, "revision-ids", note
=note
)
152 # These aren't interesting so omit from the output, at least for now
153 #self._show_stats_for(self._blobs, "other blobs", note=note)
154 #self._show_stats_for(self.last_ids, "last-ids", note=note)
155 #self._show_stats_for(self.heads, "heads", note=note)
157 def _show_stats_for(self
, dict, label
, note
=trace
.note
, tuple_key
=False):
158 """Dump statistics about a given dictionary.
160 By the key and value need to support len().
164 size
= sum(map(len, (''.join(k
) for k
in dict.keys())))
166 size
= sum(map(len, dict.keys()))
167 size
+= sum(map(len, dict.values()))
168 size
= size
* 1.0 / 1024
176 note(" %-12s: %8.1f %s (%d %s)" % (label
, size
, unit
, count
,
177 helpers
.single_plural(count
, "item", "items")))
180 """Free up any memory used by the caches."""
182 self
._sticky
_blobs
.clear()
183 self
.revision_ids
.clear()
184 self
.last_ids
.clear()
186 self
.inventories
.clear()
188 def _flush_blobs_to_disk(self
):
189 blobs
= self
._sticky
_blobs
.keys()
190 sticky_blobs
= self
._sticky
_blobs
191 total_blobs
= len(sticky_blobs
)
192 blobs
.sort(key
=lambda k
:len(sticky_blobs
[k
]))
193 if self
._tempdir
is None:
194 tempdir
= tempfile
.mkdtemp(prefix
='bzr_fastimport_blobs-')
195 self
._tempdir
= tempdir
196 self
._cleanup
.tempdir
= self
._tempdir
197 self
._cleanup
.small_blobs
= tempfile
.TemporaryFile(
198 prefix
='small-blobs-', dir=self
._tempdir
)
199 small_blob_ref
= weakref
.ref(self
._cleanup
.small_blobs
)
200 # Even though we add it to _Cleanup it seems that the object can be
201 # destroyed 'too late' for cleanup to actually occur. Probably a
202 # combination of bzr's "die directly, don't clean up" and how
203 # exceptions close the running stack.
205 small_blob
= small_blob_ref()
206 if small_blob
is not None:
208 shutil
.rmtree(tempdir
, ignore_errors
=True)
209 atexit
.register(exit_cleanup
)
213 while self
._sticky
_memory
_bytes
> self
._sticky
_flushed
_size
:
215 blob
= self
._sticky
_blobs
.pop(id)
217 self
._sticky
_memory
_bytes
-= n_bytes
218 if n_bytes
< self
._small
_blob
_threshold
:
219 f
= self
._cleanup
.small_blobs
220 f
.seek(0, os
.SEEK_END
)
221 self
._disk
_blobs
[id] = (f
.tell(), n_bytes
, None)
223 n_small_bytes
+= n_bytes
225 fd
, name
= tempfile
.mkstemp(prefix
='blob-', dir=self
._tempdir
)
228 self
._disk
_blobs
[id] = (0, n_bytes
, name
)
232 trace
.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
233 % (count
, total_blobs
, bytes
/ 1024. / 1024,
234 n_small_bytes
/ 1024. / 1024))
237 def store_blob(self
, id, data
):
238 """Store a blob of data."""
239 # Note: If we're not reference counting, everything has to be sticky
240 if not self
._blob
_ref
_counts
or id in self
._blob
_ref
_counts
:
241 self
._sticky
_blobs
[id] = data
242 self
._sticky
_memory
_bytes
+= len(data
)
243 if self
._sticky
_memory
_bytes
> self
._sticky
_cache
_size
:
244 self
._flush
_blobs
_to
_disk
()
246 # Empty data is always sticky
247 self
._sticky
_blobs
[id] = data
249 self
._blobs
[id] = data
251 def _decref(self
, id, cache
, fn
):
252 if not self
._blob
_ref
_counts
:
254 count
= self
._blob
_ref
_counts
.get(id, None)
255 if count
is not None:
261 del self
._blob
_ref
_counts
[id]
264 self
._blob
_ref
_counts
[id] = count
267 def fetch_blob(self
, id):
268 """Fetch a blob of data."""
269 if id in self
._blobs
:
270 return self
._blobs
.pop(id)
271 if id in self
._disk
_blobs
:
272 (offset
, n_bytes
, fn
) = self
._disk
_blobs
[id]
274 f
= self
._cleanup
.small_blobs
276 content
= f
.read(n_bytes
)
283 self
._decref
(id, self
._disk
_blobs
, fn
)
285 content
= self
._sticky
_blobs
[id]
286 if self
._decref
(id, self
._sticky
_blobs
, None):
287 self
._sticky
_memory
_bytes
-= len(content
)
290 def track_heads(self
, cmd
):
291 """Track the repository heads given a CommitCommand.
293 :param cmd: the CommitCommand
294 :return: the list of parents in terms of commit-ids
296 # Get the true set of parents
297 if cmd
.from_
is not None:
298 parents
= [cmd
.from_
]
300 last_id
= self
.last_ids
.get(cmd
.ref
)
301 if last_id
is not None:
305 parents
.extend(cmd
.merges
)
308 self
.track_heads_for_ref(cmd
.ref
, cmd
.id, parents
)
311 def track_heads_for_ref(self
, cmd_ref
, cmd_id
, parents
=None):
312 if parents
is not None:
313 for parent
in parents
:
314 if parent
in self
.heads
:
315 del self
.heads
[parent
]
316 self
.heads
.setdefault(cmd_id
, set()).add(cmd_ref
)
317 self
.last_ids
[cmd_ref
] = cmd_id
318 self
.last_ref
= cmd_ref