cache_manager.py

   1 # Copyright (C) 2009 Canonical Ltd
   2 #
   3 # This program is free software; you can redistribute it and/or modify
   4 # it under the terms of the GNU General Public License as published by
   5 # the Free Software Foundation; either version 2 of the License, or
   6 # (at your option) any later version.
   7 #
   8 # This program is distributed in the hope that it will be useful,
   9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 # GNU General Public License for more details.
  12 #
  13 # You should have received a copy of the GNU General Public License
  14 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  15
  16 """A manager of caches."""
  17
  18 import atexit
  19 import os
  20 import shutil
  21 import tempfile
  22 import weakref
  23
  24 from bzrlib import lru_cache, trace
  25 from bzrlib.plugins.fastimport import (
  26     branch_mapper,
  27     )
  28 from bzrlib.plugins.fastimport.reftracker import (
  29     RefTracker,
  30     )
  31 from fastimport.helpers import (
  32     single_plural,
  33     )
  34
  35
  36 class _Cleanup(object):
  37     """This class makes sure we clean up when CacheManager goes away.
  38
  39     We use a helper class to ensure that we are never in a refcycle.
  40     """
  41
  42     def __init__(self, disk_blobs):
  43         self.disk_blobs = disk_blobs
  44         self.tempdir = None
  45         self.small_blobs = None
  46
  47     def __del__(self):
  48         self.finalize()
  49
  50     def finalize(self):
  51         if self.disk_blobs is not None:
  52             for info in self.disk_blobs.itervalues():
  53                 if info[-1] is not None:
  54                     os.unlink(info[-1])
  55             self.disk_blobs = None
  56         if self.small_blobs is not None:
  57             self.small_blobs.close()
  58             self.small_blobs = None
  59         if self.tempdir is not None:
  60             shutil.rmtree(self.tempdir)
  61
  62
  63 class CacheManager(object):
  64
  65     _small_blob_threshold = 25*1024
  66     _sticky_cache_size = 300*1024*1024
  67     _sticky_flushed_size = 100*1024*1024
  68
  69     def __init__(self, info=None, verbose=False, inventory_cache_size=10):
  70         """Create a manager of caches.
  71
  72         :param info: a ConfigObj holding the output from
  73             the --info processor, or None if no hints are available
  74         """
  75         self.verbose = verbose
  76
  77         # dataref -> data. datref is either :mark or the sha-1.
  78         # Sticky blobs are referenced more than once, and are saved until their
  79         # refcount goes to 0
  80         self._blobs = {}
  81         self._sticky_blobs = {}
  82         self._sticky_memory_bytes = 0
  83         # if we overflow our memory cache, then we will dump large blobs to
  84         # disk in this directory
  85         self._tempdir = None
  86         # id => (offset, n_bytes, fname)
  87         #   if fname is None, then the content is stored in the small file
  88         self._disk_blobs = {}
  89         self._cleanup = _Cleanup(self._disk_blobs)
  90
  91         # revision-id -> Inventory cache
  92         # these are large and we probably don't need too many as
  93         # most parents are recent in history
  94         self.inventories = lru_cache.LRUCache(inventory_cache_size)
  95
  96         # import commmit-ids -> revision-id lookup table
  97         # we need to keep all of these but they are small
  98         self.marks = {}
  99
 100         # (path, branch_ref) -> file-ids - as generated.
 101         # (Use store_file_id/fetch_fileid methods rather than direct access.)
 102
 103         # Work out the blobs to make sticky - None means all
 104         self._blob_ref_counts = {}
 105         if info is not None:
 106             try:
 107                 blobs_by_counts = info['Blob reference counts']
 108                 # The parser hands values back as lists, already parsed
 109                 for count, blob_list in blobs_by_counts.items():
 110                     n = int(count)
 111                     for b in blob_list:
 112                         self._blob_ref_counts[b] = n
 113             except KeyError:
 114                 # info not in file - possible when no blobs used
 115                 pass
 116
 117         # BranchMapper has no state (for now?), but we keep it around rather
 118         # than reinstantiate on every usage
 119         self.branch_mapper = branch_mapper.BranchMapper()
 120
 121         self.reftracker = RefTracker()
 122
 123     def add_mark(self, mark, commit_id):
 124         assert mark[0] != ':'
 125         self.marks[mark] = commit_id
 126
 127     def lookup_committish(self, committish):
 128         """Resolve a 'committish' to a revision id.
 129
 130         :param committish: A "committish" string
 131         :return: Bazaar revision id
 132         """
 133         assert committish[0] == ':'
 134         return self.marks[committish.lstrip(':')]
 135
 136     def dump_stats(self, note=trace.note):
 137         """Dump some statistics about what we cached."""
 138         # TODO: add in inventory stastistics
 139         note("Cache statistics:")
 140         self._show_stats_for(self._sticky_blobs, "sticky blobs", note=note)
 141         self._show_stats_for(self.marks, "revision-ids", note=note)
 142         # These aren't interesting so omit from the output, at least for now
 143         #self._show_stats_for(self._blobs, "other blobs", note=note)
 144         #self.reftracker.dump_stats(note=note)
 145
 146     def _show_stats_for(self, dict, label, note=trace.note, tuple_key=False):
 147         """Dump statistics about a given dictionary.
 148
 149         By the key and value need to support len().
 150         """
 151         count = len(dict)
 152         if tuple_key:
 153             size = sum(map(len, (''.join(k) for k in dict.keys())))
 154         else:
 155             size = sum(map(len, dict.keys()))
 156         size += sum(map(len, dict.values()))
 157         size = size * 1.0 / 1024
 158         unit = 'K'
 159         if size > 1024:
 160             size = size / 1024
 161             unit = 'M'
 162             if size > 1024:
 163                 size = size / 1024
 164                 unit = 'G'
 165         note("    %-12s: %8.1f %s (%d %s)" % (label, size, unit, count,
 166             single_plural(count, "item", "items")))
 167
 168     def clear_all(self):
 169         """Free up any memory used by the caches."""
 170         self._blobs.clear()
 171         self._sticky_blobs.clear()
 172         self.marks.clear()
 173         self.reftracker.clear()
 174         self.inventories.clear()
 175
 176     def _flush_blobs_to_disk(self):
 177         blobs = self._sticky_blobs.keys()
 178         sticky_blobs = self._sticky_blobs
 179         total_blobs = len(sticky_blobs)
 180         blobs.sort(key=lambda k:len(sticky_blobs[k]))
 181         if self._tempdir is None:
 182             tempdir = tempfile.mkdtemp(prefix='fastimport_blobs-')
 183             self._tempdir = tempdir
 184             self._cleanup.tempdir = self._tempdir
 185             self._cleanup.small_blobs = tempfile.TemporaryFile(
 186                 prefix='small-blobs-', dir=self._tempdir)
 187             small_blob_ref = weakref.ref(self._cleanup.small_blobs)
 188             # Even though we add it to _Cleanup it seems that the object can be
 189             # destroyed 'too late' for cleanup to actually occur. Probably a
 190             # combination of bzr's "die directly, don't clean up" and how
 191             # exceptions close the running stack.
 192             def exit_cleanup():
 193                 small_blob = small_blob_ref()
 194                 if small_blob is not None:
 195                     small_blob.close()
 196                 shutil.rmtree(tempdir, ignore_errors=True)
 197             atexit.register(exit_cleanup)
 198         count = 0
 199         bytes = 0
 200         n_small_bytes = 0
 201         while self._sticky_memory_bytes > self._sticky_flushed_size:
 202             id = blobs.pop()
 203             blob = self._sticky_blobs.pop(id)
 204             n_bytes = len(blob)
 205             self._sticky_memory_bytes -= n_bytes
 206             if n_bytes < self._small_blob_threshold:
 207                 f = self._cleanup.small_blobs
 208                 f.seek(0, os.SEEK_END)
 209                 self._disk_blobs[id] = (f.tell(), n_bytes, None)
 210                 f.write(blob)
 211                 n_small_bytes += n_bytes
 212             else:
 213                 fd, name = tempfile.mkstemp(prefix='blob-', dir=self._tempdir)
 214                 os.write(fd, blob)
 215                 os.close(fd)
 216                 self._disk_blobs[id] = (0, n_bytes, name)
 217             bytes += n_bytes
 218             del blob
 219             count += 1
 220         trace.note('flushed %d/%d blobs w/ %.1fMB (%.1fMB small) to disk'
 221                    % (count, total_blobs, bytes / 1024. / 1024,
 222                       n_small_bytes / 1024. / 1024))
 223
 224     def store_blob(self, id, data):
 225         """Store a blob of data."""
 226         # Note: If we're not reference counting, everything has to be sticky
 227         if not self._blob_ref_counts or id in self._blob_ref_counts:
 228             self._sticky_blobs[id] = data
 229             self._sticky_memory_bytes += len(data)
 230             if self._sticky_memory_bytes > self._sticky_cache_size:
 231                 self._flush_blobs_to_disk()
 232         elif data == '':
 233             # Empty data is always sticky
 234             self._sticky_blobs[id] = data
 235         else:
 236             self._blobs[id] = data
 237
 238     def _decref(self, id, cache, fn):
 239         if not self._blob_ref_counts:
 240             return False
 241         count = self._blob_ref_counts.get(id, None)
 242         if count is not None:
 243             count -= 1
 244             if count <= 0:
 245                 del cache[id]
 246                 if fn is not None:
 247                     os.unlink(fn)
 248                 del self._blob_ref_counts[id]
 249                 return True
 250             else:
 251                 self._blob_ref_counts[id] = count
 252         return False
 253
 254     def fetch_blob(self, id):
 255         """Fetch a blob of data."""
 256         if id in self._blobs:
 257             return self._blobs.pop(id)
 258         if id in self._disk_blobs:
 259             (offset, n_bytes, fn) = self._disk_blobs[id]
 260             if fn is None:
 261                 f = self._cleanup.small_blobs
 262                 f.seek(offset)
 263                 content = f.read(n_bytes)
 264             else:
 265                 fp = open(fn, 'rb')
 266                 try:
 267                     content = fp.read()
 268                 finally:
 269                     fp.close()
 270             self._decref(id, self._disk_blobs, fn)
 271             return content
 272         content = self._sticky_blobs[id]
 273         if self._decref(id, self._sticky_blobs, None):
 274             self._sticky_memory_bytes -= len(content)
 275         return content
 276
 277