cvs2svn_lib/record_table.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2008 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """Classes to manage Databases of fixed-length records.
  18
  19 The databases map small, non-negative integers to fixed-size records.
  20 The records are written in index order to a disk file.  Gaps in the
  21 index sequence leave gaps in the data file, so for best space
  22 efficiency the indexes of existing records should be approximately
  23 continuous.
  24
  25 To use a RecordTable, you need a class derived from Packer which can
  26 serialize/deserialize your records into fixed-size strings.  Deriving
  27 classes have to specify how to pack records into strings and unpack
  28 strings into records by overwriting the pack() and unpack() methods
  29 respectively.
  30
  31 Note that these classes keep track of gaps in the records that have
  32 been written by filling them with packer.empty_value.  If a record is
  33 read which contains packer.empty_value, then a KeyError is raised."""
  34
  35
  36 import os
  37 import types
  38 import struct
  39 import mmap
  40
  41 from cvs2svn_lib.common import DB_OPEN_READ
  42 from cvs2svn_lib.common import DB_OPEN_WRITE
  43 from cvs2svn_lib.common import DB_OPEN_NEW
  44 from cvs2svn_lib.log import logger
  45
  46
  47 # A unique value that can be used to stand for "unset" without
  48 # preventing the use of None.
  49 _unset = object()
  50
  51
  52 class Packer(object):
  53   def __init__(self, record_len, empty_value=None):
  54     self.record_len = record_len
  55     if empty_value is None:
  56       self.empty_value = '\0' * self.record_len
  57     else:
  58       assert type(empty_value) is types.StringType
  59       assert len(empty_value) == self.record_len
  60       self.empty_value = empty_value
  61
  62   def pack(self, v):
  63     """Pack record V into a string of length self.record_len."""
  64
  65     raise NotImplementedError()
  66
  67   def unpack(self, s):
  68     """Unpack string S into a record."""
  69
  70     raise NotImplementedError()
  71
  72
  73 class StructPacker(Packer):
  74   def __init__(self, format, empty_value=_unset):
  75     self.format = format
  76     if empty_value is not _unset:
  77       empty_value = self.pack(empty_value)
  78     else:
  79       empty_value = None
  80
  81     Packer.__init__(self, struct.calcsize(self.format),
  82                     empty_value=empty_value)
  83
  84   def pack(self, v):
  85     return struct.pack(self.format, v)
  86
  87   def unpack(self, v):
  88     return struct.unpack(self.format, v)[0]
  89
  90
  91 class UnsignedIntegerPacker(StructPacker):
  92   def __init__(self, empty_value=0):
  93     StructPacker.__init__(self, '=I', empty_value)
  94
  95
  96 class SignedIntegerPacker(StructPacker):
  97   def __init__(self, empty_value=0):
  98     StructPacker.__init__(self, '=i', empty_value)
  99
 100
 101 class FileOffsetPacker(Packer):
 102   """A packer suitable for file offsets.
 103
 104   We store the 5 least significant bytes of the file offset.  This is
 105   enough bits to represent 1 TiB.  Of course if the computer
 106   doesn't have large file support, only the lowest 31 bits can be
 107   nonzero, and the offsets are limited to 2 GiB."""
 108
 109   # Convert file offsets to 8-bit little-endian unsigned longs...
 110   INDEX_FORMAT = '<Q'
 111   # ...but then truncate to 5 bytes.
 112   INDEX_FORMAT_LEN = 5
 113
 114   PAD = '\0' * (struct.calcsize(INDEX_FORMAT) - INDEX_FORMAT_LEN)
 115
 116   def __init__(self):
 117     Packer.__init__(self, self.INDEX_FORMAT_LEN)
 118
 119   def pack(self, v):
 120     return struct.pack(self.INDEX_FORMAT, v)[:self.INDEX_FORMAT_LEN]
 121
 122   def unpack(self, s):
 123     return struct.unpack(self.INDEX_FORMAT, s + self.PAD)[0]
 124
 125
 126 class RecordTableAccessError(RuntimeError):
 127   pass
 128
 129
 130 class AbstractRecordTable:
 131   def __init__(self, filename, mode, packer):
 132     self.filename = filename
 133     self.mode = mode
 134     self.packer = packer
 135     # Simplify and speed access to this oft-needed quantity:
 136     self._record_len = self.packer.record_len
 137
 138   def __str__(self):
 139     return '%s(%r)' % (self.__class__.__name__, self.filename,)
 140
 141   def _set_packed_record(self, i, s):
 142     """Set the value for index I to the packed value S."""
 143
 144     raise NotImplementedError()
 145
 146   def __setitem__(self, i, v):
 147     self._set_packed_record(i, self.packer.pack(v))
 148
 149   def _get_packed_record(self, i):
 150     """Return the packed record for index I.
 151
 152     Raise KeyError if it is not present."""
 153
 154     raise NotImplementedError()
 155
 156   def __getitem__(self, i):
 157     """Return the item for index I.
 158
 159     Raise KeyError if that item has never been set (or if it was set
 160     to self.packer.empty_value)."""
 161
 162     s = self._get_packed_record(i)
 163
 164     if s == self.packer.empty_value:
 165       raise KeyError(i)
 166
 167     return self.packer.unpack(s)
 168
 169   def get_many(self, indexes, default=None):
 170     """Yield (index, item) typles for INDEXES in arbitrary order.
 171
 172     Yield (index,default) for indices for which not item is defined."""
 173
 174     indexes = list(indexes)
 175     # Sort the indexes to reduce disk seeking:
 176     indexes.sort()
 177     for i in indexes:
 178       yield (i, self.get(i, default))
 179
 180   def get(self, i, default=None):
 181     try:
 182       return self[i]
 183     except KeyError:
 184       return default
 185
 186   def __delitem__(self, i):
 187     """Delete the item for index I.
 188
 189     Raise KeyError if that item has never been set (or if it was set
 190     to self.packer.empty_value)."""
 191
 192     if self.mode == DB_OPEN_READ:
 193       raise RecordTableAccessError()
 194
 195     # Check that the value was set (otherwise raise KeyError):
 196     self[i]
 197     self._set_packed_record(i, self.packer.empty_value)
 198
 199   def iterkeys(self):
 200     """Yield the keys in the map in key order."""
 201
 202     for i in xrange(0, self._limit):
 203       try:
 204         self[i]
 205         yield i
 206       except KeyError:
 207         pass
 208
 209   def itervalues(self):
 210     """Yield the values in the map in key order.
 211
 212     Skip over values that haven't been defined."""
 213
 214     for i in xrange(0, self._limit):
 215       try:
 216         yield self[i]
 217       except KeyError:
 218         pass
 219
 220
 221 class RecordTable(AbstractRecordTable):
 222   # The approximate amount of memory that should be used for the cache
 223   # for each instance of this class:
 224   CACHE_MEMORY = 4 * 1024 * 1024
 225
 226   # Empirically, each entry in the cache table has an overhead of
 227   # about 96 bytes on a 32-bit computer.
 228   CACHE_OVERHEAD_PER_ENTRY = 96
 229
 230   def __init__(self, filename, mode, packer, cache_memory=CACHE_MEMORY):
 231     AbstractRecordTable.__init__(self, filename, mode, packer)
 232     if self.mode == DB_OPEN_NEW:
 233       self.f = open(self.filename, 'wb+')
 234     elif self.mode == DB_OPEN_WRITE:
 235       self.f = open(self.filename, 'rb+')
 236     elif self.mode == DB_OPEN_READ:
 237       self.f = open(self.filename, 'rb')
 238     else:
 239       raise RuntimeError('Invalid mode %r' % self.mode)
 240     self.cache_memory = cache_memory
 241
 242     # Number of items that can be stored in the write cache.
 243     self._max_memory_cache = (
 244         self.cache_memory
 245         / (self.CACHE_OVERHEAD_PER_ENTRY + self._record_len))
 246
 247     # Read and write cache; a map {i : (dirty, s)}, where i is an
 248     # index, dirty indicates whether the value has to be written to
 249     # disk, and s is the packed value for the index.  Up to
 250     # self._max_memory_cache items can be stored here.  When the cache
 251     # fills up, it is written to disk in one go and then cleared.
 252     self._cache = {}
 253
 254     # The index just beyond the last record ever written:
 255     self._limit = os.path.getsize(self.filename) // self._record_len
 256
 257     # The index just beyond the last record ever written to disk:
 258     self._limit_written = self._limit
 259
 260   def flush(self):
 261     logger.debug('Flushing cache for %s' % (self,))
 262
 263     pairs = [(i, s) for (i, (dirty, s)) in self._cache.items() if dirty]
 264
 265     if pairs:
 266       pairs.sort()
 267       old_i = None
 268       f = self.f
 269       for (i, s) in pairs:
 270         if i == old_i:
 271           # No seeking needed
 272           pass
 273         elif i <= self._limit_written:
 274           # Just jump there:
 275           f.seek(i * self._record_len)
 276         else:
 277           # Jump to the end of the file then write _empty_values until
 278           # we reach the correct location:
 279           f.seek(self._limit_written * self._record_len)
 280           while self._limit_written < i:
 281             f.write(self.packer.empty_value)
 282             self._limit_written += 1
 283         f.write(s)
 284         old_i = i + 1
 285         self._limit_written = max(self._limit_written, old_i)
 286
 287       self.f.flush()
 288
 289     self._cache.clear()
 290
 291   def _set_packed_record(self, i, s):
 292     if self.mode == DB_OPEN_READ:
 293       raise RecordTableAccessError()
 294     if i < 0:
 295       raise KeyError()
 296     self._cache[i] = (True, s)
 297     if len(self._cache) >= self._max_memory_cache:
 298       self.flush()
 299     self._limit = max(self._limit, i + 1)
 300
 301   def _get_packed_record(self, i):
 302     try:
 303       return self._cache[i][1]
 304     except KeyError:
 305       if not 0 <= i < self._limit_written:
 306         raise KeyError(i)
 307       self.f.seek(i * self._record_len)
 308       s = self.f.read(self._record_len)
 309       self._cache[i] = (False, s)
 310       if len(self._cache) >= self._max_memory_cache:
 311         self.flush()
 312
 313       return s
 314
 315   def close(self):
 316     self.flush()
 317     self._cache = None
 318     self.f.close()
 319     self.f = None
 320
 321
 322 class MmapRecordTable(AbstractRecordTable):
 323   GROWTH_INCREMENT = 65536
 324
 325   def __init__(self, filename, mode, packer):
 326     AbstractRecordTable.__init__(self, filename, mode, packer)
 327     if self.mode == DB_OPEN_NEW:
 328       self.python_file = open(self.filename, 'wb+')
 329       self.python_file.write('\0' * self.GROWTH_INCREMENT)
 330       self.python_file.flush()
 331       self._filesize = self.GROWTH_INCREMENT
 332       self.f = mmap.mmap(
 333           self.python_file.fileno(), self._filesize,
 334           access=mmap.ACCESS_WRITE
 335           )
 336
 337       # The index just beyond the last record ever written:
 338       self._limit = 0
 339     elif self.mode == DB_OPEN_WRITE:
 340       self.python_file = open(self.filename, 'rb+')
 341       self._filesize = os.path.getsize(self.filename)
 342       self.f = mmap.mmap(
 343           self.python_file.fileno(), self._filesize,
 344           access=mmap.ACCESS_WRITE
 345           )
 346
 347       # The index just beyond the last record ever written:
 348       self._limit = os.path.getsize(self.filename) // self._record_len
 349     elif self.mode == DB_OPEN_READ:
 350       self.python_file = open(self.filename, 'rb')
 351       self._filesize = os.path.getsize(self.filename)
 352       self.f = mmap.mmap(
 353           self.python_file.fileno(), self._filesize,
 354           access=mmap.ACCESS_READ
 355           )
 356
 357       # The index just beyond the last record ever written:
 358       self._limit = os.path.getsize(self.filename) // self._record_len
 359     else:
 360       raise RuntimeError('Invalid mode %r' % self.mode)
 361
 362   def flush(self):
 363     self.f.flush()
 364
 365   def _set_packed_record(self, i, s):
 366     if self.mode == DB_OPEN_READ:
 367       raise RecordTableAccessError()
 368     if i < 0:
 369       raise KeyError()
 370     if i >= self._limit:
 371       # This write extends the range of valid indices.  First check
 372       # whether the file has to be enlarged:
 373       new_size = (i + 1) * self._record_len
 374       if new_size > self._filesize:
 375         self._filesize = (
 376             (new_size + self.GROWTH_INCREMENT - 1)
 377             // self.GROWTH_INCREMENT
 378             * self.GROWTH_INCREMENT
 379             )
 380         self.f.resize(self._filesize)
 381       if i > self._limit:
 382         # Pad up to the new record with empty_value:
 383         self.f[self._limit * self._record_len:i * self._record_len] = \
 384             self.packer.empty_value * (i - self._limit)
 385       self._limit = i + 1
 386
 387     self.f[i * self._record_len:(i + 1) * self._record_len] = s
 388
 389   def _get_packed_record(self, i):
 390     if not 0 <= i < self._limit:
 391       raise KeyError(i)
 392     return self.f[i * self._record_len:(i + 1) * self._record_len]
 393
 394   def close(self):
 395     self.flush()
 396     self.f.close()
 397     self.python_file.close()
 398
 399