cvs2svn_lib/database.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2007 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains database facilities used by cvs2svn."""
  18
  19
  20 import sys
  21 import os
  22 import cPickle
  23
  24 from cvs2svn_lib.common import DB_OPEN_READ
  25 from cvs2svn_lib.common import DB_OPEN_WRITE
  26 from cvs2svn_lib.common import DB_OPEN_NEW
  27 from cvs2svn_lib.common import warning_prefix
  28 from cvs2svn_lib.common import error_prefix
  29 from cvs2svn_lib.log import Log
  30 from cvs2svn_lib.record_table import FileOffsetPacker
  31 from cvs2svn_lib.record_table import RecordTable
  32
  33
  34 # DBM module selection
  35
  36 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
  37 #    so that the dbhash module used by anydbm will use bsddb3.
  38 try:
  39   import bsddb3
  40   sys.modules['bsddb'] = sys.modules['bsddb3']
  41 except ImportError:
  42   pass
  43
  44 # 2. These DBM modules are not good for cvs2svn.
  45 import anydbm
  46 if anydbm._defaultmod.__name__ in ['dumbdbm', 'dbm']:
  47   Log().error(
  48       '%s: cvs2svn uses the anydbm package, which depends on lower level '
  49           'dbm\n'
  50       'libraries.  Your system has %s, with which cvs2svn is known to have\n'
  51       'problems.  To use cvs2svn, you must install a Python dbm library '
  52           'other than\n'
  53       'dumbdbm or dbm.  See '
  54           'http://python.org/doc/current/lib/module-anydbm.html\n'
  55       'for more information.\n'
  56       % (error_prefix, anydbm._defaultmod.__name__,)
  57       )
  58   sys.exit(1)
  59
  60 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
  61 #    Unfortunately, gdbm appears not to be trouble free, either.
  62 if hasattr(anydbm._defaultmod, 'bsddb') \
  63     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  64   try:
  65     gdbm = __import__('gdbm')
  66   except ImportError:
  67     Log().warn(
  68         '%s: The version of the bsddb module found on your computer '
  69             'has been\n'
  70         'reported to malfunction on some datasets, causing KeyError '
  71             'exceptions.\n'
  72         % (warning_prefix,)
  73         )
  74   else:
  75     anydbm._defaultmod = gdbm
  76
  77
  78 class AbstractDatabase:
  79   """An abstract base class for anydbm-based databases."""
  80
  81   def __init__(self, filename, mode):
  82     """A convenience function for opening an anydbm database."""
  83
  84     # pybsddb3 has a bug which prevents it from working with
  85     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
  86     # causes the DB_TRUNCATE flag to be passed, which is disallowed
  87     # for databases protected by lock and transaction support
  88     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
  89     #
  90     # Therefore, manually perform the removal (we can do this, because
  91     # we know that for bsddb - but *not* anydbm in general - the database
  92     # consists of one file with the name we specify, rather than several
  93     # based on that name).
  94     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
  95       if os.path.isfile(filename):
  96         os.unlink(filename)
  97       mode = 'c'
  98
  99     self.db = anydbm.open(filename, mode)
 100
 101     # Import implementations for many mapping interface methods.  Note
 102     # that we specifically do not do this for any method which handles
 103     # *values*, because our derived classes define __getitem__ and
 104     # __setitem__ to override the storage of values, and grabbing
 105     # methods directly from the dbm object would bypass this.
 106     for meth_name in ('__delitem__',
 107         '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
 108       meth_ref = getattr(self.db, meth_name, None)
 109       if meth_ref:
 110         setattr(self, meth_name, meth_ref)
 111
 112   def __delitem__(self, key):
 113     # gdbm defines a __delitem__ method, but it cannot be assigned.  So
 114     # this method provides a fallback definition via explicit delegation:
 115     del self.db[key]
 116
 117   def keys(self):
 118     return self.db.keys()
 119
 120   def __iter__(self):
 121     for key in self.keys():
 122       yield key
 123
 124   def has_key(self, key):
 125     try:
 126       self.db[key]
 127       return True
 128     except KeyError:
 129       return False
 130
 131   def __contains__(self, key):
 132     return self.has_key(key)
 133
 134   def iterkeys(self):
 135     return self.__iter__()
 136
 137   def clear(self):
 138     for key in self.keys():
 139       del self[key]
 140
 141   def items(self):
 142     return [(key, self[key],) for key in self.keys()]
 143
 144   def values(self):
 145     return [self[key] for key in self.keys()]
 146
 147   def get(self, key, default=None):
 148     try:
 149       return self[key]
 150     except KeyError:
 151       return default
 152
 153   def close(self):
 154     self.db.close()
 155     self.db = None
 156
 157
 158 class Database(AbstractDatabase):
 159   """A database that uses a Serializer to store objects of a certain type.
 160
 161   Since the database entry with the key self.serializer_key is used to
 162   store the serializer, self.serializer_key may not be used as a key for
 163   normal entries."""
 164
 165   serializer_key = '_.%$1\t;_ '
 166
 167   def __init__(self, filename, mode, serializer=None):
 168     """Constructor.
 169
 170     The database stores its Serializer, so none needs to be supplied
 171     when opening an existing database."""
 172
 173     AbstractDatabase.__init__(self, filename, mode)
 174
 175     if mode == DB_OPEN_NEW:
 176       self.serializer = serializer
 177       self.db[self.serializer_key] = cPickle.dumps(self.serializer)
 178     else:
 179       self.serializer = cPickle.loads(self.db[self.serializer_key])
 180
 181   def __getitem__(self, key):
 182     return self.serializer.loads(self.db[key])
 183
 184   def __setitem__(self, key, value):
 185     self.db[key] = self.serializer.dumps(value)
 186
 187   def keys(self): # TODO: Once needed, handle iterators as well.
 188     retval = self.db.keys()
 189     retval.remove(self.serializer_key)
 190     return retval
 191
 192
 193 class IndexedDatabase:
 194   """A file of objects that are written sequentially and read randomly.
 195
 196   The objects are indexed by small non-negative integers, and a
 197   RecordTable is used to store the index -> fileoffset map.
 198   fileoffset=0 is used to represent an empty record.  (An offset of 0
 199   cannot occur for a legitimate record because the serializer is
 200   written there.)
 201
 202   The main file consists of a sequence of pickles (or other serialized
 203   data format).  The zeroth record is a pickled Serializer.
 204   Subsequent ones are objects serialized using the serializer.  The
 205   offset of each object in the file is stored to an index table so
 206   that the data can later be retrieved randomly.
 207
 208   Objects are always stored to the end of the file.  If an object is
 209   deleted or overwritten, the fact is recorded in the index_table but
 210   the space in the pickle file is not garbage collected.  This has the
 211   advantage that one can create a modified version of a database that
 212   shares the main data file with an old version by copying the index
 213   file.  But it has the disadvantage that space is wasted whenever
 214   objects are written multiple times."""
 215
 216   def __init__(self, filename, index_filename, mode, serializer=None):
 217     """Initialize an IndexedDatabase, writing the serializer if necessary.
 218
 219     SERIALIZER is only used if MODE is DB_OPEN_NEW; otherwise the
 220     serializer is read from the file."""
 221
 222     self.filename = filename
 223     self.index_filename = index_filename
 224     self.mode = mode
 225     if self.mode == DB_OPEN_NEW:
 226       self.f = open(self.filename, 'wb+')
 227     elif self.mode == DB_OPEN_WRITE:
 228       self.f = open(self.filename, 'rb+')
 229     elif self.mode == DB_OPEN_READ:
 230       self.f = open(self.filename, 'rb')
 231     else:
 232       raise RuntimeError('Invalid mode %r' % self.mode)
 233
 234     self.index_table = RecordTable(
 235         self.index_filename, self.mode, FileOffsetPacker()
 236         )
 237
 238     if self.mode == DB_OPEN_NEW:
 239       assert serializer is not None
 240       self.serializer = serializer
 241       cPickle.dump(self.serializer, self.f, -1)
 242     else:
 243       # Read the memo from the first pickle:
 244       self.serializer = cPickle.load(self.f)
 245
 246     # Seek to the end of the file, and record that position:
 247     self.f.seek(0, 2)
 248     self.fp = self.f.tell()
 249     self.eofp = self.fp
 250
 251   def __setitem__(self, index, item):
 252     """Write ITEM into the database indexed by INDEX."""
 253
 254     # Make sure we're at the end of the file:
 255     if self.fp != self.eofp:
 256       self.f.seek(self.eofp)
 257     self.index_table[index] = self.eofp
 258     s = self.serializer.dumps(item)
 259     self.f.write(s)
 260     self.eofp += len(s)
 261     self.fp = self.eofp
 262
 263   def _fetch(self, offset):
 264     if self.fp != offset:
 265       self.f.seek(offset)
 266
 267     # There is no easy way to tell how much data will be read, so just
 268     # indicate that we don't know the current file pointer:
 269     self.fp = None
 270
 271     return self.serializer.loadf(self.f)
 272
 273   def iterkeys(self):
 274     return self.index_table.iterkeys()
 275
 276   def itervalues(self):
 277     for offset in self.index_table.itervalues():
 278       yield self._fetch(offset)
 279
 280   def __getitem__(self, index):
 281     offset = self.index_table[index]
 282     return self._fetch(offset)
 283
 284   def get(self, item, default=None):
 285     try:
 286       return self[item]
 287     except KeyError:
 288       return default
 289
 290   def get_many(self, indexes, default=None):
 291     """Yield (index,item) tuples for INDEXES, in arbitrary order.
 292
 293     Yield (index,default) for indexes with no defined values."""
 294
 295     offsets = []
 296     for (index, offset) in self.index_table.get_many(indexes):
 297       if offset is None:
 298         yield (index, default)
 299       else:
 300         offsets.append((offset, index))
 301
 302     # Sort the offsets to reduce disk seeking:
 303     offsets.sort()
 304     for (offset,index) in offsets:
 305       yield (index, self._fetch(offset))
 306
 307   def __delitem__(self, index):
 308     # We don't actually free the data in self.f.
 309     del self.index_table[index]
 310
 311   def close(self):
 312     self.index_table.close()
 313     self.index_table = None
 314     self.f.close()
 315     self.f = None
 316
 317   def __str__(self):
 318     return 'IndexedDatabase(%r)' % (self.filename,)
 319
 320
 321 class IndexedStore(IndexedDatabase):
 322   """A file of items that is written sequentially and read randomly.
 323
 324   This is just like IndexedDatabase, except that it has an additional
 325   add() method which assumes that the object to be written to the
 326   database has an 'id' member, which is used as its database index.
 327   See IndexedDatabase for more information."""
 328
 329   def add(self, item):
 330     """Write ITEM into the database indexed by ITEM.id."""
 331
 332     self[item.id] = item
 333
 334