cvs2svn_lib/database.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains database facilities used by cvs2svn."""
  18
  19
  20 import sys
  21 import os
  22 import cPickle
  23
  24 from cvs2svn_lib.common import DB_OPEN_READ
  25 from cvs2svn_lib.common import DB_OPEN_WRITE
  26 from cvs2svn_lib.common import DB_OPEN_NEW
  27 from cvs2svn_lib.common import warning_prefix
  28 from cvs2svn_lib.common import error_prefix
  29 from cvs2svn_lib.log import Log
  30 from cvs2svn_lib.record_table import FileOffsetPacker
  31 from cvs2svn_lib.record_table import RecordTable
  32
  33
  34 # DBM module selection
  35
  36 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
  37 #    so that the dbhash module used by anydbm will use bsddb3.
  38 try:
  39   import bsddb3
  40   sys.modules['bsddb'] = sys.modules['bsddb3']
  41 except ImportError:
  42   pass
  43
  44 # 2. These DBM modules are not good for cvs2svn.
  45 import anydbm
  46 if anydbm._defaultmod.__name__ in ['dumbdbm', 'dbm']:
  47   Log().error(
  48       '%s: cvs2svn uses the anydbm package, which depends on lower level '
  49           'dbm\n'
  50       'libraries.  Your system has %s, with which cvs2svn is known to have\n'
  51       'problems.  To use cvs2svn, you must install a Python dbm library '
  52           'other than\n'
  53       'dumbdbm or dbm.  See '
  54           'http://python.org/doc/current/lib/module-anydbm.html\n'
  55       'for more information.\n'
  56       % (error_prefix, anydbm._defaultmod.__name__,)
  57       )
  58   sys.exit(1)
  59
  60 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
  61 #    Unfortunately, gdbm appears not to be trouble free, either.
  62 if hasattr(anydbm._defaultmod, 'bsddb') \
  63     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  64   try:
  65     gdbm = __import__('gdbm')
  66   except ImportError:
  67     Log().warn(
  68         '%s: The version of the bsddb module found on your computer '
  69             'has been\n'
  70         'reported to malfunction on some datasets, causing KeyError '
  71             'exceptions.\n'
  72         % (warning_prefix,)
  73         )
  74   else:
  75     anydbm._defaultmod = gdbm
  76
  77
  78 class Database:
  79   """A database that uses a Serializer to store objects of a certain type.
  80
  81   The serializer is stored in the database under the key
  82   self.serializer_key.  (This implies that self.serializer_key may not
  83   be used as a key for normal entries.)
  84
  85   The backing database is an anydbm-based DBM.
  86
  87   """
  88
  89   serializer_key = '_.%$1\t;_ '
  90
  91   def __init__(self, filename, mode, serializer=None):
  92     """Constructor.
  93
  94     The database stores its Serializer, so none needs to be supplied
  95     when opening an existing database."""
  96
  97     # pybsddb3 has a bug which prevents it from working with
  98     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
  99     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 100     # for databases protected by lock and transaction support
 101     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 102     #
 103     # Therefore, manually perform the removal (we can do this, because
 104     # we know that for bsddb - but *not* anydbm in general - the database
 105     # consists of one file with the name we specify, rather than several
 106     # based on that name).
 107     if mode == DB_OPEN_NEW and anydbm._defaultmod.__name__ == 'dbhash':
 108       if os.path.isfile(filename):
 109         os.unlink(filename)
 110       self.db = anydbm.open(filename, 'c')
 111     else:
 112       self.db = anydbm.open(filename, mode)
 113
 114     # Import implementations for many mapping interface methods.
 115     for meth_name in ('__delitem__',
 116         '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
 117       meth_ref = getattr(self.db, meth_name, None)
 118       if meth_ref:
 119         setattr(self, meth_name, meth_ref)
 120
 121     if mode == DB_OPEN_NEW:
 122       self.serializer = serializer
 123       self.db[self.serializer_key] = cPickle.dumps(self.serializer)
 124     else:
 125       self.serializer = cPickle.loads(self.db[self.serializer_key])
 126
 127   def __getitem__(self, key):
 128     return self.serializer.loads(self.db[key])
 129
 130   def __setitem__(self, key, value):
 131     self.db[key] = self.serializer.dumps(value)
 132
 133   def __delitem__(self, key):
 134     # gdbm defines a __delitem__ method, but it cannot be assigned.  So
 135     # this method provides a fallback definition via explicit delegation:
 136     del self.db[key]
 137
 138   def keys(self):
 139     retval = self.db.keys()
 140     retval.remove(self.serializer_key)
 141     return retval
 142
 143   def __iter__(self):
 144     for key in self.keys():
 145       yield key
 146
 147   def has_key(self, key):
 148     try:
 149       self.db[key]
 150       return True
 151     except KeyError:
 152       return False
 153
 154   def __contains__(self, key):
 155     return self.has_key(key)
 156
 157   def iterkeys(self):
 158     return self.__iter__()
 159
 160   def clear(self):
 161     for key in self.keys():
 162       del self[key]
 163
 164   def items(self):
 165     return [(key, self[key],) for key in self.keys()]
 166
 167   def values(self):
 168     return [self[key] for key in self.keys()]
 169
 170   def get(self, key, default=None):
 171     try:
 172       return self[key]
 173     except KeyError:
 174       return default
 175
 176   def close(self):
 177     self.db.close()
 178     self.db = None
 179
 180
 181 class IndexedDatabase:
 182   """A file of objects that are written sequentially and read randomly.
 183
 184   The objects are indexed by small non-negative integers, and a
 185   RecordTable is used to store the index -> fileoffset map.
 186   fileoffset=0 is used to represent an empty record.  (An offset of 0
 187   cannot occur for a legitimate record because the serializer is
 188   written there.)
 189
 190   The main file consists of a sequence of pickles (or other serialized
 191   data format).  The zeroth record is a pickled Serializer.
 192   Subsequent ones are objects serialized using the serializer.  The
 193   offset of each object in the file is stored to an index table so
 194   that the data can later be retrieved randomly.
 195
 196   Objects are always stored to the end of the file.  If an object is
 197   deleted or overwritten, the fact is recorded in the index_table but
 198   the space in the pickle file is not garbage collected.  This has the
 199   advantage that one can create a modified version of a database that
 200   shares the main data file with an old version by copying the index
 201   file.  But it has the disadvantage that space is wasted whenever
 202   objects are written multiple times."""
 203
 204   def __init__(self, filename, index_filename, mode, serializer=None):
 205     """Initialize an IndexedDatabase, writing the serializer if necessary.
 206
 207     SERIALIZER is only used if MODE is DB_OPEN_NEW; otherwise the
 208     serializer is read from the file."""
 209
 210     self.filename = filename
 211     self.index_filename = index_filename
 212     self.mode = mode
 213     if self.mode == DB_OPEN_NEW:
 214       self.f = open(self.filename, 'wb+')
 215     elif self.mode == DB_OPEN_WRITE:
 216       self.f = open(self.filename, 'rb+')
 217     elif self.mode == DB_OPEN_READ:
 218       self.f = open(self.filename, 'rb')
 219     else:
 220       raise RuntimeError('Invalid mode %r' % self.mode)
 221
 222     self.index_table = RecordTable(
 223         self.index_filename, self.mode, FileOffsetPacker()
 224         )
 225
 226     if self.mode == DB_OPEN_NEW:
 227       assert serializer is not None
 228       self.serializer = serializer
 229       cPickle.dump(self.serializer, self.f, -1)
 230     else:
 231       # Read the memo from the first pickle:
 232       self.serializer = cPickle.load(self.f)
 233
 234     # Seek to the end of the file, and record that position:
 235     self.f.seek(0, 2)
 236     self.fp = self.f.tell()
 237     self.eofp = self.fp
 238
 239   def __setitem__(self, index, item):
 240     """Write ITEM into the database indexed by INDEX."""
 241
 242     # Make sure we're at the end of the file:
 243     if self.fp != self.eofp:
 244       self.f.seek(self.eofp)
 245     self.index_table[index] = self.eofp
 246     s = self.serializer.dumps(item)
 247     self.f.write(s)
 248     self.eofp += len(s)
 249     self.fp = self.eofp
 250
 251   def _fetch(self, offset):
 252     if self.fp != offset:
 253       self.f.seek(offset)
 254
 255     # There is no easy way to tell how much data will be read, so just
 256     # indicate that we don't know the current file pointer:
 257     self.fp = None
 258
 259     return self.serializer.loadf(self.f)
 260
 261   def iterkeys(self):
 262     return self.index_table.iterkeys()
 263
 264   def itervalues(self):
 265     for offset in self.index_table.itervalues():
 266       yield self._fetch(offset)
 267
 268   def __getitem__(self, index):
 269     offset = self.index_table[index]
 270     return self._fetch(offset)
 271
 272   def get(self, item, default=None):
 273     try:
 274       return self[item]
 275     except KeyError:
 276       return default
 277
 278   def get_many(self, indexes, default=None):
 279     """Yield (index,item) tuples for INDEXES, in arbitrary order.
 280
 281     Yield (index,default) for indexes with no defined values."""
 282
 283     offsets = []
 284     for (index, offset) in self.index_table.get_many(indexes):
 285       if offset is None:
 286         yield (index, default)
 287       else:
 288         offsets.append((offset, index))
 289
 290     # Sort the offsets to reduce disk seeking:
 291     offsets.sort()
 292     for (offset,index) in offsets:
 293       yield (index, self._fetch(offset))
 294
 295   def __delitem__(self, index):
 296     # We don't actually free the data in self.f.
 297     del self.index_table[index]
 298
 299   def close(self):
 300     self.index_table.close()
 301     self.index_table = None
 302     self.f.close()
 303     self.f = None
 304
 305   def __str__(self):
 306     return 'IndexedDatabase(%r)' % (self.filename,)
 307
 308
 309 class IndexedStore(IndexedDatabase):
 310   """A file of items that is written sequentially and read randomly.
 311
 312   This is just like IndexedDatabase, except that it has an additional
 313   add() method which assumes that the object to be written to the
 314   database has an 'id' member, which is used as its database index.
 315   See IndexedDatabase for more information."""
 316
 317   def add(self, item):
 318     """Write ITEM into the database indexed by ITEM.id."""
 319
 320     self[item.id] = item
 321
 322