cvs2svn_lib/indexed_database.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2000-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains database facilities used by cvs2svn."""
  18
  19
  20 import cPickle
  21
  22 from cvs2svn_lib.common import DB_OPEN_READ
  23 from cvs2svn_lib.common import DB_OPEN_WRITE
  24 from cvs2svn_lib.common import DB_OPEN_NEW
  25 from cvs2svn_lib.record_table import FileOffsetPacker
  26 from cvs2svn_lib.record_table import RecordTable
  27
  28
  29 class IndexedDatabase:
  30   """A file of objects that are written sequentially and read randomly.
  31
  32   The objects are indexed by small non-negative integers, and a
  33   RecordTable is used to store the index -> fileoffset map.
  34   fileoffset=0 is used to represent an empty record.  (An offset of 0
  35   cannot occur for a legitimate record because the serializer is
  36   written there.)
  37
  38   The main file consists of a sequence of pickles (or other serialized
  39   data format).  The zeroth record is a pickled Serializer.
  40   Subsequent ones are objects serialized using the serializer.  The
  41   offset of each object in the file is stored to an index table so
  42   that the data can later be retrieved randomly.
  43
  44   Objects are always stored to the end of the file.  If an object is
  45   deleted or overwritten, the fact is recorded in the index_table but
  46   the space in the pickle file is not garbage collected.  This has the
  47   advantage that one can create a modified version of a database that
  48   shares the main data file with an old version by copying the index
  49   file.  But it has the disadvantage that space is wasted whenever
  50   objects are written multiple times."""
  51
  52   def __init__(self, filename, index_filename, mode, serializer=None):
  53     """Initialize an IndexedDatabase, writing the serializer if necessary.
  54
  55     SERIALIZER is only used if MODE is DB_OPEN_NEW; otherwise the
  56     serializer is read from the file."""
  57
  58     self.filename = filename
  59     self.index_filename = index_filename
  60     self.mode = mode
  61     if self.mode == DB_OPEN_NEW:
  62       self.f = open(self.filename, 'wb+')
  63     elif self.mode == DB_OPEN_WRITE:
  64       self.f = open(self.filename, 'rb+')
  65     elif self.mode == DB_OPEN_READ:
  66       self.f = open(self.filename, 'rb')
  67     else:
  68       raise RuntimeError('Invalid mode %r' % self.mode)
  69
  70     self.index_table = RecordTable(
  71         self.index_filename, self.mode, FileOffsetPacker()
  72         )
  73
  74     if self.mode == DB_OPEN_NEW:
  75       assert serializer is not None
  76       self.serializer = serializer
  77       cPickle.dump(self.serializer, self.f, -1)
  78     else:
  79       # Read the memo from the first pickle:
  80       self.serializer = cPickle.load(self.f)
  81
  82     # Seek to the end of the file, and record that position:
  83     self.f.seek(0, 2)
  84     self.fp = self.f.tell()
  85     self.eofp = self.fp
  86
  87   def __setitem__(self, index, item):
  88     """Write ITEM into the database indexed by INDEX."""
  89
  90     # Make sure we're at the end of the file:
  91     if self.fp != self.eofp:
  92       self.f.seek(self.eofp)
  93     self.index_table[index] = self.eofp
  94     s = self.serializer.dumps(item)
  95     self.f.write(s)
  96     self.eofp += len(s)
  97     self.fp = self.eofp
  98
  99   def _fetch(self, offset):
 100     if self.fp != offset:
 101       self.f.seek(offset)
 102
 103     # There is no easy way to tell how much data will be read, so just
 104     # indicate that we don't know the current file pointer:
 105     self.fp = None
 106
 107     return self.serializer.loadf(self.f)
 108
 109   def iterkeys(self):
 110     return self.index_table.iterkeys()
 111
 112   def itervalues(self):
 113     for offset in self.index_table.itervalues():
 114       yield self._fetch(offset)
 115
 116   def __getitem__(self, index):
 117     offset = self.index_table[index]
 118     return self._fetch(offset)
 119
 120   def get(self, item, default=None):
 121     try:
 122       return self[item]
 123     except KeyError:
 124       return default
 125
 126   def get_many(self, indexes, default=None):
 127     """Yield (index,item) tuples for INDEXES, in arbitrary order.
 128
 129     Yield (index,default) for indexes with no defined values."""
 130
 131     offsets = []
 132     for (index, offset) in self.index_table.get_many(indexes):
 133       if offset is None:
 134         yield (index, default)
 135       else:
 136         offsets.append((offset, index))
 137
 138     # Sort the offsets to reduce disk seeking:
 139     offsets.sort()
 140     for (offset,index) in offsets:
 141       yield (index, self._fetch(offset))
 142
 143   def __delitem__(self, index):
 144     # We don't actually free the data in self.f.
 145     del self.index_table[index]
 146
 147   def close(self):
 148     self.index_table.close()
 149     self.index_table = None
 150     self.f.close()
 151     self.f = None
 152
 153   def __str__(self):
 154     return 'IndexedDatabase(%r)' % (self.filename,)
 155
 156
 157 class IndexedStore(IndexedDatabase):
 158   """A file of items that is written sequentially and read randomly.
 159
 160   This is just like IndexedDatabase, except that it has an additional
 161   add() method which assumes that the object to be written to the
 162   database has an 'id' member, which is used as its database index.
 163   See IndexedDatabase for more information."""
 164
 165   def add(self, item):
 166     """Write ITEM into the database indexed by ITEM.id."""
 167
 168     self[item.id] = item
 169
 170