Document changes made since last edit of CHANGES.
[cvs2svn.git] / cvs2svn_lib / database.py
blobff5bb71818bbec3f7b1f6e66cc3231bccf55efab
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """This module contains database facilities used by cvs2svn."""
20 import sys
21 import os
22 import cPickle
24 from cvs2svn_lib.common import DB_OPEN_READ
25 from cvs2svn_lib.common import DB_OPEN_WRITE
26 from cvs2svn_lib.common import DB_OPEN_NEW
27 from cvs2svn_lib.common import warning_prefix
28 from cvs2svn_lib.common import error_prefix
29 from cvs2svn_lib.log import logger
30 from cvs2svn_lib.record_table import FileOffsetPacker
31 from cvs2svn_lib.record_table import RecordTable
34 # DBM module selection
36 # 1. If we have bsddb3, it is probably newer than bsddb. Fake bsddb = bsddb3,
37 # so that the dbhash module used by anydbm will use bsddb3.
38 try:
39 import bsddb3
40 sys.modules['bsddb'] = sys.modules['bsddb3']
41 except ImportError:
42 pass
44 # 2. These DBM modules are not good for cvs2svn.
45 import anydbm
46 if anydbm._defaultmod.__name__ in ['dumbdbm', 'dbm']:
47 logger.error(
48 '%s: cvs2svn uses the anydbm package, which depends on lower level '
49 'dbm\n'
50 'libraries. Your system has %s, with which cvs2svn is known to have\n'
51 'problems. To use cvs2svn, you must install a Python dbm library '
52 'other than\n'
53 'dumbdbm or dbm. See '
54 'http://python.org/doc/current/lib/module-anydbm.html\n'
55 'for more information.\n'
56 % (error_prefix, anydbm._defaultmod.__name__,)
58 sys.exit(1)
60 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
61 # Unfortunately, gdbm appears not to be trouble free, either.
62 if hasattr(anydbm._defaultmod, 'bsddb') \
63 and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
64 try:
65 gdbm = __import__('gdbm')
66 except ImportError:
67 logger.warn(
68 '%s: The version of the bsddb module found on your computer '
69 'has been\n'
70 'reported to malfunction on some datasets, causing KeyError '
71 'exceptions.\n'
72 % (warning_prefix,)
74 else:
75 anydbm._defaultmod = gdbm
78 class Database:
79 """A database that uses a Serializer to store objects of a certain type.
81 The serializer is stored in the database under the key
82 self.serializer_key. (This implies that self.serializer_key may not
83 be used as a key for normal entries.)
85 The backing database is an anydbm-based DBM.
87 """
89 serializer_key = '_.%$1\t;_ '
91 def __init__(self, filename, mode, serializer=None):
92 """Constructor.
94 The database stores its Serializer, so none needs to be supplied
95 when opening an existing database."""
97 # pybsddb3 has a bug which prevents it from working with
98 # Berkeley DB 4.2 if you open the db with 'n' ("new"). This
99 # causes the DB_TRUNCATE flag to be passed, which is disallowed
100 # for databases protected by lock and transaction support
101 # (bsddb databases use locking from bsddb version 4.2.4 onwards).
103 # Therefore, manually perform the removal (we can do this, because
104 # we know that for bsddb - but *not* anydbm in general - the database
105 # consists of one file with the name we specify, rather than several
106 # based on that name).
107 if mode == DB_OPEN_NEW and anydbm._defaultmod.__name__ == 'dbhash':
108 if os.path.isfile(filename):
109 os.unlink(filename)
110 self.db = anydbm.open(filename, 'c')
111 else:
112 self.db = anydbm.open(filename, mode)
114 # Import implementations for many mapping interface methods.
115 for meth_name in ('__delitem__',
116 '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
117 meth_ref = getattr(self.db, meth_name, None)
118 if meth_ref:
119 setattr(self, meth_name, meth_ref)
121 if mode == DB_OPEN_NEW:
122 self.serializer = serializer
123 self.db[self.serializer_key] = cPickle.dumps(self.serializer)
124 else:
125 self.serializer = cPickle.loads(self.db[self.serializer_key])
127 def __getitem__(self, key):
128 return self.serializer.loads(self.db[key])
130 def __setitem__(self, key, value):
131 self.db[key] = self.serializer.dumps(value)
133 def __delitem__(self, key):
134 # gdbm defines a __delitem__ method, but it cannot be assigned. So
135 # this method provides a fallback definition via explicit delegation:
136 del self.db[key]
138 def keys(self):
139 retval = self.db.keys()
140 retval.remove(self.serializer_key)
141 return retval
143 def __iter__(self):
144 for key in self.keys():
145 yield key
147 def has_key(self, key):
148 try:
149 self.db[key]
150 return True
151 except KeyError:
152 return False
154 def __contains__(self, key):
155 return self.has_key(key)
157 def iterkeys(self):
158 return self.__iter__()
160 def clear(self):
161 for key in self.keys():
162 del self[key]
164 def items(self):
165 return [(key, self[key],) for key in self.keys()]
167 def values(self):
168 return [self[key] for key in self.keys()]
170 def get(self, key, default=None):
171 try:
172 return self[key]
173 except KeyError:
174 return default
176 def close(self):
177 self.db.close()
178 self.db = None
181 class IndexedDatabase:
182 """A file of objects that are written sequentially and read randomly.
184 The objects are indexed by small non-negative integers, and a
185 RecordTable is used to store the index -> fileoffset map.
186 fileoffset=0 is used to represent an empty record. (An offset of 0
187 cannot occur for a legitimate record because the serializer is
188 written there.)
190 The main file consists of a sequence of pickles (or other serialized
191 data format). The zeroth record is a pickled Serializer.
192 Subsequent ones are objects serialized using the serializer. The
193 offset of each object in the file is stored to an index table so
194 that the data can later be retrieved randomly.
196 Objects are always stored to the end of the file. If an object is
197 deleted or overwritten, the fact is recorded in the index_table but
198 the space in the pickle file is not garbage collected. This has the
199 advantage that one can create a modified version of a database that
200 shares the main data file with an old version by copying the index
201 file. But it has the disadvantage that space is wasted whenever
202 objects are written multiple times."""
204 def __init__(self, filename, index_filename, mode, serializer=None):
205 """Initialize an IndexedDatabase, writing the serializer if necessary.
207 SERIALIZER is only used if MODE is DB_OPEN_NEW; otherwise the
208 serializer is read from the file."""
210 self.filename = filename
211 self.index_filename = index_filename
212 self.mode = mode
213 if self.mode == DB_OPEN_NEW:
214 self.f = open(self.filename, 'wb+')
215 elif self.mode == DB_OPEN_WRITE:
216 self.f = open(self.filename, 'rb+')
217 elif self.mode == DB_OPEN_READ:
218 self.f = open(self.filename, 'rb')
219 else:
220 raise RuntimeError('Invalid mode %r' % self.mode)
222 self.index_table = RecordTable(
223 self.index_filename, self.mode, FileOffsetPacker()
226 if self.mode == DB_OPEN_NEW:
227 assert serializer is not None
228 self.serializer = serializer
229 cPickle.dump(self.serializer, self.f, -1)
230 else:
231 # Read the memo from the first pickle:
232 self.serializer = cPickle.load(self.f)
234 # Seek to the end of the file, and record that position:
235 self.f.seek(0, 2)
236 self.fp = self.f.tell()
237 self.eofp = self.fp
239 def __setitem__(self, index, item):
240 """Write ITEM into the database indexed by INDEX."""
242 # Make sure we're at the end of the file:
243 if self.fp != self.eofp:
244 self.f.seek(self.eofp)
245 self.index_table[index] = self.eofp
246 s = self.serializer.dumps(item)
247 self.f.write(s)
248 self.eofp += len(s)
249 self.fp = self.eofp
251 def _fetch(self, offset):
252 if self.fp != offset:
253 self.f.seek(offset)
255 # There is no easy way to tell how much data will be read, so just
256 # indicate that we don't know the current file pointer:
257 self.fp = None
259 return self.serializer.loadf(self.f)
261 def iterkeys(self):
262 return self.index_table.iterkeys()
264 def itervalues(self):
265 for offset in self.index_table.itervalues():
266 yield self._fetch(offset)
268 def __getitem__(self, index):
269 offset = self.index_table[index]
270 return self._fetch(offset)
272 def get(self, item, default=None):
273 try:
274 return self[item]
275 except KeyError:
276 return default
278 def get_many(self, indexes, default=None):
279 """Yield (index,item) tuples for INDEXES, in arbitrary order.
281 Yield (index,default) for indexes with no defined values."""
283 offsets = []
284 for (index, offset) in self.index_table.get_many(indexes):
285 if offset is None:
286 yield (index, default)
287 else:
288 offsets.append((offset, index))
290 # Sort the offsets to reduce disk seeking:
291 offsets.sort()
292 for (offset,index) in offsets:
293 yield (index, self._fetch(offset))
295 def __delitem__(self, index):
296 # We don't actually free the data in self.f.
297 del self.index_table[index]
299 def close(self):
300 self.index_table.close()
301 self.index_table = None
302 self.f.close()
303 self.f = None
305 def __str__(self):
306 return 'IndexedDatabase(%r)' % (self.filename,)
309 class IndexedStore(IndexedDatabase):
310 """A file of items that is written sequentially and read randomly.
312 This is just like IndexedDatabase, except that it has an additional
313 add() method which assumes that the object to be written to the
314 database has an 'id' member, which is used as its database index.
315 See IndexedDatabase for more information."""
317 def add(self, item):
318 """Write ITEM into the database indexed by ITEM.id."""
320 self[item.id] = item