1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2007 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """This module contains database facilities used by cvs2svn."""
24 from cvs2svn_lib
.common
import DB_OPEN_READ
25 from cvs2svn_lib
.common
import DB_OPEN_WRITE
26 from cvs2svn_lib
.common
import DB_OPEN_NEW
27 from cvs2svn_lib
.common
import warning_prefix
28 from cvs2svn_lib
.common
import error_prefix
29 from cvs2svn_lib
.log
import Log
30 from cvs2svn_lib
.record_table
import FileOffsetPacker
31 from cvs2svn_lib
.record_table
import RecordTable
34 # DBM module selection
36 # 1. If we have bsddb3, it is probably newer than bsddb. Fake bsddb = bsddb3,
37 # so that the dbhash module used by anydbm will use bsddb3.
40 sys
.modules
['bsddb'] = sys
.modules
['bsddb3']
44 # 2. These DBM modules are not good for cvs2svn.
46 if anydbm
._defaultmod
.__name
__ in ['dumbdbm', 'dbm']:
48 '%s: cvs2svn uses the anydbm package, which depends on lower level '
50 'libraries. Your system has %s, with which cvs2svn is known to have\n'
51 'problems. To use cvs2svn, you must install a Python dbm library '
53 'dumbdbm or dbm. See '
54 'http://python.org/doc/current/lib/module-anydbm.html\n'
55 'for more information.\n'
56 % (error_prefix
, anydbm
._defaultmod
.__name
__,)
60 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
61 # Unfortunately, gdbm appears not to be trouble free, either.
62 if hasattr(anydbm
._defaultmod
, 'bsddb') \
63 and not hasattr(anydbm
._defaultmod
.bsddb
, '__version__'):
65 gdbm
= __import__('gdbm')
68 '%s: The version of the bsddb module found on your computer '
70 'reported to malfunction on some datasets, causing KeyError '
75 anydbm
._defaultmod
= gdbm
78 class AbstractDatabase
:
79 """An abstract base class for anydbm-based databases."""
81 def __init__(self
, filename
, mode
):
82 """A convenience function for opening an anydbm database."""
84 # pybsddb3 has a bug which prevents it from working with
85 # Berkeley DB 4.2 if you open the db with 'n' ("new"). This
86 # causes the DB_TRUNCATE flag to be passed, which is disallowed
87 # for databases protected by lock and transaction support
88 # (bsddb databases use locking from bsddb version 4.2.4 onwards).
90 # Therefore, manually perform the removal (we can do this, because
91 # we know that for bsddb - but *not* anydbm in general - the database
92 # consists of one file with the name we specify, rather than several
93 # based on that name).
94 if mode
== 'n' and anydbm
._defaultmod
.__name
__ == 'dbhash':
95 if os
.path
.isfile(filename
):
99 self
.db
= anydbm
.open(filename
, mode
)
101 # Import implementations for many mapping interface methods. Note
102 # that we specifically do not do this for any method which handles
103 # *values*, because our derived classes define __getitem__ and
104 # __setitem__ to override the storage of values, and grabbing
105 # methods directly from the dbm object would bypass this.
106 for meth_name
in ('__delitem__',
107 '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
108 meth_ref
= getattr(self
.db
, meth_name
, None)
110 setattr(self
, meth_name
, meth_ref
)
112 def __delitem__(self
, key
):
113 # gdbm defines a __delitem__ method, but it cannot be assigned. So
114 # this method provides a fallback definition via explicit delegation:
118 return self
.db
.keys()
121 for key
in self
.keys():
124 def has_key(self
, key
):
131 def __contains__(self
, key
):
132 return self
.has_key(key
)
135 return self
.__iter
__()
138 for key
in self
.keys():
142 return [(key
, self
[key
],) for key
in self
.keys()]
145 return [self
[key
] for key
in self
.keys()]
147 def get(self
, key
, default
=None):
158 class Database(AbstractDatabase
):
159 """A database that uses a Serializer to store objects of a certain type.
161 Since the database entry with the key self.serializer_key is used to
162 store the serializer, self.serializer_key may not be used as a key for
165 serializer_key
= '_.%$1\t;_ '
167 def __init__(self
, filename
, mode
, serializer
=None):
170 The database stores its Serializer, so none needs to be supplied
171 when opening an existing database."""
173 AbstractDatabase
.__init
__(self
, filename
, mode
)
175 if mode
== DB_OPEN_NEW
:
176 self
.serializer
= serializer
177 self
.db
[self
.serializer_key
] = cPickle
.dumps(self
.serializer
)
179 self
.serializer
= cPickle
.loads(self
.db
[self
.serializer_key
])
181 def __getitem__(self
, key
):
182 return self
.serializer
.loads(self
.db
[key
])
184 def __setitem__(self
, key
, value
):
185 self
.db
[key
] = self
.serializer
.dumps(value
)
187 def keys(self
): # TODO: Once needed, handle iterators as well.
188 retval
= self
.db
.keys()
189 retval
.remove(self
.serializer_key
)
193 class IndexedDatabase
:
194 """A file of objects that are written sequentially and read randomly.
196 The objects are indexed by small non-negative integers, and a
197 RecordTable is used to store the index -> fileoffset map.
198 fileoffset=0 is used to represent an empty record. (An offset of 0
199 cannot occur for a legitimate record because the serializer is
202 The main file consists of a sequence of pickles (or other serialized
203 data format). The zeroth record is a pickled Serializer.
204 Subsequent ones are objects serialized using the serializer. The
205 offset of each object in the file is stored to an index table so
206 that the data can later be retrieved randomly.
208 Objects are always stored to the end of the file. If an object is
209 deleted or overwritten, the fact is recorded in the index_table but
210 the space in the pickle file is not garbage collected. This has the
211 advantage that one can create a modified version of a database that
212 shares the main data file with an old version by copying the index
213 file. But it has the disadvantage that space is wasted whenever
214 objects are written multiple times."""
216 def __init__(self
, filename
, index_filename
, mode
, serializer
=None):
217 """Initialize an IndexedDatabase, writing the serializer if necessary.
219 SERIALIZER is only used if MODE is DB_OPEN_NEW; otherwise the
220 serializer is read from the file."""
222 self
.filename
= filename
223 self
.index_filename
= index_filename
225 if self
.mode
== DB_OPEN_NEW
:
226 self
.f
= open(self
.filename
, 'wb+')
227 elif self
.mode
== DB_OPEN_WRITE
:
228 self
.f
= open(self
.filename
, 'rb+')
229 elif self
.mode
== DB_OPEN_READ
:
230 self
.f
= open(self
.filename
, 'rb')
232 raise RuntimeError('Invalid mode %r' % self
.mode
)
234 self
.index_table
= RecordTable(
235 self
.index_filename
, self
.mode
, FileOffsetPacker()
238 if self
.mode
== DB_OPEN_NEW
:
239 assert serializer
is not None
240 self
.serializer
= serializer
241 cPickle
.dump(self
.serializer
, self
.f
, -1)
243 # Read the memo from the first pickle:
244 self
.serializer
= cPickle
.load(self
.f
)
246 # Seek to the end of the file, and record that position:
248 self
.fp
= self
.f
.tell()
251 def __setitem__(self
, index
, item
):
252 """Write ITEM into the database indexed by INDEX."""
254 # Make sure we're at the end of the file:
255 if self
.fp
!= self
.eofp
:
256 self
.f
.seek(self
.eofp
)
257 self
.index_table
[index
] = self
.eofp
258 s
= self
.serializer
.dumps(item
)
263 def _fetch(self
, offset
):
264 if self
.fp
!= offset
:
267 # There is no easy way to tell how much data will be read, so just
268 # indicate that we don't know the current file pointer:
271 return self
.serializer
.loadf(self
.f
)
274 return self
.index_table
.iterkeys()
276 def itervalues(self
):
277 for offset
in self
.index_table
.itervalues():
278 yield self
._fetch
(offset
)
280 def __getitem__(self
, index
):
281 offset
= self
.index_table
[index
]
282 return self
._fetch
(offset
)
284 def get(self
, item
, default
=None):
290 def get_many(self
, indexes
, default
=None):
291 """Yield (index,item) tuples for INDEXES, in arbitrary order.
293 Yield (index,default) for indexes with no defined values."""
296 for (index
, offset
) in self
.index_table
.get_many(indexes
):
298 yield (index
, default
)
300 offsets
.append((offset
, index
))
302 # Sort the offsets to reduce disk seeking:
304 for (offset
,index
) in offsets
:
305 yield (index
, self
._fetch
(offset
))
307 def __delitem__(self
, index
):
308 # We don't actually free the data in self.f.
309 del self
.index_table
[index
]
312 self
.index_table
.close()
313 self
.index_table
= None
318 return 'IndexedDatabase(%r)' % (self
.filename
,)
321 class IndexedStore(IndexedDatabase
):
322 """A file of items that is written sequentially and read randomly.
324 This is just like IndexedDatabase, except that it has an additional
325 add() method which assumes that the object to be written to the
326 database has an 'id' member, which is used as its database index.
327 See IndexedDatabase for more information."""
330 """Write ITEM into the database indexed by ITEM.id."""