1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2009 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """This module contains database facilities used by cvs2svn."""
24 from cvs2svn_lib
.common
import DB_OPEN_READ
25 from cvs2svn_lib
.common
import DB_OPEN_WRITE
26 from cvs2svn_lib
.common
import DB_OPEN_NEW
27 from cvs2svn_lib
.common
import warning_prefix
28 from cvs2svn_lib
.common
import error_prefix
29 from cvs2svn_lib
.log
import Log
30 from cvs2svn_lib
.record_table
import FileOffsetPacker
31 from cvs2svn_lib
.record_table
import RecordTable
34 # DBM module selection
36 # 1. If we have bsddb3, it is probably newer than bsddb. Fake bsddb = bsddb3,
37 # so that the dbhash module used by anydbm will use bsddb3.
40 sys
.modules
['bsddb'] = sys
.modules
['bsddb3']
44 # 2. These DBM modules are not good for cvs2svn.
46 if anydbm
._defaultmod
.__name
__ in ['dumbdbm', 'dbm']:
48 '%s: cvs2svn uses the anydbm package, which depends on lower level '
50 'libraries. Your system has %s, with which cvs2svn is known to have\n'
51 'problems. To use cvs2svn, you must install a Python dbm library '
53 'dumbdbm or dbm. See '
54 'http://python.org/doc/current/lib/module-anydbm.html\n'
55 'for more information.\n'
56 % (error_prefix
, anydbm
._defaultmod
.__name
__,)
60 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
61 # Unfortunately, gdbm appears not to be trouble free, either.
62 if hasattr(anydbm
._defaultmod
, 'bsddb') \
63 and not hasattr(anydbm
._defaultmod
.bsddb
, '__version__'):
65 gdbm
= __import__('gdbm')
68 '%s: The version of the bsddb module found on your computer '
70 'reported to malfunction on some datasets, causing KeyError '
75 anydbm
._defaultmod
= gdbm
79 """A database that uses a Serializer to store objects of a certain type.
81 The serializer is stored in the database under the key
82 self.serializer_key. (This implies that self.serializer_key may not
83 be used as a key for normal entries.)
85 The backing database is an anydbm-based DBM.
89 serializer_key
= '_.%$1\t;_ '
91 def __init__(self
, filename
, mode
, serializer
=None):
94 The database stores its Serializer, so none needs to be supplied
95 when opening an existing database."""
97 # pybsddb3 has a bug which prevents it from working with
98 # Berkeley DB 4.2 if you open the db with 'n' ("new"). This
99 # causes the DB_TRUNCATE flag to be passed, which is disallowed
100 # for databases protected by lock and transaction support
101 # (bsddb databases use locking from bsddb version 4.2.4 onwards).
103 # Therefore, manually perform the removal (we can do this, because
104 # we know that for bsddb - but *not* anydbm in general - the database
105 # consists of one file with the name we specify, rather than several
106 # based on that name).
107 if mode
== DB_OPEN_NEW
and anydbm
._defaultmod
.__name
__ == 'dbhash':
108 if os
.path
.isfile(filename
):
110 self
.db
= anydbm
.open(filename
, 'c')
112 self
.db
= anydbm
.open(filename
, mode
)
114 # Import implementations for many mapping interface methods.
115 for meth_name
in ('__delitem__',
116 '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
117 meth_ref
= getattr(self
.db
, meth_name
, None)
119 setattr(self
, meth_name
, meth_ref
)
121 if mode
== DB_OPEN_NEW
:
122 self
.serializer
= serializer
123 self
.db
[self
.serializer_key
] = cPickle
.dumps(self
.serializer
)
125 self
.serializer
= cPickle
.loads(self
.db
[self
.serializer_key
])
127 def __getitem__(self
, key
):
128 return self
.serializer
.loads(self
.db
[key
])
130 def __setitem__(self
, key
, value
):
131 self
.db
[key
] = self
.serializer
.dumps(value
)
133 def __delitem__(self
, key
):
134 # gdbm defines a __delitem__ method, but it cannot be assigned. So
135 # this method provides a fallback definition via explicit delegation:
139 retval
= self
.db
.keys()
140 retval
.remove(self
.serializer_key
)
144 for key
in self
.keys():
147 def has_key(self
, key
):
154 def __contains__(self
, key
):
155 return self
.has_key(key
)
158 return self
.__iter
__()
161 for key
in self
.keys():
165 return [(key
, self
[key
],) for key
in self
.keys()]
168 return [self
[key
] for key
in self
.keys()]
170 def get(self
, key
, default
=None):
181 class IndexedDatabase
:
182 """A file of objects that are written sequentially and read randomly.
184 The objects are indexed by small non-negative integers, and a
185 RecordTable is used to store the index -> fileoffset map.
186 fileoffset=0 is used to represent an empty record. (An offset of 0
187 cannot occur for a legitimate record because the serializer is
190 The main file consists of a sequence of pickles (or other serialized
191 data format). The zeroth record is a pickled Serializer.
192 Subsequent ones are objects serialized using the serializer. The
193 offset of each object in the file is stored to an index table so
194 that the data can later be retrieved randomly.
196 Objects are always stored to the end of the file. If an object is
197 deleted or overwritten, the fact is recorded in the index_table but
198 the space in the pickle file is not garbage collected. This has the
199 advantage that one can create a modified version of a database that
200 shares the main data file with an old version by copying the index
201 file. But it has the disadvantage that space is wasted whenever
202 objects are written multiple times."""
204 def __init__(self
, filename
, index_filename
, mode
, serializer
=None):
205 """Initialize an IndexedDatabase, writing the serializer if necessary.
207 SERIALIZER is only used if MODE is DB_OPEN_NEW; otherwise the
208 serializer is read from the file."""
210 self
.filename
= filename
211 self
.index_filename
= index_filename
213 if self
.mode
== DB_OPEN_NEW
:
214 self
.f
= open(self
.filename
, 'wb+')
215 elif self
.mode
== DB_OPEN_WRITE
:
216 self
.f
= open(self
.filename
, 'rb+')
217 elif self
.mode
== DB_OPEN_READ
:
218 self
.f
= open(self
.filename
, 'rb')
220 raise RuntimeError('Invalid mode %r' % self
.mode
)
222 self
.index_table
= RecordTable(
223 self
.index_filename
, self
.mode
, FileOffsetPacker()
226 if self
.mode
== DB_OPEN_NEW
:
227 assert serializer
is not None
228 self
.serializer
= serializer
229 cPickle
.dump(self
.serializer
, self
.f
, -1)
231 # Read the memo from the first pickle:
232 self
.serializer
= cPickle
.load(self
.f
)
234 # Seek to the end of the file, and record that position:
236 self
.fp
= self
.f
.tell()
239 def __setitem__(self
, index
, item
):
240 """Write ITEM into the database indexed by INDEX."""
242 # Make sure we're at the end of the file:
243 if self
.fp
!= self
.eofp
:
244 self
.f
.seek(self
.eofp
)
245 self
.index_table
[index
] = self
.eofp
246 s
= self
.serializer
.dumps(item
)
251 def _fetch(self
, offset
):
252 if self
.fp
!= offset
:
255 # There is no easy way to tell how much data will be read, so just
256 # indicate that we don't know the current file pointer:
259 return self
.serializer
.loadf(self
.f
)
262 return self
.index_table
.iterkeys()
264 def itervalues(self
):
265 for offset
in self
.index_table
.itervalues():
266 yield self
._fetch
(offset
)
268 def __getitem__(self
, index
):
269 offset
= self
.index_table
[index
]
270 return self
._fetch
(offset
)
272 def get(self
, item
, default
=None):
278 def get_many(self
, indexes
, default
=None):
279 """Yield (index,item) tuples for INDEXES, in arbitrary order.
281 Yield (index,default) for indexes with no defined values."""
284 for (index
, offset
) in self
.index_table
.get_many(indexes
):
286 yield (index
, default
)
288 offsets
.append((offset
, index
))
290 # Sort the offsets to reduce disk seeking:
292 for (offset
,index
) in offsets
:
293 yield (index
, self
._fetch
(offset
))
295 def __delitem__(self
, index
):
296 # We don't actually free the data in self.f.
297 del self
.index_table
[index
]
300 self
.index_table
.close()
301 self
.index_table
= None
306 return 'IndexedDatabase(%r)' % (self
.filename
,)
309 class IndexedStore(IndexedDatabase
):
310 """A file of items that is written sequentially and read randomly.
312 This is just like IndexedDatabase, except that it has an additional
313 add() method which assumes that the object to be written to the
314 database has an 'id' member, which is used as its database index.
315 See IndexedDatabase for more information."""
318 """Write ITEM into the database indexed by ITEM.id."""