Allow --encoding and --fallback-encoding to be used with --options.
[cvs2svn.git] / cvs2svn_lib / record_table.py
blob23b0209a942f6b68524fbf596170dd68ab559d1f
1 # (Be in -*- python -*- mode.)
3 # ====================================================================
4 # Copyright (c) 2000-2007 CollabNet. All rights reserved.
6 # This software is licensed as described in the file COPYING, which
7 # you should have received as part of this distribution. The terms
8 # are also available at http://subversion.tigris.org/license-1.html.
9 # If newer versions of this license are posted there, you may use a
10 # newer version instead, at your option.
12 # This software consists of voluntary contributions made by many
13 # individuals. For exact contribution history, see the revision
14 # history and logs, available at http://cvs2svn.tigris.org/.
15 # ====================================================================
17 """Classes to manage Databases of fixed-length records.
19 The databases map small, non-negative integers to fixed-size records.
20 The records are written in index order to a disk file. Gaps in the
21 index sequence leave gaps in the data file, so for best space
22 efficiency the indexes of existing records should be approximately
23 continuous.
25 To use a RecordTable, you need a class derived from Packer which can
26 serialize/deserialize your records into fixed-size strings. Deriving
27 classes have to specify how to pack records into strings and unpack
28 strings into records by overwriting the pack() and unpack() methods
29 respectively.
31 Note that these classes keep track of gaps in the records that have
32 been written by filling them with packer.empty_value. If a record is
33 read which contains packer.empty_value, then a KeyError is raised."""
36 import os
37 import types
38 import struct
39 import mmap
41 from cvs2svn_lib.common import DB_OPEN_READ
42 from cvs2svn_lib.common import DB_OPEN_WRITE
43 from cvs2svn_lib.common import DB_OPEN_NEW
44 from cvs2svn_lib.log import Log
47 # A unique value that can be used to stand for "unset" without
48 # preventing the use of None.
49 _unset = object()
52 class Packer(object):
53 def __init__(self, record_len, empty_value=None):
54 self.record_len = record_len
55 if empty_value is None:
56 self.empty_value = '\0' * self.record_len
57 else:
58 assert type(empty_value) is types.StringType
59 assert len(empty_value) == self.record_len
60 self.empty_value = empty_value
62 def pack(self, v):
63 """Pack record V into a string of length self.record_len."""
65 raise NotImplementedError()
67 def unpack(self, s):
68 """Unpack string S into a record."""
70 raise NotImplementedError()
73 class StructPacker(Packer):
74 def __init__(self, format, empty_value=_unset):
75 self.format = format
76 if empty_value is not _unset:
77 empty_value = self.pack(empty_value)
78 else:
79 empty_value = None
81 Packer.__init__(self, struct.calcsize(self.format),
82 empty_value=empty_value)
84 def pack(self, v):
85 return struct.pack(self.format, v)
87 def unpack(self, v):
88 return struct.unpack(self.format, v)[0]
91 class UnsignedIntegerPacker(StructPacker):
92 def __init__(self, empty_value=0):
93 StructPacker.__init__(self, '=I', empty_value)
96 class SignedIntegerPacker(StructPacker):
97 def __init__(self, empty_value=0):
98 StructPacker.__init__(self, '=i', empty_value)
101 class FileOffsetPacker(Packer):
102 """A packer suitable for file offsets.
104 We store the 5 least significant bytes of the file offset. This is
105 enough bits to represent 1 TiB. Of course if the computer
106 doesn't have large file support, only the lowest 31 bits can be
107 nonzero, and the offsets are limited to 2 GiB."""
109 # Convert file offsets to 8-bit little-endian unsigned longs...
110 INDEX_FORMAT = '<Q'
111 # ...but then truncate to 5 bytes.
112 INDEX_FORMAT_LEN = 5
114 PAD = '\0' * (struct.calcsize(INDEX_FORMAT) - INDEX_FORMAT_LEN)
116 def __init__(self):
117 Packer.__init__(self, self.INDEX_FORMAT_LEN)
119 def pack(self, v):
120 return struct.pack(self.INDEX_FORMAT, v)[:self.INDEX_FORMAT_LEN]
122 def unpack(self, s):
123 return struct.unpack(self.INDEX_FORMAT, s + self.PAD)[0]
126 class RecordTableAccessError(RuntimeError):
127 pass
130 class AbstractRecordTable:
131 def __init__(self, filename, mode, packer):
132 self.filename = filename
133 self.mode = mode
134 self.packer = packer
135 # Simplify and speed access to this oft-needed quantity:
136 self._record_len = self.packer.record_len
138 def __str__(self):
139 return '%s(%r)' % (self.__class__.__name__, self.filename,)
141 def _set_packed_record(self, i, s):
142 """Set the value for index I to the packed value S."""
144 raise NotImplementedError()
146 def __setitem__(self, i, v):
147 self._set_packed_record(i, self.packer.pack(v))
149 def _get_packed_record(self, i):
150 """Return the packed record for index I.
152 Raise KeyError if it is not present."""
154 raise NotImplementedError()
156 def __getitem__(self, i):
157 """Return the item for index I.
159 Raise KeyError if that item has never been set (or if it was set
160 to self.packer.empty_value)."""
162 s = self._get_packed_record(i)
164 if s == self.packer.empty_value:
165 raise KeyError(i)
167 return self.packer.unpack(s)
169 def get_many(self, indexes, default=None):
170 """Yield (index, item) typles for INDEXES in arbitrary order.
172 Yield (index,default) for indices for which not item is defined."""
174 indexes = list(indexes)
175 # Sort the indexes to reduce disk seeking:
176 indexes.sort()
177 for i in indexes:
178 yield (i, self.get(i, default))
180 def get(self, i, default=None):
181 try:
182 return self[i]
183 except KeyError:
184 return default
186 def __delitem__(self, i):
187 """Delete the item for index I.
189 Raise KeyError if that item has never been set (or if it was set
190 to self.packer.empty_value)."""
192 if self.mode == DB_OPEN_READ:
193 raise RecordTableAccessError()
195 # Check that the value was set (otherwise raise KeyError):
196 self[i]
197 self._set_packed_record(i, self.packer.empty_value)
199 def iterkeys(self):
200 """Yield the keys in the map in key order."""
202 for i in xrange(0, self._limit):
203 try:
204 self[i]
205 yield i
206 except KeyError:
207 pass
209 def itervalues(self):
210 """Yield the values in the map in key order.
212 Skip over values that haven't been defined."""
214 for i in xrange(0, self._limit):
215 try:
216 yield self[i]
217 except KeyError:
218 pass
221 class RecordTable(AbstractRecordTable):
222 # The approximate amount of memory that should be used for the cache
223 # for each instance of this class:
224 CACHE_MEMORY = 4 * 1024 * 1024
226 # Empirically, each entry in the cache table has an overhead of
227 # about 96 bytes on a 32-bit computer.
228 CACHE_OVERHEAD_PER_ENTRY = 96
230 def __init__(self, filename, mode, packer, cache_memory=CACHE_MEMORY):
231 AbstractRecordTable.__init__(self, filename, mode, packer)
232 if self.mode == DB_OPEN_NEW:
233 self.f = open(self.filename, 'wb+')
234 elif self.mode == DB_OPEN_WRITE:
235 self.f = open(self.filename, 'rb+')
236 elif self.mode == DB_OPEN_READ:
237 self.f = open(self.filename, 'rb')
238 else:
239 raise RuntimeError('Invalid mode %r' % self.mode)
240 self.cache_memory = cache_memory
242 # Number of items that can be stored in the write cache.
243 self._max_memory_cache = (
244 self.cache_memory
245 / (self.CACHE_OVERHEAD_PER_ENTRY + self._record_len))
247 # Read and write cache; a map {i : (dirty, s)}, where i is an
248 # index, dirty indicates whether the value has to be written to
249 # disk, and s is the packed value for the index. Up to
250 # self._max_memory_cache items can be stored here. When the cache
251 # fills up, it is written to disk in one go and then cleared.
252 self._cache = {}
254 # The index just beyond the last record ever written:
255 self._limit = os.path.getsize(self.filename) // self._record_len
257 # The index just beyond the last record ever written to disk:
258 self._limit_written = self._limit
260 def flush(self):
261 Log().debug('Flushing cache for %s' % (self,))
263 pairs = [(i, s) for (i, (dirty, s)) in self._cache.items() if dirty]
265 if pairs:
266 pairs.sort()
267 old_i = None
268 f = self.f
269 for (i, s) in pairs:
270 if i == old_i:
271 # No seeking needed
272 pass
273 elif i <= self._limit_written:
274 # Just jump there:
275 f.seek(i * self._record_len)
276 else:
277 # Jump to the end of the file then write _empty_values until
278 # we reach the correct location:
279 f.seek(self._limit_written * self._record_len)
280 while self._limit_written < i:
281 f.write(self.packer.empty_value)
282 self._limit_written += 1
283 f.write(s)
284 old_i = i + 1
285 self._limit_written = max(self._limit_written, old_i)
287 self.f.flush()
289 self._cache.clear()
291 def _set_packed_record(self, i, s):
292 if self.mode == DB_OPEN_READ:
293 raise RecordTableAccessError()
294 if i < 0:
295 raise KeyError()
296 self._cache[i] = (True, s)
297 if len(self._cache) >= self._max_memory_cache:
298 self.flush()
299 self._limit = max(self._limit, i + 1)
301 def _get_packed_record(self, i):
302 try:
303 return self._cache[i][1]
304 except KeyError:
305 if not 0 <= i < self._limit_written:
306 raise KeyError(i)
307 self.f.seek(i * self._record_len)
308 s = self.f.read(self._record_len)
309 self._cache[i] = (False, s)
310 if len(self._cache) >= self._max_memory_cache:
311 self.flush()
313 return s
315 def close(self):
316 self.flush()
317 self._cache = None
318 self.f.close()
319 self.f = None
322 class MmapRecordTable(AbstractRecordTable):
323 GROWTH_INCREMENT = 65536
325 def __init__(self, filename, mode, packer):
326 AbstractRecordTable.__init__(self, filename, mode, packer)
327 if self.mode == DB_OPEN_NEW:
328 self.python_file = open(self.filename, 'wb+')
329 self.python_file.write('\0' * self.GROWTH_INCREMENT)
330 self.python_file.flush()
331 self._filesize = self.GROWTH_INCREMENT
332 self.f = mmap.mmap(
333 self.python_file.fileno(), self._filesize,
334 access=mmap.ACCESS_WRITE
337 # The index just beyond the last record ever written:
338 self._limit = 0
339 elif self.mode == DB_OPEN_WRITE:
340 self.python_file = open(self.filename, 'rb+')
341 self._filesize = os.path.getsize(self.filename)
342 self.f = mmap.mmap(
343 self.python_file.fileno(), self._filesize,
344 access=mmap.ACCESS_WRITE
347 # The index just beyond the last record ever written:
348 self._limit = os.path.getsize(self.filename) // self._record_len
349 elif self.mode == DB_OPEN_READ:
350 self.python_file = open(self.filename, 'rb')
351 self._filesize = os.path.getsize(self.filename)
352 self.f = mmap.mmap(
353 self.python_file.fileno(), self._filesize,
354 access=mmap.ACCESS_READ
357 # The index just beyond the last record ever written:
358 self._limit = os.path.getsize(self.filename) // self._record_len
359 else:
360 raise RuntimeError('Invalid mode %r' % self.mode)
362 def flush(self):
363 self.f.flush()
365 def _set_packed_record(self, i, s):
366 if self.mode == DB_OPEN_READ:
367 raise RecordTableAccessError()
368 if i < 0:
369 raise KeyError()
370 if i >= self._limit:
371 # This write extends the range of valid indices. First check
372 # whether the file has to be enlarged:
373 new_size = (i + 1) * self._record_len
374 if new_size > self._filesize:
375 self._filesize = (
376 (new_size + self.GROWTH_INCREMENT - 1)
377 // self.GROWTH_INCREMENT
378 * self.GROWTH_INCREMENT
380 self.f.resize(self._filesize)
381 if i > self._limit:
382 # Pad up to the new record with empty_value:
383 self.f[self._limit * self._record_len:i * self._record_len] = \
384 self.packer.empty_value * (i - self._limit)
385 self._limit = i + 1
387 self.f[i * self._record_len:(i + 1) * self._record_len] = s
389 def _get_packed_record(self, i):
390 if not 0 <= i < self._limit:
391 raise KeyError(i)
392 return self.f[i * self._record_len:(i + 1) * self._record_len]
394 def close(self):
395 self.flush()
396 self.f.close()
397 self.python_file.close()