cvs2svn_lib/checkout_internal.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2007-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains classes that implement the --use-internal-co option.
  18
  19 The idea is to patch up the revisions' contents incrementally, thus
  20 avoiding the huge number of process spawns and the O(n^2) overhead of
  21 using 'co' and 'cvs'.
  22
  23 InternalRevisionCollector saves the RCS deltas and RCS revision trees
  24 to databases.  Notably, deltas from the trunk need to be reversed, as
  25 CVS stores them so they apply from HEAD backwards.
  26
  27 InternalRevisionReader produces the revisions' contents on demand.  To
  28 generate the text for a typical revision, we need the revision's delta
  29 text plus the fulltext of the previous revision.  Therefore, we
  30 maintain a checkout database containing a copy of the fulltext of any
  31 revision for which subsequent revisions still need to be retrieved.
  32 It is crucial to remove text from this database as soon as it is no
  33 longer needed, to prevent it from growing enormous.
  34
  35 There are two reasons that the text from a revision can be needed: (1)
  36 because the revision itself still needs to be output to a dumpfile;
  37 (2) because another revision needs it as the base of its delta.  We
  38 maintain a reference count for each revision, which includes *both*
  39 possibilities.  The first time a revision's text is needed, it is
  40 generated by applying the revision's deltatext to the previous
  41 revision's fulltext, and the resulting fulltext is stored in the
  42 checkout database.  Each time a revision's fulltext is retrieved, its
  43 reference count is decremented.  When the reference count goes to
  44 zero, then the fulltext is deleted from the checkout database.
  45
  46 The administrative data for managing this consists of one TextRecord
  47 entry for each revision.  Each TextRecord has an id, which is the same
  48 id as used for the corresponding CVSRevision instance.  It also
  49 maintains a count of the times it is expected to be retrieved.
  50 TextRecords come in several varieties:
  51
  52 FullTextRecord -- Used for revisions whose fulltext is derived
  53     directly from the RCS file by the InternalRevisionCollector (i.e.,
  54     typically revision 1.1 of each file).
  55
  56 DeltaTextRecord -- Used for revisions that are defined via a delta
  57     relative to some other TextRecord.  These records record the id of
  58     the TextRecord that holds the base text against which the delta is
  59     defined.  When the text for a DeltaTextRecord is retrieved, the
  60     DeltaTextRecord instance is deleted and a CheckedOutTextRecord
  61     instance is created to take its place.
  62
  63 CheckedOutTextRecord -- Used during OutputPass for a revision that
  64     started out as a DeltaTextRecord, but has already been retrieved
  65     (and therefore its fulltext is stored in the checkout database).
  66
  67 While a file is being processed during FilterSymbolsPass, the fulltext
  68 and deltas are stored to the delta database, and TextRecord instances
  69 are created to keep track of things.  The reference counts are all
  70 initialized: each record referred to by a delta has its refcount
  71 incremented, and each record that corresponds to a non-delete
  72 CVSRevision is incremented.  After that, any records with refcount==0
  73 are removed.  When one record is removed, that can cause another
  74 record's reference count to go to zero and be removed too,
  75 recursively.  When a TextRecord is deleted at this stage, its
  76 deltatext is also deleted from the delta database."""
  77
  78
  79 import re
  80 import time
  81
  82 from cvs2svn_lib import config
  83 from cvs2svn_lib.common import DB_OPEN_NEW
  84 from cvs2svn_lib.common import DB_OPEN_READ
  85 from cvs2svn_lib.common import warning_prefix
  86 from cvs2svn_lib.common import FatalError
  87 from cvs2svn_lib.common import InternalError
  88 from cvs2svn_lib.common import canonicalize_eol
  89 from cvs2svn_lib.common import is_trunk_revision
  90 from cvs2svn_lib.context import Ctx
  91 from cvs2svn_lib.log import logger
  92 from cvs2svn_lib.artifact_manager import artifact_manager
  93 from cvs2svn_lib.cvs_item import CVSRevisionModification
  94 from cvs2svn_lib.database import Database
  95 from cvs2svn_lib.database import IndexedDatabase
  96 from cvs2svn_lib.rcs_stream import RCSStream
  97 from cvs2svn_lib.rcs_stream import MalformedDeltaException
  98 from cvs2svn_lib.revision_manager import RevisionCollector
  99 from cvs2svn_lib.revision_manager import RevisionReader
 100 from cvs2svn_lib.serializer import MarshalSerializer
 101 from cvs2svn_lib.serializer import CompressingSerializer
 102 from cvs2svn_lib.serializer import PrimedPickleSerializer
 103 from cvs2svn_lib.apple_single_filter import get_maybe_apple_single
 104
 105 import cvs2svn_rcsparse
 106
 107
 108 class TextRecord(object):
 109   """Bookkeeping data for the text of a single CVSRevision."""
 110
 111   __slots__ = ['id', 'refcount']
 112
 113   def __init__(self, id):
 114     # The cvs_rev_id of the revision whose text this is.
 115     self.id = id
 116
 117     # The number of times that the text of this revision will be
 118     # retrieved.
 119     self.refcount = 0
 120
 121   def __getstate__(self):
 122     return (self.id, self.refcount,)
 123
 124   def __setstate__(self, state):
 125     (self.id, self.refcount,) = state
 126
 127   def increment_dependency_refcounts(self, text_record_db):
 128     """Increment the refcounts of any records that this one depends on."""
 129
 130     pass
 131
 132   def decrement_refcount(self, text_record_db):
 133     """Decrement the number of times our text still has to be checked out.
 134
 135     If the reference count goes to zero, call discard()."""
 136
 137     self.refcount -= 1
 138     if self.refcount == 0:
 139       text_record_db.discard(self.id)
 140
 141   def checkout(self, text_record_db):
 142     """Workhorse of the checkout process.
 143
 144     Return the text for this revision, decrement our reference count,
 145     and update the databases depending on whether there will be future
 146     checkouts."""
 147
 148     raise NotImplementedError()
 149
 150   def free(self, text_record_db):
 151     """This instance will never again be checked out; free it.
 152
 153     Also free any associated resources and decrement the refcounts of
 154     any other TextRecords that this one depends on."""
 155
 156     raise NotImplementedError()
 157
 158
 159 class FullTextRecord(TextRecord):
 160   """A record whose revision's fulltext is stored in the delta_db.
 161
 162   These records are used for revisions whose fulltext was determined
 163   by the InternalRevisionCollector during FilterSymbolsPass.  The
 164   fulltext for such a revision is is stored in the delta_db as a
 165   single string."""
 166
 167   __slots__ = []
 168
 169   def __getstate__(self):
 170     return (self.id, self.refcount,)
 171
 172   def __setstate__(self, state):
 173     (self.id, self.refcount,) = state
 174
 175   def checkout(self, text_record_db):
 176     text = text_record_db.delta_db[self.id]
 177     self.decrement_refcount(text_record_db)
 178     return text
 179
 180   def free(self, text_record_db):
 181     del text_record_db.delta_db[self.id]
 182
 183   def __str__(self):
 184     return 'FullTextRecord(%x, %d)' % (self.id, self.refcount,)
 185
 186
 187 class DeltaTextRecord(TextRecord):
 188   """A record whose revision's delta is stored as an RCS delta.
 189
 190   The text of this revision must be derived by applying an RCS delta
 191   to the text of the predecessor revision.  The RCS delta is stored
 192   in the delta_db."""
 193
 194   __slots__ = ['pred_id']
 195
 196   def __init__(self, id, pred_id):
 197     TextRecord.__init__(self, id)
 198
 199     # The cvs_rev_id of the revision relative to which this delta is
 200     # defined.
 201     self.pred_id = pred_id
 202
 203   def __getstate__(self):
 204     return (self.id, self.refcount, self.pred_id,)
 205
 206   def __setstate__(self, state):
 207     (self.id, self.refcount, self.pred_id,) = state
 208
 209   def increment_dependency_refcounts(self, text_record_db):
 210     text_record_db[self.pred_id].refcount += 1
 211
 212   def checkout(self, text_record_db):
 213     base_text = text_record_db[self.pred_id].checkout(text_record_db)
 214     rcs_stream = RCSStream(base_text)
 215     delta_text = text_record_db.delta_db[self.id]
 216     rcs_stream.apply_diff(delta_text)
 217     text = rcs_stream.get_text()
 218     del rcs_stream
 219     self.refcount -= 1
 220     if self.refcount == 0:
 221       # This text will never be needed again; just delete ourselves
 222       # without ever having stored the fulltext to the checkout
 223       # database:
 224       del text_record_db[self.id]
 225     else:
 226       # Store a new CheckedOutTextRecord in place of ourselves:
 227       text_record_db.checkout_db['%x' % self.id] = text
 228       new_text_record = CheckedOutTextRecord(self.id)
 229       new_text_record.refcount = self.refcount
 230       text_record_db.replace(new_text_record)
 231     return text
 232
 233   def free(self, text_record_db):
 234     del text_record_db.delta_db[self.id]
 235     text_record_db[self.pred_id].decrement_refcount(text_record_db)
 236
 237   def __str__(self):
 238     return 'DeltaTextRecord(%x -> %x, %d)' % (
 239         self.pred_id, self.id, self.refcount,
 240         )
 241
 242
 243 class CheckedOutTextRecord(TextRecord):
 244   """A record whose revision's fulltext is stored in the text_record_db.
 245
 246   These records are used for revisions whose fulltext has been
 247   computed already during OutputPass.  The fulltext for such a
 248   revision is stored in the text_record_db as a single string."""
 249
 250   __slots__ = []
 251
 252   def __getstate__(self):
 253     return (self.id, self.refcount,)
 254
 255   def __setstate__(self, state):
 256     (self.id, self.refcount,) = state
 257
 258   def checkout(self, text_record_db):
 259     text = text_record_db.checkout_db['%x' % self.id]
 260     self.decrement_refcount(text_record_db)
 261     return text
 262
 263   def free(self, text_record_db):
 264     del text_record_db.checkout_db['%x' % self.id]
 265
 266   def __str__(self):
 267     return 'CheckedOutTextRecord(%x, %d)' % (self.id, self.refcount,)
 268
 269
 270 class NullDatabase(object):
 271   """A do-nothing database that can be used with TextRecordDatabase.
 272
 273   Use this when you don't actually want to allow anything to be
 274   deleted."""
 275
 276   def __delitem__(self, id):
 277     pass
 278
 279
 280 class TextRecordDatabase:
 281   """Holds the TextRecord instances that are currently live.
 282
 283   During FilterSymbolsPass, files are processed one by one and a new
 284   TextRecordDatabase instance is used for each file.  During
 285   OutputPass, a single TextRecordDatabase instance is used for the
 286   duration of OutputPass; individual records are added and removed
 287   when they are active."""
 288
 289   def __init__(self, delta_db, checkout_db):
 290     # A map { cvs_rev_id -> TextRecord }.
 291     self.text_records = {}
 292
 293     # A database-like object using cvs_rev_ids as keys and containing
 294     # fulltext/deltatext strings as values.  Its __getitem__() method
 295     # is used to retrieve deltas when they are needed, and its
 296     # __delitem__() method is used to delete deltas when they can be
 297     # freed.  The modifiability of the delta database varies from pass
 298     # to pass, so the object stored here varies as well:
 299     #
 300     # FilterSymbolsPass: a NullDatabase.  The delta database cannot be
 301     #     modified during this pass, and we have no need to retrieve
 302     #     deltas, so we just use a dummy object here.
 303     #
 304     # OutputPass: a disabled IndexedDatabase.  During this pass we
 305     #     need to retrieve deltas, but we are not allowed to modify
 306     #     the delta database.  So we use an IndexedDatabase whose
 307     #     __del__() method has been disabled to do nothing.
 308     self.delta_db = delta_db
 309
 310     # A database-like object using cvs_rev_ids as keys and containing
 311     # fulltext strings as values.  This database is only set during
 312     # OutputPass.
 313     self.checkout_db = checkout_db
 314
 315     # If this is set to a list, then the list holds the ids of
 316     # text_records that have to be deleted; when discard() is called,
 317     # it adds the requested id to the list but does not delete it.  If
 318     # this member is set to None, then text_records are deleted
 319     # immediately when discard() is called.
 320     self.deferred_deletes = None
 321
 322   def __getstate__(self):
 323     return (self.text_records.values(),)
 324
 325   def __setstate__(self, state):
 326     (text_records,) = state
 327     self.text_records = {}
 328     for text_record in text_records:
 329       self.add(text_record)
 330     self.delta_db = NullDatabase()
 331     self.checkout_db = NullDatabase()
 332     self.deferred_deletes = None
 333
 334   def add(self, text_record):
 335     """Add TEXT_RECORD to our database.
 336
 337     There must not already be a record with the same id."""
 338
 339     assert not self.text_records.has_key(text_record.id)
 340
 341     self.text_records[text_record.id] = text_record
 342
 343   def __getitem__(self, id):
 344     return self.text_records[id]
 345
 346   def __delitem__(self, id):
 347     """Free the record with the specified ID."""
 348
 349     del self.text_records[id]
 350
 351   def replace(self, text_record):
 352     """Store TEXT_RECORD in place of the existing record with the same id.
 353
 354     Do not do anything with the old record."""
 355
 356     assert self.text_records.has_key(text_record.id)
 357     self.text_records[text_record.id] = text_record
 358
 359   def discard(self, *ids):
 360     """The text records with IDS are no longer needed; discard them.
 361
 362     This involves calling their free() methods and also removing them
 363     from SELF.
 364
 365     If SELF.deferred_deletes is not None, then the ids to be deleted
 366     are added to the list instead of deleted immediately.  This
 367     mechanism is to prevent a stack overflow from the avalanche of
 368     deletes that can result from deleting a long chain of revisions."""
 369
 370     if self.deferred_deletes is None:
 371       # This is an outer-level delete.
 372       self.deferred_deletes = list(ids)
 373       while self.deferred_deletes:
 374         id = self.deferred_deletes.pop()
 375         text_record = self[id]
 376         if text_record.refcount != 0:
 377           raise InternalError(
 378               'TextRecordDatabase.discard(%s) called with refcount = %d'
 379               % (text_record, text_record.refcount,)
 380               )
 381         # This call might cause other text_record ids to be added to
 382         # self.deferred_deletes:
 383         text_record.free(self)
 384         del self[id]
 385       self.deferred_deletes = None
 386     else:
 387       self.deferred_deletes.extend(ids)
 388
 389   def itervalues(self):
 390     return self.text_records.itervalues()
 391
 392   def recompute_refcounts(self, cvs_file_items):
 393     """Recompute the refcounts of the contained TextRecords.
 394
 395     Use CVS_FILE_ITEMS to determine which records will be needed by
 396     cvs2svn."""
 397
 398     # First clear all of the refcounts:
 399     for text_record in self.itervalues():
 400       text_record.refcount = 0
 401
 402     # Now increment the reference count of records that are needed as
 403     # the source of another record's deltas:
 404     for text_record in self.itervalues():
 405       text_record.increment_dependency_refcounts(self.text_records)
 406
 407     # Now increment the reference count of records that will be needed
 408     # by cvs2svn:
 409     for lod_items in cvs_file_items.iter_lods():
 410       for cvs_rev in lod_items.cvs_revisions:
 411         if isinstance(cvs_rev, CVSRevisionModification):
 412           self[cvs_rev.id].refcount += 1
 413
 414   def free_unused(self):
 415     """Free any TextRecords whose reference counts are zero."""
 416
 417     # The deletion of some of these text records might cause others to
 418     # be unused, in which case they will be deleted automatically.
 419     # But since the initially-unused records are not referred to by
 420     # any others, we don't have to be afraid that they will be deleted
 421     # before we get to them.  But it *is* crucial that we create the
 422     # whole unused list before starting the loop.
 423
 424     unused = [
 425         text_record.id
 426         for text_record in self.itervalues()
 427         if text_record.refcount == 0
 428         ]
 429
 430     self.discard(*unused)
 431
 432   def log_leftovers(self):
 433     """If any TextRecords still exist, log them."""
 434
 435     if self.text_records:
 436       logger.warn(
 437           "%s: internal problem: leftover revisions in the checkout cache:"
 438           % warning_prefix)
 439       for text_record in self.itervalues():
 440         logger.warn('    %s' % (text_record,))
 441
 442   def __repr__(self):
 443     """Debugging output of the current contents of the TextRecordDatabase."""
 444
 445     retval = ['TextRecordDatabase:']
 446     for text_record in self.itervalues():
 447       retval.append('    %s' % (text_record,))
 448     return '\n'.join(retval)
 449
 450
 451 class _Sink(cvs2svn_rcsparse.Sink):
 452   def __init__(self, revision_collector, cvs_file_items):
 453     self.revision_collector = revision_collector
 454     self.cvs_file_items = cvs_file_items
 455
 456     # A map {rev : base_rev} indicating that the text for rev is
 457     # stored in CVS as a delta relative to base_rev.
 458     self.base_revisions = {}
 459
 460     # The revision that is stored with its fulltext in CVS (usually
 461     # the oldest revision on trunk):
 462     self.head_revision = None
 463
 464     # The first logical revision on trunk (usually '1.1'):
 465     self.revision_1_1 = None
 466
 467     # Keep track of the revisions whose revision info has been seen so
 468     # far (to avoid repeated revision info blocks):
 469     self.revisions_seen = set()
 470
 471   def set_head_revision(self, revision):
 472     self.head_revision = revision
 473
 474   def define_revision(
 475         self, revision, timestamp, author, state, branches, next
 476         ):
 477     if next:
 478       self.base_revisions[next] = revision
 479     else:
 480       if is_trunk_revision(revision):
 481         self.revision_1_1 = revision
 482
 483     for branch in branches:
 484       self.base_revisions[branch] = revision
 485
 486   def set_revision_info(self, revision, log, text):
 487     if revision in self.revisions_seen:
 488       # One common form of CVS repository corruption is that the
 489       # Deltatext block for revision 1.1 appears twice.  CollectData
 490       # has already warned about this problem; here we can just ignore
 491       # it.
 492       return
 493     else:
 494       self.revisions_seen.add(revision)
 495
 496     cvs_rev_id = self.cvs_file_items.original_ids[revision]
 497     if is_trunk_revision(revision):
 498       # On trunk, revisions are encountered in reverse order (1.<N>
 499       # ... 1.1) and deltas are inverted.  The first text that we see
 500       # is the fulltext for the HEAD revision.  After that, the text
 501       # corresponding to revision 1.N is the delta (1.<N+1> ->
 502       # 1.<N>)).  We have to invert the deltas here so that we can
 503       # read the revisions out in dependency order; that is, for
 504       # revision 1.1 we want the fulltext, and for revision 1.<N> we
 505       # want the delta (1.<N-1> -> 1.<N>).  This means that we can't
 506       # compute the delta for a revision until we see its logical
 507       # parent.  When we finally see revision 1.1 (which is recognized
 508       # because it doesn't have a parent), we can record the diff (1.1
 509       # -> 1.2) for revision 1.2, and also the fulltext for 1.1.
 510
 511       if revision == self.head_revision:
 512         # This is HEAD, as fulltext.  Initialize the RCSStream so
 513         # that we can compute deltas backwards in time.
 514         self._rcs_stream = RCSStream(text)
 515         self._rcs_stream_revision = revision
 516       else:
 517         # Any other trunk revision is a backward delta.  Apply the
 518         # delta to the RCSStream to mutate it to the contents of this
 519         # revision, and also to get the reverse delta, which we store
 520         # as the forward delta of our child revision.
 521         try:
 522           text = self._rcs_stream.invert_diff(text)
 523         except MalformedDeltaException, e:
 524           logger.error(
 525               'Malformed RCS delta in %s, revision %s: %s'
 526               % (self.cvs_file_items.cvs_file.filename, revision, e)
 527               )
 528           raise RuntimeError()
 529         text_record = DeltaTextRecord(
 530             self.cvs_file_items.original_ids[self._rcs_stream_revision],
 531             cvs_rev_id
 532             )
 533         self.revision_collector._writeout(text_record, text)
 534         self._rcs_stream_revision = revision
 535
 536       if revision == self.revision_1_1:
 537         # This is revision 1.1.  Write its fulltext:
 538         text_record = FullTextRecord(cvs_rev_id)
 539         self.revision_collector._writeout(
 540             text_record, self._rcs_stream.get_text()
 541             )
 542
 543         # There will be no more trunk revisions delivered, so free the
 544         # RCSStream.
 545         del self._rcs_stream
 546         del self._rcs_stream_revision
 547
 548     else:
 549       # On branches, revisions are encountered in logical order
 550       # (<BRANCH>.1 ... <BRANCH>.<N>) and the text corresponding to
 551       # revision <BRANCH>.<N> is the forward delta (<BRANCH>.<N-1> ->
 552       # <BRANCH>.<N>).  That's what we need, so just store it.
 553
 554       # FIXME: It would be nice to avoid writing out branch deltas
 555       # when --trunk-only.  (They will be deleted when finish_file()
 556       # is called, but if the delta db is in an IndexedDatabase the
 557       # deletions won't actually recover any disk space.)
 558       text_record = DeltaTextRecord(
 559           cvs_rev_id,
 560           self.cvs_file_items.original_ids[self.base_revisions[revision]]
 561           )
 562       self.revision_collector._writeout(text_record, text)
 563
 564     return None
 565
 566
 567 class InternalRevisionCollector(RevisionCollector):
 568   """The RevisionCollector used by InternalRevisionReader."""
 569
 570   def __init__(self, compress):
 571     RevisionCollector.__init__(self)
 572     self._compress = compress
 573
 574   def register_artifacts(self, which_pass):
 575     artifact_manager.register_temp_file(
 576         config.RCS_DELTAS_INDEX_TABLE, which_pass
 577         )
 578     artifact_manager.register_temp_file(config.RCS_DELTAS_STORE, which_pass)
 579     artifact_manager.register_temp_file(
 580         config.RCS_TREES_INDEX_TABLE, which_pass
 581         )
 582     artifact_manager.register_temp_file(config.RCS_TREES_STORE, which_pass)
 583
 584   def start(self):
 585     serializer = MarshalSerializer()
 586     if self._compress:
 587       serializer = CompressingSerializer(serializer)
 588     self._delta_db = IndexedDatabase(
 589         artifact_manager.get_temp_file(config.RCS_DELTAS_STORE),
 590         artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE),
 591         DB_OPEN_NEW, serializer,
 592         )
 593     primer = (FullTextRecord, DeltaTextRecord)
 594     self._rcs_trees = IndexedDatabase(
 595         artifact_manager.get_temp_file(config.RCS_TREES_STORE),
 596         artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE),
 597         DB_OPEN_NEW, PrimedPickleSerializer(primer),
 598         )
 599
 600   def _writeout(self, text_record, text):
 601     self.text_record_db.add(text_record)
 602     self._delta_db[text_record.id] = text
 603
 604   def process_file(self, cvs_file_items):
 605     """Read revision information for the file described by CVS_FILE_ITEMS.
 606
 607     Compute the text record refcounts, discard any records that are
 608     unneeded, and store the text records for the file to the
 609     _rcs_trees database."""
 610
 611     # A map from cvs_rev_id to TextRecord instance:
 612     self.text_record_db = TextRecordDatabase(self._delta_db, NullDatabase())
 613
 614     cvs2svn_rcsparse.parse(
 615         open(cvs_file_items.cvs_file.filename, 'rb'),
 616         _Sink(self, cvs_file_items),
 617         )
 618
 619     self.text_record_db.recompute_refcounts(cvs_file_items)
 620     self.text_record_db.free_unused()
 621     self._rcs_trees[cvs_file_items.cvs_file.id] = self.text_record_db
 622     del self.text_record_db
 623
 624   def finish(self):
 625     self._delta_db.close()
 626     self._rcs_trees.close()
 627
 628
 629 class _KeywordExpander:
 630   """A class whose instances provide substitutions for CVS keywords.
 631
 632   This class is used via its __call__() method, which should be called
 633   with a match object representing a match for a CVS keyword string.
 634   The method returns the replacement for the matched text.
 635
 636   The __call__() method works by calling the method with the same name
 637   as that of the CVS keyword (converted to lower case).
 638
 639   Instances of this class can be passed as the REPL argument to
 640   re.sub()."""
 641
 642   date_fmt_old = "%Y/%m/%d %H:%M:%S"    # CVS 1.11, rcs
 643   date_fmt_new = "%Y-%m-%d %H:%M:%S"    # CVS 1.12
 644
 645   date_fmt = date_fmt_new
 646
 647   @classmethod
 648   def use_old_date_format(klass):
 649       """Class method to ensure exact compatibility with CVS 1.11
 650       output.  Use this if you want to verify your conversion and you're
 651       using CVS 1.11."""
 652       klass.date_fmt = klass.date_fmt_old
 653
 654   def __init__(self, cvs_rev):
 655     self.cvs_rev = cvs_rev
 656
 657   def __call__(self, match):
 658     return '$%s: %s $' % (
 659         match.group(1), getattr(self, match.group(1).lower())(),
 660         )
 661
 662   def author(self):
 663     return Ctx()._metadata_db[self.cvs_rev.metadata_id].original_author
 664
 665   def date(self):
 666     return time.strftime(self.date_fmt, time.gmtime(self.cvs_rev.timestamp))
 667
 668   def header(self):
 669     return '%s %s %s %s Exp' % (
 670         self.source(), self.cvs_rev.rev, self.date(), self.author(),
 671         )
 672
 673   def id(self):
 674     return '%s %s %s %s Exp' % (
 675         self.rcsfile(), self.cvs_rev.rev, self.date(), self.author(),
 676         )
 677
 678   def locker(self):
 679     # Handle kvl like kv, as a converted repo is supposed to have no
 680     # locks.
 681     return ''
 682
 683   def log(self):
 684     # Would need some special handling.
 685     return 'not supported by cvs2svn'
 686
 687   def name(self):
 688     # Cannot work, as just creating a new symbol does not check out
 689     # the revision again.
 690     return 'not supported by cvs2svn'
 691
 692   def rcsfile(self):
 693     return self.cvs_rev.cvs_file.basename + ",v"
 694
 695   def revision(self):
 696     return self.cvs_rev.rev
 697
 698   def source(self):
 699     project = self.cvs_rev.cvs_file.project
 700     return '%s/%s%s,v' % (
 701         project.cvs_repository_root,
 702         project.cvs_module,
 703         self.cvs_rev.cvs_file.cvs_path,
 704         )
 705
 706   def state(self):
 707     # We check out only live revisions.
 708     return 'Exp'
 709
 710
 711 class InternalRevisionReader(RevisionReader):
 712   """A RevisionReader that reads the contents from an own delta store."""
 713
 714   _kws = 'Author|Date|Header|Id|Locker|Log|Name|RCSfile|Revision|Source|State'
 715   _kw_re = re.compile(r'\$(' + _kws + r'):[^$\n]*\$')
 716   _kwo_re = re.compile(r'\$(' + _kws + r')(:[^$\n]*)?\$')
 717
 718   def __init__(self, compress):
 719     self._compress = compress
 720
 721   def register_artifacts(self, which_pass):
 722     artifact_manager.register_temp_file(config.CVS_CHECKOUT_DB, which_pass)
 723     artifact_manager.register_temp_file_needed(
 724         config.RCS_DELTAS_STORE, which_pass
 725         )
 726     artifact_manager.register_temp_file_needed(
 727         config.RCS_DELTAS_INDEX_TABLE, which_pass
 728         )
 729     artifact_manager.register_temp_file_needed(
 730         config.RCS_TREES_STORE, which_pass
 731         )
 732     artifact_manager.register_temp_file_needed(
 733         config.RCS_TREES_INDEX_TABLE, which_pass
 734         )
 735
 736   def start(self):
 737     self._delta_db = IndexedDatabase(
 738         artifact_manager.get_temp_file(config.RCS_DELTAS_STORE),
 739         artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE),
 740         DB_OPEN_READ,
 741         )
 742     self._delta_db.__delitem__ = lambda id: None
 743     self._tree_db = IndexedDatabase(
 744         artifact_manager.get_temp_file(config.RCS_TREES_STORE),
 745         artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE),
 746         DB_OPEN_READ,
 747         )
 748     serializer = MarshalSerializer()
 749     if self._compress:
 750       serializer = CompressingSerializer(serializer)
 751     self._co_db = Database(
 752         artifact_manager.get_temp_file(config.CVS_CHECKOUT_DB),
 753         DB_OPEN_NEW, serializer,
 754         )
 755
 756     # The set of CVSFile instances whose TextRecords have already been
 757     # read:
 758     self._loaded_files = set()
 759
 760     # A map { CVSFILE : _FileTree } for files that currently have live
 761     # revisions:
 762     self._text_record_db = TextRecordDatabase(self._delta_db, self._co_db)
 763
 764   def _get_text_record(self, cvs_rev):
 765     """Return the TextRecord instance for CVS_REV.
 766
 767     If the TextRecords for CVS_REV.cvs_file haven't been loaded yet,
 768     do so now."""
 769
 770     if cvs_rev.cvs_file not in self._loaded_files:
 771       for text_record in self._tree_db[cvs_rev.cvs_file.id].itervalues():
 772         self._text_record_db.add(text_record)
 773       self._loaded_files.add(cvs_rev.cvs_file)
 774
 775     return self._text_record_db[cvs_rev.id]
 776
 777   def get_content(self, cvs_rev):
 778     """Check out the text for revision C_REV from the repository.
 779
 780     Return the text.  If CVS_REV has a property _keyword_handling, use
 781     it to determine how to handle RCS keywords in the output:
 782
 783         'collapsed' -- collapse keywords
 784
 785         'expanded' -- expand keywords
 786
 787         'untouched' -- output keywords in the form they are found in
 788             the RCS file
 789
 790     Note that $Log$ never actually generates a log (which makes test
 791     'requires_cvs()' fail).
 792
 793     Revisions may be requested in any order, but if they are not
 794     requested in dependency order the checkout database will become
 795     very large.  Revisions may be skipped.  Each revision may be
 796     requested only once."""
 797
 798     try:
 799       text = self._get_text_record(cvs_rev).checkout(self._text_record_db)
 800     except MalformedDeltaException, (msg):
 801       raise FatalError('Malformed RCS delta in %s, revision %s: %s'
 802                        % (cvs_rev.cvs_file.get_filename(), cvs_rev.rev, msg))
 803
 804     keyword_handling = cvs_rev.get_property('_keyword_handling')
 805
 806     if keyword_handling == 'untouched':
 807       # Leave keywords in the form that they were checked in.
 808       pass
 809     elif keyword_handling == 'collapsed':
 810       text = self._kw_re.sub(r'$\1$', text)
 811     elif keyword_handling == 'expanded':
 812       text = self._kwo_re.sub(_KeywordExpander(cvs_rev), text)
 813     else:
 814       raise FatalError(
 815           'Undefined _keyword_handling property (%r) for %s'
 816           % (keyword_handling, cvs_rev,)
 817           )
 818
 819     if Ctx().decode_apple_single:
 820       # Insert a filter to decode any files that are in AppleSingle
 821       # format:
 822       text = get_maybe_apple_single(text)
 823
 824     eol_fix = cvs_rev.get_property('_eol_fix')
 825     if eol_fix:
 826       text = canonicalize_eol(text, eol_fix)
 827
 828     return text
 829
 830   def finish(self):
 831     self._text_record_db.log_leftovers()
 832
 833     del self._text_record_db
 834     self._delta_db.close()
 835     self._tree_db.close()
 836     self._co_db.close()
 837