cvs2svn_lib/checkout_internal.py

   1 # (Be in -*- python -*- mode.)
   2 #
   3 # ====================================================================
   4 # Copyright (c) 2007-2009 CollabNet.  All rights reserved.
   5 #
   6 # This software is licensed as described in the file COPYING, which
   7 # you should have received as part of this distribution.  The terms
   8 # are also available at http://subversion.tigris.org/license-1.html.
   9 # If newer versions of this license are posted there, you may use a
  10 # newer version instead, at your option.
  11 #
  12 # This software consists of voluntary contributions made by many
  13 # individuals.  For exact contribution history, see the revision
  14 # history and logs, available at http://cvs2svn.tigris.org/.
  15 # ====================================================================
  16
  17 """This module contains classes that implement the --use-internal-co option.
  18
  19 The idea is to patch up the revisions' contents incrementally, thus
  20 avoiding the huge number of process spawns and the O(n^2) overhead of
  21 using 'co' and 'cvs'.
  22
  23 InternalRevisionCollector saves the RCS deltas and RCS revision trees
  24 to databases.  Notably, deltas from the trunk need to be reversed, as
  25 CVS stores them so they apply from HEAD backwards.
  26
  27 InternalRevisionReader produces the revisions' contents on demand.  To
  28 generate the text for a typical revision, we need the revision's delta
  29 text plus the fulltext of the previous revision.  Therefore, we
  30 maintain a checkout database containing a copy of the fulltext of any
  31 revision for which subsequent revisions still need to be retrieved.
  32 It is crucial to remove text from this database as soon as it is no
  33 longer needed, to prevent it from growing enormous.
  34
  35 There are two reasons that the text from a revision can be needed: (1)
  36 because the revision itself still needs to be output to a dumpfile;
  37 (2) because another revision needs it as the base of its delta.  We
  38 maintain a reference count for each revision, which includes *both*
  39 possibilities.  The first time a revision's text is needed, it is
  40 generated by applying the revision's deltatext to the previous
  41 revision's fulltext, and the resulting fulltext is stored in the
  42 checkout database.  Each time a revision's fulltext is retrieved, its
  43 reference count is decremented.  When the reference count goes to
  44 zero, then the fulltext is deleted from the checkout database.
  45
  46 The administrative data for managing this consists of one TextRecord
  47 entry for each revision.  Each TextRecord has an id, which is the same
  48 id as used for the corresponding CVSRevision instance.  It also
  49 maintains a count of the times it is expected to be retrieved.
  50 TextRecords come in several varieties:
  51
  52 FullTextRecord -- Used for revisions whose fulltext is derived
  53     directly from the RCS file by the InternalRevisionCollector (i.e.,
  54     typically revision 1.1 of each file).
  55
  56 DeltaTextRecord -- Used for revisions that are defined via a delta
  57     relative to some other TextRecord.  These records record the id of
  58     the TextRecord that holds the base text against which the delta is
  59     defined.  When the text for a DeltaTextRecord is retrieved, the
  60     DeltaTextRecord instance is deleted and a CheckedOutTextRecord
  61     instance is created to take its place.
  62
  63 CheckedOutTextRecord -- Used during OutputPass for a revision that
  64     started out as a DeltaTextRecord, but has already been retrieved
  65     (and therefore its fulltext is stored in the checkout database).
  66
  67 While a file is being processed during FilterSymbolsPass, the fulltext
  68 and deltas are stored to the delta database, and TextRecord instances
  69 are created to keep track of things.  The reference counts are all
  70 initialized: each record referred to by a delta has its refcount
  71 incremented, and each record that corresponds to a non-delete
  72 CVSRevision is incremented.  After that, any records with refcount==0
  73 are removed.  When one record is removed, that can cause another
  74 record's reference count to go to zero and be removed too,
  75 recursively.  When a TextRecord is deleted at this stage, its
  76 deltatext is also deleted from the delta database."""
  77
  78
  79 from cStringIO import StringIO
  80 import re
  81 import time
  82
  83 from cvs2svn_lib import config
  84 from cvs2svn_lib.common import DB_OPEN_NEW
  85 from cvs2svn_lib.common import DB_OPEN_READ
  86 from cvs2svn_lib.common import warning_prefix
  87 from cvs2svn_lib.common import FatalError
  88 from cvs2svn_lib.common import InternalError
  89 from cvs2svn_lib.common import canonicalize_eol
  90 from cvs2svn_lib.common import is_trunk_revision
  91 from cvs2svn_lib.context import Ctx
  92 from cvs2svn_lib.log import Log
  93 from cvs2svn_lib.artifact_manager import artifact_manager
  94 from cvs2svn_lib.symbol import Trunk
  95 from cvs2svn_lib.cvs_item import CVSRevisionModification
  96 from cvs2svn_lib.database import Database
  97 from cvs2svn_lib.database import IndexedDatabase
  98 from cvs2svn_lib.rcs_stream import RCSStream
  99 from cvs2svn_lib.rcs_stream import MalformedDeltaException
 100 from cvs2svn_lib.revision_manager import RevisionCollector
 101 from cvs2svn_lib.revision_manager import RevisionReader
 102 from cvs2svn_lib.serializer import MarshalSerializer
 103 from cvs2svn_lib.serializer import CompressingSerializer
 104 from cvs2svn_lib.serializer import PrimedPickleSerializer
 105 from cvs2svn_lib.apple_single_filter import get_maybe_apple_single
 106
 107 import cvs2svn_rcsparse
 108
 109
 110 class TextRecord(object):
 111   """Bookkeeping data for the text of a single CVSRevision."""
 112
 113   __slots__ = ['id', 'refcount']
 114
 115   def __init__(self, id):
 116     # The cvs_rev_id of the revision whose text this is.
 117     self.id = id
 118
 119     # The number of times that the text of this revision will be
 120     # retrieved.
 121     self.refcount = 0
 122
 123   def __getstate__(self):
 124     return (self.id, self.refcount,)
 125
 126   def __setstate__(self, state):
 127     (self.id, self.refcount,) = state
 128
 129   def increment_dependency_refcounts(self, text_record_db):
 130     """Increment the refcounts of any records that this one depends on."""
 131
 132     pass
 133
 134   def decrement_refcount(self, text_record_db):
 135     """Decrement the number of times our text still has to be checked out.
 136
 137     If the reference count goes to zero, call discard()."""
 138
 139     self.refcount -= 1
 140     if self.refcount == 0:
 141       text_record_db.discard(self.id)
 142
 143   def checkout(self, text_record_db):
 144     """Workhorse of the checkout process.
 145
 146     Return the text for this revision, decrement our reference count,
 147     and update the databases depending on whether there will be future
 148     checkouts."""
 149
 150     raise NotImplementedError()
 151
 152   def free(self, text_record_db):
 153     """This instance will never again be checked out; free it.
 154
 155     Also free any associated resources and decrement the refcounts of
 156     any other TextRecords that this one depends on."""
 157
 158     raise NotImplementedError()
 159
 160
 161 class FullTextRecord(TextRecord):
 162   """A record whose revision's fulltext is stored in the delta_db.
 163
 164   These records are used for revisions whose fulltext was determined
 165   by the InternalRevisionCollector during FilterSymbolsPass.  The
 166   fulltext for such a revision is is stored in the delta_db as a
 167   single string."""
 168
 169   __slots__ = []
 170
 171   def __getstate__(self):
 172     return (self.id, self.refcount,)
 173
 174   def __setstate__(self, state):
 175     (self.id, self.refcount,) = state
 176
 177   def checkout(self, text_record_db):
 178     text = text_record_db.delta_db[self.id]
 179     self.decrement_refcount(text_record_db)
 180     return text
 181
 182   def free(self, text_record_db):
 183     del text_record_db.delta_db[self.id]
 184
 185   def __str__(self):
 186     return 'FullTextRecord(%x, %d)' % (self.id, self.refcount,)
 187
 188
 189 class DeltaTextRecord(TextRecord):
 190   """A record whose revision's delta is stored as an RCS delta.
 191
 192   The text of this revision must be derived by applying an RCS delta
 193   to the text of the predecessor revision.  The RCS delta is stored
 194   in the delta_db."""
 195
 196   __slots__ = ['pred_id']
 197
 198   def __init__(self, id, pred_id):
 199     TextRecord.__init__(self, id)
 200
 201     # The cvs_rev_id of the revision relative to which this delta is
 202     # defined.
 203     self.pred_id = pred_id
 204
 205   def __getstate__(self):
 206     return (self.id, self.refcount, self.pred_id,)
 207
 208   def __setstate__(self, state):
 209     (self.id, self.refcount, self.pred_id,) = state
 210
 211   def increment_dependency_refcounts(self, text_record_db):
 212     text_record_db[self.pred_id].refcount += 1
 213
 214   def checkout(self, text_record_db):
 215     base_text = text_record_db[self.pred_id].checkout(text_record_db)
 216     rcs_stream = RCSStream(base_text)
 217     delta_text = text_record_db.delta_db[self.id]
 218     rcs_stream.apply_diff(delta_text)
 219     text = rcs_stream.get_text()
 220     del rcs_stream
 221     self.refcount -= 1
 222     if self.refcount == 0:
 223       # This text will never be needed again; just delete ourselves
 224       # without ever having stored the fulltext to the checkout
 225       # database:
 226       del text_record_db[self.id]
 227     else:
 228       # Store a new CheckedOutTextRecord in place of ourselves:
 229       text_record_db.checkout_db['%x' % self.id] = text
 230       new_text_record = CheckedOutTextRecord(self.id)
 231       new_text_record.refcount = self.refcount
 232       text_record_db.replace(new_text_record)
 233     return text
 234
 235   def free(self, text_record_db):
 236     del text_record_db.delta_db[self.id]
 237     text_record_db[self.pred_id].decrement_refcount(text_record_db)
 238
 239   def __str__(self):
 240     return 'DeltaTextRecord(%x -> %x, %d)' % (
 241         self.pred_id, self.id, self.refcount,
 242         )
 243
 244
 245 class CheckedOutTextRecord(TextRecord):
 246   """A record whose revision's fulltext is stored in the text_record_db.
 247
 248   These records are used for revisions whose fulltext has been
 249   computed already during OutputPass.  The fulltext for such a
 250   revision is stored in the text_record_db as a single string."""
 251
 252   __slots__ = []
 253
 254   def __getstate__(self):
 255     return (self.id, self.refcount,)
 256
 257   def __setstate__(self, state):
 258     (self.id, self.refcount,) = state
 259
 260   def checkout(self, text_record_db):
 261     text = text_record_db.checkout_db['%x' % self.id]
 262     self.decrement_refcount(text_record_db)
 263     return text
 264
 265   def free(self, text_record_db):
 266     del text_record_db.checkout_db['%x' % self.id]
 267
 268   def __str__(self):
 269     return 'CheckedOutTextRecord(%x, %d)' % (self.id, self.refcount,)
 270
 271
 272 class NullDatabase(object):
 273   """A do-nothing database that can be used with TextRecordDatabase.
 274
 275   Use this when you don't actually want to allow anything to be
 276   deleted."""
 277
 278   def __delitem__(self, id):
 279     pass
 280
 281
 282 class TextRecordDatabase:
 283   """Holds the TextRecord instances that are currently live.
 284
 285   During FilterSymbolsPass, files are processed one by one and a new
 286   TextRecordDatabase instance is used for each file.  During
 287   OutputPass, a single TextRecordDatabase instance is used for the
 288   duration of OutputPass; individual records are added and removed
 289   when they are active."""
 290
 291   def __init__(self, delta_db, checkout_db):
 292     # A map { cvs_rev_id -> TextRecord }.
 293     self.text_records = {}
 294
 295     # A database-like object using cvs_rev_ids as keys and containing
 296     # fulltext/deltatext strings as values.  Its __getitem__() method
 297     # is used to retrieve deltas when they are needed, and its
 298     # __delitem__() method is used to delete deltas when they can be
 299     # freed.  The modifiability of the delta database varies from pass
 300     # to pass, so the object stored here varies as well:
 301     #
 302     # FilterSymbolsPass: a NullDatabase.  The delta database cannot be
 303     #     modified during this pass, and we have no need to retrieve
 304     #     deltas, so we just use a dummy object here.
 305     #
 306     # OutputPass: a disabled IndexedDatabase.  During this pass we
 307     #     need to retrieve deltas, but we are not allowed to modify
 308     #     the delta database.  So we use an IndexedDatabase whose
 309     #     __del__() method has been disabled to do nothing.
 310     self.delta_db = delta_db
 311
 312     # A database-like object using cvs_rev_ids as keys and containing
 313     # fulltext strings as values.  This database is only set during
 314     # OutputPass.
 315     self.checkout_db = checkout_db
 316
 317     # If this is set to a list, then the list holds the ids of
 318     # text_records that have to be deleted; when discard() is called,
 319     # it adds the requested id to the list but does not delete it.  If
 320     # this member is set to None, then text_records are deleted
 321     # immediately when discard() is called.
 322     self.deferred_deletes = None
 323
 324   def __getstate__(self):
 325     return (self.text_records.values(),)
 326
 327   def __setstate__(self, state):
 328     (text_records,) = state
 329     self.text_records = {}
 330     for text_record in text_records:
 331       self.add(text_record)
 332     self.delta_db = NullDatabase()
 333     self.checkout_db = NullDatabase()
 334     self.deferred_deletes = None
 335
 336   def add(self, text_record):
 337     """Add TEXT_RECORD to our database.
 338
 339     There must not already be a record with the same id."""
 340
 341     assert not self.text_records.has_key(text_record.id)
 342
 343     self.text_records[text_record.id] = text_record
 344
 345   def __getitem__(self, id):
 346     return self.text_records[id]
 347
 348   def __delitem__(self, id):
 349     """Free the record with the specified ID."""
 350
 351     del self.text_records[id]
 352
 353   def replace(self, text_record):
 354     """Store TEXT_RECORD in place of the existing record with the same id.
 355
 356     Do not do anything with the old record."""
 357
 358     assert self.text_records.has_key(text_record.id)
 359     self.text_records[text_record.id] = text_record
 360
 361   def discard(self, *ids):
 362     """The text records with IDS are no longer needed; discard them.
 363
 364     This involves calling their free() methods and also removing them
 365     from SELF.
 366
 367     If SELF.deferred_deletes is not None, then the ids to be deleted
 368     are added to the list instead of deleted immediately.  This
 369     mechanism is to prevent a stack overflow from the avalanche of
 370     deletes that can result from deleting a long chain of revisions."""
 371
 372     if self.deferred_deletes is None:
 373       # This is an outer-level delete.
 374       self.deferred_deletes = list(ids)
 375       while self.deferred_deletes:
 376         id = self.deferred_deletes.pop()
 377         text_record = self[id]
 378         if text_record.refcount != 0:
 379           raise InternalError(
 380               'TextRecordDatabase.discard(%s) called with refcount = %d'
 381               % (text_record, text_record.refcount,)
 382               )
 383         # This call might cause other text_record ids to be added to
 384         # self.deferred_deletes:
 385         text_record.free(self)
 386         del self[id]
 387       self.deferred_deletes = None
 388     else:
 389       self.deferred_deletes.extend(ids)
 390
 391   def itervalues(self):
 392     return self.text_records.itervalues()
 393
 394   def recompute_refcounts(self, cvs_file_items):
 395     """Recompute the refcounts of the contained TextRecords.
 396
 397     Use CVS_FILE_ITEMS to determine which records will be needed by
 398     cvs2svn."""
 399
 400     # First clear all of the refcounts:
 401     for text_record in self.itervalues():
 402       text_record.refcount = 0
 403
 404     # Now increment the reference count of records that are needed as
 405     # the source of another record's deltas:
 406     for text_record in self.itervalues():
 407       text_record.increment_dependency_refcounts(self.text_records)
 408
 409     # Now increment the reference count of records that will be needed
 410     # by cvs2svn:
 411     for lod_items in cvs_file_items.iter_lods():
 412       for cvs_rev in lod_items.cvs_revisions:
 413         if isinstance(cvs_rev, CVSRevisionModification):
 414           self[cvs_rev.id].refcount += 1
 415
 416   def free_unused(self):
 417     """Free any TextRecords whose reference counts are zero."""
 418
 419     # The deletion of some of these text records might cause others to
 420     # be unused, in which case they will be deleted automatically.
 421     # But since the initially-unused records are not referred to by
 422     # any others, we don't have to be afraid that they will be deleted
 423     # before we get to them.  But it *is* crucial that we create the
 424     # whole unused list before starting the loop.
 425
 426     unused = [
 427         text_record.id
 428         for text_record in self.itervalues()
 429         if text_record.refcount == 0
 430         ]
 431
 432     self.discard(*unused)
 433
 434   def log_leftovers(self):
 435     """If any TextRecords still exist, log them."""
 436
 437     if self.text_records:
 438       Log().warn(
 439           "%s: internal problem: leftover revisions in the checkout cache:"
 440           % warning_prefix)
 441       for text_record in self.itervalues():
 442         Log().warn('    %s' % (text_record,))
 443
 444   def __repr__(self):
 445     """Debugging output of the current contents of the TextRecordDatabase."""
 446
 447     retval = ['TextRecordDatabase:']
 448     for text_record in self.itervalues():
 449       retval.append('    %s' % (text_record,))
 450     return '\n'.join(retval)
 451
 452
 453 class _Sink(cvs2svn_rcsparse.Sink):
 454   def __init__(self, revision_collector, cvs_file_items):
 455     self.revision_collector = revision_collector
 456     self.cvs_file_items = cvs_file_items
 457
 458     # A map {rev : base_rev} indicating that the text for rev is
 459     # stored in CVS as a delta relative to base_rev.
 460     self.base_revisions = {}
 461
 462     # The revision that is stored with its fulltext in CVS (usually
 463     # the oldest revision on trunk):
 464     self.head_revision = None
 465
 466     # The first logical revision on trunk (usually '1.1'):
 467     self.revision_1_1 = None
 468
 469     # Keep track of the revisions whose revision info has been seen so
 470     # far (to avoid repeated revision info blocks):
 471     self.revisions_seen = set()
 472
 473   def set_head_revision(self, revision):
 474     self.head_revision = revision
 475
 476   def define_revision(
 477         self, revision, timestamp, author, state, branches, next
 478         ):
 479     if next:
 480       self.base_revisions[next] = revision
 481     else:
 482       if is_trunk_revision(revision):
 483         self.revision_1_1 = revision
 484
 485     for branch in branches:
 486       self.base_revisions[branch] = revision
 487
 488   def set_revision_info(self, revision, log, text):
 489     if revision in self.revisions_seen:
 490       # One common form of CVS repository corruption is that the
 491       # Deltatext block for revision 1.1 appears twice.  CollectData
 492       # has already warned about this problem; here we can just ignore
 493       # it.
 494       return
 495     else:
 496       self.revisions_seen.add(revision)
 497
 498     cvs_rev_id = self.cvs_file_items.original_ids[revision]
 499     if is_trunk_revision(revision):
 500       # On trunk, revisions are encountered in reverse order (1.<N>
 501       # ... 1.1) and deltas are inverted.  The first text that we see
 502       # is the fulltext for the HEAD revision.  After that, the text
 503       # corresponding to revision 1.N is the delta (1.<N+1> ->
 504       # 1.<N>)).  We have to invert the deltas here so that we can
 505       # read the revisions out in dependency order; that is, for
 506       # revision 1.1 we want the fulltext, and for revision 1.<N> we
 507       # want the delta (1.<N-1> -> 1.<N>).  This means that we can't
 508       # compute the delta for a revision until we see its logical
 509       # parent.  When we finally see revision 1.1 (which is recognized
 510       # because it doesn't have a parent), we can record the diff (1.1
 511       # -> 1.2) for revision 1.2, and also the fulltext for 1.1.
 512
 513       if revision == self.head_revision:
 514         # This is HEAD, as fulltext.  Initialize the RCSStream so
 515         # that we can compute deltas backwards in time.
 516         self._rcs_stream = RCSStream(text)
 517         self._rcs_stream_revision = revision
 518       else:
 519         # Any other trunk revision is a backward delta.  Apply the
 520         # delta to the RCSStream to mutate it to the contents of this
 521         # revision, and also to get the reverse delta, which we store
 522         # as the forward delta of our child revision.
 523         try:
 524           text = self._rcs_stream.invert_diff(text)
 525         except MalformedDeltaException, e:
 526           Log().error(
 527               'Malformed RCS delta in %s, revision %s: %s'
 528               % (self.cvs_file_items.cvs_file.filename, revision, e)
 529               )
 530           raise RuntimeError()
 531         text_record = DeltaTextRecord(
 532             self.cvs_file_items.original_ids[self._rcs_stream_revision],
 533             cvs_rev_id
 534             )
 535         self.revision_collector._writeout(text_record, text)
 536         self._rcs_stream_revision = revision
 537
 538       if revision == self.revision_1_1:
 539         # This is revision 1.1.  Write its fulltext:
 540         text_record = FullTextRecord(cvs_rev_id)
 541         self.revision_collector._writeout(
 542             text_record, self._rcs_stream.get_text()
 543             )
 544
 545         # There will be no more trunk revisions delivered, so free the
 546         # RCSStream.
 547         del self._rcs_stream
 548         del self._rcs_stream_revision
 549
 550     else:
 551       # On branches, revisions are encountered in logical order
 552       # (<BRANCH>.1 ... <BRANCH>.<N>) and the text corresponding to
 553       # revision <BRANCH>.<N> is the forward delta (<BRANCH>.<N-1> ->
 554       # <BRANCH>.<N>).  That's what we need, so just store it.
 555
 556       # FIXME: It would be nice to avoid writing out branch deltas
 557       # when --trunk-only.  (They will be deleted when finish_file()
 558       # is called, but if the delta db is in an IndexedDatabase the
 559       # deletions won't actually recover any disk space.)
 560       text_record = DeltaTextRecord(
 561           cvs_rev_id,
 562           self.cvs_file_items.original_ids[self.base_revisions[revision]]
 563           )
 564       self.revision_collector._writeout(text_record, text)
 565
 566     return None
 567
 568
 569 class InternalRevisionCollector(RevisionCollector):
 570   """The RevisionCollector used by InternalRevisionReader."""
 571
 572   def __init__(self, compress):
 573     RevisionCollector.__init__(self)
 574     self._compress = compress
 575
 576   def register_artifacts(self, which_pass):
 577     artifact_manager.register_temp_file(
 578         config.RCS_DELTAS_INDEX_TABLE, which_pass
 579         )
 580     artifact_manager.register_temp_file(config.RCS_DELTAS_STORE, which_pass)
 581     artifact_manager.register_temp_file(
 582         config.RCS_TREES_INDEX_TABLE, which_pass
 583         )
 584     artifact_manager.register_temp_file(config.RCS_TREES_STORE, which_pass)
 585
 586   def start(self):
 587     serializer = MarshalSerializer()
 588     if self._compress:
 589       serializer = CompressingSerializer(serializer)
 590     self._delta_db = IndexedDatabase(
 591         artifact_manager.get_temp_file(config.RCS_DELTAS_STORE),
 592         artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE),
 593         DB_OPEN_NEW, serializer,
 594         )
 595     primer = (FullTextRecord, DeltaTextRecord)
 596     self._rcs_trees = IndexedDatabase(
 597         artifact_manager.get_temp_file(config.RCS_TREES_STORE),
 598         artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE),
 599         DB_OPEN_NEW, PrimedPickleSerializer(primer),
 600         )
 601
 602   def _writeout(self, text_record, text):
 603     self.text_record_db.add(text_record)
 604     self._delta_db[text_record.id] = text
 605
 606   def process_file(self, cvs_file_items):
 607     """Read revision information for the file described by CVS_FILE_ITEMS.
 608
 609     Compute the text record refcounts, discard any records that are
 610     unneeded, and store the text records for the file to the
 611     _rcs_trees database."""
 612
 613     # A map from cvs_rev_id to TextRecord instance:
 614     self.text_record_db = TextRecordDatabase(self._delta_db, NullDatabase())
 615
 616     cvs2svn_rcsparse.parse(
 617         open(cvs_file_items.cvs_file.filename, 'rb'),
 618         _Sink(self, cvs_file_items),
 619         )
 620
 621     self.text_record_db.recompute_refcounts(cvs_file_items)
 622     self.text_record_db.free_unused()
 623     self._rcs_trees[cvs_file_items.cvs_file.id] = self.text_record_db
 624     del self.text_record_db
 625
 626   def finish(self):
 627     self._delta_db.close()
 628     self._rcs_trees.close()
 629
 630
 631 class _KeywordExpander:
 632   """A class whose instances provide substitutions for CVS keywords.
 633
 634   This class is used via its __call__() method, which should be called
 635   with a match object representing a match for a CVS keyword string.
 636   The method returns the replacement for the matched text.
 637
 638   The __call__() method works by calling the method with the same name
 639   as that of the CVS keyword (converted to lower case).
 640
 641   Instances of this class can be passed as the REPL argument to
 642   re.sub()."""
 643
 644   date_fmt_old = "%Y/%m/%d %H:%M:%S"    # CVS 1.11, rcs
 645   date_fmt_new = "%Y-%m-%d %H:%M:%S"    # CVS 1.12
 646
 647   date_fmt = date_fmt_new
 648
 649   @classmethod
 650   def use_old_date_format(klass):
 651       """Class method to ensure exact compatibility with CVS 1.11
 652       output.  Use this if you want to verify your conversion and you're
 653       using CVS 1.11."""
 654       klass.date_fmt = klass.date_fmt_old
 655
 656   def __init__(self, cvs_rev):
 657     self.cvs_rev = cvs_rev
 658
 659   def __call__(self, match):
 660     return '$%s: %s $' % (
 661         match.group(1), getattr(self, match.group(1).lower())(),
 662         )
 663
 664   def author(self):
 665     return Ctx()._metadata_db[self.cvs_rev.metadata_id].original_author
 666
 667   def date(self):
 668     return time.strftime(self.date_fmt, time.gmtime(self.cvs_rev.timestamp))
 669
 670   def header(self):
 671     return '%s %s %s %s Exp' % (
 672         self.source(), self.cvs_rev.rev, self.date(), self.author(),
 673         )
 674
 675   def id(self):
 676     return '%s %s %s %s Exp' % (
 677         self.rcsfile(), self.cvs_rev.rev, self.date(), self.author(),
 678         )
 679
 680   def locker(self):
 681     # Handle kvl like kv, as a converted repo is supposed to have no
 682     # locks.
 683     return ''
 684
 685   def log(self):
 686     # Would need some special handling.
 687     return 'not supported by cvs2svn'
 688
 689   def name(self):
 690     # Cannot work, as just creating a new symbol does not check out
 691     # the revision again.
 692     return 'not supported by cvs2svn'
 693
 694   def rcsfile(self):
 695     return self.cvs_rev.cvs_file.basename + ",v"
 696
 697   def revision(self):
 698     return self.cvs_rev.rev
 699
 700   def source(self):
 701     project = self.cvs_rev.cvs_file.project
 702     return '%s/%s%s,v' % (
 703         project.cvs_repository_root,
 704         project.cvs_module,
 705         self.cvs_rev.cvs_file.cvs_path,
 706         )
 707
 708   def state(self):
 709     # We check out only live revisions.
 710     return 'Exp'
 711
 712
 713 class InternalRevisionReader(RevisionReader):
 714   """A RevisionReader that reads the contents from an own delta store."""
 715
 716   _kws = 'Author|Date|Header|Id|Locker|Log|Name|RCSfile|Revision|Source|State'
 717   _kw_re = re.compile(r'\$(' + _kws + r'):[^$\n]*\$')
 718   _kwo_re = re.compile(r'\$(' + _kws + r')(:[^$\n]*)?\$')
 719
 720   def __init__(self, compress):
 721     self._compress = compress
 722
 723   def register_artifacts(self, which_pass):
 724     artifact_manager.register_temp_file(config.CVS_CHECKOUT_DB, which_pass)
 725     artifact_manager.register_temp_file_needed(
 726         config.RCS_DELTAS_STORE, which_pass
 727         )
 728     artifact_manager.register_temp_file_needed(
 729         config.RCS_DELTAS_INDEX_TABLE, which_pass
 730         )
 731     artifact_manager.register_temp_file_needed(
 732         config.RCS_TREES_STORE, which_pass
 733         )
 734     artifact_manager.register_temp_file_needed(
 735         config.RCS_TREES_INDEX_TABLE, which_pass
 736         )
 737
 738   def start(self):
 739     self._delta_db = IndexedDatabase(
 740         artifact_manager.get_temp_file(config.RCS_DELTAS_STORE),
 741         artifact_manager.get_temp_file(config.RCS_DELTAS_INDEX_TABLE),
 742         DB_OPEN_READ,
 743         )
 744     self._delta_db.__delitem__ = lambda id: None
 745     self._tree_db = IndexedDatabase(
 746         artifact_manager.get_temp_file(config.RCS_TREES_STORE),
 747         artifact_manager.get_temp_file(config.RCS_TREES_INDEX_TABLE),
 748         DB_OPEN_READ,
 749         )
 750     serializer = MarshalSerializer()
 751     if self._compress:
 752       serializer = CompressingSerializer(serializer)
 753     self._co_db = Database(
 754         artifact_manager.get_temp_file(config.CVS_CHECKOUT_DB),
 755         DB_OPEN_NEW, serializer,
 756         )
 757
 758     # The set of CVSFile instances whose TextRecords have already been
 759     # read:
 760     self._loaded_files = set()
 761
 762     # A map { CVSFILE : _FileTree } for files that currently have live
 763     # revisions:
 764     self._text_record_db = TextRecordDatabase(self._delta_db, self._co_db)
 765
 766   def _get_text_record(self, cvs_rev):
 767     """Return the TextRecord instance for CVS_REV.
 768
 769     If the TextRecords for CVS_REV.cvs_file haven't been loaded yet,
 770     do so now."""
 771
 772     if cvs_rev.cvs_file not in self._loaded_files:
 773       for text_record in self._tree_db[cvs_rev.cvs_file.id].itervalues():
 774         self._text_record_db.add(text_record)
 775       self._loaded_files.add(cvs_rev.cvs_file)
 776
 777     return self._text_record_db[cvs_rev.id]
 778
 779   def get_content(self, cvs_rev):
 780     """Check out the text for revision C_REV from the repository.
 781
 782     Return the text.  If CVS_REV has a property _keyword_handling, use
 783     it to determine how to handle RCS keywords in the output:
 784
 785         'collapsed' -- collapse keywords
 786
 787         'expanded' -- expand keywords
 788
 789         'untouched' -- output keywords in the form they are found in
 790             the RCS file
 791
 792     Note that $Log$ never actually generates a log (which makes test
 793     'requires_cvs()' fail).
 794
 795     Revisions may be requested in any order, but if they are not
 796     requested in dependency order the checkout database will become
 797     very large.  Revisions may be skipped.  Each revision may be
 798     requested only once."""
 799
 800     try:
 801       text = self._get_text_record(cvs_rev).checkout(self._text_record_db)
 802     except MalformedDeltaException, (msg):
 803       raise FatalError('Malformed RCS delta in %s, revision %s: %s'
 804                        % (cvs_rev.cvs_file.get_filename(), cvs_rev.rev, msg))
 805
 806     keyword_handling = cvs_rev.get_property('_keyword_handling')
 807
 808     if keyword_handling == 'untouched':
 809       # Leave keywords in the form that they were checked in.
 810       pass
 811     elif keyword_handling == 'collapsed':
 812       text = self._kw_re.sub(r'$\1$', text)
 813     elif keyword_handling == 'expanded':
 814       text = self._kwo_re.sub(_KeywordExpander(cvs_rev), text)
 815     else:
 816       raise FatalError(
 817           'Undefined _keyword_handling property (%r) for %s'
 818           % (keyword_handling, cvs_rev,)
 819           )
 820
 821     if Ctx().decode_apple_single:
 822       # Insert a filter to decode any files that are in AppleSingle
 823       # format:
 824       text = get_maybe_apple_single(text)
 825
 826     eol_fix = cvs_rev.get_property('_eol_fix')
 827     if eol_fix:
 828       text = canonicalize_eol(text, eol_fix)
 829
 830     return text
 831
 832   def finish(self):
 833     self._text_record_db.log_leftovers()
 834
 835     del self._text_record_db
 836     self._delta_db.close()
 837     self._tree_db.close()
 838     self._co_db.close()
 839