cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import string
  30 import getopt
  31 import stat
  32 import md5
  33 import marshal
  34 import errno
  35 import popen2
  36 import types
  37 try:
  38   # Try to get access to a bunch of encodings for use with --encoding.
  39   # See http://cjkpython.i18n.org/ for details.
  40   import iconv_codec
  41 except ImportError:
  42   pass
  43
  44 # Warnings and errors start with these strings.  They are typically
  45 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  46 warning_prefix = "WARNING"
  47 error_prefix = "ERROR"
  48
  49 # Make sure this Python is recent enough.
  50 if sys.hexversion < 0x2000000:
  51   sys.stderr.write("'%s: Python 2.0 or higher required, "
  52                    "see www.python.org.\n" % error_prefix)
  53   sys.exit(1)
  54
  55 # Pretend we have true booleans on older python versions
  56 try:
  57   True
  58 except:
  59   True = 1
  60   False = 0
  61
  62 # Opening pipes was a mess before Python 2.4, because some methods did
  63 # not exist on some platforms, and some behaved differenly on other.
  64 # Python 2.4 solved this by adding the subprocess module, but since we
  65 # cannot require such a new version, we cannot use it directly, but
  66 # must implement a simplified Popen using the best means neccessary.
  67 #
  68 # The SimplePopen class only has the following members and methods, all
  69 # behaving as documented in the subprocess.Popen class:
  70 #     - stdin
  71 #     - stdout
  72 #     - stderr
  73 #     - wait
  74 try:
  75   # First try subprocess.Popen...
  76   import subprocess
  77   class SimplePopen:
  78     def __init__(self, cmd, capture_stderr):
  79       if capture_stderr:
  80         stderr = subprocess.PIPE
  81       else:
  82         stderr = None
  83       self._popen = subprocess.Popen(cmd, stdin=subprocess.PIPE,
  84                                     stdout=subprocess.PIPE, stderr=stderr)
  85       self.stdin = self._popen.stdin
  86       self.stdout = self._popen.stdout
  87       if capture_stderr:
  88         self.stderr = self._popen.stderr
  89       self.wait = self._popen.wait
  90 except ImportError:
  91   if hasattr(popen2, 'Popen3'):
  92     # ...then try popen2.Popen3...
  93     class SimplePopen:
  94       def __init__(self, cmd, capture_stderr):
  95         self._popen3 = popen2.Popen3(cmd, capture_stderr)
  96         self.stdin = self._popen3.tochild
  97         self.stdout = self._popen3.fromchild
  98         if capture_stderr:
  99           self.stderr = self._popen3.childerr
 100         self.wait = self._popen3.wait
 101   else:
 102     # ...and if all fails, use popen2.popen3...
 103     class SimplePopen:
 104       def __init__(self, cmd, capture_stderr):
 105         if type(cmd) != types.StringType:
 106           cmd = argv_to_command_string(cmd)
 107         self.stdout, self.stdin, self.stderr = popen2.popen3(cmd, mode='b')
 108       def wait(self):
 109         return self.stdout.close() or self.stdin.close() or \
 110                self.stderr.close()
 111
 112 # DBM module selection
 113
 114 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
 115 #    so that the dbhash module used by anydbm will use bsddb3.
 116 try:
 117   import bsddb3
 118   sys.modules['bsddb'] = sys.modules['bsddb3']
 119 except ImportError:
 120   pass
 121
 122 # 2. These DBM modules are not good for cvs2svn.
 123 import anydbm
 124 if (anydbm._defaultmod.__name__ == 'dumbdbm'
 125     or anydbm._defaultmod.__name__ == 'dbm'):
 126   sys.stderr.write(
 127     error_prefix
 128     + ': your installation of Python does not contain a suitable\n'
 129     + 'DBM module -- cvs2svn cannot continue.\n'
 130     + 'See http://python.org/doc/current/lib/module-anydbm.html to solve.\n')
 131   sys.exit(1)
 132
 133 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
 134 #    Unfortunately, gdbm appears not to be trouble free, either.
 135 if hasattr(anydbm._defaultmod, 'bsddb') \
 136     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
 137   try:
 138     gdbm = __import__('gdbm')
 139   except ImportError:
 140     sys.stderr.write(warning_prefix +
 141         ': The version of the bsddb module found '
 142         'on your computer has been reported to malfunction on some datasets, '
 143         'causing KeyError exceptions. You may wish to upgrade your Python to '
 144         'version 2.3 or later.\n')
 145   else:
 146     anydbm._defaultmod = gdbm
 147
 148 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 149 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 150 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 151
 152 SVN_KEYWORDS_VALUE = 'Author Date Id Revision'
 153
 154 # This really only matches standard '1.1.1.*'-style vendor revisions.
 155 # One could conceivably have a file whose default branch is 1.1.3 or
 156 # whatever, or was that at some point in time, with vendor revisions
 157 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 158 # is the only time this regexp gets used), we'd have no basis for
 159 # assuming that the non-standard vendor branch had ever been the
 160 # default branch anyway, so we don't want this to match them anyway.
 161 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 162
 163 # If this run's output is a repository, then (in the tmpdir) we use
 164 # a dumpfile of this name for repository loads.
 165 #
 166 # If this run's output is a dumpfile, then this is default name of
 167 # that dumpfile, but in the current directory (unless the user has
 168 # specified a dumpfile path, of course, in which case it will be
 169 # wherever the user said).
 170 DUMPFILE = 'cvs2svn-dump'
 171
 172 # This file appears with different suffixes at different stages of
 173 # processing.  CVS revisions are cleaned and sorted here, for commit
 174 # grouping.  See design-notes.txt for details.
 175 DATAFILE = 'cvs2svn-data'
 176
 177 # This file contains a marshalled copy of all the statistics that we
 178 # gather throughout the various runs of cvs2svn.  The data stored as a
 179 # marshalled dictionary.
 180 STATISTICS_FILE = 'cvs2svn-statistics'
 181
 182 # This text file contains records (1 per line) that describe svn
 183 # filesystem paths that are the opening and closing source revisions
 184 # for copies to tags and branches.  The format is as follows:
 185 #
 186 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 187 #
 188 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 189 # SVN_REVNUM are the primary and secondary sorting criteria for
 190 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 191 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 192 # A sorted version of the above file.
 193 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 194
 195 # This file is a temporary file for storing symbolic_name -> closing
 196 # CVSRevision until the end of our pass where we can look up the
 197 # corresponding SVNRevNum for the closing revs and write these out to
 198 # the SYMBOL_OPENINGS_CLOSINGS.
 199 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 200
 201 # Skeleton version of an svn filesystem.
 202 # (These supersede and will eventually replace the two above.)
 203 # See class SVNRepositoryMirror for how these work.
 204 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 205 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 206
 207 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 208 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 209 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 210
 211 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 212 # the CVSRevision is the last such that is a source for those symbolic
 213 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 214 # file, and this file's 1.3 is the latest (by date) revision among
 215 # *all* CVS files that is a source for branch B, then the
 216 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 217 # list at least B in its list.
 218 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 219
 220 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 221 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 222 ### the s-revs data in this database.
 223 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 224
 225 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 226 # names), values are ignorable.
 227 TAGS_DB = 'cvs2svn-tags.db'
 228
 229 # A list all tags.  Each line consists of the tag name and the number
 230 # of files in which it exists, separated by a space.
 231 TAGS_LIST = 'cvs2svn-tags.txt'
 232
 233 # A list of all branches.  The file is stored as a plain text file
 234 # to make it easy to look at in an editor.  Each line contains the
 235 # branch name, the number of files where the branch is created, the
 236 # commit count, and a list of tags and branches that are defined on
 237 # revisions in the branch.
 238 BRANCHES_LIST = 'cvs2svn-branches.txt'
 239
 240 # These two databases provide a bidirectional mapping between
 241 # CVSRevision.unique_key()s and Subversion revision numbers.
 242 #
 243 # The first maps CVSRevision.unique_key() to a number; the values are
 244 # not unique.
 245 #
 246 # The second maps a number to a list of CVSRevision.unique_key()s.
 247 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 248 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 249
 250 # This database maps svn_revnums to tuples of (symbolic_name, date).
 251 #
 252 # The svn_revnums are the revision numbers of all non-primary
 253 # SVNCommits.  No primary SVNCommit has a key in this database.
 254 #
 255 # The date is stored for all commits in this database.
 256 #
 257 # For commits that fill symbolic names, the symbolic_name is stored.
 258 # For commits that default branch syncs, the symbolic_name is None.
 259 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 260
 261 # This database maps svn_revnums of a default branch synchronization
 262 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 263 #
 264 # (NOTE: Secondary commits that fill branches and tags also have a
 265 # motivating commit, but we do not record it because it is (currently)
 266 # not needed for anything.)
 267 #
 268 # This mapping is used when generating the log message for the commit
 269 # that synchronizes the default branch with trunk.
 270 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 271
 272 # How many bytes to read at a time from a pipe.  128 kiB should be
 273 # large enough to be efficient without wasting too much memory.
 274 PIPE_READ_SIZE = 128 * 1024
 275
 276 # Record the default RCS branches, if any, for CVS filepaths.
 277 #
 278 # The keys are CVS filepaths, relative to the top of the repository
 279 # and with the ",v" stripped off, so they match the cvs paths used in
 280 # Commit.commit().  The values are vendor branch revisions, such as
 281 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 282 # represents the highest vendor branch revision thought to have ever
 283 # been head of the default branch.
 284 #
 285 # The reason we record a specific vendor revision, rather than a
 286 # default branch number, is that there are two cases to handle:
 287 #
 288 # One case is simple.  The RCS file lists a default branch explicitly
 289 # in its header, such as '1.1.1'.  In this case, we know that every
 290 # revision on the vendor branch is to be treated as head of trunk at
 291 # that point in time.
 292 #
 293 # But there's also a degenerate case.  The RCS file does not currently
 294 # have a default branch, yet we can deduce that for some period in the
 295 # past it probably *did* have one.  For example, the file has vendor
 296 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 297 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 298 # case, we should record 1.1.1.96 as the last vendor revision to have
 299 # been the head of the default branch.
 300 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 301
 302 # Records the author and log message for each changeset.
 303 # The keys are author+log digests, the same kind used to identify
 304 # unique revisions in the .revs, etc files.  Each value is a tuple
 305 # of two elements: '(author logmessage)'.
 306 METADATA_DB = "cvs2svn-metadata.db"
 307
 308 # A temporary on-disk hash that maps CVSRevision unique keys to a new
 309 # timestamp for that CVSRevision.  These new timestamps are created in
 310 # pass2, and this hash is used exclusively in pass2.
 311 TWEAKED_TIMESTAMPS_DB = "cvs2svn-fixed-timestamps.db"
 312
 313 REVS_SUFFIX = '.revs'
 314 CLEAN_REVS_SUFFIX = '.c-revs'
 315 SORTED_REVS_SUFFIX = '.s-revs'
 316 RESYNC_SUFFIX = '.resync'
 317
 318 SVN_INVALID_REVNUM = -1
 319
 320 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 321
 322 # Things that can happen to a file.
 323 OP_NOOP   = '-'
 324 OP_ADD    = 'A'
 325 OP_DELETE = 'D'
 326 OP_CHANGE = 'C'
 327
 328 # A deltatext either does or doesn't represent some change.
 329 DELTATEXT_NONEMPTY = 'N'
 330 DELTATEXT_EMPTY    = 'E'
 331
 332 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 333
 334 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 335 OPENING = 'O'
 336 CLOSING = 'C'
 337
 338 class FatalException(Exception):
 339   """Exception thrown on a non-recoverable error.
 340
 341   If this exception is thrown by main(), it is caught by the global
 342   layer of the program, its string representation is printed, and the
 343   program is ended with an exit code of 1."""
 344
 345   pass
 346
 347
 348 class FatalError(FatalException):
 349   """A FatalException that prepends error_prefix to the message."""
 350
 351   def __init__(self, msg):
 352     """Use (error_prefix + ': ' + MSG + '\n') as the error message."""
 353
 354     FatalException.__init__(self, '%s: %s\n' % (error_prefix, msg,))
 355
 356
 357 def temp(basename):
 358   """Return a path to BASENAME in Ctx().tmpdir.
 359   This is a convenience function to save horizontal space in source."""
 360   return os.path.join(Ctx().tmpdir, basename)
 361
 362 # Since the unofficial set also includes [/\] we need to translate those
 363 # into ones that don't conflict with Subversion limitations.
 364 def _clean_symbolic_name(name):
 365   """Return symbolic name NAME, translating characters that Subversion
 366   does not allow in a pathname."""
 367   name = name.replace('/','++')
 368   name = name.replace('\\','--')
 369   return name
 370
 371 def _path_join(*components):
 372   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 373   Empty component are skipped."""
 374   return string.join(filter(None, components), '/')
 375
 376 def _path_split(path):
 377   """Split the svn pathname PATH into a pair, (HEAD, TAIL).
 378
 379   This is similar to os.path.split(), but always uses '/' as path
 380   separator.  PATH is an svn path, which should not start with a '/'.
 381   HEAD is everything before the last slash, and TAIL is everything
 382   after.  If PATH ends in a slash, TAIL will be empty.  If there is no
 383   slash in PATH, HEAD will be empty.  If PATH is empty, both HEAD and
 384   TAIL are empty."""
 385
 386   pos = path.rfind('/')
 387   if pos == -1:
 388     return ('', path,)
 389   else:
 390     return (path[:pos], path[pos+1:],)
 391
 392 def to_utf8(value, mode='replace'):
 393   """Encode (as Unicode) VALUE, trying the encodings in Ctx.encoding
 394   as valid source encodings.  Raise UnicodeError on failure of all
 395   source encodings."""
 396   ### FIXME: The 'replace' default mode should be an option,
 397   ### like --encoding is.
 398   for encoding in Ctx().encoding:
 399     try:
 400       return unicode(value, encoding, mode).encode('utf8')
 401     except UnicodeError:
 402       Log().write(LOG_VERBOSE, "Encoding '%s' failed for string '%s'"
 403                   % (encoding, value))
 404   raise UnicodeError
 405
 406 def run_command(command):
 407   if os.system(command):
 408     raise FatalError('Command failed: "%s"' % (command,))
 409
 410
 411 class CommandFailedException(Exception):
 412   """Exception raised if check_command_runs() fails."""
 413
 414   pass
 415
 416
 417 def check_command_runs(cmd, cmdname):
 418   """Check whether the command CMD can be executed without errors.
 419
 420   CMD is a list or string, as accepted by SimplePopen.  CMDNAME is the
 421   name of the command as it should be included in exception error
 422   messages.
 423
 424   This function checks three things: (1) the command can be run
 425   without throwing an OSError; (2) it exits with status=0; (3) it
 426   doesn't output anything to stderr.  If any of these conditions is
 427   not met, raise a CommandFailedException describing the problem."""
 428
 429   try:
 430     pipe = SimplePopen(cmd, True)
 431   except OSError, e:
 432     raise CommandFailedException('error executing %s: %s' % (cmdname, e,))
 433   pipe.stdin.close()
 434   pipe.stdout.read()
 435   errmsg = pipe.stderr.read()
 436   status = pipe.wait()
 437   if status != 0 or errmsg:
 438     msg = 'error executing %s: status %s' % (cmdname, status,)
 439     if errmsg:
 440       msg += ', error output:\n%s' % (errmsg,)
 441     raise CommandFailedException(msg)
 442
 443
 444 class CVSRepository:
 445   """A CVS repository from which data can be extracted."""
 446   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 447     """Return a command string, and the pipe created using that
 448     string.  C_REV is a CVSRevision.  If SUPPRESS_KEYWORD_SUBSTITUTION
 449     is True, then suppress the substitution of RCS/CVS keywords in the
 450     output.  The pipe returns the text of that CVS Revision."""
 451     raise NotImplementedError
 452
 453
 454 class CVSRepositoryViaRCS(CVSRepository):
 455   """A CVSRepository accessed via RCS."""
 456
 457   def __init__(self):
 458     try:
 459       check_command_runs([ 'co', '-V' ], 'co')
 460     except CommandFailedException, e:
 461       raise FatalError('%s\n'
 462                        'Please check that co is installed and in your PATH\n'
 463                        '(it is a part of the RCS software).' % (e,))
 464
 465   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 466     pipe_cmd = [ 'co', '-q', '-x,v', '-p' + c_rev.rev ]
 467     if suppress_keyword_substitution:
 468       pipe_cmd.append('-kk')
 469     pipe_cmd.append(c_rev.rcs_path())
 470     pipe = SimplePopen(pipe_cmd, True)
 471     pipe.stdin.close()
 472     return pipe_cmd, pipe
 473
 474
 475 class CVSRepositoryViaCVS(CVSRepository):
 476   """A CVSRepository accessed via CVS."""
 477
 478   def __init__(self):
 479     ctx = Ctx()
 480     # Ascend above the specified root if necessary, to find the
 481     # cvs_repository_root (a directory containing a CVSROOT directory)
 482     # and the cvs_module (the path of the conversion root within the
 483     # cvs repository) NB: cvs_module must be seperated by '/' *not* by
 484     # os.sep .
 485     self.cvs_repository_root = os.path.abspath(ctx.cvsroot)
 486     prev_cvs_repository_root = None
 487     self.cvs_module = ""
 488     while prev_cvs_repository_root != self.cvs_repository_root:
 489       if os.path.isdir(os.path.join(self.cvs_repository_root, 'CVSROOT')):
 490         break
 491       prev_cvs_repository_root = self.cvs_repository_root
 492       self.cvs_repository_root, module_component = \
 493           os.path.split(self.cvs_repository_root)
 494       self.cvs_module = module_component + "/" + self.cvs_module
 495     else:
 496       # Hit the root (of the drive, on Windows) without finding a CVSROOT dir.
 497       raise FatalError("the path '%s' is not a CVS repository, nor a path "
 498                        "within a CVS repository.  A CVS repository contains "
 499                        "a CVSROOT directory within its root directory."
 500                        % (ctx.cvsroot,))
 501     os.environ['CVSROOT'] = self.cvs_repository_root
 502
 503     def cvs_ok(global_arguments):
 504       check_command_runs(
 505         [ 'cvs' ] + global_arguments + [ '--version' ], 'cvs')
 506
 507     self.global_arguments = [ "-q", "-R" ]
 508     try:
 509       cvs_ok(self.global_arguments)
 510     except CommandFailedException, e:
 511       self.global_arguments = [ "-q" ]
 512       try:
 513         cvs_ok(self.global_arguments)
 514       except CommandFailedException, e:
 515         raise FatalError(
 516             '%s\n'
 517             'Please check that cvs is installed and in your PATH.' % (e,))
 518
 519   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 520     pipe_cmd = [ 'cvs' ] + self.global_arguments + \
 521                [ 'co', '-r' + c_rev.rev, '-p' ]
 522     if suppress_keyword_substitution:
 523       pipe_cmd.append('-kk')
 524     pipe_cmd.append(self.cvs_module + c_rev.cvs_path)
 525     pipe = SimplePopen(pipe_cmd, True)
 526     pipe.stdin.close()
 527     return pipe_cmd, pipe
 528
 529
 530 def generate_ignores(c_rev):
 531   # Read in props
 532   pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(c_rev)
 533   buf = pipe.stdout.read(PIPE_READ_SIZE)
 534   raw_ignore_val = ""
 535   while buf:
 536     raw_ignore_val = raw_ignore_val + buf
 537     buf = pipe.stdout.read(PIPE_READ_SIZE)
 538   pipe.stdout.close()
 539   error_output = pipe.stderr.read()
 540   exit_status = pipe.wait()
 541   if exit_status:
 542     raise FatalError("The command '%s' failed with exit status: %s\n"
 543                      "and the following output:\n"
 544                      "%s" % (pipe_cmd, exit_status, error_output))
 545
 546   # Tweak props: First, convert any spaces to newlines...
 547   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 548   raw_ignores = raw_ignore_val.split('\n')
 549   ignore_vals = [ ]
 550   for ignore in raw_ignores:
 551     # Reset the list if we encounter a '!'
 552     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 553     if ignore == '!':
 554       ignore_vals = [ ]
 555       continue
 556     # Skip empty lines
 557     if len(ignore) == 0:
 558       continue
 559     ignore_vals.append(ignore)
 560   return ignore_vals
 561
 562 # Return a string that has not been returned by gen_key() before.
 563 gen_key_base = 0L
 564 def gen_key():
 565   global gen_key_base
 566   key = '%x' % gen_key_base
 567   gen_key_base = gen_key_base + 1
 568   return key
 569
 570 # ============================================================================
 571 # This code is copied with a few modifications from:
 572 #   subversion/subversion/bindings/swig/python/svn/core.py
 573
 574 if sys.platform == "win32":
 575   _escape_shell_arg_re = re.compile(r'(\\+)(\"|$)')
 576
 577   def escape_shell_arg(arg):
 578     # The (very strange) parsing rules used by the C runtime library are
 579     # described at:
 580     # http://msdn.microsoft.com/library/en-us/vclang/html/_pluslang_Parsing_C.2b2b_.Command.2d.Line_Arguments.asp
 581
 582     # double up slashes, but only if they are followed by a quote character
 583     arg = re.sub(_escape_shell_arg_re, r'\1\1\2', arg)
 584
 585     # surround by quotes and escape quotes inside
 586     arg = '"' + string.replace(arg, '"', '"^""') + '"'
 587     return arg
 588
 589
 590   def argv_to_command_string(argv):
 591     """Flatten a list of command line arguments into a command string.
 592
 593     The resulting command string is expected to be passed to the system
 594     shell which os functions like popen() and system() invoke internally.
 595     """
 596
 597     # According cmd's usage notes (cmd /?), it parses the command line by
 598     # "seeing if the first character is a quote character and if so, stripping
 599     # the leading character and removing the last quote character."
 600     # So to prevent the argument string from being changed we add an extra set
 601     # of quotes around it here.
 602     return '"' + string.join(map(escape_shell_arg, argv), " ") + '"'
 603
 604 else:
 605   def escape_shell_arg(str):
 606     return "'" + string.replace(str, "'", "'\\''") + "'"
 607
 608   def argv_to_command_string(argv):
 609     """Flatten a list of command line arguments into a command string.
 610
 611     The resulting command string is expected to be passed to the system
 612     shell which os functions like popen() and system() invoke internally.
 613     """
 614
 615     return string.join(map(escape_shell_arg, argv), " ")
 616 # ============================================================================
 617
 618 def format_date(date):
 619   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 620   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 621   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 622
 623 def sort_file(infile, outfile):
 624   # sort the log files
 625
 626   # GNU sort will sort our dates differently (incorrectly!) if our
 627   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 628   # it to 'C'
 629   if os.environ.has_key('LC_ALL'):
 630     lc_all_tmp = os.environ['LC_ALL']
 631   else:
 632     lc_all_tmp = None
 633   os.environ['LC_ALL'] = 'C'
 634   # The -T option to sort has a nice side effect.  The Win32 sort is
 635   # case insensitive and cannot be used, and since it does not
 636   # understand the -T option and dies if we try to use it, there is
 637   # no risk that we use that sort by accident.
 638   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 639   if lc_all_tmp is None:
 640     del os.environ['LC_ALL']
 641   else:
 642     os.environ['LC_ALL'] = lc_all_tmp
 643
 644 def match_regexp_list(regexp_list, string):
 645   """Test whether STRING matches any of the compiled regexps in
 646   REGEXP_LIST."""
 647   for regexp in regexp_list:
 648     if regexp.match(string):
 649       return True
 650   return False
 651
 652 class LF_EOL_Filter:
 653   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 654   into LFs only."""
 655   def __init__(self, stream):
 656     self.stream = stream
 657     self.carry_cr = False
 658     self.eof = False
 659
 660   def read(self, size):
 661     while True:
 662       buf = self.stream.read(size)
 663       self.eof = len(buf) == 0
 664       if self.carry_cr:
 665         buf = '\r' + buf
 666         self.carry_cr = False
 667       if not self.eof and buf[-1] == '\r':
 668         self.carry_cr = True
 669         buf = buf[:-1]
 670       buf = string.replace(buf, '\r\n', '\n')
 671       buf = string.replace(buf, '\r', '\n')
 672       if len(buf) > 0 or self.eof:
 673         return buf
 674
 675
 676 # These constants represent the log levels that this script supports
 677 LOG_WARN = -1
 678 LOG_QUIET = 0
 679 LOG_NORMAL = 1
 680 LOG_VERBOSE = 2
 681 class Log:
 682   """A Simple logging facility.  Each line will be timestamped is
 683   self.use_timestamps is TRUE.  This class is a Borg, see
 684   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 685   __shared_state = {}
 686   def __init__(self):
 687     self.__dict__ = self.__shared_state
 688     if self.__dict__:
 689       return
 690     self.log_level = LOG_NORMAL
 691     # Set this to true if you want to see timestamps on each line output.
 692     self.use_timestamps = None
 693     self.logger = sys.stdout
 694
 695   def _timestamp(self):
 696     """Output a detailed timestamp at the beginning of each line output."""
 697     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 698
 699   def write(self, log_level, *args):
 700     """This is the public method to use for writing to a file.  Only
 701     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 702     there are multiple ARGS, they will be separated by a space."""
 703     if log_level > self.log_level:
 704       return
 705     if self.use_timestamps:
 706       self._timestamp()
 707     self.logger.write(' '.join(map(str,args)) + "\n")
 708     # Ensure that log output doesn't get out-of-order with respect to
 709     # stderr output.
 710     self.logger.flush()
 711
 712
 713 class Cleanup:
 714   """This singleton class manages any files created by cvs2svn.  When
 715   you first create a file, call Cleanup.register, passing the
 716   filename, and the last pass that you need the file.  After the end
 717   of that pass, your file will be cleaned up after running an optional
 718   callback.  This class is a Borg, see
 719   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 720
 721   __shared_state = {}
 722   def __init__(self):
 723     self.__dict__ = self.__shared_state
 724     if self.__dict__:
 725       return
 726     self._log = {}
 727     self._callbacks = {}
 728
 729   def register(self, file, which_pass, callback=None):
 730     """Register FILE for cleanup at the end of WHICH_PASS, running
 731     function CALLBACK prior to removal.  Registering a given FILE is
 732     idempotent; you may register as many times as you wish, but it
 733     will only be cleaned up once.
 734
 735     Note that if a file is registered multiple times, only the first
 736     callback registered for that file will be called at cleanup
 737     time.  Also note that if you register a database file you must
 738     close the database before cleanup, e.g. using a callback."""
 739     if not self._log.has_key(which_pass):
 740       self._log[which_pass] = {}
 741     self._log[which_pass][file] = 1
 742     if callback and not self._callbacks.has_key(file):
 743       self._callbacks[file] = callback
 744
 745   def cleanup(self, which_pass):
 746     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 747     if not self._log.has_key(which_pass):
 748       return
 749     for file in self._log[which_pass].keys():
 750       Log().write(LOG_VERBOSE, "Deleting", file)
 751       if self._callbacks.has_key(file):
 752         self._callbacks[file]()
 753       os.unlink(file)
 754
 755
 756 # Always use these constants for opening databases.
 757 DB_OPEN_READ = 'r'
 758 DB_OPEN_NEW = 'n'
 759
 760 # A wrapper for anydbm that uses the marshal module to store items as
 761 # strings.
 762 class Database:
 763   def __init__(self, filename, mode):
 764     # pybsddb3 has a bug which prevents it from working with
 765     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 766     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 767     # for databases protected by lock and transaction support
 768     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 769     #
 770     # Therefore, manually perform the removal (we can do this, because
 771     # we know that for bsddb - but *not* anydbm in general - the database
 772     # consists of one file with the name we specify, rather than several
 773     # based on that name).
 774     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 775       if os.path.isfile(filename):
 776         os.unlink(filename)
 777       mode = 'c'
 778
 779     self.db = anydbm.open(filename, mode)
 780
 781   def has_key(self, key):
 782     return self.db.has_key(key)
 783
 784   def __getitem__(self, key):
 785     return marshal.loads(self.db[key])
 786
 787   def __setitem__(self, key, value):
 788     self.db[key] = marshal.dumps(value)
 789
 790   def __delitem__(self, key):
 791     del self.db[key]
 792
 793   def get(self, key, default):
 794     if self.has_key(key):
 795       return self.__getitem__(key)
 796     return default
 797
 798
 799 class StatsKeeper:
 800   __shared_state = { }
 801   def __init__(self):
 802     self.__dict__ = self.__shared_state
 803     if self.__dict__:
 804       return
 805     self.filename = temp(STATISTICS_FILE)
 806     Cleanup().register(self.filename, pass8)
 807     # This can get kinda large, so we don't store it in our data dict.
 808     self.repos_files = { }
 809
 810     if os.path.exists(self.filename):
 811       self.unarchive()
 812     else:
 813       self.data = { 'cvs_revs_count' : 0,
 814                     'tags': { },
 815                     'branches' : { },
 816                     'repos_size' : 0,
 817                     'repos_file_count' : 0,
 818                     'svn_rev_count' : None,
 819                     'first_rev_date' : 1L<<32,
 820                     'last_rev_date' : 0,
 821                     'pass_timings' : { },
 822                     'start_time' : 0,
 823                     'end_time' : 0,
 824                     }
 825
 826   def log_duration_for_pass(self, duration, pass_num):
 827     self.data['pass_timings'][pass_num] = duration
 828
 829   def set_start_time(self, start):
 830     self.data['start_time'] = start
 831
 832   def set_end_time(self, end):
 833     self.data['end_time'] = end
 834
 835   def _bump_item(self, key, amount=1):
 836     self.data[key] = self.data[key] + amount
 837
 838   def reset_c_rev_info(self):
 839     self.data['cvs_revs_count'] = 0
 840     self.data['tags'] = { }
 841     self.data['branches'] = { }
 842
 843   def record_c_rev(self, c_rev):
 844     self._bump_item('cvs_revs_count')
 845
 846     for tag in c_rev.tags:
 847       self.data['tags'][tag] = None
 848     for branch in c_rev.branches:
 849       self.data['branches'][branch] = None
 850
 851     if c_rev.timestamp < self.data['first_rev_date']:
 852       self.data['first_rev_date'] = c_rev.timestamp
 853
 854     if c_rev.timestamp > self.data['last_rev_date']:
 855       self.data['last_rev_date'] = c_rev.timestamp
 856
 857     # Only add the size if this is the first time we see the file.
 858     if not self.repos_files.has_key(c_rev.fname):
 859       self._bump_item('repos_size', c_rev.file_size)
 860     self.repos_files[c_rev.fname] = None
 861
 862     self.data['repos_file_count'] = len(self.repos_files)
 863
 864   def set_svn_rev_count(self, count):
 865     self.data['svn_rev_count'] = count
 866
 867   def svn_rev_count(self):
 868     return self.data['svn_rev_count']
 869
 870   def archive(self):
 871     open(self.filename, 'w').write(marshal.dumps(self.data))
 872
 873   def unarchive(self):
 874     self.data = marshal.loads(open(self.filename, 'r').read())
 875
 876   def __str__(self):
 877     svn_revs_str = ""
 878     if self.data['svn_rev_count'] is not None:
 879       svn_revs_str = ('Total SVN Commits:      %10s\n'
 880                       % self.data['svn_rev_count'])
 881
 882     return ('\n'                                \
 883             'cvs2svn Statistics:\n'             \
 884             '------------------\n'              \
 885             'Total CVS Files:        %10i\n'    \
 886             'Total CVS Revisions:    %10i\n'    \
 887             'Total Unique Tags:      %10i\n'    \
 888             'Total Unique Branches:  %10i\n'    \
 889             'CVS Repos Size in KB:   %10i\n'    \
 890             '%s'                                \
 891             'First Revision Date:    %s\n'      \
 892             'Last Revision Date:     %s\n'      \
 893             '------------------'                \
 894             % (self.data['repos_file_count'],
 895                self.data['cvs_revs_count'],
 896                len(self.data['tags']),
 897                len(self.data['branches']),
 898                (self.data['repos_size'] / 1024),
 899                svn_revs_str,
 900                time.ctime(self.data['first_rev_date']),
 901                time.ctime(self.data['last_rev_date']),
 902                ))
 903
 904   def timings(self):
 905     passes = self.data['pass_timings'].keys()
 906     passes.sort()
 907     str = 'Timings:\n------------------\n'
 908
 909     def desc(val):
 910       if val == 1: return "second"
 911       return "seconds"
 912
 913     for pass_num in passes:
 914       duration = int(self.data['pass_timings'][pass_num])
 915       p_str = ('pass %d:%6d %s\n'
 916                % (pass_num, duration, desc(duration)))
 917       str = str + p_str
 918
 919     total = int(self.data['end_time'] - self.data['start_time'])
 920     str = str + ('total: %6d %s' % (total, desc(total)))
 921     return str
 922
 923
 924 class LastSymbolicNameDatabase:
 925   """ Passing every CVSRevision in s-revs to this class will result in
 926   a Database whose key is the last CVS Revision a symbolicname was
 927   seen in, and whose value is a list of all symbolicnames that were
 928   last seen in that revision."""
 929   def __init__(self, mode):
 930     self.symbols = {}
 931     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 932     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 933
 934   # Once we've gone through all the revs,
 935   # symbols.keys() will be a list of all tags and branches, and
 936   # their corresponding values will be a key into the last CVS revision
 937   # that they were used in.
 938   def log_revision(self, c_rev):
 939     # Gather last CVS Revision for symbolic name info and tag info
 940     for tag in c_rev.tags:
 941       self.symbols[tag] = c_rev.unique_key()
 942     if c_rev.op is not OP_DELETE:
 943       for branch in c_rev.branches:
 944         self.symbols[branch] = c_rev.unique_key()
 945
 946   # Creates an inversion of symbols above--a dictionary of lists (key
 947   # = CVS rev unique_key: val = list of symbols that close in that
 948   # rev.
 949   def create_database(self):
 950     for sym, rev_unique_key in self.symbols.items():
 951       if self.symbol_revs_db.has_key(rev_unique_key):
 952         ary = self.symbol_revs_db[rev_unique_key]
 953         ary.append(sym)
 954         self.symbol_revs_db[rev_unique_key] = ary
 955       else:
 956         self.symbol_revs_db[rev_unique_key] = [sym]
 957
 958
 959 class CVSRevisionDatabase:
 960   """A Database to store CVSRevision objects and retrieve them by their
 961   unique_key()."""
 962
 963   def __init__(self, mode):
 964     """Initialize an instance, opening database in MODE (like the MODE
 965     argument to Database or anydbm.open())."""
 966     self.cvs_revs_db = Database(temp(CVS_REVS_DB), mode)
 967     Cleanup().register(temp(CVS_REVS_DB), pass8)
 968
 969   def log_revision(self, c_rev):
 970     """Add C_REV, a CVSRevision, to the database."""
 971     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
 972
 973   def get_revision(self, unique_key):
 974     """Return the CVSRevision stored under UNIQUE_KEY."""
 975     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
 976
 977
 978 class TagsDatabase(Database):
 979   """A Database to store which symbolic names are tags.
 980   Each key is a tag name.
 981   The value has no meaning, and should be set to None."""
 982   def __init__(self, mode):
 983     Database.__init__(self, temp(TAGS_DB), mode)
 984     Cleanup().register(temp(TAGS_DB), pass8)
 985
 986
 987 class Project:
 988   """A project within a CVS repository."""
 989
 990   def __init__(self, cvs_root, trunk_path, branches_path, tags_path):
 991     """Create a new Project record.
 992
 993     CVS_ROOT is the main CVS directory for this project (within the
 994     filesystem).  TRUNK_PATH, BRANCHES_PATH, and TAGS_PATH are the
 995     full, normalized directory names in svn for the corresponding part
 996     of the repository."""
 997
 998     self.cvs_root = os.path.normpath(cvs_root)
 999     self.trunk_path = trunk_path
1000     self.branches_path = branches_path
1001     self.tags_path = tags_path
1002     verify_paths_disjoint(self.trunk_path, self.branches_path, self.tags_path)
1003
1004   def is_source(self, svn_path):
1005     """Return True iff SVN_PATH is a legitimate source for this project.
1006
1007     Legitimate paths are self.trunk_path or any directory directly
1008     under self.branches_path."""
1009
1010     if svn_path == self.trunk_path:
1011       return True
1012
1013     (head, tail,) = _path_split(svn_path)
1014     if head == self.branches_path:
1015       return True
1016
1017     return False
1018
1019   def is_unremovable(self, svn_path):
1020     """Return True iff the specified path must not be removed."""
1021
1022     return svn_path in [self.trunk_path, self.branches_path, self.tags_path]
1023
1024   def relative_name(self, fname):
1025     """Return the path to FNAME relative to cvs_root, with ',v' removed.
1026
1027     FNAME is a filesystem name that has to begin (textually) with
1028     self.cvs_root and end with ',v'."""
1029
1030     if not fname.startswith(self.cvs_root):
1031       raise FatalError(
1032           "relative_name: '%s' is not a sub-path of '%s'"
1033           % (fname, self.cvs_root,))
1034     if not fname.endswith(',v'):
1035       raise FatalError("relative_name: '%s' does not end with ',v'"
1036                        % (fname,))
1037     l = len(self.cvs_root)
1038     if fname[l] == os.sep:
1039       l += 1
1040     return string.replace(fname[l:-2], os.sep, '/')
1041
1042   def get_branch_path(self, branch_name):
1043     """Return the svnpath for the branch named BRANCH_NAME."""
1044
1045     return _path_join(self.branches_path, _clean_symbolic_name(branch_name))
1046
1047   def get_tag_path(self, tag_name):
1048     """Return the svnpath for the tag named TAG_NAME."""
1049
1050     return _path_join(self.tags_path, _clean_symbolic_name(tag_name))
1051
1052   def make_path(self, path, branch_name=None):
1053     """Return the trunk path or branch path for PATH.
1054
1055     PATH is a filesystem path relative to cvs_root.  If BRANCH_NAME is
1056     None, then return the svn path for this file on trunk; otherwise,
1057     return the svn path for this file on the specified branch."""
1058
1059     # For a while, we treated each top-level subdir of the CVS
1060     # repository as a "project root" and interpolated the appropriate
1061     # genealogy (trunk|tag|branch) in according to the official
1062     # recommended layout.  For example, the path '/foo/bar/baz.c' on
1063     # branch 'Rel2' would become
1064     #
1065     #   /foo/branches/Rel2/bar/baz.c
1066     #
1067     # and on trunk it would become
1068     #
1069     #   /foo/trunk/bar/baz.c
1070     #
1071     # However, we went back to the older and simpler method of just
1072     # prepending the genealogy to the front, instead of interpolating.
1073     # So now we produce:
1074     #
1075     #   /branches/Rel2/foo/bar/baz.c
1076     #   /trunk/foo/bar/baz.c
1077     #
1078     # Why?  Well, Jack Repenning pointed out that this way is much
1079     # friendlier to "anonymously rooted subtrees" (that's a tree where
1080     # the name of the top level dir doesn't matter, the point is that if
1081     # you cd into it and, say, run 'make', something good will happen).
1082     # By interpolating, we made it impossible to point cvs2svn at some
1083     # subdir in the CVS repository and convert it as a project, because
1084     # we'd treat every subdir underneath it as an independent project
1085     # root, which is probably not what the user wanted.
1086     #
1087     # Also, see Blair Zajac's post
1088     #
1089     #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
1090     #
1091     # and the surrounding thread, for why what people really want is a
1092     # way of specifying an in-repository prefix path, not interpolation.
1093
1094     if branch_name:
1095       return _path_join(self.get_branch_path(branch_name), path)
1096     else:
1097       return _path_join(self.trunk_path, path)
1098
1099
1100 class CVSRevision:
1101   def __init__(self, ctx, *args):
1102     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
1103
1104     If CTX is None, the following members and methods of the
1105     instantiated CVSRevision class object will be unavailable (or
1106     simply will not work correctly, if at all):
1107        cvs_path
1108        svn_path
1109        is_default_branch_revision()
1110
1111     (Note that this class treats CTX as const, because the caller
1112     likely passed in a Borg instance of a Ctx.  The reason this class
1113     takes CTX as as a parameter, instead of just instantiating a Ctx
1114     itself, is that this class should be usable outside cvs2svn.)
1115
1116     If there is one argument in ARGS, it is a string, in the format of
1117     a line from a revs file.  Do *not* include a trailing newline.
1118
1119     If there are multiple ARGS, there must be 17 of them,
1120     comprising a parsed revs line:
1121        timestamp       -->  (int) date stamp for this cvs revision
1122        digest          -->  (string) digest of author+logmsg
1123        prev_timestamp  -->  (int) date stamp for the previous cvs revision
1124        next_timestamp  -->  (int) date stamp for the next cvs revision
1125        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
1126        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
1127        rev             -->  (string) this CVS rev, e.g., "1.3"
1128        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
1129        file_in_attic   -->  (char or None) true if RCS file is in Attic
1130        file_executable -->  (char or None) true if RCS file has exec bit set.
1131        file_size       -->  (int) size of the RCS file
1132        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
1133        fname           -->  (string) relative path of file in CVS repos
1134        mode            -->  (string or None) "kkv", "kb", etc.
1135        branch_name     -->  (string or None) branch on which this rev occurred
1136        tags            -->  (list of strings) all tags on this revision
1137        branches        -->  (list of strings) all branches rooted in this rev
1138
1139     The two forms of initialization are equivalent.
1140
1141     WARNING: Due to the resync process in pass2, prev_timestamp or
1142     next_timestamp may be incorrect in the c-revs or s-revs files."""
1143
1144     self._ctx = ctx
1145     if len(args) == 17:
1146       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1147        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1148        self.file_executable, self.file_size, self.deltatext_code,
1149        self.fname,
1150        self.mode, self.branch_name, self.tags, self.branches) = args
1151     elif len(args) == 1:
1152       data = args[0].split(' ', 15)
1153       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1154        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1155        self.file_executable, self.file_size, self.deltatext_code,
1156        self.mode, self.branch_name, numtags, remainder) = data
1157       # Patch up data items which are not simple strings
1158       self.timestamp = int(self.timestamp, 16)
1159       if self.prev_timestamp == "*":
1160         self.prev_timestamp = 0
1161       else:
1162         self.prev_timestamp = int(self.prev_timestamp)
1163       if self.next_timestamp == "*":
1164         self.next_timestamp = 0
1165       else:
1166         self.next_timestamp = int(self.next_timestamp)
1167       if self.prev_rev == "*":
1168         self.prev_rev = None
1169       if self.next_rev == "*":
1170         self.next_rev = None
1171       if self.file_in_attic == "*":
1172         self.file_in_attic = None
1173       if self.file_executable == "*":
1174         self.file_executable = None
1175       self.file_size = int(self.file_size)
1176       if self.mode == "*":
1177         self.mode = None
1178       if self.branch_name == "*":
1179         self.branch_name = None
1180       numtags = int(numtags)
1181       tags_and_numbranches_and_remainder = remainder.split(' ', numtags + 1)
1182       self.tags = tags_and_numbranches_and_remainder[:-2]
1183       numbranches = int(tags_and_numbranches_and_remainder[-2])
1184       remainder = tags_and_numbranches_and_remainder[-1]
1185       branches_and_fname = remainder.split(' ', numbranches)
1186       self.branches = branches_and_fname[:-1]
1187       self.fname = branches_and_fname[-1]
1188     else:
1189       raise TypeError, 'CVSRevision() takes 2 or 18 arguments (%d given)' % \
1190           (len(args) + 1)
1191     if ctx is not None:
1192       self.cvs_path = ctx.project.relative_name(self.fname)
1193       self.svn_path = ctx.project.make_path(self.cvs_path, self.branch_name)
1194
1195   # The 'primary key' of a CVS Revision is the revision number + the
1196   # filename.  To provide a unique key (say, for a dict), we just glom
1197   # them together in a string.  By passing in self.prev_rev or
1198   # self.next_rev, you can get the unique key for their respective
1199   # CVSRevisions.
1200   def unique_key(self, revnum="0"):
1201     if revnum is "0":
1202       revnum = self.rev
1203     elif revnum is None:
1204       return None
1205     return revnum + "/" + self.fname
1206
1207   def __str__(self):
1208     return ('%08lx %s %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s'
1209             % (self.timestamp, self.digest, self.prev_timestamp or "*",
1210               self.next_timestamp or "*", self.op, (self.prev_rev or "*"),
1211               self.rev, (self.next_rev or "*"), (self.file_in_attic or "*"),
1212               (self.file_executable or "*"),
1213               self.file_size,
1214               self.deltatext_code, (self.mode or "*"),
1215               (self.branch_name or "*"),
1216               len(self.tags), self.tags and " " or "", " ".join(self.tags),
1217               len(self.branches), self.branches and " " or "",
1218               " ".join(self.branches),
1219               self.fname, ))
1220
1221   # Returns true if this CVSRevision is the opening CVSRevision for
1222   # NAME (for this RCS file).
1223   def opens_symbolic_name(self, name):
1224     if name in self.tags:
1225       return 1
1226     if name in self.branches:
1227       # If this c_rev opens a branch and our op is OP_DELETE, then
1228       # that means that the file that this c_rev belongs to was
1229       # created on the branch, so for all intents and purposes, this
1230       # c_rev is *technically* not an opening.  See Issue #62 for more
1231       # information.
1232       if self.op != OP_DELETE:
1233         return 1
1234     return 0
1235
1236   def is_default_branch_revision(self):
1237     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
1238     revision according to DEFAULT_BRANCHES_DB (see the conditions
1239     documented there), else return None."""
1240     if self._ctx._default_branches_db.has_key(self.cvs_path):
1241       val = self._ctx._default_branches_db[self.cvs_path]
1242       val_last_dot = val.rindex(".")
1243       our_last_dot = self.rev.rindex(".")
1244       default_branch = val[:val_last_dot]
1245       our_branch = self.rev[:our_last_dot]
1246       default_rev_component = int(val[val_last_dot + 1:])
1247       our_rev_component = int(self.rev[our_last_dot + 1:])
1248       if (default_branch == our_branch
1249           and our_rev_component <= default_rev_component):
1250         return 1
1251     # else
1252     return None
1253
1254   def rcs_path(self):
1255     """Returns the actual filesystem path to the RCS file of this
1256     CVSRevision."""
1257     if self.file_in_attic is None:
1258       return self.fname
1259     else:
1260       basepath, filename = os.path.split(self.fname)
1261       return os.path.join(basepath, 'Attic', filename)
1262
1263   def filename(self):
1264     "Return the last path component of self.fname, minus the ',v'"
1265     return os.path.split(self.fname)[-1][:-2]
1266
1267 class SymbolDatabase:
1268   """This database records information on all symbols in the RCS
1269   files.  It is created in pass 1 and it is used in pass 2."""
1270   def __init__(self):
1271     # A hash that maps tag names to commit counts
1272     self.tags = { }
1273     # A hash that maps branch names to lists of the format
1274     # [ create_count, commit_count, blockers ], where blockers
1275     # is a hash that lists the symbols that depend on the
1276     # the branch.  The blockers hash is used as a set, so the
1277     # values are not used.
1278     self.branches = { }
1279
1280   def register_tag_creation(self, name):
1281     """Register the creation of the tag NAME."""
1282     if not self.tags.has_key(name):
1283       self.tags[name] = 0
1284     self.tags[name] += 1
1285
1286   def _branch(self, name):
1287     """Helper function to get a branch node that will create and
1288     initialize the node if it does not exist."""
1289     if not self.branches.has_key(name):
1290       self.branches[name] = [ 0, 0, { } ]
1291     return self.branches[name]
1292
1293   def register_branch_creation(self, name):
1294     """Register the creation of the branch NAME."""
1295     self._branch(name)[0] += 1
1296
1297   def register_branch_commit(self, name):
1298     """Register a commit on the branch NAME."""
1299     self._branch(name)[1] += 1
1300
1301   def register_branch_blocker(self, name, blocker):
1302     """Register BLOCKER as a blocker on the branch NAME."""
1303     self._branch(name)[2][blocker] = None
1304
1305   def branch_has_commit(self, name):
1306     """Return non-zero if NAME has commits.  Returns 0 if name
1307     is not a branch or if it has no commits."""
1308     return self.branches.has_key(name) and self.branches[name][1]
1309
1310   def find_excluded_symbols(self, regexp_list):
1311     """Returns a hash of all symbols thaht match the regexps in
1312     REGEXP_LISTE.  The hash is used as a set so the values are
1313     not used."""
1314     excludes = { }
1315     for tag in self.tags.keys():
1316       if match_regexp_list(regexp_list, tag):
1317         excludes[tag] = None
1318     for branch in self.branches.keys():
1319       if match_regexp_list(regexp_list, branch):
1320         excludes[branch] = None
1321     return excludes
1322
1323   def find_branch_exclude_blockers(self, branch, excludes):
1324     """Find all blockers of BRANCH, excluding the ones in the hash
1325     EXCLUDES."""
1326     blockers = { }
1327     if excludes.has_key(branch):
1328       for blocker in self.branches[branch][2]:
1329         if not excludes.has_key(blocker):
1330           blockers[blocker] = None
1331     return blockers
1332
1333   def find_blocked_excludes(self, excludes):
1334     """Find all branches not in EXCLUDES that have blocking symbols that
1335     are not themselves excluded.  Return a hash that maps branch names
1336     to a hash of blockers.  The hash of blockes is used as a set so the
1337     values are not used."""
1338     blocked_branches = { }
1339     for branch in self.branches.keys():
1340       blockers = self.find_branch_exclude_blockers(branch, excludes)
1341       if blockers:
1342         blocked_branches[branch] = blockers
1343     return blocked_branches
1344
1345   def find_mismatches(self, excludes=None):
1346     """Find all symbols that are defined as both tags and branches,
1347     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1348     the symbol name, tag count, branch count and commit count."""
1349     if excludes is None:
1350       excludes = { }
1351     mismatches = [ ]
1352     for branch in self.branches.keys():
1353       if not excludes.has_key(branch) and self.tags.has_key(branch):
1354         mismatches.append((branch,                    # name
1355                            self.tags[branch],         # tag count
1356                            self.branches[branch][0],  # branch count
1357                            self.branches[branch][1])) # commit count
1358     return mismatches
1359
1360   def read(self):
1361     """Read the symbol database from files."""
1362     f = open(temp(TAGS_LIST))
1363     while 1:
1364       line = f.readline()
1365       if not line:
1366         break
1367       tag, count = line.split()
1368       self.tags[tag] = int(count)
1369
1370     f = open(temp(BRANCHES_LIST))
1371     while 1:
1372       line = f.readline()
1373       if not line:
1374         break
1375       words = line.split()
1376       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1377       for blocker in words[3:]:
1378         self.branches[words[0]][2][blocker] = None
1379
1380   def write(self):
1381     """Store the symbol database to files."""
1382     f = open(temp(TAGS_LIST), "w")
1383     Cleanup().register(temp(TAGS_LIST), pass2)
1384     for tag, count in self.tags.items():
1385       f.write("%s %d\n" % (tag, count))
1386
1387     f = open(temp(BRANCHES_LIST), "w")
1388     Cleanup().register(temp(BRANCHES_LIST), pass2)
1389     for branch, info in self.branches.items():
1390       f.write("%s %d %d" % (branch, info[0], info[1]))
1391       if info[2]:
1392         f.write(" ")
1393         f.write(" ".join(info[2].keys()))
1394       f.write("\n")
1395
1396 class CollectData(cvs2svn_rcsparse.Sink):
1397   def __init__(self):
1398     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1399     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1400     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1401     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1402     self.default_branches_db = Database(temp(DEFAULT_BRANCHES_DB),
1403                                         DB_OPEN_NEW)
1404     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1405     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1406     Cleanup().register(temp(METADATA_DB), pass8)
1407     self.fatal_errors = []
1408     self.num_files = 0
1409     self.symbol_db = SymbolDatabase()
1410
1411     # 1 if we've collected data for at least one file, None otherwise.
1412     self.found_valid_file = None
1413
1414     # See set_fname() for initializations of other variables.
1415
1416   def set_fname(self, canonical_name, filename):
1417     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1418     filesystem path to the file in question, and CANONICAL_NAME is
1419     FILENAME with the 'Attic' component removed (if the file is indeed
1420     in the Attic) ."""
1421     self.fname = canonical_name
1422
1423     # We calculate and save some file metadata here, where we can do
1424     # it only once per file, instead of waiting until later where we
1425     # would have to do the same calculations once per CVS *revision*.
1426
1427     self.rel_name = Ctx().project.relative_name(self.fname)
1428
1429     # If the paths are not the same, then that means that the
1430     # canonical_name has had the 'Attic' component stripped out.
1431     self.file_in_attic = None
1432     if not canonical_name == filename:
1433       self.file_in_attic = 1
1434
1435     file_stat = os.stat(filename)
1436     # The size of our file in bytes
1437     self.file_size = file_stat[stat.ST_SIZE]
1438
1439     # Whether or not the executable bit is set.
1440     self.file_executable = None
1441     if file_stat[0] & stat.S_IXUSR:
1442       self.file_executable = 1
1443
1444     # revision -> [timestamp, author, old-timestamp]
1445     self.rev_data = { }
1446
1447     # Maps revision number (key) to the revision number of the
1448     # previous revision along this line of development.
1449     #
1450     # For the first revision R on a branch, we consider the revision
1451     # from which R sprouted to be the 'previous'.
1452     #
1453     # Note that this revision can't be determined arithmetically (due
1454     # to cvsadmin -o, which is why this is necessary).
1455     #
1456     # If the key has no previous revision, then store None as key's
1457     # value.
1458     self.prev_rev = { }
1459
1460     # This dict is essentially self.prev_rev with the values mapped in
1461     # the other direction, so following key -> value will yield you
1462     # the next revision number.
1463     #
1464     # Unlike self.prev_rev, if the key has no next revision, then the
1465     # key is not present.
1466     self.next_rev = { }
1467
1468     # Track the state of each revision so that in set_revision_info,
1469     # we can determine if our op is an add/change/delete.  We can do
1470     # this because in set_revision_info, we'll have all of the
1471     # revisions for a file at our fingertips, and we need to examine
1472     # the state of our prev_rev to determine if we're an add or a
1473     # change--without the state of the prev_rev, we are unable to
1474     # distinguish between an add and a change.
1475     self.rev_state = { }
1476
1477     # Hash mapping branch numbers, like '1.7.2', to branch names,
1478     # like 'Release_1_0_dev'.
1479     self.branch_names = { }
1480
1481     # RCS flags (used for keyword expansion).
1482     self.mode = None
1483
1484     # Hash mapping revision numbers, like '1.7', to lists of names
1485     # indicating which branches sprout from that revision, like
1486     # ['Release_1_0_dev', 'experimental_driver', ...].
1487     self.branchlist = { }
1488
1489     # Like self.branchlist, but the values are lists of tag names that
1490     # apply to the key revision.
1491     self.taglist = { }
1492
1493     # If set, this is an RCS branch number -- rcsparse calls this the
1494     # "principal branch", but CVS and RCS refer to it as the "default
1495     # branch", so that's what we call it, even though the rcsparse API
1496     # setter method is still 'set_principal_branch'.
1497     self.default_branch = None
1498
1499     # If the RCS file doesn't have a default branch anymore, but does
1500     # have vendor revisions, then we make an educated guess that those
1501     # revisions *were* the head of the default branch up until the
1502     # commit of 1.2, at which point the file's default branch became
1503     # trunk.  This records the date at which 1.2 was committed.
1504     self.first_non_vendor_revision_date = None
1505
1506     # A list of all symbols defined for the current file.  Used to
1507     # prevent multiple definitions of a symbol, something which can
1508     # easily happen when --symbol-transform is used.
1509     self.defined_symbols = { }
1510
1511   def set_principal_branch(self, branch):
1512     self.default_branch = branch
1513
1514   def set_expansion(self, mode):
1515     self.mode = mode
1516
1517   def set_branch_name(self, branch_number, name):
1518     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1519     and that NAME sprouts from BRANCH_NUMBER .
1520     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1521     for example '1.7.2' (never '1.7.0.2')."""
1522     if not self.branch_names.has_key(branch_number):
1523       self.branch_names[branch_number] = name
1524       # The branchlist is keyed on the revision number from which the
1525       # branch sprouts, so strip off the odd final component.
1526       sprout_rev = branch_number[:branch_number.rfind(".")]
1527       if not self.branchlist.has_key(sprout_rev):
1528         self.branchlist[sprout_rev] = []
1529       self.branchlist[sprout_rev].append(name)
1530       self.symbol_db.register_branch_creation(name)
1531     else:
1532       sys.stderr.write("%s: in '%s':\n"
1533                        "   branch '%s' already has name '%s',\n"
1534                        "   cannot also have name '%s', ignoring the latter\n"
1535                        % (warning_prefix, self.fname, branch_number,
1536                           self.branch_names[branch_number], name))
1537
1538   def rev_to_branch_name(self, revision):
1539     """Return the name of the branch on which REVISION lies.
1540     REVISION is a non-branch revision number with an even number of,
1541     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1542     For the convenience of callers, REVISION can also be a trunk
1543     revision such as '1.2', in which case just return None."""
1544     if trunk_rev.match(revision):
1545       return None
1546     return self.branch_names.get(revision[:revision.rindex(".")])
1547
1548   def add_cvs_branch(self, revision, branch_name):
1549     """Record the root revision and branch revision for BRANCH_NAME,
1550     based on REVISION.  REVISION is a CVS branch number having an even
1551     number of components where the second-to-last is '0'.  For
1552     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1553     from 1.7 and has branch number 1.7.2."""
1554     last_dot = revision.rfind(".")
1555     branch_rev = revision[:last_dot]
1556     last2_dot = branch_rev.rfind(".")
1557     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1558     self.set_branch_name(branch_rev, branch_name)
1559
1560   def define_tag(self, name, revision):
1561     """Record a bidirectional mapping between symbolic NAME and REVISION.
1562     REVISION is an unprocessed revision number from the RCS file's
1563     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1564     This function will determine what kind of symbolic name it is by
1565     inspection, and record it in the right places."""
1566     for (pattern, replacement) in Ctx().symbol_transforms:
1567       newname = pattern.sub(replacement, name)
1568       if newname != name:
1569         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1570                     % (name, newname))
1571         name = newname
1572     if self.defined_symbols.has_key(name):
1573       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1574                 % (error_prefix, name, self.fname)
1575       sys.stderr.write(err + "\n")
1576       self.fatal_errors.append(err)
1577     self.defined_symbols[name] = None
1578     if branch_tag.match(revision):
1579       self.add_cvs_branch(revision, name)
1580     elif vendor_tag.match(revision):
1581       self.set_branch_name(revision, name)
1582     else:
1583       if not self.taglist.has_key(revision):
1584         self.taglist[revision] = []
1585       self.taglist[revision].append(name)
1586       self.symbol_db.register_tag_creation(name)
1587
1588   def define_revision(self, revision, timestamp, author, state,
1589                       branches, next):
1590
1591     # Record the state of our revision for later calculations
1592     self.rev_state[revision] = state
1593
1594     # store the rev_data as a list in case we have to jigger the timestamp
1595     self.rev_data[revision] = [int(timestamp), author, None]
1596
1597     # When on trunk, the RCS 'next' revision number points to what
1598     # humans might consider to be the 'previous' revision number.  For
1599     # example, 1.3's RCS 'next' is 1.2.
1600     #
1601     # However, on a branch, the RCS 'next' revision number really does
1602     # point to what humans would consider to be the 'next' revision
1603     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1604     #
1605     # In other words, in RCS, 'next' always means "where to find the next
1606     # deltatext that you need this revision to retrieve.
1607     #
1608     # That said, we don't *want* RCS's behavior here, so we determine
1609     # whether we're on trunk or a branch and set self.prev_rev
1610     # accordingly.
1611     #
1612     # One last thing.  Note that if REVISION is a branch revision,
1613     # instead of mapping REVISION to NEXT, we instead map NEXT to
1614     # REVISION.  Since we loop over all revisions in the file before
1615     # doing anything with the data we gather here, this 'reverse
1616     # assignment' effectively does the following:
1617     #
1618     # 1. Gives us no 'prev' value for REVISION (in this
1619     # iteration... it may have been set in a previous iteration)
1620     #
1621     # 2. Sets the 'prev' value for the revision with number NEXT to
1622     # REVISION.  So when we come around to the branch revision whose
1623     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1624     # set.
1625     if trunk_rev.match(revision):
1626       self.prev_rev[revision] = next
1627       self.next_rev[next] = revision
1628     elif next:
1629       self.prev_rev[next] = revision
1630       self.next_rev[revision] = next
1631
1632     for b in branches:
1633       self.prev_rev[b] = revision
1634
1635     # Ratchet up the highest vendor head revision, if necessary.
1636     if self.default_branch:
1637       default_branch_root = self.default_branch + "."
1638       if ((revision.find(default_branch_root) == 0)
1639           and (default_branch_root.count('.') == revision.count('.'))):
1640         # This revision is on the default branch, so record that it is
1641         # the new highest default branch head revision.
1642         self.default_branches_db[self.rel_name] = revision
1643     else:
1644       # No default branch, so make an educated guess.
1645       if revision == '1.2':
1646         # This is probably the time when the file stopped having a
1647         # default branch, so make a note of it.
1648         self.first_non_vendor_revision_date = timestamp
1649       else:
1650         m = vendor_revision.match(revision)
1651         if m and ((not self.first_non_vendor_revision_date)
1652                   or (timestamp < self.first_non_vendor_revision_date)):
1653           # We're looking at a vendor revision, and it wasn't
1654           # committed after this file lost its default branch, so bump
1655           # the maximum trunk vendor revision in the permanent record.
1656           self.default_branches_db[self.rel_name] = revision
1657
1658     if not trunk_rev.match(revision):
1659       # Check for unlabeled branches, record them.  We tried to collect
1660       # all branch names when we parsed the symbolic name header
1661       # earlier, of course, but that didn't catch unlabeled branches.
1662       # If a branch is unlabeled, this is our first encounter with it,
1663       # so we have to record its data now.
1664       branch_number = revision[:revision.rindex(".")]
1665       if not self.branch_names.has_key(branch_number):
1666         branch_name = "unlabeled-" + branch_number
1667         self.set_branch_name(branch_number, branch_name)
1668
1669       # Register the commit on this non-trunk branch
1670       branch_name = self.branch_names[branch_number]
1671       self.symbol_db.register_branch_commit(branch_name)
1672
1673   def tree_completed(self):
1674     "The revision tree has been parsed.  Analyze it for consistency."
1675
1676     # Our algorithm depends upon the timestamps on the revisions occuring
1677     # monotonically over time.  That is, we want to see rev 1.34 occur in
1678     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1679     # sorting), and then tried to insert 1.34, we'd be screwed.
1680
1681     # to perform the analysis, we'll simply visit all of the 'previous'
1682     # links that we have recorded and validate that the timestamp on the
1683     # previous revision is before the specified revision
1684
1685     # if we have to resync some nodes, then we restart the scan. just keep
1686     # looping as long as we need to restart.
1687     while 1:
1688       for current, prev in self.prev_rev.items():
1689         if not prev:
1690           # no previous revision exists (i.e. the initial revision)
1691           continue
1692         t_c = self.rev_data[current][0]
1693         t_p = self.rev_data[prev][0]
1694         if t_p >= t_c:
1695           # the previous revision occurred later than the current revision.
1696           # shove the previous revision back in time (and any before it that
1697           # may need to shift).
1698
1699           # We sync backwards and not forwards because any given CVS
1700           # Revision has only one previous revision.  However, a CVS
1701           # Revision can *be* a previous revision for many other
1702           # revisions (e.g., a revision that is the source of multiple
1703           # branches).  This becomes relevant when we do the secondary
1704           # synchronization in pass 2--we can make certain that we
1705           # don't resync a revision earlier than it's previous
1706           # revision, but it would be non-trivial to make sure that we
1707           # don't resync revision R *after* any revisions that have R
1708           # as a previous revision.
1709           while t_p >= t_c:
1710             self.rev_data[prev][0] = t_c - 1    # new timestamp
1711             self.rev_data[prev][2] = t_p        # old timestamp
1712             delta = t_c - 1 - t_p
1713             msg =  "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1714                   % (self.rel_name,
1715                      prev, time.ctime(t_p), delta)
1716             Log().write(LOG_VERBOSE, msg)
1717             if (delta > COMMIT_THRESHOLD
1718                 or delta < (COMMIT_THRESHOLD * -1)):
1719               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1720               Log().write(LOG_WARN, str % (warning_prefix, self.rel_name,
1721                                            delta))
1722             current = prev
1723             prev = self.prev_rev[current]
1724             if not prev:
1725               break
1726             t_c = t_c - 1               # self.rev_data[current][0]
1727             t_p = self.rev_data[prev][0]
1728
1729           # break from the for-loop
1730           break
1731       else:
1732         # finished the for-loop (no resyncing was performed)
1733         return
1734
1735   def set_revision_info(self, revision, log, text):
1736     timestamp, author, old_ts = self.rev_data[revision]
1737     digest = sha.new(log + '\0' + author).hexdigest()
1738     if old_ts:
1739       # the timestamp on this revision was changed. log it for later
1740       # resynchronization of other files's revisions that occurred
1741       # for this time and log message.
1742       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1743
1744     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1745     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1746     #
1747     # If revision 1.1 appears to have been created via 'cvs add'
1748     # instead of 'cvs import', then this file probably never had a
1749     # default branch, so retroactively remove its record in the
1750     # default branches db.  The test is that the log message CVS uses
1751     # for 1.1 in imports is "Initial revision\n" with no period.
1752     if revision == '1.1' and log != 'Initial revision\n':
1753       if self.default_branches_db.has_key(self.rel_name):
1754         del self.default_branches_db[self.rel_name]
1755
1756     # Get the timestamps of the previous and next revisions
1757     prev_rev = self.prev_rev[revision]
1758     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1759
1760     next_rev = self.next_rev.get(revision)
1761     next_timestamp, ign, ign = self.rev_data.get(next_rev, [0, None, None])
1762
1763     # How to tell if a CVSRevision is an add, a change, or a deletion:
1764     #
1765     # It's a delete if RCS state is 'dead'
1766     #
1767     # It's an add if RCS state is 'Exp.' and
1768     #      - we either have no previous revision
1769     #        or
1770     #      - we have a previous revision whose state is 'dead'
1771     #
1772     # Anything else is a change.
1773     if self.rev_state[revision] == 'dead':
1774       op = OP_DELETE
1775     elif ((self.prev_rev.get(revision, None) is None)
1776           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1777       op = OP_ADD
1778     else:
1779       op = OP_CHANGE
1780
1781     def is_branch_revision(rev):
1782       """Return True if this revision is not a trunk revision,
1783       else return False."""
1784       if rev.count('.') >= 3:
1785         return True
1786       return False
1787
1788     def is_same_line_of_development(rev1, rev2):
1789       """Return True if rev1 and rev2 are on the same line of
1790       development (i.e., both on trunk, or both on the same branch);
1791       return False otherwise.  Either rev1 or rev2 can be None, in
1792       which case automatically return False."""
1793       if rev1 is None or rev2 is None:
1794         return False
1795       if rev1.count('.') == 1 and rev2.count('.') == 1:
1796         return True
1797       if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
1798         return True
1799       return False
1800
1801     # There can be an odd situation where the tip revision of a branch
1802     # is alive, but every predecessor on the branch is in state 'dead',
1803     # yet the revision from which the branch sprouts is alive.  (This
1804     # is sort of a mirror image of the more common case of adding a
1805     # file on a branch, in which the first revision on the branch is
1806     # alive while the revision from which it sprouts is dead.)
1807     #
1808     # In this odd situation, we must mark the first live revision on
1809     # the branch as an OP_CHANGE instead of an OP_ADD, because it
1810     # reflects, however indirectly, a change w.r.t. the source
1811     # revision from which the branch sprouts.
1812     #
1813     # This is issue #89.
1814     cur_num = revision
1815     if (is_branch_revision(revision)
1816         and not self.rev_state[revision] == 'dead'):
1817       while 1:
1818         prev_num = self.prev_rev.get(cur_num, None)
1819         if not cur_num or not prev_num:
1820           break
1821         if (not is_same_line_of_development(cur_num, prev_num)
1822             and self.rev_state[cur_num] == 'dead'
1823             and not self.rev_state[prev_num] == 'dead'):
1824           op = OP_CHANGE
1825         cur_num = self.prev_rev.get(cur_num, None)
1826
1827     if text:
1828       deltatext_code = DELTATEXT_NONEMPTY
1829     else:
1830       deltatext_code = DELTATEXT_EMPTY
1831
1832     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp,
1833                         next_timestamp, op,
1834                         prev_rev, revision, next_rev,
1835                         self.file_in_attic, self.file_executable,
1836                         self.file_size,
1837                         deltatext_code, self.fname,
1838                         self.mode, self.rev_to_branch_name(revision),
1839                         self.taglist.get(revision, []),
1840                         self.branchlist.get(revision, []))
1841     self.revs.write(str(c_rev) + "\n")
1842     StatsKeeper().record_c_rev(c_rev)
1843
1844     if not self.metadata_db.has_key(digest):
1845       self.metadata_db[digest] = (author, log)
1846
1847   def parse_completed(self):
1848     # Walk through all branches and tags and register them with
1849     # their parent branch in the symbol database.
1850     for revision, symbols in self.taglist.items() + self.branchlist.items():
1851       for symbol in symbols:
1852         name = self.rev_to_branch_name(revision)
1853         if name is not None:
1854           self.symbol_db.register_branch_blocker(name, symbol)
1855
1856     self.num_files = self.num_files + 1
1857
1858   def write_symbol_db(self):
1859     self.symbol_db.write()
1860
1861 class SymbolingsLogger:
1862   """Manage the file that contains lines for symbol openings and
1863   closings.
1864
1865   This data will later be used to determine valid SVNRevision ranges
1866   from which a file can be copied when creating a branch or tag in
1867   Subversion.  Do this by finding "Openings" and "Closings" for each
1868   file copied onto a branch or tag.
1869
1870   An "Opening" is the CVSRevision from which a given branch/tag
1871   sprouts on a path.
1872
1873   The "Closing" for that branch/tag and path is the next CVSRevision
1874   on the same line of development as the opening.
1875
1876   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1877   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1878   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1879   'foo.c'.  Note that there may be many revisions chronologically
1880   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1881   perhaps even including on branch BEE itself.  But 1.3 is the next
1882   revision *on the same line* as 1.2, that is why it is the closing
1883   revision for those symbolic names of which 1.2 is the opening.
1884
1885   The reason for doing all this hullabaloo is to make branch and tag
1886   creation as efficient as possible by minimizing the number of copies
1887   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1888   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1889   means that when creating branch BEE, there is some motivation to do
1890   the copy from one of 17-30.  Now if there were another file,
1891   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1892   to revisions 24 and 39 in Subversion, we would know that the ideal
1893   thing would be to copy the branch from somewhere between 24 and 29,
1894   inclusive.
1895   """
1896   def __init__(self):
1897     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1898     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1899     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1900     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1901
1902     # This keys of this dictionary are *source* cvs_paths for which
1903     # we've encountered an 'opening' on the default branch.  The
1904     # values are the (uncleaned) symbolic names that this path has
1905     # opened.
1906     self.open_paths_with_default_branches = { }
1907
1908   def log_revision(self, c_rev, svn_revnum):
1909     """Log any openings found in C_REV, and if C_REV.next_rev is not
1910     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1911     any) will have its revnum determined later."""
1912     for name in c_rev.tags + c_rev.branches:
1913       self._note_default_branch_opening(c_rev, name)
1914       if c_rev.op != OP_DELETE:
1915         self._log(name, svn_revnum,
1916                   c_rev.cvs_path, c_rev.branch_name, OPENING)
1917
1918       # If our c_rev has a next_rev, then that's the closing rev for
1919       # this source revision.  Log it to closings for later processing
1920       # since we don't know the svn_revnum yet.
1921       if c_rev.next_rev is not None:
1922         self.closings.write('%s %s\n' %
1923                             (name, c_rev.unique_key(c_rev.next_rev)))
1924
1925   def _log(self, name, svn_revnum, cvs_path, branch_name, type):
1926     """Write out a single line to the symbol_openings_closings file
1927     representing that SVN_REVNUM of SVN_PATH on BRANCH_NAME is either the
1928     opening or closing (TYPE) of NAME (a symbolic name).
1929
1930     TYPE should only be one of the following global constants:
1931     OPENING or CLOSING."""
1932     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1933     self.symbolings.write(
1934         '%s %.8d %s %s %s\n'
1935         % (name, svn_revnum, type, branch_name or '*', cvs_path))
1936
1937   def close(self):
1938     """Iterate through the closings file, lookup the svn_revnum for
1939     each closing CVSRevision, and write a proper line out to the
1940     symbolings file."""
1941     # Use this to get the c_rev of our rev_key
1942     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1943
1944     self.closings.close()
1945     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1946       (name, rev_key) = line.rstrip().split(" ", 1)
1947       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1948
1949       c_rev = cvs_revs_db.get_revision(rev_key)
1950       self._log(name, svn_revnum, c_rev.cvs_path, c_rev.branch_name, CLOSING)
1951
1952     self.symbolings.close()
1953
1954   def _note_default_branch_opening(self, c_rev, symbolic_name):
1955     """If C_REV is a default branch revision, log C_REV.cvs_path as an
1956     opening for SYMBOLIC_NAME."""
1957     path = c_rev.cvs_path
1958     if not self.open_paths_with_default_branches.has_key(path):
1959       self.open_paths_with_default_branches[path] = [ ]
1960     self.open_paths_with_default_branches[path].append(symbolic_name)
1961
1962   def log_default_branch_closing(self, c_rev, svn_revnum):
1963     """If self.open_paths_with_default_branches contains
1964     C_REV.cvs_path, then call log each name in
1965     self.open_paths_with_default_branches[C_REV.cvs_path] as a closing
1966     with SVN_REVNUM as the closing revision number."""
1967     path = c_rev.cvs_path
1968     if self.open_paths_with_default_branches.has_key(path):
1969       # log each symbol as a closing
1970       for name in self.open_paths_with_default_branches[path]:
1971         self._log(name, svn_revnum, path, None, CLOSING)
1972       # Remove them from the openings list as we're done with them.
1973       del self.open_paths_with_default_branches[path]
1974
1975
1976 class PersistenceManager:
1977   """The PersistenceManager allows us to effectively store SVNCommits
1978   to disk and retrieve them later using only their subversion revision
1979   number as the key.  It also returns the subversion revision number
1980   for a given CVSRevision's unique key.
1981
1982   All information pertinent to each SVNCommit is stored in a series of
1983   on-disk databases so that SVNCommits can be retrieved on-demand.
1984
1985   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
1986   In 'new' mode, PersistenceManager will initialize a new set of on-disk
1987   databases and be fully-featured.
1988   In 'read' mode, PersistenceManager will open existing on-disk databases
1989   and the set_* methods will be unavailable."""
1990   def __init__(self, mode):
1991     self.mode = mode
1992     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
1993       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
1994     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
1995     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
1996     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
1997     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
1998     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
1999     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
2000     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
2001     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
2002     ###PERF kff Elsewhere there are comments about sucking the tags db
2003     ### into memory.  That seems like a good idea.
2004     if not Ctx().trunk_only:
2005       self.tags_db = TagsDatabase(DB_OPEN_READ)
2006       self.motivating_revnums = Database(temp(MOTIVATING_REVNUMS), mode)
2007       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
2008
2009     # "branch_name" -> svn_revnum in which branch was last filled.
2010     # This is used by CVSCommit._pre_commit, to prevent creating a fill
2011     # revision which would have nothing to do.
2012     self.last_filled = {}
2013
2014   def get_svn_revnum(self, cvs_rev_unique_key):
2015     """Return the Subversion revision number in which
2016     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
2017     is no mapping for CVS_REV_UNIQUE_KEY."""
2018     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
2019
2020   def get_svn_commit(self, svn_revnum):
2021     """Return an SVNCommit that corresponds to SVN_REVNUM.
2022
2023     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
2024
2025     This method can throw SVNCommitInternalInconsistencyError.
2026     """
2027     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
2028     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
2029     if c_rev_keys == None:
2030       return None
2031
2032     digest = None
2033     for key in c_rev_keys:
2034       c_rev = self.cvs_revisions.get_revision(key)
2035       svn_commit.add_revision(c_rev)
2036       # Set the author and log message for this commit by using
2037       # CVSRevision metadata, but only if haven't done so already.
2038       if digest is None:
2039         digest = c_rev.digest
2040         author, log_msg = self.svn_commit_metadata[digest]
2041         svn_commit.set_author(author)
2042         svn_commit.set_log_msg(log_msg)
2043
2044     # If we're doing a trunk-only conversion, we don't need to do any more
2045     # work.
2046     if Ctx().trunk_only:
2047       return svn_commit
2048
2049     name, date = self._get_name_and_date(svn_revnum)
2050     if name:
2051       svn_commit.set_symbolic_name(name)
2052       svn_commit.set_date(date)
2053       if self.tags_db.has_key(name):
2054         svn_commit.is_tag = 1
2055
2056     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
2057     if motivating_revnum:
2058       svn_commit.set_motivating_revnum(int(motivating_revnum))
2059       svn_commit.set_date(date)
2060
2061     if len(svn_commit.cvs_revs) and name:
2062       raise SVNCommit.SVNCommitInternalInconsistencyError(
2063           "An SVNCommit cannot have cvs_revisions *and* a corresponding\n"
2064           "symbolic name ('%s') to fill."
2065           % (_clean_symbolic_name(name),))
2066
2067     return svn_commit
2068
2069   def set_cvs_revs(self, svn_revnum, cvs_revs):
2070     """Record the bidirectional mapping between SVN_REVNUM and
2071     CVS_REVS."""
2072     if self.mode == DB_OPEN_READ:
2073       raise RuntimeError, \
2074           'Write operation attempted on read-only PersistenceManager'
2075     for c_rev in cvs_revs:
2076       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
2077     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
2078     for c_rev in cvs_revs:
2079       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
2080
2081   def set_name_and_date(self, svn_revnum, name, date):
2082     """Associate symbolic name NAME and DATE with SVN_REVNUM.
2083
2084     NAME is allowed to be None."""
2085
2086     if self.mode == DB_OPEN_READ:
2087       raise RuntimeError, \
2088           'Write operation attempted on read-only PersistenceManager'
2089     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
2090     self.last_filled[name] = svn_revnum
2091
2092   def _get_name_and_date(self, svn_revnum):
2093     """Return a tuple containing the symbolic name and date associated
2094     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
2095     associated with it."""
2096     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
2097
2098   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
2099     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
2100     if self.mode == DB_OPEN_READ:
2101       raise RuntimeError, \
2102           'Write operation attempted on read-only PersistenceManager'
2103     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
2104
2105
2106 class CVSCommit:
2107   """Each instance of this class contains a number of CVS Revisions
2108   that correspond to one or more Subversion Commits.  After all CVS
2109   Revisions are added to the grouping, calling process_revisions will
2110   generate a Subversion Commit (or Commits) for the set of CVS
2111   Revisions in the grouping."""
2112
2113   def __init__(self, digest, author, log):
2114     self.digest = digest
2115     self.author = author
2116     self.log = log
2117
2118     # Symbolic names for which the last source revision has already
2119     # been seen and for which the CVSRevisionAggregator has already
2120     # generated a fill SVNCommit.  See self.process_revisions().
2121     self.done_symbols = [ ]
2122
2123     self.files = { }
2124     # Lists of CVSRevisions
2125     self.changes = [ ]
2126     self.deletes = [ ]
2127
2128     # Start out with a t_min higher than any incoming time T, and a
2129     # t_max lower than any incoming T.  This way the first T will
2130     # push t_min down to T, and t_max up to T, naturally (without any
2131     # special-casing), and successive times will then ratchet them
2132     # outward as appropriate.
2133     self.t_min = 1L<<32
2134     self.t_max = 0
2135
2136     # This will be set to the SVNCommit that occurs in self._commit.
2137     self.motivating_commit = None
2138
2139     # This is a list of all non-primary commits motivated by the main
2140     # commit.  We gather these so that we can set their dates to the
2141     # same date as the primary commit.
2142     self.secondary_commits = [ ]
2143
2144     # State for handling default branches.
2145     #
2146     # Here is a tempting, but ultimately nugatory, bit of logic, which
2147     # I share with you so you may appreciate the less attractive, but
2148     # refreshingly non-nugatory, logic which follows it:
2149     #
2150     # If some of the commits in this txn happened on a non-trunk
2151     # default branch, then those files will have to be copied into
2152     # trunk manually after being changed on the branch (because the
2153     # RCS "default branch" appears as head, i.e., trunk, in practice).
2154     # As long as those copies don't overwrite any trunk paths that
2155     # were also changed in this commit, then we can do the copies in
2156     # the same revision, because they won't cover changes that don't
2157     # appear anywhere/anywhen else.  However, if some of the trunk dst
2158     # paths *did* change in this commit, then immediately copying the
2159     # branch changes would lose those trunk mods forever.  So in this
2160     # case, we need to do at least that copy in its own revision.  And
2161     # for simplicity's sake, if we're creating the new revision for
2162     # even one file, then we just do all such copies together in the
2163     # new revision.
2164     #
2165     # Doesn't that sound nice?
2166     #
2167     # Unfortunately, Subversion doesn't support copies with sources
2168     # in the current txn.  All copies must be based in committed
2169     # revisions.  Therefore, we generate the above-described new
2170     # revision unconditionally.
2171     #
2172     # This is a list of c_revs, and a c_rev is appended for each
2173     # default branch commit that will need to be copied to trunk (or
2174     # deleted from trunk) in some generated revision following the
2175     # "regular" revision.
2176     self.default_branch_cvs_revisions = [ ]
2177
2178   def __cmp__(self, other):
2179     # Commits should be sorted by t_max.  If both self and other have
2180     # the same t_max, break the tie using t_min, and lastly, digest
2181     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
2182             or cmp(self.digest, other.digest))
2183
2184   def has_file(self, fname):
2185     return self.files.has_key(fname)
2186
2187   def revisions(self):
2188     return self.changes + self.deletes
2189
2190   def opens_symbolic_name(self, name):
2191     """Returns true if any CVSRevision in this commit is on a tag or a
2192     branch or is the origin of a tag or branch."""
2193     for c_rev in self.revisions():
2194       if c_rev.opens_symbolic_name(name):
2195         return 1
2196     return 0
2197
2198   def add_revision(self, c_rev):
2199     # Record the time range of this commit.
2200     #
2201     # ### ISSUE: It's possible, though unlikely, that the time range
2202     # of a commit could get gradually expanded to be arbitrarily
2203     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
2204     # problem, and anyway deciding where to break it up would be a
2205     # judgement call.  For now, we just print a warning in commit() if
2206     # this happens.
2207     if c_rev.timestamp < self.t_min:
2208       self.t_min = c_rev.timestamp
2209     if c_rev.timestamp > self.t_max:
2210       self.t_max = c_rev.timestamp
2211
2212     if c_rev.op == OP_DELETE:
2213       self.deletes.append(c_rev)
2214     else:
2215       # OP_CHANGE or OP_ADD
2216       self.changes.append(c_rev)
2217
2218     self.files[c_rev.fname] = 1
2219
2220   def _pre_commit(self):
2221     """Generates any SVNCommits that must exist before the main
2222     commit."""
2223
2224     # There may be multiple c_revs in this commit that would cause
2225     # branch B to be filled, but we only want to fill B once.  On the
2226     # other hand, there might be multiple branches committed on in
2227     # this commit.  Whatever the case, we should count exactly one
2228     # commit per branch, because we only fill a branch once per
2229     # CVSCommit.  This list tracks which branches we've already
2230     # counted.
2231     accounted_for_sym_names = [ ]
2232
2233     def fill_needed(c_rev, pm):
2234       """Return 1 if this is the first commit on a new branch (for
2235       this file) and we need to fill the branch; else return 0
2236       (meaning that some other file's first commit on the branch has
2237       already done the fill for us).
2238
2239       If C_REV.op is OP_ADD, only return 1 if the branch that this
2240       commit is on has no last filled revision.
2241
2242       PM is a PersistenceManager to query.
2243       """
2244
2245       # Different '.' counts indicate that c_rev is now on a different
2246       # line of development (and may need a fill)
2247       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
2248         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
2249         # It should be the case that when we have a file F that
2250         # is added on branch B (thus, F on trunk is in state
2251         # 'dead'), we generate an SVNCommit to fill B iff the branch
2252         # has never been filled before.
2253         #
2254         # If this c_rev.op == OP_ADD, *and* the branch has never
2255         # been filled before, then fill it now.  Otherwise, no need to
2256         # fill it.
2257         if c_rev.op == OP_ADD:
2258           if pm.last_filled.get(c_rev.branch_name, None) is None:
2259             return 1
2260         elif c_rev.op == OP_CHANGE:
2261           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
2262             return 1
2263         elif c_rev.op == OP_DELETE:
2264           if pm.last_filled.get(c_rev.branch_name, None) is None:
2265             return 1
2266       return 0
2267
2268     for c_rev in self.changes + self.deletes:
2269       # If a commit is on a branch, we must ensure that the branch
2270       # path being committed exists (in HEAD of the Subversion
2271       # repository).  If it doesn't exist, we will need to fill the
2272       # branch.  After the fill, the path on which we're committing
2273       # will exist.
2274       if c_rev.branch_name \
2275           and c_rev.branch_name not in accounted_for_sym_names \
2276           and c_rev.branch_name not in self.done_symbols \
2277           and fill_needed(c_rev, Ctx()._persistence_manager):
2278         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
2279                                % c_rev.branch_name)
2280         svn_commit.set_symbolic_name(c_rev.branch_name)
2281         self.secondary_commits.append(svn_commit)
2282         accounted_for_sym_names.append(c_rev.branch_name)
2283
2284   def _commit(self):
2285     """Generates the primary SVNCommit that corresponds to this
2286     CVSCommit."""
2287     # Generate an SVNCommit unconditionally.  Even if the only change
2288     # in this CVSCommit is a deletion of an already-deleted file (that
2289     # is, a CVS revision in state 'dead' whose predecessor was also in
2290     # state 'dead'), the conversion will still generate a Subversion
2291     # revision containing the log message for the second dead
2292     # revision, because we don't want to lose that information.
2293     svn_commit = SVNCommit("commit")
2294     self.motivating_commit = svn_commit
2295
2296     for c_rev in self.changes:
2297       svn_commit.add_revision(c_rev)
2298       # Only make a change if we need to.  When 1.1.1.1 has an empty
2299       # deltatext, the explanation is almost always that we're looking
2300       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
2301       # such imports, CVS creates an RCS file where 1.1 has the
2302       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2303       # content as 1.1.  There's no reason to reflect this non-change
2304       # in the repository, so we want to do nothing in this case.  (If
2305       # we were really paranoid, we could make sure 1.1's log message
2306       # is the CVS-generated "Initial revision\n", but I think the
2307       # conditions below are strict enough.)
2308       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
2309               and (c_rev.rev == "1.1.1.1")):
2310         if c_rev.is_default_branch_revision():
2311           self.default_branch_cvs_revisions.append(c_rev)
2312
2313     for c_rev in self.deletes:
2314       # When a file is added on a branch, CVS not only adds the file
2315       # on the branch, but generates a trunk revision (typically
2316       # 1.1) for that file in state 'dead'.  We only want to add
2317       # this revision if the log message is not the standard cvs
2318       # fabricated log message.
2319       if c_rev.prev_rev is None:
2320         # c_rev.branches may be empty if the originating branch
2321         # has been excluded.
2322         if not c_rev.branches:
2323           continue
2324         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
2325                              % (c_rev.filename(),
2326                                 c_rev.branches[0]))
2327         author, log_msg = \
2328             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
2329         if log_msg == cvs_generated_msg:
2330           continue
2331
2332       svn_commit.add_revision(c_rev)
2333       if c_rev.is_default_branch_revision():
2334         self.default_branch_cvs_revisions.append(c_rev)
2335
2336     # There is a slight chance that we didn't actually register any
2337     # CVSRevisions with our SVNCommit (see loop over self.deletes
2338     # above), so if we have no CVSRevisions, we don't flush the
2339     # svn_commit to disk and roll back our revnum.
2340     if len(svn_commit.cvs_revs) > 0:
2341       svn_commit.flush()
2342     else:
2343       # We will not be flushing this SVNCommit, so rollback the
2344       # SVNCommit revision counter.
2345       SVNCommit.revnum = SVNCommit.revnum - 1
2346
2347     if not Ctx().trunk_only:
2348       for c_rev in self.revisions():
2349         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
2350
2351   def _post_commit(self):
2352     """Generates any SVNCommits that we can perform now that _commit
2353     has happened.  That is, handle non-trunk default branches.
2354     Sometimes an RCS file has a non-trunk default branch, so a commit
2355     on that default branch would be visible in a default CVS checkout
2356     of HEAD.  If we don't copy that commit over to Subversion's trunk,
2357     then there will be no Subversion tree which corresponds to that
2358     CVS checkout.  Of course, in order to copy the path over, we may
2359     first need to delete the existing trunk there.  """
2360
2361     # Only generate a commit if we have default branch revs
2362     if len(self.default_branch_cvs_revisions):
2363       # Generate an SVNCommit for all of our default branch c_revs.
2364       svn_commit = SVNCommit("post-commit default branch(es)")
2365       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
2366       for c_rev in self.default_branch_cvs_revisions:
2367         svn_commit.add_revision(c_rev)
2368         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
2369                                                             svn_commit.revnum)
2370       self.secondary_commits.append(svn_commit)
2371
2372   def process_revisions(self, done_symbols):
2373     """Process all the CVSRevisions that this instance has, creating
2374     one or more SVNCommits in the process.  Generate fill SVNCommits
2375     only for symbols not in DONE_SYMBOLS (avoids unnecessary
2376     fills).
2377
2378     Return the primary SVNCommit that corresponds to this CVSCommit.
2379     The returned SVNCommit is the commit that motivated any other
2380     SVNCommits generated in this CVSCommit."""
2381     self.done_symbols = done_symbols
2382     seconds = self.t_max - self.t_min + 1
2383
2384     Log().write(LOG_VERBOSE, '-' * 60)
2385     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2386     if seconds == 1:
2387       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2388                   % time.ctime(self.t_max))
2389     else:
2390       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2391       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2392                   % (time.ctime(self.t_max), seconds))
2393
2394     if seconds > COMMIT_THRESHOLD + 1:
2395       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2396                   % (warning_prefix, COMMIT_THRESHOLD))
2397
2398     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2399       self._commit()
2400       return self.motivating_commit
2401
2402     self._pre_commit()
2403     self._commit()
2404     self._post_commit()
2405
2406     for svn_commit in self.secondary_commits:
2407       svn_commit.set_date(self.motivating_commit.get_date())
2408       svn_commit.flush()
2409
2410     return self.motivating_commit
2411
2412
2413 class SVNCommit:
2414   """This represents one commit to the Subversion Repository.  There
2415   are three types of SVNCommits:
2416
2417   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2418
2419   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2420
2421   3. Updates trunk to reflect the contents of a particular branch
2422      (this is to handle RCS default branches)."""
2423
2424   # The revision number to assign to the next new SVNCommit.
2425   # We start at 2 because SVNRepositoryMirror uses the first commit
2426   # to create trunk, tags, and branches.
2427   revnum = 2
2428
2429   class SVNCommitInternalInconsistencyError(Exception):
2430     """Exception raised if we encounter an impossible state in the
2431     SVNCommit Databases."""
2432     pass
2433
2434   def __init__(self, description="", revnum=None, cvs_revs=None):
2435     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2436     If REVNUM, the SVNCommit will correspond to that revision number;
2437     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2438     REVNUM.
2439
2440     It is an error to pass CVS_REVS without REVNUM, but you may pass
2441     REVNUM without CVS_REVS, and then add a revision at a time by
2442     invoking add_revision()."""
2443     self._description = description
2444
2445     # Revprop metadata for this commit.
2446     #
2447     # These initial values are placeholders.  At least the log and the
2448     # date should be different by the time these are used.
2449     #
2450     # They are private because their values should be returned encoded
2451     # in UTF8, but callers aren't required to set them in UTF8.
2452     # Therefore, accessor methods are used to set them, and
2453     # self.get_revprops() is used to to get them, in dictionary form.
2454     self._author = Ctx().username
2455     self._log_msg = "This log message means an SVNCommit was used too soon."
2456     self._max_date = 0  # Latest date seen so far.
2457
2458     self.cvs_revs = cvs_revs or []
2459     if revnum:
2460       self.revnum = revnum
2461     else:
2462       self.revnum = SVNCommit.revnum
2463       SVNCommit.revnum = SVNCommit.revnum + 1
2464
2465     # The (uncleaned) symbolic name that is filled in this SVNCommit, if any.
2466     self.symbolic_name = None
2467
2468     # If this commit is a default branch synchronization, this
2469     # variable represents the subversion revision number of the
2470     # *primary* commit where the default branch changes actually
2471     # happened.  It is None otherwise.
2472     #
2473     # It is possible for multiple synchronization commits to refer to
2474     # the same motivating commit revision number, and it is possible
2475     # for a single synchronization commit to contain CVSRevisions on
2476     # multiple different default branches.
2477     self.motivating_revnum = None
2478
2479     # is_tag is true only if this commit is a fill of a symbolic name
2480     # that is a tag, None in all other cases.
2481     self.is_tag = None
2482
2483   def set_symbolic_name(self, symbolic_name):
2484     "Set self.symbolic_name to SYMBOLIC_NAME."
2485     self.symbolic_name = symbolic_name
2486
2487   def set_motivating_revnum(self, revnum):
2488     "Set self.motivating_revnum to REVNUM."
2489     self.motivating_revnum = revnum
2490
2491   def set_author(self, author):
2492     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2493     This is the only way to set an SVNCommit's author."""
2494     self._author = author
2495
2496   def set_log_msg(self, msg):
2497     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2498     This is the only way to set an SVNCommit's log message."""
2499     self._log_msg = msg
2500
2501   def set_date(self, date):
2502     """Set this SVNCommit's date to DATE (an integer).
2503     Note that self.add_revision() updates this automatically based on
2504     a CVSRevision; so you may not need to call this at all, and even
2505     if you do, the value may be overwritten by a later call to
2506     self.add_revision()."""
2507     self._max_date = date
2508
2509   def get_date(self):
2510     """Returns this SVNCommit's date as an integer."""
2511     return self._max_date
2512
2513   def get_revprops(self):
2514     """Return the Subversion revprops for this SVNCommit."""
2515     date = format_date(self._max_date)
2516     try:
2517       utf8_author = None
2518       if self._author is not None:
2519         utf8_author = to_utf8(self._author)
2520       utf8_log = to_utf8(self.get_log_msg())
2521       return { 'svn:author' : utf8_author,
2522                'svn:log'    : utf8_log,
2523                'svn:date'   : date }
2524     except UnicodeError:
2525       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2526                   % warning_prefix)
2527       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2528       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2529       Log().write(LOG_WARN, "  date:   '%s'" % date)
2530       Log().write(LOG_WARN,
2531                   "(subversion rev %s)  Related files:" % self.revnum)
2532       for c_rev in self.cvs_revs:
2533         Log().write(LOG_WARN, " ", c_rev.fname)
2534
2535       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2536                   "'--encoding=latin1'.\n")
2537       # It's better to fall back to the original (unknown encoding) data
2538       # than to either 1) quit or 2) record nothing at all.
2539       return { 'svn:author' : self._author,
2540                'svn:log'    : self.get_log_msg(),
2541                'svn:date'   : date }
2542
2543   def add_revision(self, cvs_rev):
2544     self.cvs_revs.append(cvs_rev)
2545     if cvs_rev.timestamp > self._max_date:
2546       self._max_date = cvs_rev.timestamp
2547
2548   def _is_primary_commit(self):
2549     """Return true if this is a primary SVNCommit, false otherwise."""
2550     return not (self.symbolic_name or self.motivating_revnum)
2551
2552   def flush(self):
2553     Log().write(LOG_NORMAL, "Creating Subversion r%d (%s)"
2554                 % (self.revnum, self._description))
2555     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2556
2557     if self.motivating_revnum is not None:
2558       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2559                                                        self.motivating_revnum)
2560
2561     # If we're not a primary commit, then store our date and/or our
2562     # symbolic_name
2563     if not self._is_primary_commit():
2564       Ctx()._persistence_manager.set_name_and_date(
2565           self.revnum, self.symbolic_name, self._max_date)
2566
2567   def __str__(self):
2568     """ Print a human-readable description of this SVNCommit.  This
2569     description is not intended to be machine-parseable (although
2570     we're not going to stop you if you try!)"""
2571
2572     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2573     if self.symbolic_name:
2574       ret += ("   symbolic name: " + _clean_symbolic_name(self.symbolic_name)
2575               + "\n")
2576     else:
2577       ret += "   NO symbolic name\n"
2578     ret += "   debug description: " + self._description + "\n"
2579     ret += "   cvs_revs:\n"
2580     for c_rev in self.cvs_revs:
2581       ret += "     " + c_rev.unique_key() + "\n"
2582     return ret
2583
2584   def get_log_msg(self):
2585     """Returns the actual log message for a primary commit, and the
2586     appropriate manufactured log message for a secondary commit."""
2587     if self.symbolic_name is not None:
2588       return self._log_msg_for_symbolic_name_commit()
2589     elif self.motivating_revnum is not None:
2590       return self._log_msg_for_default_branch_commit()
2591     else:
2592       return self._log_msg
2593
2594   def _log_msg_for_symbolic_name_commit(self):
2595     """Creates a log message for a manufactured commit that fills
2596     self.symbolic_name.  If self.is_tag is true, write the log message
2597     as though for a tag, else write it as though for a branch."""
2598     type = 'branch'
2599     if self.is_tag:
2600       type = 'tag'
2601
2602     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2603     space_or_newline = ' '
2604     cleaned_symbolic_name = _clean_symbolic_name(self.symbolic_name)
2605     if len(cleaned_symbolic_name) >= 13:
2606       space_or_newline = '\n'
2607
2608     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2609            % (type, space_or_newline, cleaned_symbolic_name)
2610
2611   def _log_msg_for_default_branch_commit(self):
2612     """Creates a log message for a manufactured commit that
2613     synchronizes a non-trunk default branch with trunk."""
2614     msg = 'This commit was generated by cvs2svn to compensate for '     \
2615           'changes in r%d,\n'                                           \
2616           'which included commits to RCS files with non-trunk default ' \
2617           'branches.\n' % self.motivating_revnum
2618     return msg
2619
2620 class CVSRevisionAggregator:
2621   """This class groups CVSRevisions into CVSCommits that represent
2622   at least one SVNCommit."""
2623   def __init__(self):
2624     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2625     if not Ctx().trunk_only:
2626       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB),
2627                                    DB_OPEN_READ)
2628     self.cvs_commits = {}
2629     self.pending_symbols = {}
2630     # A list of symbols for which we've already encountered the last
2631     # CVSRevision that is a source for that symbol.  That is, the
2632     # final fill for this symbol has been done, and we never need to
2633     # fill it again.
2634     self.done_symbols = [ ]
2635
2636     # This variable holds the most recently created primary svn_commit
2637     # object.  CVSRevisionAggregator maintains this variable merely
2638     # for its date, so that it can set dates for the SVNCommits
2639     # created in self.attempt_to_commit_symbols().
2640     self.latest_primary_svn_commit = None
2641
2642     Ctx()._symbolings_logger = SymbolingsLogger()
2643     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2644     Ctx()._default_branches_db = Database(temp(DEFAULT_BRANCHES_DB),
2645                                           DB_OPEN_READ)
2646
2647
2648   def process_revision(self, c_rev):
2649     # Each time we read a new line, we scan the commits we've
2650     # accumulated so far to see if any are ready for processing now.
2651     ready_queue = [ ]
2652     for digest_key, cvs_commit in self.cvs_commits.items():
2653       if cvs_commit.t_max + COMMIT_THRESHOLD < c_rev.timestamp:
2654         ready_queue.append(cvs_commit)
2655         del self.cvs_commits[digest_key]
2656         continue
2657       # If the inbound commit is on the same file as a pending commit,
2658       # close the pending commit to further changes.  Don't flush it though,
2659       # as there may be other pending commits dated before this one.
2660       # ### ISSUE: the has_file() check below is not optimal.
2661       # It does fix the dataloss bug where revisions would get lost
2662       # if checked in too quickly, but it can also break apart the
2663       # commits.  The correct fix would require tracking the dependencies
2664       # between change sets and committing them in proper order.
2665       if cvs_commit.has_file(c_rev.fname):
2666         unused_id = digest_key + '-'
2667         # Find a string that does is not already a key in
2668         # the self.cvs_commits dict
2669         while self.cvs_commits.has_key(unused_id):
2670           unused_id = unused_id + '-'
2671         self.cvs_commits[unused_id] = cvs_commit
2672         del self.cvs_commits[digest_key]
2673
2674     # Add this item into the set of still-available commits.
2675     if self.cvs_commits.has_key(c_rev.digest):
2676       cvs_commit = self.cvs_commits[c_rev.digest]
2677     else:
2678       author, log = self.metadata_db[c_rev.digest]
2679       self.cvs_commits[c_rev.digest] = CVSCommit(c_rev.digest,
2680                                                  author, log)
2681       cvs_commit = self.cvs_commits[c_rev.digest]
2682     cvs_commit.add_revision(c_rev)
2683
2684     # If there are any elements in the ready_queue at this point, they
2685     # need to be processed, because this latest rev couldn't possibly
2686     # be part of any of them.  Sort them into time-order, then process
2687     # 'em.
2688     ready_queue.sort()
2689
2690     # Make sure we attempt_to_commit_symbols for this c_rev, even if no
2691     # commits are ready.
2692     if len(ready_queue) == 0:
2693       self.attempt_to_commit_symbols(ready_queue, c_rev)
2694
2695     for cvs_commit in ready_queue[:]:
2696       self.latest_primary_svn_commit \
2697           = cvs_commit.process_revisions(self.done_symbols)
2698       ready_queue.remove(cvs_commit)
2699       self.attempt_to_commit_symbols(ready_queue, c_rev)
2700
2701   def flush(self):
2702     """Commit anything left in self.cvs_commits.  Then inform the
2703     SymbolingsLogger that all commits are done."""
2704
2705     ready_queue = [ ]
2706     for k, v in self.cvs_commits.items():
2707       ready_queue.append((v, k))
2708
2709     ready_queue.sort()
2710     for cvs_commit_tuple in ready_queue[:]:
2711       self.latest_primary_svn_commit = \
2712         cvs_commit_tuple[0].process_revisions(self.done_symbols)
2713       ready_queue.remove(cvs_commit_tuple)
2714       del self.cvs_commits[cvs_commit_tuple[1]]
2715       self.attempt_to_commit_symbols([])
2716
2717     if not Ctx().trunk_only:
2718       Ctx()._symbolings_logger.close()
2719
2720   def attempt_to_commit_symbols(self, queued_commits, c_rev=None):
2721     """
2722     This function generates 1 SVNCommit for each symbol in
2723     self.pending_symbols that doesn't have an opening CVSRevision in
2724     either QUEUED_COMMITS or self.cvs_commits.values().
2725
2726     If C_REV is not None, then we first add to self.pending_symbols
2727     any symbols from C_REV that C_REV is the last CVSRevision for.
2728     """
2729     # If we're not doing a trunk-only conversion, get the symbolic
2730     # names that this c_rev is the last *source* CVSRevision for and
2731     # add them to those left over from previous passes through the
2732     # aggregator.
2733     if c_rev and not Ctx().trunk_only:
2734       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2735         self.pending_symbols[sym] = None
2736
2737     # Make a list of all symbols that still have *source* CVSRevisions
2738     # in the pending commit queue (self.cvs_commits).
2739     open_symbols = {}
2740     for sym in self.pending_symbols.keys():
2741       for cvs_commit in self.cvs_commits.values() + queued_commits:
2742         if cvs_commit.opens_symbolic_name(sym):
2743           open_symbols[sym] = None
2744           break
2745
2746     # Sort the pending symbols so that we will always process the
2747     # symbols in the same order, regardless of the order in which the
2748     # dict hashing algorithm hands them back to us.  We do this so
2749     # that our tests will get the same results on all platforms.
2750     sorted_pending_symbols_keys = self.pending_symbols.keys()
2751     sorted_pending_symbols_keys.sort()
2752     for sym in sorted_pending_symbols_keys:
2753       if open_symbols.has_key(sym): # sym is still open--don't close it.
2754         continue
2755       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2756       svn_commit.set_symbolic_name(sym)
2757       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2758       svn_commit.flush()
2759       self.done_symbols.append(sym)
2760       del self.pending_symbols[sym]
2761
2762
2763 class SymbolingsReader:
2764   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2765   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2766   returning the correct opening and closing Subversion revision
2767   numbers for a given symbolic name."""
2768   def __init__(self):
2769     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2770     reads the offsets database into memory."""
2771     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2772     # The offsets_db is really small, and we need to read and write
2773     # from it a fair bit, so suck it into memory
2774     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2775     self.offsets = { }
2776     for key in offsets_db.db.keys():
2777       #print " ZOO:", key, offsets_db[key]
2778       self.offsets[key] = offsets_db[key]
2779
2780   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2781     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2782     SymbolicNameFillingGuide object.
2783
2784     Note that if we encounter an opening rev in this fill, but the
2785     corresponding closing rev takes place later than SVN_REVNUM, the
2786     closing will not be passed to SymbolicNameFillingGuide in this
2787     fill (and will be discarded when encountered in a later fill).
2788     This is perfectly fine, because we can still do a valid fill
2789     without the closing--we always try to fill what we can as soon as
2790     we can."""
2791
2792     openings_closings_map = OpeningsClosingsMap(symbolic_name)
2793
2794     # It's possible to have a branch start with a file that was added
2795     # on a branch
2796     if self.offsets.has_key(symbolic_name):
2797       # set our read offset for self.symbolings to the offset for
2798       # symbolic_name
2799       self.symbolings.seek(self.offsets[symbolic_name])
2800
2801       while 1:
2802         fpos = self.symbolings.tell()
2803         line = self.symbolings.readline().rstrip()
2804         if not line:
2805           break
2806         name, revnum, type, branch_name, cvs_path = line.split(" ", 4)
2807         if branch_name == '*':
2808           branch_name = None
2809         svn_path = Ctx().project.make_path(cvs_path, branch_name)
2810         revnum = int(revnum)
2811         if revnum > svn_revnum or name != symbolic_name:
2812           break
2813         openings_closings_map.register(svn_path, revnum, type)
2814
2815       # get current offset of the read marker and set it to the offset
2816       # for the beginning of the line we just read if we used anything
2817       # we read.
2818       if not openings_closings_map.is_empty():
2819         self.offsets[symbolic_name] = fpos
2820
2821     return SymbolicNameFillingGuide(openings_closings_map)
2822
2823
2824 class SvnRevisionRange:
2825   """The range of subversion revision numbers from which a path can be
2826   copied.  self.opening_revnum is the number of the earliest such
2827   revision, and self.closing_revnum is one higher than the number of
2828   the last such revision.  If self.closing_revnum is None, then no
2829   closings were registered."""
2830
2831   def __init__(self, opening_revnum):
2832     self.opening_revnum = opening_revnum
2833     self.closing_revnum = None
2834
2835   def add_closing(self, closing_revnum):
2836     # When we have a non-trunk default branch, we may have multiple
2837     # closings--only register the first closing we encounter.
2838     if self.closing_revnum is None:
2839       self.closing_revnum = closing_revnum
2840
2841   def __str__(self):
2842     if self.closing_revnum is None:
2843       return '[%d:]' % (self.opening_revnum,)
2844     else:
2845       return '[%d:%d]' % (self.opening_revnum, self.closing_revnum,)
2846
2847
2848 class OpeningsClosingsMap:
2849   """A dictionary of openings and closings for a symbolic name in the
2850   current SVNCommit.
2851
2852   The user should call self.register() for the openings and closings,
2853   then self.get_node_tree() to retrieve the information as a
2854   SymbolicNameFillingGuide."""
2855
2856   def __init__(self, symbolic_name):
2857     """Initialize OpeningsClosingsMap and prepare it for receiving
2858     openings and closings."""
2859
2860     self.name = symbolic_name
2861
2862     # A dictionary of SVN_PATHS to SvnRevisionRange objects.
2863     self.things = { }
2864
2865   def register(self, svn_path, svn_revnum, type):
2866     """Register an opening or closing revision for this symbolic name.
2867     SVN_PATH is the source path that needs to be copied into
2868     self.symbolic_name, and SVN_REVNUM is either the first svn
2869     revision number that we can copy from (our opening), or the last
2870     (not inclusive) svn revision number that we can copy from (our
2871     closing).  TYPE indicates whether this path is an opening or a a
2872     closing.
2873
2874     The opening for a given SVN_PATH must be passed before the closing
2875     for it to have any effect... any closing encountered before a
2876     corresponding opening will be discarded.
2877
2878     It is not necessary to pass a corresponding closing for every
2879     opening.
2880     """
2881     # Always log an OPENING
2882     if type == OPENING:
2883       self.things[svn_path] = SvnRevisionRange(svn_revnum)
2884     # Only log a closing if we've already registered the opening for that
2885     # path.
2886     elif type == CLOSING and self.things.has_key(svn_path):
2887       self.things[svn_path].add_closing(svn_revnum)
2888
2889   def is_empty(self):
2890     """Return true if we haven't accumulated any openings or closings,
2891     false otherwise."""
2892     return not len(self.things)
2893
2894   def get_things(self):
2895     """Return a list of (svn_path, SvnRevisionRange) tuples for all
2896     svn_paths with registered openings or closings."""
2897
2898     return self.things.items()
2899
2900
2901 class SymbolicNameFillingGuide:
2902   """A node tree representing the source paths to be copied to fill
2903   self.symbolic_name in the current SVNCommit.
2904
2905   self._node_tree is the root of the directory tree, in the form {
2906   path_component : subnode }.  Leaf nodes are instances of
2907   SvnRevisionRange.  Intermediate (directory) nodes are dictionaries
2908   mapping relative names to subnodes.
2909
2910   By walking self._node_tree and calling self.get_best_revnum() on
2911   each node, the caller can determine what subversion revision number
2912   to copy the path corresponding to that node from.  self._node_tree
2913   should be treated as read-only.
2914
2915   The caller can then descend to sub-nodes to see if their "best
2916   revnum" differs from their parents' and if it does, take appropriate
2917   actions to "patch up" the subtrees."""
2918
2919   def __init__(self, openings_closings_map):
2920     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2921     store into it the openings and closings from
2922     OPENINGS_CLOSINGS_MAP."""
2923
2924     self.name = openings_closings_map.name
2925
2926     # The dictionary that holds our node tree as a map { node_key :
2927     # node }.
2928     self._node_tree = { }
2929
2930     for svn_path, svn_revision_range in openings_closings_map.get_things():
2931       (head, tail) = _path_split(svn_path)
2932       self._get_node_for_path(head)[tail] = svn_revision_range
2933
2934     #self.print_node_tree(self._node_tree)
2935
2936   def _get_node_for_path(self, svn_path):
2937     """Return the node key for svn_path, creating new nodes as needed."""
2938     # Walk down the path, one node at a time.
2939     node = self._node_tree
2940     for component in svn_path.split('/'):
2941       if node.has_key(component):
2942         node = node[component]
2943       else:
2944         old_node = node
2945         node = {}
2946         old_node[component] = node
2947
2948     return node
2949
2950   def get_best_revnum(self, node, preferred_revnum):
2951     """Determine the best subversion revision number to use when
2952     copying the source tree beginning at NODE.  Returns a
2953     subversion revision number.
2954
2955     PREFERRED_REVNUM is passed to self._best_rev and used to
2956     calculate the best_revnum."""
2957     revnum = SVN_INVALID_REVNUM
2958
2959     # Aggregate openings and closings from the rev tree
2960     svn_revision_ranges = self._list_revnums(node)
2961     openings = [ x.opening_revnum
2962                  for x in svn_revision_ranges ]
2963     closings = [ x.closing_revnum
2964                  for x in svn_revision_ranges
2965                  if x.closing_revnum is not None ]
2966
2967     # Helper function for scoring the lists.
2968     def tally_frequencies(rev_list):
2969       """Takes an array of revisions (REV_LIST), for example:
2970
2971         [21, 18, 6, 49, 39, 24, 24, 24, 24, 24, 24, 24]
2972
2973       and adds up every occurrence of each revision and returns a sorted
2974       array of tuples containing (svn_revnum, count):
2975
2976         [(6, 1), (18, 1), (21, 1), (24, 7), (39, 1), (49, 1)]
2977       """
2978       s = {}
2979       for k in rev_list: # Add up the scores
2980         s[k] = s.get(k, 0) + 1
2981       a = s.items()
2982       a.sort()
2983       return a
2984
2985     # Score the lists
2986     scores = self._score_revisions(tally_frequencies(openings),
2987                                    tally_frequencies(closings))
2988
2989     revnum, max_score = self._best_rev(scores, preferred_revnum)
2990
2991     if revnum == SVN_INVALID_REVNUM:
2992       raise FatalError("failed to find a revision "
2993                        + "to copy from when copying %s" % name)
2994     return revnum, max_score
2995
2996   def _best_rev(self, scores, preferred_rev):
2997     """Return the revision with the highest score from SCORES, a list
2998     returned by _score_revisions().  When the maximum score is shared
2999     by multiple revisions, the oldest revision is selected, unless
3000     PREFERRED_REV is one of the possibilities, in which case, it is
3001     selected."""
3002     max_score = 0
3003     preferred_rev_score = -1
3004     rev = SVN_INVALID_REVNUM
3005     if preferred_rev is None:
3006       # Comparison order of different types is arbitrary. Do not
3007       # expect None to compare less than int values below.
3008       # In Python 2.3 None compares with ints like negative infinity.
3009       # In Python 2.0 None compares with ints like positive infinity.
3010       preferred_rev = SVN_INVALID_REVNUM
3011     for revnum, count in scores:
3012       if count > max_score:
3013         max_score = count
3014         rev = revnum
3015       if revnum <= preferred_rev:
3016         preferred_rev_score = count
3017     if preferred_rev_score == max_score:
3018       rev = preferred_rev
3019     return rev, max_score
3020
3021   def _score_revisions(self, openings, closings):
3022     """Return a list of revisions and scores based on OPENINGS and
3023     CLOSINGS.  The returned list looks like:
3024
3025        [(REV1 SCORE1), (REV2 SCORE2), ...]
3026
3027     where the tuples are sorted by revision number.  OPENINGS and
3028     CLOSINGS are lists of tuples [(svn_revnum, count), ...] reflecting
3029     the frequency with which svn revision numbers appeared as the
3030     opening_revnum and closing_revnum of file nodes.
3031
3032     Each score indicates that copying the corresponding revision (or
3033     any following revision up to the next revision in the list) of the
3034     object in question would yield that many correct paths at or
3035     underneath the object.  There may be other paths underneath it
3036     which are not correct and would need to be deleted or recopied;
3037     those can only be detected by descending and examining their
3038     scores.
3039
3040     If OPENINGS is empty, return the empty list."""
3041     # First look for easy out.
3042     if not openings:
3043       return []
3044
3045     # No easy out, so wish for lexical closures and calculate the scores :-).
3046     scores = []
3047     opening_score_accum = 0
3048     for (opening_rev, opening_score) in openings:
3049       opening_score_accum = opening_score_accum + opening_score
3050       scores.append((opening_rev, opening_score_accum))
3051     min = 0
3052     for (closing_rev, closing_score) in closings:
3053       done_exact_rev = None
3054       insert_index = None
3055       insert_score = None
3056       for j in range(min, len(scores)):
3057         score_rev, score = scores[j]
3058         if score_rev >= closing_rev:
3059           if not done_exact_rev:
3060             if score_rev > closing_rev:
3061               insert_index = j
3062               insert_score = scores[j-1][1] - closing_score
3063             done_exact_rev = 1
3064           scores[j] = (score_rev, score - closing_score)
3065         else:
3066           min = j + 1
3067       if not done_exact_rev:
3068         scores.append((closing_rev,scores[-1][1] - closing_score))
3069       if insert_index is not None:
3070         scores.insert(insert_index, (closing_rev, insert_score))
3071     return scores
3072
3073   def _list_revnums(self, node):
3074     """Return a list of all the SvnRevisionRanges (including
3075     duplicates) for all leaf nodes at and under NODE."""
3076
3077     if isinstance(node, SvnRevisionRange):
3078       # It is a leaf node.
3079       return [ node ]
3080     else:
3081       # It is an intermediate node.
3082       revnums = []
3083       for key, subnode in node.items():
3084         revnums.extend(self._list_revnums(subnode))
3085       return revnums
3086
3087   def get_sources(self):
3088     """Return the list of sources for this symbolic name.
3089
3090     The Project instance defines what are legitimate sources.  Raise
3091     an exception if a change occurred outside of the source
3092     directories."""
3093
3094     return self._get_sub_sources('', self._node_tree)
3095
3096   def _get_sub_sources(self, start_svn_path, start_node):
3097     """Return the list of sources for this symbolic name, starting the
3098     search at path START_SVN_PATH, which is node START_NODE.  This is
3099     a helper method, called by get_sources() (see)."""
3100
3101     project = Ctx().project
3102     if isinstance(start_node, SvnRevisionRange):
3103       # This implies that a change was found outside of the
3104       # legitimate sources.  This should never happen.
3105       raise
3106     elif project.is_source(start_svn_path):
3107       # This is a legitimate source.  Add it to list.
3108       return [ FillSource(start_svn_path, start_node) ]
3109     else:
3110       # This is a directory that is not a legitimate source.  (That's
3111       # OK because it hasn't changed directly.)  But directories
3112       # within it have been changed, so we need to search recursively
3113       # to find their enclosing sources.
3114       sources = []
3115       for entry, node in start_node.items():
3116         svn_path = _path_join(start_svn_path, entry)
3117         sources.extend(self._get_sub_sources(svn_path, node))
3118
3119     return sources
3120
3121   def print_node_tree(self, node, name='/', indent_depth=0):
3122     """For debugging purposes.  Prints all nodes in TREE that are
3123     rooted at NODE.  INDENT_DEPTH is used to indent the output of
3124     recursive calls."""
3125     if not indent_depth:
3126       print "TREE", "=" * 75
3127     if isinstance(node, SvnRevisionRange):
3128       print "TREE:", " " * (indent_depth * 2), name, node
3129     else:
3130       print "TREE:", " " * (indent_depth * 2), name
3131       for key, value in node.items():
3132         self.print_node_tree(value, key, (indent_depth + 1))
3133
3134
3135 class FillSource:
3136   """Representation of a fill source used by the symbol filler in
3137   SVNRepositoryMirror."""
3138   def __init__(self, prefix, node):
3139     """Create an unscored fill source with a prefix and a key."""
3140     self.prefix = prefix
3141     self.node = node
3142     self.score = None
3143     self.revnum = None
3144
3145   def set_score(self, score, revnum):
3146     """Set the SCORE and REVNUM."""
3147     self.score = score
3148     self.revnum = revnum
3149
3150   def __cmp__(self, other):
3151     """Comparison operator used to sort FillSources in descending
3152     score order."""
3153     if self.score is None or other.score is None:
3154       raise TypeError, 'Tried to compare unscored FillSource'
3155     return cmp(other.score, self.score)
3156
3157
3158 class SVNRepositoryMirror:
3159   """Mirror a Subversion Repository as it is constructed, one
3160   SVNCommit at a time.  The mirror is skeletal; it does not contain
3161   file contents.  The creation of a dumpfile or Subversion repository
3162   is handled by delegates.  See self.add_delegate method for how to
3163   set delegates.
3164
3165   The structure of the repository is kept in two databases and one
3166   hash.  The revs_db database maps revisions to root node keys, and
3167   the nodes_db database maps node keys to nodes.  A node is a hash
3168   from directory names to keys.  Both the revs_db and the nodes_db are
3169   stored on disk and each access is expensive.
3170
3171   The nodes_db database only has the keys for old revisions.  The
3172   revision that is being contructed is kept in memory in the new_nodes
3173   hash which is cheap to access.
3174
3175   You must invoke _start_commit between SVNCommits.
3176
3177   *** WARNING *** All path arguments to methods in this class CANNOT
3178       have leading or trailing slashes.
3179   """
3180
3181   class SVNRepositoryMirrorPathExistsError(Exception):
3182     """Exception raised if an attempt is made to add a path to the
3183     repository mirror and that path already exists in the youngest
3184     revision of the repository."""
3185     pass
3186
3187   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
3188     """Exception raised if a CVSRevision is found to have an unexpected
3189     operation (OP) value."""
3190     pass
3191
3192   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
3193     """Exception raised if an empty SymbolicNameFillingGuide is returned
3194     during a fill where the branch in question already exists."""
3195     pass
3196
3197   def __init__(self):
3198     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
3199     self.delegates = [ ]
3200
3201     # This corresponds to the 'revisions' table in a Subversion fs.
3202     self.revs_db = Database(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
3203     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
3204
3205     # This corresponds to the 'nodes' table in a Subversion fs.  (We
3206     # don't need a 'representations' or 'strings' table because we
3207     # only track metadata, not file contents.)
3208     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
3209     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
3210
3211     # Start at revision 0 without a root node.  It will be created
3212     # by _open_writable_root_node.
3213     self.youngest = 0
3214     self.new_root_key = None
3215     self.new_nodes = { }
3216
3217     if not Ctx().trunk_only:
3218       ###PERF IMPT: Suck this into memory.
3219       self.tags_db = TagsDatabase(DB_OPEN_READ)
3220       self.symbolings_reader = SymbolingsReader()
3221
3222   def _initialize_repository(self, date):
3223     """Initialize the repository by creating the directories for
3224     trunk, tags, and branches.  This method should only be called
3225     after all delegates are added to the repository mirror."""
3226     # Make a 'fake' SVNCommit so we can take advantage of the revprops
3227     # magic therein
3228     svn_commit = SVNCommit("Initialization", 1)
3229     svn_commit.set_date(date)
3230     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
3231
3232     self._start_commit(svn_commit)
3233     self._mkdir(Ctx().project.trunk_path)
3234     if not Ctx().trunk_only:
3235       self._mkdir(Ctx().project.branches_path)
3236       self._mkdir(Ctx().project.tags_path)
3237
3238   def _start_commit(self, svn_commit):
3239     """Start a new commit."""
3240     if self.youngest > 0:
3241       self._end_commit()
3242
3243     self.youngest = svn_commit.revnum
3244     self.new_root_key = None
3245     self.new_nodes = { }
3246
3247     self._invoke_delegates('start_commit', svn_commit)
3248
3249   def _end_commit(self):
3250     """Called at the end of each commit.  This method copies the newly
3251     created nodes to the on-disk nodes db."""
3252     if self.new_root_key is None:
3253       # No changes were made in this revision, so we make the root node
3254       # of the new revision be the same as the last one.
3255       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
3256     else:
3257       self.revs_db[str(self.youngest)] = self.new_root_key
3258       # Copy the new nodes to the nodes_db
3259       for key, value in self.new_nodes.items():
3260         self.nodes_db[key] = value
3261
3262   def _get_node(self, key):
3263     """Returns the node contents for KEY which may refer to either
3264     self.nodes_db or self.new_nodes."""
3265     if self.new_nodes.has_key(key):
3266       return self.new_nodes[key]
3267     else:
3268       return self.nodes_db[key]
3269
3270   def _open_readonly_node(self, path, revnum):
3271     """Open a readonly node for PATH at revision REVNUM.  Returns the
3272     node key and node contents if the path exists, else (None, None)."""
3273     # Get the root key
3274     if revnum == self.youngest:
3275       if self.new_root_key is None:
3276         node_key = self.revs_db[str(self.youngest - 1)]
3277       else:
3278         node_key = self.new_root_key
3279     else:
3280       node_key = self.revs_db[str(revnum)]
3281
3282     for component in path.split('/'):
3283       node_contents = self._get_node(node_key)
3284       if not node_contents.has_key(component):
3285         return None
3286       node_key = node_contents[component]
3287
3288     return node_key
3289
3290   def _open_writable_root_node(self):
3291     """Open a writable root node.  The current root node is returned
3292     immeditely if it is already writable.  If not, create a new one by
3293     copying the contents of the root node of the previous version."""
3294     if self.new_root_key is not None:
3295       return self.new_root_key, self.new_nodes[self.new_root_key]
3296
3297     if self.youngest < 2:
3298       new_contents = { }
3299     else:
3300       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
3301     self.new_root_key = gen_key()
3302     self.new_nodes = { self.new_root_key: new_contents }
3303
3304     return self.new_root_key, new_contents
3305
3306   def _open_writable_node(self, svn_path, create):
3307     """Open a writable node for the path SVN_PATH, creating SVN_PATH
3308     and any missing directories if CREATE is True."""
3309     parent_key, parent_contents = self._open_writable_root_node()
3310
3311     # Walk up the path, one node at a time.
3312     path_so_far = None
3313     components = svn_path.split('/')
3314     for i in range(len(components)):
3315       component = components[i]
3316       this_key = this_contents = None
3317       path_so_far = _path_join(path_so_far, component)
3318       if parent_contents.has_key(component):
3319         # The component exists.
3320         this_key = parent_contents[component]
3321         if self.new_nodes.has_key(this_key):
3322           this_contents = self.new_nodes[this_key]
3323         else:
3324           # Suck the node from the nodes_db, but update the key
3325           this_contents = self.nodes_db[this_key]
3326           this_key = gen_key()
3327           self.new_nodes[this_key] = this_contents
3328           parent_contents[component] = this_key
3329       elif create:
3330         # The component does not exists, so we create it.
3331         this_contents = { }
3332         this_key = gen_key()
3333         self.new_nodes[this_key] = this_contents
3334         parent_contents[component] = this_key
3335         if i < len(components) - 1:
3336           self._invoke_delegates('mkdir', path_so_far)
3337       else:
3338         # The component does not exists and we are not instructed to
3339         # create it, so we give up.
3340         return None, None
3341
3342       parent_key = this_key
3343       parent_contents = this_contents
3344
3345     return this_key, this_contents
3346
3347   def _path_exists(self, path):
3348     """If PATH exists in self.youngest of the svn repository mirror,
3349     return true, else return None.
3350
3351     PATH must not start with '/'."""
3352     return self._open_readonly_node(path, self.youngest) is not None
3353
3354   def _fast_delete_path(self, parent_path, parent_contents, component):
3355     """Delete COMPONENT from the parent direcory PARENT_PATH with the
3356     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
3357     in PARENT_CONTENTS."""
3358     if parent_contents.has_key(component):
3359       del parent_contents[component]
3360       self._invoke_delegates('delete_path',
3361                              _path_join(parent_path, component))
3362
3363   def _delete_path(self, svn_path, should_prune=False):
3364     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
3365     all ancestor directories that are made empty when SVN_PATH is deleted.
3366     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
3367
3368     NOTE: This function ignores requests to delete the root directory
3369     or any directory for which Ctx().project.is_unremovable() returns
3370     True, either directly or by pruning."""
3371
3372     if svn_path == '' or Ctx().project.is_unremovable(svn_path):
3373       return
3374
3375     (parent_path, entry,) = _path_split(svn_path)
3376     if parent_path:
3377       parent_key, parent_contents = \
3378           self._open_writable_node(parent_path, False)
3379     else:
3380       parent_key, parent_contents = self._open_writable_root_node()
3381
3382     if parent_key is not None:
3383       self._fast_delete_path(parent_path, parent_contents, entry)
3384       # The following recursion makes pruning an O(n^2) operation in the
3385       # worst case (where n is the depth of SVN_PATH), but the worst case
3386       # is probably rare, and the constant cost is pretty low.  Another
3387       # drawback is that we issue a delete for each path and not just
3388       # a single delete for the topmost directory pruned.
3389       if should_prune and len(parent_contents) == 0:
3390         self._delete_path(parent_path, True)
3391
3392   def _mkdir(self, path):
3393     """Create PATH in the repository mirror at the youngest revision."""
3394     self._open_writable_node(path, True)
3395     self._invoke_delegates('mkdir', path)
3396
3397   def _change_path(self, cvs_rev):
3398     """Register a change in self.youngest for the CVS_REV's svn_path
3399     in the repository mirror."""
3400     # We do not have to update the nodes because our mirror is only
3401     # concerned with the presence or absence of paths, and a file
3402     # content change does not cause any path changes.
3403     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, 0))
3404
3405   def _add_path(self, cvs_rev):
3406     """Add the CVS_REV's svn_path to the repository mirror."""
3407     self._open_writable_node(cvs_rev.svn_path, True)
3408     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, 1))
3409
3410   def _copy_path(self, src_path, dest_path, src_revnum):
3411     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
3412     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
3413     parent *must* exist, but DEST_PATH *cannot* exist.
3414
3415     Return the node key and the contents of the new node at DEST_PATH
3416     as a dictionary."""
3417     # get the contents of the node of our src_path
3418     src_key = self._open_readonly_node(src_path, src_revnum)
3419     src_contents = self._get_node(src_key)
3420
3421     # Get the parent path and the base path of the dest_path
3422     (dest_parent, dest_basename,) = _path_split(dest_path)
3423     dest_parent_key, dest_parent_contents = \
3424                    self._open_writable_node(dest_parent, False)
3425
3426     if dest_parent_contents.has_key(dest_basename):
3427       msg = "Attempt to add path '%s' to repository mirror " % dest_path
3428       msg = msg + "when it already exists in the mirror."
3429       raise self.SVNRepositoryMirrorPathExistsError, msg
3430
3431     dest_parent_contents[dest_basename] = src_key
3432     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
3433
3434     # Yes sir, src_key and src_contents are also the contents of the
3435     # destination.  This is a cheap copy, remember!  :-)
3436     return src_key, src_contents
3437
3438   def _fill_symbolic_name(self, svn_commit):
3439     """Performs all copies necessary to create as much of the the tag
3440     or branch SVN_COMMIT.symbolic_name as possible given the current
3441     revision of the repository mirror.
3442
3443     The symbolic name is guaranteed to exist in the Subversion
3444     repository by the end of this call, even if there are no paths
3445     under it."""
3446     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3447         svn_commit.symbolic_name, self.youngest)
3448     # Get the list of sources for the symbolic name.
3449     sources = symbol_fill.get_sources()
3450
3451     if sources:
3452       if self.tags_db.has_key(svn_commit.symbolic_name):
3453         dest_prefix = Ctx().project.get_tag_path(svn_commit.symbolic_name)
3454       else:
3455         dest_prefix = Ctx().project.get_branch_path(svn_commit.symbolic_name)
3456
3457       dest_key = self._open_writable_node(dest_prefix, False)[0]
3458       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3459     else:
3460       # We can only get here for a branch whose first commit is an add
3461       # (as opposed to a copy).
3462       dest_path = Ctx().project.get_branch_path(symbol_fill.name)
3463       if not self._path_exists(dest_path):
3464         # If our symbol_fill was empty, that means that our first
3465         # commit on the branch was to a file added on the branch, and
3466         # that this is our first fill of that branch.
3467         #
3468         # This case is covered by test 16.
3469         #
3470         # ...we create the branch by copying trunk from the our
3471         # current revision number minus 1
3472         source_path = Ctx().project.trunk_path
3473         entries = self._copy_path(source_path, dest_path,
3474                                   svn_commit.revnum - 1)[1]
3475         # Now since we've just copied trunk to a branch that's
3476         # *supposed* to be empty, we delete any entries in the
3477         # copied directory.
3478         for entry in entries.keys():
3479           del_path = dest_path + '/' + entry
3480           # Delete but don't prune.
3481           self._delete_path(del_path)
3482       else:
3483         msg = "Error filling branch '" \
3484               + _clean_symbolic_name(symbol_fill.name) + "'.\n"
3485         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3486         msg = msg + "attempted to create a branch that already exists."
3487         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3488
3489   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3490             path = None, parent_source_prefix = None,
3491             preferred_revnum = None, prune_ok = None):
3492     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3493     SOURCES, and recurse into the child items.
3494
3495     DEST_PREFIX is the prefix of the destination directory, e.g.
3496     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3497     FillSource classes that are candidates to be copied to the
3498     destination.  DEST_KEY is the key in self.nodes_db to the
3499     destination, or None if the destination does not yet exist.
3500
3501     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3502     are at the top level, e.g. '/tags/my_tag'.
3503
3504     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3505     the parent directory, and PREFERRED_REVNUM is an int which is the
3506     source revision number that the caller (who may have copied KEY's
3507     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3508     then no revision is preferable to any other (which probably means
3509     that no copies have happened yet).
3510
3511     PRUNE_OK means that a copy has been made in this recursion, and
3512     it's safe to prune directories that are not in
3513     SYMBOL_FILL._node_tree, provided that said directory has a source
3514     prefix of one of the PARENT_SOURCE_PREFIX.
3515
3516     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3517     should only be passed in by recursive calls."""
3518     # Calculate scores and revnums for all sources
3519     for source in sources:
3520       src_revnum, score = symbol_fill.get_best_revnum(source.node,
3521                                                       preferred_revnum)
3522       source.set_score(score, src_revnum)
3523
3524     # Sort the sources in descending score order so that we will make
3525     # a eventual copy from the source with the highest score.
3526     sources.sort()
3527     copy_source = sources[0]
3528
3529     src_path = _path_join(copy_source.prefix, path)
3530     dest_path = _path_join(dest_prefix, path)
3531
3532     # Figure out if we shall copy to this destination and delete any
3533     # destination path that is in the way.
3534     do_copy = 0
3535     if dest_key is None:
3536       do_copy = 1
3537     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3538                        copy_source.revnum != preferred_revnum):
3539       # We are about to replace the destination, so we need to remove
3540       # it before we perform the copy.
3541       self._delete_path(dest_path)
3542       do_copy = 1
3543
3544     if do_copy:
3545       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3546                                                copy_source.revnum)
3547       prune_ok = 1
3548     else:
3549       dest_entries = self._get_node(dest_key)
3550
3551     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3552     # elements and the values are lists of FillSource classes where
3553     # this path element exists.
3554     src_entries = {}
3555     for source in sources:
3556       if isinstance(source.node, SvnRevisionRange):
3557         continue
3558       for entry, node in source.node.items():
3559         if not src_entries.has_key(entry):
3560           src_entries[entry] = []
3561         src_entries[entry].append(FillSource(source.prefix, node))
3562
3563     if prune_ok:
3564       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3565       delete_list = [ ]
3566       for entry in dest_entries.keys():
3567         if not src_entries.has_key(entry):
3568           delete_list.append(entry)
3569       if delete_list:
3570         if not self.new_nodes.has_key(dest_key):
3571           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3572         # Sort the delete list to get "diffable" dumpfiles.
3573         delete_list.sort()
3574         for entry in delete_list:
3575           self._fast_delete_path(dest_path, dest_entries, entry)
3576
3577     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3578     src_keys = src_entries.keys()
3579     src_keys.sort()
3580     for src_key in src_keys:
3581       next_dest_key = dest_entries.get(src_key, None)
3582       self._fill(symbol_fill, dest_prefix, next_dest_key,
3583                  src_entries[src_key], _path_join(path, src_key),
3584                  copy_source.prefix, sources[0].revnum, prune_ok)
3585
3586   def _synchronize_default_branch(self, svn_commit):
3587     """Propagate any changes that happened on a non-trunk default
3588     branch to the trunk of the repository.  See
3589     CVSCommit._post_commit() for details on why this is necessary."""
3590     for cvs_rev in svn_commit.cvs_revs:
3591       svn_trunk_path = Ctx().project.make_path(cvs_rev.cvs_path)
3592       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3593         if self._path_exists(svn_trunk_path):
3594           # Delete the path on trunk...
3595           self._delete_path(svn_trunk_path)
3596         # ...and copy over from branch
3597         self._copy_path(cvs_rev.svn_path, svn_trunk_path,
3598                         svn_commit.motivating_revnum)
3599       elif cvs_rev.op == OP_DELETE:
3600         # delete trunk path
3601         self._delete_path(svn_trunk_path)
3602       else:
3603         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3604                % cvs_rev.op)
3605         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3606
3607   def commit(self, svn_commit):
3608     """Add an SVNCommit to the SVNRepository, incrementing the
3609     Repository revision number, and changing the repository.  Invoke
3610     the delegates' _start_commit() method."""
3611
3612     if svn_commit.revnum == 2:
3613       self._initialize_repository(svn_commit.get_date())
3614
3615     self._start_commit(svn_commit)
3616
3617     if svn_commit.symbolic_name:
3618       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3619                   _clean_symbolic_name(svn_commit.symbolic_name))
3620       self._fill_symbolic_name(svn_commit)
3621     elif svn_commit.motivating_revnum:
3622       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3623                   % svn_commit.motivating_revnum)
3624       self._synchronize_default_branch(svn_commit)
3625     else: # This actually commits CVSRevisions
3626       if len(svn_commit.cvs_revs) > 1: plural = "s"
3627       else: plural = ""
3628       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3629                   % (len(svn_commit.cvs_revs), plural))
3630       for cvs_rev in svn_commit.cvs_revs:
3631         # See comment in CVSCommit._commit() for what this is all
3632         # about.  Note that although asking self._path_exists() is
3633         # somewhat expensive, we only do it if the first two (cheap)
3634         # tests succeed first.
3635         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3636                 and (cvs_rev.rev == "1.1.1.1")
3637                 and self._path_exists(cvs_rev.svn_path)):
3638           if cvs_rev.op == OP_ADD:
3639             self._add_path(cvs_rev)
3640           elif cvs_rev.op == OP_CHANGE:
3641             # Fix for Issue #74:
3642             #
3643             # Here's the scenario.  You have file FOO that is imported
3644             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3645             # the file exists.
3646             #
3647             # Moving forward in time, FOO is deleted on the default
3648             # branch (r1.1.1.2).  cvs2svn determines that this delete
3649             # also needs to happen on trunk, so FOO is deleted on
3650             # trunk.
3651             #
3652             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3653             # not 'dead', we assume it's a change).  However, since
3654             # our trunk file has been deleted, svnadmin blows up--you
3655             # can't change a file that doesn't exist!
3656             #
3657             # Soooo... we just check the path, and if it doesn't
3658             # exist, we do an add... if the path does exist, it's
3659             # business as usual.
3660             if not self._path_exists(cvs_rev.svn_path):
3661               self._add_path(cvs_rev)
3662             else:
3663               self._change_path(cvs_rev)
3664
3665         if cvs_rev.op == OP_DELETE:
3666           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3667
3668   def cleanup(self):
3669     """Callback for the Cleanup.register in self.__init__."""
3670     self.revs_db = None
3671     self.nodes_db = None
3672
3673   def add_delegate(self, delegate):
3674     """Adds DELEGATE to self.delegates.
3675
3676     For every delegate you add, as soon as SVNRepositoryMirror
3677     performs a repository action method, SVNRepositoryMirror will call
3678     the delegate's corresponding repository action method.  Multiple
3679     delegates will be called in the order that they are added.  See
3680     SVNRepositoryMirrorDelegate for more information."""
3681     self.delegates.append(delegate)
3682
3683   def _invoke_delegates(self, method, *args):
3684     """Iterate through each of our delegates, in the order that they
3685     were added, and call the delegate's method named METHOD with the
3686     arguments in ARGS."""
3687     for delegate in self.delegates:
3688       getattr(delegate, method)(*args)
3689
3690   def finish(self):
3691     """Calls the delegate finish method."""
3692     self._end_commit()
3693     self._invoke_delegates('finish')
3694     self.cleanup()
3695
3696
3697 class SVNCommitItem:
3698   """A wrapper class for CVSRevision objects upon which
3699    Subversion-related data (such as properties) may be hung."""
3700
3701   def __init__(self, c_rev, make_svn_props):
3702     self.c_rev = c_rev
3703     self.set_cvs_revnum_properties = Ctx().cvs_revnums
3704     self.eol_from_mime_type = Ctx().eol_from_mime_type
3705     self.no_default_eol = Ctx().no_default_eol
3706     self.keywords_off = Ctx().keywords_off
3707     self.mime_mapper = Ctx().mime_mapper
3708
3709     # We begin with only a "CVS revision" property.
3710     self.svn_props = { }
3711     if self.set_cvs_revnum_properties:
3712       self.svn_props['cvs2svn:cvs-rev'] = c_rev.rev
3713       make_svn_props = True
3714
3715     # Set mime-type and eol.  These two properties are intertwingled;
3716     # follow the conditionals carefully.  See also issue #39.
3717     mime_type = None
3718     eol_style = None
3719     keywords = None
3720
3721     if self.mime_mapper:
3722       mime_type = self.mime_mapper.get_type_from_filename(c_rev.cvs_path)
3723
3724     if not c_rev.mode == 'b':
3725       if not self.no_default_eol:
3726         eol_style = 'native'
3727       elif mime_type and self.eol_from_mime_type:
3728         if mime_type.startswith("text/"):
3729           eol_style = 'native'
3730         else:
3731           eol_style = None
3732     elif mime_type is None:
3733       # file is kb, and no other mimetype specified
3734       mime_type = 'application/octet-stream'
3735
3736     # Set the svn:keywords property, if appropriate.  See issue #2.
3737     if not self.keywords_off and (c_rev.mode is None or c_rev.mode == 'kv' or
3738                                   c_rev.mode == 'kvl'):
3739       keywords = SVN_KEYWORDS_VALUE
3740
3741     # Remember if we need to filter the EOLs.  We can't use self.svn_props
3742     # becase they are only set on the first revision and we need to filter
3743     # all revisions.
3744     self.needs_eol_filter = eol_style == 'native'
3745
3746     # Remember if this file has svn:keywords set
3747     self.has_keywords = keywords is not None
3748
3749     # If asked to fill in the Subversion properties ('svn:' ones), do so.
3750     if make_svn_props:
3751       # Tack on the executableness, if any.
3752       if c_rev.file_executable:
3753         self.svn_props['svn:executable'] = '*'
3754
3755       # Set the svn:keywords property, if appropriate.  See issue #2.
3756       if keywords:
3757         self.svn_props['svn:keywords'] = SVN_KEYWORDS_VALUE
3758
3759       if mime_type:
3760         self.svn_props['svn:mime-type'] = mime_type
3761
3762       if eol_style:
3763         self.svn_props['svn:eol-style'] = eol_style
3764
3765
3766 class SVNRepositoryMirrorDelegate:
3767   """Abstract superclass for any delegate to SVNRepositoryMirror.
3768   Subclasses must implement all of the methods below.
3769
3770   For each method, a subclass implements, in its own way, the
3771   Subversion operation implied by the method's name.  For example, for
3772   the add_path method, the DumpfileDelegate would write out a
3773   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3774   would merely print that the path is being added to the repository,
3775   and the RepositoryDelegate would actually cause the path to be added
3776   to the Subversion repository that it is creating.
3777   """
3778
3779   def start_commit(self, svn_commit):
3780     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3781     see subclass implementation for details."""
3782     raise NotImplementedError
3783
3784   def mkdir(self, path):
3785     """PATH is a string; see subclass implementation for details."""
3786     raise NotImplementedError
3787
3788   def add_path(self, s_item):
3789     """S_ITEM is an SVNCommitItem; see subclass implementation for
3790     details."""
3791     raise NotImplementedError
3792
3793   def change_path(self, s_item):
3794     """S_ITEM is an SVNCommitItem; see subclass implementation for
3795     details."""
3796     raise NotImplementedError
3797
3798   def delete_path(self, path):
3799     """PATH is a string; see subclass implementation for
3800     details."""
3801     raise NotImplementedError
3802
3803   def copy_path(self, src_path, dest_path, src_revnum):
3804     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3805     subversion revision number (int); see subclass implementation for
3806     details."""
3807     raise NotImplementedError
3808
3809   def finish(self):
3810     """Perform any cleanup necessary after all revisions have been
3811     committed."""
3812     raise NotImplementedError
3813
3814
3815 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3816   """Create a Subversion dumpfile."""
3817
3818   def __init__(self, dumpfile_path=None):
3819     """Return a new DumpfileDelegate instance, attached to a dumpfile
3820     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding.
3821
3822     If Ctx().cvs_revnums is true, then set the 'cvs2svn:cvs-revnum'
3823     property on files, when they are changed due to a corresponding
3824     CVS revision.
3825
3826     If Ctx().mime_mapper is not None, then it is a MimeMapper
3827     instance, used to determine whether or not to set the
3828     'svn:mime-type' property on files.  But even if Ctx().mime_mapper
3829     is None, files marked with the CVS 'kb' flag will receive a mime
3830     type of "application/octet-stream".
3831
3832     Unless Ctx().no_default_eol is true, set 'svn:eol-style' to
3833     'native' for files not marked with the CVS 'kb' flag, except as
3834     superseded by Ctx().eol_from_mime_type (see below).
3835
3836     If Ctx().eol_from_mime_type is not None, then set 'svn:eol-style'
3837     to 'native' for all files to which Ctx().mime_mapper assigns a
3838     mime type beginning with "text/", and don't set 'svn:eol-style'
3839     for files assigned a type not beginning with "text/".
3840     """
3841     if dumpfile_path:
3842       self.dumpfile_path = dumpfile_path
3843     else:
3844       self.dumpfile_path = Ctx().dumpfile
3845
3846     self.dumpfile = open(self.dumpfile_path, 'wb')
3847     self._write_dumpfile_header(self.dumpfile)
3848
3849   def _write_dumpfile_header(self, dumpfile):
3850     # Initialize the dumpfile with the standard headers.
3851     #
3852     # Since the CVS repository doesn't have a UUID, and the Subversion
3853     # repository will be created with one anyway, we don't specify a
3854     # UUID in the dumpflie
3855     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3856
3857   def _utf8_path(self, path):
3858     """Return a copy of PATH encoded in UTF-8."""
3859     pieces = string.split(path, '/')
3860     # Convert each path component separately (as they may each use
3861     # different encodings).
3862     for i in range(len(pieces)):
3863       try:
3864         # Log messages can be converted with the 'replace' strategy,
3865         # but we can't afford any lossiness here.
3866         pieces[i] = to_utf8(pieces[i], 'strict')
3867       except UnicodeError:
3868         raise FatalError(
3869             "Unable to convert a path '%s' to internal encoding.\n"
3870             "Consider rerunning with (for example) '--encoding=latin1'."
3871             % (path,))
3872     return string.join(pieces, '/')
3873
3874   def start_commit(self, svn_commit):
3875     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3876
3877     self.revision = svn_commit.revnum
3878
3879     # The start of a new commit typically looks like this:
3880     #
3881     #   Revision-number: 1
3882     #   Prop-content-length: 129
3883     #   Content-length: 129
3884     #
3885     #   K 7
3886     #   svn:log
3887     #   V 27
3888     #   Log message for revision 1.
3889     #   K 10
3890     #   svn:author
3891     #   V 7
3892     #   jrandom
3893     #   K 8
3894     #   svn:date
3895     #   V 27
3896     #   2003-04-22T22:57:58.132837Z
3897     #   PROPS-END
3898     #
3899     # Notice that the length headers count everything -- not just the
3900     # length of the data but also the lengths of the lengths, including
3901     # the 'K ' or 'V ' prefixes.
3902     #
3903     # The reason there are both Prop-content-length and Content-length
3904     # is that the former includes just props, while the latter includes
3905     # everything.  That's the generic header form for any entity in a
3906     # dumpfile.  But since revisions only have props, the two lengths
3907     # are always the same for revisions.
3908
3909     # Calculate the total length of the props section.
3910     props = svn_commit.get_revprops()
3911     prop_names = props.keys()
3912     prop_names.sort()
3913     total_len = 10  # len('PROPS-END\n')
3914     for propname in prop_names:
3915       if props[propname] is None:
3916         continue
3917       klen = len(propname)
3918       klen_len = len('K %d' % klen)
3919       vlen = len(props[propname])
3920       vlen_len = len('V %d' % vlen)
3921       # + 4 for the four newlines within a given property's section
3922       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
3923
3924     # Print the revision header and props
3925     self.dumpfile.write('Revision-number: %d\n'
3926                         'Prop-content-length: %d\n'
3927                         'Content-length: %d\n'
3928                         '\n'
3929                         % (self.revision, total_len, total_len))
3930
3931     for propname in prop_names:
3932       if props[propname] is None:
3933         continue
3934       self.dumpfile.write('K %d\n'
3935                           '%s\n'
3936                           'V %d\n'
3937                           '%s\n' % (len(propname),
3938                                     propname,
3939                                     len(props[propname]),
3940                                     props[propname]))
3941
3942     self.dumpfile.write('PROPS-END\n')
3943     self.dumpfile.write('\n')
3944
3945   def mkdir(self, path):
3946     """Emit the creation of directory PATH."""
3947     self.dumpfile.write("Node-path: %s\n"
3948                         "Node-kind: dir\n"
3949                         "Node-action: add\n"
3950                         "\n"
3951                         "\n" % self._utf8_path(path))
3952
3953   def _add_or_change_path(self, s_item, op):
3954     """Emit the addition or change corresponding to S_ITEM.
3955     OP is either the constant OP_ADD or OP_CHANGE."""
3956
3957     # Validation stuffs
3958     if op == OP_ADD:
3959       action = 'add'
3960     elif op == OP_CHANGE:
3961       action = 'change'
3962     else:
3963       raise FatalError("_add_or_change_path() called with bad op ('%s')"
3964                        % (op,))
3965
3966     # Convenience variables
3967     c_rev = s_item.c_rev
3968     svn_props = s_item.svn_props
3969
3970     # The property handling here takes advantage of an undocumented
3971     # but IMHO consistent feature of the Subversion dumpfile-loading
3972     # code.  When a node's properties aren't mentioned (that is, the
3973     # "Prop-content-length:" header is absent, no properties are
3974     # listed at all, and there is no "PROPS-END\n" line) then no
3975     # change is made to the node's properties.
3976     #
3977     # This is consistent with the way dumpfiles behave w.r.t. text
3978     # content changes, so I'm comfortable relying on it.  If you
3979     # commit a change to *just* the properties of some node that
3980     # already has text contents from a previous revision, then in the
3981     # dumpfile output for the prop change, no "Text-content-length:"
3982     # nor "Text-content-md5:" header will be present, and the text of
3983     # the file will not be given.  But this does not cause the file's
3984     # text to be erased!  It simply remains unchanged.
3985     #
3986     # This works out great for cvs2svn, due to lucky coincidences:
3987     #
3988     # For files, the only properties we ever set are set in the first
3989     # revision; all other revisions (including on branches) inherit
3990     # from that.  After the first revision, we never change file
3991     # properties, therefore, there is no need to remember the full set
3992     # of properties on a given file once we've set it.
3993     #
3994     # For directories, the only property we set is "svn:ignore", and
3995     # while we may change it after the first revision, we always do so
3996     # based on the contents of a ".cvsignore" file -- in other words,
3997     # CVS is doing the remembering for us, so we still don't have to
3998     # preserve the previous value of the property ourselves.
3999
4000     # Calculate the (sorted-by-name) property string and length, if any.
4001     prop_contents = ''
4002     prop_names = svn_props.keys()
4003     prop_names.sort()
4004     for pname in prop_names:
4005       pval = svn_props[pname]
4006       prop_contents = prop_contents + \
4007                       'K %d\n%s\nV %d\n%s\n' \
4008                       % (len(pname), pname, len(pval), pval)
4009     if prop_contents:
4010       prop_contents = prop_contents + 'PROPS-END\n'
4011       props_len = len(prop_contents)
4012     else:
4013       props_len = 0
4014
4015     props_header = ''
4016     if props_len:
4017       props_header = 'Prop-content-length: %d\n' % props_len
4018
4019     # treat .cvsignore as a directory property
4020     dir_path, basename = os.path.split(c_rev.svn_path)
4021     if basename == ".cvsignore":
4022       ignore_vals = generate_ignores(c_rev)
4023       ignore_contents = '\n'.join(ignore_vals)
4024       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
4025                          (len(ignore_contents), ignore_contents))
4026       ignore_contents = ignore_contents + 'PROPS-END\n'
4027       ignore_len = len(ignore_contents)
4028
4029       # write headers, then props
4030       self.dumpfile.write('Node-path: %s\n'
4031                           'Node-kind: dir\n'
4032                           'Node-action: change\n'
4033                           'Prop-content-length: %d\n'
4034                           'Content-length: %d\n'
4035                           '\n'
4036                           '%s'
4037                           % (self._utf8_path(dir_path), ignore_len,
4038                              ignore_len, ignore_contents))
4039
4040     # If the file has keywords, we must prevent CVS/RCS from expanding
4041     # the keywords because they must be unexpanded in the repository,
4042     # or Subversion will get confused.
4043     pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(
4044         c_rev, suppress_keyword_substitution=s_item.has_keywords)
4045
4046     self.dumpfile.write('Node-path: %s\n'
4047                         'Node-kind: file\n'
4048                         'Node-action: %s\n'
4049                         '%s'  # no property header if no props
4050                         'Text-content-length: '
4051                         % (self._utf8_path(c_rev.svn_path),
4052                            action, props_header))
4053
4054     pos = self.dumpfile.tell()
4055
4056     self.dumpfile.write('0000000000000000\n'
4057                         'Text-content-md5: 00000000000000000000000000000000\n'
4058                         'Content-length: 0000000000000000\n'
4059                         '\n')
4060
4061     if prop_contents:
4062       self.dumpfile.write(prop_contents)
4063
4064     # Insert a filter to convert all EOLs to LFs if neccessary
4065     if s_item.needs_eol_filter:
4066       data_reader = LF_EOL_Filter(pipe.stdout)
4067     else:
4068       data_reader = pipe.stdout
4069
4070     # Insert the rev contents, calculating length and checksum as we go.
4071     checksum = md5.new()
4072     length = 0
4073     while True:
4074       buf = data_reader.read(PIPE_READ_SIZE)
4075       if buf == '':
4076         break
4077       checksum.update(buf)
4078       length = length + len(buf)
4079       self.dumpfile.write(buf)
4080
4081     pipe.stdout.close()
4082     error_output = pipe.stderr.read()
4083     exit_status = pipe.wait()
4084     if exit_status:
4085       raise FatalError("The command '%s' failed with exit status: %s\n"
4086                        "and the following output:\n"
4087                        "%s" % (pipe_cmd, exit_status, error_output))
4088
4089     # Go back to patch up the length and checksum headers:
4090     self.dumpfile.seek(pos, 0)
4091     # We left 16 zeros for the text length; replace them with the real
4092     # length, padded on the left with spaces:
4093     self.dumpfile.write('%16d' % length)
4094     # 16... + 1 newline + len('Text-content-md5: ') == 35
4095     self.dumpfile.seek(pos + 35, 0)
4096     self.dumpfile.write(checksum.hexdigest())
4097     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
4098     self.dumpfile.seek(pos + 84, 0)
4099     # The content length is the length of property data, text data,
4100     # and any metadata around/inside around them.
4101     self.dumpfile.write('%16d' % (length + props_len))
4102     # Jump back to the end of the stream
4103     self.dumpfile.seek(0, 2)
4104
4105     # This record is done (write two newlines -- one to terminate
4106     # contents that weren't themselves newline-termination, one to
4107     # provide a blank line for readability.
4108     self.dumpfile.write('\n\n')
4109
4110   def add_path(self, s_item):
4111     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
4112     self._add_or_change_path(s_item, OP_ADD)
4113
4114   def change_path(self, s_item):
4115     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
4116     self._add_or_change_path(s_item, OP_CHANGE)
4117
4118   def delete_path(self, path):
4119     """Emit the deletion of PATH."""
4120     self.dumpfile.write('Node-path: %s\n'
4121                         'Node-action: delete\n'
4122                         '\n' % self._utf8_path(path))
4123
4124   def copy_path(self, src_path, dest_path, src_revnum):
4125     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
4126     # We don't need to include "Node-kind:" for copies; the loader
4127     # ignores it anyway and just uses the source kind instead.
4128     self.dumpfile.write('Node-path: %s\n'
4129                         'Node-action: add\n'
4130                         'Node-copyfrom-rev: %d\n'
4131                         'Node-copyfrom-path: /%s\n'
4132                         '\n'
4133                         % (self._utf8_path(dest_path),
4134                            src_revnum,
4135                            self._utf8_path(src_path)))
4136
4137   def finish(self):
4138     """Perform any cleanup necessary after all revisions have been
4139     committed."""
4140     self.dumpfile.close()
4141
4142
4143 class RepositoryDelegate(DumpfileDelegate):
4144   """Creates a new Subversion Repository.  DumpfileDelegate does all
4145   of the heavy lifting."""
4146   def __init__(self):
4147     self.svnadmin = Ctx().svnadmin
4148     self.target = Ctx().target
4149     if not Ctx().existing_svnrepos:
4150       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
4151       if not Ctx().fs_type:
4152         # User didn't say what kind repository (bdb, fsfs, etc).
4153         # We still pass --bdb-txn-nosync.  It's a no-op if the default
4154         # repository type doesn't support it, but we definitely want
4155         # it if BDB is the default.
4156         run_command('%s create %s "%s"' % (self.svnadmin,
4157                                            "--bdb-txn-nosync",
4158                                            self.target))
4159       elif Ctx().fs_type == 'bdb':
4160         # User explicitly specified bdb.
4161         #
4162         # Since this is a BDB repository, pass --bdb-txn-nosync,
4163         # because it gives us a 4-5x speed boost (if cvs2svn is
4164         # creating the repository, cvs2svn should be the only program
4165         # accessing the svn repository (until cvs is done, at least)).
4166         # But we'll turn no-sync off in self.finish(), unless
4167         # instructed otherwise.
4168         run_command('%s create %s %s "%s"' % (self.svnadmin,
4169                                               "--fs-type=bdb",
4170                                               "--bdb-txn-nosync",
4171                                               self.target))
4172       else:
4173         # User specified something other than bdb.
4174         run_command('%s create %s "%s"' % (self.svnadmin,
4175                                            "--fs-type=%s" % Ctx().fs_type,
4176                                            self.target))
4177
4178     # Since the output of this run is a repository, not a dumpfile,
4179     # the temporary dumpfiles we create should go in the tmpdir.
4180     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
4181
4182     # This is 1 if a commit is in progress, otherwise None.
4183     self._commit_in_progress = None
4184
4185     self.dumpfile = open(self.dumpfile_path, 'w+b')
4186     self.loader_pipe = SimplePopen([ self.svnadmin, 'load', '-q',
4187                                      self.target ], True)
4188     self.loader_pipe.stdout.close()
4189     try:
4190       self._write_dumpfile_header(self.loader_pipe.stdin)
4191     except IOError:
4192       raise FatalError("svnadmin failed with the following output while "
4193                        "loading the dumpfile:\n"
4194                        + self.loader_pipe.stderr.read())
4195
4196   def _feed_pipe(self):
4197     """Feed the revision stored in the dumpfile to the svnadmin
4198     load pipe."""
4199     self.dumpfile.seek(0)
4200     while 1:
4201       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
4202       if not len(data):
4203         break
4204       try:
4205         self.loader_pipe.stdin.write(data)
4206       except IOError:
4207         raise FatalError("svnadmin failed with the following output "
4208                          "while loading the dumpfile:\n"
4209                          + self.loader_pipe.stderr.read())
4210
4211   def start_commit(self, svn_commit):
4212     """Start a new commit.  If a commit is already in progress, close
4213     the dumpfile, load it into the svn repository, open a new
4214     dumpfile, and write the header into it."""
4215     if self._commit_in_progress:
4216       self._feed_pipe()
4217     self.dumpfile.seek(0)
4218     self.dumpfile.truncate()
4219     DumpfileDelegate.start_commit(self, svn_commit)
4220     self._commit_in_progress = 1
4221
4222   def finish(self):
4223     """Loads the last commit into the repository."""
4224     self._feed_pipe()
4225     self.dumpfile.close()
4226     self.loader_pipe.stdin.close()
4227     error_output = self.loader_pipe.stderr.read()
4228     exit_status = self.loader_pipe.wait()
4229     if exit_status:
4230       raise FatalError('svnadmin load failed with exit status: %s\n'
4231                        'and the following output:\n'
4232                        '%s' % (exit_status, error_output,))
4233     os.remove(self.dumpfile_path)
4234
4235     # If this is a BDB repository, and we created the repository, and
4236     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
4237     # line in the DB_CONFIG file, because txn syncing should be on by
4238     # default in BDB repositories.
4239     #
4240     # We determine if this is a BDB repository by looking for the
4241     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
4242     # checking Ctx().fs_type.  That way this code will Do The Right
4243     # Thing in all circumstances.
4244     db_config = os.path.join(self.target, "db/DB_CONFIG")
4245     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
4246         and os.path.exists(db_config)):
4247       no_sync = 'set_flags DB_TXN_NOSYNC\n'
4248
4249       contents = open(db_config, 'r').readlines()
4250       index = contents.index(no_sync)
4251       contents[index] = '# ' + no_sync
4252       contents = open(db_config, 'w').writelines(contents)
4253
4254
4255 class StdoutDelegate(SVNRepositoryMirrorDelegate):
4256   """Makes no changes to the disk, but writes out information to
4257   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
4258   print statements will state that we're doing something, when in
4259   reality, we aren't doing anything other than printing out that we're
4260   doing something.  Kind of zen, really."""
4261   def __init__(self, total_revs):
4262     self.total_revs = total_revs
4263
4264   def start_commit(self, svn_commit):
4265     """Prints out the Subversion revision number of the commit that is
4266     being started."""
4267     Log().write(LOG_VERBOSE, "=" * 60)
4268     Log().write(LOG_NORMAL, "Starting Subversion r%d / %d" %
4269                 (svn_commit.revnum, self.total_revs))
4270
4271   def mkdir(self, path):
4272     """Print a line stating that we are creating directory PATH."""
4273     Log().write(LOG_VERBOSE, "  New Directory", path)
4274
4275   def add_path(self, s_item):
4276     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
4277     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
4278
4279   def change_path(self, s_item):
4280     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
4281     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
4282
4283   def delete_path(self, path):
4284     """Print a line stating that we are 'deleting' PATH."""
4285     Log().write(LOG_VERBOSE, "  Deleting", path)
4286
4287   def copy_path(self, src_path, dest_path, src_revnum):
4288     """Print a line stating that we are 'copying' revision SRC_REVNUM
4289     of SRC_PATH to DEST_PATH."""
4290     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
4291     Log().write(LOG_VERBOSE, "                to", dest_path)
4292
4293   def finish(self):
4294     """State that we are done creating our repository."""
4295     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
4296     Log().write(LOG_QUIET, "Done.")
4297
4298 # This should be a local to pass1,
4299 # but Python 2.0 does not support nested scopes.
4300 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
4301 def pass1():
4302   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
4303   cd = CollectData()
4304
4305   def visit_file(baton, dirname, files):
4306     cd = baton
4307     for fname in files:
4308       if fname[-2:] != ',v':
4309         continue
4310       cd.found_valid_file = 1
4311       pathname = os.path.join(dirname, fname)
4312       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
4313         # drop the 'Attic' portion from the pathname for the canonical name.
4314         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
4315       else:
4316         # If this file also exists in the attic, it's a fatal error
4317         attic_path = os.path.join(dirname, 'Attic', fname)
4318         if os.path.exists(attic_path):
4319           err = "%s: A CVS repository cannot contain both %s and %s" \
4320                 % (error_prefix, pathname, attic_path)
4321           sys.stderr.write(err + '\n')
4322           cd.fatal_errors.append(err)
4323         cd.set_fname(pathname, pathname)
4324       Log().write(LOG_NORMAL, pathname)
4325       try:
4326         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
4327       except (cvs2svn_rcsparse.common.RCSParseError, ValueError,
4328               RuntimeError):
4329         err = "%s: '%s' is not a valid ,v file" \
4330               % (error_prefix, pathname)
4331         sys.stderr.write(err + '\n')
4332         cd.fatal_errors.append(err)
4333       except:
4334         Log().write(LOG_WARN,
4335                     "Exception occurred while parsing %s" % pathname)
4336         raise
4337
4338   os.path.walk(Ctx().cvsroot, visit_file, cd)
4339   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
4340
4341   cd.write_symbol_db()
4342
4343   if len(cd.fatal_errors) > 0:
4344     raise FatalException("Pass 1 complete.\n"
4345                          + "=" * 75 + "\n"
4346                          + "Error summary:\n"
4347                          + "\n".join(cd.fatal_errors) + "\n"
4348                          + "Exited due to fatal error(s).\n")
4349
4350   if cd.found_valid_file is None:
4351     raise FatalException(
4352         "\n"
4353         "No RCS files found in your CVS Repository!\n"
4354         "Are you absolutely certain you are pointing cvs2svn\n"
4355         "at a CVS repository?\n"
4356         "\n"
4357         "Exited due to fatal error(s).\n")
4358
4359   StatsKeeper().reset_c_rev_info()
4360   StatsKeeper().archive()
4361   Log().write(LOG_QUIET, "Done")
4362
4363 def pass2():
4364   "Pass 2: clean up the revision information."
4365
4366   symbol_db = SymbolDatabase()
4367   symbol_db.read()
4368
4369   # Convert the list of regexps to a list of strings
4370   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
4371
4372   error_detected = 0
4373
4374   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
4375   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
4376   if blocked_excludes:
4377     for branch, blockers in blocked_excludes.items():
4378       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
4379                        "excluded because the following symbols depend "
4380                        "on it:\n" % (branch))
4381       for blocker in blockers:
4382         sys.stderr.write("    '%s'\n" % (blocker))
4383     sys.stderr.write("\n")
4384     error_detected = 1
4385
4386   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
4387   invalid_forced_tags = [ ]
4388   for forced_tag in Ctx().forced_tags:
4389     if excludes.has_key(forced_tag):
4390       continue
4391     if symbol_db.branch_has_commit(forced_tag):
4392       invalid_forced_tags.append(forced_tag)
4393   if invalid_forced_tags:
4394     sys.stderr.write(error_prefix + ": The following branches cannot be "
4395                      "forced to be tags because they have commits:\n")
4396     for tag in invalid_forced_tags:
4397       sys.stderr.write("    '%s'\n" % (tag))
4398     sys.stderr.write("\n")
4399     error_detected = 1
4400
4401   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
4402   mismatches = symbol_db.find_mismatches(excludes)
4403   def is_not_forced(mismatch):
4404     name = mismatch[0]
4405     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
4406   mismatches = filter(is_not_forced, mismatches)
4407   if mismatches:
4408     sys.stderr.write(error_prefix + ": The following symbols are tags "
4409                      "in some files and branches in others.\nUse "
4410                      "--force-tag, --force-branch and/or --exclude to "
4411                      "resolve the symbols.\n")
4412     for name, tag_count, branch_count, commit_count in mismatches:
4413       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
4414                        "%d files and has commits in %d files.\n"
4415                        % (name, tag_count, branch_count, commit_count))
4416     error_detected = 1
4417
4418   # Bail out now if we found errors
4419   if error_detected:
4420     sys.exit(1)
4421
4422   # Create the tags database
4423   tags_db = TagsDatabase(DB_OPEN_NEW)
4424   for tag in symbol_db.tags.keys():
4425     if tag not in Ctx().forced_branches:
4426       tags_db[tag] = None
4427   for tag in Ctx().forced_tags:
4428     tags_db[tag] = None
4429
4430   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
4431
4432   # We may have recorded some changes in revisions' timestamp.  We need to
4433   # scan for any other files which may have had the same log message and
4434   # occurred at "the same time" and change their timestamps, too.
4435
4436   # read the resync data file
4437   def read_resync(fname):
4438     "Read the .resync file into memory."
4439
4440     ### note that we assume that we can hold the entire resync file in
4441     ### memory. really large repositories with whacky timestamps could
4442     ### bust this assumption. should that ever happen, then it is possible
4443     ### to split the resync file into pieces and make multiple passes,
4444     ### using each piece.
4445
4446     #
4447     # A digest maps to a sequence of lists which specify a lower and upper
4448     # time bound for matching up the commit.  We keep a sequence of these
4449     # because a number of checkins with the same log message (e.g. an empty
4450     # log message) could need to be remapped.  We also make them a list
4451     # because we will dynamically expand the lower/upper bound as we find
4452     # commits that fall into a particular msg and time range.
4453     #
4454     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4455     #
4456     resync = { }
4457
4458     for line in fileinput.FileInput(fname):
4459       t1 = int(line[:8], 16)
4460       digest = line[9:DIGEST_END_IDX]
4461       t2 = int(line[DIGEST_END_IDX+1:], 16)
4462       t1_l = t1 - COMMIT_THRESHOLD/2
4463       t1_u = t1 + COMMIT_THRESHOLD/2
4464       if resync.has_key(digest):
4465         resync[digest].append([t1_l, t1_u, t2])
4466       else:
4467         resync[digest] = [ [t1_l, t1_u, t2] ]
4468
4469     # For each digest, sort the resync items in it in increasing order,
4470     # based on the lower time bound.
4471     digests = resync.keys()
4472     for digest in digests:
4473       (resync[digest]).sort()
4474
4475     return resync
4476
4477   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4478
4479   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4480   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4481
4482   tweaked_timestamps_db = Database(temp(TWEAKED_TIMESTAMPS_DB), DB_OPEN_NEW)
4483   Cleanup().register(temp(TWEAKED_TIMESTAMPS_DB), pass2)
4484
4485   # process the revisions file, looking for items to clean up
4486   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4487     c_rev = CVSRevision(Ctx(), line[:-1])
4488
4489     # Skip this entire revision if it's on an excluded branch
4490     if excludes.has_key(c_rev.branch_name):
4491       continue
4492
4493     new_prev_ts = None
4494     if c_rev.prev_rev is not None:
4495       new_prev_ts = tweaked_timestamps_db.get(
4496         c_rev.unique_key(c_rev.prev_rev), None)
4497     if new_prev_ts:
4498       c_rev.prev_timestamp = new_prev_ts
4499
4500     new_next_ts = None
4501     if c_rev.next_rev is not None:
4502       new_next_ts = tweaked_timestamps_db.get(
4503         c_rev.unique_key(c_rev.next_rev), None)
4504     if new_next_ts:
4505       c_rev.next_timestamp = new_next_ts
4506
4507     # Remove all references to excluded tags and branches
4508     def not_excluded(symbol, excludes=excludes):
4509       return not excludes.has_key(symbol)
4510     c_rev.branches = filter(not_excluded, c_rev.branches)
4511     c_rev.tags = filter(not_excluded, c_rev.tags)
4512
4513     # Convert all branches that are forced to be tags
4514     for forced_tag in Ctx().forced_tags:
4515       if forced_tag in c_rev.branches:
4516         c_rev.branches.remove(forced_tag)
4517         c_rev.tags.append(forced_tag)
4518
4519     # Convert all tags that are forced to be branches
4520     for forced_branch in Ctx().forced_branches:
4521       if forced_branch in c_rev.tags:
4522         c_rev.tags.remove(forced_branch)
4523         c_rev.branches.append(forced_branch)
4524
4525     # see if this is "near" any of the resync records we
4526     # have recorded for this digest [of the log message].
4527     for record in resync.get(c_rev.digest, []):
4528       if record[2] == c_rev.timestamp:
4529         # This means that either c_rev is the same revision that
4530         # caused the resync record to exist, or c_rev is a different
4531         # CVS revision that happens to have the same timestamp.  In
4532         # either case, we don't have to do anything, so we...
4533         continue
4534
4535       if record[0] <= c_rev.timestamp <= record[1]:
4536         # bingo!  We probably want to remap the time on this c_rev,
4537         # unless the remapping would be useless because the new time
4538         # would fall outside the COMMIT_THRESHOLD window for this
4539         # commit group.
4540         new_timestamp = record[2]
4541         # If the new timestamp is earlier than that of our previous revision
4542         if new_timestamp < c_rev.prev_timestamp:
4543           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4544                   + " to time %s, which is before previous the time of"
4545                   + " revision %s (%s):")
4546           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4547                                         c_rev.cvs_path, new_timestamp,
4548                                         c_rev.prev_rev, c_rev.prev_timestamp))
4549           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4550           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4551           # attempted resync time, then sync back to c_rev.prev_timestamp
4552           # + 1...
4553           if ((c_rev.prev_timestamp + 1) - new_timestamp) < COMMIT_THRESHOLD:
4554             new_timestamp = c_rev.prev_timestamp + 1
4555             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4556                                                           new_timestamp))
4557           else:
4558             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4559                         warning_prefix)
4560             continue
4561
4562         # If the new timestamp is later than that of our next revision
4563         elif c_rev.next_timestamp and new_timestamp > c_rev.next_timestamp:
4564           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4565                   + " to time %s, which is after time of next"
4566                   + " revision %s (%s):")
4567           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4568                                         c_rev.cvs_path, new_timestamp,
4569                                         c_rev.prev_rev, c_rev.next_timestamp))
4570           # If resyncing our rev to c_rev.next_timestamp - 1 will place
4571           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4572           # attempted resync time, then sync forward to c_rev.next_timestamp
4573           # - 1...
4574           if (new_timestamp - (c_rev.next_timestamp - 1)) < COMMIT_THRESHOLD:
4575             new_timestamp = c_rev.next_timestamp - 1
4576             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4577                                                           new_timestamp))
4578           else:
4579             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4580                         warning_prefix)
4581             continue
4582
4583         # Fix for Issue #71: Avoid resyncing two consecutive revisions
4584         # to the same timestamp.
4585         elif (new_timestamp == c_rev.prev_timestamp
4586               or new_timestamp == c_rev.next_timestamp):
4587           continue
4588
4589         # adjust the time range. we want the COMMIT_THRESHOLD from the
4590         # bounds of the earlier/latest commit in this group.
4591         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4592         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4593
4594         msg = "PASS2 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4595               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4596                  new_timestamp - c_rev.timestamp)
4597         Log().write(LOG_VERBOSE, msg)
4598
4599         c_rev.timestamp = new_timestamp
4600         tweaked_timestamps_db[c_rev.unique_key()] = new_timestamp
4601
4602         # stop looking for hits
4603         break
4604
4605     output.write(str(c_rev) + "\n")
4606   Log().write(LOG_QUIET, "Done")
4607
4608 def pass3():
4609   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4610   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4611             temp(DATAFILE + SORTED_REVS_SUFFIX))
4612   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4613   Log().write(LOG_QUIET, "Done")
4614
4615 def pass4():
4616   """Iterate through sorted revs, storing them in a database.
4617   If we're not doing a trunk-only conversion, generate the
4618   LastSymbolicNameDatabase, which contains the last CVSRevision
4619   that is a source for each tag or branch.
4620   """
4621   Log().write(LOG_QUIET,
4622       "Copying CVS revision data from flat file to database...")
4623   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4624   if not Ctx().trunk_only:
4625     Log().write(LOG_QUIET,
4626         "Finding last CVS revisions for all symbolic names...")
4627     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4628   else:
4629     # This is to avoid testing Ctx().trunk_only every time around the loop
4630     class DummyLSNDB:
4631       def noop(*args): pass
4632       log_revision = noop
4633       create_database = noop
4634     last_sym_name_db = DummyLSNDB()
4635
4636   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4637     c_rev = CVSRevision(Ctx(), line[:-1])
4638     cvs_revs_db.log_revision(c_rev)
4639     last_sym_name_db.log_revision(c_rev)
4640     StatsKeeper().record_c_rev(c_rev)
4641
4642   last_sym_name_db.create_database()
4643   StatsKeeper().archive()
4644   Log().write(LOG_QUIET, "Done")
4645
4646 def pass5():
4647   """
4648   Generate the SVNCommit <-> CVSRevision mapping
4649   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4650   CVSRevisions that represent an opening or closing for a path on a
4651   branch or tag.  See SymbolingsLogger for more details.
4652   """
4653   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4654
4655   aggregator = CVSRevisionAggregator()
4656   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4657     c_rev = CVSRevision(Ctx(), line[:-1])
4658     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4659       aggregator.process_revision(c_rev)
4660   aggregator.flush()
4661
4662   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4663   StatsKeeper().archive()
4664   Log().write(LOG_QUIET, "Done")
4665
4666 def pass6():
4667   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4668
4669   if not Ctx().trunk_only:
4670     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4671               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4672     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4673   Log().write(LOG_QUIET, "Done")
4674
4675 def pass7():
4676   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4677
4678   def generate_offsets_for_symbolings():
4679     """This function iterates through all the lines in
4680     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4681     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4682     where SYMBOLIC_NAME is first encountered.  This will allow us to
4683     seek to the various offsets in the file and sequentially read only
4684     the openings and closings that we need."""
4685
4686     ###PERF This is a fine example of a db that can be in-memory and
4687     #just flushed to disk when we're done.  Later, it can just be sucked
4688     #back into memory.
4689     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4690     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4691
4692     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4693     old_sym = ""
4694     while 1:
4695       fpos = file.tell()
4696       line = file.readline()
4697       if not line:
4698         break
4699       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4700       if not sym == old_sym:
4701         Log().write(LOG_VERBOSE, " ", sym)
4702         old_sym = sym
4703         offsets_db[sym] = fpos
4704
4705   if not Ctx().trunk_only:
4706     generate_offsets_for_symbolings()
4707   Log().write(LOG_QUIET, "Done.")
4708
4709 def pass8():
4710   svncounter = 2 # Repository initialization is 1.
4711   repos = SVNRepositoryMirror()
4712   persistence_manager = PersistenceManager(DB_OPEN_READ)
4713
4714   if Ctx().target:
4715     if not Ctx().dry_run:
4716       repos.add_delegate(RepositoryDelegate())
4717     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4718   else:
4719     if not Ctx().dry_run:
4720       repos.add_delegate(DumpfileDelegate())
4721     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4722
4723   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4724
4725   while 1:
4726     svn_commit = persistence_manager.get_svn_commit(svncounter)
4727     if not svn_commit:
4728       break
4729     repos.commit(svn_commit)
4730     svncounter += 1
4731
4732   repos.finish()
4733
4734 _passes = [
4735   pass1,
4736   pass2,
4737   pass3,
4738   pass4,
4739   pass5,
4740   pass6,
4741   pass7,
4742   pass8,
4743   ]
4744
4745
4746 class Ctx:
4747   """Session state for this run of cvs2svn.  For example, run-time
4748   options are stored here.  This class is a Borg, see
4749   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4750   """
4751   __shared_state = { }
4752   def __init__(self):
4753     self.__dict__ = self.__shared_state
4754     if self.__dict__:
4755       return
4756     # Else, initialize to defaults.
4757     self.cvsroot = None
4758     self.target = None
4759     self.dumpfile = DUMPFILE
4760     self.tmpdir = '.'
4761     self.verbose = 0
4762     self.quiet = 0
4763     self.prune = 1
4764     self.existing_svnrepos = 0
4765     self.dump_only = 0
4766     self.dry_run = 0
4767     self.trunk_only = 0
4768     self.trunk_base = "trunk"
4769     self.tags_base = "tags"
4770     self.branches_base = "branches"
4771     self.encoding = ["ascii"]
4772     self.mime_types_file = None
4773     self.mime_mapper = None
4774     self.no_default_eol = 0
4775     self.eol_from_mime_type = 0
4776     self.keywords_off = 0
4777     self.use_cvs = None
4778     self.svnadmin = "svnadmin"
4779     self.username = None
4780     self.print_help = 0
4781     self.skip_cleanup = 0
4782     self.cvs_revnums = 0
4783     self.bdb_txn_nosync = 0
4784     self.fs_type = None
4785     self.forced_branches = []
4786     self.forced_tags = []
4787     self.excludes = []
4788     self.symbol_transforms = []
4789
4790 class MimeMapper:
4791   """A class that provides mappings from file names to MIME types.
4792   Note that we should really be using Python's 'mimetypes' module.
4793   See http://cvs2svn.tigris.org/servlets/ReadMsg?list=dev&msgNo=266
4794   for more."""
4795
4796   def __init__(self):
4797     self.mappings = { }
4798
4799   def set_mime_types_file(self, mime_types_file):
4800     for line in fileinput.input(mime_types_file):
4801       if line.startswith("#"):
4802         continue
4803
4804       # format of a line is something like
4805       # text/plain c h cpp
4806       extensions = line.split()
4807       if len(extensions) < 2:
4808         continue
4809       type = extensions.pop(0)
4810       for ext in extensions:
4811         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4812           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n"
4813                            % (warning_prefix, ext, self.mappings[ext], type))
4814         self.mappings[ext] = type
4815
4816
4817   def get_type_from_filename(self, filename):
4818     basename, extension = os.path.splitext(os.path.basename(filename))
4819
4820     # Extension includes the dot, so strip it (will leave extension
4821     # empty if filename ends with a dot, which is ok):
4822     extension = extension[1:]
4823
4824     # If there is no extension (or the file ends with a period), use
4825     # the base name for mapping.  This allows us to set mappings for
4826     # files such as README or Makefile:
4827     if not extension:
4828       extension = basename
4829     return self.mappings.get(extension, None)
4830
4831
4832 def convert(start_pass, end_pass):
4833   "Convert a CVS repository to an SVN repository."
4834
4835   cleanup = Cleanup()
4836   times = [ None ] * (end_pass + 1)
4837   times[start_pass - 1] = time.time()
4838   StatsKeeper().set_start_time(time.time())
4839   for i in range(start_pass - 1, end_pass):
4840     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4841     _passes[i]()
4842     times[i + 1] = time.time()
4843     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4844     # Dispose of items in Ctx() not intended to live past the end of the pass
4845     # (Identified by exactly one leading underscore)
4846     for attr in dir(Ctx()):
4847       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4848           and not attr[:6] == "_Ctx__"):
4849         delattr(Ctx(), attr)
4850     if not Ctx().skip_cleanup:
4851       cleanup.cleanup(_passes[i])
4852     StatsKeeper().set_end_time(time.time())
4853
4854   Log().write(LOG_QUIET, StatsKeeper())
4855   if end_pass < 4:
4856     Log().write(LOG_QUIET,
4857                 '(These are unaltered CVS repository stats and do not\n'
4858                 ' reflect tags or branches excluded via --exclude)\n')
4859   Log().write(LOG_NORMAL, StatsKeeper().timings())
4860
4861
4862 def normalize_ttb_path(opt, path):
4863   """Normalize a path to be used for --trunk, --tags, or --branches.
4864
4865   1. Strip leading, trailing, and duplicated '/'.
4866   2. Verify that the path is not empty.
4867
4868   Return the normalized path.
4869
4870   If the path is invalid, write an error message and exit."""
4871
4872   norm_path = _path_join(*path.split('/'))
4873   if not norm_path:
4874     raise FatalError("cannot pass an empty path to %s." % (opt,))
4875   return norm_path
4876
4877
4878 def verify_paths_disjoint(*paths):
4879   """Verify that all of the paths in the argument list are disjoint.
4880
4881   If any of the paths is nested in another one (i.e., in the sense
4882   that 'a/b/c/d' is nested in 'a/b'), or any two paths are identical,
4883   write an error message and exit."""
4884
4885   paths = [(path.split('/'), path) for path in paths]
4886   # If all overlapping elements are equal, a shorter list is
4887   # considered "less than" a longer one.  Therefore if any paths are
4888   # nested, this sort will leave at least one such pair adjacent, in
4889   # the order [nest,nestling].
4890   paths.sort()
4891   for i in range(1, len(paths)):
4892     split_path1, path1 = paths[i - 1]
4893     split_path2, path2 = paths[i]
4894     if len(split_path1) <= len(split_path2) \
4895        and split_path2[:len(split_path1)] == split_path1:
4896       raise FatalError("paths %s and %s are not disjoint." % (path1, path2,))
4897
4898
4899 def usage():
4900   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
4901         % os.path.basename(sys.argv[0])
4902   print '  --help, -h           print this usage message and exit with success'
4903   print '  --version            print the version number'
4904   print '  -q                   quiet'
4905   print '  -v                   verbose'
4906   print '  -s PATH              path for SVN repos'
4907   print '  -p START[:END]       start at pass START, end at pass END of %d' \
4908         % len(_passes)
4909   print '                       If only START is given, run only pass START'
4910   print '                       (implicitly enables --skip-cleanup)'
4911   print '  --existing-svnrepos  load into existing SVN repository'
4912   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
4913   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
4914   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
4915   print '  --dry-run            do not create a repository or a dumpfile;'
4916   print '                       just print what would happen.'
4917   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
4918   print '                       (only use this if having problems with RCS)'
4919   print '  --svnadmin=PATH      path to the svnadmin program'
4920   print '  --trunk-only         convert only trunk commits, not tags nor branches'
4921   print '  --trunk=PATH         path for trunk (default: %s)'    \
4922         % Ctx().trunk_base
4923   print '  --branches=PATH      path for branches (default: %s)' \
4924         % Ctx().branches_base
4925   print '  --tags=PATH          path for tags (default: %s)'     \
4926         % Ctx().tags_base
4927   print '  --no-prune           don\'t prune empty directories'
4928   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
4929   print '  --encoding=ENC       encoding of log messages in CVS repos'
4930   print '                       Multiple of these options may be passed, where they'
4931   print '                       will be treated as an ordered list of encodings to'
4932   print '                       attempt (with "ascii" as a hardcoded last resort)'
4933   print '  --force-branch=NAME  force NAME to be a branch'
4934   print '  --force-tag=NAME     force NAME to be a tag'
4935   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
4936   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
4937   print '                       use Python regexp and reference syntax respectively'
4938   print '  --username=NAME      username for cvs2svn-synthesized commits'
4939   print '  --skip-cleanup       prevent the deletion of intermediate files'
4940   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
4941   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
4942   print '  --cvs-revnums        record CVS revision numbers as file properties'
4943   print '  --mime-types=FILE    specify an apache-style mime.types file for\n' \
4944         '                       setting svn:mime-type'
4945   print '  --eol-from-mime-type set svn:eol-style by mime type (only with --mime-types)'
4946   print '  --no-default-eol     don\'t set svn:eol-style by CVS defaults'
4947   print '  --keywords-off       don\'t set svn:keywords on any files (by default,'
4948   print '                       cvs2svn sets svn:keywords on non-binary files to'
4949   print '                       "%s")' % SVN_KEYWORDS_VALUE
4950
4951 def main():
4952   # Convenience var, so we don't have to keep instantiating this Borg.
4953   ctx = Ctx()
4954
4955   profiling = None
4956   start_pass = 1
4957   end_pass = len(_passes)
4958
4959   try:
4960     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
4961                                [ "help", "create", "trunk=",
4962                                  "username=", "existing-svnrepos",
4963                                  "branches=", "tags=", "encoding=",
4964                                  "force-branch=", "force-tag=", "exclude=",
4965                                  "use-cvs", "mime-types=",
4966                                  "eol-from-mime-type", "no-default-eol",
4967                                  "trunk-only", "no-prune", "dry-run",
4968                                  "dump-only", "dumpfile=", "tmpdir=",
4969                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
4970                                  "bdb-txn-nosync", "fs-type=",
4971                                  "version", "profile",
4972                                  "keywords-off", "symbol-transform="])
4973   except getopt.GetoptError, e:
4974     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
4975     usage()
4976     sys.exit(1)
4977
4978   for opt, value in opts:
4979     if opt == '--version':
4980         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
4981         sys.exit(0)
4982     elif opt == '-p':
4983       # Don't cleanup if we're doing incrementals.
4984       ctx.skip_cleanup = 1
4985       if value.find(':') > 0:
4986         start_pass, end_pass = map(int, value.split(':'))
4987       else:
4988         end_pass = start_pass = int(value)
4989       if start_pass > len(_passes) or start_pass < 1:
4990         raise FatalError(
4991             'illegal value (%d) for starting pass.  Must be 1 through %d.'
4992             % (int(start_pass), len(_passes),))
4993       if end_pass < start_pass or end_pass > len(_passes):
4994         raise FatalError(
4995             'illegal value (%d) for ending pass.  Must be %d through %d.'
4996             % (int(end_pass), int(start_pass), len(_passes),))
4997     elif (opt == '--help') or (opt == '-h'):
4998       ctx.print_help = 1
4999     elif opt == '-v':
5000       Log().log_level = LOG_VERBOSE
5001       ctx.verbose = 1
5002     elif opt == '-q':
5003       Log().log_level = LOG_QUIET
5004       ctx.quiet = 1
5005     elif opt == '-s':
5006       ctx.target = value
5007     elif opt == '--existing-svnrepos':
5008       ctx.existing_svnrepos = 1
5009     elif opt == '--dumpfile':
5010       ctx.dumpfile = value
5011     elif opt == '--tmpdir':
5012       ctx.tmpdir = value
5013     elif opt == '--use-cvs':
5014       ctx.use_cvs = 1
5015     elif opt == '--svnadmin':
5016       ctx.svnadmin = value
5017     elif opt == '--trunk-only':
5018       ctx.trunk_only = 1
5019     elif opt == '--trunk':
5020       ctx.trunk_base = normalize_ttb_path(opt, value)
5021     elif opt == '--branches':
5022       ctx.branches_base = normalize_ttb_path(opt, value)
5023     elif opt == '--tags':
5024       ctx.tags_base = normalize_ttb_path(opt, value)
5025     elif opt == '--no-prune':
5026       ctx.prune = None
5027     elif opt == '--dump-only':
5028       ctx.dump_only = 1
5029     elif opt == '--dry-run':
5030       ctx.dry_run = 1
5031     elif opt == '--encoding':
5032       ctx.encoding.insert(-1, value)
5033     elif opt == '--force-branch':
5034       ctx.forced_branches.append(value)
5035     elif opt == '--force-tag':
5036       ctx.forced_tags.append(value)
5037     elif opt == '--exclude':
5038       try:
5039         ctx.excludes.append(re.compile('^' + value + '$'))
5040       except re.error, e:
5041         raise FatalError("'%s' is not a valid regexp." % (value,))
5042     elif opt == '--mime-types':
5043       ctx.mime_types_file = value
5044     elif opt == '--eol-from-mime-type':
5045       ctx.eol_from_mime_type = 1
5046     elif opt == '--no-default-eol':
5047       ctx.no_default_eol = 1
5048     elif opt == '--keywords-off':
5049       ctx.keywords_off = 1
5050     elif opt == '--username':
5051       ctx.username = value
5052     elif opt == '--skip-cleanup':
5053       ctx.skip_cleanup = 1
5054     elif opt == '--cvs-revnums':
5055       ctx.cvs_revnums = 1
5056     elif opt == '--bdb-txn-nosync':
5057       ctx.bdb_txn_nosync = 1
5058     elif opt == '--fs-type':
5059       ctx.fs_type = value
5060     elif opt == '--create':
5061       sys.stderr.write(warning_prefix +
5062           ': The behaviour produced by the --create option is now the '
5063           'default,\nand passing the option is deprecated.\n')
5064     elif opt == '--profile':
5065       profiling = 1
5066     elif opt == '--symbol-transform':
5067       [pattern, replacement] = value.split(":")
5068       try:
5069         pattern = re.compile(pattern)
5070       except re.error, e:
5071         raise FatalError("'%s' is not a valid regexp." % (pattern,))
5072       ctx.symbol_transforms.append((pattern, replacement,))
5073
5074   if ctx.print_help:
5075     usage()
5076     sys.exit(0)
5077
5078   # Consistency check for options and arguments.
5079   if len(args) == 0:
5080     usage()
5081     sys.exit(1)
5082
5083   if len(args) > 1:
5084     sys.stderr.write(error_prefix +
5085                      ": must pass only one CVS repository.\n")
5086     usage()
5087     sys.exit(1)
5088
5089   ctx.cvsroot = args[0]
5090
5091   if not os.path.isdir(ctx.cvsroot):
5092     raise FatalError("the given CVS repository path '%s' is not an "
5093                      "existing directory." % ctx.cvsroot)
5094
5095   if ctx.use_cvs:
5096     ctx.cvs_repository = CVSRepositoryViaCVS()
5097   else:
5098     ctx.cvs_repository = CVSRepositoryViaRCS()
5099
5100   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
5101     raise FatalError("must pass one of '-s' or '--dump-only'.")
5102
5103   def not_both(opt1val, opt1name, opt2val, opt2name):
5104     if opt1val and opt2val:
5105       raise FatalError("cannot pass both '%s' and '%s'."
5106                        % (opt1name, opt2name,))
5107
5108   not_both(ctx.target, '-s',
5109            ctx.dump_only, '--dump-only')
5110
5111   not_both(ctx.dump_only, '--dump-only',
5112            ctx.existing_svnrepos, '--existing-svnrepos')
5113
5114   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
5115            ctx.existing_svnrepos, '--existing-svnrepos')
5116
5117   not_both(ctx.dump_only, '--dump-only',
5118            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
5119
5120   not_both(ctx.quiet, '-q',
5121            ctx.verbose, '-v')
5122
5123   not_both(ctx.fs_type, '--fs-type',
5124            ctx.existing_svnrepos, '--existing-svnrepos')
5125
5126   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
5127     raise FatalError("cannot pass --bdb-txn-nosync with --fs-type=%s."
5128                      % ctx.fs_type)
5129
5130   # Create the default project (using ctx.trunk, ctx.branches, and ctx.tags):
5131   ctx.project = Project(ctx.cvsroot,
5132                         ctx.trunk_base, ctx.branches_base, ctx.tags_base)
5133
5134   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
5135     raise FatalError("the svn-repos-path '%s' is not an "
5136                      "existing directory." % ctx.target)
5137
5138   if not ctx.dump_only and not ctx.existing_svnrepos \
5139      and (not ctx.dry_run) and os.path.exists(ctx.target):
5140     raise FatalError("the svn-repos-path '%s' exists.\n"
5141                      "Remove it, or pass '--existing-svnrepos'."
5142                      % ctx.target)
5143
5144   if ctx.target and not ctx.dry_run:
5145     # Verify that svnadmin can be executed.  The 'help' subcommand
5146     # should be harmless.
5147     try:
5148       check_command_runs([ctx.svnadmin, 'help'], 'svnadmin')
5149     except CommandFailedException, e:
5150       raise FatalError(
5151           '%s\n'
5152           'svnadmin could not be executed.  Please ensure that it is\n'
5153           'installed and/or use the --svnadmin option.' % (e,))
5154
5155   if ctx.mime_types_file:
5156     ctx.mime_mapper = MimeMapper()
5157     ctx.mime_mapper.set_mime_types_file(ctx.mime_types_file)
5158
5159   # Make sure the tmp directory exists.  Note that we don't check if
5160   # it's empty -- we want to be able to use, for example, "." to hold
5161   # tempfiles.  But if we *did* want check if it were empty, we'd do
5162   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
5163   if not os.path.exists(ctx.tmpdir):
5164     os.mkdir(ctx.tmpdir)
5165   elif not os.path.isdir(ctx.tmpdir):
5166     raise FatalError(
5167         "cvs2svn tried to use '%s' for temporary files, but that path\n"
5168         "  exists and is not a directory.  Please make it be a directory,\n"
5169         "  or specify some other directory for temporary files."
5170         % (ctx.tmpdir,))
5171
5172   # But do lock the tmpdir, to avoid process clash.
5173   try:
5174     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5175   except OSError, e:
5176     if e.errno == errno.EACCES:
5177       raise FatalError("Permission denied:"
5178                        + " No write access to directory '%s'." % ctx.tmpdir)
5179     if e.errno == errno.EEXIST:
5180       raise FatalError(
5181           "cvs2svn is using directory '%s' for temporary files, but\n"
5182           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
5183           "  cvs2svn process is currently using '%s' as its temporary\n"
5184           "  workspace.  If you are certain that is not the case,\n"
5185           "  then remove the '%s/cvs2svn.lock' subdirectory."
5186           % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir,))
5187     raise
5188   try:
5189     if profiling:
5190       import hotshot
5191       prof = hotshot.Profile('cvs2svn.hotshot')
5192       prof.runcall(convert, start_pass, end_pass)
5193       prof.close()
5194     else:
5195       convert(start_pass, end_pass)
5196   finally:
5197     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5198     except: pass
5199
5200
5201 if __name__ == '__main__':
5202   try:
5203     main()
5204   except FatalException, e:
5205     sys.stderr.write(str(e))
5206     sys.exit(1)
5207
5208