cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import fnmatch
  30 import string
  31 import getopt
  32 import stat
  33 import md5
  34 import marshal
  35 import errno
  36 import popen2
  37 import types
  38 import ConfigParser
  39 try:
  40   # Try to get access to a bunch of encodings for use with --encoding.
  41   # See http://cjkpython.i18n.org/ for details.
  42   import iconv_codec
  43 except ImportError:
  44   pass
  45
  46 # Warnings and errors start with these strings.  They are typically
  47 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  48 warning_prefix = "WARNING"
  49 error_prefix = "ERROR"
  50
  51 # Make sure this Python is recent enough.
  52 if sys.hexversion < 0x02020000:
  53   sys.stderr.write("'%s: Python 2.2 or higher required, "
  54                    "see www.python.org.\n" % error_prefix)
  55   sys.exit(1)
  56
  57 # Pretend we have true booleans on older python versions
  58 try:
  59   True
  60 except:
  61   True = 1
  62   False = 0
  63
  64 # Opening pipes was a mess before Python 2.4, because some methods did
  65 # not exist on some platforms, and some behaved differenly on other.
  66 # Python 2.4 solved this by adding the subprocess module, but since we
  67 # cannot require such a new version, we cannot use it directly, but
  68 # must implement a simplified Popen using the best means neccessary.
  69 #
  70 # The SimplePopen class only has the following members and methods, all
  71 # behaving as documented in the subprocess.Popen class:
  72 #     - stdin
  73 #     - stdout
  74 #     - stderr
  75 #     - wait
  76 try:
  77   # First try subprocess.Popen...
  78   import subprocess
  79   class SimplePopen:
  80     def __init__(self, cmd, capture_stderr):
  81       if capture_stderr:
  82         stderr = subprocess.PIPE
  83       else:
  84         stderr = None
  85       self._popen = subprocess.Popen(cmd, stdin=subprocess.PIPE,
  86                                     stdout=subprocess.PIPE, stderr=stderr)
  87       self.stdin = self._popen.stdin
  88       self.stdout = self._popen.stdout
  89       if capture_stderr:
  90         self.stderr = self._popen.stderr
  91       self.wait = self._popen.wait
  92 except ImportError:
  93   if hasattr(popen2, 'Popen3'):
  94     # ...then try popen2.Popen3...
  95     class SimplePopen:
  96       def __init__(self, cmd, capture_stderr):
  97         self._popen3 = popen2.Popen3(cmd, capture_stderr)
  98         self.stdin = self._popen3.tochild
  99         self.stdout = self._popen3.fromchild
 100         if capture_stderr:
 101           self.stderr = self._popen3.childerr
 102         self.wait = self._popen3.wait
 103   else:
 104     # ...and if all fails, use popen2.popen3...
 105     class SimplePopen:
 106       def __init__(self, cmd, capture_stderr):
 107         if type(cmd) != types.StringType:
 108           cmd = argv_to_command_string(cmd)
 109         self.stdout, self.stdin, self.stderr = popen2.popen3(cmd, mode='b')
 110       def wait(self):
 111         return self.stdout.close() or self.stdin.close() or \
 112                self.stderr.close()
 113
 114 # DBM module selection
 115
 116 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
 117 #    so that the dbhash module used by anydbm will use bsddb3.
 118 try:
 119   import bsddb3
 120   sys.modules['bsddb'] = sys.modules['bsddb3']
 121 except ImportError:
 122   pass
 123
 124 # 2. These DBM modules are not good for cvs2svn.
 125 import anydbm
 126 if (anydbm._defaultmod.__name__ == 'dumbdbm'
 127     or anydbm._defaultmod.__name__ == 'dbm'):
 128   sys.stderr.write(
 129     error_prefix
 130     + ': your installation of Python does not contain a suitable\n'
 131     + 'DBM module -- cvs2svn cannot continue.\n'
 132     + 'See http://python.org/doc/current/lib/module-anydbm.html to solve.\n')
 133   sys.exit(1)
 134
 135 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
 136 #    Unfortunately, gdbm appears not to be trouble free, either.
 137 if hasattr(anydbm._defaultmod, 'bsddb') \
 138     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
 139   try:
 140     gdbm = __import__('gdbm')
 141   except ImportError:
 142     sys.stderr.write(warning_prefix +
 143         ': The version of the bsddb module found '
 144         'on your computer has been reported to malfunction on some datasets, '
 145         'causing KeyError exceptions. You may wish to upgrade your Python to '
 146         'version 2.3 or later.\n')
 147   else:
 148     anydbm._defaultmod = gdbm
 149
 150 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 151 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 152 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 153
 154 SVN_KEYWORDS_VALUE = 'Author Date Id Revision'
 155
 156 # This really only matches standard '1.1.1.*'-style vendor revisions.
 157 # One could conceivably have a file whose default branch is 1.1.3 or
 158 # whatever, or was that at some point in time, with vendor revisions
 159 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 160 # is the only time this regexp gets used), we'd have no basis for
 161 # assuming that the non-standard vendor branch had ever been the
 162 # default branch anyway, so we don't want this to match them anyway.
 163 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 164
 165 # If this run's output is a repository, then (in the tmpdir) we use
 166 # a dumpfile of this name for repository loads.
 167 #
 168 # If this run's output is a dumpfile, then this is default name of
 169 # that dumpfile, but in the current directory (unless the user has
 170 # specified a dumpfile path, of course, in which case it will be
 171 # wherever the user said).
 172 DUMPFILE = 'cvs2svn-dump'
 173
 174 # This file appears with different suffixes at different stages of
 175 # processing.  CVS revisions are cleaned and sorted here, for commit
 176 # grouping.  See design-notes.txt for details.
 177 DATAFILE = 'cvs2svn-data'
 178
 179 # This file contains a marshalled copy of all the statistics that we
 180 # gather throughout the various runs of cvs2svn.  The data stored as a
 181 # marshalled dictionary.
 182 STATISTICS_FILE = 'cvs2svn-statistics'
 183
 184 # This text file contains records (1 per line) that describe svn
 185 # filesystem paths that are the opening and closing source revisions
 186 # for copies to tags and branches.  The format is as follows:
 187 #
 188 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 189 #
 190 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 191 # SVN_REVNUM are the primary and secondary sorting criteria for
 192 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 193 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 194 # A sorted version of the above file.
 195 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 196
 197 # This file is a temporary file for storing symbolic_name -> closing
 198 # CVSRevision until the end of our pass where we can look up the
 199 # corresponding SVNRevNum for the closing revs and write these out to
 200 # the SYMBOL_OPENINGS_CLOSINGS.
 201 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 202
 203 # Skeleton version of an svn filesystem.
 204 # (These supersede and will eventually replace the two above.)
 205 # See class SVNRepositoryMirror for how these work.
 206 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 207 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 208
 209 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 210 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 211 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 212
 213 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 214 # the CVSRevision is the last such that is a source for those symbolic
 215 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 216 # file, and this file's 1.3 is the latest (by date) revision among
 217 # *all* CVS files that is a source for branch B, then the
 218 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 219 # list at least B in its list.
 220 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 221
 222 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 223 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 224 ### the s-revs data in this database.
 225 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 226
 227 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 228 # names), values are ignorable.
 229 TAGS_DB = 'cvs2svn-tags.db'
 230
 231 # A list all tags.  Each line consists of the tag name and the number
 232 # of files in which it exists, separated by a space.
 233 TAGS_LIST = 'cvs2svn-tags.txt'
 234
 235 # A list of all branches.  The file is stored as a plain text file
 236 # to make it easy to look at in an editor.  Each line contains the
 237 # branch name, the number of files where the branch is created, the
 238 # commit count, and a list of tags and branches that are defined on
 239 # revisions in the branch.
 240 BRANCHES_LIST = 'cvs2svn-branches.txt'
 241
 242 # These two databases provide a bidirectional mapping between
 243 # CVSRevision.unique_key()s and Subversion revision numbers.
 244 #
 245 # The first maps CVSRevision.unique_key() to a number; the values are
 246 # not unique.
 247 #
 248 # The second maps a number to a list of CVSRevision.unique_key()s.
 249 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 250 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 251
 252 # This database maps svn_revnums to tuples of (symbolic_name, date).
 253 #
 254 # The svn_revnums are the revision numbers of all non-primary
 255 # SVNCommits.  No primary SVNCommit has a key in this database.
 256 #
 257 # The date is stored for all commits in this database.
 258 #
 259 # For commits that fill symbolic names, the symbolic_name is stored.
 260 # For commits that default branch syncs, the symbolic_name is None.
 261 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 262
 263 # This database maps svn_revnums of a default branch synchronization
 264 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 265 #
 266 # (NOTE: Secondary commits that fill branches and tags also have a
 267 # motivating commit, but we do not record it because it is (currently)
 268 # not needed for anything.)
 269 #
 270 # This mapping is used when generating the log message for the commit
 271 # that synchronizes the default branch with trunk.
 272 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 273
 274 # How many bytes to read at a time from a pipe.  128 kiB should be
 275 # large enough to be efficient without wasting too much memory.
 276 PIPE_READ_SIZE = 128 * 1024
 277
 278 # Record the default RCS branches, if any, for CVS filepaths.
 279 #
 280 # The keys are CVS filepaths, relative to the top of the repository
 281 # and with the ",v" stripped off, so they match the cvs paths used in
 282 # Commit.commit().  The values are vendor branch revisions, such as
 283 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 284 # represents the highest vendor branch revision thought to have ever
 285 # been head of the default branch.
 286 #
 287 # The reason we record a specific vendor revision, rather than a
 288 # default branch number, is that there are two cases to handle:
 289 #
 290 # One case is simple.  The RCS file lists a default branch explicitly
 291 # in its header, such as '1.1.1'.  In this case, we know that every
 292 # revision on the vendor branch is to be treated as head of trunk at
 293 # that point in time.
 294 #
 295 # But there's also a degenerate case.  The RCS file does not currently
 296 # have a default branch, yet we can deduce that for some period in the
 297 # past it probably *did* have one.  For example, the file has vendor
 298 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 299 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 300 # case, we should record 1.1.1.96 as the last vendor revision to have
 301 # been the head of the default branch.
 302 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 303
 304 # Records the author and log message for each changeset.
 305 # The keys are author+log digests, the same kind used to identify
 306 # unique revisions in the .revs, etc files.  Each value is a tuple
 307 # of two elements: '(author logmessage)'.
 308 METADATA_DB = "cvs2svn-metadata.db"
 309
 310 # A temporary on-disk hash that maps CVSRevision unique keys to a new
 311 # timestamp for that CVSRevision.  These new timestamps are created in
 312 # pass2, and this hash is used exclusively in pass2.
 313 TWEAKED_TIMESTAMPS_DB = "cvs2svn-fixed-timestamps.db"
 314
 315 REVS_SUFFIX = '.revs'
 316 CLEAN_REVS_SUFFIX = '.c-revs'
 317 SORTED_REVS_SUFFIX = '.s-revs'
 318 RESYNC_SUFFIX = '.resync'
 319
 320 SVN_INVALID_REVNUM = -1
 321
 322 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 323
 324 # Things that can happen to a file.
 325 OP_NOOP   = '-'
 326 OP_ADD    = 'A'
 327 OP_DELETE = 'D'
 328 OP_CHANGE = 'C'
 329
 330 # A deltatext either does or doesn't represent some change.
 331 DELTATEXT_NONEMPTY = 'N'
 332 DELTATEXT_EMPTY    = 'E'
 333
 334 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 335
 336 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 337 OPENING = 'O'
 338 CLOSING = 'C'
 339
 340 class FatalException(Exception):
 341   """Exception thrown on a non-recoverable error.
 342
 343   If this exception is thrown by main(), it is caught by the global
 344   layer of the program, its string representation is printed, and the
 345   program is ended with an exit code of 1."""
 346
 347   pass
 348
 349
 350 class FatalError(FatalException):
 351   """A FatalException that prepends error_prefix to the message."""
 352
 353   def __init__(self, msg):
 354     """Use (error_prefix + ': ' + MSG + '\n') as the error message."""
 355
 356     FatalException.__init__(self, '%s: %s\n' % (error_prefix, msg,))
 357
 358
 359 def temp(basename):
 360   """Return a path to BASENAME in Ctx().tmpdir.
 361   This is a convenience function to save horizontal space in source."""
 362   return os.path.join(Ctx().tmpdir, basename)
 363
 364 # Since the unofficial set also includes [/\] we need to translate those
 365 # into ones that don't conflict with Subversion limitations.
 366 def _clean_symbolic_name(name):
 367   """Return symbolic name NAME, translating characters that Subversion
 368   does not allow in a pathname."""
 369   name = name.replace('/','++')
 370   name = name.replace('\\','--')
 371   return name
 372
 373 def _path_join(*components):
 374   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 375   Empty component are skipped."""
 376   return string.join(filter(None, components), '/')
 377
 378 def _path_split(path):
 379   """Split the svn pathname PATH into a pair, (HEAD, TAIL).
 380
 381   This is similar to os.path.split(), but always uses '/' as path
 382   separator.  PATH is an svn path, which should not start with a '/'.
 383   HEAD is everything before the last slash, and TAIL is everything
 384   after.  If PATH ends in a slash, TAIL will be empty.  If there is no
 385   slash in PATH, HEAD will be empty.  If PATH is empty, both HEAD and
 386   TAIL are empty."""
 387
 388   pos = path.rfind('/')
 389   if pos == -1:
 390     return ('', path,)
 391   else:
 392     return (path[:pos], path[pos+1:],)
 393
 394 def to_utf8(value, mode='replace'):
 395   """Encode (as Unicode) VALUE, trying the encodings in Ctx.encoding
 396   as valid source encodings.  Raise UnicodeError on failure of all
 397   source encodings."""
 398   ### FIXME: The 'replace' default mode should be an option,
 399   ### like --encoding is.
 400   for encoding in Ctx().encoding:
 401     try:
 402       return unicode(value, encoding, mode).encode('utf8')
 403     except UnicodeError:
 404       Log().write(LOG_VERBOSE, "Encoding '%s' failed for string '%s'"
 405                   % (encoding, value))
 406   raise UnicodeError
 407
 408 def run_command(command):
 409   if os.system(command):
 410     raise FatalError('Command failed: "%s"' % (command,))
 411
 412
 413 class CommandFailedException(Exception):
 414   """Exception raised if check_command_runs() fails."""
 415
 416   pass
 417
 418
 419 def check_command_runs(cmd, cmdname):
 420   """Check whether the command CMD can be executed without errors.
 421
 422   CMD is a list or string, as accepted by SimplePopen.  CMDNAME is the
 423   name of the command as it should be included in exception error
 424   messages.
 425
 426   This function checks three things: (1) the command can be run
 427   without throwing an OSError; (2) it exits with status=0; (3) it
 428   doesn't output anything to stderr.  If any of these conditions is
 429   not met, raise a CommandFailedException describing the problem."""
 430
 431   try:
 432     pipe = SimplePopen(cmd, True)
 433   except OSError, e:
 434     raise CommandFailedException('error executing %s: %s' % (cmdname, e,))
 435   pipe.stdin.close()
 436   pipe.stdout.read()
 437   errmsg = pipe.stderr.read()
 438   status = pipe.wait()
 439   if status != 0 or errmsg:
 440     msg = 'error executing %s: status %s' % (cmdname, status,)
 441     if errmsg:
 442       msg += ', error output:\n%s' % (errmsg,)
 443     raise CommandFailedException(msg)
 444
 445
 446 class CVSRepository:
 447   """A CVS repository from which data can be extracted."""
 448
 449   def __init__(self, cvs_repos_path):
 450     """CVS_REPOS_PATH is the top of the CVS repository (at least as
 451     far as this run is concerned)."""
 452
 453     if not os.path.isdir(cvs_repos_path):
 454       raise FatalError("The specified CVS repository path '%s' is not an "
 455                        "existing directory." % cvs_repos_path)
 456
 457     self.cvs_repos_path = os.path.normpath(cvs_repos_path)
 458     self.cvs_prefix_re = re.compile(
 459         r'^' + re.escape(self.cvs_repos_path)
 460         + r'(' + re.escape(os.sep) + r'|$)')
 461
 462   def get_cvs_path(self, fname):
 463     """Return the path to FNAME relative to cvs_repos_path, with ',v' removed.
 464
 465     FNAME is a filesystem name that has to be within
 466     self.cvs_repos_path.  Return the filename relative to
 467     self.cvs_repos_path, with ',v' striped off if present, and with
 468     os.sep converted to '/'."""
 469
 470     (tail, n) = self.cvs_prefix_re.subn('', fname, 1)
 471     if n != 1:
 472       raise FatalError(
 473           "get_cvs_path: '%s' is not a sub-path of '%s'"
 474           % (fname, self.cvs_repos_path,))
 475     if tail.endswith(',v'):
 476       tail = tail[:-2]
 477     return string.replace(tail, os.sep, '/')
 478
 479   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 480     """Return a command string, and the pipe created using that
 481     string.  C_REV is a CVSRevision.  If SUPPRESS_KEYWORD_SUBSTITUTION
 482     is True, then suppress the substitution of RCS/CVS keywords in the
 483     output.  The pipe returns the text of that CVS Revision."""
 484     raise NotImplementedError
 485
 486
 487 class CVSRepositoryViaRCS(CVSRepository):
 488   """A CVSRepository accessed via RCS."""
 489
 490   def __init__(self, cvs_repos_path):
 491     CVSRepository.__init__(self, cvs_repos_path)
 492     try:
 493       check_command_runs([ 'co', '-V' ], 'co')
 494     except CommandFailedException, e:
 495       raise FatalError('%s\n'
 496                        'Please check that co is installed and in your PATH\n'
 497                        '(it is a part of the RCS software).' % (e,))
 498
 499   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 500     pipe_cmd = [ 'co', '-q', '-x,v', '-p' + c_rev.rev ]
 501     if suppress_keyword_substitution:
 502       pipe_cmd.append('-kk')
 503     pipe_cmd.append(c_rev.rcs_path())
 504     pipe = SimplePopen(pipe_cmd, True)
 505     pipe.stdin.close()
 506     return pipe_cmd, pipe
 507
 508
 509 class CVSRepositoryViaCVS(CVSRepository):
 510   """A CVSRepository accessed via CVS."""
 511
 512   def __init__(self, cvs_repos_path):
 513     CVSRepository.__init__(self, cvs_repos_path)
 514     # Ascend above the specified root if necessary, to find the
 515     # cvs_repository_root (a directory containing a CVSROOT directory)
 516     # and the cvs_module (the path of the conversion root within the
 517     # cvs repository) NB: cvs_module must be seperated by '/' *not* by
 518     # os.sep .
 519     def is_cvs_repository_root(path):
 520       return os.path.isdir(os.path.join(path, 'CVSROOT'))
 521
 522     self.cvs_repository_root = os.path.abspath(self.cvs_repos_path)
 523     self.cvs_module = ""
 524     while not is_cvs_repository_root(self.cvs_repository_root):
 525       # Step up one directory:
 526       prev_cvs_repository_root = self.cvs_repository_root
 527       self.cvs_repository_root, module_component = \
 528           os.path.split(self.cvs_repository_root)
 529       if self.cvs_repository_root == prev_cvs_repository_root:
 530         # Hit the root (of the drive, on Windows) without finding a
 531         # CVSROOT dir.
 532         raise FatalError(
 533             "the path '%s' is not a CVS repository, nor a path "
 534             "within a CVS repository.  A CVS repository contains "
 535             "a CVSROOT directory within its root directory."
 536             % (self.cvs_repos_path,))
 537
 538       self.cvs_module = module_component + "/" + self.cvs_module
 539
 540     os.environ['CVSROOT'] = self.cvs_repository_root
 541
 542     def cvs_ok(global_arguments):
 543       check_command_runs(
 544           [ 'cvs' ] + global_arguments + [ '--version' ], 'cvs')
 545
 546     self.global_arguments = [ "-q", "-R" ]
 547     try:
 548       cvs_ok(self.global_arguments)
 549     except CommandFailedException, e:
 550       self.global_arguments = [ "-q" ]
 551       try:
 552         cvs_ok(self.global_arguments)
 553       except CommandFailedException, e:
 554         raise FatalError(
 555             '%s\n'
 556             'Please check that cvs is installed and in your PATH.' % (e,))
 557
 558   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 559     pipe_cmd = [ 'cvs' ] + self.global_arguments + \
 560                [ 'co', '-r' + c_rev.rev, '-p' ]
 561     if suppress_keyword_substitution:
 562       pipe_cmd.append('-kk')
 563     pipe_cmd.append(self.cvs_module + c_rev.cvs_path)
 564     pipe = SimplePopen(pipe_cmd, True)
 565     pipe.stdin.close()
 566     return pipe_cmd, pipe
 567
 568
 569 def generate_ignores(c_rev):
 570   # Read in props
 571   pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(c_rev)
 572   buf = pipe.stdout.read(PIPE_READ_SIZE)
 573   raw_ignore_val = ""
 574   while buf:
 575     raw_ignore_val = raw_ignore_val + buf
 576     buf = pipe.stdout.read(PIPE_READ_SIZE)
 577   pipe.stdout.close()
 578   error_output = pipe.stderr.read()
 579   exit_status = pipe.wait()
 580   if exit_status:
 581     raise FatalError("The command '%s' failed with exit status: %s\n"
 582                      "and the following output:\n"
 583                      "%s" % (pipe_cmd, exit_status, error_output))
 584
 585   # Tweak props: First, convert any spaces to newlines...
 586   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 587   raw_ignores = raw_ignore_val.split('\n')
 588   ignore_vals = [ ]
 589   for ignore in raw_ignores:
 590     # Reset the list if we encounter a '!'
 591     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 592     if ignore == '!':
 593       ignore_vals = [ ]
 594       continue
 595     # Skip empty lines
 596     if len(ignore) == 0:
 597       continue
 598     ignore_vals.append(ignore)
 599   return ignore_vals
 600
 601 # Return a string that has not been returned by gen_key() before.
 602 gen_key_base = 0L
 603 def gen_key():
 604   global gen_key_base
 605   key = '%x' % gen_key_base
 606   gen_key_base = gen_key_base + 1
 607   return key
 608
 609 # ============================================================================
 610 # This code is copied with a few modifications from:
 611 #   subversion/subversion/bindings/swig/python/svn/core.py
 612
 613 if sys.platform == "win32":
 614   _escape_shell_arg_re = re.compile(r'(\\+)(\"|$)')
 615
 616   def escape_shell_arg(arg):
 617     # The (very strange) parsing rules used by the C runtime library are
 618     # described at:
 619     # http://msdn.microsoft.com/library/en-us/vclang/html/_pluslang_Parsing_C.2b2b_.Command.2d.Line_Arguments.asp
 620
 621     # double up slashes, but only if they are followed by a quote character
 622     arg = re.sub(_escape_shell_arg_re, r'\1\1\2', arg)
 623
 624     # surround by quotes and escape quotes inside
 625     arg = '"' + string.replace(arg, '"', '"^""') + '"'
 626     return arg
 627
 628
 629   def argv_to_command_string(argv):
 630     """Flatten a list of command line arguments into a command string.
 631
 632     The resulting command string is expected to be passed to the system
 633     shell which os functions like popen() and system() invoke internally.
 634     """
 635
 636     # According cmd's usage notes (cmd /?), it parses the command line by
 637     # "seeing if the first character is a quote character and if so, stripping
 638     # the leading character and removing the last quote character."
 639     # So to prevent the argument string from being changed we add an extra set
 640     # of quotes around it here.
 641     return '"' + string.join(map(escape_shell_arg, argv), " ") + '"'
 642
 643 else:
 644   def escape_shell_arg(str):
 645     return "'" + string.replace(str, "'", "'\\''") + "'"
 646
 647   def argv_to_command_string(argv):
 648     """Flatten a list of command line arguments into a command string.
 649
 650     The resulting command string is expected to be passed to the system
 651     shell which os functions like popen() and system() invoke internally.
 652     """
 653
 654     return string.join(map(escape_shell_arg, argv), " ")
 655 # ============================================================================
 656
 657 def format_date(date):
 658   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 659   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 660   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 661
 662 def sort_file(infile, outfile):
 663   # sort the log files
 664
 665   # GNU sort will sort our dates differently (incorrectly!) if our
 666   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 667   # it to 'C'
 668   lc_all_tmp = os.environ.get('LC_ALL', None)
 669   os.environ['LC_ALL'] = 'C'
 670   # The -T option to sort has a nice side effect.  The Win32 sort is
 671   # case insensitive and cannot be used, and since it does not
 672   # understand the -T option and dies if we try to use it, there is
 673   # no risk that we use that sort by accident.
 674   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 675   if lc_all_tmp is None:
 676     del os.environ['LC_ALL']
 677   else:
 678     os.environ['LC_ALL'] = lc_all_tmp
 679
 680 def match_regexp_list(regexp_list, string):
 681   """Test whether STRING matches any of the compiled regexps in
 682   REGEXP_LIST."""
 683   for regexp in regexp_list:
 684     if regexp.match(string):
 685       return True
 686   return False
 687
 688 class LF_EOL_Filter:
 689   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 690   into LFs only."""
 691   def __init__(self, stream):
 692     self.stream = stream
 693     self.carry_cr = False
 694     self.eof = False
 695
 696   def read(self, size):
 697     while True:
 698       buf = self.stream.read(size)
 699       self.eof = len(buf) == 0
 700       if self.carry_cr:
 701         buf = '\r' + buf
 702         self.carry_cr = False
 703       if not self.eof and buf[-1] == '\r':
 704         self.carry_cr = True
 705         buf = buf[:-1]
 706       buf = string.replace(buf, '\r\n', '\n')
 707       buf = string.replace(buf, '\r', '\n')
 708       if len(buf) > 0 or self.eof:
 709         return buf
 710
 711
 712 # These constants represent the log levels that this script supports
 713 LOG_WARN = -1
 714 LOG_QUIET = 0
 715 LOG_NORMAL = 1
 716 LOG_VERBOSE = 2
 717 class Log:
 718   """A Simple logging facility.  Each line will be timestamped is
 719   self.use_timestamps is TRUE.  This class is a Borg, see
 720   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 721   __shared_state = {}
 722   def __init__(self):
 723     self.__dict__ = self.__shared_state
 724     if self.__dict__:
 725       return
 726     self.log_level = LOG_NORMAL
 727     # Set this to true if you want to see timestamps on each line output.
 728     self.use_timestamps = None
 729     self.logger = sys.stdout
 730
 731   def _timestamp(self):
 732     """Output a detailed timestamp at the beginning of each line output."""
 733     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 734
 735   def write(self, log_level, *args):
 736     """This is the public method to use for writing to a file.  Only
 737     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 738     there are multiple ARGS, they will be separated by a space."""
 739     if log_level > self.log_level:
 740       return
 741     if self.use_timestamps:
 742       self._timestamp()
 743     self.logger.write(' '.join(map(str,args)) + "\n")
 744     # Ensure that log output doesn't get out-of-order with respect to
 745     # stderr output.
 746     self.logger.flush()
 747
 748
 749 class Cleanup:
 750   """This singleton class manages any files created by cvs2svn.  When
 751   you first create a file, call Cleanup.register, passing the
 752   filename, and the last pass that you need the file.  After the end
 753   of that pass, your file will be cleaned up after running an optional
 754   callback.  This class is a Borg, see
 755   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 756
 757   __shared_state = {}
 758   def __init__(self):
 759     self.__dict__ = self.__shared_state
 760     if self.__dict__:
 761       return
 762     self._log = {}
 763     self._callbacks = {}
 764
 765   def register(self, file, which_pass, callback=None):
 766     """Register FILE for cleanup at the end of WHICH_PASS, running
 767     function CALLBACK prior to removal.  Registering a given FILE is
 768     idempotent; you may register as many times as you wish, but it
 769     will only be cleaned up once.
 770
 771     Note that if a file is registered multiple times, only the first
 772     callback registered for that file will be called at cleanup
 773     time.  Also note that if you register a database file you must
 774     close the database before cleanup, e.g. using a callback."""
 775     self._log.setdefault(which_pass, {})[file] = 1
 776     if callback and not self._callbacks.has_key(file):
 777       self._callbacks[file] = callback
 778
 779   def cleanup(self, which_pass):
 780     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 781     if not self._log.has_key(which_pass):
 782       return
 783     for file in self._log[which_pass]:
 784       Log().write(LOG_VERBOSE, "Deleting", file)
 785       if self._callbacks.has_key(file):
 786         self._callbacks[file]()
 787       os.unlink(file)
 788
 789
 790 # Always use these constants for opening databases.
 791 DB_OPEN_READ = 'r'
 792 DB_OPEN_NEW = 'n'
 793
 794
 795 class AbstractDatabase:
 796   """An abstract base class for anydbm-based databases."""
 797
 798   def __init__(self, filename, mode):
 799     """A convenience function for opening an anydbm database."""
 800     # pybsddb3 has a bug which prevents it from working with
 801     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 802     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 803     # for databases protected by lock and transaction support
 804     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 805     #
 806     # Therefore, manually perform the removal (we can do this, because
 807     # we know that for bsddb - but *not* anydbm in general - the database
 808     # consists of one file with the name we specify, rather than several
 809     # based on that name).
 810     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 811       if os.path.isfile(filename):
 812         os.unlink(filename)
 813       mode = 'c'
 814
 815     self.db = anydbm.open(filename, mode)
 816     self.has_key = self.db.has_key
 817     self.__delitem__ = self.db.__delitem__
 818
 819   def get(self, key, default=None):
 820     """bsddb3 doesn't have a get() method, so define one here."""
 821
 822     try:
 823       return self[key]
 824     except KeyError:
 825       return default
 826
 827
 828 class SDatabase(AbstractDatabase):
 829   """A database that can only store strings."""
 830
 831   def __getitem__(self, key):
 832     return self.db[key]
 833
 834   def __setitem__(self, key, value):
 835     self.db[key] = value
 836
 837
 838 class Database(AbstractDatabase):
 839   """A database that uses the marshal module to store built-in types."""
 840
 841   def __getitem__(self, key):
 842     return marshal.loads(self.db[key])
 843
 844   def __setitem__(self, key, value):
 845     self.db[key] = marshal.dumps(value)
 846
 847
 848 class StatsKeeper:
 849   __shared_state = { }
 850   def __init__(self):
 851     self.__dict__ = self.__shared_state
 852     if self.__dict__:
 853       return
 854     self.filename = temp(STATISTICS_FILE)
 855     Cleanup().register(self.filename, pass8)
 856     # This can get kinda large, so we don't store it in our data dict.
 857     self.repos_files = { }
 858
 859     if os.path.exists(self.filename):
 860       self.unarchive()
 861     else:
 862       self.data = { 'cvs_revs_count' : 0,
 863                     'tags': { },
 864                     'branches' : { },
 865                     'repos_size' : 0,
 866                     'repos_file_count' : 0,
 867                     'svn_rev_count' : None,
 868                     'first_rev_date' : 1L<<32,
 869                     'last_rev_date' : 0,
 870                     'pass_timings' : { },
 871                     'start_time' : 0,
 872                     'end_time' : 0,
 873                     }
 874
 875   def log_duration_for_pass(self, duration, pass_num):
 876     self.data['pass_timings'][pass_num] = duration
 877
 878   def set_start_time(self, start):
 879     self.data['start_time'] = start
 880
 881   def set_end_time(self, end):
 882     self.data['end_time'] = end
 883
 884   def _bump_item(self, key, amount=1):
 885     self.data[key] = self.data[key] + amount
 886
 887   def reset_c_rev_info(self):
 888     self.data['cvs_revs_count'] = 0
 889     self.data['tags'] = { }
 890     self.data['branches'] = { }
 891
 892   def record_c_rev(self, c_rev):
 893     self._bump_item('cvs_revs_count')
 894
 895     for tag in c_rev.tags:
 896       self.data['tags'][tag] = None
 897     for branch in c_rev.branches:
 898       self.data['branches'][branch] = None
 899
 900     if c_rev.timestamp < self.data['first_rev_date']:
 901       self.data['first_rev_date'] = c_rev.timestamp
 902
 903     if c_rev.timestamp > self.data['last_rev_date']:
 904       self.data['last_rev_date'] = c_rev.timestamp
 905
 906     # Only add the size if this is the first time we see the file.
 907     if not self.repos_files.has_key(c_rev.fname):
 908       self._bump_item('repos_size', c_rev.file_size)
 909     self.repos_files[c_rev.fname] = None
 910
 911     self.data['repos_file_count'] = len(self.repos_files)
 912
 913   def set_svn_rev_count(self, count):
 914     self.data['svn_rev_count'] = count
 915
 916   def svn_rev_count(self):
 917     return self.data['svn_rev_count']
 918
 919   def archive(self):
 920     open(self.filename, 'w').write(marshal.dumps(self.data))
 921
 922   def unarchive(self):
 923     self.data = marshal.loads(open(self.filename, 'r').read())
 924
 925   def __str__(self):
 926     svn_revs_str = ""
 927     if self.data['svn_rev_count'] is not None:
 928       svn_revs_str = ('Total SVN Commits:      %10s\n'
 929                       % self.data['svn_rev_count'])
 930
 931     return ('\n'                                \
 932             'cvs2svn Statistics:\n'             \
 933             '------------------\n'              \
 934             'Total CVS Files:        %10i\n'    \
 935             'Total CVS Revisions:    %10i\n'    \
 936             'Total Unique Tags:      %10i\n'    \
 937             'Total Unique Branches:  %10i\n'    \
 938             'CVS Repos Size in KB:   %10i\n'    \
 939             '%s'                                \
 940             'First Revision Date:    %s\n'      \
 941             'Last Revision Date:     %s\n'      \
 942             '------------------'                \
 943             % (self.data['repos_file_count'],
 944                self.data['cvs_revs_count'],
 945                len(self.data['tags']),
 946                len(self.data['branches']),
 947                (self.data['repos_size'] / 1024),
 948                svn_revs_str,
 949                time.ctime(self.data['first_rev_date']),
 950                time.ctime(self.data['last_rev_date']),
 951                ))
 952
 953   def timings(self):
 954     passes = self.data['pass_timings'].keys()
 955     passes.sort()
 956     str = 'Timings:\n------------------\n'
 957
 958     def desc(val):
 959       if val == 1: return "second"
 960       return "seconds"
 961
 962     for pass_num in passes:
 963       duration = int(self.data['pass_timings'][pass_num])
 964       p_str = ('pass %d:%6d %s\n'
 965                % (pass_num, duration, desc(duration)))
 966       str = str + p_str
 967
 968     total = int(self.data['end_time'] - self.data['start_time'])
 969     str = str + ('total: %6d %s' % (total, desc(total)))
 970     return str
 971
 972
 973 class LastSymbolicNameDatabase:
 974   """ Passing every CVSRevision in s-revs to this class will result in
 975   a Database whose key is the last CVS Revision a symbolicname was
 976   seen in, and whose value is a list of all symbolicnames that were
 977   last seen in that revision."""
 978   def __init__(self, mode):
 979     self.symbols = {}
 980     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 981     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 982
 983   # Once we've gone through all the revs,
 984   # symbols.keys() will be a list of all tags and branches, and
 985   # their corresponding values will be a key into the last CVS revision
 986   # that they were used in.
 987   def log_revision(self, c_rev):
 988     # Gather last CVS Revision for symbolic name info and tag info
 989     for tag in c_rev.tags:
 990       self.symbols[tag] = c_rev.unique_key()
 991     if c_rev.op is not OP_DELETE:
 992       for branch in c_rev.branches:
 993         self.symbols[branch] = c_rev.unique_key()
 994
 995   # Creates an inversion of symbols above--a dictionary of lists (key
 996   # = CVS rev unique_key: val = list of symbols that close in that
 997   # rev.
 998   def create_database(self):
 999     for sym, rev_unique_key in self.symbols.items():
1000       ary = self.symbol_revs_db.get(rev_unique_key, [])
1001       ary.append(sym)
1002       self.symbol_revs_db[rev_unique_key] = ary
1003
1004
1005 class CVSRevisionDatabase:
1006   """A Database to store CVSRevision objects and retrieve them by their
1007   unique_key()."""
1008
1009   def __init__(self, mode):
1010     """Initialize an instance, opening database in MODE (like the MODE
1011     argument to Database or anydbm.open())."""
1012     self.cvs_revs_db = SDatabase(temp(CVS_REVS_DB), mode)
1013     Cleanup().register(temp(CVS_REVS_DB), pass8)
1014
1015   def log_revision(self, c_rev):
1016     """Add C_REV, a CVSRevision, to the database."""
1017     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
1018
1019   def get_revision(self, unique_key):
1020     """Return the CVSRevision stored under UNIQUE_KEY."""
1021     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
1022
1023
1024 def TagsDatabase(mode):
1025   """A Database to store which symbolic names are tags.
1026   Each key is a tag name.
1027   The value has no meaning, and should be set to None."""
1028   db = SDatabase(temp(TAGS_DB), mode)
1029   Cleanup().register(temp(TAGS_DB), pass8)
1030   return db
1031
1032
1033 class Project:
1034   """A project within a CVS repository."""
1035
1036   def __init__(self, project_cvs_repos_path,
1037                trunk_path, branches_path, tags_path):
1038     """Create a new Project record.
1039
1040     PROJECT_CVS_REPOS_PATH is the main CVS directory for this project
1041     (within the filesystem).  TRUNK_PATH, BRANCHES_PATH, and TAGS_PATH
1042     are the full, normalized directory names in svn for the
1043     corresponding part of the repository."""
1044
1045     self.project_cvs_repos_path = project_cvs_repos_path
1046     prefix = Ctx().cvs_repository.cvs_repos_path
1047     if not self.project_cvs_repos_path.startswith(prefix):
1048       raise FatalError("Project '%s' must start with '%s'"
1049                        % (self.project_cvs_repos_path, prefix,))
1050     # The project's main directory as a cvs_path:
1051     self.project_cvs_path = self.project_cvs_repos_path[len(prefix):]
1052     if self.project_cvs_path.startswith(os.sep):
1053       self.project_cvs_path = self.project_cvs_path[1:]
1054     self.trunk_path = trunk_path
1055     self.branches_path = branches_path
1056     self.tags_path = tags_path
1057     verify_paths_disjoint(self.trunk_path, self.branches_path, self.tags_path)
1058
1059   def is_source(self, svn_path):
1060     """Return True iff SVN_PATH is a legitimate source for this project.
1061
1062     Legitimate paths are self.trunk_path or any directory directly
1063     under self.branches_path."""
1064
1065     if svn_path == self.trunk_path:
1066       return True
1067
1068     (head, tail,) = _path_split(svn_path)
1069     if head == self.branches_path:
1070       return True
1071
1072     return False
1073
1074   def is_unremovable(self, svn_path):
1075     """Return True iff the specified path must not be removed."""
1076
1077     return svn_path in [self.trunk_path, self.branches_path, self.tags_path]
1078
1079   def get_branch_path(self, branch_name):
1080     """Return the svnpath for the branch named BRANCH_NAME."""
1081
1082     return _path_join(self.branches_path, _clean_symbolic_name(branch_name))
1083
1084   def get_tag_path(self, tag_name):
1085     """Return the svnpath for the tag named TAG_NAME."""
1086
1087     return _path_join(self.tags_path, _clean_symbolic_name(tag_name))
1088
1089   def _relative_name(self, cvs_path):
1090     """Convert CVS_PATH into a name relative to this project's root directory.
1091
1092     CVS_PATH has to begin (textually) with self.project_cvs_path.
1093     Remove prefix and optional '/'."""
1094
1095     if not cvs_path.startswith(self.project_cvs_path):
1096       raise FatalError(
1097           "_relative_name: '%s' is not a sub-path of '%s'"
1098           % (cvs_path, self.project_cvs_path,))
1099     l = len(self.project_cvs_path)
1100     if cvs_path[l] == os.sep:
1101       l += 1
1102     return cvs_path[l:]
1103
1104   def make_trunk_path(self, cvs_path):
1105     """Return the trunk path for CVS_PATH.
1106
1107     Return the svn path for this file on trunk."""
1108
1109     return _path_join(self.trunk_path, self._relative_name(cvs_path))
1110
1111   def make_branch_path(self, branch_name, cvs_path):
1112     """Return the svn path for CVS_PATH on branch BRANCH_NAME."""
1113
1114     return _path_join(self.get_branch_path(branch_name),
1115                       self._relative_name(cvs_path))
1116
1117
1118 class CVSRevision:
1119   def __init__(self, ctx, *args):
1120     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
1121
1122     If CTX is None, the following members and methods of the
1123     instantiated CVSRevision class object will be unavailable (or
1124     simply will not work correctly, if at all):
1125        cvs_path
1126        svn_path
1127        is_default_branch_revision()
1128
1129     (Note that this class treats CTX as const, because the caller
1130     likely passed in a Borg instance of a Ctx.  The reason this class
1131     takes CTX as as a parameter, instead of just instantiating a Ctx
1132     itself, is that this class should be usable outside cvs2svn.)
1133
1134     If there is one argument in ARGS, it is a string, in the format of
1135     a line from a revs file.  Do *not* include a trailing newline.
1136
1137     If there are multiple ARGS, there must be 17 of them,
1138     comprising a parsed revs line:
1139        timestamp       -->  (int) date stamp for this cvs revision
1140        digest          -->  (string) digest of author+logmsg
1141        prev_timestamp  -->  (int) date stamp for the previous cvs revision
1142        next_timestamp  -->  (int) date stamp for the next cvs revision
1143        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
1144        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
1145        rev             -->  (string) this CVS rev, e.g., "1.3"
1146        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
1147        file_in_attic   -->  (char or None) true if RCS file is in Attic
1148        file_executable -->  (char or None) true if RCS file has exec bit set.
1149        file_size       -->  (int) size of the RCS file
1150        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
1151        fname           -->  (string) relative path of file in CVS repos
1152        mode            -->  (string or None) "kkv", "kb", etc.
1153        branch_name     -->  (string or None) branch on which this rev occurred
1154        tags            -->  (list of strings) all tags on this revision
1155        branches        -->  (list of strings) all branches rooted in this rev
1156
1157     The two forms of initialization are equivalent.
1158
1159     WARNING: Due to the resync process in pass2, prev_timestamp or
1160     next_timestamp may be incorrect in the c-revs or s-revs files."""
1161
1162     self._ctx = ctx
1163     if len(args) == 17:
1164       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1165        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1166        self.file_executable, self.file_size, self.deltatext_code,
1167        self.fname,
1168        self.mode, self.branch_name, self.tags, self.branches) = args
1169     elif len(args) == 1:
1170       data = args[0].split(' ', 15)
1171       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1172        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1173        self.file_executable, self.file_size, self.deltatext_code,
1174        self.mode, self.branch_name, numtags, remainder) = data
1175       # Patch up data items which are not simple strings
1176       self.timestamp = int(self.timestamp, 16)
1177       if self.prev_timestamp == "*":
1178         self.prev_timestamp = 0
1179       else:
1180         self.prev_timestamp = int(self.prev_timestamp)
1181       if self.next_timestamp == "*":
1182         self.next_timestamp = 0
1183       else:
1184         self.next_timestamp = int(self.next_timestamp)
1185       if self.prev_rev == "*":
1186         self.prev_rev = None
1187       if self.next_rev == "*":
1188         self.next_rev = None
1189       if self.file_in_attic == "*":
1190         self.file_in_attic = None
1191       if self.file_executable == "*":
1192         self.file_executable = None
1193       self.file_size = int(self.file_size)
1194       if self.mode == "*":
1195         self.mode = None
1196       if self.branch_name == "*":
1197         self.branch_name = None
1198       numtags = int(numtags)
1199       tags_and_numbranches_and_remainder = remainder.split(' ', numtags + 1)
1200       self.tags = tags_and_numbranches_and_remainder[:-2]
1201       numbranches = int(tags_and_numbranches_and_remainder[-2])
1202       remainder = tags_and_numbranches_and_remainder[-1]
1203       branches_and_fname = remainder.split(' ', numbranches)
1204       self.branches = branches_and_fname[:-1]
1205       self.fname = branches_and_fname[-1]
1206     else:
1207       raise TypeError, 'CVSRevision() takes 2 or 18 arguments (%d given)' % \
1208           (len(args) + 1)
1209     if ctx is not None:
1210       self.cvs_path = ctx.cvs_repository.get_cvs_path(self.fname)
1211       if self.branch_name:
1212         self.svn_path = ctx.project.make_branch_path(self.branch_name,
1213                                                      self.cvs_path)
1214       else:
1215         self.svn_path = ctx.project.make_trunk_path(self.cvs_path)
1216
1217   # The 'primary key' of a CVS Revision is the revision number + the
1218   # filename.  To provide a unique key (say, for a dict), we just glom
1219   # them together in a string.  By passing in self.prev_rev or
1220   # self.next_rev, you can get the unique key for their respective
1221   # CVSRevisions.
1222   def unique_key(self, revnum="0"):
1223     if revnum is "0":
1224       revnum = self.rev
1225     elif revnum is None:
1226       return None
1227     return revnum + "/" + self.fname
1228
1229   def __str__(self):
1230     return ('%08lx %s %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s'
1231             % (self.timestamp, self.digest, self.prev_timestamp or "*",
1232               self.next_timestamp or "*", self.op, (self.prev_rev or "*"),
1233               self.rev, (self.next_rev or "*"), (self.file_in_attic or "*"),
1234               (self.file_executable or "*"),
1235               self.file_size,
1236               self.deltatext_code, (self.mode or "*"),
1237               (self.branch_name or "*"),
1238               len(self.tags), self.tags and " " or "", " ".join(self.tags),
1239               len(self.branches), self.branches and " " or "",
1240               " ".join(self.branches),
1241               self.fname, ))
1242
1243   # Returns true if this CVSRevision is the opening CVSRevision for
1244   # NAME (for this RCS file).
1245   def opens_symbolic_name(self, name):
1246     if name in self.tags:
1247       return 1
1248     if name in self.branches:
1249       # If this c_rev opens a branch and our op is OP_DELETE, then
1250       # that means that the file that this c_rev belongs to was
1251       # created on the branch, so for all intents and purposes, this
1252       # c_rev is *technically* not an opening.  See Issue #62 for more
1253       # information.
1254       if self.op != OP_DELETE:
1255         return 1
1256     return 0
1257
1258   def is_default_branch_revision(self):
1259     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
1260     revision according to DEFAULT_BRANCHES_DB (see the conditions
1261     documented there), else return None."""
1262     val = self._ctx._default_branches_db.get(self.cvs_path, None)
1263     if val is not None:
1264       val_last_dot = val.rindex(".")
1265       our_last_dot = self.rev.rindex(".")
1266       default_branch = val[:val_last_dot]
1267       our_branch = self.rev[:our_last_dot]
1268       default_rev_component = int(val[val_last_dot + 1:])
1269       our_rev_component = int(self.rev[our_last_dot + 1:])
1270       if (default_branch == our_branch
1271           and our_rev_component <= default_rev_component):
1272         return 1
1273     # else
1274     return None
1275
1276   def rcs_path(self):
1277     """Returns the actual filesystem path to the RCS file of this
1278     CVSRevision."""
1279     if self.file_in_attic is None:
1280       return self.fname
1281     else:
1282       basepath, filename = os.path.split(self.fname)
1283       return os.path.join(basepath, 'Attic', filename)
1284
1285   def filename(self):
1286     "Return the last path component of self.fname, minus the ',v'"
1287     return os.path.split(self.fname)[-1][:-2]
1288
1289 class SymbolDatabase:
1290   """This database records information on all symbols in the RCS
1291   files.  It is created in pass 1 and it is used in pass 2."""
1292   def __init__(self):
1293     # A hash that maps tag names to commit counts
1294     self.tags = { }
1295     # A hash that maps branch names to lists of the format
1296     # [ create_count, commit_count, blockers ], where blockers
1297     # is a hash that lists the symbols that depend on the
1298     # the branch.  The blockers hash is used as a set, so the
1299     # values are not used.
1300     self.branches = { }
1301
1302   def register_tag_creation(self, name):
1303     """Register the creation of the tag NAME."""
1304     self.tags[name] = self.tags.get(name, 0) + 1
1305
1306   def _branch(self, name):
1307     """Helper function to get a branch node that will create and
1308     initialize the node if it does not exist."""
1309     if not self.branches.has_key(name):
1310       self.branches[name] = [ 0, 0, { } ]
1311     return self.branches[name]
1312
1313   def register_branch_creation(self, name):
1314     """Register the creation of the branch NAME."""
1315     self._branch(name)[0] += 1
1316
1317   def register_branch_commit(self, name):
1318     """Register a commit on the branch NAME."""
1319     self._branch(name)[1] += 1
1320
1321   def register_branch_blocker(self, name, blocker):
1322     """Register BLOCKER as a blocker on the branch NAME."""
1323     self._branch(name)[2][blocker] = None
1324
1325   def branch_has_commit(self, name):
1326     """Return non-zero if NAME has commits.  Returns 0 if name
1327     is not a branch or if it has no commits."""
1328     return self.branches.has_key(name) and self.branches[name][1]
1329
1330   def find_excluded_symbols(self, regexp_list):
1331     """Returns a hash of all symbols thaht match the regexps in
1332     REGEXP_LISTE.  The hash is used as a set so the values are
1333     not used."""
1334     excludes = { }
1335     for tag in self.tags:
1336       if match_regexp_list(regexp_list, tag):
1337         excludes[tag] = None
1338     for branch in self.branches:
1339       if match_regexp_list(regexp_list, branch):
1340         excludes[branch] = None
1341     return excludes
1342
1343   def find_branch_exclude_blockers(self, branch, excludes):
1344     """Find all blockers of BRANCH, excluding the ones in the hash
1345     EXCLUDES."""
1346     blockers = { }
1347     if excludes.has_key(branch):
1348       for blocker in self.branches[branch][2]:
1349         if not excludes.has_key(blocker):
1350           blockers[blocker] = None
1351     return blockers
1352
1353   def find_blocked_excludes(self, excludes):
1354     """Find all branches not in EXCLUDES that have blocking symbols that
1355     are not themselves excluded.  Return a hash that maps branch names
1356     to a hash of blockers.  The hash of blockes is used as a set so the
1357     values are not used."""
1358     blocked_branches = { }
1359     for branch in self.branches:
1360       blockers = self.find_branch_exclude_blockers(branch, excludes)
1361       if blockers:
1362         blocked_branches[branch] = blockers
1363     return blocked_branches
1364
1365   def find_mismatches(self, excludes=None):
1366     """Find all symbols that are defined as both tags and branches,
1367     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1368     the symbol name, tag count, branch count and commit count."""
1369     if excludes is None:
1370       excludes = { }
1371     mismatches = [ ]
1372     for branch in self.branches:
1373       if not excludes.has_key(branch) and self.tags.has_key(branch):
1374         mismatches.append((branch,                    # name
1375                            self.tags[branch],         # tag count
1376                            self.branches[branch][0],  # branch count
1377                            self.branches[branch][1])) # commit count
1378     return mismatches
1379
1380   def read(self):
1381     """Read the symbol database from files."""
1382     f = open(temp(TAGS_LIST))
1383     while 1:
1384       line = f.readline()
1385       if not line:
1386         break
1387       tag, count = line.split()
1388       self.tags[tag] = int(count)
1389
1390     f = open(temp(BRANCHES_LIST))
1391     while 1:
1392       line = f.readline()
1393       if not line:
1394         break
1395       words = line.split()
1396       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1397       for blocker in words[3:]:
1398         self.branches[words[0]][2][blocker] = None
1399
1400   def write(self):
1401     """Store the symbol database to files."""
1402     f = open(temp(TAGS_LIST), "w")
1403     Cleanup().register(temp(TAGS_LIST), pass2)
1404     for tag, count in self.tags.items():
1405       f.write("%s %d\n" % (tag, count))
1406
1407     f = open(temp(BRANCHES_LIST), "w")
1408     Cleanup().register(temp(BRANCHES_LIST), pass2)
1409     for branch, info in self.branches.items():
1410       f.write("%s %d %d" % (branch, info[0], info[1]))
1411       if info[2]:
1412         f.write(" ")
1413         f.write(" ".join(info[2].keys()))
1414       f.write("\n")
1415
1416 class CollectData(cvs2svn_rcsparse.Sink):
1417   def __init__(self):
1418     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1419     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1420     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1421     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1422     self.default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
1423                                          DB_OPEN_NEW)
1424     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1425     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1426     Cleanup().register(temp(METADATA_DB), pass8)
1427     self.fatal_errors = []
1428     self.num_files = 0
1429     self.symbol_db = SymbolDatabase()
1430
1431     # 1 if we've collected data for at least one file, None otherwise.
1432     self.found_valid_file = None
1433
1434     # See set_fname() for initializations of other variables.
1435
1436   def set_fname(self, canonical_name, filename):
1437     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1438     filesystem path to the file in question, and CANONICAL_NAME is
1439     FILENAME with the 'Attic' component removed (if the file is indeed
1440     in the Attic) ."""
1441     self.fname = canonical_name
1442
1443     # We calculate and save some file metadata here, where we can do
1444     # it only once per file, instead of waiting until later where we
1445     # would have to do the same calculations once per CVS *revision*.
1446
1447     self.cvs_path = Ctx().cvs_repository.get_cvs_path(self.fname)
1448
1449     # If the paths are not the same, then that means that the
1450     # canonical_name has had the 'Attic' component stripped out.
1451     self.file_in_attic = None
1452     if canonical_name != filename:
1453       self.file_in_attic = 1
1454
1455     file_stat = os.stat(filename)
1456     # The size of our file in bytes
1457     self.file_size = file_stat[stat.ST_SIZE]
1458
1459     # Whether or not the executable bit is set.
1460     self.file_executable = None
1461     if file_stat[0] & stat.S_IXUSR:
1462       self.file_executable = 1
1463
1464     # revision -> [timestamp, author, old-timestamp]
1465     self.rev_data = { }
1466
1467     # Maps revision number (key) to the revision number of the
1468     # previous revision along this line of development.
1469     #
1470     # For the first revision R on a branch, we consider the revision
1471     # from which R sprouted to be the 'previous'.
1472     #
1473     # Note that this revision can't be determined arithmetically (due
1474     # to cvsadmin -o, which is why this is necessary).
1475     #
1476     # If the key has no previous revision, then store None as key's
1477     # value.
1478     self.prev_rev = { }
1479
1480     # This dict is essentially self.prev_rev with the values mapped in
1481     # the other direction, so following key -> value will yield you
1482     # the next revision number.
1483     #
1484     # Unlike self.prev_rev, if the key has no next revision, then the
1485     # key is not present.
1486     self.next_rev = { }
1487
1488     # Track the state of each revision so that in set_revision_info,
1489     # we can determine if our op is an add/change/delete.  We can do
1490     # this because in set_revision_info, we'll have all of the
1491     # revisions for a file at our fingertips, and we need to examine
1492     # the state of our prev_rev to determine if we're an add or a
1493     # change--without the state of the prev_rev, we are unable to
1494     # distinguish between an add and a change.
1495     self.rev_state = { }
1496
1497     # Hash mapping branch numbers, like '1.7.2', to branch names,
1498     # like 'Release_1_0_dev'.
1499     self.branch_names = { }
1500
1501     # RCS flags (used for keyword expansion).
1502     self.mode = None
1503
1504     # Hash mapping revision numbers, like '1.7', to lists of names
1505     # indicating which branches sprout from that revision, like
1506     # ['Release_1_0_dev', 'experimental_driver', ...].
1507     self.branchlist = { }
1508
1509     # Like self.branchlist, but the values are lists of tag names that
1510     # apply to the key revision.
1511     self.taglist = { }
1512
1513     # If set, this is an RCS branch number -- rcsparse calls this the
1514     # "principal branch", but CVS and RCS refer to it as the "default
1515     # branch", so that's what we call it, even though the rcsparse API
1516     # setter method is still 'set_principal_branch'.
1517     self.default_branch = None
1518
1519     # If the RCS file doesn't have a default branch anymore, but does
1520     # have vendor revisions, then we make an educated guess that those
1521     # revisions *were* the head of the default branch up until the
1522     # commit of 1.2, at which point the file's default branch became
1523     # trunk.  This records the date at which 1.2 was committed.
1524     self.first_non_vendor_revision_date = None
1525
1526     # A list of all symbols defined for the current file.  Used to
1527     # prevent multiple definitions of a symbol, something which can
1528     # easily happen when --symbol-transform is used.
1529     self.defined_symbols = { }
1530
1531   def set_principal_branch(self, branch):
1532     self.default_branch = branch
1533
1534   def set_expansion(self, mode):
1535     self.mode = mode
1536
1537   def set_branch_name(self, branch_number, name):
1538     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1539     and that NAME sprouts from BRANCH_NUMBER .
1540     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1541     for example '1.7.2' (never '1.7.0.2')."""
1542     if not self.branch_names.has_key(branch_number):
1543       self.branch_names[branch_number] = name
1544       # The branchlist is keyed on the revision number from which the
1545       # branch sprouts, so strip off the odd final component.
1546       sprout_rev = branch_number[:branch_number.rfind(".")]
1547       self.branchlist.setdefault(sprout_rev, []).append(name)
1548       self.symbol_db.register_branch_creation(name)
1549     else:
1550       sys.stderr.write("%s: in '%s':\n"
1551                        "   branch '%s' already has name '%s',\n"
1552                        "   cannot also have name '%s', ignoring the latter\n"
1553                        % (warning_prefix, self.fname, branch_number,
1554                           self.branch_names[branch_number], name))
1555
1556   def rev_to_branch_name(self, revision):
1557     """Return the name of the branch on which REVISION lies.
1558     REVISION is a non-branch revision number with an even number of,
1559     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1560     For the convenience of callers, REVISION can also be a trunk
1561     revision such as '1.2', in which case just return None."""
1562     if trunk_rev.match(revision):
1563       return None
1564     return self.branch_names.get(revision[:revision.rindex(".")])
1565
1566   def add_cvs_branch(self, revision, branch_name):
1567     """Record the root revision and branch revision for BRANCH_NAME,
1568     based on REVISION.  REVISION is a CVS branch number having an even
1569     number of components where the second-to-last is '0'.  For
1570     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1571     from 1.7 and has branch number 1.7.2."""
1572     last_dot = revision.rfind(".")
1573     branch_rev = revision[:last_dot]
1574     last2_dot = branch_rev.rfind(".")
1575     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1576     self.set_branch_name(branch_rev, branch_name)
1577
1578   def define_tag(self, name, revision):
1579     """Record a bidirectional mapping between symbolic NAME and REVISION.
1580     REVISION is an unprocessed revision number from the RCS file's
1581     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1582     This function will determine what kind of symbolic name it is by
1583     inspection, and record it in the right places."""
1584     for (pattern, replacement) in Ctx().symbol_transforms:
1585       newname = pattern.sub(replacement, name)
1586       if newname != name:
1587         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1588                     % (name, newname))
1589         name = newname
1590     if self.defined_symbols.has_key(name):
1591       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1592                 % (error_prefix, name, self.fname)
1593       sys.stderr.write(err + "\n")
1594       self.fatal_errors.append(err)
1595     self.defined_symbols[name] = None
1596     if branch_tag.match(revision):
1597       self.add_cvs_branch(revision, name)
1598     elif vendor_tag.match(revision):
1599       self.set_branch_name(revision, name)
1600     else:
1601       self.taglist.setdefault(revision, []).append(name)
1602       self.symbol_db.register_tag_creation(name)
1603
1604   def define_revision(self, revision, timestamp, author, state,
1605                       branches, next):
1606
1607     # Record the state of our revision for later calculations
1608     self.rev_state[revision] = state
1609
1610     # store the rev_data as a list in case we have to jigger the timestamp
1611     self.rev_data[revision] = [int(timestamp), author, None]
1612
1613     # When on trunk, the RCS 'next' revision number points to what
1614     # humans might consider to be the 'previous' revision number.  For
1615     # example, 1.3's RCS 'next' is 1.2.
1616     #
1617     # However, on a branch, the RCS 'next' revision number really does
1618     # point to what humans would consider to be the 'next' revision
1619     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1620     #
1621     # In other words, in RCS, 'next' always means "where to find the next
1622     # deltatext that you need this revision to retrieve.
1623     #
1624     # That said, we don't *want* RCS's behavior here, so we determine
1625     # whether we're on trunk or a branch and set self.prev_rev
1626     # accordingly.
1627     #
1628     # One last thing.  Note that if REVISION is a branch revision,
1629     # instead of mapping REVISION to NEXT, we instead map NEXT to
1630     # REVISION.  Since we loop over all revisions in the file before
1631     # doing anything with the data we gather here, this 'reverse
1632     # assignment' effectively does the following:
1633     #
1634     # 1. Gives us no 'prev' value for REVISION (in this
1635     # iteration... it may have been set in a previous iteration)
1636     #
1637     # 2. Sets the 'prev' value for the revision with number NEXT to
1638     # REVISION.  So when we come around to the branch revision whose
1639     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1640     # set.
1641     if trunk_rev.match(revision):
1642       self.prev_rev[revision] = next
1643       self.next_rev[next] = revision
1644     elif next:
1645       self.prev_rev[next] = revision
1646       self.next_rev[revision] = next
1647
1648     for b in branches:
1649       self.prev_rev[b] = revision
1650
1651     # Ratchet up the highest vendor head revision, if necessary.
1652     if self.default_branch:
1653       default_branch_root = self.default_branch + "."
1654       if ((revision.find(default_branch_root) == 0)
1655           and (default_branch_root.count('.') == revision.count('.'))):
1656         # This revision is on the default branch, so record that it is
1657         # the new highest default branch head revision.
1658         self.default_branches_db[self.cvs_path] = revision
1659     else:
1660       # No default branch, so make an educated guess.
1661       if revision == '1.2':
1662         # This is probably the time when the file stopped having a
1663         # default branch, so make a note of it.
1664         self.first_non_vendor_revision_date = timestamp
1665       else:
1666         m = vendor_revision.match(revision)
1667         if m and ((not self.first_non_vendor_revision_date)
1668                   or (timestamp < self.first_non_vendor_revision_date)):
1669           # We're looking at a vendor revision, and it wasn't
1670           # committed after this file lost its default branch, so bump
1671           # the maximum trunk vendor revision in the permanent record.
1672           self.default_branches_db[self.cvs_path] = revision
1673
1674     if not trunk_rev.match(revision):
1675       # Check for unlabeled branches, record them.  We tried to collect
1676       # all branch names when we parsed the symbolic name header
1677       # earlier, of course, but that didn't catch unlabeled branches.
1678       # If a branch is unlabeled, this is our first encounter with it,
1679       # so we have to record its data now.
1680       branch_number = revision[:revision.rindex(".")]
1681       if not self.branch_names.has_key(branch_number):
1682         branch_name = "unlabeled-" + branch_number
1683         self.set_branch_name(branch_number, branch_name)
1684
1685       # Register the commit on this non-trunk branch
1686       branch_name = self.branch_names[branch_number]
1687       self.symbol_db.register_branch_commit(branch_name)
1688
1689   def tree_completed(self):
1690     "The revision tree has been parsed.  Analyze it for consistency."
1691
1692     # Our algorithm depends upon the timestamps on the revisions occuring
1693     # monotonically over time.  That is, we want to see rev 1.34 occur in
1694     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1695     # sorting), and then tried to insert 1.34, we'd be screwed.
1696
1697     # to perform the analysis, we'll simply visit all of the 'previous'
1698     # links that we have recorded and validate that the timestamp on the
1699     # previous revision is before the specified revision
1700
1701     # if we have to resync some nodes, then we restart the scan. just keep
1702     # looping as long as we need to restart.
1703     while 1:
1704       for current, prev in self.prev_rev.items():
1705         if not prev:
1706           # no previous revision exists (i.e. the initial revision)
1707           continue
1708         t_c = self.rev_data[current][0]
1709         t_p = self.rev_data[prev][0]
1710         if t_p >= t_c:
1711           # the previous revision occurred later than the current revision.
1712           # shove the previous revision back in time (and any before it that
1713           # may need to shift).
1714
1715           # We sync backwards and not forwards because any given CVS
1716           # Revision has only one previous revision.  However, a CVS
1717           # Revision can *be* a previous revision for many other
1718           # revisions (e.g., a revision that is the source of multiple
1719           # branches).  This becomes relevant when we do the secondary
1720           # synchronization in pass 2--we can make certain that we
1721           # don't resync a revision earlier than it's previous
1722           # revision, but it would be non-trivial to make sure that we
1723           # don't resync revision R *after* any revisions that have R
1724           # as a previous revision.
1725           while t_p >= t_c:
1726             self.rev_data[prev][0] = t_c - 1    # new timestamp
1727             self.rev_data[prev][2] = t_p        # old timestamp
1728             delta = t_c - 1 - t_p
1729             msg =  "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1730                   % (self.cvs_path, prev, time.ctime(t_p), delta)
1731             Log().write(LOG_VERBOSE, msg)
1732             if (delta > COMMIT_THRESHOLD
1733                 or delta < (COMMIT_THRESHOLD * -1)):
1734               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1735               Log().write(LOG_WARN,
1736                           str % (warning_prefix, self.cvs_path, delta))
1737             current = prev
1738             prev = self.prev_rev[current]
1739             if not prev:
1740               break
1741             t_c = t_c - 1               # self.rev_data[current][0]
1742             t_p = self.rev_data[prev][0]
1743
1744           # break from the for-loop
1745           break
1746       else:
1747         # finished the for-loop (no resyncing was performed)
1748         return
1749
1750   def set_revision_info(self, revision, log, text):
1751     timestamp, author, old_ts = self.rev_data[revision]
1752     digest = sha.new(log + '\0' + author).hexdigest()
1753     if old_ts:
1754       # the timestamp on this revision was changed. log it for later
1755       # resynchronization of other files's revisions that occurred
1756       # for this time and log message.
1757       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1758
1759     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1760     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1761     #
1762     # If revision 1.1 appears to have been created via 'cvs add'
1763     # instead of 'cvs import', then this file probably never had a
1764     # default branch, so retroactively remove its record in the
1765     # default branches db.  The test is that the log message CVS uses
1766     # for 1.1 in imports is "Initial revision\n" with no period.
1767     if revision == '1.1' and log != 'Initial revision\n':
1768       try:
1769         del self.default_branches_db[self.cvs_path]
1770       except KeyError:
1771         pass
1772
1773     # Get the timestamps of the previous and next revisions
1774     prev_rev = self.prev_rev[revision]
1775     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1776
1777     next_rev = self.next_rev.get(revision)
1778     next_timestamp, ign, ign = self.rev_data.get(next_rev, [0, None, None])
1779
1780     # How to tell if a CVSRevision is an add, a change, or a deletion:
1781     #
1782     # It's a delete if RCS state is 'dead'
1783     #
1784     # It's an add if RCS state is 'Exp.' and
1785     #      - we either have no previous revision
1786     #        or
1787     #      - we have a previous revision whose state is 'dead'
1788     #
1789     # Anything else is a change.
1790     if self.rev_state[revision] == 'dead':
1791       op = OP_DELETE
1792     elif ((self.prev_rev.get(revision, None) is None)
1793           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1794       op = OP_ADD
1795     else:
1796       op = OP_CHANGE
1797
1798     def is_branch_revision(rev):
1799       """Return True if this revision is not a trunk revision,
1800       else return False."""
1801       if rev.count('.') >= 3:
1802         return True
1803       return False
1804
1805     def is_same_line_of_development(rev1, rev2):
1806       """Return True if rev1 and rev2 are on the same line of
1807       development (i.e., both on trunk, or both on the same branch);
1808       return False otherwise.  Either rev1 or rev2 can be None, in
1809       which case automatically return False."""
1810       if rev1 is None or rev2 is None:
1811         return False
1812       if rev1.count('.') == 1 and rev2.count('.') == 1:
1813         return True
1814       if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
1815         return True
1816       return False
1817
1818     # There can be an odd situation where the tip revision of a branch
1819     # is alive, but every predecessor on the branch is in state 'dead',
1820     # yet the revision from which the branch sprouts is alive.  (This
1821     # is sort of a mirror image of the more common case of adding a
1822     # file on a branch, in which the first revision on the branch is
1823     # alive while the revision from which it sprouts is dead.)
1824     #
1825     # In this odd situation, we must mark the first live revision on
1826     # the branch as an OP_CHANGE instead of an OP_ADD, because it
1827     # reflects, however indirectly, a change w.r.t. the source
1828     # revision from which the branch sprouts.
1829     #
1830     # This is issue #89.
1831     cur_num = revision
1832     if is_branch_revision(revision) and self.rev_state[revision] != 'dead':
1833       while 1:
1834         prev_num = self.prev_rev.get(cur_num, None)
1835         if not cur_num or not prev_num:
1836           break
1837         if (not is_same_line_of_development(cur_num, prev_num)
1838             and self.rev_state[cur_num] == 'dead'
1839             and self.rev_state[prev_num] != 'dead'):
1840           op = OP_CHANGE
1841         cur_num = self.prev_rev.get(cur_num, None)
1842
1843     if text:
1844       deltatext_code = DELTATEXT_NONEMPTY
1845     else:
1846       deltatext_code = DELTATEXT_EMPTY
1847
1848     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp,
1849                         next_timestamp, op,
1850                         prev_rev, revision, next_rev,
1851                         self.file_in_attic, self.file_executable,
1852                         self.file_size,
1853                         deltatext_code, self.fname,
1854                         self.mode, self.rev_to_branch_name(revision),
1855                         self.taglist.get(revision, []),
1856                         self.branchlist.get(revision, []))
1857     self.revs.write(str(c_rev) + "\n")
1858     StatsKeeper().record_c_rev(c_rev)
1859
1860     if not self.metadata_db.has_key(digest):
1861       self.metadata_db[digest] = (author, log)
1862
1863   def parse_completed(self):
1864     # Walk through all branches and tags and register them with
1865     # their parent branch in the symbol database.
1866     for revision, symbols in self.taglist.items() + self.branchlist.items():
1867       for symbol in symbols:
1868         name = self.rev_to_branch_name(revision)
1869         if name is not None:
1870           self.symbol_db.register_branch_blocker(name, symbol)
1871
1872     self.num_files = self.num_files + 1
1873
1874   def write_symbol_db(self):
1875     self.symbol_db.write()
1876
1877 class SymbolingsLogger:
1878   """Manage the file that contains lines for symbol openings and
1879   closings.
1880
1881   This data will later be used to determine valid SVNRevision ranges
1882   from which a file can be copied when creating a branch or tag in
1883   Subversion.  Do this by finding "Openings" and "Closings" for each
1884   file copied onto a branch or tag.
1885
1886   An "Opening" is the CVSRevision from which a given branch/tag
1887   sprouts on a path.
1888
1889   The "Closing" for that branch/tag and path is the next CVSRevision
1890   on the same line of development as the opening.
1891
1892   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1893   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1894   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1895   'foo.c'.  Note that there may be many revisions chronologically
1896   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1897   perhaps even including on branch BEE itself.  But 1.3 is the next
1898   revision *on the same line* as 1.2, that is why it is the closing
1899   revision for those symbolic names of which 1.2 is the opening.
1900
1901   The reason for doing all this hullabaloo is to make branch and tag
1902   creation as efficient as possible by minimizing the number of copies
1903   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1904   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1905   means that when creating branch BEE, there is some motivation to do
1906   the copy from one of 17-30.  Now if there were another file,
1907   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1908   to revisions 24 and 39 in Subversion, we would know that the ideal
1909   thing would be to copy the branch from somewhere between 24 and 29,
1910   inclusive.
1911   """
1912   def __init__(self):
1913     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1914     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1915     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1916     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1917
1918     # This keys of this dictionary are *source* cvs_paths for which
1919     # we've encountered an 'opening' on the default branch.  The
1920     # values are the (uncleaned) symbolic names that this path has
1921     # opened.
1922     self.open_paths_with_default_branches = { }
1923
1924   def log_revision(self, c_rev, svn_revnum):
1925     """Log any openings found in C_REV, and if C_REV.next_rev is not
1926     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1927     any) will have its revnum determined later."""
1928     for name in c_rev.tags + c_rev.branches:
1929       self._note_default_branch_opening(c_rev, name)
1930       if c_rev.op != OP_DELETE:
1931         self._log(name, svn_revnum,
1932                   c_rev.cvs_path, c_rev.branch_name, OPENING)
1933
1934       # If our c_rev has a next_rev, then that's the closing rev for
1935       # this source revision.  Log it to closings for later processing
1936       # since we don't know the svn_revnum yet.
1937       if c_rev.next_rev is not None:
1938         self.closings.write('%s %s\n' %
1939                             (name, c_rev.unique_key(c_rev.next_rev)))
1940
1941   def _log(self, name, svn_revnum, cvs_path, branch_name, type):
1942     """Write out a single line to the symbol_openings_closings file
1943     representing that SVN_REVNUM of SVN_PATH on BRANCH_NAME is either the
1944     opening or closing (TYPE) of NAME (a symbolic name).
1945
1946     TYPE should only be one of the following global constants:
1947     OPENING or CLOSING."""
1948     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1949     self.symbolings.write(
1950         '%s %.8d %s %s %s\n'
1951         % (name, svn_revnum, type, branch_name or '*', cvs_path))
1952
1953   def close(self):
1954     """Iterate through the closings file, lookup the svn_revnum for
1955     each closing CVSRevision, and write a proper line out to the
1956     symbolings file."""
1957     # Use this to get the c_rev of our rev_key
1958     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1959
1960     self.closings.close()
1961     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1962       (name, rev_key) = line.rstrip().split(" ", 1)
1963       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1964
1965       c_rev = cvs_revs_db.get_revision(rev_key)
1966       self._log(name, svn_revnum, c_rev.cvs_path, c_rev.branch_name, CLOSING)
1967
1968     self.symbolings.close()
1969
1970   def _note_default_branch_opening(self, c_rev, symbolic_name):
1971     """If C_REV is a default branch revision, log C_REV.cvs_path as an
1972     opening for SYMBOLIC_NAME."""
1973     self.open_paths_with_default_branches.setdefault(
1974         c_rev.cvs_path, []).append(symbolic_name)
1975
1976   def log_default_branch_closing(self, c_rev, svn_revnum):
1977     """If self.open_paths_with_default_branches contains
1978     C_REV.cvs_path, then call log each name in
1979     self.open_paths_with_default_branches[C_REV.cvs_path] as a closing
1980     with SVN_REVNUM as the closing revision number."""
1981     path = c_rev.cvs_path
1982     if self.open_paths_with_default_branches.has_key(path):
1983       # log each symbol as a closing
1984       for name in self.open_paths_with_default_branches[path]:
1985         self._log(name, svn_revnum, path, None, CLOSING)
1986       # Remove them from the openings list as we're done with them.
1987       del self.open_paths_with_default_branches[path]
1988
1989
1990 class PersistenceManager:
1991   """The PersistenceManager allows us to effectively store SVNCommits
1992   to disk and retrieve them later using only their subversion revision
1993   number as the key.  It also returns the subversion revision number
1994   for a given CVSRevision's unique key.
1995
1996   All information pertinent to each SVNCommit is stored in a series of
1997   on-disk databases so that SVNCommits can be retrieved on-demand.
1998
1999   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
2000   In 'new' mode, PersistenceManager will initialize a new set of on-disk
2001   databases and be fully-featured.
2002   In 'read' mode, PersistenceManager will open existing on-disk databases
2003   and the set_* methods will be unavailable."""
2004   def __init__(self, mode):
2005     self.mode = mode
2006     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
2007       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
2008     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
2009     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
2010     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
2011     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
2012     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
2013     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
2014     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
2015     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
2016     ###PERF kff Elsewhere there are comments about sucking the tags db
2017     ### into memory.  That seems like a good idea.
2018     if not Ctx().trunk_only:
2019       self.tags_db = TagsDatabase(DB_OPEN_READ)
2020       self.motivating_revnums = SDatabase(temp(MOTIVATING_REVNUMS), mode)
2021       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
2022
2023     # "branch_name" -> svn_revnum in which branch was last filled.
2024     # This is used by CVSCommit._pre_commit, to prevent creating a fill
2025     # revision which would have nothing to do.
2026     self.last_filled = {}
2027
2028   def get_svn_revnum(self, cvs_rev_unique_key):
2029     """Return the Subversion revision number in which
2030     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
2031     is no mapping for CVS_REV_UNIQUE_KEY."""
2032     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
2033
2034   def get_svn_commit(self, svn_revnum):
2035     """Return an SVNCommit that corresponds to SVN_REVNUM.
2036
2037     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
2038
2039     This method can throw SVNCommitInternalInconsistencyError.
2040     """
2041     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
2042     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
2043     if c_rev_keys == None:
2044       return None
2045
2046     digest = None
2047     for key in c_rev_keys:
2048       c_rev = self.cvs_revisions.get_revision(key)
2049       svn_commit.add_revision(c_rev)
2050       # Set the author and log message for this commit by using
2051       # CVSRevision metadata, but only if haven't done so already.
2052       if digest is None:
2053         digest = c_rev.digest
2054         author, log_msg = self.svn_commit_metadata[digest]
2055         svn_commit.set_author(author)
2056         svn_commit.set_log_msg(log_msg)
2057
2058     # If we're doing a trunk-only conversion, we don't need to do any more
2059     # work.
2060     if Ctx().trunk_only:
2061       return svn_commit
2062
2063     name, date = self._get_name_and_date(svn_revnum)
2064     if name:
2065       svn_commit.set_symbolic_name(name)
2066       svn_commit.set_date(date)
2067       if self.tags_db.has_key(name):
2068         svn_commit.is_tag = 1
2069
2070     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
2071     if motivating_revnum:
2072       svn_commit.set_motivating_revnum(int(motivating_revnum))
2073       svn_commit.set_date(date)
2074
2075     if len(svn_commit.cvs_revs) and name:
2076       raise SVNCommit.SVNCommitInternalInconsistencyError(
2077           "An SVNCommit cannot have cvs_revisions *and* a corresponding\n"
2078           "symbolic name ('%s') to fill."
2079           % (_clean_symbolic_name(name),))
2080
2081     return svn_commit
2082
2083   def set_cvs_revs(self, svn_revnum, cvs_revs):
2084     """Record the bidirectional mapping between SVN_REVNUM and
2085     CVS_REVS."""
2086     if self.mode == DB_OPEN_READ:
2087       raise RuntimeError, \
2088           'Write operation attempted on read-only PersistenceManager'
2089     for c_rev in cvs_revs:
2090       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
2091     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
2092     for c_rev in cvs_revs:
2093       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
2094
2095   def set_name_and_date(self, svn_revnum, name, date):
2096     """Associate symbolic name NAME and DATE with SVN_REVNUM.
2097
2098     NAME is allowed to be None."""
2099
2100     if self.mode == DB_OPEN_READ:
2101       raise RuntimeError, \
2102           'Write operation attempted on read-only PersistenceManager'
2103     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
2104     self.last_filled[name] = svn_revnum
2105
2106   def _get_name_and_date(self, svn_revnum):
2107     """Return a tuple containing the symbolic name and date associated
2108     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
2109     associated with it."""
2110     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
2111
2112   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
2113     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
2114     if self.mode == DB_OPEN_READ:
2115       raise RuntimeError, \
2116           'Write operation attempted on read-only PersistenceManager'
2117     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
2118
2119
2120 class CVSCommit:
2121   """Each instance of this class contains a number of CVS Revisions
2122   that correspond to one or more Subversion Commits.  After all CVS
2123   Revisions are added to the grouping, calling process_revisions will
2124   generate a Subversion Commit (or Commits) for the set of CVS
2125   Revisions in the grouping."""
2126
2127   def __init__(self, digest, author, log):
2128     self.digest = digest
2129     self.author = author
2130     self.log = log
2131
2132     # Symbolic names for which the last source revision has already
2133     # been seen and for which the CVSRevisionAggregator has already
2134     # generated a fill SVNCommit.  See self.process_revisions().
2135     self.done_symbols = [ ]
2136
2137     self.files = { }
2138     # Lists of CVSRevisions
2139     self.changes = [ ]
2140     self.deletes = [ ]
2141
2142     # Start out with a t_min higher than any incoming time T, and a
2143     # t_max lower than any incoming T.  This way the first T will
2144     # push t_min down to T, and t_max up to T, naturally (without any
2145     # special-casing), and successive times will then ratchet them
2146     # outward as appropriate.
2147     self.t_min = 1L<<32
2148     self.t_max = 0
2149
2150     # This will be set to the SVNCommit that occurs in self._commit.
2151     self.motivating_commit = None
2152
2153     # This is a list of all non-primary commits motivated by the main
2154     # commit.  We gather these so that we can set their dates to the
2155     # same date as the primary commit.
2156     self.secondary_commits = [ ]
2157
2158     # State for handling default branches.
2159     #
2160     # Here is a tempting, but ultimately nugatory, bit of logic, which
2161     # I share with you so you may appreciate the less attractive, but
2162     # refreshingly non-nugatory, logic which follows it:
2163     #
2164     # If some of the commits in this txn happened on a non-trunk
2165     # default branch, then those files will have to be copied into
2166     # trunk manually after being changed on the branch (because the
2167     # RCS "default branch" appears as head, i.e., trunk, in practice).
2168     # As long as those copies don't overwrite any trunk paths that
2169     # were also changed in this commit, then we can do the copies in
2170     # the same revision, because they won't cover changes that don't
2171     # appear anywhere/anywhen else.  However, if some of the trunk dst
2172     # paths *did* change in this commit, then immediately copying the
2173     # branch changes would lose those trunk mods forever.  So in this
2174     # case, we need to do at least that copy in its own revision.  And
2175     # for simplicity's sake, if we're creating the new revision for
2176     # even one file, then we just do all such copies together in the
2177     # new revision.
2178     #
2179     # Doesn't that sound nice?
2180     #
2181     # Unfortunately, Subversion doesn't support copies with sources
2182     # in the current txn.  All copies must be based in committed
2183     # revisions.  Therefore, we generate the above-described new
2184     # revision unconditionally.
2185     #
2186     # This is a list of c_revs, and a c_rev is appended for each
2187     # default branch commit that will need to be copied to trunk (or
2188     # deleted from trunk) in some generated revision following the
2189     # "regular" revision.
2190     self.default_branch_cvs_revisions = [ ]
2191
2192   def __cmp__(self, other):
2193     # Commits should be sorted by t_max.  If both self and other have
2194     # the same t_max, break the tie using t_min, and lastly, digest
2195     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
2196             or cmp(self.digest, other.digest))
2197
2198   def has_file(self, fname):
2199     return self.files.has_key(fname)
2200
2201   def revisions(self):
2202     return self.changes + self.deletes
2203
2204   def opens_symbolic_name(self, name):
2205     """Returns true if any CVSRevision in this commit is on a tag or a
2206     branch or is the origin of a tag or branch."""
2207     for c_rev in self.revisions():
2208       if c_rev.opens_symbolic_name(name):
2209         return 1
2210     return 0
2211
2212   def add_revision(self, c_rev):
2213     # Record the time range of this commit.
2214     #
2215     # ### ISSUE: It's possible, though unlikely, that the time range
2216     # of a commit could get gradually expanded to be arbitrarily
2217     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
2218     # problem, and anyway deciding where to break it up would be a
2219     # judgement call.  For now, we just print a warning in commit() if
2220     # this happens.
2221     if c_rev.timestamp < self.t_min:
2222       self.t_min = c_rev.timestamp
2223     if c_rev.timestamp > self.t_max:
2224       self.t_max = c_rev.timestamp
2225
2226     if c_rev.op == OP_DELETE:
2227       self.deletes.append(c_rev)
2228     else:
2229       # OP_CHANGE or OP_ADD
2230       self.changes.append(c_rev)
2231
2232     self.files[c_rev.fname] = 1
2233
2234   def _pre_commit(self):
2235     """Generates any SVNCommits that must exist before the main
2236     commit."""
2237
2238     # There may be multiple c_revs in this commit that would cause
2239     # branch B to be filled, but we only want to fill B once.  On the
2240     # other hand, there might be multiple branches committed on in
2241     # this commit.  Whatever the case, we should count exactly one
2242     # commit per branch, because we only fill a branch once per
2243     # CVSCommit.  This list tracks which branches we've already
2244     # counted.
2245     accounted_for_sym_names = [ ]
2246
2247     def fill_needed(c_rev, pm):
2248       """Return 1 if this is the first commit on a new branch (for
2249       this file) and we need to fill the branch; else return 0
2250       (meaning that some other file's first commit on the branch has
2251       already done the fill for us).
2252
2253       If C_REV.op is OP_ADD, only return 1 if the branch that this
2254       commit is on has no last filled revision.
2255
2256       PM is a PersistenceManager to query.
2257       """
2258
2259       # Different '.' counts indicate that c_rev is now on a different
2260       # line of development (and may need a fill)
2261       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
2262         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
2263         # It should be the case that when we have a file F that
2264         # is added on branch B (thus, F on trunk is in state
2265         # 'dead'), we generate an SVNCommit to fill B iff the branch
2266         # has never been filled before.
2267         #
2268         # If this c_rev.op == OP_ADD, *and* the branch has never
2269         # been filled before, then fill it now.  Otherwise, no need to
2270         # fill it.
2271         if c_rev.op == OP_ADD:
2272           if pm.last_filled.get(c_rev.branch_name, None) is None:
2273             return 1
2274         elif c_rev.op == OP_CHANGE:
2275           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
2276             return 1
2277         elif c_rev.op == OP_DELETE:
2278           if pm.last_filled.get(c_rev.branch_name, None) is None:
2279             return 1
2280       return 0
2281
2282     for c_rev in self.changes + self.deletes:
2283       # If a commit is on a branch, we must ensure that the branch
2284       # path being committed exists (in HEAD of the Subversion
2285       # repository).  If it doesn't exist, we will need to fill the
2286       # branch.  After the fill, the path on which we're committing
2287       # will exist.
2288       if c_rev.branch_name \
2289           and c_rev.branch_name not in accounted_for_sym_names \
2290           and c_rev.branch_name not in self.done_symbols \
2291           and fill_needed(c_rev, Ctx()._persistence_manager):
2292         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
2293                                % c_rev.branch_name)
2294         svn_commit.set_symbolic_name(c_rev.branch_name)
2295         self.secondary_commits.append(svn_commit)
2296         accounted_for_sym_names.append(c_rev.branch_name)
2297
2298   def _commit(self):
2299     """Generates the primary SVNCommit that corresponds to this
2300     CVSCommit."""
2301     # Generate an SVNCommit unconditionally.  Even if the only change
2302     # in this CVSCommit is a deletion of an already-deleted file (that
2303     # is, a CVS revision in state 'dead' whose predecessor was also in
2304     # state 'dead'), the conversion will still generate a Subversion
2305     # revision containing the log message for the second dead
2306     # revision, because we don't want to lose that information.
2307     svn_commit = SVNCommit("commit")
2308     self.motivating_commit = svn_commit
2309
2310     for c_rev in self.changes:
2311       svn_commit.add_revision(c_rev)
2312       # Only make a change if we need to.  When 1.1.1.1 has an empty
2313       # deltatext, the explanation is almost always that we're looking
2314       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
2315       # such imports, CVS creates an RCS file where 1.1 has the
2316       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2317       # content as 1.1.  There's no reason to reflect this non-change
2318       # in the repository, so we want to do nothing in this case.  (If
2319       # we were really paranoid, we could make sure 1.1's log message
2320       # is the CVS-generated "Initial revision\n", but I think the
2321       # conditions below are strict enough.)
2322       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
2323               and (c_rev.rev == "1.1.1.1")):
2324         if c_rev.is_default_branch_revision():
2325           self.default_branch_cvs_revisions.append(c_rev)
2326
2327     for c_rev in self.deletes:
2328       # When a file is added on a branch, CVS not only adds the file
2329       # on the branch, but generates a trunk revision (typically
2330       # 1.1) for that file in state 'dead'.  We only want to add
2331       # this revision if the log message is not the standard cvs
2332       # fabricated log message.
2333       if c_rev.prev_rev is None:
2334         # c_rev.branches may be empty if the originating branch
2335         # has been excluded.
2336         if not c_rev.branches:
2337           continue
2338         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
2339                              % (c_rev.filename(),
2340                                 c_rev.branches[0]))
2341         author, log_msg = \
2342             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
2343         if log_msg == cvs_generated_msg:
2344           continue
2345
2346       svn_commit.add_revision(c_rev)
2347       if c_rev.is_default_branch_revision():
2348         self.default_branch_cvs_revisions.append(c_rev)
2349
2350     # There is a slight chance that we didn't actually register any
2351     # CVSRevisions with our SVNCommit (see loop over self.deletes
2352     # above), so if we have no CVSRevisions, we don't flush the
2353     # svn_commit to disk and roll back our revnum.
2354     if len(svn_commit.cvs_revs) > 0:
2355       svn_commit.flush()
2356     else:
2357       # We will not be flushing this SVNCommit, so rollback the
2358       # SVNCommit revision counter.
2359       SVNCommit.revnum = SVNCommit.revnum - 1
2360
2361     if not Ctx().trunk_only:
2362       for c_rev in self.revisions():
2363         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
2364
2365   def _post_commit(self):
2366     """Generates any SVNCommits that we can perform now that _commit
2367     has happened.  That is, handle non-trunk default branches.
2368     Sometimes an RCS file has a non-trunk default branch, so a commit
2369     on that default branch would be visible in a default CVS checkout
2370     of HEAD.  If we don't copy that commit over to Subversion's trunk,
2371     then there will be no Subversion tree which corresponds to that
2372     CVS checkout.  Of course, in order to copy the path over, we may
2373     first need to delete the existing trunk there.  """
2374
2375     # Only generate a commit if we have default branch revs
2376     if len(self.default_branch_cvs_revisions):
2377       # Generate an SVNCommit for all of our default branch c_revs.
2378       svn_commit = SVNCommit("post-commit default branch(es)")
2379       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
2380       for c_rev in self.default_branch_cvs_revisions:
2381         svn_commit.add_revision(c_rev)
2382         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
2383                                                             svn_commit.revnum)
2384       self.secondary_commits.append(svn_commit)
2385
2386   def process_revisions(self, done_symbols):
2387     """Process all the CVSRevisions that this instance has, creating
2388     one or more SVNCommits in the process.  Generate fill SVNCommits
2389     only for symbols not in DONE_SYMBOLS (avoids unnecessary
2390     fills).
2391
2392     Return the primary SVNCommit that corresponds to this CVSCommit.
2393     The returned SVNCommit is the commit that motivated any other
2394     SVNCommits generated in this CVSCommit."""
2395     self.done_symbols = done_symbols
2396     seconds = self.t_max - self.t_min + 1
2397
2398     Log().write(LOG_VERBOSE, '-' * 60)
2399     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2400     if seconds == 1:
2401       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2402                   % time.ctime(self.t_max))
2403     else:
2404       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2405       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2406                   % (time.ctime(self.t_max), seconds))
2407
2408     if seconds > COMMIT_THRESHOLD + 1:
2409       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2410                   % (warning_prefix, COMMIT_THRESHOLD))
2411
2412     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2413       self._commit()
2414       return self.motivating_commit
2415
2416     self._pre_commit()
2417     self._commit()
2418     self._post_commit()
2419
2420     for svn_commit in self.secondary_commits:
2421       svn_commit.set_date(self.motivating_commit.get_date())
2422       svn_commit.flush()
2423
2424     return self.motivating_commit
2425
2426
2427 class SVNCommit:
2428   """This represents one commit to the Subversion Repository.  There
2429   are three types of SVNCommits:
2430
2431   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2432
2433   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2434
2435   3. Updates trunk to reflect the contents of a particular branch
2436      (this is to handle RCS default branches)."""
2437
2438   # The revision number to assign to the next new SVNCommit.
2439   # We start at 2 because SVNRepositoryMirror uses the first commit
2440   # to create trunk, tags, and branches.
2441   revnum = 2
2442
2443   class SVNCommitInternalInconsistencyError(Exception):
2444     """Exception raised if we encounter an impossible state in the
2445     SVNCommit Databases."""
2446     pass
2447
2448   def __init__(self, description="", revnum=None, cvs_revs=None):
2449     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2450     If REVNUM, the SVNCommit will correspond to that revision number;
2451     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2452     REVNUM.
2453
2454     It is an error to pass CVS_REVS without REVNUM, but you may pass
2455     REVNUM without CVS_REVS, and then add a revision at a time by
2456     invoking add_revision()."""
2457     self._description = description
2458
2459     # Revprop metadata for this commit.
2460     #
2461     # These initial values are placeholders.  At least the log and the
2462     # date should be different by the time these are used.
2463     #
2464     # They are private because their values should be returned encoded
2465     # in UTF8, but callers aren't required to set them in UTF8.
2466     # Therefore, accessor methods are used to set them, and
2467     # self.get_revprops() is used to to get them, in dictionary form.
2468     self._author = Ctx().username
2469     self._log_msg = "This log message means an SVNCommit was used too soon."
2470     self._max_date = 0  # Latest date seen so far.
2471
2472     self.cvs_revs = cvs_revs or []
2473     if revnum:
2474       self.revnum = revnum
2475     else:
2476       self.revnum = SVNCommit.revnum
2477       SVNCommit.revnum = SVNCommit.revnum + 1
2478
2479     # The (uncleaned) symbolic name that is filled in this SVNCommit, if any.
2480     self.symbolic_name = None
2481
2482     # If this commit is a default branch synchronization, this
2483     # variable represents the subversion revision number of the
2484     # *primary* commit where the default branch changes actually
2485     # happened.  It is None otherwise.
2486     #
2487     # It is possible for multiple synchronization commits to refer to
2488     # the same motivating commit revision number, and it is possible
2489     # for a single synchronization commit to contain CVSRevisions on
2490     # multiple different default branches.
2491     self.motivating_revnum = None
2492
2493     # is_tag is true only if this commit is a fill of a symbolic name
2494     # that is a tag, None in all other cases.
2495     self.is_tag = None
2496
2497   def set_symbolic_name(self, symbolic_name):
2498     "Set self.symbolic_name to SYMBOLIC_NAME."
2499     self.symbolic_name = symbolic_name
2500
2501   def set_motivating_revnum(self, revnum):
2502     "Set self.motivating_revnum to REVNUM."
2503     self.motivating_revnum = revnum
2504
2505   def set_author(self, author):
2506     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2507     This is the only way to set an SVNCommit's author."""
2508     self._author = author
2509
2510   def set_log_msg(self, msg):
2511     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2512     This is the only way to set an SVNCommit's log message."""
2513     self._log_msg = msg
2514
2515   def set_date(self, date):
2516     """Set this SVNCommit's date to DATE (an integer).
2517     Note that self.add_revision() updates this automatically based on
2518     a CVSRevision; so you may not need to call this at all, and even
2519     if you do, the value may be overwritten by a later call to
2520     self.add_revision()."""
2521     self._max_date = date
2522
2523   def get_date(self):
2524     """Returns this SVNCommit's date as an integer."""
2525     return self._max_date
2526
2527   def get_revprops(self):
2528     """Return the Subversion revprops for this SVNCommit."""
2529     date = format_date(self._max_date)
2530     try:
2531       utf8_author = None
2532       if self._author is not None:
2533         utf8_author = to_utf8(self._author)
2534       utf8_log = to_utf8(self.get_log_msg())
2535       return { 'svn:author' : utf8_author,
2536                'svn:log'    : utf8_log,
2537                'svn:date'   : date }
2538     except UnicodeError:
2539       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2540                   % warning_prefix)
2541       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2542       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2543       Log().write(LOG_WARN, "  date:   '%s'" % date)
2544       Log().write(LOG_WARN,
2545                   "(subversion rev %s)  Related files:" % self.revnum)
2546       for c_rev in self.cvs_revs:
2547         Log().write(LOG_WARN, " ", c_rev.fname)
2548
2549       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2550                   "'--encoding=latin1'.\n")
2551       # It's better to fall back to the original (unknown encoding) data
2552       # than to either 1) quit or 2) record nothing at all.
2553       return { 'svn:author' : self._author,
2554                'svn:log'    : self.get_log_msg(),
2555                'svn:date'   : date }
2556
2557   def add_revision(self, cvs_rev):
2558     self.cvs_revs.append(cvs_rev)
2559     if cvs_rev.timestamp > self._max_date:
2560       self._max_date = cvs_rev.timestamp
2561
2562   def _is_primary_commit(self):
2563     """Return true if this is a primary SVNCommit, false otherwise."""
2564     return not (self.symbolic_name or self.motivating_revnum)
2565
2566   def flush(self):
2567     Log().write(LOG_NORMAL, "Creating Subversion r%d (%s)"
2568                 % (self.revnum, self._description))
2569     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2570
2571     if self.motivating_revnum is not None:
2572       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2573                                                        self.motivating_revnum)
2574
2575     # If we're not a primary commit, then store our date and/or our
2576     # symbolic_name
2577     if not self._is_primary_commit():
2578       Ctx()._persistence_manager.set_name_and_date(
2579           self.revnum, self.symbolic_name, self._max_date)
2580
2581   def __str__(self):
2582     """ Print a human-readable description of this SVNCommit.  This
2583     description is not intended to be machine-parseable (although
2584     we're not going to stop you if you try!)"""
2585
2586     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2587     if self.symbolic_name:
2588       ret += ("   symbolic name: " + _clean_symbolic_name(self.symbolic_name)
2589               + "\n")
2590     else:
2591       ret += "   NO symbolic name\n"
2592     ret += "   debug description: " + self._description + "\n"
2593     ret += "   cvs_revs:\n"
2594     for c_rev in self.cvs_revs:
2595       ret += "     " + c_rev.unique_key() + "\n"
2596     return ret
2597
2598   def get_log_msg(self):
2599     """Returns the actual log message for a primary commit, and the
2600     appropriate manufactured log message for a secondary commit."""
2601     if self.symbolic_name is not None:
2602       return self._log_msg_for_symbolic_name_commit()
2603     elif self.motivating_revnum is not None:
2604       return self._log_msg_for_default_branch_commit()
2605     else:
2606       return self._log_msg
2607
2608   def _log_msg_for_symbolic_name_commit(self):
2609     """Creates a log message for a manufactured commit that fills
2610     self.symbolic_name.  If self.is_tag is true, write the log message
2611     as though for a tag, else write it as though for a branch."""
2612     type = 'branch'
2613     if self.is_tag:
2614       type = 'tag'
2615
2616     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2617     space_or_newline = ' '
2618     cleaned_symbolic_name = _clean_symbolic_name(self.symbolic_name)
2619     if len(cleaned_symbolic_name) >= 13:
2620       space_or_newline = '\n'
2621
2622     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2623            % (type, space_or_newline, cleaned_symbolic_name)
2624
2625   def _log_msg_for_default_branch_commit(self):
2626     """Creates a log message for a manufactured commit that
2627     synchronizes a non-trunk default branch with trunk."""
2628     msg = 'This commit was generated by cvs2svn to compensate for '     \
2629           'changes in r%d,\n'                                           \
2630           'which included commits to RCS files with non-trunk default ' \
2631           'branches.\n' % self.motivating_revnum
2632     return msg
2633
2634 class CVSRevisionAggregator:
2635   """This class groups CVSRevisions into CVSCommits that represent
2636   at least one SVNCommit."""
2637   def __init__(self):
2638     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2639     if not Ctx().trunk_only:
2640       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB),
2641                                    DB_OPEN_READ)
2642
2643     # A map { key : CVSCommit } of CVS commits currently being
2644     # accumulated.  If the CVSCommit is still open to further
2645     # CVSRevisions, then key is CVSRevision.digest.  If not (because
2646     # an inbound commit wanted to affect a file that was already
2647     # within the CVSCommit), then key is CVSRevision.digest plus some
2648     # number of appended '-'.
2649     self.cvs_commits = {}
2650
2651     # A map { symbol : None } of symbolic names for which the last
2652     # source CVSRevision has already been processed but which haven't
2653     # been closed yet.
2654     self.pending_symbols = {}
2655
2656     # A list of closed symbols.  That is, we've already encountered
2657     # the last CVSRevision that is a source for that symbol, the final
2658     # fill for this symbol has been done, and we never need to fill it
2659     # again.
2660     self.done_symbols = [ ]
2661
2662     # This variable holds the most recently created primary svn_commit
2663     # object.  CVSRevisionAggregator maintains this variable merely
2664     # for its date, so that it can set dates for the SVNCommits
2665     # created in self._attempt_to_commit_symbols().
2666     self.latest_primary_svn_commit = None
2667
2668     Ctx()._symbolings_logger = SymbolingsLogger()
2669     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2670     Ctx()._default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
2671                                            DB_OPEN_READ)
2672
2673   def _extract_ready_commits(self, timestamp):
2674     """Extract and return any active commits that expire by TIMESTAMP."""
2675
2676     ready_queue = [ ]
2677     for digest_key, cvs_commit in self.cvs_commits.items():
2678       if cvs_commit.t_max + COMMIT_THRESHOLD < timestamp:
2679         ready_queue.append(cvs_commit)
2680         del self.cvs_commits[digest_key]
2681     return ready_queue
2682
2683   def process_revision(self, c_rev):
2684     # Each time we read a new line, scan the accumulating commits to
2685     # see if any are ready for processing.
2686     ready_queue = self._extract_ready_commits(c_rev.timestamp)
2687
2688     for digest_key, cvs_commit in self.cvs_commits.items():
2689       # If the inbound commit is on the same file as a pending commit,
2690       # close the pending commit to further changes.  Don't flush it though,
2691       # as there may be other pending commits dated before this one.
2692       # ### ISSUE: the has_file() check below is not optimal.
2693       # It does fix the dataloss bug where revisions would get lost
2694       # if checked in too quickly, but it can also break apart the
2695       # commits.  The correct fix would require tracking the dependencies
2696       # between change sets and committing them in proper order.
2697       if cvs_commit.has_file(c_rev.fname):
2698         unused_id = digest_key + '-'
2699         # Find a string that does is not already a key in
2700         # the self.cvs_commits dict
2701         while self.cvs_commits.has_key(unused_id):
2702           unused_id = unused_id + '-'
2703         self.cvs_commits[unused_id] = cvs_commit
2704         del self.cvs_commits[digest_key]
2705
2706     # Add this item into the set of still-available commits.
2707     if self.cvs_commits.has_key(c_rev.digest):
2708       cvs_commit = self.cvs_commits[c_rev.digest]
2709     else:
2710       author, log = self.metadata_db[c_rev.digest]
2711       cvs_commit = CVSCommit(c_rev.digest, author, log)
2712       self.cvs_commits[c_rev.digest] = cvs_commit
2713     cvs_commit.add_revision(c_rev)
2714
2715     if ready_queue:
2716       # Any elements in the ready_queue at this point need to be
2717       # processed, because this latest rev couldn't possibly be part
2718       # of any of them.  Sort them into time-order, then process 'em.
2719       ready_queue.sort()
2720
2721       while ready_queue:
2722         cvs_commit = ready_queue.pop(0)
2723         self.latest_primary_svn_commit = \
2724             cvs_commit.process_revisions(self.done_symbols)
2725         self._add_pending_symbols(c_rev)
2726         self._attempt_to_commit_symbols(ready_queue)
2727     else:
2728       # Make sure we _add_pending_symbols() for this c_rev and
2729       # _attempt_to_commit_symbols(), even if no commits are ready.
2730       self._add_pending_symbols(c_rev)
2731       self._attempt_to_commit_symbols(ready_queue)
2732
2733   def flush(self):
2734     """Commit anything left in self.cvs_commits.  Then inform the
2735     SymbolingsLogger that all commits are done."""
2736
2737     ready_queue = [ ]
2738     for k, v in self.cvs_commits.items():
2739       ready_queue.append((v, k))
2740
2741     ready_queue.sort()
2742     while ready_queue:
2743       (cvs_commit, key) = ready_queue.pop(0)
2744       self.latest_primary_svn_commit = \
2745           cvs_commit.process_revisions(self.done_symbols)
2746       del self.cvs_commits[key]
2747       self._attempt_to_commit_symbols([])
2748
2749     if not Ctx().trunk_only:
2750       Ctx()._symbolings_logger.close()
2751
2752   def _add_pending_symbols(self, c_rev):
2753     """Add to self.pending_symbols any symbols from C_REV for which
2754     C_REV is the last CVSRevision.
2755
2756     If we're not doing a trunk-only conversion, get the symbolic names
2757     that this c_rev is the last *source* CVSRevision for and add them
2758     to those left over from previous passes through the aggregator."""
2759
2760     if not Ctx().trunk_only:
2761       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2762         self.pending_symbols[sym] = None
2763
2764   def _attempt_to_commit_symbols(self, queued_commits):
2765     """Generate one SVNCommit for each symbol in self.pending_symbols
2766     that doesn't have an opening CVSRevision in either QUEUED_COMMITS
2767     or self.cvs_commits.values()."""
2768
2769     # Make a list of all symbols from self.pending_symbols that do not
2770     # have *source* CVSRevisions in the pending commit queue
2771     # (self.cvs_commits) or in queued_commits:
2772     closeable_symbols = []
2773     for sym in self.pending_symbols:
2774       for cvs_commit in self.cvs_commits.values() + queued_commits:
2775         if cvs_commit.opens_symbolic_name(sym):
2776           break
2777       else:
2778         closeable_symbols.append(sym)
2779
2780     # Sort the closeable symbols so that we will always process the
2781     # symbols in the same order, regardless of the order in which the
2782     # dict hashing algorithm hands them back to us.  We do this so
2783     # that our tests will get the same results on all platforms.
2784     closeable_symbols.sort()
2785     for sym in closeable_symbols:
2786       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2787       svn_commit.set_symbolic_name(sym)
2788       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2789       svn_commit.flush()
2790       self.done_symbols.append(sym)
2791       del self.pending_symbols[sym]
2792
2793
2794 class SymbolingsReader:
2795   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2796   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2797   returning the correct opening and closing Subversion revision
2798   numbers for a given symbolic name."""
2799   def __init__(self):
2800     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2801     reads the offsets database into memory."""
2802     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2803     # The offsets_db is really small, and we need to read and write
2804     # from it a fair bit, so suck it into memory
2805     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2806     self.offsets = { }
2807     for key in offsets_db.db.keys():
2808       #print " ZOO:", key, offsets_db[key]
2809       self.offsets[key] = offsets_db[key]
2810
2811   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2812     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2813     SymbolicNameFillingGuide object.
2814
2815     Note that if we encounter an opening rev in this fill, but the
2816     corresponding closing rev takes place later than SVN_REVNUM, the
2817     closing will not be passed to SymbolicNameFillingGuide in this
2818     fill (and will be discarded when encountered in a later fill).
2819     This is perfectly fine, because we can still do a valid fill
2820     without the closing--we always try to fill what we can as soon as
2821     we can."""
2822
2823     openings_closings_map = OpeningsClosingsMap(symbolic_name)
2824
2825     # It's possible to have a branch start with a file that was added
2826     # on a branch
2827     if self.offsets.has_key(symbolic_name):
2828       # set our read offset for self.symbolings to the offset for
2829       # symbolic_name
2830       self.symbolings.seek(self.offsets[symbolic_name])
2831
2832       while 1:
2833         fpos = self.symbolings.tell()
2834         line = self.symbolings.readline().rstrip()
2835         if not line:
2836           break
2837         name, revnum, type, branch_name, cvs_path = line.split(" ", 4)
2838         if branch_name == '*':
2839           svn_path = Ctx().project.make_trunk_path(cvs_path)
2840         else:
2841           svn_path = Ctx().project.make_branch_path(branch_name, cvs_path)
2842         revnum = int(revnum)
2843         if revnum > svn_revnum or name != symbolic_name:
2844           break
2845         openings_closings_map.register(svn_path, revnum, type)
2846
2847       # get current offset of the read marker and set it to the offset
2848       # for the beginning of the line we just read if we used anything
2849       # we read.
2850       if not openings_closings_map.is_empty():
2851         self.offsets[symbolic_name] = fpos
2852
2853     return SymbolicNameFillingGuide(openings_closings_map)
2854
2855
2856 class SvnRevisionRange:
2857   """The range of subversion revision numbers from which a path can be
2858   copied.  self.opening_revnum is the number of the earliest such
2859   revision, and self.closing_revnum is one higher than the number of
2860   the last such revision.  If self.closing_revnum is None, then no
2861   closings were registered."""
2862
2863   def __init__(self, opening_revnum):
2864     self.opening_revnum = opening_revnum
2865     self.closing_revnum = None
2866
2867   def add_closing(self, closing_revnum):
2868     # When we have a non-trunk default branch, we may have multiple
2869     # closings--only register the first closing we encounter.
2870     if self.closing_revnum is None:
2871       self.closing_revnum = closing_revnum
2872
2873   def __str__(self):
2874     if self.closing_revnum is None:
2875       return '[%d:]' % (self.opening_revnum,)
2876     else:
2877       return '[%d:%d]' % (self.opening_revnum, self.closing_revnum,)
2878
2879
2880 class OpeningsClosingsMap:
2881   """A dictionary of openings and closings for a symbolic name in the
2882   current SVNCommit.
2883
2884   The user should call self.register() for the openings and closings,
2885   then self.get_node_tree() to retrieve the information as a
2886   SymbolicNameFillingGuide."""
2887
2888   def __init__(self, symbolic_name):
2889     """Initialize OpeningsClosingsMap and prepare it for receiving
2890     openings and closings."""
2891
2892     self.name = symbolic_name
2893
2894     # A dictionary of SVN_PATHS to SvnRevisionRange objects.
2895     self.things = { }
2896
2897   def register(self, svn_path, svn_revnum, type):
2898     """Register an opening or closing revision for this symbolic name.
2899     SVN_PATH is the source path that needs to be copied into
2900     self.symbolic_name, and SVN_REVNUM is either the first svn
2901     revision number that we can copy from (our opening), or the last
2902     (not inclusive) svn revision number that we can copy from (our
2903     closing).  TYPE indicates whether this path is an opening or a a
2904     closing.
2905
2906     The opening for a given SVN_PATH must be passed before the closing
2907     for it to have any effect... any closing encountered before a
2908     corresponding opening will be discarded.
2909
2910     It is not necessary to pass a corresponding closing for every
2911     opening.
2912     """
2913     # Always log an OPENING
2914     if type == OPENING:
2915       self.things[svn_path] = SvnRevisionRange(svn_revnum)
2916     # Only log a closing if we've already registered the opening for that
2917     # path.
2918     elif type == CLOSING and self.things.has_key(svn_path):
2919       self.things[svn_path].add_closing(svn_revnum)
2920
2921   def is_empty(self):
2922     """Return true if we haven't accumulated any openings or closings,
2923     false otherwise."""
2924     return not len(self.things)
2925
2926   def get_things(self):
2927     """Return a list of (svn_path, SvnRevisionRange) tuples for all
2928     svn_paths with registered openings or closings."""
2929
2930     return self.things.items()
2931
2932
2933 class SymbolicNameFillingGuide:
2934   """A node tree representing the source paths to be copied to fill
2935   self.symbolic_name in the current SVNCommit.
2936
2937   self._node_tree is the root of the directory tree, in the form {
2938   path_component : subnode }.  Leaf nodes are instances of
2939   SvnRevisionRange.  Intermediate (directory) nodes are dictionaries
2940   mapping relative names to subnodes.
2941
2942   By walking self._node_tree and calling self.get_best_revnum() on
2943   each node, the caller can determine what subversion revision number
2944   to copy the path corresponding to that node from.  self._node_tree
2945   should be treated as read-only.
2946
2947   The caller can then descend to sub-nodes to see if their "best
2948   revnum" differs from their parents' and if it does, take appropriate
2949   actions to "patch up" the subtrees."""
2950
2951   def __init__(self, openings_closings_map):
2952     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2953     store into it the openings and closings from
2954     OPENINGS_CLOSINGS_MAP."""
2955
2956     self.name = openings_closings_map.name
2957
2958     # The dictionary that holds our node tree as a map { node_key :
2959     # node }.
2960     self._node_tree = { }
2961
2962     for svn_path, svn_revision_range in openings_closings_map.get_things():
2963       (head, tail) = _path_split(svn_path)
2964       self._get_node_for_path(head)[tail] = svn_revision_range
2965
2966     #self.print_node_tree(self._node_tree)
2967
2968   def _get_node_for_path(self, svn_path):
2969     """Return the node key for svn_path, creating new nodes as needed."""
2970     # Walk down the path, one node at a time.
2971     node = self._node_tree
2972     for component in svn_path.split('/'):
2973       if node.has_key(component):
2974         node = node[component]
2975       else:
2976         old_node = node
2977         node = {}
2978         old_node[component] = node
2979
2980     return node
2981
2982   def get_best_revnum(self, node, preferred_revnum):
2983     """Determine the best subversion revision number to use when
2984     copying the source tree beginning at NODE.  Returns a
2985     subversion revision number.
2986
2987     PREFERRED_REVNUM is passed to best_rev and used to calculate the
2988     best_revnum."""
2989
2990     def score_revisions(svn_revision_ranges):
2991       """Return a list of revisions and scores based on
2992       SVN_REVISION_RANGES.  The returned list looks like:
2993
2994          [(REV1 SCORE1), (REV2 SCORE2), ...]
2995
2996       where the tuples are sorted by revision number.
2997       SVN_REVISION_RANGES is a list of SvnRevisionRange objects.
2998
2999       For each svn revision that appears as either an opening_revnum
3000       or closing_revnum for one of the svn_revision_ranges, output a
3001       tuple indicating how many of the SvnRevisionRanges include that
3002       svn_revision in its range.  A score thus indicates that copying
3003       the corresponding revision (or any following revision up to the
3004       next revision in the list) of the object in question would yield
3005       that many correct paths at or underneath the object.  There may
3006       be other paths underneath it which are not correct and would
3007       need to be deleted or recopied; those can only be detected by
3008       descending and examining their scores.
3009
3010       If OPENINGS is empty, return the empty list."""
3011       openings = [ x.opening_revnum
3012                    for x in svn_revision_ranges ]
3013       closings = [ x.closing_revnum
3014                    for x in svn_revision_ranges
3015                    if x.closing_revnum is not None ]
3016
3017       # First look for easy out.
3018       if not openings:
3019         return []
3020
3021       # Create a list with both openings (which increment the total)
3022       # and closings (which decrement the total):
3023       things = [(rev,1) for rev in openings] + [(rev,-1) for rev in closings]
3024       # Sort by revision number:
3025       things.sort()
3026       # Initialize output list with zeroth element of things.  This
3027       # element must exist, because it was already verified that
3028       # openings is not empty.
3029       scores = [ things[0] ]
3030       total = scores[-1][1]
3031       for (rev, change) in things[1:]:
3032         total += change
3033         if rev == scores[-1][0]:
3034           # Same revision as last entry; modify last entry:
3035           scores[-1] = (rev, total)
3036         else:
3037           # Previously-unseen revision; create new entry:
3038           scores.append((rev, total))
3039       return scores
3040
3041     def best_rev(scores, preferred_rev):
3042       """Return the revision with the highest score from SCORES, a list
3043       returned by score_revisions().  When the maximum score is shared
3044       by multiple revisions, the oldest revision is selected, unless
3045       PREFERRED_REV is one of the possibilities, in which case, it is
3046       selected."""
3047       max_score = 0
3048       preferred_rev_score = -1
3049       rev = SVN_INVALID_REVNUM
3050       if preferred_rev is None:
3051         # Comparison order of different types is arbitrary.  Do not
3052         # expect None to compare less than int values below.
3053         preferred_rev = SVN_INVALID_REVNUM
3054       for revnum, count in scores:
3055         if count > max_score:
3056           max_score = count
3057           rev = revnum
3058         if revnum <= preferred_rev:
3059           preferred_rev_score = count
3060       if preferred_rev_score == max_score:
3061         rev = preferred_rev
3062       return rev, max_score
3063
3064     # Aggregate openings and closings from the rev tree
3065     svn_revision_ranges = self._list_revnums(node)
3066
3067     # Score the lists
3068     scores = score_revisions(svn_revision_ranges)
3069
3070     revnum, max_score = best_rev(scores, preferred_revnum)
3071
3072     if revnum == SVN_INVALID_REVNUM:
3073       raise FatalError("failed to find a revision "
3074                        + "to copy from when copying %s" % name)
3075     return revnum, max_score
3076
3077   def _list_revnums(self, node):
3078     """Return a list of all the SvnRevisionRanges (including
3079     duplicates) for all leaf nodes at and under NODE."""
3080
3081     if isinstance(node, SvnRevisionRange):
3082       # It is a leaf node.
3083       return [ node ]
3084     else:
3085       # It is an intermediate node.
3086       revnums = []
3087       for key, subnode in node.items():
3088         revnums.extend(self._list_revnums(subnode))
3089       return revnums
3090
3091   def get_sources(self):
3092     """Return the list of sources for this symbolic name.
3093
3094     The Project instance defines what are legitimate sources.  Raise
3095     an exception if a change occurred outside of the source
3096     directories."""
3097
3098     return self._get_sub_sources('', self._node_tree)
3099
3100   def _get_sub_sources(self, start_svn_path, start_node):
3101     """Return the list of sources for this symbolic name, starting the
3102     search at path START_SVN_PATH, which is node START_NODE.  This is
3103     a helper method, called by get_sources() (see)."""
3104
3105     project = Ctx().project
3106     if isinstance(start_node, SvnRevisionRange):
3107       # This implies that a change was found outside of the
3108       # legitimate sources.  This should never happen.
3109       raise
3110     elif project.is_source(start_svn_path):
3111       # This is a legitimate source.  Add it to list.
3112       return [ FillSource(start_svn_path, start_node) ]
3113     else:
3114       # This is a directory that is not a legitimate source.  (That's
3115       # OK because it hasn't changed directly.)  But directories
3116       # within it have been changed, so we need to search recursively
3117       # to find their enclosing sources.
3118       sources = []
3119       for entry, node in start_node.items():
3120         svn_path = _path_join(start_svn_path, entry)
3121         sources.extend(self._get_sub_sources(svn_path, node))
3122
3123     return sources
3124
3125   def print_node_tree(self, node, name='/', indent_depth=0):
3126     """For debugging purposes.  Prints all nodes in TREE that are
3127     rooted at NODE.  INDENT_DEPTH is used to indent the output of
3128     recursive calls."""
3129     if not indent_depth:
3130       print "TREE", "=" * 75
3131     if isinstance(node, SvnRevisionRange):
3132       print "TREE:", " " * (indent_depth * 2), name, node
3133     else:
3134       print "TREE:", " " * (indent_depth * 2), name
3135       for key, value in node.items():
3136         self.print_node_tree(value, key, (indent_depth + 1))
3137
3138
3139 class FillSource:
3140   """Representation of a fill source used by the symbol filler in
3141   SVNRepositoryMirror."""
3142   def __init__(self, prefix, node):
3143     """Create an unscored fill source with a prefix and a key."""
3144     self.prefix = prefix
3145     self.node = node
3146     self.score = None
3147     self.revnum = None
3148
3149   def set_score(self, score, revnum):
3150     """Set the SCORE and REVNUM."""
3151     self.score = score
3152     self.revnum = revnum
3153
3154   def __cmp__(self, other):
3155     """Comparison operator used to sort FillSources in descending
3156     score order."""
3157     if self.score is None or other.score is None:
3158       raise TypeError, 'Tried to compare unscored FillSource'
3159     return cmp(other.score, self.score)
3160
3161
3162 class SVNRepositoryMirror:
3163   """Mirror a Subversion Repository as it is constructed, one
3164   SVNCommit at a time.  The mirror is skeletal; it does not contain
3165   file contents.  The creation of a dumpfile or Subversion repository
3166   is handled by delegates.  See self.add_delegate method for how to
3167   set delegates.
3168
3169   The structure of the repository is kept in two databases and one
3170   hash.  The revs_db database maps revisions to root node keys, and
3171   the nodes_db database maps node keys to nodes.  A node is a hash
3172   from directory names to keys.  Both the revs_db and the nodes_db are
3173   stored on disk and each access is expensive.
3174
3175   The nodes_db database only has the keys for old revisions.  The
3176   revision that is being contructed is kept in memory in the new_nodes
3177   hash which is cheap to access.
3178
3179   You must invoke _start_commit between SVNCommits.
3180
3181   *** WARNING *** All path arguments to methods in this class CANNOT
3182       have leading or trailing slashes.
3183   """
3184
3185   class SVNRepositoryMirrorPathExistsError(Exception):
3186     """Exception raised if an attempt is made to add a path to the
3187     repository mirror and that path already exists in the youngest
3188     revision of the repository."""
3189     pass
3190
3191   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
3192     """Exception raised if a CVSRevision is found to have an unexpected
3193     operation (OP) value."""
3194     pass
3195
3196   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
3197     """Exception raised if an empty SymbolicNameFillingGuide is returned
3198     during a fill where the branch in question already exists."""
3199     pass
3200
3201   def __init__(self):
3202     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
3203     self.delegates = [ ]
3204
3205     # This corresponds to the 'revisions' table in a Subversion fs.
3206     self.revs_db = SDatabase(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
3207     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
3208
3209     # This corresponds to the 'nodes' table in a Subversion fs.  (We
3210     # don't need a 'representations' or 'strings' table because we
3211     # only track metadata, not file contents.)
3212     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
3213     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
3214
3215     # Start at revision 0 without a root node.  It will be created
3216     # by _open_writable_root_node.
3217     self.youngest = 0
3218     self.new_root_key = None
3219     self.new_nodes = { }
3220
3221     if not Ctx().trunk_only:
3222       ###PERF IMPT: Suck this into memory.
3223       self.tags_db = TagsDatabase(DB_OPEN_READ)
3224       self.symbolings_reader = SymbolingsReader()
3225
3226   def _initialize_repository(self, date):
3227     """Initialize the repository by creating the directories for
3228     trunk, tags, and branches.  This method should only be called
3229     after all delegates are added to the repository mirror."""
3230     # Make a 'fake' SVNCommit so we can take advantage of the revprops
3231     # magic therein
3232     svn_commit = SVNCommit("Initialization", 1)
3233     svn_commit.set_date(date)
3234     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
3235
3236     self._start_commit(svn_commit)
3237     self._mkdir(Ctx().project.trunk_path)
3238     if not Ctx().trunk_only:
3239       self._mkdir(Ctx().project.branches_path)
3240       self._mkdir(Ctx().project.tags_path)
3241
3242   def _start_commit(self, svn_commit):
3243     """Start a new commit."""
3244     if self.youngest > 0:
3245       self._end_commit()
3246
3247     self.youngest = svn_commit.revnum
3248     self.new_root_key = None
3249     self.new_nodes = { }
3250
3251     self._invoke_delegates('start_commit', svn_commit)
3252
3253   def _end_commit(self):
3254     """Called at the end of each commit.  This method copies the newly
3255     created nodes to the on-disk nodes db."""
3256     if self.new_root_key is None:
3257       # No changes were made in this revision, so we make the root node
3258       # of the new revision be the same as the last one.
3259       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
3260     else:
3261       self.revs_db[str(self.youngest)] = self.new_root_key
3262       # Copy the new nodes to the nodes_db
3263       for key, value in self.new_nodes.items():
3264         self.nodes_db[key] = value
3265
3266   def _get_node(self, key):
3267     """Returns the node contents for KEY which may refer to either
3268     self.nodes_db or self.new_nodes."""
3269     if self.new_nodes.has_key(key):
3270       return self.new_nodes[key]
3271     else:
3272       return self.nodes_db[key]
3273
3274   def _open_readonly_node(self, path, revnum):
3275     """Open a readonly node for PATH at revision REVNUM.  Returns the
3276     node key and node contents if the path exists, else (None, None)."""
3277     # Get the root key
3278     if revnum == self.youngest:
3279       if self.new_root_key is None:
3280         node_key = self.revs_db[str(self.youngest - 1)]
3281       else:
3282         node_key = self.new_root_key
3283     else:
3284       node_key = self.revs_db[str(revnum)]
3285
3286     for component in path.split('/'):
3287       node_contents = self._get_node(node_key)
3288       node_key = node_contents.get(component, None)
3289       if node_key is None:
3290         return None
3291
3292     return node_key
3293
3294   def _open_writable_root_node(self):
3295     """Open a writable root node.  The current root node is returned
3296     immeditely if it is already writable.  If not, create a new one by
3297     copying the contents of the root node of the previous version."""
3298     if self.new_root_key is not None:
3299       return self.new_root_key, self.new_nodes[self.new_root_key]
3300
3301     if self.youngest < 2:
3302       new_contents = { }
3303     else:
3304       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
3305     self.new_root_key = gen_key()
3306     self.new_nodes = { self.new_root_key: new_contents }
3307
3308     return self.new_root_key, new_contents
3309
3310   def _open_writable_node(self, svn_path, create):
3311     """Open a writable node for the path SVN_PATH, creating SVN_PATH
3312     and any missing directories if CREATE is True."""
3313     parent_key, parent_contents = self._open_writable_root_node()
3314
3315     # Walk up the path, one node at a time.
3316     path_so_far = None
3317     components = svn_path.split('/')
3318     for i in range(len(components)):
3319       component = components[i]
3320       path_so_far = _path_join(path_so_far, component)
3321       this_key = parent_contents.get(component, None)
3322       if this_key is not None:
3323         # The component exists.
3324         this_contents = self.new_nodes.get(this_key, None)
3325         if this_contents is None:
3326           # Suck the node from the nodes_db, but update the key
3327           this_contents = self.nodes_db[this_key]
3328           this_key = gen_key()
3329           self.new_nodes[this_key] = this_contents
3330           parent_contents[component] = this_key
3331       elif create:
3332         # The component does not exists, so we create it.
3333         this_contents = { }
3334         this_key = gen_key()
3335         self.new_nodes[this_key] = this_contents
3336         parent_contents[component] = this_key
3337         if i < len(components) - 1:
3338           self._invoke_delegates('mkdir', path_so_far)
3339       else:
3340         # The component does not exists and we are not instructed to
3341         # create it, so we give up.
3342         return None, None
3343
3344       parent_key = this_key
3345       parent_contents = this_contents
3346
3347     return this_key, this_contents
3348
3349   def _path_exists(self, path):
3350     """If PATH exists in self.youngest of the svn repository mirror,
3351     return true, else return None.
3352
3353     PATH must not start with '/'."""
3354     return self._open_readonly_node(path, self.youngest) is not None
3355
3356   def _fast_delete_path(self, parent_path, parent_contents, component):
3357     """Delete COMPONENT from the parent direcory PARENT_PATH with the
3358     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
3359     in PARENT_CONTENTS."""
3360     if parent_contents.has_key(component):
3361       del parent_contents[component]
3362       self._invoke_delegates('delete_path',
3363                              _path_join(parent_path, component))
3364
3365   def _delete_path(self, svn_path, should_prune=False):
3366     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
3367     all ancestor directories that are made empty when SVN_PATH is deleted.
3368     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
3369
3370     NOTE: This function ignores requests to delete the root directory
3371     or any directory for which Ctx().project.is_unremovable() returns
3372     True, either directly or by pruning."""
3373
3374     if svn_path == '' or Ctx().project.is_unremovable(svn_path):
3375       return
3376
3377     (parent_path, entry,) = _path_split(svn_path)
3378     if parent_path:
3379       parent_key, parent_contents = \
3380           self._open_writable_node(parent_path, False)
3381     else:
3382       parent_key, parent_contents = self._open_writable_root_node()
3383
3384     if parent_key is not None:
3385       self._fast_delete_path(parent_path, parent_contents, entry)
3386       # The following recursion makes pruning an O(n^2) operation in the
3387       # worst case (where n is the depth of SVN_PATH), but the worst case
3388       # is probably rare, and the constant cost is pretty low.  Another
3389       # drawback is that we issue a delete for each path and not just
3390       # a single delete for the topmost directory pruned.
3391       if should_prune and len(parent_contents) == 0:
3392         self._delete_path(parent_path, True)
3393
3394   def _mkdir(self, path):
3395     """Create PATH in the repository mirror at the youngest revision."""
3396     self._open_writable_node(path, True)
3397     self._invoke_delegates('mkdir', path)
3398
3399   def _change_path(self, cvs_rev):
3400     """Register a change in self.youngest for the CVS_REV's svn_path
3401     in the repository mirror."""
3402     # We do not have to update the nodes because our mirror is only
3403     # concerned with the presence or absence of paths, and a file
3404     # content change does not cause any path changes.
3405     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, False))
3406
3407   def _add_path(self, cvs_rev):
3408     """Add the CVS_REV's svn_path to the repository mirror."""
3409     self._open_writable_node(cvs_rev.svn_path, True)
3410     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, True))
3411
3412   def _copy_path(self, src_path, dest_path, src_revnum):
3413     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
3414     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
3415     parent *must* exist, but DEST_PATH *cannot* exist.
3416
3417     Return the node key and the contents of the new node at DEST_PATH
3418     as a dictionary."""
3419     # get the contents of the node of our src_path
3420     src_key = self._open_readonly_node(src_path, src_revnum)
3421     src_contents = self._get_node(src_key)
3422
3423     # Get the parent path and the base path of the dest_path
3424     (dest_parent, dest_basename,) = _path_split(dest_path)
3425     dest_parent_key, dest_parent_contents = \
3426                    self._open_writable_node(dest_parent, False)
3427
3428     if dest_parent_contents.has_key(dest_basename):
3429       msg = "Attempt to add path '%s' to repository mirror " % dest_path
3430       msg = msg + "when it already exists in the mirror."
3431       raise self.SVNRepositoryMirrorPathExistsError, msg
3432
3433     dest_parent_contents[dest_basename] = src_key
3434     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
3435
3436     # Yes sir, src_key and src_contents are also the contents of the
3437     # destination.  This is a cheap copy, remember!  :-)
3438     return src_key, src_contents
3439
3440   def _fill_symbolic_name(self, svn_commit):
3441     """Performs all copies necessary to create as much of the the tag
3442     or branch SVN_COMMIT.symbolic_name as possible given the current
3443     revision of the repository mirror.
3444
3445     The symbolic name is guaranteed to exist in the Subversion
3446     repository by the end of this call, even if there are no paths
3447     under it."""
3448     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3449         svn_commit.symbolic_name, self.youngest)
3450     # Get the list of sources for the symbolic name.
3451     sources = symbol_fill.get_sources()
3452
3453     if sources:
3454       if self.tags_db.has_key(svn_commit.symbolic_name):
3455         dest_prefix = Ctx().project.get_tag_path(svn_commit.symbolic_name)
3456       else:
3457         dest_prefix = Ctx().project.get_branch_path(svn_commit.symbolic_name)
3458
3459       dest_key = self._open_writable_node(dest_prefix, False)[0]
3460       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3461     else:
3462       # We can only get here for a branch whose first commit is an add
3463       # (as opposed to a copy).
3464       dest_path = Ctx().project.get_branch_path(symbol_fill.name)
3465       if not self._path_exists(dest_path):
3466         # If our symbol_fill was empty, that means that our first
3467         # commit on the branch was to a file added on the branch, and
3468         # that this is our first fill of that branch.
3469         #
3470         # This case is covered by test 16.
3471         #
3472         # ...we create the branch by copying trunk from the our
3473         # current revision number minus 1
3474         source_path = Ctx().project.trunk_path
3475         entries = self._copy_path(source_path, dest_path,
3476                                   svn_commit.revnum - 1)[1]
3477         # Now since we've just copied trunk to a branch that's
3478         # *supposed* to be empty, we delete any entries in the
3479         # copied directory.
3480         for entry in entries:
3481           del_path = dest_path + '/' + entry
3482           # Delete but don't prune.
3483           self._delete_path(del_path)
3484       else:
3485         msg = "Error filling branch '" \
3486               + _clean_symbolic_name(symbol_fill.name) + "'.\n"
3487         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3488         msg = msg + "attempted to create a branch that already exists."
3489         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3490
3491   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3492             path = None, parent_source_prefix = None,
3493             preferred_revnum = None, prune_ok = None):
3494     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3495     SOURCES, and recurse into the child items.
3496
3497     DEST_PREFIX is the prefix of the destination directory, e.g.
3498     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3499     FillSource classes that are candidates to be copied to the
3500     destination.  DEST_KEY is the key in self.nodes_db to the
3501     destination, or None if the destination does not yet exist.
3502
3503     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3504     are at the top level, e.g. '/tags/my_tag'.
3505
3506     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3507     the parent directory, and PREFERRED_REVNUM is an int which is the
3508     source revision number that the caller (who may have copied KEY's
3509     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3510     then no revision is preferable to any other (which probably means
3511     that no copies have happened yet).
3512
3513     PRUNE_OK means that a copy has been made in this recursion, and
3514     it's safe to prune directories that are not in
3515     SYMBOL_FILL._node_tree, provided that said directory has a source
3516     prefix of one of the PARENT_SOURCE_PREFIX.
3517
3518     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3519     should only be passed in by recursive calls."""
3520     # Calculate scores and revnums for all sources
3521     for source in sources:
3522       src_revnum, score = symbol_fill.get_best_revnum(source.node,
3523                                                       preferred_revnum)
3524       source.set_score(score, src_revnum)
3525
3526     # Sort the sources in descending score order so that we will make
3527     # a eventual copy from the source with the highest score.
3528     sources.sort()
3529     copy_source = sources[0]
3530
3531     src_path = _path_join(copy_source.prefix, path)
3532     dest_path = _path_join(dest_prefix, path)
3533
3534     # Figure out if we shall copy to this destination and delete any
3535     # destination path that is in the way.
3536     do_copy = 0
3537     if dest_key is None:
3538       do_copy = 1
3539     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3540                        copy_source.revnum != preferred_revnum):
3541       # We are about to replace the destination, so we need to remove
3542       # it before we perform the copy.
3543       self._delete_path(dest_path)
3544       do_copy = 1
3545
3546     if do_copy:
3547       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3548                                                copy_source.revnum)
3549       prune_ok = 1
3550     else:
3551       dest_entries = self._get_node(dest_key)
3552
3553     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3554     # elements and the values are lists of FillSource classes where
3555     # this path element exists.
3556     src_entries = {}
3557     for source in sources:
3558       if isinstance(source.node, SvnRevisionRange):
3559         continue
3560       for entry, node in source.node.items():
3561         src_entries.setdefault(entry, []).append(
3562             FillSource(source.prefix, node))
3563
3564     if prune_ok:
3565       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3566       delete_list = [ ]
3567       for entry in dest_entries:
3568         if not src_entries.has_key(entry):
3569           delete_list.append(entry)
3570       if delete_list:
3571         if not self.new_nodes.has_key(dest_key):
3572           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3573         # Sort the delete list to get "diffable" dumpfiles.
3574         delete_list.sort()
3575         for entry in delete_list:
3576           self._fast_delete_path(dest_path, dest_entries, entry)
3577
3578     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3579     src_keys = src_entries.keys()
3580     src_keys.sort()
3581     for src_key in src_keys:
3582       next_dest_key = dest_entries.get(src_key, None)
3583       self._fill(symbol_fill, dest_prefix, next_dest_key,
3584                  src_entries[src_key], _path_join(path, src_key),
3585                  copy_source.prefix, sources[0].revnum, prune_ok)
3586
3587   def _synchronize_default_branch(self, svn_commit):
3588     """Propagate any changes that happened on a non-trunk default
3589     branch to the trunk of the repository.  See
3590     CVSCommit._post_commit() for details on why this is necessary."""
3591     for cvs_rev in svn_commit.cvs_revs:
3592       svn_trunk_path = Ctx().project.make_trunk_path(cvs_rev.cvs_path)
3593       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3594         if self._path_exists(svn_trunk_path):
3595           # Delete the path on trunk...
3596           self._delete_path(svn_trunk_path)
3597         # ...and copy over from branch
3598         self._copy_path(cvs_rev.svn_path, svn_trunk_path,
3599                         svn_commit.motivating_revnum)
3600       elif cvs_rev.op == OP_DELETE:
3601         # delete trunk path
3602         self._delete_path(svn_trunk_path)
3603       else:
3604         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3605                % cvs_rev.op)
3606         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3607
3608   def commit(self, svn_commit):
3609     """Add an SVNCommit to the SVNRepository, incrementing the
3610     Repository revision number, and changing the repository.  Invoke
3611     the delegates' _start_commit() method."""
3612
3613     if svn_commit.revnum == 2:
3614       self._initialize_repository(svn_commit.get_date())
3615
3616     self._start_commit(svn_commit)
3617
3618     if svn_commit.symbolic_name:
3619       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3620                   _clean_symbolic_name(svn_commit.symbolic_name))
3621       self._fill_symbolic_name(svn_commit)
3622     elif svn_commit.motivating_revnum:
3623       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3624                   % svn_commit.motivating_revnum)
3625       self._synchronize_default_branch(svn_commit)
3626     else: # This actually commits CVSRevisions
3627       if len(svn_commit.cvs_revs) > 1: plural = "s"
3628       else: plural = ""
3629       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3630                   % (len(svn_commit.cvs_revs), plural))
3631       for cvs_rev in svn_commit.cvs_revs:
3632         # See comment in CVSCommit._commit() for what this is all
3633         # about.  Note that although asking self._path_exists() is
3634         # somewhat expensive, we only do it if the first two (cheap)
3635         # tests succeed first.
3636         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3637                 and (cvs_rev.rev == "1.1.1.1")
3638                 and self._path_exists(cvs_rev.svn_path)):
3639           if cvs_rev.op == OP_ADD:
3640             self._add_path(cvs_rev)
3641           elif cvs_rev.op == OP_CHANGE:
3642             # Fix for Issue #74:
3643             #
3644             # Here's the scenario.  You have file FOO that is imported
3645             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3646             # the file exists.
3647             #
3648             # Moving forward in time, FOO is deleted on the default
3649             # branch (r1.1.1.2).  cvs2svn determines that this delete
3650             # also needs to happen on trunk, so FOO is deleted on
3651             # trunk.
3652             #
3653             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3654             # not 'dead', we assume it's a change).  However, since
3655             # our trunk file has been deleted, svnadmin blows up--you
3656             # can't change a file that doesn't exist!
3657             #
3658             # Soooo... we just check the path, and if it doesn't
3659             # exist, we do an add... if the path does exist, it's
3660             # business as usual.
3661             if not self._path_exists(cvs_rev.svn_path):
3662               self._add_path(cvs_rev)
3663             else:
3664               self._change_path(cvs_rev)
3665
3666         if cvs_rev.op == OP_DELETE:
3667           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3668
3669   def cleanup(self):
3670     """Callback for the Cleanup.register in self.__init__."""
3671     self.revs_db = None
3672     self.nodes_db = None
3673
3674   def add_delegate(self, delegate):
3675     """Adds DELEGATE to self.delegates.
3676
3677     For every delegate you add, as soon as SVNRepositoryMirror
3678     performs a repository action method, SVNRepositoryMirror will call
3679     the delegate's corresponding repository action method.  Multiple
3680     delegates will be called in the order that they are added.  See
3681     SVNRepositoryMirrorDelegate for more information."""
3682     self.delegates.append(delegate)
3683
3684   def _invoke_delegates(self, method, *args):
3685     """Iterate through each of our delegates, in the order that they
3686     were added, and call the delegate's method named METHOD with the
3687     arguments in ARGS."""
3688     for delegate in self.delegates:
3689       getattr(delegate, method)(*args)
3690
3691   def finish(self):
3692     """Calls the delegate finish method."""
3693     self._end_commit()
3694     self._invoke_delegates('finish')
3695     self.cleanup()
3696
3697
3698 class SVNCommitItem:
3699   """A wrapper class for CVSRevision objects upon which
3700   Subversion-related data (such as properties) may be hung."""
3701
3702   def __init__(self, c_rev, svn_props_changed):
3703     """Initialize instance and record the properties for this file.
3704     SVN_PROPS_CHANGED indicates whether the svn: properties are known
3705     to have changed since the last revision.
3706
3707     The properties are set by the SVNPropertySetters in
3708     Ctx().svn_property_setters, then we read a couple of the
3709     properties back out for our own purposes."""
3710
3711     self.c_rev = c_rev
3712     # Did the svn properties change for this file (i.e., do they have
3713     # to be written to the dumpfile?)
3714     self.svn_props_changed = svn_props_changed
3715
3716     # The properties for this item as a map { key : value }.  If VALUE
3717     # is None, no property should be set.
3718     self.svn_props = { }
3719
3720     for svn_property_setter in Ctx().svn_property_setters:
3721       svn_property_setter.set_properties(self)
3722
3723     # Remember if we need to filter the EOLs.  We could actually use
3724     # self.svn_props now, since it is initialized for each revision.
3725     self.needs_eol_filter = \
3726         self.svn_props.get('svn:eol-style', None) is not None
3727
3728     self.has_keywords = self.svn_props.get('svn:keywords', None) is not None
3729
3730
3731 class SVNPropertySetter:
3732   """Abstract class for objects that can set properties on a SVNCommitItem."""
3733
3734   def set_properties(self, s_item):
3735     """Set any properties that can be determined for S_ITEM."""
3736
3737     raise NotImplementedError
3738
3739
3740 class SVNRepositoryMirrorDelegate:
3741   """Abstract superclass for any delegate to SVNRepositoryMirror.
3742   Subclasses must implement all of the methods below.
3743
3744   For each method, a subclass implements, in its own way, the
3745   Subversion operation implied by the method's name.  For example, for
3746   the add_path method, the DumpfileDelegate would write out a
3747   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3748   would merely print that the path is being added to the repository,
3749   and the RepositoryDelegate would actually cause the path to be added
3750   to the Subversion repository that it is creating.
3751   """
3752
3753   def start_commit(self, svn_commit):
3754     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3755     see subclass implementation for details."""
3756     raise NotImplementedError
3757
3758   def mkdir(self, path):
3759     """PATH is a string; see subclass implementation for details."""
3760     raise NotImplementedError
3761
3762   def add_path(self, s_item):
3763     """S_ITEM is an SVNCommitItem; see subclass implementation for
3764     details."""
3765     raise NotImplementedError
3766
3767   def change_path(self, s_item):
3768     """S_ITEM is an SVNCommitItem; see subclass implementation for
3769     details."""
3770     raise NotImplementedError
3771
3772   def delete_path(self, path):
3773     """PATH is a string; see subclass implementation for
3774     details."""
3775     raise NotImplementedError
3776
3777   def copy_path(self, src_path, dest_path, src_revnum):
3778     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3779     subversion revision number (int); see subclass implementation for
3780     details."""
3781     raise NotImplementedError
3782
3783   def finish(self):
3784     """Perform any cleanup necessary after all revisions have been
3785     committed."""
3786     raise NotImplementedError
3787
3788
3789 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3790   """Create a Subversion dumpfile."""
3791
3792   def __init__(self, dumpfile_path=None):
3793     """Return a new DumpfileDelegate instance, attached to a dumpfile
3794     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding."""
3795     if dumpfile_path:
3796       self.dumpfile_path = dumpfile_path
3797     else:
3798       self.dumpfile_path = Ctx().dumpfile
3799
3800     self.dumpfile = open(self.dumpfile_path, 'wb')
3801     self._write_dumpfile_header(self.dumpfile)
3802
3803   def _write_dumpfile_header(self, dumpfile):
3804     # Initialize the dumpfile with the standard headers.
3805     #
3806     # Since the CVS repository doesn't have a UUID, and the Subversion
3807     # repository will be created with one anyway, we don't specify a
3808     # UUID in the dumpflie
3809     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3810
3811   def _utf8_path(self, path):
3812     """Return a copy of PATH encoded in UTF-8."""
3813     pieces = string.split(path, '/')
3814     # Convert each path component separately (as they may each use
3815     # different encodings).
3816     for i in range(len(pieces)):
3817       try:
3818         # Log messages can be converted with the 'replace' strategy,
3819         # but we can't afford any lossiness here.
3820         pieces[i] = to_utf8(pieces[i], 'strict')
3821       except UnicodeError:
3822         raise FatalError(
3823             "Unable to convert a path '%s' to internal encoding.\n"
3824             "Consider rerunning with (for example) '--encoding=latin1'."
3825             % (path,))
3826     return string.join(pieces, '/')
3827
3828   def _string_for_prop(self, name, value):
3829     """Return a property in the form needed for the dumpfile."""
3830
3831     return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value)
3832
3833   def start_commit(self, svn_commit):
3834     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3835
3836     self.revision = svn_commit.revnum
3837
3838     # The start of a new commit typically looks like this:
3839     #
3840     #   Revision-number: 1
3841     #   Prop-content-length: 129
3842     #   Content-length: 129
3843     #
3844     #   K 7
3845     #   svn:log
3846     #   V 27
3847     #   Log message for revision 1.
3848     #   K 10
3849     #   svn:author
3850     #   V 7
3851     #   jrandom
3852     #   K 8
3853     #   svn:date
3854     #   V 27
3855     #   2003-04-22T22:57:58.132837Z
3856     #   PROPS-END
3857     #
3858     # Notice that the length headers count everything -- not just the
3859     # length of the data but also the lengths of the lengths, including
3860     # the 'K ' or 'V ' prefixes.
3861     #
3862     # The reason there are both Prop-content-length and Content-length
3863     # is that the former includes just props, while the latter includes
3864     # everything.  That's the generic header form for any entity in a
3865     # dumpfile.  But since revisions only have props, the two lengths
3866     # are always the same for revisions.
3867
3868     # Calculate the output needed for the property definitions.
3869     props = svn_commit.get_revprops()
3870     prop_names = props.keys()
3871     prop_names.sort()
3872     prop_strings = []
3873     for propname in prop_names:
3874       if props[propname] is not None:
3875         prop_strings.append(self._string_for_prop(propname, props[propname]))
3876
3877     all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n'
3878     total_len = len(all_prop_strings)
3879
3880     # Print the revision header and props
3881     self.dumpfile.write('Revision-number: %d\n'
3882                         'Prop-content-length: %d\n'
3883                         'Content-length: %d\n'
3884                         '\n'
3885                         % (self.revision, total_len, total_len))
3886
3887     self.dumpfile.write(all_prop_strings)
3888     self.dumpfile.write('\n')
3889
3890   def mkdir(self, path):
3891     """Emit the creation of directory PATH."""
3892     self.dumpfile.write("Node-path: %s\n"
3893                         "Node-kind: dir\n"
3894                         "Node-action: add\n"
3895                         "\n"
3896                         "\n" % self._utf8_path(path))
3897
3898   def _add_or_change_path(self, s_item, op):
3899     """Emit the addition or change corresponding to S_ITEM.
3900     OP is either the constant OP_ADD or OP_CHANGE."""
3901
3902     # Validation stuffs
3903     if op == OP_ADD:
3904       action = 'add'
3905     elif op == OP_CHANGE:
3906       action = 'change'
3907     else:
3908       raise FatalError("_add_or_change_path() called with bad op ('%s')"
3909                        % (op,))
3910
3911     # Convenience variables
3912     c_rev = s_item.c_rev
3913
3914     # The property handling here takes advantage of an undocumented
3915     # but IMHO consistent feature of the Subversion dumpfile-loading
3916     # code.  When a node's properties aren't mentioned (that is, the
3917     # "Prop-content-length:" header is absent, no properties are
3918     # listed at all, and there is no "PROPS-END\n" line) then no
3919     # change is made to the node's properties.
3920     #
3921     # This is consistent with the way dumpfiles behave w.r.t. text
3922     # content changes, so I'm comfortable relying on it.  If you
3923     # commit a change to *just* the properties of some node that
3924     # already has text contents from a previous revision, then in the
3925     # dumpfile output for the prop change, no "Text-content-length:"
3926     # nor "Text-content-md5:" header will be present, and the text of
3927     # the file will not be given.  But this does not cause the file's
3928     # text to be erased!  It simply remains unchanged.
3929     #
3930     # This works out great for cvs2svn, due to lucky coincidences:
3931     #
3932     # For files, the only properties we ever set are set in the first
3933     # revision; all other revisions (including on branches) inherit
3934     # from that.  After the first revision, we never change file
3935     # properties, therefore, there is no need to remember the full set
3936     # of properties on a given file once we've set it.
3937     #
3938     # For directories, the only property we set is "svn:ignore", and
3939     # while we may change it after the first revision, we always do so
3940     # based on the contents of a ".cvsignore" file -- in other words,
3941     # CVS is doing the remembering for us, so we still don't have to
3942     # preserve the previous value of the property ourselves.
3943
3944     # Calculate the (sorted-by-name) property string and length, if any.
3945     if s_item.svn_props_changed:
3946       svn_props = s_item.svn_props
3947       prop_contents = ''
3948       prop_names = svn_props.keys()
3949       prop_names.sort()
3950       for pname in prop_names:
3951         pvalue = svn_props[pname]
3952         if pvalue is not None:
3953           prop_contents += self._string_for_prop(pname, pvalue)
3954       prop_contents += 'PROPS-END\n'
3955       props_header = 'Prop-content-length: %d\n' % len(prop_contents)
3956     else:
3957       prop_contents = ''
3958       props_header = ''
3959
3960     # treat .cvsignore as a directory property
3961     dir_path, basename = os.path.split(c_rev.svn_path)
3962     if basename == ".cvsignore":
3963       ignore_vals = generate_ignores(c_rev)
3964       ignore_contents = '\n'.join(ignore_vals)
3965       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3966                          (len(ignore_contents), ignore_contents))
3967       ignore_contents = ignore_contents + 'PROPS-END\n'
3968       ignore_len = len(ignore_contents)
3969
3970       # write headers, then props
3971       self.dumpfile.write('Node-path: %s\n'
3972                           'Node-kind: dir\n'
3973                           'Node-action: change\n'
3974                           'Prop-content-length: %d\n'
3975                           'Content-length: %d\n'
3976                           '\n'
3977                           '%s'
3978                           % (self._utf8_path(dir_path), ignore_len,
3979                              ignore_len, ignore_contents))
3980
3981     # If the file has keywords, we must prevent CVS/RCS from expanding
3982     # the keywords because they must be unexpanded in the repository,
3983     # or Subversion will get confused.
3984     pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(
3985         c_rev, suppress_keyword_substitution=s_item.has_keywords)
3986
3987     self.dumpfile.write('Node-path: %s\n'
3988                         'Node-kind: file\n'
3989                         'Node-action: %s\n'
3990                         '%s'  # no property header if no props
3991                         'Text-content-length: '
3992                         % (self._utf8_path(c_rev.svn_path),
3993                            action, props_header))
3994
3995     pos = self.dumpfile.tell()
3996
3997     self.dumpfile.write('0000000000000000\n'
3998                         'Text-content-md5: 00000000000000000000000000000000\n'
3999                         'Content-length: 0000000000000000\n'
4000                         '\n')
4001
4002     if prop_contents:
4003       self.dumpfile.write(prop_contents)
4004
4005     # Insert a filter to convert all EOLs to LFs if neccessary
4006     if s_item.needs_eol_filter:
4007       data_reader = LF_EOL_Filter(pipe.stdout)
4008     else:
4009       data_reader = pipe.stdout
4010
4011     # Insert the rev contents, calculating length and checksum as we go.
4012     checksum = md5.new()
4013     length = 0
4014     while True:
4015       buf = data_reader.read(PIPE_READ_SIZE)
4016       if buf == '':
4017         break
4018       checksum.update(buf)
4019       length = length + len(buf)
4020       self.dumpfile.write(buf)
4021
4022     pipe.stdout.close()
4023     error_output = pipe.stderr.read()
4024     exit_status = pipe.wait()
4025     if exit_status:
4026       raise FatalError("The command '%s' failed with exit status: %s\n"
4027                        "and the following output:\n"
4028                        "%s" % (pipe_cmd, exit_status, error_output))
4029
4030     # Go back to patch up the length and checksum headers:
4031     self.dumpfile.seek(pos, 0)
4032     # We left 16 zeros for the text length; replace them with the real
4033     # length, padded on the left with spaces:
4034     self.dumpfile.write('%16d' % length)
4035     # 16... + 1 newline + len('Text-content-md5: ') == 35
4036     self.dumpfile.seek(pos + 35, 0)
4037     self.dumpfile.write(checksum.hexdigest())
4038     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
4039     self.dumpfile.seek(pos + 84, 0)
4040     # The content length is the length of property data, text data,
4041     # and any metadata around/inside around them.
4042     self.dumpfile.write('%16d' % (length + len(prop_contents)))
4043     # Jump back to the end of the stream
4044     self.dumpfile.seek(0, 2)
4045
4046     # This record is done (write two newlines -- one to terminate
4047     # contents that weren't themselves newline-termination, one to
4048     # provide a blank line for readability.
4049     self.dumpfile.write('\n\n')
4050
4051   def add_path(self, s_item):
4052     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
4053     self._add_or_change_path(s_item, OP_ADD)
4054
4055   def change_path(self, s_item):
4056     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
4057     self._add_or_change_path(s_item, OP_CHANGE)
4058
4059   def delete_path(self, path):
4060     """Emit the deletion of PATH."""
4061     self.dumpfile.write('Node-path: %s\n'
4062                         'Node-action: delete\n'
4063                         '\n' % self._utf8_path(path))
4064
4065   def copy_path(self, src_path, dest_path, src_revnum):
4066     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
4067     # We don't need to include "Node-kind:" for copies; the loader
4068     # ignores it anyway and just uses the source kind instead.
4069     self.dumpfile.write('Node-path: %s\n'
4070                         'Node-action: add\n'
4071                         'Node-copyfrom-rev: %d\n'
4072                         'Node-copyfrom-path: /%s\n'
4073                         '\n'
4074                         % (self._utf8_path(dest_path),
4075                            src_revnum,
4076                            self._utf8_path(src_path)))
4077
4078   def finish(self):
4079     """Perform any cleanup necessary after all revisions have been
4080     committed."""
4081     self.dumpfile.close()
4082
4083
4084 class RepositoryDelegate(DumpfileDelegate):
4085   """Creates a new Subversion Repository.  DumpfileDelegate does all
4086   of the heavy lifting."""
4087   def __init__(self):
4088     self.svnadmin = Ctx().svnadmin
4089     self.target = Ctx().target
4090     if not Ctx().existing_svnrepos:
4091       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
4092       if not Ctx().fs_type:
4093         # User didn't say what kind repository (bdb, fsfs, etc).
4094         # We still pass --bdb-txn-nosync.  It's a no-op if the default
4095         # repository type doesn't support it, but we definitely want
4096         # it if BDB is the default.
4097         run_command('%s create %s "%s"' % (self.svnadmin,
4098                                            "--bdb-txn-nosync",
4099                                            self.target))
4100       elif Ctx().fs_type == 'bdb':
4101         # User explicitly specified bdb.
4102         #
4103         # Since this is a BDB repository, pass --bdb-txn-nosync,
4104         # because it gives us a 4-5x speed boost (if cvs2svn is
4105         # creating the repository, cvs2svn should be the only program
4106         # accessing the svn repository (until cvs is done, at least)).
4107         # But we'll turn no-sync off in self.finish(), unless
4108         # instructed otherwise.
4109         run_command('%s create %s %s "%s"' % (self.svnadmin,
4110                                               "--fs-type=bdb",
4111                                               "--bdb-txn-nosync",
4112                                               self.target))
4113       else:
4114         # User specified something other than bdb.
4115         run_command('%s create %s "%s"' % (self.svnadmin,
4116                                            "--fs-type=%s" % Ctx().fs_type,
4117                                            self.target))
4118
4119     # Since the output of this run is a repository, not a dumpfile,
4120     # the temporary dumpfiles we create should go in the tmpdir.
4121     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
4122
4123     # This is 1 if a commit is in progress, otherwise None.
4124     self._commit_in_progress = None
4125
4126     self.dumpfile = open(self.dumpfile_path, 'w+b')
4127     self.loader_pipe = SimplePopen([ self.svnadmin, 'load', '-q',
4128                                      self.target ], True)
4129     self.loader_pipe.stdout.close()
4130     try:
4131       self._write_dumpfile_header(self.loader_pipe.stdin)
4132     except IOError:
4133       raise FatalError("svnadmin failed with the following output while "
4134                        "loading the dumpfile:\n"
4135                        + self.loader_pipe.stderr.read())
4136
4137   def _feed_pipe(self):
4138     """Feed the revision stored in the dumpfile to the svnadmin
4139     load pipe."""
4140     self.dumpfile.seek(0)
4141     while 1:
4142       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
4143       if not len(data):
4144         break
4145       try:
4146         self.loader_pipe.stdin.write(data)
4147       except IOError:
4148         raise FatalError("svnadmin failed with the following output "
4149                          "while loading the dumpfile:\n"
4150                          + self.loader_pipe.stderr.read())
4151
4152   def start_commit(self, svn_commit):
4153     """Start a new commit.  If a commit is already in progress, close
4154     the dumpfile, load it into the svn repository, open a new
4155     dumpfile, and write the header into it."""
4156     if self._commit_in_progress:
4157       self._feed_pipe()
4158     self.dumpfile.seek(0)
4159     self.dumpfile.truncate()
4160     DumpfileDelegate.start_commit(self, svn_commit)
4161     self._commit_in_progress = 1
4162
4163   def finish(self):
4164     """Loads the last commit into the repository."""
4165     self._feed_pipe()
4166     self.dumpfile.close()
4167     self.loader_pipe.stdin.close()
4168     error_output = self.loader_pipe.stderr.read()
4169     exit_status = self.loader_pipe.wait()
4170     if exit_status:
4171       raise FatalError('svnadmin load failed with exit status: %s\n'
4172                        'and the following output:\n'
4173                        '%s' % (exit_status, error_output,))
4174     os.remove(self.dumpfile_path)
4175
4176     # If this is a BDB repository, and we created the repository, and
4177     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
4178     # line in the DB_CONFIG file, because txn syncing should be on by
4179     # default in BDB repositories.
4180     #
4181     # We determine if this is a BDB repository by looking for the
4182     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
4183     # checking Ctx().fs_type.  That way this code will Do The Right
4184     # Thing in all circumstances.
4185     db_config = os.path.join(self.target, "db/DB_CONFIG")
4186     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
4187         and os.path.exists(db_config)):
4188       no_sync = 'set_flags DB_TXN_NOSYNC\n'
4189
4190       contents = open(db_config, 'r').readlines()
4191       index = contents.index(no_sync)
4192       contents[index] = '# ' + no_sync
4193       contents = open(db_config, 'w').writelines(contents)
4194
4195
4196 class StdoutDelegate(SVNRepositoryMirrorDelegate):
4197   """Makes no changes to the disk, but writes out information to
4198   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
4199   print statements will state that we're doing something, when in
4200   reality, we aren't doing anything other than printing out that we're
4201   doing something.  Kind of zen, really."""
4202   def __init__(self, total_revs):
4203     self.total_revs = total_revs
4204
4205   def start_commit(self, svn_commit):
4206     """Prints out the Subversion revision number of the commit that is
4207     being started."""
4208     Log().write(LOG_VERBOSE, "=" * 60)
4209     Log().write(LOG_NORMAL, "Starting Subversion r%d / %d" %
4210                 (svn_commit.revnum, self.total_revs))
4211
4212   def mkdir(self, path):
4213     """Print a line stating that we are creating directory PATH."""
4214     Log().write(LOG_VERBOSE, "  New Directory", path)
4215
4216   def add_path(self, s_item):
4217     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
4218     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
4219
4220   def change_path(self, s_item):
4221     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
4222     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
4223
4224   def delete_path(self, path):
4225     """Print a line stating that we are 'deleting' PATH."""
4226     Log().write(LOG_VERBOSE, "  Deleting", path)
4227
4228   def copy_path(self, src_path, dest_path, src_revnum):
4229     """Print a line stating that we are 'copying' revision SRC_REVNUM
4230     of SRC_PATH to DEST_PATH."""
4231     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
4232     Log().write(LOG_VERBOSE, "                to", dest_path)
4233
4234   def finish(self):
4235     """State that we are done creating our repository."""
4236     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
4237     Log().write(LOG_QUIET, "Done.")
4238
4239 def pass1():
4240   OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
4241   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
4242   cd = CollectData()
4243
4244   def visit_file(baton, dirname, files):
4245     cd = baton
4246     for fname in files:
4247       if fname[-2:] != ',v':
4248         continue
4249       cd.found_valid_file = 1
4250       pathname = os.path.join(dirname, fname)
4251       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
4252         # drop the 'Attic' portion from the pathname for the canonical name.
4253         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
4254       else:
4255         # If this file also exists in the attic, it's a fatal error
4256         attic_path = os.path.join(dirname, 'Attic', fname)
4257         if os.path.exists(attic_path):
4258           err = "%s: A CVS repository cannot contain both %s and %s" \
4259                 % (error_prefix, pathname, attic_path)
4260           sys.stderr.write(err + '\n')
4261           cd.fatal_errors.append(err)
4262         cd.set_fname(pathname, pathname)
4263       Log().write(LOG_NORMAL, pathname)
4264       try:
4265         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
4266       except (cvs2svn_rcsparse.common.RCSParseError, ValueError,
4267               RuntimeError):
4268         err = "%s: '%s' is not a valid ,v file" \
4269               % (error_prefix, pathname)
4270         sys.stderr.write(err + '\n')
4271         cd.fatal_errors.append(err)
4272       except:
4273         Log().write(LOG_WARN,
4274                     "Exception occurred while parsing %s" % pathname)
4275         raise
4276
4277   os.path.walk(Ctx().project.project_cvs_repos_path, visit_file, cd)
4278   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
4279
4280   cd.write_symbol_db()
4281
4282   if len(cd.fatal_errors) > 0:
4283     raise FatalException("Pass 1 complete.\n"
4284                          + "=" * 75 + "\n"
4285                          + "Error summary:\n"
4286                          + "\n".join(cd.fatal_errors) + "\n"
4287                          + "Exited due to fatal error(s).\n")
4288
4289   if cd.found_valid_file is None:
4290     raise FatalException(
4291         "\n"
4292         "No RCS files found in your CVS Repository!\n"
4293         "Are you absolutely certain you are pointing cvs2svn\n"
4294         "at a CVS repository?\n"
4295         "\n"
4296         "Exited due to fatal error(s).\n")
4297
4298   StatsKeeper().reset_c_rev_info()
4299   StatsKeeper().archive()
4300   Log().write(LOG_QUIET, "Done")
4301
4302 def pass2():
4303   "Pass 2: clean up the revision information."
4304
4305   symbol_db = SymbolDatabase()
4306   symbol_db.read()
4307
4308   # Convert the list of regexps to a list of strings
4309   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
4310
4311   error_detected = 0
4312
4313   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
4314   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
4315   if blocked_excludes:
4316     for branch, blockers in blocked_excludes.items():
4317       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
4318                        "excluded because the following symbols depend "
4319                        "on it:\n" % (branch))
4320       for blocker in blockers:
4321         sys.stderr.write("    '%s'\n" % (blocker))
4322     sys.stderr.write("\n")
4323     error_detected = 1
4324
4325   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
4326   invalid_forced_tags = [ ]
4327   for forced_tag in Ctx().forced_tags:
4328     if excludes.has_key(forced_tag):
4329       continue
4330     if symbol_db.branch_has_commit(forced_tag):
4331       invalid_forced_tags.append(forced_tag)
4332   if invalid_forced_tags:
4333     sys.stderr.write(error_prefix + ": The following branches cannot be "
4334                      "forced to be tags because they have commits:\n")
4335     for tag in invalid_forced_tags:
4336       sys.stderr.write("    '%s'\n" % (tag))
4337     sys.stderr.write("\n")
4338     error_detected = 1
4339
4340   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
4341   mismatches = symbol_db.find_mismatches(excludes)
4342   def is_not_forced(mismatch):
4343     name = mismatch[0]
4344     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
4345   mismatches = filter(is_not_forced, mismatches)
4346   if mismatches:
4347     sys.stderr.write(error_prefix + ": The following symbols are tags "
4348                      "in some files and branches in others.\nUse "
4349                      "--force-tag, --force-branch and/or --exclude to "
4350                      "resolve the symbols.\n")
4351     for name, tag_count, branch_count, commit_count in mismatches:
4352       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
4353                        "%d files and has commits in %d files.\n"
4354                        % (name, tag_count, branch_count, commit_count))
4355     error_detected = 1
4356
4357   # Bail out now if we found errors
4358   if error_detected:
4359     sys.exit(1)
4360
4361   # Create the tags database
4362   tags_db = TagsDatabase(DB_OPEN_NEW)
4363   for tag in symbol_db.tags:
4364     if tag not in Ctx().forced_branches:
4365       tags_db[tag] = None
4366   for tag in Ctx().forced_tags:
4367     tags_db[tag] = None
4368
4369   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
4370
4371   # We may have recorded some changes in revisions' timestamp.  We need to
4372   # scan for any other files which may have had the same log message and
4373   # occurred at "the same time" and change their timestamps, too.
4374
4375   # read the resync data file
4376   def read_resync(fname):
4377     "Read the .resync file into memory."
4378
4379     ### note that we assume that we can hold the entire resync file in
4380     ### memory. really large repositories with whacky timestamps could
4381     ### bust this assumption. should that ever happen, then it is possible
4382     ### to split the resync file into pieces and make multiple passes,
4383     ### using each piece.
4384
4385     #
4386     # A digest maps to a sequence of lists which specify a lower and upper
4387     # time bound for matching up the commit.  We keep a sequence of these
4388     # because a number of checkins with the same log message (e.g. an empty
4389     # log message) could need to be remapped.  We also make them a list
4390     # because we will dynamically expand the lower/upper bound as we find
4391     # commits that fall into a particular msg and time range.
4392     #
4393     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4394     #
4395     resync = { }
4396
4397     for line in fileinput.FileInput(fname):
4398       t1 = int(line[:8], 16)
4399       digest = line[9:DIGEST_END_IDX]
4400       t2 = int(line[DIGEST_END_IDX+1:], 16)
4401       t1_l = t1 - COMMIT_THRESHOLD/2
4402       t1_u = t1 + COMMIT_THRESHOLD/2
4403       resync.setdefault(digest, []).append([t1_l, t1_u, t2])
4404
4405     # For each digest, sort the resync items in it in increasing order,
4406     # based on the lower time bound.
4407     for val in resync.values():
4408       val.sort()
4409
4410     return resync
4411
4412   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4413
4414   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4415   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4416
4417   tweaked_timestamps_db = Database(temp(TWEAKED_TIMESTAMPS_DB), DB_OPEN_NEW)
4418   Cleanup().register(temp(TWEAKED_TIMESTAMPS_DB), pass2)
4419
4420   # process the revisions file, looking for items to clean up
4421   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4422     c_rev = CVSRevision(Ctx(), line[:-1])
4423
4424     # Skip this entire revision if it's on an excluded branch
4425     if excludes.has_key(c_rev.branch_name):
4426       continue
4427
4428     new_prev_ts = None
4429     if c_rev.prev_rev is not None:
4430       new_prev_ts = tweaked_timestamps_db.get(
4431         c_rev.unique_key(c_rev.prev_rev), None)
4432     if new_prev_ts:
4433       c_rev.prev_timestamp = new_prev_ts
4434
4435     new_next_ts = None
4436     if c_rev.next_rev is not None:
4437       new_next_ts = tweaked_timestamps_db.get(
4438         c_rev.unique_key(c_rev.next_rev), None)
4439     if new_next_ts:
4440       c_rev.next_timestamp = new_next_ts
4441
4442     # Remove all references to excluded tags and branches
4443     def not_excluded(symbol, excludes=excludes):
4444       return not excludes.has_key(symbol)
4445     c_rev.branches = filter(not_excluded, c_rev.branches)
4446     c_rev.tags = filter(not_excluded, c_rev.tags)
4447
4448     # Convert all branches that are forced to be tags
4449     for forced_tag in Ctx().forced_tags:
4450       if forced_tag in c_rev.branches:
4451         c_rev.branches.remove(forced_tag)
4452         c_rev.tags.append(forced_tag)
4453
4454     # Convert all tags that are forced to be branches
4455     for forced_branch in Ctx().forced_branches:
4456       if forced_branch in c_rev.tags:
4457         c_rev.tags.remove(forced_branch)
4458         c_rev.branches.append(forced_branch)
4459
4460     # see if this is "near" any of the resync records we
4461     # have recorded for this digest [of the log message].
4462     for record in resync.get(c_rev.digest, []):
4463       if record[2] == c_rev.timestamp:
4464         # This means that either c_rev is the same revision that
4465         # caused the resync record to exist, or c_rev is a different
4466         # CVS revision that happens to have the same timestamp.  In
4467         # either case, we don't have to do anything, so we...
4468         continue
4469
4470       if record[0] <= c_rev.timestamp <= record[1]:
4471         # bingo!  We probably want to remap the time on this c_rev,
4472         # unless the remapping would be useless because the new time
4473         # would fall outside the COMMIT_THRESHOLD window for this
4474         # commit group.
4475         new_timestamp = record[2]
4476         # If the new timestamp is earlier than that of our previous revision
4477         if new_timestamp < c_rev.prev_timestamp:
4478           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4479                   + " to time %s, which is before previous the time of"
4480                   + " revision %s (%s):")
4481           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4482                                         c_rev.cvs_path, new_timestamp,
4483                                         c_rev.prev_rev, c_rev.prev_timestamp))
4484           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4485           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4486           # attempted resync time, then sync back to c_rev.prev_timestamp
4487           # + 1...
4488           if ((c_rev.prev_timestamp + 1) - new_timestamp) < COMMIT_THRESHOLD:
4489             new_timestamp = c_rev.prev_timestamp + 1
4490             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4491                                                           new_timestamp))
4492           else:
4493             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4494                         warning_prefix)
4495             continue
4496
4497         # If the new timestamp is later than that of our next revision
4498         elif c_rev.next_timestamp and new_timestamp > c_rev.next_timestamp:
4499           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4500                   + " to time %s, which is after time of next"
4501                   + " revision %s (%s):")
4502           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4503                                         c_rev.cvs_path, new_timestamp,
4504                                         c_rev.prev_rev, c_rev.next_timestamp))
4505           # If resyncing our rev to c_rev.next_timestamp - 1 will place
4506           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4507           # attempted resync time, then sync forward to c_rev.next_timestamp
4508           # - 1...
4509           if (new_timestamp - (c_rev.next_timestamp - 1)) < COMMIT_THRESHOLD:
4510             new_timestamp = c_rev.next_timestamp - 1
4511             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4512                                                           new_timestamp))
4513           else:
4514             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4515                         warning_prefix)
4516             continue
4517
4518         # Fix for Issue #71: Avoid resyncing two consecutive revisions
4519         # to the same timestamp.
4520         elif (new_timestamp == c_rev.prev_timestamp
4521               or new_timestamp == c_rev.next_timestamp):
4522           continue
4523
4524         # adjust the time range. we want the COMMIT_THRESHOLD from the
4525         # bounds of the earlier/latest commit in this group.
4526         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4527         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4528
4529         msg = "PASS2 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4530               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4531                  new_timestamp - c_rev.timestamp)
4532         Log().write(LOG_VERBOSE, msg)
4533
4534         c_rev.timestamp = new_timestamp
4535         tweaked_timestamps_db[c_rev.unique_key()] = new_timestamp
4536
4537         # stop looking for hits
4538         break
4539
4540     output.write(str(c_rev) + "\n")
4541   Log().write(LOG_QUIET, "Done")
4542
4543 def pass3():
4544   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4545   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4546             temp(DATAFILE + SORTED_REVS_SUFFIX))
4547   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4548   Log().write(LOG_QUIET, "Done")
4549
4550 def pass4():
4551   """Iterate through sorted revs, storing them in a database.
4552   If we're not doing a trunk-only conversion, generate the
4553   LastSymbolicNameDatabase, which contains the last CVSRevision
4554   that is a source for each tag or branch.
4555   """
4556   Log().write(LOG_QUIET,
4557       "Copying CVS revision data from flat file to database...")
4558   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4559   if not Ctx().trunk_only:
4560     Log().write(LOG_QUIET,
4561         "Finding last CVS revisions for all symbolic names...")
4562     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4563   else:
4564     # This is to avoid testing Ctx().trunk_only every time around the loop
4565     class DummyLSNDB:
4566       def noop(*args): pass
4567       log_revision = noop
4568       create_database = noop
4569     last_sym_name_db = DummyLSNDB()
4570
4571   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4572     c_rev = CVSRevision(Ctx(), line[:-1])
4573     cvs_revs_db.log_revision(c_rev)
4574     last_sym_name_db.log_revision(c_rev)
4575     StatsKeeper().record_c_rev(c_rev)
4576
4577   last_sym_name_db.create_database()
4578   StatsKeeper().archive()
4579   Log().write(LOG_QUIET, "Done")
4580
4581 def pass5():
4582   """
4583   Generate the SVNCommit <-> CVSRevision mapping
4584   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4585   CVSRevisions that represent an opening or closing for a path on a
4586   branch or tag.  See SymbolingsLogger for more details.
4587   """
4588   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4589
4590   aggregator = CVSRevisionAggregator()
4591   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4592     c_rev = CVSRevision(Ctx(), line[:-1])
4593     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4594       aggregator.process_revision(c_rev)
4595   aggregator.flush()
4596
4597   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4598   StatsKeeper().archive()
4599   Log().write(LOG_QUIET, "Done")
4600
4601 def pass6():
4602   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4603
4604   if not Ctx().trunk_only:
4605     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4606               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4607     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4608   Log().write(LOG_QUIET, "Done")
4609
4610 def pass7():
4611   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4612
4613   def generate_offsets_for_symbolings():
4614     """This function iterates through all the lines in
4615     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4616     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4617     where SYMBOLIC_NAME is first encountered.  This will allow us to
4618     seek to the various offsets in the file and sequentially read only
4619     the openings and closings that we need."""
4620
4621     ###PERF This is a fine example of a db that can be in-memory and
4622     #just flushed to disk when we're done.  Later, it can just be sucked
4623     #back into memory.
4624     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4625     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4626
4627     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4628     old_sym = ""
4629     while 1:
4630       fpos = file.tell()
4631       line = file.readline()
4632       if not line:
4633         break
4634       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4635       if sym != old_sym:
4636         Log().write(LOG_VERBOSE, " ", sym)
4637         old_sym = sym
4638         offsets_db[sym] = fpos
4639
4640   if not Ctx().trunk_only:
4641     generate_offsets_for_symbolings()
4642   Log().write(LOG_QUIET, "Done.")
4643
4644 def pass8():
4645   svncounter = 2 # Repository initialization is 1.
4646   repos = SVNRepositoryMirror()
4647   persistence_manager = PersistenceManager(DB_OPEN_READ)
4648
4649   if Ctx().target:
4650     if not Ctx().dry_run:
4651       repos.add_delegate(RepositoryDelegate())
4652     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4653   else:
4654     if not Ctx().dry_run:
4655       repos.add_delegate(DumpfileDelegate())
4656     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4657
4658   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4659
4660   while 1:
4661     svn_commit = persistence_manager.get_svn_commit(svncounter)
4662     if not svn_commit:
4663       break
4664     repos.commit(svn_commit)
4665     svncounter += 1
4666
4667   repos.finish()
4668
4669 _passes = [
4670   pass1,
4671   pass2,
4672   pass3,
4673   pass4,
4674   pass5,
4675   pass6,
4676   pass7,
4677   pass8,
4678   ]
4679
4680
4681 class Ctx:
4682   """Session state for this run of cvs2svn.  For example, run-time
4683   options are stored here.  This class is a Borg, see
4684   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4685   """
4686   __shared_state = { }
4687   def __init__(self):
4688     self.__dict__ = self.__shared_state
4689     if self.__dict__:
4690       return
4691     # Else, initialize to defaults.
4692     self.target = None
4693     self.dumpfile = DUMPFILE
4694     self.tmpdir = '.'
4695     self.verbose = 0
4696     self.quiet = 0
4697     self.prune = 1
4698     self.existing_svnrepos = 0
4699     self.dump_only = 0
4700     self.dry_run = 0
4701     self.trunk_only = 0
4702     self.trunk_base = "trunk"
4703     self.tags_base = "tags"
4704     self.branches_base = "branches"
4705     self.encoding = ["ascii"]
4706     self.mime_types_file = None
4707     self.auto_props_file = None
4708     self.auto_props_ignore_case = False
4709     self.no_default_eol = 0
4710     self.eol_from_mime_type = 0
4711     self.keywords_off = 0
4712     self.use_cvs = None
4713     self.svnadmin = "svnadmin"
4714     self.username = None
4715     self.print_help = 0
4716     self.skip_cleanup = 0
4717     self.bdb_txn_nosync = 0
4718     self.fs_type = None
4719     self.forced_branches = []
4720     self.forced_tags = []
4721     self.excludes = []
4722     self.symbol_transforms = []
4723     self.svn_property_setters = []
4724
4725
4726 class CVSRevisionNumberSetter(SVNPropertySetter):
4727   """Set the cvs2svn:cvs-rev property to the CVS revision number."""
4728
4729   def set_properties(self, s_item):
4730     s_item.svn_props['cvs2svn:cvs-rev'] = s_item.c_rev.rev
4731     s_item.svn_props_changed = True
4732
4733
4734 class MimeMapper(SVNPropertySetter):
4735   """A class that provides mappings from file names to MIME types."""
4736
4737   def __init__(self, mime_types_file):
4738     self.mappings = { }
4739
4740     for line in fileinput.input(mime_types_file):
4741       if line.startswith("#"):
4742         continue
4743
4744       # format of a line is something like
4745       # text/plain c h cpp
4746       extensions = line.split()
4747       if len(extensions) < 2:
4748         continue
4749       type = extensions.pop(0)
4750       for ext in extensions:
4751         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4752           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n"
4753                            % (warning_prefix, ext, self.mappings[ext], type))
4754         self.mappings[ext] = type
4755
4756   def set_properties(self, s_item):
4757     basename, extension = os.path.splitext(
4758         os.path.basename(s_item.c_rev.cvs_path)
4759         )
4760
4761     # Extension includes the dot, so strip it (will leave extension
4762     # empty if filename ends with a dot, which is ok):
4763     extension = extension[1:]
4764
4765     # If there is no extension (or the file ends with a period), use
4766     # the base name for mapping.  This allows us to set mappings for
4767     # files such as README or Makefile:
4768     if not extension:
4769       extension = basename
4770
4771     mime_type = self.mappings.get(extension, None)
4772     if mime_type is not None:
4773       s_item.svn_props['svn:mime-type'] = mime_type
4774
4775
4776 class AutoPropsPropertySetter(SVNPropertySetter):
4777   """Set arbitrary svn properties based on an auto-props configuration.
4778
4779   This class always supports case-sensitive and case-insensitive
4780   pattern matching.  The 'correct' behavior is not quite clear,
4781   because subversion itself does an inconsistent job of handling case
4782   in auto-props patterns; see
4783   http://subversion.tigris.org/issues/show_bug.cgi?id=2036."""
4784
4785   class Pattern:
4786     """Describes the properties to be set for files matching a pattern."""
4787     def __init__(self, pattern, propdict):
4788       # A glob-like pattern:
4789       self.pattern = pattern
4790       # A dictionary of properties that should be set:
4791       self.propdict = propdict
4792
4793     def match(self, basename):
4794       """Does the file with the specified basename match pattern?"""
4795       return fnmatch.fnmatch(basename, self.pattern)
4796
4797   def __init__(self, configfilename, ignore_case):
4798     config = ConfigParser.ConfigParser()
4799     if ignore_case:
4800       self.transform_case = self.squash_case
4801     else:
4802       config.optionxform = self.preserve_case
4803       self.transform_case = self.preserve_case
4804
4805     config.readfp(file(configfilename))
4806     self.patterns = []
4807     for section in config.sections():
4808       if self.transform_case(section) == 'auto-props':
4809         for (pattern, value) in config.items(section):
4810           if value:
4811             self._add_pattern(pattern, value)
4812
4813   def squash_case(self, s):
4814     return s.lower()
4815
4816   def preserve_case(self, s):
4817     return s
4818
4819   def _add_pattern(self, pattern, value):
4820     props = value.split(';')
4821     propdict = {}
4822     for prop in props:
4823       s = prop.split('=', 1)
4824       if len(s) == 1:
4825         propdict[s[0]] = None
4826       else:
4827         propdict[s[0]] = s[1]
4828     self.patterns.append(
4829         self.Pattern(self.transform_case(pattern), propdict))
4830
4831   def get_propdict(self, path):
4832     basename = self.transform_case(os.path.basename(path))
4833     propdict = {}
4834     for pattern in self.patterns:
4835       if pattern.match(basename):
4836         for (key,value) in pattern.propdict.items():
4837           if propdict.has_key(key):
4838             if propdict[key] != value:
4839               Log().write(
4840                   LOG_WARN,
4841                   "Contradictory values set for property '%s' for file %s."
4842                   % (k, path,))
4843           else:
4844             propdict[key] = value
4845
4846     print 'propdict %s -> %s' % (path, propdict,) ###
4847     return propdict
4848
4849   def set_properties(self, s_item):
4850     propdict = self.get_propdict(s_item.c_rev.cvs_path)
4851     for (k,v) in propdict.items():
4852       if s_item.svn_props.has_key(k):
4853         if s_item.svn_props[k] != v:
4854           Log().write(
4855               LOG_WARN,
4856               "Property '%s' already set for file %s."
4857               % (k, s_item.c_rev.cvs_path,))
4858       else:
4859         s_item.svn_props[k] = v
4860
4861
4862 class BinaryFileDefaultMimeTypeSetter(SVNPropertySetter):
4863   """Set the default mime type for binary files, if no other one is known."""
4864
4865   def set_properties(self, s_item):
4866     if not s_item.svn_props.has_key('svn:mime-type') \
4867            and s_item.c_rev.mode == 'b':
4868       s_item.svn_props['svn:mime-type'] = 'application/octet-stream'
4869
4870
4871 class BinaryFileEOLStyleSetter(SVNPropertySetter):
4872   """Set the eol-style for binary files to None."""
4873
4874   def set_properties(self, s_item):
4875     if s_item.c_rev.mode == 'b':
4876       s_item.svn_props['svn:eol-style'] = None
4877
4878
4879 class EOLStyleFromMimeTypeSetter(SVNPropertySetter):
4880   """Set the eol-style from the mime type if it is not already known.
4881
4882   This setting is influenced by the mime-type setting, which must
4883   already have been set.  See also issue #39."""
4884
4885   def set_properties(self, s_item):
4886     if not s_item.svn_props.has_key('svn:eol-style') \
4887        and s_item.svn_props.get('svn:mime-type', None) is not None:
4888       if s_item.svn_props['svn:mime-type'].startswith("text/"):
4889         s_item.svn_props['svn:eol-style'] = 'native'
4890       else:
4891         s_item.svn_props['svn:eol-style'] = None
4892
4893
4894 class DefaultEOLStyleSetter(SVNPropertySetter):
4895   """Set the default eol-style if one has not already been set."""
4896
4897   def __init__(self, value):
4898     """Initialize with the specified default VALUE."""
4899
4900     self.value = value
4901
4902   def set_properties(self, s_item):
4903     if not s_item.svn_props.has_key('svn:eol-style'):
4904       s_item.svn_props['svn:eol-style'] = self.value
4905
4906
4907 class KeywordsPropertySetter(SVNPropertySetter):
4908   """Set the svn:keywords property based on the file's mode.  See
4909   issue #2."""
4910
4911   def __init__(self, value):
4912     """Use VALUE for the value of the svn:keywords property if it is
4913     to be set."""
4914
4915     self.value = value
4916
4917   def set_properties(self, s_item):
4918     if not s_item.svn_props.has_key('svn:keywords') \
4919            and s_item.c_rev.mode in [None, 'kv', 'kvl']:
4920       s_item.svn_props['svn:keywords'] = self.value
4921
4922
4923 class ExecutablePropertySetter(SVNPropertySetter):
4924   """Set the svn:executable property based on c_rev.file_executable."""
4925
4926   def set_properties(self, s_item):
4927     if s_item.c_rev.file_executable:
4928       s_item.svn_props['svn:executable'] = '*'
4929
4930
4931 def convert(start_pass, end_pass):
4932   "Convert a CVS repository to an SVN repository."
4933
4934   cleanup = Cleanup()
4935   times = [ None ] * (end_pass + 1)
4936   times[start_pass - 1] = time.time()
4937   StatsKeeper().set_start_time(time.time())
4938   for i in range(start_pass - 1, end_pass):
4939     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4940     _passes[i]()
4941     times[i + 1] = time.time()
4942     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4943     # Dispose of items in Ctx() not intended to live past the end of the pass
4944     # (Identified by exactly one leading underscore)
4945     for attr in dir(Ctx()):
4946       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4947           and attr[:6] != "_Ctx__"):
4948         delattr(Ctx(), attr)
4949     if not Ctx().skip_cleanup:
4950       cleanup.cleanup(_passes[i])
4951     StatsKeeper().set_end_time(time.time())
4952
4953   Log().write(LOG_QUIET, StatsKeeper())
4954   if end_pass < 4:
4955     Log().write(LOG_QUIET,
4956                 '(These are unaltered CVS repository stats and do not\n'
4957                 ' reflect tags or branches excluded via --exclude)\n')
4958   Log().write(LOG_NORMAL, StatsKeeper().timings())
4959
4960
4961 def normalize_ttb_path(opt, path):
4962   """Normalize a path to be used for --trunk, --tags, or --branches.
4963
4964   1. Strip leading, trailing, and duplicated '/'.
4965   2. Verify that the path is not empty.
4966
4967   Return the normalized path.
4968
4969   If the path is invalid, write an error message and exit."""
4970
4971   norm_path = _path_join(*path.split('/'))
4972   if not norm_path:
4973     raise FatalError("cannot pass an empty path to %s." % (opt,))
4974   return norm_path
4975
4976
4977 def verify_paths_disjoint(*paths):
4978   """Verify that all of the paths in the argument list are disjoint.
4979
4980   If any of the paths is nested in another one (i.e., in the sense
4981   that 'a/b/c/d' is nested in 'a/b'), or any two paths are identical,
4982   write an error message and exit."""
4983
4984   paths = [(path.split('/'), path) for path in paths]
4985   # If all overlapping elements are equal, a shorter list is
4986   # considered "less than" a longer one.  Therefore if any paths are
4987   # nested, this sort will leave at least one such pair adjacent, in
4988   # the order [nest,nestling].
4989   paths.sort()
4990   for i in range(1, len(paths)):
4991     split_path1, path1 = paths[i - 1]
4992     split_path2, path2 = paths[i]
4993     if len(split_path1) <= len(split_path2) \
4994        and split_path2[:len(split_path1)] == split_path1:
4995       raise FatalError("paths %s and %s are not disjoint." % (path1, path2,))
4996
4997
4998 def usage():
4999   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
5000         % os.path.basename(sys.argv[0])
5001   print '  --help, -h           print this usage message and exit with success'
5002   print '  --version            print the version number'
5003   print '  -q                   quiet'
5004   print '  -v                   verbose'
5005   print '  -s PATH              path for SVN repos'
5006   print '  -p START[:END]       start at pass START, end at pass END of %d' \
5007         % len(_passes)
5008   print '                       If only START is given, run only pass START'
5009   print '                       (implicitly enables --skip-cleanup)'
5010   print '  --existing-svnrepos  load into existing SVN repository'
5011   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
5012   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
5013   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
5014   print '  --dry-run            do not create a repository or a dumpfile;'
5015   print '                       just print what would happen.'
5016   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
5017   print '                       (only use this if having problems with RCS)'
5018   print '  --svnadmin=PATH      path to the svnadmin program'
5019   print '  --trunk-only         convert only trunk commits, not tags nor branches'
5020   print '  --trunk=PATH         path for trunk (default: %s)'    \
5021         % Ctx().trunk_base
5022   print '  --branches=PATH      path for branches (default: %s)' \
5023         % Ctx().branches_base
5024   print '  --tags=PATH          path for tags (default: %s)'     \
5025         % Ctx().tags_base
5026   print '  --no-prune           don\'t prune empty directories'
5027   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
5028   print '  --encoding=ENC       encoding of paths and log messages in CVS repos'
5029   print '                       Multiple of these options may be passed, where they'
5030   print '                       will be treated as an ordered list of encodings to'
5031   print '                       attempt (with "ascii" as a hardcoded last resort)'
5032   print '  --force-branch=NAME  force NAME to be a branch'
5033   print '  --force-tag=NAME     force NAME to be a tag'
5034   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
5035   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
5036   print '                       use Python regexp and reference syntax respectively'
5037   print '  --username=NAME      username for cvs2svn-synthesized commits'
5038   print '  --skip-cleanup       prevent the deletion of intermediate files'
5039   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
5040   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
5041   print '  --cvs-revnums        record CVS revision numbers as file properties'
5042   print '  --auto-props=FILE    set file properties from the auto-props section'
5043   print '                       of a file in svn config format'
5044   print '  --auto-props-ignore-case Ignore case when matching auto-props patterns'
5045   print '  --mime-types=FILE    specify an apache-style mime.types file for'
5046   print '                       setting svn:mime-type'
5047   print '  --eol-from-mime-type set svn:eol-style from mime type if known'
5048   print '  --no-default-eol     don\'t set svn:eol-style to \'native\' for'
5049   print '                       non-binary files with undetermined mime types'
5050   print '  --keywords-off       don\'t set svn:keywords on any files (by default,'
5051   print '                       cvs2svn sets svn:keywords on non-binary files to'
5052   print '                       "%s")' % SVN_KEYWORDS_VALUE
5053
5054 def main():
5055   # Convenience var, so we don't have to keep instantiating this Borg.
5056   ctx = Ctx()
5057
5058   profiling = None
5059   start_pass = 1
5060   end_pass = len(_passes)
5061
5062   try:
5063     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
5064                                [ "help", "create", "trunk=",
5065                                  "username=", "existing-svnrepos",
5066                                  "branches=", "tags=", "encoding=",
5067                                  "force-branch=", "force-tag=", "exclude=",
5068                                  "use-cvs", "mime-types=",
5069                                  "auto-props=", "auto-props-ignore-case",
5070                                  "eol-from-mime-type", "no-default-eol",
5071                                  "trunk-only", "no-prune", "dry-run",
5072                                  "dump-only", "dumpfile=", "tmpdir=",
5073                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
5074                                  "bdb-txn-nosync", "fs-type=",
5075                                  "version", "profile",
5076                                  "keywords-off", "symbol-transform="])
5077   except getopt.GetoptError, e:
5078     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
5079     usage()
5080     sys.exit(1)
5081
5082   for opt, value in opts:
5083     if opt == '--version':
5084         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
5085         sys.exit(0)
5086     elif opt == '-p':
5087       # Don't cleanup if we're doing incrementals.
5088       ctx.skip_cleanup = 1
5089       if value.find(':') > 0:
5090         start_pass, end_pass = map(int, value.split(':'))
5091       else:
5092         end_pass = start_pass = int(value)
5093       if start_pass > len(_passes) or start_pass < 1:
5094         raise FatalError(
5095             'illegal value (%d) for starting pass.  Must be 1 through %d.'
5096             % (int(start_pass), len(_passes),))
5097       if end_pass < start_pass or end_pass > len(_passes):
5098         raise FatalError(
5099             'illegal value (%d) for ending pass.  Must be %d through %d.'
5100             % (int(end_pass), int(start_pass), len(_passes),))
5101     elif (opt == '--help') or (opt == '-h'):
5102       ctx.print_help = 1
5103     elif opt == '-v':
5104       Log().log_level = LOG_VERBOSE
5105       ctx.verbose = 1
5106     elif opt == '-q':
5107       Log().log_level = LOG_QUIET
5108       ctx.quiet = 1
5109     elif opt == '-s':
5110       ctx.target = value
5111     elif opt == '--existing-svnrepos':
5112       ctx.existing_svnrepos = 1
5113     elif opt == '--dumpfile':
5114       ctx.dumpfile = value
5115     elif opt == '--tmpdir':
5116       ctx.tmpdir = value
5117     elif opt == '--use-cvs':
5118       ctx.use_cvs = 1
5119     elif opt == '--svnadmin':
5120       ctx.svnadmin = value
5121     elif opt == '--trunk-only':
5122       ctx.trunk_only = 1
5123     elif opt == '--trunk':
5124       ctx.trunk_base = normalize_ttb_path(opt, value)
5125     elif opt == '--branches':
5126       ctx.branches_base = normalize_ttb_path(opt, value)
5127     elif opt == '--tags':
5128       ctx.tags_base = normalize_ttb_path(opt, value)
5129     elif opt == '--no-prune':
5130       ctx.prune = None
5131     elif opt == '--dump-only':
5132       ctx.dump_only = 1
5133     elif opt == '--dry-run':
5134       ctx.dry_run = 1
5135     elif opt == '--encoding':
5136       ctx.encoding.insert(-1, value)
5137     elif opt == '--force-branch':
5138       ctx.forced_branches.append(value)
5139     elif opt == '--force-tag':
5140       ctx.forced_tags.append(value)
5141     elif opt == '--exclude':
5142       try:
5143         ctx.excludes.append(re.compile('^' + value + '$'))
5144       except re.error, e:
5145         raise FatalError("'%s' is not a valid regexp." % (value,))
5146     elif opt == '--mime-types':
5147       ctx.mime_types_file = value
5148     elif opt == '--auto-props':
5149       ctx.auto_props_file = value
5150     elif opt == '--auto-props-ignore-case':
5151       ctx.auto_props_ignore_case = True
5152     elif opt == '--eol-from-mime-type':
5153       ctx.eol_from_mime_type = 1
5154     elif opt == '--no-default-eol':
5155       ctx.no_default_eol = 1
5156     elif opt == '--keywords-off':
5157       ctx.keywords_off = 1
5158     elif opt == '--username':
5159       ctx.username = value
5160     elif opt == '--skip-cleanup':
5161       ctx.skip_cleanup = 1
5162     elif opt == '--cvs-revnums':
5163       ctx.svn_property_setters.append(CVSRevisionNumberSetter())
5164     elif opt == '--bdb-txn-nosync':
5165       ctx.bdb_txn_nosync = 1
5166     elif opt == '--fs-type':
5167       ctx.fs_type = value
5168     elif opt == '--create':
5169       sys.stderr.write(warning_prefix +
5170           ': The behaviour produced by the --create option is now the '
5171           'default,\nand passing the option is deprecated.\n')
5172     elif opt == '--profile':
5173       profiling = 1
5174     elif opt == '--symbol-transform':
5175       [pattern, replacement] = value.split(":")
5176       try:
5177         pattern = re.compile(pattern)
5178       except re.error, e:
5179         raise FatalError("'%s' is not a valid regexp." % (pattern,))
5180       ctx.symbol_transforms.append((pattern, replacement,))
5181
5182   if ctx.print_help:
5183     usage()
5184     sys.exit(0)
5185
5186   # Consistency check for options and arguments.
5187   if len(args) == 0:
5188     usage()
5189     sys.exit(1)
5190
5191   if len(args) > 1:
5192     sys.stderr.write(error_prefix +
5193                      ": must pass only one CVS repository.\n")
5194     usage()
5195     sys.exit(1)
5196
5197   cvsroot = args[0]
5198
5199   if ctx.use_cvs:
5200     ctx.cvs_repository = CVSRepositoryViaCVS(cvsroot)
5201   else:
5202     ctx.cvs_repository = CVSRepositoryViaRCS(cvsroot)
5203
5204   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
5205     raise FatalError("must pass one of '-s' or '--dump-only'.")
5206
5207   def not_both(opt1val, opt1name, opt2val, opt2name):
5208     if opt1val and opt2val:
5209       raise FatalError("cannot pass both '%s' and '%s'."
5210                        % (opt1name, opt2name,))
5211
5212   not_both(ctx.target, '-s',
5213            ctx.dump_only, '--dump-only')
5214
5215   not_both(ctx.dump_only, '--dump-only',
5216            ctx.existing_svnrepos, '--existing-svnrepos')
5217
5218   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
5219            ctx.existing_svnrepos, '--existing-svnrepos')
5220
5221   not_both(ctx.dump_only, '--dump-only',
5222            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
5223
5224   not_both(ctx.quiet, '-q',
5225            ctx.verbose, '-v')
5226
5227   not_both(ctx.fs_type, '--fs-type',
5228            ctx.existing_svnrepos, '--existing-svnrepos')
5229
5230   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
5231     raise FatalError("cannot pass --bdb-txn-nosync with --fs-type=%s."
5232                      % ctx.fs_type)
5233
5234   # Create the default project (using ctx.trunk, ctx.branches, and ctx.tags):
5235   ctx.project = Project(ctx.cvs_repository.cvs_repos_path,
5236                         ctx.trunk_base, ctx.branches_base, ctx.tags_base)
5237
5238   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
5239     raise FatalError("the svn-repos-path '%s' is not an "
5240                      "existing directory." % ctx.target)
5241
5242   if not ctx.dump_only and not ctx.existing_svnrepos \
5243      and (not ctx.dry_run) and os.path.exists(ctx.target):
5244     raise FatalError("the svn-repos-path '%s' exists.\n"
5245                      "Remove it, or pass '--existing-svnrepos'."
5246                      % ctx.target)
5247
5248   if ctx.target and not ctx.dry_run:
5249     # Verify that svnadmin can be executed.  The 'help' subcommand
5250     # should be harmless.
5251     try:
5252       check_command_runs([ctx.svnadmin, 'help'], 'svnadmin')
5253     except CommandFailedException, e:
5254       raise FatalError(
5255           '%s\n'
5256           'svnadmin could not be executed.  Please ensure that it is\n'
5257           'installed and/or use the --svnadmin option.' % (e,))
5258
5259   if ctx.mime_types_file:
5260     ctx.svn_property_setters.append(MimeMapper(ctx.mime_types_file))
5261
5262   if ctx.auto_props_file:
5263     ctx.svn_property_setters.append(AutoPropsPropertySetter(
5264         ctx.auto_props_file, ctx.auto_props_ignore_case))
5265
5266   ctx.svn_property_setters.append(BinaryFileDefaultMimeTypeSetter())
5267   ctx.svn_property_setters.append(BinaryFileEOLStyleSetter())
5268
5269   if ctx.eol_from_mime_type:
5270     ctx.svn_property_setters.append(EOLStyleFromMimeTypeSetter())
5271
5272   if ctx.no_default_eol:
5273     ctx.svn_property_setters.append(DefaultEOLStyleSetter(None))
5274   else:
5275     ctx.svn_property_setters.append(DefaultEOLStyleSetter('native'))
5276
5277   if not ctx.keywords_off:
5278     ctx.svn_property_setters.append(
5279         KeywordsPropertySetter(SVN_KEYWORDS_VALUE))
5280
5281   ctx.svn_property_setters.append(ExecutablePropertySetter())
5282
5283   # Make sure the tmp directory exists.  Note that we don't check if
5284   # it's empty -- we want to be able to use, for example, "." to hold
5285   # tempfiles.  But if we *did* want check if it were empty, we'd do
5286   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
5287   if not os.path.exists(ctx.tmpdir):
5288     os.mkdir(ctx.tmpdir)
5289   elif not os.path.isdir(ctx.tmpdir):
5290     raise FatalError(
5291         "cvs2svn tried to use '%s' for temporary files, but that path\n"
5292         "  exists and is not a directory.  Please make it be a directory,\n"
5293         "  or specify some other directory for temporary files."
5294         % (ctx.tmpdir,))
5295
5296   # But do lock the tmpdir, to avoid process clash.
5297   try:
5298     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5299   except OSError, e:
5300     if e.errno == errno.EACCES:
5301       raise FatalError("Permission denied:"
5302                        + " No write access to directory '%s'." % ctx.tmpdir)
5303     if e.errno == errno.EEXIST:
5304       raise FatalError(
5305           "cvs2svn is using directory '%s' for temporary files, but\n"
5306           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
5307           "  cvs2svn process is currently using '%s' as its temporary\n"
5308           "  workspace.  If you are certain that is not the case,\n"
5309           "  then remove the '%s/cvs2svn.lock' subdirectory."
5310           % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir,))
5311     raise
5312   try:
5313     if profiling:
5314       import hotshot
5315       prof = hotshot.Profile('cvs2svn.hotshot')
5316       prof.runcall(convert, start_pass, end_pass)
5317       prof.close()
5318     else:
5319       convert(start_pass, end_pass)
5320   finally:
5321     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5322     except: pass
5323
5324
5325 if __name__ == '__main__':
5326   try:
5327     main()
5328   except FatalException, e:
5329     sys.stderr.write(str(e))
5330     sys.exit(1)
5331
5332