cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import string
  30 import getopt
  31 import stat
  32 import md5
  33 import marshal
  34 import errno
  35 import popen2
  36 import types
  37 try:
  38   # Try to get access to a bunch of encodings for use with --encoding.
  39   # See http://cjkpython.i18n.org/ for details.
  40   import iconv_codec
  41 except ImportError:
  42   pass
  43
  44 # Warnings and errors start with these strings.  They are typically
  45 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  46 warning_prefix = "WARNING"
  47 error_prefix = "ERROR"
  48
  49 # Make sure this Python is recent enough.
  50 if sys.hexversion < 0x2000000:
  51   sys.stderr.write("'%s: Python 2.0 or higher required, "
  52                    "see www.python.org.\n" % error_prefix)
  53   sys.exit(1)
  54
  55 # Pretend we have true booleans on older python versions
  56 try:
  57   True
  58 except:
  59   True = 1
  60   False = 0
  61
  62 # Opening pipes was a mess before Python 2.4, because some methods did
  63 # not exist on some platforms, and some behaved differenly on other.
  64 # Python 2.4 solved this by adding the subprocess module, but since we
  65 # cannot require such a new version, we cannot use it directly, but
  66 # must implement a simplified Popen using the best means neccessary.
  67 #
  68 # The SimplePopen class only has the following members and methods, all
  69 # behaving as documented in the subprocess.Popen class:
  70 #     - stdin
  71 #     - stdout
  72 #     - stderr
  73 #     - wait
  74 try:
  75   # First try subprocess.Popen...
  76   import subprocess
  77   class SimplePopen:
  78     def __init__(self, cmd, capture_stderr):
  79       if capture_stderr:
  80         stderr = subprocess.PIPE
  81       else:
  82         stderr = None
  83       self._popen = subprocess.Popen(cmd, stdin=subprocess.PIPE,
  84                                     stdout=subprocess.PIPE, stderr=stderr)
  85       self.stdin = self._popen.stdin
  86       self.stdout = self._popen.stdout
  87       if capture_stderr:
  88         self.stderr = self._popen.stderr
  89       self.wait = self._popen.wait
  90 except ImportError:
  91   if hasattr(popen2, 'Popen3'):
  92     # ...then try popen2.Popen3...
  93     class SimplePopen:
  94       def __init__(self, cmd, capture_stderr):
  95         self._popen3 = popen2.Popen3(cmd, capture_stderr)
  96         self.stdin = self._popen3.tochild
  97         self.stdout = self._popen3.fromchild
  98         if capture_stderr:
  99           self.stderr = self._popen3.childerr
 100         self.wait = self._popen3.wait
 101   else:
 102     # ...and if all fails, use popen2.popen3...
 103     class SimplePopen:
 104       def __init__(self, cmd, capture_stderr):
 105         if type(cmd) != types.StringType:
 106           cmd = argv_to_command_string(cmd)
 107         self.stdout, self.stdin, self.stderr = popen2.popen3(cmd, mode='b')
 108       def wait(self):
 109         return self.stdout.close() or self.stdin.close() or \
 110                self.stderr.close()
 111
 112 # DBM module selection
 113
 114 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
 115 #    so that the dbhash module used by anydbm will use bsddb3.
 116 try:
 117   import bsddb3
 118   sys.modules['bsddb'] = sys.modules['bsddb3']
 119 except ImportError:
 120   pass
 121
 122 # 2. These DBM modules are not good for cvs2svn.
 123 import anydbm
 124 if (anydbm._defaultmod.__name__ == 'dumbdbm'
 125     or anydbm._defaultmod.__name__ == 'dbm'):
 126   sys.stderr.write(
 127     error_prefix
 128     + ': your installation of Python does not contain a suitable\n'
 129     + 'DBM module -- cvs2svn cannot continue.\n'
 130     + 'See http://python.org/doc/current/lib/module-anydbm.html to solve.\n')
 131   sys.exit(1)
 132
 133 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
 134 #    Unfortunately, gdbm appears not to be trouble free, either.
 135 if hasattr(anydbm._defaultmod, 'bsddb') \
 136     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
 137   try:
 138     gdbm = __import__('gdbm')
 139   except ImportError:
 140     sys.stderr.write(warning_prefix +
 141         ': The version of the bsddb module found '
 142         'on your computer has been reported to malfunction on some datasets, '
 143         'causing KeyError exceptions. You may wish to upgrade your Python to '
 144         'version 2.3 or later.\n')
 145   else:
 146     anydbm._defaultmod = gdbm
 147
 148 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 149 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 150 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 151
 152 SVN_KEYWORDS_VALUE = 'Author Date Id Revision'
 153
 154 # This really only matches standard '1.1.1.*'-style vendor revisions.
 155 # One could conceivably have a file whose default branch is 1.1.3 or
 156 # whatever, or was that at some point in time, with vendor revisions
 157 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 158 # is the only time this regexp gets used), we'd have no basis for
 159 # assuming that the non-standard vendor branch had ever been the
 160 # default branch anyway, so we don't want this to match them anyway.
 161 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 162
 163 # If this run's output is a repository, then (in the tmpdir) we use
 164 # a dumpfile of this name for repository loads.
 165 #
 166 # If this run's output is a dumpfile, then this is default name of
 167 # that dumpfile, but in the current directory (unless the user has
 168 # specified a dumpfile path, of course, in which case it will be
 169 # wherever the user said).
 170 DUMPFILE = 'cvs2svn-dump'
 171
 172 # This file appears with different suffixes at different stages of
 173 # processing.  CVS revisions are cleaned and sorted here, for commit
 174 # grouping.  See design-notes.txt for details.
 175 DATAFILE = 'cvs2svn-data'
 176
 177 # This file contains a marshalled copy of all the statistics that we
 178 # gather throughout the various runs of cvs2svn.  The data stored as a
 179 # marshalled dictionary.
 180 STATISTICS_FILE = 'cvs2svn-statistics'
 181
 182 # This text file contains records (1 per line) that describe svn
 183 # filesystem paths that are the opening and closing source revisions
 184 # for copies to tags and branches.  The format is as follows:
 185 #
 186 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 187 #
 188 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 189 # SVN_REVNUM are the primary and secondary sorting criteria for
 190 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 191 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 192 # A sorted version of the above file.
 193 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 194
 195 # This file is a temporary file for storing symbolic_name -> closing
 196 # CVSRevision until the end of our pass where we can look up the
 197 # corresponding SVNRevNum for the closing revs and write these out to
 198 # the SYMBOL_OPENINGS_CLOSINGS.
 199 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 200
 201 # Skeleton version of an svn filesystem.
 202 # (These supersede and will eventually replace the two above.)
 203 # See class SVNRepositoryMirror for how these work.
 204 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 205 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 206
 207 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 208 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 209 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 210
 211 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 212 # the CVSRevision is the last such that is a source for those symbolic
 213 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 214 # file, and this file's 1.3 is the latest (by date) revision among
 215 # *all* CVS files that is a source for branch B, then the
 216 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 217 # list at least B in its list.
 218 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 219
 220 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 221 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 222 ### the s-revs data in this database.
 223 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 224
 225 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 226 # names), values are ignorable.
 227 TAGS_DB = 'cvs2svn-tags.db'
 228
 229 # A list all tags.  Each line consists of the tag name and the number
 230 # of files in which it exists, separated by a space.
 231 TAGS_LIST = 'cvs2svn-tags.txt'
 232
 233 # A list of all branches.  The file is stored as a plain text file
 234 # to make it easy to look at in an editor.  Each line contains the
 235 # branch name, the number of files where the branch is created, the
 236 # commit count, and a list of tags and branches that are defined on
 237 # revisions in the branch.
 238 BRANCHES_LIST = 'cvs2svn-branches.txt'
 239
 240 # These two databases provide a bidirectional mapping between
 241 # CVSRevision.unique_key()s and Subversion revision numbers.
 242 #
 243 # The first maps CVSRevision.unique_key() to a number; the values are
 244 # not unique.
 245 #
 246 # The second maps a number to a list of CVSRevision.unique_key()s.
 247 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 248 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 249
 250 # This database maps svn_revnums to tuples of (symbolic_name, date).
 251 #
 252 # The svn_revnums are the revision numbers of all non-primary
 253 # SVNCommits.  No primary SVNCommit has a key in this database.
 254 #
 255 # The date is stored for all commits in this database.
 256 #
 257 # For commits that fill symbolic names, the symbolic_name is stored.
 258 # For commits that default branch syncs, the symbolic_name is None.
 259 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 260
 261 # This database maps svn_revnums of a default branch synchronization
 262 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 263 #
 264 # (NOTE: Secondary commits that fill branches and tags also have a
 265 # motivating commit, but we do not record it because it is (currently)
 266 # not needed for anything.)
 267 #
 268 # This mapping is used when generating the log message for the commit
 269 # that synchronizes the default branch with trunk.
 270 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 271
 272 # How many bytes to read at a time from a pipe.  128 kiB should be
 273 # large enough to be efficient without wasting too much memory.
 274 PIPE_READ_SIZE = 128 * 1024
 275
 276 # Record the default RCS branches, if any, for CVS filepaths.
 277 #
 278 # The keys are CVS filepaths, relative to the top of the repository
 279 # and with the ",v" stripped off, so they match the cvs paths used in
 280 # Commit.commit().  The values are vendor branch revisions, such as
 281 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 282 # represents the highest vendor branch revision thought to have ever
 283 # been head of the default branch.
 284 #
 285 # The reason we record a specific vendor revision, rather than a
 286 # default branch number, is that there are two cases to handle:
 287 #
 288 # One case is simple.  The RCS file lists a default branch explicitly
 289 # in its header, such as '1.1.1'.  In this case, we know that every
 290 # revision on the vendor branch is to be treated as head of trunk at
 291 # that point in time.
 292 #
 293 # But there's also a degenerate case.  The RCS file does not currently
 294 # have a default branch, yet we can deduce that for some period in the
 295 # past it probably *did* have one.  For example, the file has vendor
 296 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 297 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 298 # case, we should record 1.1.1.96 as the last vendor revision to have
 299 # been the head of the default branch.
 300 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 301
 302 # Records the author and log message for each changeset.
 303 # The keys are author+log digests, the same kind used to identify
 304 # unique revisions in the .revs, etc files.  Each value is a tuple
 305 # of two elements: '(author logmessage)'.
 306 METADATA_DB = "cvs2svn-metadata.db"
 307
 308 # A temporary on-disk hash that maps CVSRevision unique keys to a new
 309 # timestamp for that CVSRevision.  These new timestamps are created in
 310 # pass2, and this hash is used exclusively in pass2.
 311 TWEAKED_TIMESTAMPS_DB = "cvs2svn-fixed-timestamps.db"
 312
 313 REVS_SUFFIX = '.revs'
 314 CLEAN_REVS_SUFFIX = '.c-revs'
 315 SORTED_REVS_SUFFIX = '.s-revs'
 316 RESYNC_SUFFIX = '.resync'
 317
 318 SVN_INVALID_REVNUM = -1
 319
 320 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 321
 322 # Things that can happen to a file.
 323 OP_NOOP   = '-'
 324 OP_ADD    = 'A'
 325 OP_DELETE = 'D'
 326 OP_CHANGE = 'C'
 327
 328 # A deltatext either does or doesn't represent some change.
 329 DELTATEXT_NONEMPTY = 'N'
 330 DELTATEXT_EMPTY    = 'E'
 331
 332 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 333
 334 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 335 OPENING = 'O'
 336 CLOSING = 'C'
 337
 338 class FatalException(Exception):
 339   """Exception thrown on a non-recoverable error.
 340
 341   If this exception is thrown by main(), it is caught by the global
 342   layer of the program, its string representation is printed, and the
 343   program is ended with an exit code of 1."""
 344
 345   pass
 346
 347
 348 class FatalError(FatalException):
 349   """A FatalException that prepends error_prefix to the message."""
 350
 351   def __init__(self, msg):
 352     """Use (error_prefix + ': ' + MSG + '\n') as the error message."""
 353
 354     FatalException.__init__(self, '%s: %s\n' % (error_prefix, msg,))
 355
 356
 357 def temp(basename):
 358   """Return a path to BASENAME in Ctx().tmpdir.
 359   This is a convenience function to save horizontal space in source."""
 360   return os.path.join(Ctx().tmpdir, basename)
 361
 362 # Since the unofficial set also includes [/\] we need to translate those
 363 # into ones that don't conflict with Subversion limitations.
 364 def _clean_symbolic_name(name):
 365   """Return symbolic name NAME, translating characters that Subversion
 366   does not allow in a pathname."""
 367   name = name.replace('/','++')
 368   name = name.replace('\\','--')
 369   return name
 370
 371 def _path_join(*components):
 372   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 373   Empty component are skipped."""
 374   return string.join(filter(None, components), '/')
 375
 376 def _path_split(path):
 377   """Split the svn pathname PATH into a pair, (HEAD, TAIL).
 378
 379   This is similar to os.path.split(), but always uses '/' as path
 380   separator.  PATH is an svn path, which should not start with a '/'.
 381   HEAD is everything before the last slash, and TAIL is everything
 382   after.  If PATH ends in a slash, TAIL will be empty.  If there is no
 383   slash in PATH, HEAD will be empty.  If PATH is empty, both HEAD and
 384   TAIL are empty."""
 385
 386   pos = path.rfind('/')
 387   if pos == -1:
 388     return ('', path,)
 389   else:
 390     return (path[:pos], path[pos+1:],)
 391
 392 def to_utf8(value, mode='replace'):
 393   """Encode (as Unicode) VALUE, trying the encodings in Ctx.encoding
 394   as valid source encodings.  Raise UnicodeError on failure of all
 395   source encodings."""
 396   ### FIXME: The 'replace' default mode should be an option,
 397   ### like --encoding is.
 398   for encoding in Ctx().encoding:
 399     try:
 400       return unicode(value, encoding, mode).encode('utf8')
 401     except UnicodeError:
 402       Log().write(LOG_VERBOSE, "Encoding '%s' failed for string '%s'"
 403                   % (encoding, value))
 404   raise UnicodeError
 405
 406 def run_command(command):
 407   if os.system(command):
 408     raise FatalError('Command failed: "%s"' % (command,))
 409
 410
 411 class CommandFailedException(Exception):
 412   """Exception raised if check_command_runs() fails."""
 413
 414   pass
 415
 416
 417 def check_command_runs(cmd, cmdname):
 418   """Check whether the command CMD can be executed without errors.
 419
 420   CMD is a list or string, as accepted by SimplePopen.  CMDNAME is the
 421   name of the command as it should be included in exception error
 422   messages.
 423
 424   This function checks three things: (1) the command can be run
 425   without throwing an OSError; (2) it exits with status=0; (3) it
 426   doesn't output anything to stderr.  If any of these conditions is
 427   not met, raise a CommandFailedException describing the problem."""
 428
 429   try:
 430     pipe = SimplePopen(cmd, True)
 431   except OSError, e:
 432     raise CommandFailedException('error executing %s: %s' % (cmdname, e,))
 433   pipe.stdin.close()
 434   pipe.stdout.read()
 435   errmsg = pipe.stderr.read()
 436   status = pipe.wait()
 437   if status != 0 or errmsg:
 438     msg = 'error executing %s: status %s' % (cmdname, status,)
 439     if errmsg:
 440       msg += ', error output:\n%s' % (errmsg,)
 441     raise CommandFailedException(msg)
 442
 443
 444 class CVSRepository:
 445   """A CVS repository from which data can be extracted."""
 446
 447   def __init__(self, cvs_repos_path):
 448     """CVS_REPOS_PATH is the top of the CVS repository (at least as
 449     far as this run is concerned)."""
 450
 451     if not os.path.isdir(cvs_repos_path):
 452       raise FatalError("The specified CVS repository path '%s' is not an "
 453                        "existing directory." % cvs_repos_path)
 454
 455     self.cvs_repos_path = os.path.normpath(cvs_repos_path)
 456
 457   def get_cvs_path(self, fname):
 458     """Return the path to FNAME relative to cvs_repos_path, with ',v' removed.
 459
 460     FNAME is a filesystem name that has to begin (textually) with
 461     self.cvs_repos_path and end with ',v'.  Those parts will be
 462     stripped off and os.sep will be converted to '/'."""
 463
 464     if not fname.startswith(self.cvs_repos_path):
 465       raise FatalError(
 466           "get_cvs_path: '%s' is not a sub-path of '%s'"
 467           % (fname, self.cvs_repos_path,))
 468     if not fname.endswith(',v'):
 469       raise FatalError("get_cvs_path: '%s' does not end with ',v'"
 470                        % (fname,))
 471     l = len(self.cvs_repos_path)
 472     if fname[l] == os.sep:
 473       l += 1
 474     return string.replace(fname[l:-2], os.sep, '/')
 475
 476   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 477     """Return a command string, and the pipe created using that
 478     string.  C_REV is a CVSRevision.  If SUPPRESS_KEYWORD_SUBSTITUTION
 479     is True, then suppress the substitution of RCS/CVS keywords in the
 480     output.  The pipe returns the text of that CVS Revision."""
 481     raise NotImplementedError
 482
 483
 484 class CVSRepositoryViaRCS(CVSRepository):
 485   """A CVSRepository accessed via RCS."""
 486
 487   def __init__(self, cvs_repos_path):
 488     CVSRepository.__init__(self, cvs_repos_path)
 489     try:
 490       check_command_runs([ 'co', '-V' ], 'co')
 491     except CommandFailedException, e:
 492       raise FatalError('%s\n'
 493                        'Please check that co is installed and in your PATH\n'
 494                        '(it is a part of the RCS software).' % (e,))
 495
 496   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 497     pipe_cmd = [ 'co', '-q', '-x,v', '-p' + c_rev.rev ]
 498     if suppress_keyword_substitution:
 499       pipe_cmd.append('-kk')
 500     pipe_cmd.append(c_rev.rcs_path())
 501     pipe = SimplePopen(pipe_cmd, True)
 502     pipe.stdin.close()
 503     return pipe_cmd, pipe
 504
 505
 506 class CVSRepositoryViaCVS(CVSRepository):
 507   """A CVSRepository accessed via CVS."""
 508
 509   def __init__(self, cvs_repos_path):
 510     CVSRepository.__init__(self, cvs_repos_path)
 511     # Ascend above the specified root if necessary, to find the
 512     # cvs_repository_root (a directory containing a CVSROOT directory)
 513     # and the cvs_module (the path of the conversion root within the
 514     # cvs repository) NB: cvs_module must be seperated by '/' *not* by
 515     # os.sep .
 516     def is_cvs_repository_root(path):
 517       return os.path.isdir(os.path.join(path, 'CVSROOT'))
 518
 519     self.cvs_repository_root = os.path.abspath(self.cvs_repos_path)
 520     self.cvs_module = ""
 521     while not is_cvs_repository_root(self.cvs_repository_root):
 522       # Step up one directory:
 523       prev_cvs_repository_root = self.cvs_repository_root
 524       self.cvs_repository_root, module_component = \
 525           os.path.split(self.cvs_repository_root)
 526       if self.cvs_repository_root == prev_cvs_repository_root:
 527         # Hit the root (of the drive, on Windows) without finding a
 528         # CVSROOT dir.
 529         raise FatalError(
 530             "the path '%s' is not a CVS repository, nor a path "
 531             "within a CVS repository.  A CVS repository contains "
 532             "a CVSROOT directory within its root directory."
 533             % (self.cvs_repos_path,))
 534
 535       self.cvs_module = module_component + "/" + self.cvs_module
 536
 537     os.environ['CVSROOT'] = self.cvs_repository_root
 538
 539     def cvs_ok(global_arguments):
 540       check_command_runs(
 541           [ 'cvs' ] + global_arguments + [ '--version' ], 'cvs')
 542
 543     self.global_arguments = [ "-q", "-R" ]
 544     try:
 545       cvs_ok(self.global_arguments)
 546     except CommandFailedException, e:
 547       self.global_arguments = [ "-q" ]
 548       try:
 549         cvs_ok(self.global_arguments)
 550       except CommandFailedException, e:
 551         raise FatalError(
 552             '%s\n'
 553             'Please check that cvs is installed and in your PATH.' % (e,))
 554
 555   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 556     pipe_cmd = [ 'cvs' ] + self.global_arguments + \
 557                [ 'co', '-r' + c_rev.rev, '-p' ]
 558     if suppress_keyword_substitution:
 559       pipe_cmd.append('-kk')
 560     pipe_cmd.append(self.cvs_module + c_rev.cvs_path)
 561     pipe = SimplePopen(pipe_cmd, True)
 562     pipe.stdin.close()
 563     return pipe_cmd, pipe
 564
 565
 566 def generate_ignores(c_rev):
 567   # Read in props
 568   pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(c_rev)
 569   buf = pipe.stdout.read(PIPE_READ_SIZE)
 570   raw_ignore_val = ""
 571   while buf:
 572     raw_ignore_val = raw_ignore_val + buf
 573     buf = pipe.stdout.read(PIPE_READ_SIZE)
 574   pipe.stdout.close()
 575   error_output = pipe.stderr.read()
 576   exit_status = pipe.wait()
 577   if exit_status:
 578     raise FatalError("The command '%s' failed with exit status: %s\n"
 579                      "and the following output:\n"
 580                      "%s" % (pipe_cmd, exit_status, error_output))
 581
 582   # Tweak props: First, convert any spaces to newlines...
 583   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 584   raw_ignores = raw_ignore_val.split('\n')
 585   ignore_vals = [ ]
 586   for ignore in raw_ignores:
 587     # Reset the list if we encounter a '!'
 588     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 589     if ignore == '!':
 590       ignore_vals = [ ]
 591       continue
 592     # Skip empty lines
 593     if len(ignore) == 0:
 594       continue
 595     ignore_vals.append(ignore)
 596   return ignore_vals
 597
 598 # Return a string that has not been returned by gen_key() before.
 599 gen_key_base = 0L
 600 def gen_key():
 601   global gen_key_base
 602   key = '%x' % gen_key_base
 603   gen_key_base = gen_key_base + 1
 604   return key
 605
 606 # ============================================================================
 607 # This code is copied with a few modifications from:
 608 #   subversion/subversion/bindings/swig/python/svn/core.py
 609
 610 if sys.platform == "win32":
 611   _escape_shell_arg_re = re.compile(r'(\\+)(\"|$)')
 612
 613   def escape_shell_arg(arg):
 614     # The (very strange) parsing rules used by the C runtime library are
 615     # described at:
 616     # http://msdn.microsoft.com/library/en-us/vclang/html/_pluslang_Parsing_C.2b2b_.Command.2d.Line_Arguments.asp
 617
 618     # double up slashes, but only if they are followed by a quote character
 619     arg = re.sub(_escape_shell_arg_re, r'\1\1\2', arg)
 620
 621     # surround by quotes and escape quotes inside
 622     arg = '"' + string.replace(arg, '"', '"^""') + '"'
 623     return arg
 624
 625
 626   def argv_to_command_string(argv):
 627     """Flatten a list of command line arguments into a command string.
 628
 629     The resulting command string is expected to be passed to the system
 630     shell which os functions like popen() and system() invoke internally.
 631     """
 632
 633     # According cmd's usage notes (cmd /?), it parses the command line by
 634     # "seeing if the first character is a quote character and if so, stripping
 635     # the leading character and removing the last quote character."
 636     # So to prevent the argument string from being changed we add an extra set
 637     # of quotes around it here.
 638     return '"' + string.join(map(escape_shell_arg, argv), " ") + '"'
 639
 640 else:
 641   def escape_shell_arg(str):
 642     return "'" + string.replace(str, "'", "'\\''") + "'"
 643
 644   def argv_to_command_string(argv):
 645     """Flatten a list of command line arguments into a command string.
 646
 647     The resulting command string is expected to be passed to the system
 648     shell which os functions like popen() and system() invoke internally.
 649     """
 650
 651     return string.join(map(escape_shell_arg, argv), " ")
 652 # ============================================================================
 653
 654 def format_date(date):
 655   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 656   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 657   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 658
 659 def sort_file(infile, outfile):
 660   # sort the log files
 661
 662   # GNU sort will sort our dates differently (incorrectly!) if our
 663   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 664   # it to 'C'
 665   lc_all_tmp = os.environ.get('LC_ALL', None)
 666   os.environ['LC_ALL'] = 'C'
 667   # The -T option to sort has a nice side effect.  The Win32 sort is
 668   # case insensitive and cannot be used, and since it does not
 669   # understand the -T option and dies if we try to use it, there is
 670   # no risk that we use that sort by accident.
 671   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 672   if lc_all_tmp is None:
 673     del os.environ['LC_ALL']
 674   else:
 675     os.environ['LC_ALL'] = lc_all_tmp
 676
 677 def match_regexp_list(regexp_list, string):
 678   """Test whether STRING matches any of the compiled regexps in
 679   REGEXP_LIST."""
 680   for regexp in regexp_list:
 681     if regexp.match(string):
 682       return True
 683   return False
 684
 685 class LF_EOL_Filter:
 686   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 687   into LFs only."""
 688   def __init__(self, stream):
 689     self.stream = stream
 690     self.carry_cr = False
 691     self.eof = False
 692
 693   def read(self, size):
 694     while True:
 695       buf = self.stream.read(size)
 696       self.eof = len(buf) == 0
 697       if self.carry_cr:
 698         buf = '\r' + buf
 699         self.carry_cr = False
 700       if not self.eof and buf[-1] == '\r':
 701         self.carry_cr = True
 702         buf = buf[:-1]
 703       buf = string.replace(buf, '\r\n', '\n')
 704       buf = string.replace(buf, '\r', '\n')
 705       if len(buf) > 0 or self.eof:
 706         return buf
 707
 708
 709 # These constants represent the log levels that this script supports
 710 LOG_WARN = -1
 711 LOG_QUIET = 0
 712 LOG_NORMAL = 1
 713 LOG_VERBOSE = 2
 714 class Log:
 715   """A Simple logging facility.  Each line will be timestamped is
 716   self.use_timestamps is TRUE.  This class is a Borg, see
 717   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 718   __shared_state = {}
 719   def __init__(self):
 720     self.__dict__ = self.__shared_state
 721     if self.__dict__:
 722       return
 723     self.log_level = LOG_NORMAL
 724     # Set this to true if you want to see timestamps on each line output.
 725     self.use_timestamps = None
 726     self.logger = sys.stdout
 727
 728   def _timestamp(self):
 729     """Output a detailed timestamp at the beginning of each line output."""
 730     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 731
 732   def write(self, log_level, *args):
 733     """This is the public method to use for writing to a file.  Only
 734     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 735     there are multiple ARGS, they will be separated by a space."""
 736     if log_level > self.log_level:
 737       return
 738     if self.use_timestamps:
 739       self._timestamp()
 740     self.logger.write(' '.join(map(str,args)) + "\n")
 741     # Ensure that log output doesn't get out-of-order with respect to
 742     # stderr output.
 743     self.logger.flush()
 744
 745
 746 class Cleanup:
 747   """This singleton class manages any files created by cvs2svn.  When
 748   you first create a file, call Cleanup.register, passing the
 749   filename, and the last pass that you need the file.  After the end
 750   of that pass, your file will be cleaned up after running an optional
 751   callback.  This class is a Borg, see
 752   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 753
 754   __shared_state = {}
 755   def __init__(self):
 756     self.__dict__ = self.__shared_state
 757     if self.__dict__:
 758       return
 759     self._log = {}
 760     self._callbacks = {}
 761
 762   def register(self, file, which_pass, callback=None):
 763     """Register FILE for cleanup at the end of WHICH_PASS, running
 764     function CALLBACK prior to removal.  Registering a given FILE is
 765     idempotent; you may register as many times as you wish, but it
 766     will only be cleaned up once.
 767
 768     Note that if a file is registered multiple times, only the first
 769     callback registered for that file will be called at cleanup
 770     time.  Also note that if you register a database file you must
 771     close the database before cleanup, e.g. using a callback."""
 772     self._log.setdefault(which_pass, {})[file] = 1
 773     if callback and not self._callbacks.has_key(file):
 774       self._callbacks[file] = callback
 775
 776   def cleanup(self, which_pass):
 777     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 778     if not self._log.has_key(which_pass):
 779       return
 780     for file in self._log[which_pass].keys():
 781       Log().write(LOG_VERBOSE, "Deleting", file)
 782       if self._callbacks.has_key(file):
 783         self._callbacks[file]()
 784       os.unlink(file)
 785
 786
 787 # Always use these constants for opening databases.
 788 DB_OPEN_READ = 'r'
 789 DB_OPEN_NEW = 'n'
 790
 791
 792 class AbstractDatabase:
 793   """An abstract base class for anydbm-based databases."""
 794
 795   def __init__(self, filename, mode):
 796     """A convenience function for opening an anydbm database."""
 797     # pybsddb3 has a bug which prevents it from working with
 798     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 799     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 800     # for databases protected by lock and transaction support
 801     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 802     #
 803     # Therefore, manually perform the removal (we can do this, because
 804     # we know that for bsddb - but *not* anydbm in general - the database
 805     # consists of one file with the name we specify, rather than several
 806     # based on that name).
 807     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 808       if os.path.isfile(filename):
 809         os.unlink(filename)
 810       mode = 'c'
 811
 812     self.db = anydbm.open(filename, mode)
 813     self.has_key = self.db.has_key
 814     self.__delitem__ = self.db.__delitem__
 815
 816   def get(self, key, default=None):
 817     """bsddb3 doesn't have a get() method, so define one here."""
 818
 819     try:
 820       return self[key]
 821     except KeyError:
 822       return default
 823
 824
 825 class SDatabase(AbstractDatabase):
 826   """A database that can only store strings."""
 827
 828   def __getitem__(self, key):
 829     return self.db[key]
 830
 831   def __setitem__(self, key, value):
 832     self.db[key] = value
 833
 834
 835 class Database(AbstractDatabase):
 836   """A database that uses the marshal module to store built-in types."""
 837
 838   def __getitem__(self, key):
 839     return marshal.loads(self.db[key])
 840
 841   def __setitem__(self, key, value):
 842     self.db[key] = marshal.dumps(value)
 843
 844
 845 class StatsKeeper:
 846   __shared_state = { }
 847   def __init__(self):
 848     self.__dict__ = self.__shared_state
 849     if self.__dict__:
 850       return
 851     self.filename = temp(STATISTICS_FILE)
 852     Cleanup().register(self.filename, pass8)
 853     # This can get kinda large, so we don't store it in our data dict.
 854     self.repos_files = { }
 855
 856     if os.path.exists(self.filename):
 857       self.unarchive()
 858     else:
 859       self.data = { 'cvs_revs_count' : 0,
 860                     'tags': { },
 861                     'branches' : { },
 862                     'repos_size' : 0,
 863                     'repos_file_count' : 0,
 864                     'svn_rev_count' : None,
 865                     'first_rev_date' : 1L<<32,
 866                     'last_rev_date' : 0,
 867                     'pass_timings' : { },
 868                     'start_time' : 0,
 869                     'end_time' : 0,
 870                     }
 871
 872   def log_duration_for_pass(self, duration, pass_num):
 873     self.data['pass_timings'][pass_num] = duration
 874
 875   def set_start_time(self, start):
 876     self.data['start_time'] = start
 877
 878   def set_end_time(self, end):
 879     self.data['end_time'] = end
 880
 881   def _bump_item(self, key, amount=1):
 882     self.data[key] = self.data[key] + amount
 883
 884   def reset_c_rev_info(self):
 885     self.data['cvs_revs_count'] = 0
 886     self.data['tags'] = { }
 887     self.data['branches'] = { }
 888
 889   def record_c_rev(self, c_rev):
 890     self._bump_item('cvs_revs_count')
 891
 892     for tag in c_rev.tags:
 893       self.data['tags'][tag] = None
 894     for branch in c_rev.branches:
 895       self.data['branches'][branch] = None
 896
 897     if c_rev.timestamp < self.data['first_rev_date']:
 898       self.data['first_rev_date'] = c_rev.timestamp
 899
 900     if c_rev.timestamp > self.data['last_rev_date']:
 901       self.data['last_rev_date'] = c_rev.timestamp
 902
 903     # Only add the size if this is the first time we see the file.
 904     if not self.repos_files.has_key(c_rev.fname):
 905       self._bump_item('repos_size', c_rev.file_size)
 906     self.repos_files[c_rev.fname] = None
 907
 908     self.data['repos_file_count'] = len(self.repos_files)
 909
 910   def set_svn_rev_count(self, count):
 911     self.data['svn_rev_count'] = count
 912
 913   def svn_rev_count(self):
 914     return self.data['svn_rev_count']
 915
 916   def archive(self):
 917     open(self.filename, 'w').write(marshal.dumps(self.data))
 918
 919   def unarchive(self):
 920     self.data = marshal.loads(open(self.filename, 'r').read())
 921
 922   def __str__(self):
 923     svn_revs_str = ""
 924     if self.data['svn_rev_count'] is not None:
 925       svn_revs_str = ('Total SVN Commits:      %10s\n'
 926                       % self.data['svn_rev_count'])
 927
 928     return ('\n'                                \
 929             'cvs2svn Statistics:\n'             \
 930             '------------------\n'              \
 931             'Total CVS Files:        %10i\n'    \
 932             'Total CVS Revisions:    %10i\n'    \
 933             'Total Unique Tags:      %10i\n'    \
 934             'Total Unique Branches:  %10i\n'    \
 935             'CVS Repos Size in KB:   %10i\n'    \
 936             '%s'                                \
 937             'First Revision Date:    %s\n'      \
 938             'Last Revision Date:     %s\n'      \
 939             '------------------'                \
 940             % (self.data['repos_file_count'],
 941                self.data['cvs_revs_count'],
 942                len(self.data['tags']),
 943                len(self.data['branches']),
 944                (self.data['repos_size'] / 1024),
 945                svn_revs_str,
 946                time.ctime(self.data['first_rev_date']),
 947                time.ctime(self.data['last_rev_date']),
 948                ))
 949
 950   def timings(self):
 951     passes = self.data['pass_timings'].keys()
 952     passes.sort()
 953     str = 'Timings:\n------------------\n'
 954
 955     def desc(val):
 956       if val == 1: return "second"
 957       return "seconds"
 958
 959     for pass_num in passes:
 960       duration = int(self.data['pass_timings'][pass_num])
 961       p_str = ('pass %d:%6d %s\n'
 962                % (pass_num, duration, desc(duration)))
 963       str = str + p_str
 964
 965     total = int(self.data['end_time'] - self.data['start_time'])
 966     str = str + ('total: %6d %s' % (total, desc(total)))
 967     return str
 968
 969
 970 class LastSymbolicNameDatabase:
 971   """ Passing every CVSRevision in s-revs to this class will result in
 972   a Database whose key is the last CVS Revision a symbolicname was
 973   seen in, and whose value is a list of all symbolicnames that were
 974   last seen in that revision."""
 975   def __init__(self, mode):
 976     self.symbols = {}
 977     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 978     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 979
 980   # Once we've gone through all the revs,
 981   # symbols.keys() will be a list of all tags and branches, and
 982   # their corresponding values will be a key into the last CVS revision
 983   # that they were used in.
 984   def log_revision(self, c_rev):
 985     # Gather last CVS Revision for symbolic name info and tag info
 986     for tag in c_rev.tags:
 987       self.symbols[tag] = c_rev.unique_key()
 988     if c_rev.op is not OP_DELETE:
 989       for branch in c_rev.branches:
 990         self.symbols[branch] = c_rev.unique_key()
 991
 992   # Creates an inversion of symbols above--a dictionary of lists (key
 993   # = CVS rev unique_key: val = list of symbols that close in that
 994   # rev.
 995   def create_database(self):
 996     for sym, rev_unique_key in self.symbols.items():
 997       ary = self.symbol_revs_db.get(rev_unique_key, [])
 998       ary.append(sym)
 999       self.symbol_revs_db[rev_unique_key] = ary
1000
1001
1002 class CVSRevisionDatabase:
1003   """A Database to store CVSRevision objects and retrieve them by their
1004   unique_key()."""
1005
1006   def __init__(self, mode):
1007     """Initialize an instance, opening database in MODE (like the MODE
1008     argument to Database or anydbm.open())."""
1009     self.cvs_revs_db = SDatabase(temp(CVS_REVS_DB), mode)
1010     Cleanup().register(temp(CVS_REVS_DB), pass8)
1011
1012   def log_revision(self, c_rev):
1013     """Add C_REV, a CVSRevision, to the database."""
1014     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
1015
1016   def get_revision(self, unique_key):
1017     """Return the CVSRevision stored under UNIQUE_KEY."""
1018     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
1019
1020
1021 def TagsDatabase(mode):
1022   """A Database to store which symbolic names are tags.
1023   Each key is a tag name.
1024   The value has no meaning, and should be set to None."""
1025   db = SDatabase(temp(TAGS_DB), mode)
1026   Cleanup().register(temp(TAGS_DB), pass8)
1027   return db
1028
1029
1030 class Project:
1031   """A project within a CVS repository."""
1032
1033   def __init__(self, project_cvs_repos_path,
1034                trunk_path, branches_path, tags_path):
1035     """Create a new Project record.
1036
1037     PROJECT_CVS_REPOS_PATH is the main CVS directory for this project
1038     (within the filesystem).  TRUNK_PATH, BRANCHES_PATH, and TAGS_PATH
1039     are the full, normalized directory names in svn for the
1040     corresponding part of the repository."""
1041
1042     self.project_cvs_repos_path = project_cvs_repos_path
1043     prefix = Ctx().cvs_repository.cvs_repos_path
1044     if not self.project_cvs_repos_path.startswith(prefix):
1045       raise FatalError("Project '%s' must start with '%s'"
1046                        % (self.project_cvs_repos_path, prefix,))
1047     # The project's main directory as a cvs_path:
1048     self.project_cvs_path = self.project_cvs_repos_path[len(prefix):]
1049     if self.project_cvs_path.startswith(os.sep):
1050       self.project_cvs_path = self.project_cvs_path[1:]
1051     self.trunk_path = trunk_path
1052     self.branches_path = branches_path
1053     self.tags_path = tags_path
1054     verify_paths_disjoint(self.trunk_path, self.branches_path, self.tags_path)
1055
1056   def is_source(self, svn_path):
1057     """Return True iff SVN_PATH is a legitimate source for this project.
1058
1059     Legitimate paths are self.trunk_path or any directory directly
1060     under self.branches_path."""
1061
1062     if svn_path == self.trunk_path:
1063       return True
1064
1065     (head, tail,) = _path_split(svn_path)
1066     if head == self.branches_path:
1067       return True
1068
1069     return False
1070
1071   def is_unremovable(self, svn_path):
1072     """Return True iff the specified path must not be removed."""
1073
1074     return svn_path in [self.trunk_path, self.branches_path, self.tags_path]
1075
1076   def relative_name(self, fname):
1077     """Return the path to FNAME relative to project_cvs_repos_path,
1078     with ',v' removed.
1079
1080     FNAME is a filesystem name that has to begin (textually) with
1081     self.project_cvs_repos_path and end with ',v'.  Remove both prefix
1082     and suffix, and convert os.sep into '/'."""
1083
1084     if not fname.startswith(self.project_cvs_repos_path):
1085       raise FatalError(
1086           "relative_name: '%s' is not a sub-path of '%s'"
1087           % (fname, self.project_cvs_repos_path,))
1088     if not fname.endswith(',v'):
1089       raise FatalError("relative_name: '%s' does not end with ',v'"
1090                        % (fname,))
1091     l = len(self.project_cvs_repos_path)
1092     if fname[l] == os.sep:
1093       l += 1
1094     return string.replace(fname[l:-2], os.sep, '/')
1095
1096   def get_branch_path(self, branch_name):
1097     """Return the svnpath for the branch named BRANCH_NAME."""
1098
1099     return _path_join(self.branches_path, _clean_symbolic_name(branch_name))
1100
1101   def get_tag_path(self, tag_name):
1102     """Return the svnpath for the tag named TAG_NAME."""
1103
1104     return _path_join(self.tags_path, _clean_symbolic_name(tag_name))
1105
1106   def make_trunk_path(self, path):
1107     """Return the trunk path for PATH.
1108
1109     PATH is a relative name (relative to project_cvs_repos_path).
1110     Return the svn path for this file on trunk."""
1111
1112     return _path_join(self.trunk_path, path)
1113
1114   def make_branch_path(self, branch_name, path):
1115     """Return the branch path for PATH on the branch with name BRANCH_NAME.
1116
1117     PATH is a relative name (relative to project_cvs_repos_path).
1118     Return the svn path for this file on the specified branch."""
1119
1120     return _path_join(self.get_branch_path(branch_name), path)
1121
1122
1123 class CVSRevision:
1124   def __init__(self, ctx, *args):
1125     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
1126
1127     If CTX is None, the following members and methods of the
1128     instantiated CVSRevision class object will be unavailable (or
1129     simply will not work correctly, if at all):
1130        cvs_path
1131        svn_path
1132        is_default_branch_revision()
1133
1134     (Note that this class treats CTX as const, because the caller
1135     likely passed in a Borg instance of a Ctx.  The reason this class
1136     takes CTX as as a parameter, instead of just instantiating a Ctx
1137     itself, is that this class should be usable outside cvs2svn.)
1138
1139     If there is one argument in ARGS, it is a string, in the format of
1140     a line from a revs file.  Do *not* include a trailing newline.
1141
1142     If there are multiple ARGS, there must be 17 of them,
1143     comprising a parsed revs line:
1144        timestamp       -->  (int) date stamp for this cvs revision
1145        digest          -->  (string) digest of author+logmsg
1146        prev_timestamp  -->  (int) date stamp for the previous cvs revision
1147        next_timestamp  -->  (int) date stamp for the next cvs revision
1148        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
1149        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
1150        rev             -->  (string) this CVS rev, e.g., "1.3"
1151        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
1152        file_in_attic   -->  (char or None) true if RCS file is in Attic
1153        file_executable -->  (char or None) true if RCS file has exec bit set.
1154        file_size       -->  (int) size of the RCS file
1155        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
1156        fname           -->  (string) relative path of file in CVS repos
1157        mode            -->  (string or None) "kkv", "kb", etc.
1158        branch_name     -->  (string or None) branch on which this rev occurred
1159        tags            -->  (list of strings) all tags on this revision
1160        branches        -->  (list of strings) all branches rooted in this rev
1161
1162     The two forms of initialization are equivalent.
1163
1164     WARNING: Due to the resync process in pass2, prev_timestamp or
1165     next_timestamp may be incorrect in the c-revs or s-revs files."""
1166
1167     self._ctx = ctx
1168     if len(args) == 17:
1169       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1170        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1171        self.file_executable, self.file_size, self.deltatext_code,
1172        self.fname,
1173        self.mode, self.branch_name, self.tags, self.branches) = args
1174     elif len(args) == 1:
1175       data = args[0].split(' ', 15)
1176       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1177        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1178        self.file_executable, self.file_size, self.deltatext_code,
1179        self.mode, self.branch_name, numtags, remainder) = data
1180       # Patch up data items which are not simple strings
1181       self.timestamp = int(self.timestamp, 16)
1182       if self.prev_timestamp == "*":
1183         self.prev_timestamp = 0
1184       else:
1185         self.prev_timestamp = int(self.prev_timestamp)
1186       if self.next_timestamp == "*":
1187         self.next_timestamp = 0
1188       else:
1189         self.next_timestamp = int(self.next_timestamp)
1190       if self.prev_rev == "*":
1191         self.prev_rev = None
1192       if self.next_rev == "*":
1193         self.next_rev = None
1194       if self.file_in_attic == "*":
1195         self.file_in_attic = None
1196       if self.file_executable == "*":
1197         self.file_executable = None
1198       self.file_size = int(self.file_size)
1199       if self.mode == "*":
1200         self.mode = None
1201       if self.branch_name == "*":
1202         self.branch_name = None
1203       numtags = int(numtags)
1204       tags_and_numbranches_and_remainder = remainder.split(' ', numtags + 1)
1205       self.tags = tags_and_numbranches_and_remainder[:-2]
1206       numbranches = int(tags_and_numbranches_and_remainder[-2])
1207       remainder = tags_and_numbranches_and_remainder[-1]
1208       branches_and_fname = remainder.split(' ', numbranches)
1209       self.branches = branches_and_fname[:-1]
1210       self.fname = branches_and_fname[-1]
1211     else:
1212       raise TypeError, 'CVSRevision() takes 2 or 18 arguments (%d given)' % \
1213           (len(args) + 1)
1214     if ctx is not None:
1215       self.cvs_path = ctx.cvs_repository.get_cvs_path(self.fname)
1216       rel_name = ctx.project.relative_name(self.fname)
1217       if self.branch_name:
1218         self.svn_path = ctx.project.make_branch_path(
1219             self.branch_name, rel_name)
1220       else:
1221         self.svn_path = ctx.project.make_trunk_path(rel_name)
1222
1223   # The 'primary key' of a CVS Revision is the revision number + the
1224   # filename.  To provide a unique key (say, for a dict), we just glom
1225   # them together in a string.  By passing in self.prev_rev or
1226   # self.next_rev, you can get the unique key for their respective
1227   # CVSRevisions.
1228   def unique_key(self, revnum="0"):
1229     if revnum is "0":
1230       revnum = self.rev
1231     elif revnum is None:
1232       return None
1233     return revnum + "/" + self.fname
1234
1235   def __str__(self):
1236     return ('%08lx %s %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s'
1237             % (self.timestamp, self.digest, self.prev_timestamp or "*",
1238               self.next_timestamp or "*", self.op, (self.prev_rev or "*"),
1239               self.rev, (self.next_rev or "*"), (self.file_in_attic or "*"),
1240               (self.file_executable or "*"),
1241               self.file_size,
1242               self.deltatext_code, (self.mode or "*"),
1243               (self.branch_name or "*"),
1244               len(self.tags), self.tags and " " or "", " ".join(self.tags),
1245               len(self.branches), self.branches and " " or "",
1246               " ".join(self.branches),
1247               self.fname, ))
1248
1249   # Returns true if this CVSRevision is the opening CVSRevision for
1250   # NAME (for this RCS file).
1251   def opens_symbolic_name(self, name):
1252     if name in self.tags:
1253       return 1
1254     if name in self.branches:
1255       # If this c_rev opens a branch and our op is OP_DELETE, then
1256       # that means that the file that this c_rev belongs to was
1257       # created on the branch, so for all intents and purposes, this
1258       # c_rev is *technically* not an opening.  See Issue #62 for more
1259       # information.
1260       if self.op != OP_DELETE:
1261         return 1
1262     return 0
1263
1264   def is_default_branch_revision(self):
1265     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
1266     revision according to DEFAULT_BRANCHES_DB (see the conditions
1267     documented there), else return None."""
1268     val = self._ctx._default_branches_db.get(self.cvs_path, None)
1269     if val is not None:
1270       val_last_dot = val.rindex(".")
1271       our_last_dot = self.rev.rindex(".")
1272       default_branch = val[:val_last_dot]
1273       our_branch = self.rev[:our_last_dot]
1274       default_rev_component = int(val[val_last_dot + 1:])
1275       our_rev_component = int(self.rev[our_last_dot + 1:])
1276       if (default_branch == our_branch
1277           and our_rev_component <= default_rev_component):
1278         return 1
1279     # else
1280     return None
1281
1282   def rcs_path(self):
1283     """Returns the actual filesystem path to the RCS file of this
1284     CVSRevision."""
1285     if self.file_in_attic is None:
1286       return self.fname
1287     else:
1288       basepath, filename = os.path.split(self.fname)
1289       return os.path.join(basepath, 'Attic', filename)
1290
1291   def filename(self):
1292     "Return the last path component of self.fname, minus the ',v'"
1293     return os.path.split(self.fname)[-1][:-2]
1294
1295 class SymbolDatabase:
1296   """This database records information on all symbols in the RCS
1297   files.  It is created in pass 1 and it is used in pass 2."""
1298   def __init__(self):
1299     # A hash that maps tag names to commit counts
1300     self.tags = { }
1301     # A hash that maps branch names to lists of the format
1302     # [ create_count, commit_count, blockers ], where blockers
1303     # is a hash that lists the symbols that depend on the
1304     # the branch.  The blockers hash is used as a set, so the
1305     # values are not used.
1306     self.branches = { }
1307
1308   def register_tag_creation(self, name):
1309     """Register the creation of the tag NAME."""
1310     self.tags[name] = self.tags.get(name, 0) + 1
1311
1312   def _branch(self, name):
1313     """Helper function to get a branch node that will create and
1314     initialize the node if it does not exist."""
1315     if not self.branches.has_key(name):
1316       self.branches[name] = [ 0, 0, { } ]
1317     return self.branches[name]
1318
1319   def register_branch_creation(self, name):
1320     """Register the creation of the branch NAME."""
1321     self._branch(name)[0] += 1
1322
1323   def register_branch_commit(self, name):
1324     """Register a commit on the branch NAME."""
1325     self._branch(name)[1] += 1
1326
1327   def register_branch_blocker(self, name, blocker):
1328     """Register BLOCKER as a blocker on the branch NAME."""
1329     self._branch(name)[2][blocker] = None
1330
1331   def branch_has_commit(self, name):
1332     """Return non-zero if NAME has commits.  Returns 0 if name
1333     is not a branch or if it has no commits."""
1334     return self.branches.has_key(name) and self.branches[name][1]
1335
1336   def find_excluded_symbols(self, regexp_list):
1337     """Returns a hash of all symbols thaht match the regexps in
1338     REGEXP_LISTE.  The hash is used as a set so the values are
1339     not used."""
1340     excludes = { }
1341     for tag in self.tags.keys():
1342       if match_regexp_list(regexp_list, tag):
1343         excludes[tag] = None
1344     for branch in self.branches.keys():
1345       if match_regexp_list(regexp_list, branch):
1346         excludes[branch] = None
1347     return excludes
1348
1349   def find_branch_exclude_blockers(self, branch, excludes):
1350     """Find all blockers of BRANCH, excluding the ones in the hash
1351     EXCLUDES."""
1352     blockers = { }
1353     if excludes.has_key(branch):
1354       for blocker in self.branches[branch][2]:
1355         if not excludes.has_key(blocker):
1356           blockers[blocker] = None
1357     return blockers
1358
1359   def find_blocked_excludes(self, excludes):
1360     """Find all branches not in EXCLUDES that have blocking symbols that
1361     are not themselves excluded.  Return a hash that maps branch names
1362     to a hash of blockers.  The hash of blockes is used as a set so the
1363     values are not used."""
1364     blocked_branches = { }
1365     for branch in self.branches.keys():
1366       blockers = self.find_branch_exclude_blockers(branch, excludes)
1367       if blockers:
1368         blocked_branches[branch] = blockers
1369     return blocked_branches
1370
1371   def find_mismatches(self, excludes=None):
1372     """Find all symbols that are defined as both tags and branches,
1373     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1374     the symbol name, tag count, branch count and commit count."""
1375     if excludes is None:
1376       excludes = { }
1377     mismatches = [ ]
1378     for branch in self.branches.keys():
1379       if not excludes.has_key(branch) and self.tags.has_key(branch):
1380         mismatches.append((branch,                    # name
1381                            self.tags[branch],         # tag count
1382                            self.branches[branch][0],  # branch count
1383                            self.branches[branch][1])) # commit count
1384     return mismatches
1385
1386   def read(self):
1387     """Read the symbol database from files."""
1388     f = open(temp(TAGS_LIST))
1389     while 1:
1390       line = f.readline()
1391       if not line:
1392         break
1393       tag, count = line.split()
1394       self.tags[tag] = int(count)
1395
1396     f = open(temp(BRANCHES_LIST))
1397     while 1:
1398       line = f.readline()
1399       if not line:
1400         break
1401       words = line.split()
1402       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1403       for blocker in words[3:]:
1404         self.branches[words[0]][2][blocker] = None
1405
1406   def write(self):
1407     """Store the symbol database to files."""
1408     f = open(temp(TAGS_LIST), "w")
1409     Cleanup().register(temp(TAGS_LIST), pass2)
1410     for tag, count in self.tags.items():
1411       f.write("%s %d\n" % (tag, count))
1412
1413     f = open(temp(BRANCHES_LIST), "w")
1414     Cleanup().register(temp(BRANCHES_LIST), pass2)
1415     for branch, info in self.branches.items():
1416       f.write("%s %d %d" % (branch, info[0], info[1]))
1417       if info[2]:
1418         f.write(" ")
1419         f.write(" ".join(info[2].keys()))
1420       f.write("\n")
1421
1422 class CollectData(cvs2svn_rcsparse.Sink):
1423   def __init__(self):
1424     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1425     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1426     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1427     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1428     self.default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
1429                                          DB_OPEN_NEW)
1430     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1431     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1432     Cleanup().register(temp(METADATA_DB), pass8)
1433     self.fatal_errors = []
1434     self.num_files = 0
1435     self.symbol_db = SymbolDatabase()
1436
1437     # 1 if we've collected data for at least one file, None otherwise.
1438     self.found_valid_file = None
1439
1440     # See set_fname() for initializations of other variables.
1441
1442   def set_fname(self, canonical_name, filename):
1443     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1444     filesystem path to the file in question, and CANONICAL_NAME is
1445     FILENAME with the 'Attic' component removed (if the file is indeed
1446     in the Attic) ."""
1447     self.fname = canonical_name
1448
1449     # We calculate and save some file metadata here, where we can do
1450     # it only once per file, instead of waiting until later where we
1451     # would have to do the same calculations once per CVS *revision*.
1452
1453     self.cvs_path = Ctx().cvs_repository.get_cvs_path(self.fname)
1454
1455     # If the paths are not the same, then that means that the
1456     # canonical_name has had the 'Attic' component stripped out.
1457     self.file_in_attic = None
1458     if canonical_name != filename:
1459       self.file_in_attic = 1
1460
1461     file_stat = os.stat(filename)
1462     # The size of our file in bytes
1463     self.file_size = file_stat[stat.ST_SIZE]
1464
1465     # Whether or not the executable bit is set.
1466     self.file_executable = None
1467     if file_stat[0] & stat.S_IXUSR:
1468       self.file_executable = 1
1469
1470     # revision -> [timestamp, author, old-timestamp]
1471     self.rev_data = { }
1472
1473     # Maps revision number (key) to the revision number of the
1474     # previous revision along this line of development.
1475     #
1476     # For the first revision R on a branch, we consider the revision
1477     # from which R sprouted to be the 'previous'.
1478     #
1479     # Note that this revision can't be determined arithmetically (due
1480     # to cvsadmin -o, which is why this is necessary).
1481     #
1482     # If the key has no previous revision, then store None as key's
1483     # value.
1484     self.prev_rev = { }
1485
1486     # This dict is essentially self.prev_rev with the values mapped in
1487     # the other direction, so following key -> value will yield you
1488     # the next revision number.
1489     #
1490     # Unlike self.prev_rev, if the key has no next revision, then the
1491     # key is not present.
1492     self.next_rev = { }
1493
1494     # Track the state of each revision so that in set_revision_info,
1495     # we can determine if our op is an add/change/delete.  We can do
1496     # this because in set_revision_info, we'll have all of the
1497     # revisions for a file at our fingertips, and we need to examine
1498     # the state of our prev_rev to determine if we're an add or a
1499     # change--without the state of the prev_rev, we are unable to
1500     # distinguish between an add and a change.
1501     self.rev_state = { }
1502
1503     # Hash mapping branch numbers, like '1.7.2', to branch names,
1504     # like 'Release_1_0_dev'.
1505     self.branch_names = { }
1506
1507     # RCS flags (used for keyword expansion).
1508     self.mode = None
1509
1510     # Hash mapping revision numbers, like '1.7', to lists of names
1511     # indicating which branches sprout from that revision, like
1512     # ['Release_1_0_dev', 'experimental_driver', ...].
1513     self.branchlist = { }
1514
1515     # Like self.branchlist, but the values are lists of tag names that
1516     # apply to the key revision.
1517     self.taglist = { }
1518
1519     # If set, this is an RCS branch number -- rcsparse calls this the
1520     # "principal branch", but CVS and RCS refer to it as the "default
1521     # branch", so that's what we call it, even though the rcsparse API
1522     # setter method is still 'set_principal_branch'.
1523     self.default_branch = None
1524
1525     # If the RCS file doesn't have a default branch anymore, but does
1526     # have vendor revisions, then we make an educated guess that those
1527     # revisions *were* the head of the default branch up until the
1528     # commit of 1.2, at which point the file's default branch became
1529     # trunk.  This records the date at which 1.2 was committed.
1530     self.first_non_vendor_revision_date = None
1531
1532     # A list of all symbols defined for the current file.  Used to
1533     # prevent multiple definitions of a symbol, something which can
1534     # easily happen when --symbol-transform is used.
1535     self.defined_symbols = { }
1536
1537   def set_principal_branch(self, branch):
1538     self.default_branch = branch
1539
1540   def set_expansion(self, mode):
1541     self.mode = mode
1542
1543   def set_branch_name(self, branch_number, name):
1544     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1545     and that NAME sprouts from BRANCH_NUMBER .
1546     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1547     for example '1.7.2' (never '1.7.0.2')."""
1548     if not self.branch_names.has_key(branch_number):
1549       self.branch_names[branch_number] = name
1550       # The branchlist is keyed on the revision number from which the
1551       # branch sprouts, so strip off the odd final component.
1552       sprout_rev = branch_number[:branch_number.rfind(".")]
1553       self.branchlist.setdefault(sprout_rev, []).append(name)
1554       self.symbol_db.register_branch_creation(name)
1555     else:
1556       sys.stderr.write("%s: in '%s':\n"
1557                        "   branch '%s' already has name '%s',\n"
1558                        "   cannot also have name '%s', ignoring the latter\n"
1559                        % (warning_prefix, self.fname, branch_number,
1560                           self.branch_names[branch_number], name))
1561
1562   def rev_to_branch_name(self, revision):
1563     """Return the name of the branch on which REVISION lies.
1564     REVISION is a non-branch revision number with an even number of,
1565     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1566     For the convenience of callers, REVISION can also be a trunk
1567     revision such as '1.2', in which case just return None."""
1568     if trunk_rev.match(revision):
1569       return None
1570     return self.branch_names.get(revision[:revision.rindex(".")])
1571
1572   def add_cvs_branch(self, revision, branch_name):
1573     """Record the root revision and branch revision for BRANCH_NAME,
1574     based on REVISION.  REVISION is a CVS branch number having an even
1575     number of components where the second-to-last is '0'.  For
1576     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1577     from 1.7 and has branch number 1.7.2."""
1578     last_dot = revision.rfind(".")
1579     branch_rev = revision[:last_dot]
1580     last2_dot = branch_rev.rfind(".")
1581     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1582     self.set_branch_name(branch_rev, branch_name)
1583
1584   def define_tag(self, name, revision):
1585     """Record a bidirectional mapping between symbolic NAME and REVISION.
1586     REVISION is an unprocessed revision number from the RCS file's
1587     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1588     This function will determine what kind of symbolic name it is by
1589     inspection, and record it in the right places."""
1590     for (pattern, replacement) in Ctx().symbol_transforms:
1591       newname = pattern.sub(replacement, name)
1592       if newname != name:
1593         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1594                     % (name, newname))
1595         name = newname
1596     if self.defined_symbols.has_key(name):
1597       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1598                 % (error_prefix, name, self.fname)
1599       sys.stderr.write(err + "\n")
1600       self.fatal_errors.append(err)
1601     self.defined_symbols[name] = None
1602     if branch_tag.match(revision):
1603       self.add_cvs_branch(revision, name)
1604     elif vendor_tag.match(revision):
1605       self.set_branch_name(revision, name)
1606     else:
1607       self.taglist.setdefault(revision, []).append(name)
1608       self.symbol_db.register_tag_creation(name)
1609
1610   def define_revision(self, revision, timestamp, author, state,
1611                       branches, next):
1612
1613     # Record the state of our revision for later calculations
1614     self.rev_state[revision] = state
1615
1616     # store the rev_data as a list in case we have to jigger the timestamp
1617     self.rev_data[revision] = [int(timestamp), author, None]
1618
1619     # When on trunk, the RCS 'next' revision number points to what
1620     # humans might consider to be the 'previous' revision number.  For
1621     # example, 1.3's RCS 'next' is 1.2.
1622     #
1623     # However, on a branch, the RCS 'next' revision number really does
1624     # point to what humans would consider to be the 'next' revision
1625     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1626     #
1627     # In other words, in RCS, 'next' always means "where to find the next
1628     # deltatext that you need this revision to retrieve.
1629     #
1630     # That said, we don't *want* RCS's behavior here, so we determine
1631     # whether we're on trunk or a branch and set self.prev_rev
1632     # accordingly.
1633     #
1634     # One last thing.  Note that if REVISION is a branch revision,
1635     # instead of mapping REVISION to NEXT, we instead map NEXT to
1636     # REVISION.  Since we loop over all revisions in the file before
1637     # doing anything with the data we gather here, this 'reverse
1638     # assignment' effectively does the following:
1639     #
1640     # 1. Gives us no 'prev' value for REVISION (in this
1641     # iteration... it may have been set in a previous iteration)
1642     #
1643     # 2. Sets the 'prev' value for the revision with number NEXT to
1644     # REVISION.  So when we come around to the branch revision whose
1645     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1646     # set.
1647     if trunk_rev.match(revision):
1648       self.prev_rev[revision] = next
1649       self.next_rev[next] = revision
1650     elif next:
1651       self.prev_rev[next] = revision
1652       self.next_rev[revision] = next
1653
1654     for b in branches:
1655       self.prev_rev[b] = revision
1656
1657     # Ratchet up the highest vendor head revision, if necessary.
1658     if self.default_branch:
1659       default_branch_root = self.default_branch + "."
1660       if ((revision.find(default_branch_root) == 0)
1661           and (default_branch_root.count('.') == revision.count('.'))):
1662         # This revision is on the default branch, so record that it is
1663         # the new highest default branch head revision.
1664         self.default_branches_db[self.cvs_path] = revision
1665     else:
1666       # No default branch, so make an educated guess.
1667       if revision == '1.2':
1668         # This is probably the time when the file stopped having a
1669         # default branch, so make a note of it.
1670         self.first_non_vendor_revision_date = timestamp
1671       else:
1672         m = vendor_revision.match(revision)
1673         if m and ((not self.first_non_vendor_revision_date)
1674                   or (timestamp < self.first_non_vendor_revision_date)):
1675           # We're looking at a vendor revision, and it wasn't
1676           # committed after this file lost its default branch, so bump
1677           # the maximum trunk vendor revision in the permanent record.
1678           self.default_branches_db[self.cvs_path] = revision
1679
1680     if not trunk_rev.match(revision):
1681       # Check for unlabeled branches, record them.  We tried to collect
1682       # all branch names when we parsed the symbolic name header
1683       # earlier, of course, but that didn't catch unlabeled branches.
1684       # If a branch is unlabeled, this is our first encounter with it,
1685       # so we have to record its data now.
1686       branch_number = revision[:revision.rindex(".")]
1687       if not self.branch_names.has_key(branch_number):
1688         branch_name = "unlabeled-" + branch_number
1689         self.set_branch_name(branch_number, branch_name)
1690
1691       # Register the commit on this non-trunk branch
1692       branch_name = self.branch_names[branch_number]
1693       self.symbol_db.register_branch_commit(branch_name)
1694
1695   def tree_completed(self):
1696     "The revision tree has been parsed.  Analyze it for consistency."
1697
1698     # Our algorithm depends upon the timestamps on the revisions occuring
1699     # monotonically over time.  That is, we want to see rev 1.34 occur in
1700     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1701     # sorting), and then tried to insert 1.34, we'd be screwed.
1702
1703     # to perform the analysis, we'll simply visit all of the 'previous'
1704     # links that we have recorded and validate that the timestamp on the
1705     # previous revision is before the specified revision
1706
1707     # if we have to resync some nodes, then we restart the scan. just keep
1708     # looping as long as we need to restart.
1709     while 1:
1710       for current, prev in self.prev_rev.items():
1711         if not prev:
1712           # no previous revision exists (i.e. the initial revision)
1713           continue
1714         t_c = self.rev_data[current][0]
1715         t_p = self.rev_data[prev][0]
1716         if t_p >= t_c:
1717           # the previous revision occurred later than the current revision.
1718           # shove the previous revision back in time (and any before it that
1719           # may need to shift).
1720
1721           # We sync backwards and not forwards because any given CVS
1722           # Revision has only one previous revision.  However, a CVS
1723           # Revision can *be* a previous revision for many other
1724           # revisions (e.g., a revision that is the source of multiple
1725           # branches).  This becomes relevant when we do the secondary
1726           # synchronization in pass 2--we can make certain that we
1727           # don't resync a revision earlier than it's previous
1728           # revision, but it would be non-trivial to make sure that we
1729           # don't resync revision R *after* any revisions that have R
1730           # as a previous revision.
1731           while t_p >= t_c:
1732             self.rev_data[prev][0] = t_c - 1    # new timestamp
1733             self.rev_data[prev][2] = t_p        # old timestamp
1734             delta = t_c - 1 - t_p
1735             msg =  "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1736                   % (self.cvs_path, prev, time.ctime(t_p), delta)
1737             Log().write(LOG_VERBOSE, msg)
1738             if (delta > COMMIT_THRESHOLD
1739                 or delta < (COMMIT_THRESHOLD * -1)):
1740               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1741               Log().write(LOG_WARN,
1742                           str % (warning_prefix, self.cvs_path, delta))
1743             current = prev
1744             prev = self.prev_rev[current]
1745             if not prev:
1746               break
1747             t_c = t_c - 1               # self.rev_data[current][0]
1748             t_p = self.rev_data[prev][0]
1749
1750           # break from the for-loop
1751           break
1752       else:
1753         # finished the for-loop (no resyncing was performed)
1754         return
1755
1756   def set_revision_info(self, revision, log, text):
1757     timestamp, author, old_ts = self.rev_data[revision]
1758     digest = sha.new(log + '\0' + author).hexdigest()
1759     if old_ts:
1760       # the timestamp on this revision was changed. log it for later
1761       # resynchronization of other files's revisions that occurred
1762       # for this time and log message.
1763       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1764
1765     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1766     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1767     #
1768     # If revision 1.1 appears to have been created via 'cvs add'
1769     # instead of 'cvs import', then this file probably never had a
1770     # default branch, so retroactively remove its record in the
1771     # default branches db.  The test is that the log message CVS uses
1772     # for 1.1 in imports is "Initial revision\n" with no period.
1773     if revision == '1.1' and log != 'Initial revision\n':
1774       try:
1775         del self.default_branches_db[self.cvs_path]
1776       except KeyError:
1777         pass
1778
1779     # Get the timestamps of the previous and next revisions
1780     prev_rev = self.prev_rev[revision]
1781     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1782
1783     next_rev = self.next_rev.get(revision)
1784     next_timestamp, ign, ign = self.rev_data.get(next_rev, [0, None, None])
1785
1786     # How to tell if a CVSRevision is an add, a change, or a deletion:
1787     #
1788     # It's a delete if RCS state is 'dead'
1789     #
1790     # It's an add if RCS state is 'Exp.' and
1791     #      - we either have no previous revision
1792     #        or
1793     #      - we have a previous revision whose state is 'dead'
1794     #
1795     # Anything else is a change.
1796     if self.rev_state[revision] == 'dead':
1797       op = OP_DELETE
1798     elif ((self.prev_rev.get(revision, None) is None)
1799           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1800       op = OP_ADD
1801     else:
1802       op = OP_CHANGE
1803
1804     def is_branch_revision(rev):
1805       """Return True if this revision is not a trunk revision,
1806       else return False."""
1807       if rev.count('.') >= 3:
1808         return True
1809       return False
1810
1811     def is_same_line_of_development(rev1, rev2):
1812       """Return True if rev1 and rev2 are on the same line of
1813       development (i.e., both on trunk, or both on the same branch);
1814       return False otherwise.  Either rev1 or rev2 can be None, in
1815       which case automatically return False."""
1816       if rev1 is None or rev2 is None:
1817         return False
1818       if rev1.count('.') == 1 and rev2.count('.') == 1:
1819         return True
1820       if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
1821         return True
1822       return False
1823
1824     # There can be an odd situation where the tip revision of a branch
1825     # is alive, but every predecessor on the branch is in state 'dead',
1826     # yet the revision from which the branch sprouts is alive.  (This
1827     # is sort of a mirror image of the more common case of adding a
1828     # file on a branch, in which the first revision on the branch is
1829     # alive while the revision from which it sprouts is dead.)
1830     #
1831     # In this odd situation, we must mark the first live revision on
1832     # the branch as an OP_CHANGE instead of an OP_ADD, because it
1833     # reflects, however indirectly, a change w.r.t. the source
1834     # revision from which the branch sprouts.
1835     #
1836     # This is issue #89.
1837     cur_num = revision
1838     if is_branch_revision(revision) and self.rev_state[revision] != 'dead':
1839       while 1:
1840         prev_num = self.prev_rev.get(cur_num, None)
1841         if not cur_num or not prev_num:
1842           break
1843         if (not is_same_line_of_development(cur_num, prev_num)
1844             and self.rev_state[cur_num] == 'dead'
1845             and self.rev_state[prev_num] != 'dead'):
1846           op = OP_CHANGE
1847         cur_num = self.prev_rev.get(cur_num, None)
1848
1849     if text:
1850       deltatext_code = DELTATEXT_NONEMPTY
1851     else:
1852       deltatext_code = DELTATEXT_EMPTY
1853
1854     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp,
1855                         next_timestamp, op,
1856                         prev_rev, revision, next_rev,
1857                         self.file_in_attic, self.file_executable,
1858                         self.file_size,
1859                         deltatext_code, self.fname,
1860                         self.mode, self.rev_to_branch_name(revision),
1861                         self.taglist.get(revision, []),
1862                         self.branchlist.get(revision, []))
1863     self.revs.write(str(c_rev) + "\n")
1864     StatsKeeper().record_c_rev(c_rev)
1865
1866     if not self.metadata_db.has_key(digest):
1867       self.metadata_db[digest] = (author, log)
1868
1869   def parse_completed(self):
1870     # Walk through all branches and tags and register them with
1871     # their parent branch in the symbol database.
1872     for revision, symbols in self.taglist.items() + self.branchlist.items():
1873       for symbol in symbols:
1874         name = self.rev_to_branch_name(revision)
1875         if name is not None:
1876           self.symbol_db.register_branch_blocker(name, symbol)
1877
1878     self.num_files = self.num_files + 1
1879
1880   def write_symbol_db(self):
1881     self.symbol_db.write()
1882
1883 class SymbolingsLogger:
1884   """Manage the file that contains lines for symbol openings and
1885   closings.
1886
1887   This data will later be used to determine valid SVNRevision ranges
1888   from which a file can be copied when creating a branch or tag in
1889   Subversion.  Do this by finding "Openings" and "Closings" for each
1890   file copied onto a branch or tag.
1891
1892   An "Opening" is the CVSRevision from which a given branch/tag
1893   sprouts on a path.
1894
1895   The "Closing" for that branch/tag and path is the next CVSRevision
1896   on the same line of development as the opening.
1897
1898   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1899   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1900   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1901   'foo.c'.  Note that there may be many revisions chronologically
1902   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1903   perhaps even including on branch BEE itself.  But 1.3 is the next
1904   revision *on the same line* as 1.2, that is why it is the closing
1905   revision for those symbolic names of which 1.2 is the opening.
1906
1907   The reason for doing all this hullabaloo is to make branch and tag
1908   creation as efficient as possible by minimizing the number of copies
1909   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1910   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1911   means that when creating branch BEE, there is some motivation to do
1912   the copy from one of 17-30.  Now if there were another file,
1913   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1914   to revisions 24 and 39 in Subversion, we would know that the ideal
1915   thing would be to copy the branch from somewhere between 24 and 29,
1916   inclusive.
1917   """
1918   def __init__(self):
1919     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1920     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1921     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1922     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1923
1924     # This keys of this dictionary are *source* cvs_paths for which
1925     # we've encountered an 'opening' on the default branch.  The
1926     # values are the (uncleaned) symbolic names that this path has
1927     # opened.
1928     self.open_paths_with_default_branches = { }
1929
1930   def log_revision(self, c_rev, svn_revnum):
1931     """Log any openings found in C_REV, and if C_REV.next_rev is not
1932     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1933     any) will have its revnum determined later."""
1934     for name in c_rev.tags + c_rev.branches:
1935       self._note_default_branch_opening(c_rev, name)
1936       if c_rev.op != OP_DELETE:
1937         self._log(name, svn_revnum,
1938                   c_rev.cvs_path, c_rev.branch_name, OPENING)
1939
1940       # If our c_rev has a next_rev, then that's the closing rev for
1941       # this source revision.  Log it to closings for later processing
1942       # since we don't know the svn_revnum yet.
1943       if c_rev.next_rev is not None:
1944         self.closings.write('%s %s\n' %
1945                             (name, c_rev.unique_key(c_rev.next_rev)))
1946
1947   def _log(self, name, svn_revnum, cvs_path, branch_name, type):
1948     """Write out a single line to the symbol_openings_closings file
1949     representing that SVN_REVNUM of SVN_PATH on BRANCH_NAME is either the
1950     opening or closing (TYPE) of NAME (a symbolic name).
1951
1952     TYPE should only be one of the following global constants:
1953     OPENING or CLOSING."""
1954     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1955     self.symbolings.write(
1956         '%s %.8d %s %s %s\n'
1957         % (name, svn_revnum, type, branch_name or '*', cvs_path))
1958
1959   def close(self):
1960     """Iterate through the closings file, lookup the svn_revnum for
1961     each closing CVSRevision, and write a proper line out to the
1962     symbolings file."""
1963     # Use this to get the c_rev of our rev_key
1964     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1965
1966     self.closings.close()
1967     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1968       (name, rev_key) = line.rstrip().split(" ", 1)
1969       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1970
1971       c_rev = cvs_revs_db.get_revision(rev_key)
1972       self._log(name, svn_revnum, c_rev.cvs_path, c_rev.branch_name, CLOSING)
1973
1974     self.symbolings.close()
1975
1976   def _note_default_branch_opening(self, c_rev, symbolic_name):
1977     """If C_REV is a default branch revision, log C_REV.cvs_path as an
1978     opening for SYMBOLIC_NAME."""
1979     self.open_paths_with_default_branches.setdefault(
1980         c_rev.cvs_path, []).append(symbolic_name)
1981
1982   def log_default_branch_closing(self, c_rev, svn_revnum):
1983     """If self.open_paths_with_default_branches contains
1984     C_REV.cvs_path, then call log each name in
1985     self.open_paths_with_default_branches[C_REV.cvs_path] as a closing
1986     with SVN_REVNUM as the closing revision number."""
1987     path = c_rev.cvs_path
1988     if self.open_paths_with_default_branches.has_key(path):
1989       # log each symbol as a closing
1990       for name in self.open_paths_with_default_branches[path]:
1991         self._log(name, svn_revnum, path, None, CLOSING)
1992       # Remove them from the openings list as we're done with them.
1993       del self.open_paths_with_default_branches[path]
1994
1995
1996 class PersistenceManager:
1997   """The PersistenceManager allows us to effectively store SVNCommits
1998   to disk and retrieve them later using only their subversion revision
1999   number as the key.  It also returns the subversion revision number
2000   for a given CVSRevision's unique key.
2001
2002   All information pertinent to each SVNCommit is stored in a series of
2003   on-disk databases so that SVNCommits can be retrieved on-demand.
2004
2005   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
2006   In 'new' mode, PersistenceManager will initialize a new set of on-disk
2007   databases and be fully-featured.
2008   In 'read' mode, PersistenceManager will open existing on-disk databases
2009   and the set_* methods will be unavailable."""
2010   def __init__(self, mode):
2011     self.mode = mode
2012     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
2013       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
2014     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
2015     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
2016     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
2017     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
2018     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
2019     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
2020     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
2021     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
2022     ###PERF kff Elsewhere there are comments about sucking the tags db
2023     ### into memory.  That seems like a good idea.
2024     if not Ctx().trunk_only:
2025       self.tags_db = TagsDatabase(DB_OPEN_READ)
2026       self.motivating_revnums = SDatabase(temp(MOTIVATING_REVNUMS), mode)
2027       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
2028
2029     # "branch_name" -> svn_revnum in which branch was last filled.
2030     # This is used by CVSCommit._pre_commit, to prevent creating a fill
2031     # revision which would have nothing to do.
2032     self.last_filled = {}
2033
2034   def get_svn_revnum(self, cvs_rev_unique_key):
2035     """Return the Subversion revision number in which
2036     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
2037     is no mapping for CVS_REV_UNIQUE_KEY."""
2038     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
2039
2040   def get_svn_commit(self, svn_revnum):
2041     """Return an SVNCommit that corresponds to SVN_REVNUM.
2042
2043     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
2044
2045     This method can throw SVNCommitInternalInconsistencyError.
2046     """
2047     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
2048     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
2049     if c_rev_keys == None:
2050       return None
2051
2052     digest = None
2053     for key in c_rev_keys:
2054       c_rev = self.cvs_revisions.get_revision(key)
2055       svn_commit.add_revision(c_rev)
2056       # Set the author and log message for this commit by using
2057       # CVSRevision metadata, but only if haven't done so already.
2058       if digest is None:
2059         digest = c_rev.digest
2060         author, log_msg = self.svn_commit_metadata[digest]
2061         svn_commit.set_author(author)
2062         svn_commit.set_log_msg(log_msg)
2063
2064     # If we're doing a trunk-only conversion, we don't need to do any more
2065     # work.
2066     if Ctx().trunk_only:
2067       return svn_commit
2068
2069     name, date = self._get_name_and_date(svn_revnum)
2070     if name:
2071       svn_commit.set_symbolic_name(name)
2072       svn_commit.set_date(date)
2073       if self.tags_db.has_key(name):
2074         svn_commit.is_tag = 1
2075
2076     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
2077     if motivating_revnum:
2078       svn_commit.set_motivating_revnum(int(motivating_revnum))
2079       svn_commit.set_date(date)
2080
2081     if len(svn_commit.cvs_revs) and name:
2082       raise SVNCommit.SVNCommitInternalInconsistencyError(
2083           "An SVNCommit cannot have cvs_revisions *and* a corresponding\n"
2084           "symbolic name ('%s') to fill."
2085           % (_clean_symbolic_name(name),))
2086
2087     return svn_commit
2088
2089   def set_cvs_revs(self, svn_revnum, cvs_revs):
2090     """Record the bidirectional mapping between SVN_REVNUM and
2091     CVS_REVS."""
2092     if self.mode == DB_OPEN_READ:
2093       raise RuntimeError, \
2094           'Write operation attempted on read-only PersistenceManager'
2095     for c_rev in cvs_revs:
2096       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
2097     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
2098     for c_rev in cvs_revs:
2099       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
2100
2101   def set_name_and_date(self, svn_revnum, name, date):
2102     """Associate symbolic name NAME and DATE with SVN_REVNUM.
2103
2104     NAME is allowed to be None."""
2105
2106     if self.mode == DB_OPEN_READ:
2107       raise RuntimeError, \
2108           'Write operation attempted on read-only PersistenceManager'
2109     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
2110     self.last_filled[name] = svn_revnum
2111
2112   def _get_name_and_date(self, svn_revnum):
2113     """Return a tuple containing the symbolic name and date associated
2114     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
2115     associated with it."""
2116     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
2117
2118   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
2119     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
2120     if self.mode == DB_OPEN_READ:
2121       raise RuntimeError, \
2122           'Write operation attempted on read-only PersistenceManager'
2123     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
2124
2125
2126 class CVSCommit:
2127   """Each instance of this class contains a number of CVS Revisions
2128   that correspond to one or more Subversion Commits.  After all CVS
2129   Revisions are added to the grouping, calling process_revisions will
2130   generate a Subversion Commit (or Commits) for the set of CVS
2131   Revisions in the grouping."""
2132
2133   def __init__(self, digest, author, log):
2134     self.digest = digest
2135     self.author = author
2136     self.log = log
2137
2138     # Symbolic names for which the last source revision has already
2139     # been seen and for which the CVSRevisionAggregator has already
2140     # generated a fill SVNCommit.  See self.process_revisions().
2141     self.done_symbols = [ ]
2142
2143     self.files = { }
2144     # Lists of CVSRevisions
2145     self.changes = [ ]
2146     self.deletes = [ ]
2147
2148     # Start out with a t_min higher than any incoming time T, and a
2149     # t_max lower than any incoming T.  This way the first T will
2150     # push t_min down to T, and t_max up to T, naturally (without any
2151     # special-casing), and successive times will then ratchet them
2152     # outward as appropriate.
2153     self.t_min = 1L<<32
2154     self.t_max = 0
2155
2156     # This will be set to the SVNCommit that occurs in self._commit.
2157     self.motivating_commit = None
2158
2159     # This is a list of all non-primary commits motivated by the main
2160     # commit.  We gather these so that we can set their dates to the
2161     # same date as the primary commit.
2162     self.secondary_commits = [ ]
2163
2164     # State for handling default branches.
2165     #
2166     # Here is a tempting, but ultimately nugatory, bit of logic, which
2167     # I share with you so you may appreciate the less attractive, but
2168     # refreshingly non-nugatory, logic which follows it:
2169     #
2170     # If some of the commits in this txn happened on a non-trunk
2171     # default branch, then those files will have to be copied into
2172     # trunk manually after being changed on the branch (because the
2173     # RCS "default branch" appears as head, i.e., trunk, in practice).
2174     # As long as those copies don't overwrite any trunk paths that
2175     # were also changed in this commit, then we can do the copies in
2176     # the same revision, because they won't cover changes that don't
2177     # appear anywhere/anywhen else.  However, if some of the trunk dst
2178     # paths *did* change in this commit, then immediately copying the
2179     # branch changes would lose those trunk mods forever.  So in this
2180     # case, we need to do at least that copy in its own revision.  And
2181     # for simplicity's sake, if we're creating the new revision for
2182     # even one file, then we just do all such copies together in the
2183     # new revision.
2184     #
2185     # Doesn't that sound nice?
2186     #
2187     # Unfortunately, Subversion doesn't support copies with sources
2188     # in the current txn.  All copies must be based in committed
2189     # revisions.  Therefore, we generate the above-described new
2190     # revision unconditionally.
2191     #
2192     # This is a list of c_revs, and a c_rev is appended for each
2193     # default branch commit that will need to be copied to trunk (or
2194     # deleted from trunk) in some generated revision following the
2195     # "regular" revision.
2196     self.default_branch_cvs_revisions = [ ]
2197
2198   def __cmp__(self, other):
2199     # Commits should be sorted by t_max.  If both self and other have
2200     # the same t_max, break the tie using t_min, and lastly, digest
2201     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
2202             or cmp(self.digest, other.digest))
2203
2204   def has_file(self, fname):
2205     return self.files.has_key(fname)
2206
2207   def revisions(self):
2208     return self.changes + self.deletes
2209
2210   def opens_symbolic_name(self, name):
2211     """Returns true if any CVSRevision in this commit is on a tag or a
2212     branch or is the origin of a tag or branch."""
2213     for c_rev in self.revisions():
2214       if c_rev.opens_symbolic_name(name):
2215         return 1
2216     return 0
2217
2218   def add_revision(self, c_rev):
2219     # Record the time range of this commit.
2220     #
2221     # ### ISSUE: It's possible, though unlikely, that the time range
2222     # of a commit could get gradually expanded to be arbitrarily
2223     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
2224     # problem, and anyway deciding where to break it up would be a
2225     # judgement call.  For now, we just print a warning in commit() if
2226     # this happens.
2227     if c_rev.timestamp < self.t_min:
2228       self.t_min = c_rev.timestamp
2229     if c_rev.timestamp > self.t_max:
2230       self.t_max = c_rev.timestamp
2231
2232     if c_rev.op == OP_DELETE:
2233       self.deletes.append(c_rev)
2234     else:
2235       # OP_CHANGE or OP_ADD
2236       self.changes.append(c_rev)
2237
2238     self.files[c_rev.fname] = 1
2239
2240   def _pre_commit(self):
2241     """Generates any SVNCommits that must exist before the main
2242     commit."""
2243
2244     # There may be multiple c_revs in this commit that would cause
2245     # branch B to be filled, but we only want to fill B once.  On the
2246     # other hand, there might be multiple branches committed on in
2247     # this commit.  Whatever the case, we should count exactly one
2248     # commit per branch, because we only fill a branch once per
2249     # CVSCommit.  This list tracks which branches we've already
2250     # counted.
2251     accounted_for_sym_names = [ ]
2252
2253     def fill_needed(c_rev, pm):
2254       """Return 1 if this is the first commit on a new branch (for
2255       this file) and we need to fill the branch; else return 0
2256       (meaning that some other file's first commit on the branch has
2257       already done the fill for us).
2258
2259       If C_REV.op is OP_ADD, only return 1 if the branch that this
2260       commit is on has no last filled revision.
2261
2262       PM is a PersistenceManager to query.
2263       """
2264
2265       # Different '.' counts indicate that c_rev is now on a different
2266       # line of development (and may need a fill)
2267       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
2268         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
2269         # It should be the case that when we have a file F that
2270         # is added on branch B (thus, F on trunk is in state
2271         # 'dead'), we generate an SVNCommit to fill B iff the branch
2272         # has never been filled before.
2273         #
2274         # If this c_rev.op == OP_ADD, *and* the branch has never
2275         # been filled before, then fill it now.  Otherwise, no need to
2276         # fill it.
2277         if c_rev.op == OP_ADD:
2278           if pm.last_filled.get(c_rev.branch_name, None) is None:
2279             return 1
2280         elif c_rev.op == OP_CHANGE:
2281           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
2282             return 1
2283         elif c_rev.op == OP_DELETE:
2284           if pm.last_filled.get(c_rev.branch_name, None) is None:
2285             return 1
2286       return 0
2287
2288     for c_rev in self.changes + self.deletes:
2289       # If a commit is on a branch, we must ensure that the branch
2290       # path being committed exists (in HEAD of the Subversion
2291       # repository).  If it doesn't exist, we will need to fill the
2292       # branch.  After the fill, the path on which we're committing
2293       # will exist.
2294       if c_rev.branch_name \
2295           and c_rev.branch_name not in accounted_for_sym_names \
2296           and c_rev.branch_name not in self.done_symbols \
2297           and fill_needed(c_rev, Ctx()._persistence_manager):
2298         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
2299                                % c_rev.branch_name)
2300         svn_commit.set_symbolic_name(c_rev.branch_name)
2301         self.secondary_commits.append(svn_commit)
2302         accounted_for_sym_names.append(c_rev.branch_name)
2303
2304   def _commit(self):
2305     """Generates the primary SVNCommit that corresponds to this
2306     CVSCommit."""
2307     # Generate an SVNCommit unconditionally.  Even if the only change
2308     # in this CVSCommit is a deletion of an already-deleted file (that
2309     # is, a CVS revision in state 'dead' whose predecessor was also in
2310     # state 'dead'), the conversion will still generate a Subversion
2311     # revision containing the log message for the second dead
2312     # revision, because we don't want to lose that information.
2313     svn_commit = SVNCommit("commit")
2314     self.motivating_commit = svn_commit
2315
2316     for c_rev in self.changes:
2317       svn_commit.add_revision(c_rev)
2318       # Only make a change if we need to.  When 1.1.1.1 has an empty
2319       # deltatext, the explanation is almost always that we're looking
2320       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
2321       # such imports, CVS creates an RCS file where 1.1 has the
2322       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2323       # content as 1.1.  There's no reason to reflect this non-change
2324       # in the repository, so we want to do nothing in this case.  (If
2325       # we were really paranoid, we could make sure 1.1's log message
2326       # is the CVS-generated "Initial revision\n", but I think the
2327       # conditions below are strict enough.)
2328       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
2329               and (c_rev.rev == "1.1.1.1")):
2330         if c_rev.is_default_branch_revision():
2331           self.default_branch_cvs_revisions.append(c_rev)
2332
2333     for c_rev in self.deletes:
2334       # When a file is added on a branch, CVS not only adds the file
2335       # on the branch, but generates a trunk revision (typically
2336       # 1.1) for that file in state 'dead'.  We only want to add
2337       # this revision if the log message is not the standard cvs
2338       # fabricated log message.
2339       if c_rev.prev_rev is None:
2340         # c_rev.branches may be empty if the originating branch
2341         # has been excluded.
2342         if not c_rev.branches:
2343           continue
2344         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
2345                              % (c_rev.filename(),
2346                                 c_rev.branches[0]))
2347         author, log_msg = \
2348             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
2349         if log_msg == cvs_generated_msg:
2350           continue
2351
2352       svn_commit.add_revision(c_rev)
2353       if c_rev.is_default_branch_revision():
2354         self.default_branch_cvs_revisions.append(c_rev)
2355
2356     # There is a slight chance that we didn't actually register any
2357     # CVSRevisions with our SVNCommit (see loop over self.deletes
2358     # above), so if we have no CVSRevisions, we don't flush the
2359     # svn_commit to disk and roll back our revnum.
2360     if len(svn_commit.cvs_revs) > 0:
2361       svn_commit.flush()
2362     else:
2363       # We will not be flushing this SVNCommit, so rollback the
2364       # SVNCommit revision counter.
2365       SVNCommit.revnum = SVNCommit.revnum - 1
2366
2367     if not Ctx().trunk_only:
2368       for c_rev in self.revisions():
2369         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
2370
2371   def _post_commit(self):
2372     """Generates any SVNCommits that we can perform now that _commit
2373     has happened.  That is, handle non-trunk default branches.
2374     Sometimes an RCS file has a non-trunk default branch, so a commit
2375     on that default branch would be visible in a default CVS checkout
2376     of HEAD.  If we don't copy that commit over to Subversion's trunk,
2377     then there will be no Subversion tree which corresponds to that
2378     CVS checkout.  Of course, in order to copy the path over, we may
2379     first need to delete the existing trunk there.  """
2380
2381     # Only generate a commit if we have default branch revs
2382     if len(self.default_branch_cvs_revisions):
2383       # Generate an SVNCommit for all of our default branch c_revs.
2384       svn_commit = SVNCommit("post-commit default branch(es)")
2385       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
2386       for c_rev in self.default_branch_cvs_revisions:
2387         svn_commit.add_revision(c_rev)
2388         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
2389                                                             svn_commit.revnum)
2390       self.secondary_commits.append(svn_commit)
2391
2392   def process_revisions(self, done_symbols):
2393     """Process all the CVSRevisions that this instance has, creating
2394     one or more SVNCommits in the process.  Generate fill SVNCommits
2395     only for symbols not in DONE_SYMBOLS (avoids unnecessary
2396     fills).
2397
2398     Return the primary SVNCommit that corresponds to this CVSCommit.
2399     The returned SVNCommit is the commit that motivated any other
2400     SVNCommits generated in this CVSCommit."""
2401     self.done_symbols = done_symbols
2402     seconds = self.t_max - self.t_min + 1
2403
2404     Log().write(LOG_VERBOSE, '-' * 60)
2405     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2406     if seconds == 1:
2407       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2408                   % time.ctime(self.t_max))
2409     else:
2410       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2411       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2412                   % (time.ctime(self.t_max), seconds))
2413
2414     if seconds > COMMIT_THRESHOLD + 1:
2415       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2416                   % (warning_prefix, COMMIT_THRESHOLD))
2417
2418     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2419       self._commit()
2420       return self.motivating_commit
2421
2422     self._pre_commit()
2423     self._commit()
2424     self._post_commit()
2425
2426     for svn_commit in self.secondary_commits:
2427       svn_commit.set_date(self.motivating_commit.get_date())
2428       svn_commit.flush()
2429
2430     return self.motivating_commit
2431
2432
2433 class SVNCommit:
2434   """This represents one commit to the Subversion Repository.  There
2435   are three types of SVNCommits:
2436
2437   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2438
2439   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2440
2441   3. Updates trunk to reflect the contents of a particular branch
2442      (this is to handle RCS default branches)."""
2443
2444   # The revision number to assign to the next new SVNCommit.
2445   # We start at 2 because SVNRepositoryMirror uses the first commit
2446   # to create trunk, tags, and branches.
2447   revnum = 2
2448
2449   class SVNCommitInternalInconsistencyError(Exception):
2450     """Exception raised if we encounter an impossible state in the
2451     SVNCommit Databases."""
2452     pass
2453
2454   def __init__(self, description="", revnum=None, cvs_revs=None):
2455     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2456     If REVNUM, the SVNCommit will correspond to that revision number;
2457     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2458     REVNUM.
2459
2460     It is an error to pass CVS_REVS without REVNUM, but you may pass
2461     REVNUM without CVS_REVS, and then add a revision at a time by
2462     invoking add_revision()."""
2463     self._description = description
2464
2465     # Revprop metadata for this commit.
2466     #
2467     # These initial values are placeholders.  At least the log and the
2468     # date should be different by the time these are used.
2469     #
2470     # They are private because their values should be returned encoded
2471     # in UTF8, but callers aren't required to set them in UTF8.
2472     # Therefore, accessor methods are used to set them, and
2473     # self.get_revprops() is used to to get them, in dictionary form.
2474     self._author = Ctx().username
2475     self._log_msg = "This log message means an SVNCommit was used too soon."
2476     self._max_date = 0  # Latest date seen so far.
2477
2478     self.cvs_revs = cvs_revs or []
2479     if revnum:
2480       self.revnum = revnum
2481     else:
2482       self.revnum = SVNCommit.revnum
2483       SVNCommit.revnum = SVNCommit.revnum + 1
2484
2485     # The (uncleaned) symbolic name that is filled in this SVNCommit, if any.
2486     self.symbolic_name = None
2487
2488     # If this commit is a default branch synchronization, this
2489     # variable represents the subversion revision number of the
2490     # *primary* commit where the default branch changes actually
2491     # happened.  It is None otherwise.
2492     #
2493     # It is possible for multiple synchronization commits to refer to
2494     # the same motivating commit revision number, and it is possible
2495     # for a single synchronization commit to contain CVSRevisions on
2496     # multiple different default branches.
2497     self.motivating_revnum = None
2498
2499     # is_tag is true only if this commit is a fill of a symbolic name
2500     # that is a tag, None in all other cases.
2501     self.is_tag = None
2502
2503   def set_symbolic_name(self, symbolic_name):
2504     "Set self.symbolic_name to SYMBOLIC_NAME."
2505     self.symbolic_name = symbolic_name
2506
2507   def set_motivating_revnum(self, revnum):
2508     "Set self.motivating_revnum to REVNUM."
2509     self.motivating_revnum = revnum
2510
2511   def set_author(self, author):
2512     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2513     This is the only way to set an SVNCommit's author."""
2514     self._author = author
2515
2516   def set_log_msg(self, msg):
2517     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2518     This is the only way to set an SVNCommit's log message."""
2519     self._log_msg = msg
2520
2521   def set_date(self, date):
2522     """Set this SVNCommit's date to DATE (an integer).
2523     Note that self.add_revision() updates this automatically based on
2524     a CVSRevision; so you may not need to call this at all, and even
2525     if you do, the value may be overwritten by a later call to
2526     self.add_revision()."""
2527     self._max_date = date
2528
2529   def get_date(self):
2530     """Returns this SVNCommit's date as an integer."""
2531     return self._max_date
2532
2533   def get_revprops(self):
2534     """Return the Subversion revprops for this SVNCommit."""
2535     date = format_date(self._max_date)
2536     try:
2537       utf8_author = None
2538       if self._author is not None:
2539         utf8_author = to_utf8(self._author)
2540       utf8_log = to_utf8(self.get_log_msg())
2541       return { 'svn:author' : utf8_author,
2542                'svn:log'    : utf8_log,
2543                'svn:date'   : date }
2544     except UnicodeError:
2545       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2546                   % warning_prefix)
2547       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2548       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2549       Log().write(LOG_WARN, "  date:   '%s'" % date)
2550       Log().write(LOG_WARN,
2551                   "(subversion rev %s)  Related files:" % self.revnum)
2552       for c_rev in self.cvs_revs:
2553         Log().write(LOG_WARN, " ", c_rev.fname)
2554
2555       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2556                   "'--encoding=latin1'.\n")
2557       # It's better to fall back to the original (unknown encoding) data
2558       # than to either 1) quit or 2) record nothing at all.
2559       return { 'svn:author' : self._author,
2560                'svn:log'    : self.get_log_msg(),
2561                'svn:date'   : date }
2562
2563   def add_revision(self, cvs_rev):
2564     self.cvs_revs.append(cvs_rev)
2565     if cvs_rev.timestamp > self._max_date:
2566       self._max_date = cvs_rev.timestamp
2567
2568   def _is_primary_commit(self):
2569     """Return true if this is a primary SVNCommit, false otherwise."""
2570     return not (self.symbolic_name or self.motivating_revnum)
2571
2572   def flush(self):
2573     Log().write(LOG_NORMAL, "Creating Subversion r%d (%s)"
2574                 % (self.revnum, self._description))
2575     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2576
2577     if self.motivating_revnum is not None:
2578       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2579                                                        self.motivating_revnum)
2580
2581     # If we're not a primary commit, then store our date and/or our
2582     # symbolic_name
2583     if not self._is_primary_commit():
2584       Ctx()._persistence_manager.set_name_and_date(
2585           self.revnum, self.symbolic_name, self._max_date)
2586
2587   def __str__(self):
2588     """ Print a human-readable description of this SVNCommit.  This
2589     description is not intended to be machine-parseable (although
2590     we're not going to stop you if you try!)"""
2591
2592     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2593     if self.symbolic_name:
2594       ret += ("   symbolic name: " + _clean_symbolic_name(self.symbolic_name)
2595               + "\n")
2596     else:
2597       ret += "   NO symbolic name\n"
2598     ret += "   debug description: " + self._description + "\n"
2599     ret += "   cvs_revs:\n"
2600     for c_rev in self.cvs_revs:
2601       ret += "     " + c_rev.unique_key() + "\n"
2602     return ret
2603
2604   def get_log_msg(self):
2605     """Returns the actual log message for a primary commit, and the
2606     appropriate manufactured log message for a secondary commit."""
2607     if self.symbolic_name is not None:
2608       return self._log_msg_for_symbolic_name_commit()
2609     elif self.motivating_revnum is not None:
2610       return self._log_msg_for_default_branch_commit()
2611     else:
2612       return self._log_msg
2613
2614   def _log_msg_for_symbolic_name_commit(self):
2615     """Creates a log message for a manufactured commit that fills
2616     self.symbolic_name.  If self.is_tag is true, write the log message
2617     as though for a tag, else write it as though for a branch."""
2618     type = 'branch'
2619     if self.is_tag:
2620       type = 'tag'
2621
2622     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2623     space_or_newline = ' '
2624     cleaned_symbolic_name = _clean_symbolic_name(self.symbolic_name)
2625     if len(cleaned_symbolic_name) >= 13:
2626       space_or_newline = '\n'
2627
2628     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2629            % (type, space_or_newline, cleaned_symbolic_name)
2630
2631   def _log_msg_for_default_branch_commit(self):
2632     """Creates a log message for a manufactured commit that
2633     synchronizes a non-trunk default branch with trunk."""
2634     msg = 'This commit was generated by cvs2svn to compensate for '     \
2635           'changes in r%d,\n'                                           \
2636           'which included commits to RCS files with non-trunk default ' \
2637           'branches.\n' % self.motivating_revnum
2638     return msg
2639
2640 class CVSRevisionAggregator:
2641   """This class groups CVSRevisions into CVSCommits that represent
2642   at least one SVNCommit."""
2643   def __init__(self):
2644     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2645     if not Ctx().trunk_only:
2646       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB),
2647                                    DB_OPEN_READ)
2648     self.cvs_commits = {}
2649     self.pending_symbols = {}
2650     # A list of symbols for which we've already encountered the last
2651     # CVSRevision that is a source for that symbol.  That is, the
2652     # final fill for this symbol has been done, and we never need to
2653     # fill it again.
2654     self.done_symbols = [ ]
2655
2656     # This variable holds the most recently created primary svn_commit
2657     # object.  CVSRevisionAggregator maintains this variable merely
2658     # for its date, so that it can set dates for the SVNCommits
2659     # created in self.attempt_to_commit_symbols().
2660     self.latest_primary_svn_commit = None
2661
2662     Ctx()._symbolings_logger = SymbolingsLogger()
2663     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2664     Ctx()._default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
2665                                            DB_OPEN_READ)
2666
2667
2668   def process_revision(self, c_rev):
2669     # Each time we read a new line, we scan the commits we've
2670     # accumulated so far to see if any are ready for processing now.
2671     ready_queue = [ ]
2672     for digest_key, cvs_commit in self.cvs_commits.items():
2673       if cvs_commit.t_max + COMMIT_THRESHOLD < c_rev.timestamp:
2674         ready_queue.append(cvs_commit)
2675         del self.cvs_commits[digest_key]
2676         continue
2677       # If the inbound commit is on the same file as a pending commit,
2678       # close the pending commit to further changes.  Don't flush it though,
2679       # as there may be other pending commits dated before this one.
2680       # ### ISSUE: the has_file() check below is not optimal.
2681       # It does fix the dataloss bug where revisions would get lost
2682       # if checked in too quickly, but it can also break apart the
2683       # commits.  The correct fix would require tracking the dependencies
2684       # between change sets and committing them in proper order.
2685       if cvs_commit.has_file(c_rev.fname):
2686         unused_id = digest_key + '-'
2687         # Find a string that does is not already a key in
2688         # the self.cvs_commits dict
2689         while self.cvs_commits.has_key(unused_id):
2690           unused_id = unused_id + '-'
2691         self.cvs_commits[unused_id] = cvs_commit
2692         del self.cvs_commits[digest_key]
2693
2694     # Add this item into the set of still-available commits.
2695     if self.cvs_commits.has_key(c_rev.digest):
2696       cvs_commit = self.cvs_commits[c_rev.digest]
2697     else:
2698       author, log = self.metadata_db[c_rev.digest]
2699       self.cvs_commits[c_rev.digest] = CVSCommit(c_rev.digest,
2700                                                  author, log)
2701       cvs_commit = self.cvs_commits[c_rev.digest]
2702     cvs_commit.add_revision(c_rev)
2703
2704     # If there are any elements in the ready_queue at this point, they
2705     # need to be processed, because this latest rev couldn't possibly
2706     # be part of any of them.  Sort them into time-order, then process
2707     # 'em.
2708     ready_queue.sort()
2709
2710     # Make sure we attempt_to_commit_symbols for this c_rev, even if no
2711     # commits are ready.
2712     if len(ready_queue) == 0:
2713       self.attempt_to_commit_symbols(ready_queue, c_rev)
2714
2715     for cvs_commit in ready_queue[:]:
2716       self.latest_primary_svn_commit \
2717           = cvs_commit.process_revisions(self.done_symbols)
2718       ready_queue.remove(cvs_commit)
2719       self.attempt_to_commit_symbols(ready_queue, c_rev)
2720
2721   def flush(self):
2722     """Commit anything left in self.cvs_commits.  Then inform the
2723     SymbolingsLogger that all commits are done."""
2724
2725     ready_queue = [ ]
2726     for k, v in self.cvs_commits.items():
2727       ready_queue.append((v, k))
2728
2729     ready_queue.sort()
2730     for cvs_commit_tuple in ready_queue[:]:
2731       self.latest_primary_svn_commit = \
2732         cvs_commit_tuple[0].process_revisions(self.done_symbols)
2733       ready_queue.remove(cvs_commit_tuple)
2734       del self.cvs_commits[cvs_commit_tuple[1]]
2735       self.attempt_to_commit_symbols([])
2736
2737     if not Ctx().trunk_only:
2738       Ctx()._symbolings_logger.close()
2739
2740   def attempt_to_commit_symbols(self, queued_commits, c_rev=None):
2741     """
2742     This function generates 1 SVNCommit for each symbol in
2743     self.pending_symbols that doesn't have an opening CVSRevision in
2744     either QUEUED_COMMITS or self.cvs_commits.values().
2745
2746     If C_REV is not None, then we first add to self.pending_symbols
2747     any symbols from C_REV that C_REV is the last CVSRevision for.
2748     """
2749     # If we're not doing a trunk-only conversion, get the symbolic
2750     # names that this c_rev is the last *source* CVSRevision for and
2751     # add them to those left over from previous passes through the
2752     # aggregator.
2753     if c_rev and not Ctx().trunk_only:
2754       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2755         self.pending_symbols[sym] = None
2756
2757     # Make a list of all symbols that still have *source* CVSRevisions
2758     # in the pending commit queue (self.cvs_commits).
2759     open_symbols = {}
2760     for sym in self.pending_symbols.keys():
2761       for cvs_commit in self.cvs_commits.values() + queued_commits:
2762         if cvs_commit.opens_symbolic_name(sym):
2763           open_symbols[sym] = None
2764           break
2765
2766     # Sort the pending symbols so that we will always process the
2767     # symbols in the same order, regardless of the order in which the
2768     # dict hashing algorithm hands them back to us.  We do this so
2769     # that our tests will get the same results on all platforms.
2770     sorted_pending_symbols_keys = self.pending_symbols.keys()
2771     sorted_pending_symbols_keys.sort()
2772     for sym in sorted_pending_symbols_keys:
2773       if open_symbols.has_key(sym): # sym is still open--don't close it.
2774         continue
2775       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2776       svn_commit.set_symbolic_name(sym)
2777       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2778       svn_commit.flush()
2779       self.done_symbols.append(sym)
2780       del self.pending_symbols[sym]
2781
2782
2783 class SymbolingsReader:
2784   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2785   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2786   returning the correct opening and closing Subversion revision
2787   numbers for a given symbolic name."""
2788   def __init__(self):
2789     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2790     reads the offsets database into memory."""
2791     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2792     # The offsets_db is really small, and we need to read and write
2793     # from it a fair bit, so suck it into memory
2794     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2795     self.offsets = { }
2796     for key in offsets_db.db.keys():
2797       #print " ZOO:", key, offsets_db[key]
2798       self.offsets[key] = offsets_db[key]
2799
2800   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2801     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2802     SymbolicNameFillingGuide object.
2803
2804     Note that if we encounter an opening rev in this fill, but the
2805     corresponding closing rev takes place later than SVN_REVNUM, the
2806     closing will not be passed to SymbolicNameFillingGuide in this
2807     fill (and will be discarded when encountered in a later fill).
2808     This is perfectly fine, because we can still do a valid fill
2809     without the closing--we always try to fill what we can as soon as
2810     we can."""
2811
2812     openings_closings_map = OpeningsClosingsMap(symbolic_name)
2813
2814     # It's possible to have a branch start with a file that was added
2815     # on a branch
2816     if self.offsets.has_key(symbolic_name):
2817       # set our read offset for self.symbolings to the offset for
2818       # symbolic_name
2819       self.symbolings.seek(self.offsets[symbolic_name])
2820
2821       while 1:
2822         fpos = self.symbolings.tell()
2823         line = self.symbolings.readline().rstrip()
2824         if not line:
2825           break
2826         name, revnum, type, branch_name, cvs_path = line.split(" ", 4)
2827         if branch_name == '*':
2828           svn_path = Ctx().project.make_trunk_path(cvs_path)
2829         else:
2830           svn_path = Ctx().project.make_branch_path(branch_name, cvs_path)
2831         revnum = int(revnum)
2832         if revnum > svn_revnum or name != symbolic_name:
2833           break
2834         openings_closings_map.register(svn_path, revnum, type)
2835
2836       # get current offset of the read marker and set it to the offset
2837       # for the beginning of the line we just read if we used anything
2838       # we read.
2839       if not openings_closings_map.is_empty():
2840         self.offsets[symbolic_name] = fpos
2841
2842     return SymbolicNameFillingGuide(openings_closings_map)
2843
2844
2845 class SvnRevisionRange:
2846   """The range of subversion revision numbers from which a path can be
2847   copied.  self.opening_revnum is the number of the earliest such
2848   revision, and self.closing_revnum is one higher than the number of
2849   the last such revision.  If self.closing_revnum is None, then no
2850   closings were registered."""
2851
2852   def __init__(self, opening_revnum):
2853     self.opening_revnum = opening_revnum
2854     self.closing_revnum = None
2855
2856   def add_closing(self, closing_revnum):
2857     # When we have a non-trunk default branch, we may have multiple
2858     # closings--only register the first closing we encounter.
2859     if self.closing_revnum is None:
2860       self.closing_revnum = closing_revnum
2861
2862   def __str__(self):
2863     if self.closing_revnum is None:
2864       return '[%d:]' % (self.opening_revnum,)
2865     else:
2866       return '[%d:%d]' % (self.opening_revnum, self.closing_revnum,)
2867
2868
2869 class OpeningsClosingsMap:
2870   """A dictionary of openings and closings for a symbolic name in the
2871   current SVNCommit.
2872
2873   The user should call self.register() for the openings and closings,
2874   then self.get_node_tree() to retrieve the information as a
2875   SymbolicNameFillingGuide."""
2876
2877   def __init__(self, symbolic_name):
2878     """Initialize OpeningsClosingsMap and prepare it for receiving
2879     openings and closings."""
2880
2881     self.name = symbolic_name
2882
2883     # A dictionary of SVN_PATHS to SvnRevisionRange objects.
2884     self.things = { }
2885
2886   def register(self, svn_path, svn_revnum, type):
2887     """Register an opening or closing revision for this symbolic name.
2888     SVN_PATH is the source path that needs to be copied into
2889     self.symbolic_name, and SVN_REVNUM is either the first svn
2890     revision number that we can copy from (our opening), or the last
2891     (not inclusive) svn revision number that we can copy from (our
2892     closing).  TYPE indicates whether this path is an opening or a a
2893     closing.
2894
2895     The opening for a given SVN_PATH must be passed before the closing
2896     for it to have any effect... any closing encountered before a
2897     corresponding opening will be discarded.
2898
2899     It is not necessary to pass a corresponding closing for every
2900     opening.
2901     """
2902     # Always log an OPENING
2903     if type == OPENING:
2904       self.things[svn_path] = SvnRevisionRange(svn_revnum)
2905     # Only log a closing if we've already registered the opening for that
2906     # path.
2907     elif type == CLOSING and self.things.has_key(svn_path):
2908       self.things[svn_path].add_closing(svn_revnum)
2909
2910   def is_empty(self):
2911     """Return true if we haven't accumulated any openings or closings,
2912     false otherwise."""
2913     return not len(self.things)
2914
2915   def get_things(self):
2916     """Return a list of (svn_path, SvnRevisionRange) tuples for all
2917     svn_paths with registered openings or closings."""
2918
2919     return self.things.items()
2920
2921
2922 class SymbolicNameFillingGuide:
2923   """A node tree representing the source paths to be copied to fill
2924   self.symbolic_name in the current SVNCommit.
2925
2926   self._node_tree is the root of the directory tree, in the form {
2927   path_component : subnode }.  Leaf nodes are instances of
2928   SvnRevisionRange.  Intermediate (directory) nodes are dictionaries
2929   mapping relative names to subnodes.
2930
2931   By walking self._node_tree and calling self.get_best_revnum() on
2932   each node, the caller can determine what subversion revision number
2933   to copy the path corresponding to that node from.  self._node_tree
2934   should be treated as read-only.
2935
2936   The caller can then descend to sub-nodes to see if their "best
2937   revnum" differs from their parents' and if it does, take appropriate
2938   actions to "patch up" the subtrees."""
2939
2940   def __init__(self, openings_closings_map):
2941     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2942     store into it the openings and closings from
2943     OPENINGS_CLOSINGS_MAP."""
2944
2945     self.name = openings_closings_map.name
2946
2947     # The dictionary that holds our node tree as a map { node_key :
2948     # node }.
2949     self._node_tree = { }
2950
2951     for svn_path, svn_revision_range in openings_closings_map.get_things():
2952       (head, tail) = _path_split(svn_path)
2953       self._get_node_for_path(head)[tail] = svn_revision_range
2954
2955     #self.print_node_tree(self._node_tree)
2956
2957   def _get_node_for_path(self, svn_path):
2958     """Return the node key for svn_path, creating new nodes as needed."""
2959     # Walk down the path, one node at a time.
2960     node = self._node_tree
2961     for component in svn_path.split('/'):
2962       if node.has_key(component):
2963         node = node[component]
2964       else:
2965         old_node = node
2966         node = {}
2967         old_node[component] = node
2968
2969     return node
2970
2971   def get_best_revnum(self, node, preferred_revnum):
2972     """Determine the best subversion revision number to use when
2973     copying the source tree beginning at NODE.  Returns a
2974     subversion revision number.
2975
2976     PREFERRED_REVNUM is passed to best_rev and used to calculate the
2977     best_revnum."""
2978
2979     def score_revisions(svn_revision_ranges):
2980       """Return a list of revisions and scores based on
2981       SVN_REVISION_RANGES.  The returned list looks like:
2982
2983          [(REV1 SCORE1), (REV2 SCORE2), ...]
2984
2985       where the tuples are sorted by revision number.
2986       SVN_REVISION_RANGES is a list of SvnRevisionRange objects.
2987
2988       For each svn revision that appears as either an opening_revnum
2989       or closing_revnum for one of the svn_revision_ranges, output a
2990       tuple indicating how many of the SvnRevisionRanges include that
2991       svn_revision in its range.  A score thus indicates that copying
2992       the corresponding revision (or any following revision up to the
2993       next revision in the list) of the object in question would yield
2994       that many correct paths at or underneath the object.  There may
2995       be other paths underneath it which are not correct and would
2996       need to be deleted or recopied; those can only be detected by
2997       descending and examining their scores.
2998
2999       If OPENINGS is empty, return the empty list."""
3000       openings = [ x.opening_revnum
3001                    for x in svn_revision_ranges ]
3002       closings = [ x.closing_revnum
3003                    for x in svn_revision_ranges
3004                    if x.closing_revnum is not None ]
3005
3006       # First look for easy out.
3007       if not openings:
3008         return []
3009
3010       # Create a list with both openings (which increment the total)
3011       # and closings (which decrement the total):
3012       things = [(rev,1) for rev in openings] + [(rev,-1) for rev in closings]
3013       # Sort by revision number:
3014       things.sort()
3015       # Initialize output list with zeroth element of things.  This
3016       # element must exist, because it was already verified that
3017       # openings is not empty.
3018       scores = [ things[0] ]
3019       total = scores[-1][1]
3020       for (rev, change) in things[1:]:
3021         total += change
3022         if rev == scores[-1][0]:
3023           # Same revision as last entry; modify last entry:
3024           scores[-1] = (rev, total)
3025         else:
3026           # Previously-unseen revision; create new entry:
3027           scores.append((rev, total))
3028       return scores
3029
3030     def best_rev(scores, preferred_rev):
3031       """Return the revision with the highest score from SCORES, a list
3032       returned by score_revisions().  When the maximum score is shared
3033       by multiple revisions, the oldest revision is selected, unless
3034       PREFERRED_REV is one of the possibilities, in which case, it is
3035       selected."""
3036       max_score = 0
3037       preferred_rev_score = -1
3038       rev = SVN_INVALID_REVNUM
3039       if preferred_rev is None:
3040         # Comparison order of different types is arbitrary. Do not
3041         # expect None to compare less than int values below.
3042         # In Python 2.3 None compares with ints like negative infinity.
3043         # In Python 2.0 None compares with ints like positive infinity.
3044         preferred_rev = SVN_INVALID_REVNUM
3045       for revnum, count in scores:
3046         if count > max_score:
3047           max_score = count
3048           rev = revnum
3049         if revnum <= preferred_rev:
3050           preferred_rev_score = count
3051       if preferred_rev_score == max_score:
3052         rev = preferred_rev
3053       return rev, max_score
3054
3055     # Aggregate openings and closings from the rev tree
3056     svn_revision_ranges = self._list_revnums(node)
3057
3058     # Score the lists
3059     scores = score_revisions(svn_revision_ranges)
3060
3061     revnum, max_score = best_rev(scores, preferred_revnum)
3062
3063     if revnum == SVN_INVALID_REVNUM:
3064       raise FatalError("failed to find a revision "
3065                        + "to copy from when copying %s" % name)
3066     return revnum, max_score
3067
3068   def _list_revnums(self, node):
3069     """Return a list of all the SvnRevisionRanges (including
3070     duplicates) for all leaf nodes at and under NODE."""
3071
3072     if isinstance(node, SvnRevisionRange):
3073       # It is a leaf node.
3074       return [ node ]
3075     else:
3076       # It is an intermediate node.
3077       revnums = []
3078       for key, subnode in node.items():
3079         revnums.extend(self._list_revnums(subnode))
3080       return revnums
3081
3082   def get_sources(self):
3083     """Return the list of sources for this symbolic name.
3084
3085     The Project instance defines what are legitimate sources.  Raise
3086     an exception if a change occurred outside of the source
3087     directories."""
3088
3089     return self._get_sub_sources('', self._node_tree)
3090
3091   def _get_sub_sources(self, start_svn_path, start_node):
3092     """Return the list of sources for this symbolic name, starting the
3093     search at path START_SVN_PATH, which is node START_NODE.  This is
3094     a helper method, called by get_sources() (see)."""
3095
3096     project = Ctx().project
3097     if isinstance(start_node, SvnRevisionRange):
3098       # This implies that a change was found outside of the
3099       # legitimate sources.  This should never happen.
3100       raise
3101     elif project.is_source(start_svn_path):
3102       # This is a legitimate source.  Add it to list.
3103       return [ FillSource(start_svn_path, start_node) ]
3104     else:
3105       # This is a directory that is not a legitimate source.  (That's
3106       # OK because it hasn't changed directly.)  But directories
3107       # within it have been changed, so we need to search recursively
3108       # to find their enclosing sources.
3109       sources = []
3110       for entry, node in start_node.items():
3111         svn_path = _path_join(start_svn_path, entry)
3112         sources.extend(self._get_sub_sources(svn_path, node))
3113
3114     return sources
3115
3116   def print_node_tree(self, node, name='/', indent_depth=0):
3117     """For debugging purposes.  Prints all nodes in TREE that are
3118     rooted at NODE.  INDENT_DEPTH is used to indent the output of
3119     recursive calls."""
3120     if not indent_depth:
3121       print "TREE", "=" * 75
3122     if isinstance(node, SvnRevisionRange):
3123       print "TREE:", " " * (indent_depth * 2), name, node
3124     else:
3125       print "TREE:", " " * (indent_depth * 2), name
3126       for key, value in node.items():
3127         self.print_node_tree(value, key, (indent_depth + 1))
3128
3129
3130 class FillSource:
3131   """Representation of a fill source used by the symbol filler in
3132   SVNRepositoryMirror."""
3133   def __init__(self, prefix, node):
3134     """Create an unscored fill source with a prefix and a key."""
3135     self.prefix = prefix
3136     self.node = node
3137     self.score = None
3138     self.revnum = None
3139
3140   def set_score(self, score, revnum):
3141     """Set the SCORE and REVNUM."""
3142     self.score = score
3143     self.revnum = revnum
3144
3145   def __cmp__(self, other):
3146     """Comparison operator used to sort FillSources in descending
3147     score order."""
3148     if self.score is None or other.score is None:
3149       raise TypeError, 'Tried to compare unscored FillSource'
3150     return cmp(other.score, self.score)
3151
3152
3153 class SVNRepositoryMirror:
3154   """Mirror a Subversion Repository as it is constructed, one
3155   SVNCommit at a time.  The mirror is skeletal; it does not contain
3156   file contents.  The creation of a dumpfile or Subversion repository
3157   is handled by delegates.  See self.add_delegate method for how to
3158   set delegates.
3159
3160   The structure of the repository is kept in two databases and one
3161   hash.  The revs_db database maps revisions to root node keys, and
3162   the nodes_db database maps node keys to nodes.  A node is a hash
3163   from directory names to keys.  Both the revs_db and the nodes_db are
3164   stored on disk and each access is expensive.
3165
3166   The nodes_db database only has the keys for old revisions.  The
3167   revision that is being contructed is kept in memory in the new_nodes
3168   hash which is cheap to access.
3169
3170   You must invoke _start_commit between SVNCommits.
3171
3172   *** WARNING *** All path arguments to methods in this class CANNOT
3173       have leading or trailing slashes.
3174   """
3175
3176   class SVNRepositoryMirrorPathExistsError(Exception):
3177     """Exception raised if an attempt is made to add a path to the
3178     repository mirror and that path already exists in the youngest
3179     revision of the repository."""
3180     pass
3181
3182   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
3183     """Exception raised if a CVSRevision is found to have an unexpected
3184     operation (OP) value."""
3185     pass
3186
3187   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
3188     """Exception raised if an empty SymbolicNameFillingGuide is returned
3189     during a fill where the branch in question already exists."""
3190     pass
3191
3192   def __init__(self):
3193     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
3194     self.delegates = [ ]
3195
3196     # This corresponds to the 'revisions' table in a Subversion fs.
3197     self.revs_db = SDatabase(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
3198     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
3199
3200     # This corresponds to the 'nodes' table in a Subversion fs.  (We
3201     # don't need a 'representations' or 'strings' table because we
3202     # only track metadata, not file contents.)
3203     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
3204     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
3205
3206     # Start at revision 0 without a root node.  It will be created
3207     # by _open_writable_root_node.
3208     self.youngest = 0
3209     self.new_root_key = None
3210     self.new_nodes = { }
3211
3212     if not Ctx().trunk_only:
3213       ###PERF IMPT: Suck this into memory.
3214       self.tags_db = TagsDatabase(DB_OPEN_READ)
3215       self.symbolings_reader = SymbolingsReader()
3216
3217   def _initialize_repository(self, date):
3218     """Initialize the repository by creating the directories for
3219     trunk, tags, and branches.  This method should only be called
3220     after all delegates are added to the repository mirror."""
3221     # Make a 'fake' SVNCommit so we can take advantage of the revprops
3222     # magic therein
3223     svn_commit = SVNCommit("Initialization", 1)
3224     svn_commit.set_date(date)
3225     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
3226
3227     self._start_commit(svn_commit)
3228     self._mkdir(Ctx().project.trunk_path)
3229     if not Ctx().trunk_only:
3230       self._mkdir(Ctx().project.branches_path)
3231       self._mkdir(Ctx().project.tags_path)
3232
3233   def _start_commit(self, svn_commit):
3234     """Start a new commit."""
3235     if self.youngest > 0:
3236       self._end_commit()
3237
3238     self.youngest = svn_commit.revnum
3239     self.new_root_key = None
3240     self.new_nodes = { }
3241
3242     self._invoke_delegates('start_commit', svn_commit)
3243
3244   def _end_commit(self):
3245     """Called at the end of each commit.  This method copies the newly
3246     created nodes to the on-disk nodes db."""
3247     if self.new_root_key is None:
3248       # No changes were made in this revision, so we make the root node
3249       # of the new revision be the same as the last one.
3250       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
3251     else:
3252       self.revs_db[str(self.youngest)] = self.new_root_key
3253       # Copy the new nodes to the nodes_db
3254       for key, value in self.new_nodes.items():
3255         self.nodes_db[key] = value
3256
3257   def _get_node(self, key):
3258     """Returns the node contents for KEY which may refer to either
3259     self.nodes_db or self.new_nodes."""
3260     if self.new_nodes.has_key(key):
3261       return self.new_nodes[key]
3262     else:
3263       return self.nodes_db[key]
3264
3265   def _open_readonly_node(self, path, revnum):
3266     """Open a readonly node for PATH at revision REVNUM.  Returns the
3267     node key and node contents if the path exists, else (None, None)."""
3268     # Get the root key
3269     if revnum == self.youngest:
3270       if self.new_root_key is None:
3271         node_key = self.revs_db[str(self.youngest - 1)]
3272       else:
3273         node_key = self.new_root_key
3274     else:
3275       node_key = self.revs_db[str(revnum)]
3276
3277     for component in path.split('/'):
3278       node_contents = self._get_node(node_key)
3279       node_key = node_contents.get(component, None)
3280       if node_key is None:
3281         return None
3282
3283     return node_key
3284
3285   def _open_writable_root_node(self):
3286     """Open a writable root node.  The current root node is returned
3287     immeditely if it is already writable.  If not, create a new one by
3288     copying the contents of the root node of the previous version."""
3289     if self.new_root_key is not None:
3290       return self.new_root_key, self.new_nodes[self.new_root_key]
3291
3292     if self.youngest < 2:
3293       new_contents = { }
3294     else:
3295       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
3296     self.new_root_key = gen_key()
3297     self.new_nodes = { self.new_root_key: new_contents }
3298
3299     return self.new_root_key, new_contents
3300
3301   def _open_writable_node(self, svn_path, create):
3302     """Open a writable node for the path SVN_PATH, creating SVN_PATH
3303     and any missing directories if CREATE is True."""
3304     parent_key, parent_contents = self._open_writable_root_node()
3305
3306     # Walk up the path, one node at a time.
3307     path_so_far = None
3308     components = svn_path.split('/')
3309     for i in range(len(components)):
3310       component = components[i]
3311       path_so_far = _path_join(path_so_far, component)
3312       this_key = parent_contents.get(component, None)
3313       if this_key is not None:
3314         # The component exists.
3315         this_contents = self.new_nodes.get(this_key, None)
3316         if this_contents is None:
3317           # Suck the node from the nodes_db, but update the key
3318           this_contents = self.nodes_db[this_key]
3319           this_key = gen_key()
3320           self.new_nodes[this_key] = this_contents
3321           parent_contents[component] = this_key
3322       elif create:
3323         # The component does not exists, so we create it.
3324         this_contents = { }
3325         this_key = gen_key()
3326         self.new_nodes[this_key] = this_contents
3327         parent_contents[component] = this_key
3328         if i < len(components) - 1:
3329           self._invoke_delegates('mkdir', path_so_far)
3330       else:
3331         # The component does not exists and we are not instructed to
3332         # create it, so we give up.
3333         return None, None
3334
3335       parent_key = this_key
3336       parent_contents = this_contents
3337
3338     return this_key, this_contents
3339
3340   def _path_exists(self, path):
3341     """If PATH exists in self.youngest of the svn repository mirror,
3342     return true, else return None.
3343
3344     PATH must not start with '/'."""
3345     return self._open_readonly_node(path, self.youngest) is not None
3346
3347   def _fast_delete_path(self, parent_path, parent_contents, component):
3348     """Delete COMPONENT from the parent direcory PARENT_PATH with the
3349     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
3350     in PARENT_CONTENTS."""
3351     if parent_contents.has_key(component):
3352       del parent_contents[component]
3353       self._invoke_delegates('delete_path',
3354                              _path_join(parent_path, component))
3355
3356   def _delete_path(self, svn_path, should_prune=False):
3357     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
3358     all ancestor directories that are made empty when SVN_PATH is deleted.
3359     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
3360
3361     NOTE: This function ignores requests to delete the root directory
3362     or any directory for which Ctx().project.is_unremovable() returns
3363     True, either directly or by pruning."""
3364
3365     if svn_path == '' or Ctx().project.is_unremovable(svn_path):
3366       return
3367
3368     (parent_path, entry,) = _path_split(svn_path)
3369     if parent_path:
3370       parent_key, parent_contents = \
3371           self._open_writable_node(parent_path, False)
3372     else:
3373       parent_key, parent_contents = self._open_writable_root_node()
3374
3375     if parent_key is not None:
3376       self._fast_delete_path(parent_path, parent_contents, entry)
3377       # The following recursion makes pruning an O(n^2) operation in the
3378       # worst case (where n is the depth of SVN_PATH), but the worst case
3379       # is probably rare, and the constant cost is pretty low.  Another
3380       # drawback is that we issue a delete for each path and not just
3381       # a single delete for the topmost directory pruned.
3382       if should_prune and len(parent_contents) == 0:
3383         self._delete_path(parent_path, True)
3384
3385   def _mkdir(self, path):
3386     """Create PATH in the repository mirror at the youngest revision."""
3387     self._open_writable_node(path, True)
3388     self._invoke_delegates('mkdir', path)
3389
3390   def _change_path(self, cvs_rev):
3391     """Register a change in self.youngest for the CVS_REV's svn_path
3392     in the repository mirror."""
3393     # We do not have to update the nodes because our mirror is only
3394     # concerned with the presence or absence of paths, and a file
3395     # content change does not cause any path changes.
3396     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, False))
3397
3398   def _add_path(self, cvs_rev):
3399     """Add the CVS_REV's svn_path to the repository mirror."""
3400     self._open_writable_node(cvs_rev.svn_path, True)
3401     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, True))
3402
3403   def _copy_path(self, src_path, dest_path, src_revnum):
3404     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
3405     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
3406     parent *must* exist, but DEST_PATH *cannot* exist.
3407
3408     Return the node key and the contents of the new node at DEST_PATH
3409     as a dictionary."""
3410     # get the contents of the node of our src_path
3411     src_key = self._open_readonly_node(src_path, src_revnum)
3412     src_contents = self._get_node(src_key)
3413
3414     # Get the parent path and the base path of the dest_path
3415     (dest_parent, dest_basename,) = _path_split(dest_path)
3416     dest_parent_key, dest_parent_contents = \
3417                    self._open_writable_node(dest_parent, False)
3418
3419     if dest_parent_contents.has_key(dest_basename):
3420       msg = "Attempt to add path '%s' to repository mirror " % dest_path
3421       msg = msg + "when it already exists in the mirror."
3422       raise self.SVNRepositoryMirrorPathExistsError, msg
3423
3424     dest_parent_contents[dest_basename] = src_key
3425     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
3426
3427     # Yes sir, src_key and src_contents are also the contents of the
3428     # destination.  This is a cheap copy, remember!  :-)
3429     return src_key, src_contents
3430
3431   def _fill_symbolic_name(self, svn_commit):
3432     """Performs all copies necessary to create as much of the the tag
3433     or branch SVN_COMMIT.symbolic_name as possible given the current
3434     revision of the repository mirror.
3435
3436     The symbolic name is guaranteed to exist in the Subversion
3437     repository by the end of this call, even if there are no paths
3438     under it."""
3439     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3440         svn_commit.symbolic_name, self.youngest)
3441     # Get the list of sources for the symbolic name.
3442     sources = symbol_fill.get_sources()
3443
3444     if sources:
3445       if self.tags_db.has_key(svn_commit.symbolic_name):
3446         dest_prefix = Ctx().project.get_tag_path(svn_commit.symbolic_name)
3447       else:
3448         dest_prefix = Ctx().project.get_branch_path(svn_commit.symbolic_name)
3449
3450       dest_key = self._open_writable_node(dest_prefix, False)[0]
3451       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3452     else:
3453       # We can only get here for a branch whose first commit is an add
3454       # (as opposed to a copy).
3455       dest_path = Ctx().project.get_branch_path(symbol_fill.name)
3456       if not self._path_exists(dest_path):
3457         # If our symbol_fill was empty, that means that our first
3458         # commit on the branch was to a file added on the branch, and
3459         # that this is our first fill of that branch.
3460         #
3461         # This case is covered by test 16.
3462         #
3463         # ...we create the branch by copying trunk from the our
3464         # current revision number minus 1
3465         source_path = Ctx().project.trunk_path
3466         entries = self._copy_path(source_path, dest_path,
3467                                   svn_commit.revnum - 1)[1]
3468         # Now since we've just copied trunk to a branch that's
3469         # *supposed* to be empty, we delete any entries in the
3470         # copied directory.
3471         for entry in entries.keys():
3472           del_path = dest_path + '/' + entry
3473           # Delete but don't prune.
3474           self._delete_path(del_path)
3475       else:
3476         msg = "Error filling branch '" \
3477               + _clean_symbolic_name(symbol_fill.name) + "'.\n"
3478         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3479         msg = msg + "attempted to create a branch that already exists."
3480         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3481
3482   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3483             path = None, parent_source_prefix = None,
3484             preferred_revnum = None, prune_ok = None):
3485     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3486     SOURCES, and recurse into the child items.
3487
3488     DEST_PREFIX is the prefix of the destination directory, e.g.
3489     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3490     FillSource classes that are candidates to be copied to the
3491     destination.  DEST_KEY is the key in self.nodes_db to the
3492     destination, or None if the destination does not yet exist.
3493
3494     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3495     are at the top level, e.g. '/tags/my_tag'.
3496
3497     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3498     the parent directory, and PREFERRED_REVNUM is an int which is the
3499     source revision number that the caller (who may have copied KEY's
3500     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3501     then no revision is preferable to any other (which probably means
3502     that no copies have happened yet).
3503
3504     PRUNE_OK means that a copy has been made in this recursion, and
3505     it's safe to prune directories that are not in
3506     SYMBOL_FILL._node_tree, provided that said directory has a source
3507     prefix of one of the PARENT_SOURCE_PREFIX.
3508
3509     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3510     should only be passed in by recursive calls."""
3511     # Calculate scores and revnums for all sources
3512     for source in sources:
3513       src_revnum, score = symbol_fill.get_best_revnum(source.node,
3514                                                       preferred_revnum)
3515       source.set_score(score, src_revnum)
3516
3517     # Sort the sources in descending score order so that we will make
3518     # a eventual copy from the source with the highest score.
3519     sources.sort()
3520     copy_source = sources[0]
3521
3522     src_path = _path_join(copy_source.prefix, path)
3523     dest_path = _path_join(dest_prefix, path)
3524
3525     # Figure out if we shall copy to this destination and delete any
3526     # destination path that is in the way.
3527     do_copy = 0
3528     if dest_key is None:
3529       do_copy = 1
3530     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3531                        copy_source.revnum != preferred_revnum):
3532       # We are about to replace the destination, so we need to remove
3533       # it before we perform the copy.
3534       self._delete_path(dest_path)
3535       do_copy = 1
3536
3537     if do_copy:
3538       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3539                                                copy_source.revnum)
3540       prune_ok = 1
3541     else:
3542       dest_entries = self._get_node(dest_key)
3543
3544     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3545     # elements and the values are lists of FillSource classes where
3546     # this path element exists.
3547     src_entries = {}
3548     for source in sources:
3549       if isinstance(source.node, SvnRevisionRange):
3550         continue
3551       for entry, node in source.node.items():
3552         src_entries.setdefault(entry, []).append(
3553             FillSource(source.prefix, node))
3554
3555     if prune_ok:
3556       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3557       delete_list = [ ]
3558       for entry in dest_entries.keys():
3559         if not src_entries.has_key(entry):
3560           delete_list.append(entry)
3561       if delete_list:
3562         if not self.new_nodes.has_key(dest_key):
3563           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3564         # Sort the delete list to get "diffable" dumpfiles.
3565         delete_list.sort()
3566         for entry in delete_list:
3567           self._fast_delete_path(dest_path, dest_entries, entry)
3568
3569     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3570     src_keys = src_entries.keys()
3571     src_keys.sort()
3572     for src_key in src_keys:
3573       next_dest_key = dest_entries.get(src_key, None)
3574       self._fill(symbol_fill, dest_prefix, next_dest_key,
3575                  src_entries[src_key], _path_join(path, src_key),
3576                  copy_source.prefix, sources[0].revnum, prune_ok)
3577
3578   def _synchronize_default_branch(self, svn_commit):
3579     """Propagate any changes that happened on a non-trunk default
3580     branch to the trunk of the repository.  See
3581     CVSCommit._post_commit() for details on why this is necessary."""
3582     for cvs_rev in svn_commit.cvs_revs:
3583       svn_trunk_path = Ctx().project.make_trunk_path(
3584           Ctx().project.relative_name(cvs_rev.fname))
3585       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3586         if self._path_exists(svn_trunk_path):
3587           # Delete the path on trunk...
3588           self._delete_path(svn_trunk_path)
3589         # ...and copy over from branch
3590         self._copy_path(cvs_rev.svn_path, svn_trunk_path,
3591                         svn_commit.motivating_revnum)
3592       elif cvs_rev.op == OP_DELETE:
3593         # delete trunk path
3594         self._delete_path(svn_trunk_path)
3595       else:
3596         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3597                % cvs_rev.op)
3598         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3599
3600   def commit(self, svn_commit):
3601     """Add an SVNCommit to the SVNRepository, incrementing the
3602     Repository revision number, and changing the repository.  Invoke
3603     the delegates' _start_commit() method."""
3604
3605     if svn_commit.revnum == 2:
3606       self._initialize_repository(svn_commit.get_date())
3607
3608     self._start_commit(svn_commit)
3609
3610     if svn_commit.symbolic_name:
3611       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3612                   _clean_symbolic_name(svn_commit.symbolic_name))
3613       self._fill_symbolic_name(svn_commit)
3614     elif svn_commit.motivating_revnum:
3615       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3616                   % svn_commit.motivating_revnum)
3617       self._synchronize_default_branch(svn_commit)
3618     else: # This actually commits CVSRevisions
3619       if len(svn_commit.cvs_revs) > 1: plural = "s"
3620       else: plural = ""
3621       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3622                   % (len(svn_commit.cvs_revs), plural))
3623       for cvs_rev in svn_commit.cvs_revs:
3624         # See comment in CVSCommit._commit() for what this is all
3625         # about.  Note that although asking self._path_exists() is
3626         # somewhat expensive, we only do it if the first two (cheap)
3627         # tests succeed first.
3628         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3629                 and (cvs_rev.rev == "1.1.1.1")
3630                 and self._path_exists(cvs_rev.svn_path)):
3631           if cvs_rev.op == OP_ADD:
3632             self._add_path(cvs_rev)
3633           elif cvs_rev.op == OP_CHANGE:
3634             # Fix for Issue #74:
3635             #
3636             # Here's the scenario.  You have file FOO that is imported
3637             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3638             # the file exists.
3639             #
3640             # Moving forward in time, FOO is deleted on the default
3641             # branch (r1.1.1.2).  cvs2svn determines that this delete
3642             # also needs to happen on trunk, so FOO is deleted on
3643             # trunk.
3644             #
3645             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3646             # not 'dead', we assume it's a change).  However, since
3647             # our trunk file has been deleted, svnadmin blows up--you
3648             # can't change a file that doesn't exist!
3649             #
3650             # Soooo... we just check the path, and if it doesn't
3651             # exist, we do an add... if the path does exist, it's
3652             # business as usual.
3653             if not self._path_exists(cvs_rev.svn_path):
3654               self._add_path(cvs_rev)
3655             else:
3656               self._change_path(cvs_rev)
3657
3658         if cvs_rev.op == OP_DELETE:
3659           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3660
3661   def cleanup(self):
3662     """Callback for the Cleanup.register in self.__init__."""
3663     self.revs_db = None
3664     self.nodes_db = None
3665
3666   def add_delegate(self, delegate):
3667     """Adds DELEGATE to self.delegates.
3668
3669     For every delegate you add, as soon as SVNRepositoryMirror
3670     performs a repository action method, SVNRepositoryMirror will call
3671     the delegate's corresponding repository action method.  Multiple
3672     delegates will be called in the order that they are added.  See
3673     SVNRepositoryMirrorDelegate for more information."""
3674     self.delegates.append(delegate)
3675
3676   def _invoke_delegates(self, method, *args):
3677     """Iterate through each of our delegates, in the order that they
3678     were added, and call the delegate's method named METHOD with the
3679     arguments in ARGS."""
3680     for delegate in self.delegates:
3681       getattr(delegate, method)(*args)
3682
3683   def finish(self):
3684     """Calls the delegate finish method."""
3685     self._end_commit()
3686     self._invoke_delegates('finish')
3687     self.cleanup()
3688
3689
3690 class SVNCommitItem:
3691   """A wrapper class for CVSRevision objects upon which
3692   Subversion-related data (such as properties) may be hung."""
3693
3694   def __init__(self, c_rev, svn_props_changed):
3695     """Initialize instance and record the properties for this file.
3696     SVN_PROPS_CHANGED indicates whether the svn: properties are known
3697     to have changed since the last revision.
3698
3699     The properties are set by the SVNPropertySetters in
3700     Ctx().svn_property_setters, then we read a couple of the
3701     properties back out for our own purposes."""
3702
3703     self.c_rev = c_rev
3704     # Did the svn properties change for this file (i.e., do they have
3705     # to be written to the dumpfile?)
3706     self.svn_props_changed = svn_props_changed
3707
3708     # The properties for this item as a map { key : value }.  If VALUE
3709     # is None, no property should be set.
3710     self.svn_props = { }
3711
3712     for svn_property_setter in Ctx().svn_property_setters:
3713       svn_property_setter.set_properties(self)
3714
3715     # Remember if we need to filter the EOLs.  We could actually use
3716     # self.svn_props now, since it is initialized for each revision.
3717     self.needs_eol_filter = \
3718         self.svn_props.get('svn:eol-style', None) is not None
3719
3720     self.has_keywords = self.svn_props.get('svn:keywords', None) is not None
3721
3722
3723 class SVNPropertySetter:
3724   """Abstract class for objects that can set properties on a SVNCommitItem."""
3725
3726   def set_properties(self, s_item):
3727     """Set any properties that can be determined for S_ITEM."""
3728
3729     raise NotImplementedError
3730
3731
3732 class SVNRepositoryMirrorDelegate:
3733   """Abstract superclass for any delegate to SVNRepositoryMirror.
3734   Subclasses must implement all of the methods below.
3735
3736   For each method, a subclass implements, in its own way, the
3737   Subversion operation implied by the method's name.  For example, for
3738   the add_path method, the DumpfileDelegate would write out a
3739   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3740   would merely print that the path is being added to the repository,
3741   and the RepositoryDelegate would actually cause the path to be added
3742   to the Subversion repository that it is creating.
3743   """
3744
3745   def start_commit(self, svn_commit):
3746     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3747     see subclass implementation for details."""
3748     raise NotImplementedError
3749
3750   def mkdir(self, path):
3751     """PATH is a string; see subclass implementation for details."""
3752     raise NotImplementedError
3753
3754   def add_path(self, s_item):
3755     """S_ITEM is an SVNCommitItem; see subclass implementation for
3756     details."""
3757     raise NotImplementedError
3758
3759   def change_path(self, s_item):
3760     """S_ITEM is an SVNCommitItem; see subclass implementation for
3761     details."""
3762     raise NotImplementedError
3763
3764   def delete_path(self, path):
3765     """PATH is a string; see subclass implementation for
3766     details."""
3767     raise NotImplementedError
3768
3769   def copy_path(self, src_path, dest_path, src_revnum):
3770     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3771     subversion revision number (int); see subclass implementation for
3772     details."""
3773     raise NotImplementedError
3774
3775   def finish(self):
3776     """Perform any cleanup necessary after all revisions have been
3777     committed."""
3778     raise NotImplementedError
3779
3780
3781 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3782   """Create a Subversion dumpfile."""
3783
3784   def __init__(self, dumpfile_path=None):
3785     """Return a new DumpfileDelegate instance, attached to a dumpfile
3786     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding."""
3787     if dumpfile_path:
3788       self.dumpfile_path = dumpfile_path
3789     else:
3790       self.dumpfile_path = Ctx().dumpfile
3791
3792     self.dumpfile = open(self.dumpfile_path, 'wb')
3793     self._write_dumpfile_header(self.dumpfile)
3794
3795   def _write_dumpfile_header(self, dumpfile):
3796     # Initialize the dumpfile with the standard headers.
3797     #
3798     # Since the CVS repository doesn't have a UUID, and the Subversion
3799     # repository will be created with one anyway, we don't specify a
3800     # UUID in the dumpflie
3801     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3802
3803   def _utf8_path(self, path):
3804     """Return a copy of PATH encoded in UTF-8."""
3805     pieces = string.split(path, '/')
3806     # Convert each path component separately (as they may each use
3807     # different encodings).
3808     for i in range(len(pieces)):
3809       try:
3810         # Log messages can be converted with the 'replace' strategy,
3811         # but we can't afford any lossiness here.
3812         pieces[i] = to_utf8(pieces[i], 'strict')
3813       except UnicodeError:
3814         raise FatalError(
3815             "Unable to convert a path '%s' to internal encoding.\n"
3816             "Consider rerunning with (for example) '--encoding=latin1'."
3817             % (path,))
3818     return string.join(pieces, '/')
3819
3820   def _string_for_prop(self, name, value):
3821     """Return a property in the form needed for the dumpfile."""
3822
3823     return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value)
3824
3825   def start_commit(self, svn_commit):
3826     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3827
3828     self.revision = svn_commit.revnum
3829
3830     # The start of a new commit typically looks like this:
3831     #
3832     #   Revision-number: 1
3833     #   Prop-content-length: 129
3834     #   Content-length: 129
3835     #
3836     #   K 7
3837     #   svn:log
3838     #   V 27
3839     #   Log message for revision 1.
3840     #   K 10
3841     #   svn:author
3842     #   V 7
3843     #   jrandom
3844     #   K 8
3845     #   svn:date
3846     #   V 27
3847     #   2003-04-22T22:57:58.132837Z
3848     #   PROPS-END
3849     #
3850     # Notice that the length headers count everything -- not just the
3851     # length of the data but also the lengths of the lengths, including
3852     # the 'K ' or 'V ' prefixes.
3853     #
3854     # The reason there are both Prop-content-length and Content-length
3855     # is that the former includes just props, while the latter includes
3856     # everything.  That's the generic header form for any entity in a
3857     # dumpfile.  But since revisions only have props, the two lengths
3858     # are always the same for revisions.
3859
3860     # Calculate the output needed for the property definitions.
3861     props = svn_commit.get_revprops()
3862     prop_names = props.keys()
3863     prop_names.sort()
3864     prop_strings = []
3865     for propname in prop_names:
3866       if props[propname] is not None:
3867         prop_strings.append(self._string_for_prop(propname, props[propname]))
3868
3869     all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n'
3870     total_len = len(all_prop_strings)
3871
3872     # Print the revision header and props
3873     self.dumpfile.write('Revision-number: %d\n'
3874                         'Prop-content-length: %d\n'
3875                         'Content-length: %d\n'
3876                         '\n'
3877                         % (self.revision, total_len, total_len))
3878
3879     self.dumpfile.write(all_prop_strings)
3880     self.dumpfile.write('\n')
3881
3882   def mkdir(self, path):
3883     """Emit the creation of directory PATH."""
3884     self.dumpfile.write("Node-path: %s\n"
3885                         "Node-kind: dir\n"
3886                         "Node-action: add\n"
3887                         "\n"
3888                         "\n" % self._utf8_path(path))
3889
3890   def _add_or_change_path(self, s_item, op):
3891     """Emit the addition or change corresponding to S_ITEM.
3892     OP is either the constant OP_ADD or OP_CHANGE."""
3893
3894     # Validation stuffs
3895     if op == OP_ADD:
3896       action = 'add'
3897     elif op == OP_CHANGE:
3898       action = 'change'
3899     else:
3900       raise FatalError("_add_or_change_path() called with bad op ('%s')"
3901                        % (op,))
3902
3903     # Convenience variables
3904     c_rev = s_item.c_rev
3905
3906     # The property handling here takes advantage of an undocumented
3907     # but IMHO consistent feature of the Subversion dumpfile-loading
3908     # code.  When a node's properties aren't mentioned (that is, the
3909     # "Prop-content-length:" header is absent, no properties are
3910     # listed at all, and there is no "PROPS-END\n" line) then no
3911     # change is made to the node's properties.
3912     #
3913     # This is consistent with the way dumpfiles behave w.r.t. text
3914     # content changes, so I'm comfortable relying on it.  If you
3915     # commit a change to *just* the properties of some node that
3916     # already has text contents from a previous revision, then in the
3917     # dumpfile output for the prop change, no "Text-content-length:"
3918     # nor "Text-content-md5:" header will be present, and the text of
3919     # the file will not be given.  But this does not cause the file's
3920     # text to be erased!  It simply remains unchanged.
3921     #
3922     # This works out great for cvs2svn, due to lucky coincidences:
3923     #
3924     # For files, the only properties we ever set are set in the first
3925     # revision; all other revisions (including on branches) inherit
3926     # from that.  After the first revision, we never change file
3927     # properties, therefore, there is no need to remember the full set
3928     # of properties on a given file once we've set it.
3929     #
3930     # For directories, the only property we set is "svn:ignore", and
3931     # while we may change it after the first revision, we always do so
3932     # based on the contents of a ".cvsignore" file -- in other words,
3933     # CVS is doing the remembering for us, so we still don't have to
3934     # preserve the previous value of the property ourselves.
3935
3936     # Calculate the (sorted-by-name) property string and length, if any.
3937     if s_item.svn_props_changed:
3938       svn_props = s_item.svn_props
3939       prop_contents = ''
3940       prop_names = svn_props.keys()
3941       prop_names.sort()
3942       for pname in prop_names:
3943         pvalue = svn_props[pname]
3944         if pvalue is not None:
3945           prop_contents += self._string_for_prop(pname, pvalue)
3946       prop_contents += 'PROPS-END\n'
3947       props_header = 'Prop-content-length: %d\n' % len(prop_contents)
3948     else:
3949       prop_contents = ''
3950       props_header = ''
3951
3952     # treat .cvsignore as a directory property
3953     dir_path, basename = os.path.split(c_rev.svn_path)
3954     if basename == ".cvsignore":
3955       ignore_vals = generate_ignores(c_rev)
3956       ignore_contents = '\n'.join(ignore_vals)
3957       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3958                          (len(ignore_contents), ignore_contents))
3959       ignore_contents = ignore_contents + 'PROPS-END\n'
3960       ignore_len = len(ignore_contents)
3961
3962       # write headers, then props
3963       self.dumpfile.write('Node-path: %s\n'
3964                           'Node-kind: dir\n'
3965                           'Node-action: change\n'
3966                           'Prop-content-length: %d\n'
3967                           'Content-length: %d\n'
3968                           '\n'
3969                           '%s'
3970                           % (self._utf8_path(dir_path), ignore_len,
3971                              ignore_len, ignore_contents))
3972
3973     # If the file has keywords, we must prevent CVS/RCS from expanding
3974     # the keywords because they must be unexpanded in the repository,
3975     # or Subversion will get confused.
3976     pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(
3977         c_rev, suppress_keyword_substitution=s_item.has_keywords)
3978
3979     self.dumpfile.write('Node-path: %s\n'
3980                         'Node-kind: file\n'
3981                         'Node-action: %s\n'
3982                         '%s'  # no property header if no props
3983                         'Text-content-length: '
3984                         % (self._utf8_path(c_rev.svn_path),
3985                            action, props_header))
3986
3987     pos = self.dumpfile.tell()
3988
3989     self.dumpfile.write('0000000000000000\n'
3990                         'Text-content-md5: 00000000000000000000000000000000\n'
3991                         'Content-length: 0000000000000000\n'
3992                         '\n')
3993
3994     if prop_contents:
3995       self.dumpfile.write(prop_contents)
3996
3997     # Insert a filter to convert all EOLs to LFs if neccessary
3998     if s_item.needs_eol_filter:
3999       data_reader = LF_EOL_Filter(pipe.stdout)
4000     else:
4001       data_reader = pipe.stdout
4002
4003     # Insert the rev contents, calculating length and checksum as we go.
4004     checksum = md5.new()
4005     length = 0
4006     while True:
4007       buf = data_reader.read(PIPE_READ_SIZE)
4008       if buf == '':
4009         break
4010       checksum.update(buf)
4011       length = length + len(buf)
4012       self.dumpfile.write(buf)
4013
4014     pipe.stdout.close()
4015     error_output = pipe.stderr.read()
4016     exit_status = pipe.wait()
4017     if exit_status:
4018       raise FatalError("The command '%s' failed with exit status: %s\n"
4019                        "and the following output:\n"
4020                        "%s" % (pipe_cmd, exit_status, error_output))
4021
4022     # Go back to patch up the length and checksum headers:
4023     self.dumpfile.seek(pos, 0)
4024     # We left 16 zeros for the text length; replace them with the real
4025     # length, padded on the left with spaces:
4026     self.dumpfile.write('%16d' % length)
4027     # 16... + 1 newline + len('Text-content-md5: ') == 35
4028     self.dumpfile.seek(pos + 35, 0)
4029     self.dumpfile.write(checksum.hexdigest())
4030     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
4031     self.dumpfile.seek(pos + 84, 0)
4032     # The content length is the length of property data, text data,
4033     # and any metadata around/inside around them.
4034     self.dumpfile.write('%16d' % (length + len(prop_contents)))
4035     # Jump back to the end of the stream
4036     self.dumpfile.seek(0, 2)
4037
4038     # This record is done (write two newlines -- one to terminate
4039     # contents that weren't themselves newline-termination, one to
4040     # provide a blank line for readability.
4041     self.dumpfile.write('\n\n')
4042
4043   def add_path(self, s_item):
4044     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
4045     self._add_or_change_path(s_item, OP_ADD)
4046
4047   def change_path(self, s_item):
4048     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
4049     self._add_or_change_path(s_item, OP_CHANGE)
4050
4051   def delete_path(self, path):
4052     """Emit the deletion of PATH."""
4053     self.dumpfile.write('Node-path: %s\n'
4054                         'Node-action: delete\n'
4055                         '\n' % self._utf8_path(path))
4056
4057   def copy_path(self, src_path, dest_path, src_revnum):
4058     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
4059     # We don't need to include "Node-kind:" for copies; the loader
4060     # ignores it anyway and just uses the source kind instead.
4061     self.dumpfile.write('Node-path: %s\n'
4062                         'Node-action: add\n'
4063                         'Node-copyfrom-rev: %d\n'
4064                         'Node-copyfrom-path: /%s\n'
4065                         '\n'
4066                         % (self._utf8_path(dest_path),
4067                            src_revnum,
4068                            self._utf8_path(src_path)))
4069
4070   def finish(self):
4071     """Perform any cleanup necessary after all revisions have been
4072     committed."""
4073     self.dumpfile.close()
4074
4075
4076 class RepositoryDelegate(DumpfileDelegate):
4077   """Creates a new Subversion Repository.  DumpfileDelegate does all
4078   of the heavy lifting."""
4079   def __init__(self):
4080     self.svnadmin = Ctx().svnadmin
4081     self.target = Ctx().target
4082     if not Ctx().existing_svnrepos:
4083       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
4084       if not Ctx().fs_type:
4085         # User didn't say what kind repository (bdb, fsfs, etc).
4086         # We still pass --bdb-txn-nosync.  It's a no-op if the default
4087         # repository type doesn't support it, but we definitely want
4088         # it if BDB is the default.
4089         run_command('%s create %s "%s"' % (self.svnadmin,
4090                                            "--bdb-txn-nosync",
4091                                            self.target))
4092       elif Ctx().fs_type == 'bdb':
4093         # User explicitly specified bdb.
4094         #
4095         # Since this is a BDB repository, pass --bdb-txn-nosync,
4096         # because it gives us a 4-5x speed boost (if cvs2svn is
4097         # creating the repository, cvs2svn should be the only program
4098         # accessing the svn repository (until cvs is done, at least)).
4099         # But we'll turn no-sync off in self.finish(), unless
4100         # instructed otherwise.
4101         run_command('%s create %s %s "%s"' % (self.svnadmin,
4102                                               "--fs-type=bdb",
4103                                               "--bdb-txn-nosync",
4104                                               self.target))
4105       else:
4106         # User specified something other than bdb.
4107         run_command('%s create %s "%s"' % (self.svnadmin,
4108                                            "--fs-type=%s" % Ctx().fs_type,
4109                                            self.target))
4110
4111     # Since the output of this run is a repository, not a dumpfile,
4112     # the temporary dumpfiles we create should go in the tmpdir.
4113     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
4114
4115     # This is 1 if a commit is in progress, otherwise None.
4116     self._commit_in_progress = None
4117
4118     self.dumpfile = open(self.dumpfile_path, 'w+b')
4119     self.loader_pipe = SimplePopen([ self.svnadmin, 'load', '-q',
4120                                      self.target ], True)
4121     self.loader_pipe.stdout.close()
4122     try:
4123       self._write_dumpfile_header(self.loader_pipe.stdin)
4124     except IOError:
4125       raise FatalError("svnadmin failed with the following output while "
4126                        "loading the dumpfile:\n"
4127                        + self.loader_pipe.stderr.read())
4128
4129   def _feed_pipe(self):
4130     """Feed the revision stored in the dumpfile to the svnadmin
4131     load pipe."""
4132     self.dumpfile.seek(0)
4133     while 1:
4134       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
4135       if not len(data):
4136         break
4137       try:
4138         self.loader_pipe.stdin.write(data)
4139       except IOError:
4140         raise FatalError("svnadmin failed with the following output "
4141                          "while loading the dumpfile:\n"
4142                          + self.loader_pipe.stderr.read())
4143
4144   def start_commit(self, svn_commit):
4145     """Start a new commit.  If a commit is already in progress, close
4146     the dumpfile, load it into the svn repository, open a new
4147     dumpfile, and write the header into it."""
4148     if self._commit_in_progress:
4149       self._feed_pipe()
4150     self.dumpfile.seek(0)
4151     self.dumpfile.truncate()
4152     DumpfileDelegate.start_commit(self, svn_commit)
4153     self._commit_in_progress = 1
4154
4155   def finish(self):
4156     """Loads the last commit into the repository."""
4157     self._feed_pipe()
4158     self.dumpfile.close()
4159     self.loader_pipe.stdin.close()
4160     error_output = self.loader_pipe.stderr.read()
4161     exit_status = self.loader_pipe.wait()
4162     if exit_status:
4163       raise FatalError('svnadmin load failed with exit status: %s\n'
4164                        'and the following output:\n'
4165                        '%s' % (exit_status, error_output,))
4166     os.remove(self.dumpfile_path)
4167
4168     # If this is a BDB repository, and we created the repository, and
4169     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
4170     # line in the DB_CONFIG file, because txn syncing should be on by
4171     # default in BDB repositories.
4172     #
4173     # We determine if this is a BDB repository by looking for the
4174     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
4175     # checking Ctx().fs_type.  That way this code will Do The Right
4176     # Thing in all circumstances.
4177     db_config = os.path.join(self.target, "db/DB_CONFIG")
4178     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
4179         and os.path.exists(db_config)):
4180       no_sync = 'set_flags DB_TXN_NOSYNC\n'
4181
4182       contents = open(db_config, 'r').readlines()
4183       index = contents.index(no_sync)
4184       contents[index] = '# ' + no_sync
4185       contents = open(db_config, 'w').writelines(contents)
4186
4187
4188 class StdoutDelegate(SVNRepositoryMirrorDelegate):
4189   """Makes no changes to the disk, but writes out information to
4190   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
4191   print statements will state that we're doing something, when in
4192   reality, we aren't doing anything other than printing out that we're
4193   doing something.  Kind of zen, really."""
4194   def __init__(self, total_revs):
4195     self.total_revs = total_revs
4196
4197   def start_commit(self, svn_commit):
4198     """Prints out the Subversion revision number of the commit that is
4199     being started."""
4200     Log().write(LOG_VERBOSE, "=" * 60)
4201     Log().write(LOG_NORMAL, "Starting Subversion r%d / %d" %
4202                 (svn_commit.revnum, self.total_revs))
4203
4204   def mkdir(self, path):
4205     """Print a line stating that we are creating directory PATH."""
4206     Log().write(LOG_VERBOSE, "  New Directory", path)
4207
4208   def add_path(self, s_item):
4209     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
4210     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
4211
4212   def change_path(self, s_item):
4213     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
4214     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
4215
4216   def delete_path(self, path):
4217     """Print a line stating that we are 'deleting' PATH."""
4218     Log().write(LOG_VERBOSE, "  Deleting", path)
4219
4220   def copy_path(self, src_path, dest_path, src_revnum):
4221     """Print a line stating that we are 'copying' revision SRC_REVNUM
4222     of SRC_PATH to DEST_PATH."""
4223     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
4224     Log().write(LOG_VERBOSE, "                to", dest_path)
4225
4226   def finish(self):
4227     """State that we are done creating our repository."""
4228     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
4229     Log().write(LOG_QUIET, "Done.")
4230
4231 # This should be a local to pass1,
4232 # but Python 2.0 does not support nested scopes.
4233 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
4234 def pass1():
4235   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
4236   cd = CollectData()
4237
4238   def visit_file(baton, dirname, files):
4239     cd = baton
4240     for fname in files:
4241       if fname[-2:] != ',v':
4242         continue
4243       cd.found_valid_file = 1
4244       pathname = os.path.join(dirname, fname)
4245       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
4246         # drop the 'Attic' portion from the pathname for the canonical name.
4247         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
4248       else:
4249         # If this file also exists in the attic, it's a fatal error
4250         attic_path = os.path.join(dirname, 'Attic', fname)
4251         if os.path.exists(attic_path):
4252           err = "%s: A CVS repository cannot contain both %s and %s" \
4253                 % (error_prefix, pathname, attic_path)
4254           sys.stderr.write(err + '\n')
4255           cd.fatal_errors.append(err)
4256         cd.set_fname(pathname, pathname)
4257       Log().write(LOG_NORMAL, pathname)
4258       try:
4259         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
4260       except (cvs2svn_rcsparse.common.RCSParseError, ValueError,
4261               RuntimeError):
4262         err = "%s: '%s' is not a valid ,v file" \
4263               % (error_prefix, pathname)
4264         sys.stderr.write(err + '\n')
4265         cd.fatal_errors.append(err)
4266       except:
4267         Log().write(LOG_WARN,
4268                     "Exception occurred while parsing %s" % pathname)
4269         raise
4270
4271   os.path.walk(Ctx().project.project_cvs_repos_path, visit_file, cd)
4272   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
4273
4274   cd.write_symbol_db()
4275
4276   if len(cd.fatal_errors) > 0:
4277     raise FatalException("Pass 1 complete.\n"
4278                          + "=" * 75 + "\n"
4279                          + "Error summary:\n"
4280                          + "\n".join(cd.fatal_errors) + "\n"
4281                          + "Exited due to fatal error(s).\n")
4282
4283   if cd.found_valid_file is None:
4284     raise FatalException(
4285         "\n"
4286         "No RCS files found in your CVS Repository!\n"
4287         "Are you absolutely certain you are pointing cvs2svn\n"
4288         "at a CVS repository?\n"
4289         "\n"
4290         "Exited due to fatal error(s).\n")
4291
4292   StatsKeeper().reset_c_rev_info()
4293   StatsKeeper().archive()
4294   Log().write(LOG_QUIET, "Done")
4295
4296 def pass2():
4297   "Pass 2: clean up the revision information."
4298
4299   symbol_db = SymbolDatabase()
4300   symbol_db.read()
4301
4302   # Convert the list of regexps to a list of strings
4303   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
4304
4305   error_detected = 0
4306
4307   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
4308   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
4309   if blocked_excludes:
4310     for branch, blockers in blocked_excludes.items():
4311       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
4312                        "excluded because the following symbols depend "
4313                        "on it:\n" % (branch))
4314       for blocker in blockers:
4315         sys.stderr.write("    '%s'\n" % (blocker))
4316     sys.stderr.write("\n")
4317     error_detected = 1
4318
4319   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
4320   invalid_forced_tags = [ ]
4321   for forced_tag in Ctx().forced_tags:
4322     if excludes.has_key(forced_tag):
4323       continue
4324     if symbol_db.branch_has_commit(forced_tag):
4325       invalid_forced_tags.append(forced_tag)
4326   if invalid_forced_tags:
4327     sys.stderr.write(error_prefix + ": The following branches cannot be "
4328                      "forced to be tags because they have commits:\n")
4329     for tag in invalid_forced_tags:
4330       sys.stderr.write("    '%s'\n" % (tag))
4331     sys.stderr.write("\n")
4332     error_detected = 1
4333
4334   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
4335   mismatches = symbol_db.find_mismatches(excludes)
4336   def is_not_forced(mismatch):
4337     name = mismatch[0]
4338     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
4339   mismatches = filter(is_not_forced, mismatches)
4340   if mismatches:
4341     sys.stderr.write(error_prefix + ": The following symbols are tags "
4342                      "in some files and branches in others.\nUse "
4343                      "--force-tag, --force-branch and/or --exclude to "
4344                      "resolve the symbols.\n")
4345     for name, tag_count, branch_count, commit_count in mismatches:
4346       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
4347                        "%d files and has commits in %d files.\n"
4348                        % (name, tag_count, branch_count, commit_count))
4349     error_detected = 1
4350
4351   # Bail out now if we found errors
4352   if error_detected:
4353     sys.exit(1)
4354
4355   # Create the tags database
4356   tags_db = TagsDatabase(DB_OPEN_NEW)
4357   for tag in symbol_db.tags.keys():
4358     if tag not in Ctx().forced_branches:
4359       tags_db[tag] = None
4360   for tag in Ctx().forced_tags:
4361     tags_db[tag] = None
4362
4363   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
4364
4365   # We may have recorded some changes in revisions' timestamp.  We need to
4366   # scan for any other files which may have had the same log message and
4367   # occurred at "the same time" and change their timestamps, too.
4368
4369   # read the resync data file
4370   def read_resync(fname):
4371     "Read the .resync file into memory."
4372
4373     ### note that we assume that we can hold the entire resync file in
4374     ### memory. really large repositories with whacky timestamps could
4375     ### bust this assumption. should that ever happen, then it is possible
4376     ### to split the resync file into pieces and make multiple passes,
4377     ### using each piece.
4378
4379     #
4380     # A digest maps to a sequence of lists which specify a lower and upper
4381     # time bound for matching up the commit.  We keep a sequence of these
4382     # because a number of checkins with the same log message (e.g. an empty
4383     # log message) could need to be remapped.  We also make them a list
4384     # because we will dynamically expand the lower/upper bound as we find
4385     # commits that fall into a particular msg and time range.
4386     #
4387     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4388     #
4389     resync = { }
4390
4391     for line in fileinput.FileInput(fname):
4392       t1 = int(line[:8], 16)
4393       digest = line[9:DIGEST_END_IDX]
4394       t2 = int(line[DIGEST_END_IDX+1:], 16)
4395       t1_l = t1 - COMMIT_THRESHOLD/2
4396       t1_u = t1 + COMMIT_THRESHOLD/2
4397       resync.setdefault(digest, []).append([t1_l, t1_u, t2])
4398
4399     # For each digest, sort the resync items in it in increasing order,
4400     # based on the lower time bound.
4401     for val in resync.values():
4402       val.sort()
4403
4404     return resync
4405
4406   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4407
4408   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4409   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4410
4411   tweaked_timestamps_db = Database(temp(TWEAKED_TIMESTAMPS_DB), DB_OPEN_NEW)
4412   Cleanup().register(temp(TWEAKED_TIMESTAMPS_DB), pass2)
4413
4414   # process the revisions file, looking for items to clean up
4415   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4416     c_rev = CVSRevision(Ctx(), line[:-1])
4417
4418     # Skip this entire revision if it's on an excluded branch
4419     if excludes.has_key(c_rev.branch_name):
4420       continue
4421
4422     new_prev_ts = None
4423     if c_rev.prev_rev is not None:
4424       new_prev_ts = tweaked_timestamps_db.get(
4425         c_rev.unique_key(c_rev.prev_rev), None)
4426     if new_prev_ts:
4427       c_rev.prev_timestamp = new_prev_ts
4428
4429     new_next_ts = None
4430     if c_rev.next_rev is not None:
4431       new_next_ts = tweaked_timestamps_db.get(
4432         c_rev.unique_key(c_rev.next_rev), None)
4433     if new_next_ts:
4434       c_rev.next_timestamp = new_next_ts
4435
4436     # Remove all references to excluded tags and branches
4437     def not_excluded(symbol, excludes=excludes):
4438       return not excludes.has_key(symbol)
4439     c_rev.branches = filter(not_excluded, c_rev.branches)
4440     c_rev.tags = filter(not_excluded, c_rev.tags)
4441
4442     # Convert all branches that are forced to be tags
4443     for forced_tag in Ctx().forced_tags:
4444       if forced_tag in c_rev.branches:
4445         c_rev.branches.remove(forced_tag)
4446         c_rev.tags.append(forced_tag)
4447
4448     # Convert all tags that are forced to be branches
4449     for forced_branch in Ctx().forced_branches:
4450       if forced_branch in c_rev.tags:
4451         c_rev.tags.remove(forced_branch)
4452         c_rev.branches.append(forced_branch)
4453
4454     # see if this is "near" any of the resync records we
4455     # have recorded for this digest [of the log message].
4456     for record in resync.get(c_rev.digest, []):
4457       if record[2] == c_rev.timestamp:
4458         # This means that either c_rev is the same revision that
4459         # caused the resync record to exist, or c_rev is a different
4460         # CVS revision that happens to have the same timestamp.  In
4461         # either case, we don't have to do anything, so we...
4462         continue
4463
4464       if record[0] <= c_rev.timestamp <= record[1]:
4465         # bingo!  We probably want to remap the time on this c_rev,
4466         # unless the remapping would be useless because the new time
4467         # would fall outside the COMMIT_THRESHOLD window for this
4468         # commit group.
4469         new_timestamp = record[2]
4470         # If the new timestamp is earlier than that of our previous revision
4471         if new_timestamp < c_rev.prev_timestamp:
4472           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4473                   + " to time %s, which is before previous the time of"
4474                   + " revision %s (%s):")
4475           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4476                                         c_rev.cvs_path, new_timestamp,
4477                                         c_rev.prev_rev, c_rev.prev_timestamp))
4478           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4479           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4480           # attempted resync time, then sync back to c_rev.prev_timestamp
4481           # + 1...
4482           if ((c_rev.prev_timestamp + 1) - new_timestamp) < COMMIT_THRESHOLD:
4483             new_timestamp = c_rev.prev_timestamp + 1
4484             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4485                                                           new_timestamp))
4486           else:
4487             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4488                         warning_prefix)
4489             continue
4490
4491         # If the new timestamp is later than that of our next revision
4492         elif c_rev.next_timestamp and new_timestamp > c_rev.next_timestamp:
4493           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4494                   + " to time %s, which is after time of next"
4495                   + " revision %s (%s):")
4496           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4497                                         c_rev.cvs_path, new_timestamp,
4498                                         c_rev.prev_rev, c_rev.next_timestamp))
4499           # If resyncing our rev to c_rev.next_timestamp - 1 will place
4500           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4501           # attempted resync time, then sync forward to c_rev.next_timestamp
4502           # - 1...
4503           if (new_timestamp - (c_rev.next_timestamp - 1)) < COMMIT_THRESHOLD:
4504             new_timestamp = c_rev.next_timestamp - 1
4505             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4506                                                           new_timestamp))
4507           else:
4508             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4509                         warning_prefix)
4510             continue
4511
4512         # Fix for Issue #71: Avoid resyncing two consecutive revisions
4513         # to the same timestamp.
4514         elif (new_timestamp == c_rev.prev_timestamp
4515               or new_timestamp == c_rev.next_timestamp):
4516           continue
4517
4518         # adjust the time range. we want the COMMIT_THRESHOLD from the
4519         # bounds of the earlier/latest commit in this group.
4520         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4521         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4522
4523         msg = "PASS2 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4524               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4525                  new_timestamp - c_rev.timestamp)
4526         Log().write(LOG_VERBOSE, msg)
4527
4528         c_rev.timestamp = new_timestamp
4529         tweaked_timestamps_db[c_rev.unique_key()] = new_timestamp
4530
4531         # stop looking for hits
4532         break
4533
4534     output.write(str(c_rev) + "\n")
4535   Log().write(LOG_QUIET, "Done")
4536
4537 def pass3():
4538   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4539   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4540             temp(DATAFILE + SORTED_REVS_SUFFIX))
4541   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4542   Log().write(LOG_QUIET, "Done")
4543
4544 def pass4():
4545   """Iterate through sorted revs, storing them in a database.
4546   If we're not doing a trunk-only conversion, generate the
4547   LastSymbolicNameDatabase, which contains the last CVSRevision
4548   that is a source for each tag or branch.
4549   """
4550   Log().write(LOG_QUIET,
4551       "Copying CVS revision data from flat file to database...")
4552   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4553   if not Ctx().trunk_only:
4554     Log().write(LOG_QUIET,
4555         "Finding last CVS revisions for all symbolic names...")
4556     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4557   else:
4558     # This is to avoid testing Ctx().trunk_only every time around the loop
4559     class DummyLSNDB:
4560       def noop(*args): pass
4561       log_revision = noop
4562       create_database = noop
4563     last_sym_name_db = DummyLSNDB()
4564
4565   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4566     c_rev = CVSRevision(Ctx(), line[:-1])
4567     cvs_revs_db.log_revision(c_rev)
4568     last_sym_name_db.log_revision(c_rev)
4569     StatsKeeper().record_c_rev(c_rev)
4570
4571   last_sym_name_db.create_database()
4572   StatsKeeper().archive()
4573   Log().write(LOG_QUIET, "Done")
4574
4575 def pass5():
4576   """
4577   Generate the SVNCommit <-> CVSRevision mapping
4578   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4579   CVSRevisions that represent an opening or closing for a path on a
4580   branch or tag.  See SymbolingsLogger for more details.
4581   """
4582   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4583
4584   aggregator = CVSRevisionAggregator()
4585   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4586     c_rev = CVSRevision(Ctx(), line[:-1])
4587     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4588       aggregator.process_revision(c_rev)
4589   aggregator.flush()
4590
4591   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4592   StatsKeeper().archive()
4593   Log().write(LOG_QUIET, "Done")
4594
4595 def pass6():
4596   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4597
4598   if not Ctx().trunk_only:
4599     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4600               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4601     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4602   Log().write(LOG_QUIET, "Done")
4603
4604 def pass7():
4605   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4606
4607   def generate_offsets_for_symbolings():
4608     """This function iterates through all the lines in
4609     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4610     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4611     where SYMBOLIC_NAME is first encountered.  This will allow us to
4612     seek to the various offsets in the file and sequentially read only
4613     the openings and closings that we need."""
4614
4615     ###PERF This is a fine example of a db that can be in-memory and
4616     #just flushed to disk when we're done.  Later, it can just be sucked
4617     #back into memory.
4618     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4619     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4620
4621     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4622     old_sym = ""
4623     while 1:
4624       fpos = file.tell()
4625       line = file.readline()
4626       if not line:
4627         break
4628       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4629       if sym != old_sym:
4630         Log().write(LOG_VERBOSE, " ", sym)
4631         old_sym = sym
4632         offsets_db[sym] = fpos
4633
4634   if not Ctx().trunk_only:
4635     generate_offsets_for_symbolings()
4636   Log().write(LOG_QUIET, "Done.")
4637
4638 def pass8():
4639   svncounter = 2 # Repository initialization is 1.
4640   repos = SVNRepositoryMirror()
4641   persistence_manager = PersistenceManager(DB_OPEN_READ)
4642
4643   if Ctx().target:
4644     if not Ctx().dry_run:
4645       repos.add_delegate(RepositoryDelegate())
4646     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4647   else:
4648     if not Ctx().dry_run:
4649       repos.add_delegate(DumpfileDelegate())
4650     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4651
4652   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4653
4654   while 1:
4655     svn_commit = persistence_manager.get_svn_commit(svncounter)
4656     if not svn_commit:
4657       break
4658     repos.commit(svn_commit)
4659     svncounter += 1
4660
4661   repos.finish()
4662
4663 _passes = [
4664   pass1,
4665   pass2,
4666   pass3,
4667   pass4,
4668   pass5,
4669   pass6,
4670   pass7,
4671   pass8,
4672   ]
4673
4674
4675 class Ctx:
4676   """Session state for this run of cvs2svn.  For example, run-time
4677   options are stored here.  This class is a Borg, see
4678   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4679   """
4680   __shared_state = { }
4681   def __init__(self):
4682     self.__dict__ = self.__shared_state
4683     if self.__dict__:
4684       return
4685     # Else, initialize to defaults.
4686     self.target = None
4687     self.dumpfile = DUMPFILE
4688     self.tmpdir = '.'
4689     self.verbose = 0
4690     self.quiet = 0
4691     self.prune = 1
4692     self.existing_svnrepos = 0
4693     self.dump_only = 0
4694     self.dry_run = 0
4695     self.trunk_only = 0
4696     self.trunk_base = "trunk"
4697     self.tags_base = "tags"
4698     self.branches_base = "branches"
4699     self.encoding = ["ascii"]
4700     self.mime_types_file = None
4701     self.no_default_eol = 0
4702     self.eol_from_mime_type = 0
4703     self.keywords_off = 0
4704     self.use_cvs = None
4705     self.svnadmin = "svnadmin"
4706     self.username = None
4707     self.print_help = 0
4708     self.skip_cleanup = 0
4709     self.bdb_txn_nosync = 0
4710     self.fs_type = None
4711     self.forced_branches = []
4712     self.forced_tags = []
4713     self.excludes = []
4714     self.symbol_transforms = []
4715     self.svn_property_setters = []
4716
4717
4718 class CVSRevisionNumberSetter(SVNPropertySetter):
4719   """Set the cvs2svn:cvs-rev property to the CVS revision number."""
4720
4721   def set_properties(self, s_item):
4722     s_item.svn_props['cvs2svn:cvs-rev'] = s_item.c_rev.rev
4723     s_item.svn_props_changed = True
4724
4725
4726 class MimeMapper(SVNPropertySetter):
4727   """A class that provides mappings from file names to MIME types."""
4728
4729   def __init__(self, mime_types_file):
4730     self.mappings = { }
4731
4732     for line in fileinput.input(mime_types_file):
4733       if line.startswith("#"):
4734         continue
4735
4736       # format of a line is something like
4737       # text/plain c h cpp
4738       extensions = line.split()
4739       if len(extensions) < 2:
4740         continue
4741       type = extensions.pop(0)
4742       for ext in extensions:
4743         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4744           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n"
4745                            % (warning_prefix, ext, self.mappings[ext], type))
4746         self.mappings[ext] = type
4747
4748   def set_properties(self, s_item):
4749     basename, extension = os.path.splitext(
4750         os.path.basename(s_item.c_rev.cvs_path)
4751         )
4752
4753     # Extension includes the dot, so strip it (will leave extension
4754     # empty if filename ends with a dot, which is ok):
4755     extension = extension[1:]
4756
4757     # If there is no extension (or the file ends with a period), use
4758     # the base name for mapping.  This allows us to set mappings for
4759     # files such as README or Makefile:
4760     if not extension:
4761       extension = basename
4762
4763     mime_type = self.mappings.get(extension, None)
4764     if mime_type is not None:
4765       s_item.svn_props['svn:mime-type'] = mime_type
4766
4767
4768 class BinaryFileDefaultMimeTypeSetter(SVNPropertySetter):
4769   """Set the default mime type for binary files, if no other one is known."""
4770
4771   def set_properties(self, s_item):
4772     if not s_item.svn_props.has_key('svn:mime-type') \
4773            and s_item.c_rev.mode == 'b':
4774       s_item.svn_props['svn:mime-type'] = 'application/octet-stream'
4775
4776
4777 class BinaryFileEOLStyleSetter(SVNPropertySetter):
4778   """Set the eol-style for binary files to None."""
4779
4780   def set_properties(self, s_item):
4781     if s_item.c_rev.mode == 'b':
4782       s_item.svn_props['svn:eol-style'] = None
4783
4784
4785 class EOLStyleFromMimeTypeSetter(SVNPropertySetter):
4786   """Set the eol-style from the mime type if it is not already known.
4787
4788   This setting is influenced by the mime-type setting, which must
4789   already have been set.  See also issue #39."""
4790
4791   def set_properties(self, s_item):
4792     if not s_item.svn_props.has_key('svn:eol-style') \
4793        and s_item.svn_props.get('svn:mime-type', None) is not None:
4794       if s_item.svn_props['svn:mime-type'].startswith("text/"):
4795         s_item.svn_props['svn:eol-style'] = 'native'
4796       else:
4797         s_item.svn_props['svn:eol-style'] = None
4798
4799
4800 class DefaultEOLStyleSetter(SVNPropertySetter):
4801   """Set the default eol-style if one has not already been set."""
4802
4803   def __init__(self, value):
4804     """Initialize with the specified default VALUE."""
4805
4806     self.value = value
4807
4808   def set_properties(self, s_item):
4809     if not s_item.svn_props.has_key('svn:eol-style'):
4810       s_item.svn_props['svn:eol-style'] = self.value
4811
4812
4813 class KeywordsPropertySetter(SVNPropertySetter):
4814   """Set the svn:keywords property based on the file's mode.  See
4815   issue #2."""
4816
4817   def __init__(self, value):
4818     """Use VALUE for the value of the svn:keywords property if it is
4819     to be set."""
4820
4821     self.value = value
4822
4823   def set_properties(self, s_item):
4824     if not s_item.svn_props.has_key('svn:keywords') \
4825            and s_item.c_rev.mode in [None, 'kv', 'kvl']:
4826       s_item.svn_props['svn:keywords'] = self.value
4827
4828
4829 class ExecutablePropertySetter(SVNPropertySetter):
4830   """Set the svn:executable property based on c_rev.file_executable."""
4831
4832   def set_properties(self, s_item):
4833     if s_item.c_rev.file_executable:
4834       s_item.svn_props['svn:executable'] = '*'
4835
4836
4837 def convert(start_pass, end_pass):
4838   "Convert a CVS repository to an SVN repository."
4839
4840   cleanup = Cleanup()
4841   times = [ None ] * (end_pass + 1)
4842   times[start_pass - 1] = time.time()
4843   StatsKeeper().set_start_time(time.time())
4844   for i in range(start_pass - 1, end_pass):
4845     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4846     _passes[i]()
4847     times[i + 1] = time.time()
4848     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4849     # Dispose of items in Ctx() not intended to live past the end of the pass
4850     # (Identified by exactly one leading underscore)
4851     for attr in dir(Ctx()):
4852       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4853           and attr[:6] != "_Ctx__"):
4854         delattr(Ctx(), attr)
4855     if not Ctx().skip_cleanup:
4856       cleanup.cleanup(_passes[i])
4857     StatsKeeper().set_end_time(time.time())
4858
4859   Log().write(LOG_QUIET, StatsKeeper())
4860   if end_pass < 4:
4861     Log().write(LOG_QUIET,
4862                 '(These are unaltered CVS repository stats and do not\n'
4863                 ' reflect tags or branches excluded via --exclude)\n')
4864   Log().write(LOG_NORMAL, StatsKeeper().timings())
4865
4866
4867 def normalize_ttb_path(opt, path):
4868   """Normalize a path to be used for --trunk, --tags, or --branches.
4869
4870   1. Strip leading, trailing, and duplicated '/'.
4871   2. Verify that the path is not empty.
4872
4873   Return the normalized path.
4874
4875   If the path is invalid, write an error message and exit."""
4876
4877   norm_path = _path_join(*path.split('/'))
4878   if not norm_path:
4879     raise FatalError("cannot pass an empty path to %s." % (opt,))
4880   return norm_path
4881
4882
4883 def verify_paths_disjoint(*paths):
4884   """Verify that all of the paths in the argument list are disjoint.
4885
4886   If any of the paths is nested in another one (i.e., in the sense
4887   that 'a/b/c/d' is nested in 'a/b'), or any two paths are identical,
4888   write an error message and exit."""
4889
4890   paths = [(path.split('/'), path) for path in paths]
4891   # If all overlapping elements are equal, a shorter list is
4892   # considered "less than" a longer one.  Therefore if any paths are
4893   # nested, this sort will leave at least one such pair adjacent, in
4894   # the order [nest,nestling].
4895   paths.sort()
4896   for i in range(1, len(paths)):
4897     split_path1, path1 = paths[i - 1]
4898     split_path2, path2 = paths[i]
4899     if len(split_path1) <= len(split_path2) \
4900        and split_path2[:len(split_path1)] == split_path1:
4901       raise FatalError("paths %s and %s are not disjoint." % (path1, path2,))
4902
4903
4904 def usage():
4905   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
4906         % os.path.basename(sys.argv[0])
4907   print '  --help, -h           print this usage message and exit with success'
4908   print '  --version            print the version number'
4909   print '  -q                   quiet'
4910   print '  -v                   verbose'
4911   print '  -s PATH              path for SVN repos'
4912   print '  -p START[:END]       start at pass START, end at pass END of %d' \
4913         % len(_passes)
4914   print '                       If only START is given, run only pass START'
4915   print '                       (implicitly enables --skip-cleanup)'
4916   print '  --existing-svnrepos  load into existing SVN repository'
4917   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
4918   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
4919   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
4920   print '  --dry-run            do not create a repository or a dumpfile;'
4921   print '                       just print what would happen.'
4922   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
4923   print '                       (only use this if having problems with RCS)'
4924   print '  --svnadmin=PATH      path to the svnadmin program'
4925   print '  --trunk-only         convert only trunk commits, not tags nor branches'
4926   print '  --trunk=PATH         path for trunk (default: %s)'    \
4927         % Ctx().trunk_base
4928   print '  --branches=PATH      path for branches (default: %s)' \
4929         % Ctx().branches_base
4930   print '  --tags=PATH          path for tags (default: %s)'     \
4931         % Ctx().tags_base
4932   print '  --no-prune           don\'t prune empty directories'
4933   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
4934   print '  --encoding=ENC       encoding of paths and log messages in CVS repos'
4935   print '                       Multiple of these options may be passed, where they'
4936   print '                       will be treated as an ordered list of encodings to'
4937   print '                       attempt (with "ascii" as a hardcoded last resort)'
4938   print '  --force-branch=NAME  force NAME to be a branch'
4939   print '  --force-tag=NAME     force NAME to be a tag'
4940   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
4941   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
4942   print '                       use Python regexp and reference syntax respectively'
4943   print '  --username=NAME      username for cvs2svn-synthesized commits'
4944   print '  --skip-cleanup       prevent the deletion of intermediate files'
4945   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
4946   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
4947   print '  --cvs-revnums        record CVS revision numbers as file properties'
4948   print '  --mime-types=FILE    specify an apache-style mime.types file for'
4949   print '                       setting svn:mime-type'
4950   print '  --eol-from-mime-type set svn:eol-style from mime type if known'
4951   print '  --no-default-eol     don\'t set svn:eol-style to \'native\' for'
4952   print '                       non-binary files with undetermined mime types'
4953   print '  --keywords-off       don\'t set svn:keywords on any files (by default,'
4954   print '                       cvs2svn sets svn:keywords on non-binary files to'
4955   print '                       "%s")' % SVN_KEYWORDS_VALUE
4956
4957 def main():
4958   # Convenience var, so we don't have to keep instantiating this Borg.
4959   ctx = Ctx()
4960
4961   profiling = None
4962   start_pass = 1
4963   end_pass = len(_passes)
4964
4965   try:
4966     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
4967                                [ "help", "create", "trunk=",
4968                                  "username=", "existing-svnrepos",
4969                                  "branches=", "tags=", "encoding=",
4970                                  "force-branch=", "force-tag=", "exclude=",
4971                                  "use-cvs", "mime-types=",
4972                                  "eol-from-mime-type", "no-default-eol",
4973                                  "trunk-only", "no-prune", "dry-run",
4974                                  "dump-only", "dumpfile=", "tmpdir=",
4975                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
4976                                  "bdb-txn-nosync", "fs-type=",
4977                                  "version", "profile",
4978                                  "keywords-off", "symbol-transform="])
4979   except getopt.GetoptError, e:
4980     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
4981     usage()
4982     sys.exit(1)
4983
4984   for opt, value in opts:
4985     if opt == '--version':
4986         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
4987         sys.exit(0)
4988     elif opt == '-p':
4989       # Don't cleanup if we're doing incrementals.
4990       ctx.skip_cleanup = 1
4991       if value.find(':') > 0:
4992         start_pass, end_pass = map(int, value.split(':'))
4993       else:
4994         end_pass = start_pass = int(value)
4995       if start_pass > len(_passes) or start_pass < 1:
4996         raise FatalError(
4997             'illegal value (%d) for starting pass.  Must be 1 through %d.'
4998             % (int(start_pass), len(_passes),))
4999       if end_pass < start_pass or end_pass > len(_passes):
5000         raise FatalError(
5001             'illegal value (%d) for ending pass.  Must be %d through %d.'
5002             % (int(end_pass), int(start_pass), len(_passes),))
5003     elif (opt == '--help') or (opt == '-h'):
5004       ctx.print_help = 1
5005     elif opt == '-v':
5006       Log().log_level = LOG_VERBOSE
5007       ctx.verbose = 1
5008     elif opt == '-q':
5009       Log().log_level = LOG_QUIET
5010       ctx.quiet = 1
5011     elif opt == '-s':
5012       ctx.target = value
5013     elif opt == '--existing-svnrepos':
5014       ctx.existing_svnrepos = 1
5015     elif opt == '--dumpfile':
5016       ctx.dumpfile = value
5017     elif opt == '--tmpdir':
5018       ctx.tmpdir = value
5019     elif opt == '--use-cvs':
5020       ctx.use_cvs = 1
5021     elif opt == '--svnadmin':
5022       ctx.svnadmin = value
5023     elif opt == '--trunk-only':
5024       ctx.trunk_only = 1
5025     elif opt == '--trunk':
5026       ctx.trunk_base = normalize_ttb_path(opt, value)
5027     elif opt == '--branches':
5028       ctx.branches_base = normalize_ttb_path(opt, value)
5029     elif opt == '--tags':
5030       ctx.tags_base = normalize_ttb_path(opt, value)
5031     elif opt == '--no-prune':
5032       ctx.prune = None
5033     elif opt == '--dump-only':
5034       ctx.dump_only = 1
5035     elif opt == '--dry-run':
5036       ctx.dry_run = 1
5037     elif opt == '--encoding':
5038       ctx.encoding.insert(-1, value)
5039     elif opt == '--force-branch':
5040       ctx.forced_branches.append(value)
5041     elif opt == '--force-tag':
5042       ctx.forced_tags.append(value)
5043     elif opt == '--exclude':
5044       try:
5045         ctx.excludes.append(re.compile('^' + value + '$'))
5046       except re.error, e:
5047         raise FatalError("'%s' is not a valid regexp." % (value,))
5048     elif opt == '--mime-types':
5049       ctx.mime_types_file = value
5050     elif opt == '--eol-from-mime-type':
5051       ctx.eol_from_mime_type = 1
5052     elif opt == '--no-default-eol':
5053       ctx.no_default_eol = 1
5054     elif opt == '--keywords-off':
5055       ctx.keywords_off = 1
5056     elif opt == '--username':
5057       ctx.username = value
5058     elif opt == '--skip-cleanup':
5059       ctx.skip_cleanup = 1
5060     elif opt == '--cvs-revnums':
5061       ctx.svn_property_setters.append(CVSRevisionNumberSetter())
5062     elif opt == '--bdb-txn-nosync':
5063       ctx.bdb_txn_nosync = 1
5064     elif opt == '--fs-type':
5065       ctx.fs_type = value
5066     elif opt == '--create':
5067       sys.stderr.write(warning_prefix +
5068           ': The behaviour produced by the --create option is now the '
5069           'default,\nand passing the option is deprecated.\n')
5070     elif opt == '--profile':
5071       profiling = 1
5072     elif opt == '--symbol-transform':
5073       [pattern, replacement] = value.split(":")
5074       try:
5075         pattern = re.compile(pattern)
5076       except re.error, e:
5077         raise FatalError("'%s' is not a valid regexp." % (pattern,))
5078       ctx.symbol_transforms.append((pattern, replacement,))
5079
5080   if ctx.print_help:
5081     usage()
5082     sys.exit(0)
5083
5084   # Consistency check for options and arguments.
5085   if len(args) == 0:
5086     usage()
5087     sys.exit(1)
5088
5089   if len(args) > 1:
5090     sys.stderr.write(error_prefix +
5091                      ": must pass only one CVS repository.\n")
5092     usage()
5093     sys.exit(1)
5094
5095   cvsroot = args[0]
5096
5097   if ctx.use_cvs:
5098     ctx.cvs_repository = CVSRepositoryViaCVS(cvsroot)
5099   else:
5100     ctx.cvs_repository = CVSRepositoryViaRCS(cvsroot)
5101
5102   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
5103     raise FatalError("must pass one of '-s' or '--dump-only'.")
5104
5105   def not_both(opt1val, opt1name, opt2val, opt2name):
5106     if opt1val and opt2val:
5107       raise FatalError("cannot pass both '%s' and '%s'."
5108                        % (opt1name, opt2name,))
5109
5110   not_both(ctx.target, '-s',
5111            ctx.dump_only, '--dump-only')
5112
5113   not_both(ctx.dump_only, '--dump-only',
5114            ctx.existing_svnrepos, '--existing-svnrepos')
5115
5116   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
5117            ctx.existing_svnrepos, '--existing-svnrepos')
5118
5119   not_both(ctx.dump_only, '--dump-only',
5120            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
5121
5122   not_both(ctx.quiet, '-q',
5123            ctx.verbose, '-v')
5124
5125   not_both(ctx.fs_type, '--fs-type',
5126            ctx.existing_svnrepos, '--existing-svnrepos')
5127
5128   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
5129     raise FatalError("cannot pass --bdb-txn-nosync with --fs-type=%s."
5130                      % ctx.fs_type)
5131
5132   # Create the default project (using ctx.trunk, ctx.branches, and ctx.tags):
5133   ctx.project = Project(ctx.cvs_repository.cvs_repos_path,
5134                         ctx.trunk_base, ctx.branches_base, ctx.tags_base)
5135
5136   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
5137     raise FatalError("the svn-repos-path '%s' is not an "
5138                      "existing directory." % ctx.target)
5139
5140   if not ctx.dump_only and not ctx.existing_svnrepos \
5141      and (not ctx.dry_run) and os.path.exists(ctx.target):
5142     raise FatalError("the svn-repos-path '%s' exists.\n"
5143                      "Remove it, or pass '--existing-svnrepos'."
5144                      % ctx.target)
5145
5146   if ctx.target and not ctx.dry_run:
5147     # Verify that svnadmin can be executed.  The 'help' subcommand
5148     # should be harmless.
5149     try:
5150       check_command_runs([ctx.svnadmin, 'help'], 'svnadmin')
5151     except CommandFailedException, e:
5152       raise FatalError(
5153           '%s\n'
5154           'svnadmin could not be executed.  Please ensure that it is\n'
5155           'installed and/or use the --svnadmin option.' % (e,))
5156
5157   if ctx.mime_types_file:
5158     ctx.svn_property_setters.append(MimeMapper(ctx.mime_types_file))
5159
5160   ctx.svn_property_setters.append(BinaryFileDefaultMimeTypeSetter())
5161   ctx.svn_property_setters.append(BinaryFileEOLStyleSetter())
5162
5163   if ctx.eol_from_mime_type:
5164     ctx.svn_property_setters.append(EOLStyleFromMimeTypeSetter())
5165
5166   if ctx.no_default_eol:
5167     ctx.svn_property_setters.append(DefaultEOLStyleSetter(None))
5168   else:
5169     ctx.svn_property_setters.append(DefaultEOLStyleSetter('native'))
5170
5171   if not ctx.keywords_off:
5172     ctx.svn_property_setters.append(
5173         KeywordsPropertySetter(SVN_KEYWORDS_VALUE))
5174
5175   ctx.svn_property_setters.append(ExecutablePropertySetter())
5176
5177   # Make sure the tmp directory exists.  Note that we don't check if
5178   # it's empty -- we want to be able to use, for example, "." to hold
5179   # tempfiles.  But if we *did* want check if it were empty, we'd do
5180   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
5181   if not os.path.exists(ctx.tmpdir):
5182     os.mkdir(ctx.tmpdir)
5183   elif not os.path.isdir(ctx.tmpdir):
5184     raise FatalError(
5185         "cvs2svn tried to use '%s' for temporary files, but that path\n"
5186         "  exists and is not a directory.  Please make it be a directory,\n"
5187         "  or specify some other directory for temporary files."
5188         % (ctx.tmpdir,))
5189
5190   # But do lock the tmpdir, to avoid process clash.
5191   try:
5192     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5193   except OSError, e:
5194     if e.errno == errno.EACCES:
5195       raise FatalError("Permission denied:"
5196                        + " No write access to directory '%s'." % ctx.tmpdir)
5197     if e.errno == errno.EEXIST:
5198       raise FatalError(
5199           "cvs2svn is using directory '%s' for temporary files, but\n"
5200           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
5201           "  cvs2svn process is currently using '%s' as its temporary\n"
5202           "  workspace.  If you are certain that is not the case,\n"
5203           "  then remove the '%s/cvs2svn.lock' subdirectory."
5204           % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir,))
5205     raise
5206   try:
5207     if profiling:
5208       import hotshot
5209       prof = hotshot.Profile('cvs2svn.hotshot')
5210       prof.runcall(convert, start_pass, end_pass)
5211       prof.close()
5212     else:
5213       convert(start_pass, end_pass)
5214   finally:
5215     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5216     except: pass
5217
5218
5219 if __name__ == '__main__':
5220   try:
5221     main()
5222   except FatalException, e:
5223     sys.stderr.write(str(e))
5224     sys.exit(1)
5225
5226