cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import fnmatch
  30 import string
  31 import getopt
  32 import stat
  33 import md5
  34 import marshal
  35 import errno
  36 import popen2
  37 import types
  38 import ConfigParser
  39 import UserDict
  40 try:
  41   # Try to get access to a bunch of encodings for use with --encoding.
  42   # See http://cjkpython.i18n.org/ for details.
  43   import iconv_codec
  44 except ImportError:
  45   pass
  46
  47 # Warnings and errors start with these strings.  They are typically
  48 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  49 warning_prefix = "WARNING"
  50 error_prefix = "ERROR"
  51
  52 # Make sure this Python is recent enough.
  53 if sys.hexversion < 0x02020000:
  54   sys.stderr.write("'%s: Python 2.2 or higher required, "
  55                    "see www.python.org.\n" % error_prefix)
  56   sys.exit(1)
  57
  58 # Pretend we have true booleans on older python versions
  59 try:
  60   True
  61 except:
  62   True = 1
  63   False = 0
  64
  65 # Opening pipes was a mess before Python 2.4, because some methods did
  66 # not exist on some platforms, and some behaved differenly on other.
  67 # Python 2.4 solved this by adding the subprocess module, but since we
  68 # cannot require such a new version, we cannot use it directly, but
  69 # must implement a simplified Popen using the best means neccessary.
  70 #
  71 # The SimplePopen class only has the following members and methods, all
  72 # behaving as documented in the subprocess.Popen class:
  73 #     - stdin
  74 #     - stdout
  75 #     - stderr
  76 #     - wait
  77 try:
  78   # First try subprocess.Popen...
  79   import subprocess
  80   class SimplePopen:
  81     def __init__(self, cmd, capture_stderr):
  82       if capture_stderr:
  83         stderr = subprocess.PIPE
  84       else:
  85         stderr = None
  86       self._popen = subprocess.Popen(cmd, stdin=subprocess.PIPE,
  87                                     stdout=subprocess.PIPE, stderr=stderr)
  88       self.stdin = self._popen.stdin
  89       self.stdout = self._popen.stdout
  90       if capture_stderr:
  91         self.stderr = self._popen.stderr
  92       self.wait = self._popen.wait
  93 except ImportError:
  94   if hasattr(popen2, 'Popen3'):
  95     # ...then try popen2.Popen3...
  96     class SimplePopen:
  97       def __init__(self, cmd, capture_stderr):
  98         self._popen3 = popen2.Popen3(cmd, capture_stderr)
  99         self.stdin = self._popen3.tochild
 100         self.stdout = self._popen3.fromchild
 101         if capture_stderr:
 102           self.stderr = self._popen3.childerr
 103         self.wait = self._popen3.wait
 104   else:
 105     # ...and if all fails, use popen2.popen3...
 106     class SimplePopen:
 107       def __init__(self, cmd, capture_stderr):
 108         if type(cmd) != types.StringType:
 109           cmd = argv_to_command_string(cmd)
 110         self.stdout, self.stdin, self.stderr = popen2.popen3(cmd, mode='b')
 111       def wait(self):
 112         return self.stdout.close() or self.stdin.close() or \
 113                self.stderr.close()
 114
 115 # DBM module selection
 116
 117 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
 118 #    so that the dbhash module used by anydbm will use bsddb3.
 119 try:
 120   import bsddb3
 121   sys.modules['bsddb'] = sys.modules['bsddb3']
 122 except ImportError:
 123   pass
 124
 125 # 2. These DBM modules are not good for cvs2svn.
 126 import anydbm
 127 if (anydbm._defaultmod.__name__ == 'dumbdbm'
 128     or anydbm._defaultmod.__name__ == 'dbm'):
 129   sys.stderr.write(
 130     error_prefix
 131     + ': your installation of Python does not contain a suitable\n'
 132     + 'DBM module -- cvs2svn cannot continue.\n'
 133     + 'See http://python.org/doc/current/lib/module-anydbm.html to solve.\n')
 134   sys.exit(1)
 135
 136 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
 137 #    Unfortunately, gdbm appears not to be trouble free, either.
 138 if hasattr(anydbm._defaultmod, 'bsddb') \
 139     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
 140   try:
 141     gdbm = __import__('gdbm')
 142   except ImportError:
 143     sys.stderr.write(warning_prefix +
 144         ': The version of the bsddb module found '
 145         'on your computer has been reported to malfunction on some datasets, '
 146         'causing KeyError exceptions. You may wish to upgrade your Python to '
 147         'version 2.3 or later.\n')
 148   else:
 149     anydbm._defaultmod = gdbm
 150
 151 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 152 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 153 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 154
 155 SVN_KEYWORDS_VALUE = 'Author Date Id Revision'
 156
 157 # This really only matches standard '1.1.1.*'-style vendor revisions.
 158 # One could conceivably have a file whose default branch is 1.1.3 or
 159 # whatever, or was that at some point in time, with vendor revisions
 160 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 161 # is the only time this regexp gets used), we'd have no basis for
 162 # assuming that the non-standard vendor branch had ever been the
 163 # default branch anyway, so we don't want this to match them anyway.
 164 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 165
 166 # If this run's output is a repository, then (in the tmpdir) we use
 167 # a dumpfile of this name for repository loads.
 168 #
 169 # If this run's output is a dumpfile, then this is default name of
 170 # that dumpfile, but in the current directory (unless the user has
 171 # specified a dumpfile path, of course, in which case it will be
 172 # wherever the user said).
 173 DUMPFILE = 'cvs2svn-dump'
 174
 175 # This file appears with different suffixes at different stages of
 176 # processing.  CVS revisions are cleaned and sorted here, for commit
 177 # grouping.  See design-notes.txt for details.
 178 DATAFILE = 'cvs2svn-data'
 179
 180 # This file contains a marshalled copy of all the statistics that we
 181 # gather throughout the various runs of cvs2svn.  The data stored as a
 182 # marshalled dictionary.
 183 STATISTICS_FILE = 'cvs2svn-statistics'
 184
 185 # This text file contains records (1 per line) that describe svn
 186 # filesystem paths that are the opening and closing source revisions
 187 # for copies to tags and branches.  The format is as follows:
 188 #
 189 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 190 #
 191 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 192 # SVN_REVNUM are the primary and secondary sorting criteria for
 193 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 194 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 195 # A sorted version of the above file.
 196 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 197
 198 # This file is a temporary file for storing symbolic_name -> closing
 199 # CVSRevision until the end of our pass where we can look up the
 200 # corresponding SVNRevNum for the closing revs and write these out to
 201 # the SYMBOL_OPENINGS_CLOSINGS.
 202 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 203
 204 # Skeleton version of an svn filesystem.
 205 # (These supersede and will eventually replace the two above.)
 206 # See class SVNRepositoryMirror for how these work.
 207 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 208 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 209
 210 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 211 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 212 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 213
 214 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 215 # the CVSRevision is the last such that is a source for those symbolic
 216 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 217 # file, and this file's 1.3 is the latest (by date) revision among
 218 # *all* CVS files that is a source for branch B, then the
 219 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 220 # list at least B in its list.
 221 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 222
 223 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 224 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 225 ### the s-revs data in this database.
 226 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 227
 228 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 229 # names), values are ignorable.
 230 TAGS_DB = 'cvs2svn-tags.db'
 231
 232 # A list all tags.  Each line consists of the tag name and the number
 233 # of files in which it exists, separated by a space.
 234 TAGS_LIST = 'cvs2svn-tags.txt'
 235
 236 # A list of all branches.  The file is stored as a plain text file
 237 # to make it easy to look at in an editor.  Each line contains the
 238 # branch name, the number of files where the branch is created, the
 239 # commit count, and a list of tags and branches that are defined on
 240 # revisions in the branch.
 241 BRANCHES_LIST = 'cvs2svn-branches.txt'
 242
 243 # These two databases provide a bidirectional mapping between
 244 # CVSRevision.unique_key()s and Subversion revision numbers.
 245 #
 246 # The first maps CVSRevision.unique_key() to a number; the values are
 247 # not unique.
 248 #
 249 # The second maps a number to a list of CVSRevision.unique_key()s.
 250 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 251 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 252
 253 # This database maps svn_revnums to tuples of (symbolic_name, date).
 254 #
 255 # The svn_revnums are the revision numbers of all non-primary
 256 # SVNCommits.  No primary SVNCommit has a key in this database.
 257 #
 258 # The date is stored for all commits in this database.
 259 #
 260 # For commits that fill symbolic names, the symbolic_name is stored.
 261 # For commits that default branch syncs, the symbolic_name is None.
 262 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 263
 264 # This database maps svn_revnums of a default branch synchronization
 265 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 266 #
 267 # (NOTE: Secondary commits that fill branches and tags also have a
 268 # motivating commit, but we do not record it because it is (currently)
 269 # not needed for anything.)
 270 #
 271 # This mapping is used when generating the log message for the commit
 272 # that synchronizes the default branch with trunk.
 273 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 274
 275 # How many bytes to read at a time from a pipe.  128 kiB should be
 276 # large enough to be efficient without wasting too much memory.
 277 PIPE_READ_SIZE = 128 * 1024
 278
 279 # Record the default RCS branches, if any, for CVS filepaths.
 280 #
 281 # The keys are CVS filepaths, relative to the top of the repository
 282 # and with the ",v" stripped off, so they match the cvs paths used in
 283 # Commit.commit().  The values are vendor branch revisions, such as
 284 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 285 # represents the highest vendor branch revision thought to have ever
 286 # been head of the default branch.
 287 #
 288 # The reason we record a specific vendor revision, rather than a
 289 # default branch number, is that there are two cases to handle:
 290 #
 291 # One case is simple.  The RCS file lists a default branch explicitly
 292 # in its header, such as '1.1.1'.  In this case, we know that every
 293 # revision on the vendor branch is to be treated as head of trunk at
 294 # that point in time.
 295 #
 296 # But there's also a degenerate case.  The RCS file does not currently
 297 # have a default branch, yet we can deduce that for some period in the
 298 # past it probably *did* have one.  For example, the file has vendor
 299 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 300 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 301 # case, we should record 1.1.1.96 as the last vendor revision to have
 302 # been the head of the default branch.
 303 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 304
 305 # Records the author and log message for each changeset.
 306 # The keys are author+log digests, the same kind used to identify
 307 # unique revisions in the .revs, etc files.  Each value is a tuple
 308 # of two elements: '(author logmessage)'.
 309 METADATA_DB = "cvs2svn-metadata.db"
 310
 311 # A temporary on-disk hash that maps CVSRevision unique keys to a new
 312 # timestamp for that CVSRevision.  These new timestamps are created in
 313 # pass2, and this hash is used exclusively in pass2.
 314 TWEAKED_TIMESTAMPS_DB = "cvs2svn-fixed-timestamps.db"
 315
 316 REVS_SUFFIX = '.revs'
 317 CLEAN_REVS_SUFFIX = '.c-revs'
 318 SORTED_REVS_SUFFIX = '.s-revs'
 319 RESYNC_SUFFIX = '.resync'
 320
 321 SVN_INVALID_REVNUM = -1
 322
 323 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 324
 325 # Things that can happen to a file.
 326 OP_NOOP   = '-'
 327 OP_ADD    = 'A'
 328 OP_DELETE = 'D'
 329 OP_CHANGE = 'C'
 330
 331 # A deltatext either does or doesn't represent some change.
 332 DELTATEXT_NONEMPTY = 'N'
 333 DELTATEXT_EMPTY    = 'E'
 334
 335 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 336
 337 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 338 OPENING = 'O'
 339 CLOSING = 'C'
 340
 341 class FatalException(Exception):
 342   """Exception thrown on a non-recoverable error.
 343
 344   If this exception is thrown by main(), it is caught by the global
 345   layer of the program, its string representation is printed, and the
 346   program is ended with an exit code of 1."""
 347
 348   pass
 349
 350
 351 class FatalError(FatalException):
 352   """A FatalException that prepends error_prefix to the message."""
 353
 354   def __init__(self, msg):
 355     """Use (error_prefix + ': ' + MSG + '\n') as the error message."""
 356
 357     FatalException.__init__(self, '%s: %s\n' % (error_prefix, msg,))
 358
 359
 360 def temp(basename):
 361   """Return a path to BASENAME in Ctx().tmpdir.
 362   This is a convenience function to save horizontal space in source."""
 363   return os.path.join(Ctx().tmpdir, basename)
 364
 365 # Since the unofficial set also includes [/\] we need to translate those
 366 # into ones that don't conflict with Subversion limitations.
 367 def _clean_symbolic_name(name):
 368   """Return symbolic name NAME, translating characters that Subversion
 369   does not allow in a pathname."""
 370   name = name.replace('/','++')
 371   name = name.replace('\\','--')
 372   return name
 373
 374 def _path_join(*components):
 375   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 376   Empty component are skipped."""
 377   return string.join(filter(None, components), '/')
 378
 379 def _path_split(path):
 380   """Split the svn pathname PATH into a pair, (HEAD, TAIL).
 381
 382   This is similar to os.path.split(), but always uses '/' as path
 383   separator.  PATH is an svn path, which should not start with a '/'.
 384   HEAD is everything before the last slash, and TAIL is everything
 385   after.  If PATH ends in a slash, TAIL will be empty.  If there is no
 386   slash in PATH, HEAD will be empty.  If PATH is empty, both HEAD and
 387   TAIL are empty."""
 388
 389   pos = path.rfind('/')
 390   if pos == -1:
 391     return ('', path,)
 392   else:
 393     return (path[:pos], path[pos+1:],)
 394
 395 def to_utf8(value, mode='replace'):
 396   """Encode (as Unicode) VALUE, trying the encodings in Ctx.encoding
 397   as valid source encodings.  Raise UnicodeError on failure of all
 398   source encodings."""
 399   ### FIXME: The 'replace' default mode should be an option,
 400   ### like --encoding is.
 401   for encoding in Ctx().encoding:
 402     try:
 403       return unicode(value, encoding, mode).encode('utf8')
 404     except UnicodeError:
 405       Log().write(LOG_VERBOSE, "Encoding '%s' failed for string '%s'"
 406                   % (encoding, value))
 407   raise UnicodeError
 408
 409 def run_command(command):
 410   if os.system(command):
 411     raise FatalError('Command failed: "%s"' % (command,))
 412
 413
 414 class CommandFailedException(Exception):
 415   """Exception raised if check_command_runs() fails."""
 416
 417   pass
 418
 419
 420 def check_command_runs(cmd, cmdname):
 421   """Check whether the command CMD can be executed without errors.
 422
 423   CMD is a list or string, as accepted by SimplePopen.  CMDNAME is the
 424   name of the command as it should be included in exception error
 425   messages.
 426
 427   This function checks three things: (1) the command can be run
 428   without throwing an OSError; (2) it exits with status=0; (3) it
 429   doesn't output anything to stderr.  If any of these conditions is
 430   not met, raise a CommandFailedException describing the problem."""
 431
 432   try:
 433     pipe = SimplePopen(cmd, True)
 434   except OSError, e:
 435     raise CommandFailedException('error executing %s: %s' % (cmdname, e,))
 436   pipe.stdin.close()
 437   pipe.stdout.read()
 438   errmsg = pipe.stderr.read()
 439   status = pipe.wait()
 440   if status != 0 or errmsg:
 441     msg = 'error executing %s: status %s' % (cmdname, status,)
 442     if errmsg:
 443       msg += ', error output:\n%s' % (errmsg,)
 444     raise CommandFailedException(msg)
 445
 446
 447 class CVSRepository:
 448   """A CVS repository from which data can be extracted."""
 449
 450   def __init__(self, cvs_repos_path):
 451     """CVS_REPOS_PATH is the top of the CVS repository (at least as
 452     far as this run is concerned)."""
 453
 454     if not os.path.isdir(cvs_repos_path):
 455       raise FatalError("The specified CVS repository path '%s' is not an "
 456                        "existing directory." % cvs_repos_path)
 457
 458     self.cvs_repos_path = os.path.normpath(cvs_repos_path)
 459     self.cvs_prefix_re = re.compile(
 460         r'^' + re.escape(self.cvs_repos_path)
 461         + r'(' + re.escape(os.sep) + r'|$)')
 462
 463   def get_cvs_path(self, fname):
 464     """Return the path to FNAME relative to cvs_repos_path, with ',v' removed.
 465
 466     FNAME is a filesystem name that has to be within
 467     self.cvs_repos_path.  Return the filename relative to
 468     self.cvs_repos_path, with ',v' striped off if present, and with
 469     os.sep converted to '/'."""
 470
 471     (tail, n) = self.cvs_prefix_re.subn('', fname, 1)
 472     if n != 1:
 473       raise FatalError(
 474           "get_cvs_path: '%s' is not a sub-path of '%s'"
 475           % (fname, self.cvs_repos_path,))
 476     if tail.endswith(',v'):
 477       tail = tail[:-2]
 478     return string.replace(tail, os.sep, '/')
 479
 480   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 481     """Return a command string, and the pipe created using that
 482     string.  C_REV is a CVSRevision.  If SUPPRESS_KEYWORD_SUBSTITUTION
 483     is True, then suppress the substitution of RCS/CVS keywords in the
 484     output.  The pipe returns the text of that CVS Revision."""
 485     raise NotImplementedError
 486
 487
 488 class CVSRepositoryViaRCS(CVSRepository):
 489   """A CVSRepository accessed via RCS."""
 490
 491   def __init__(self, cvs_repos_path):
 492     CVSRepository.__init__(self, cvs_repos_path)
 493     try:
 494       check_command_runs([ 'co', '-V' ], 'co')
 495     except CommandFailedException, e:
 496       raise FatalError('%s\n'
 497                        'Please check that co is installed and in your PATH\n'
 498                        '(it is a part of the RCS software).' % (e,))
 499
 500   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 501     pipe_cmd = [ 'co', '-q', '-x,v', '-p' + c_rev.rev ]
 502     if suppress_keyword_substitution:
 503       pipe_cmd.append('-kk')
 504     pipe_cmd.append(c_rev.rcs_path())
 505     pipe = SimplePopen(pipe_cmd, True)
 506     pipe.stdin.close()
 507     return pipe_cmd, pipe
 508
 509
 510 class CVSRepositoryViaCVS(CVSRepository):
 511   """A CVSRepository accessed via CVS."""
 512
 513   def __init__(self, cvs_repos_path):
 514     CVSRepository.__init__(self, cvs_repos_path)
 515     # Ascend above the specified root if necessary, to find the
 516     # cvs_repository_root (a directory containing a CVSROOT directory)
 517     # and the cvs_module (the path of the conversion root within the
 518     # cvs repository) NB: cvs_module must be seperated by '/' *not* by
 519     # os.sep .
 520     def is_cvs_repository_root(path):
 521       return os.path.isdir(os.path.join(path, 'CVSROOT'))
 522
 523     self.cvs_repository_root = os.path.abspath(self.cvs_repos_path)
 524     self.cvs_module = ""
 525     while not is_cvs_repository_root(self.cvs_repository_root):
 526       # Step up one directory:
 527       prev_cvs_repository_root = self.cvs_repository_root
 528       self.cvs_repository_root, module_component = \
 529           os.path.split(self.cvs_repository_root)
 530       if self.cvs_repository_root == prev_cvs_repository_root:
 531         # Hit the root (of the drive, on Windows) without finding a
 532         # CVSROOT dir.
 533         raise FatalError(
 534             "the path '%s' is not a CVS repository, nor a path "
 535             "within a CVS repository.  A CVS repository contains "
 536             "a CVSROOT directory within its root directory."
 537             % (self.cvs_repos_path,))
 538
 539       self.cvs_module = module_component + "/" + self.cvs_module
 540
 541     os.environ['CVSROOT'] = self.cvs_repository_root
 542
 543     def cvs_ok(global_arguments):
 544       check_command_runs(
 545           [ 'cvs' ] + global_arguments + [ '--version' ], 'cvs')
 546
 547     self.global_arguments = [ "-q", "-R" ]
 548     try:
 549       cvs_ok(self.global_arguments)
 550     except CommandFailedException, e:
 551       self.global_arguments = [ "-q" ]
 552       try:
 553         cvs_ok(self.global_arguments)
 554       except CommandFailedException, e:
 555         raise FatalError(
 556             '%s\n'
 557             'Please check that cvs is installed and in your PATH.' % (e,))
 558
 559   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 560     pipe_cmd = [ 'cvs' ] + self.global_arguments + \
 561                [ 'co', '-r' + c_rev.rev, '-p' ]
 562     if suppress_keyword_substitution:
 563       pipe_cmd.append('-kk')
 564     pipe_cmd.append(self.cvs_module + c_rev.cvs_path)
 565     pipe = SimplePopen(pipe_cmd, True)
 566     pipe.stdin.close()
 567     return pipe_cmd, pipe
 568
 569
 570 def generate_ignores(c_rev):
 571   # Read in props
 572   pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(c_rev)
 573   buf = pipe.stdout.read(PIPE_READ_SIZE)
 574   raw_ignore_val = ""
 575   while buf:
 576     raw_ignore_val = raw_ignore_val + buf
 577     buf = pipe.stdout.read(PIPE_READ_SIZE)
 578   pipe.stdout.close()
 579   error_output = pipe.stderr.read()
 580   exit_status = pipe.wait()
 581   if exit_status:
 582     raise FatalError("The command '%s' failed with exit status: %s\n"
 583                      "and the following output:\n"
 584                      "%s" % (pipe_cmd, exit_status, error_output))
 585
 586   # Tweak props: First, convert any spaces to newlines...
 587   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 588   raw_ignores = raw_ignore_val.split('\n')
 589   ignore_vals = [ ]
 590   for ignore in raw_ignores:
 591     # Reset the list if we encounter a '!'
 592     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 593     if ignore == '!':
 594       ignore_vals = [ ]
 595       continue
 596     # Skip empty lines
 597     if len(ignore) == 0:
 598       continue
 599     ignore_vals.append(ignore)
 600   return ignore_vals
 601
 602 # Return a string that has not been returned by gen_key() before.
 603 gen_key_base = 0L
 604 def gen_key():
 605   global gen_key_base
 606   key = '%x' % gen_key_base
 607   gen_key_base = gen_key_base + 1
 608   return key
 609
 610 # ============================================================================
 611 # This code is copied with a few modifications from:
 612 #   subversion/subversion/bindings/swig/python/svn/core.py
 613
 614 if sys.platform == "win32":
 615   _escape_shell_arg_re = re.compile(r'(\\+)(\"|$)')
 616
 617   def escape_shell_arg(arg):
 618     # The (very strange) parsing rules used by the C runtime library are
 619     # described at:
 620     # http://msdn.microsoft.com/library/en-us/vclang/html/_pluslang_Parsing_C.2b2b_.Command.2d.Line_Arguments.asp
 621
 622     # double up slashes, but only if they are followed by a quote character
 623     arg = re.sub(_escape_shell_arg_re, r'\1\1\2', arg)
 624
 625     # surround by quotes and escape quotes inside
 626     arg = '"' + string.replace(arg, '"', '"^""') + '"'
 627     return arg
 628
 629
 630   def argv_to_command_string(argv):
 631     """Flatten a list of command line arguments into a command string.
 632
 633     The resulting command string is expected to be passed to the system
 634     shell which os functions like popen() and system() invoke internally.
 635     """
 636
 637     # According cmd's usage notes (cmd /?), it parses the command line by
 638     # "seeing if the first character is a quote character and if so, stripping
 639     # the leading character and removing the last quote character."
 640     # So to prevent the argument string from being changed we add an extra set
 641     # of quotes around it here.
 642     return '"' + string.join(map(escape_shell_arg, argv), " ") + '"'
 643
 644 else:
 645   def escape_shell_arg(str):
 646     return "'" + string.replace(str, "'", "'\\''") + "'"
 647
 648   def argv_to_command_string(argv):
 649     """Flatten a list of command line arguments into a command string.
 650
 651     The resulting command string is expected to be passed to the system
 652     shell which os functions like popen() and system() invoke internally.
 653     """
 654
 655     return string.join(map(escape_shell_arg, argv), " ")
 656 # ============================================================================
 657
 658 def format_date(date):
 659   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 660   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 661   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 662
 663 def sort_file(infile, outfile):
 664   # sort the log files
 665
 666   # GNU sort will sort our dates differently (incorrectly!) if our
 667   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 668   # it to 'C'
 669   lc_all_tmp = os.environ.get('LC_ALL', None)
 670   os.environ['LC_ALL'] = 'C'
 671   # The -T option to sort has a nice side effect.  The Win32 sort is
 672   # case insensitive and cannot be used, and since it does not
 673   # understand the -T option and dies if we try to use it, there is
 674   # no risk that we use that sort by accident.
 675   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 676   if lc_all_tmp is None:
 677     del os.environ['LC_ALL']
 678   else:
 679     os.environ['LC_ALL'] = lc_all_tmp
 680
 681 def match_regexp_list(regexp_list, string):
 682   """Test whether STRING matches any of the compiled regexps in
 683   REGEXP_LIST."""
 684   for regexp in regexp_list:
 685     if regexp.match(string):
 686       return True
 687   return False
 688
 689 class LF_EOL_Filter:
 690   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 691   into LFs only."""
 692   def __init__(self, stream):
 693     self.stream = stream
 694     self.carry_cr = False
 695     self.eof = False
 696
 697   def read(self, size):
 698     while True:
 699       buf = self.stream.read(size)
 700       self.eof = len(buf) == 0
 701       if self.carry_cr:
 702         buf = '\r' + buf
 703         self.carry_cr = False
 704       if not self.eof and buf[-1] == '\r':
 705         self.carry_cr = True
 706         buf = buf[:-1]
 707       buf = string.replace(buf, '\r\n', '\n')
 708       buf = string.replace(buf, '\r', '\n')
 709       if len(buf) > 0 or self.eof:
 710         return buf
 711
 712
 713 # These constants represent the log levels that this script supports
 714 LOG_WARN = -1
 715 LOG_QUIET = 0
 716 LOG_NORMAL = 1
 717 LOG_VERBOSE = 2
 718 class Log:
 719   """A Simple logging facility.  Each line will be timestamped is
 720   self.use_timestamps is TRUE.  This class is a Borg, see
 721   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 722   __shared_state = {}
 723   def __init__(self):
 724     self.__dict__ = self.__shared_state
 725     if self.__dict__:
 726       return
 727     self.log_level = LOG_NORMAL
 728     # Set this to true if you want to see timestamps on each line output.
 729     self.use_timestamps = None
 730     self.logger = sys.stdout
 731
 732   def _timestamp(self):
 733     """Output a detailed timestamp at the beginning of each line output."""
 734     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 735
 736   def write(self, log_level, *args):
 737     """This is the public method to use for writing to a file.  Only
 738     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 739     there are multiple ARGS, they will be separated by a space."""
 740     if log_level > self.log_level:
 741       return
 742     if self.use_timestamps:
 743       self._timestamp()
 744     self.logger.write(' '.join(map(str,args)) + "\n")
 745     # Ensure that log output doesn't get out-of-order with respect to
 746     # stderr output.
 747     self.logger.flush()
 748
 749
 750 class Cleanup:
 751   """This singleton class manages any files created by cvs2svn.  When
 752   you first create a file, call Cleanup.register, passing the
 753   filename, and the last pass that you need the file.  After the end
 754   of that pass, your file will be cleaned up after running an optional
 755   callback.  This class is a Borg, see
 756   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 757
 758   __shared_state = {}
 759   def __init__(self):
 760     self.__dict__ = self.__shared_state
 761     if self.__dict__:
 762       return
 763     self._log = {}
 764     self._callbacks = {}
 765
 766   def register(self, file, which_pass, callback=None):
 767     """Register FILE for cleanup at the end of WHICH_PASS, running
 768     function CALLBACK prior to removal.  Registering a given FILE is
 769     idempotent; you may register as many times as you wish, but it
 770     will only be cleaned up once.
 771
 772     Note that if a file is registered multiple times, only the first
 773     callback registered for that file will be called at cleanup
 774     time.  Also note that if you register a database file you must
 775     close the database before cleanup, e.g. using a callback."""
 776     self._log.setdefault(which_pass, {})[file] = 1
 777     if callback and not self._callbacks.has_key(file):
 778       self._callbacks[file] = callback
 779
 780   def cleanup(self, which_pass):
 781     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 782     if not self._log.has_key(which_pass):
 783       return
 784     for file in self._log[which_pass]:
 785       Log().write(LOG_VERBOSE, "Deleting", file)
 786       if self._callbacks.has_key(file):
 787         self._callbacks[file]()
 788       os.unlink(file)
 789
 790
 791 # Always use these constants for opening databases.
 792 DB_OPEN_READ = 'r'
 793 DB_OPEN_NEW = 'n'
 794
 795
 796 class AbstractDatabase(UserDict.DictMixin):
 797   """An abstract base class for anydbm-based databases."""
 798
 799   def __init__(self, filename, mode):
 800     """A convenience function for opening an anydbm database."""
 801     # pybsddb3 has a bug which prevents it from working with
 802     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 803     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 804     # for databases protected by lock and transaction support
 805     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 806     #
 807     # Therefore, manually perform the removal (we can do this, because
 808     # we know that for bsddb - but *not* anydbm in general - the database
 809     # consists of one file with the name we specify, rather than several
 810     # based on that name).
 811     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 812       if os.path.isfile(filename):
 813         os.unlink(filename)
 814       mode = 'c'
 815
 816     self.db = anydbm.open(filename, mode)
 817
 818     # Import implementations for many mapping interface methods.
 819     # Note that we specifically do not do this for any method which handles
 820     # *values*, because our derived classes may define __getitem__ and
 821     # __setitem__ to override the storage of values, and grabbing methods
 822     # directly from the dbm object would bypass this.
 823     for meth_name in ('__delitem__', 'keys',
 824         '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
 825       meth_ref = getattr(self.db, meth_name, None)
 826       if meth_ref:
 827         setattr(self, meth_name, meth_ref)
 828
 829   def __delitem__(self, key):
 830     "gdbm does not define a __delitem__ we can assign."
 831     del self.db[key]
 832
 833
 834 class SDatabase(AbstractDatabase):
 835   """A database that can only store strings."""
 836
 837   def __getitem__(self, key):
 838     return self.db[key]
 839
 840   def __setitem__(self, key, value):
 841     self.db[key] = value
 842
 843
 844 class Database(AbstractDatabase):
 845   """A database that uses the marshal module to store built-in types."""
 846
 847   def __getitem__(self, key):
 848     return marshal.loads(self.db[key])
 849
 850   def __setitem__(self, key, value):
 851     self.db[key] = marshal.dumps(value)
 852
 853
 854 class StatsKeeper:
 855   __shared_state = { }
 856   def __init__(self):
 857     self.__dict__ = self.__shared_state
 858     if self.__dict__:
 859       return
 860     self.filename = temp(STATISTICS_FILE)
 861     Cleanup().register(self.filename, pass8)
 862     # This can get kinda large, so we don't store it in our data dict.
 863     self.repos_files = { }
 864
 865     if os.path.exists(self.filename):
 866       self.unarchive()
 867     else:
 868       self.data = { 'cvs_revs_count' : 0,
 869                     'tags': { },
 870                     'branches' : { },
 871                     'repos_size' : 0,
 872                     'repos_file_count' : 0,
 873                     'svn_rev_count' : None,
 874                     'first_rev_date' : 1L<<32,
 875                     'last_rev_date' : 0,
 876                     'pass_timings' : { },
 877                     'start_time' : 0,
 878                     'end_time' : 0,
 879                     }
 880
 881   def log_duration_for_pass(self, duration, pass_num):
 882     self.data['pass_timings'][pass_num] = duration
 883
 884   def set_start_time(self, start):
 885     self.data['start_time'] = start
 886
 887   def set_end_time(self, end):
 888     self.data['end_time'] = end
 889
 890   def _bump_item(self, key, amount=1):
 891     self.data[key] = self.data[key] + amount
 892
 893   def reset_c_rev_info(self):
 894     self.data['cvs_revs_count'] = 0
 895     self.data['tags'] = { }
 896     self.data['branches'] = { }
 897
 898   def record_c_rev(self, c_rev):
 899     self._bump_item('cvs_revs_count')
 900
 901     for tag in c_rev.tags:
 902       self.data['tags'][tag] = None
 903     for branch in c_rev.branches:
 904       self.data['branches'][branch] = None
 905
 906     if c_rev.timestamp < self.data['first_rev_date']:
 907       self.data['first_rev_date'] = c_rev.timestamp
 908
 909     if c_rev.timestamp > self.data['last_rev_date']:
 910       self.data['last_rev_date'] = c_rev.timestamp
 911
 912     # Only add the size if this is the first time we see the file.
 913     if not self.repos_files.has_key(c_rev.fname):
 914       self._bump_item('repos_size', c_rev.file_size)
 915     self.repos_files[c_rev.fname] = None
 916
 917     self.data['repos_file_count'] = len(self.repos_files)
 918
 919   def set_svn_rev_count(self, count):
 920     self.data['svn_rev_count'] = count
 921
 922   def svn_rev_count(self):
 923     return self.data['svn_rev_count']
 924
 925   def archive(self):
 926     open(self.filename, 'w').write(marshal.dumps(self.data))
 927
 928   def unarchive(self):
 929     self.data = marshal.loads(open(self.filename, 'r').read())
 930
 931   def __str__(self):
 932     svn_revs_str = ""
 933     if self.data['svn_rev_count'] is not None:
 934       svn_revs_str = ('Total SVN Commits:      %10s\n'
 935                       % self.data['svn_rev_count'])
 936
 937     return ('\n'                                \
 938             'cvs2svn Statistics:\n'             \
 939             '------------------\n'              \
 940             'Total CVS Files:        %10i\n'    \
 941             'Total CVS Revisions:    %10i\n'    \
 942             'Total Unique Tags:      %10i\n'    \
 943             'Total Unique Branches:  %10i\n'    \
 944             'CVS Repos Size in KB:   %10i\n'    \
 945             '%s'                                \
 946             'First Revision Date:    %s\n'      \
 947             'Last Revision Date:     %s\n'      \
 948             '------------------'                \
 949             % (self.data['repos_file_count'],
 950                self.data['cvs_revs_count'],
 951                len(self.data['tags']),
 952                len(self.data['branches']),
 953                (self.data['repos_size'] / 1024),
 954                svn_revs_str,
 955                time.ctime(self.data['first_rev_date']),
 956                time.ctime(self.data['last_rev_date']),
 957                ))
 958
 959   def timings(self):
 960     passes = self.data['pass_timings'].keys()
 961     passes.sort()
 962     str = 'Timings:\n------------------\n'
 963
 964     def desc(val):
 965       if val == 1: return "second"
 966       return "seconds"
 967
 968     for pass_num in passes:
 969       duration = int(self.data['pass_timings'][pass_num])
 970       p_str = ('pass %d:%6d %s\n'
 971                % (pass_num, duration, desc(duration)))
 972       str = str + p_str
 973
 974     total = int(self.data['end_time'] - self.data['start_time'])
 975     str = str + ('total: %6d %s' % (total, desc(total)))
 976     return str
 977
 978
 979 class LastSymbolicNameDatabase:
 980   """ Passing every CVSRevision in s-revs to this class will result in
 981   a Database whose key is the last CVS Revision a symbolicname was
 982   seen in, and whose value is a list of all symbolicnames that were
 983   last seen in that revision."""
 984   def __init__(self, mode):
 985     self.symbols = {}
 986     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 987     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 988
 989   # Once we've gone through all the revs,
 990   # symbols.keys() will be a list of all tags and branches, and
 991   # their corresponding values will be a key into the last CVS revision
 992   # that they were used in.
 993   def log_revision(self, c_rev):
 994     # Gather last CVS Revision for symbolic name info and tag info
 995     for tag in c_rev.tags:
 996       self.symbols[tag] = c_rev.unique_key()
 997     if c_rev.op is not OP_DELETE:
 998       for branch in c_rev.branches:
 999         self.symbols[branch] = c_rev.unique_key()
1000
1001   # Creates an inversion of symbols above--a dictionary of lists (key
1002   # = CVS rev unique_key: val = list of symbols that close in that
1003   # rev.
1004   def create_database(self):
1005     for sym, rev_unique_key in self.symbols.items():
1006       ary = self.symbol_revs_db.get(rev_unique_key, [])
1007       ary.append(sym)
1008       self.symbol_revs_db[rev_unique_key] = ary
1009
1010
1011 class CVSRevisionDatabase:
1012   """A Database to store CVSRevision objects and retrieve them by their
1013   unique_key()."""
1014
1015   def __init__(self, mode):
1016     """Initialize an instance, opening database in MODE (like the MODE
1017     argument to Database or anydbm.open())."""
1018     self.cvs_revs_db = SDatabase(temp(CVS_REVS_DB), mode)
1019     Cleanup().register(temp(CVS_REVS_DB), pass8)
1020
1021   def log_revision(self, c_rev):
1022     """Add C_REV, a CVSRevision, to the database."""
1023     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
1024
1025   def get_revision(self, unique_key):
1026     """Return the CVSRevision stored under UNIQUE_KEY."""
1027     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
1028
1029
1030 def TagsDatabase(mode):
1031   """A Database to store which symbolic names are tags.
1032   Each key is a tag name.
1033   The value has no meaning, and should be set to None."""
1034   db = SDatabase(temp(TAGS_DB), mode)
1035   Cleanup().register(temp(TAGS_DB), pass8)
1036   return db
1037
1038
1039 class Project:
1040   """A project within a CVS repository."""
1041
1042   def __init__(self, project_cvs_repos_path,
1043                trunk_path, branches_path, tags_path):
1044     """Create a new Project record.
1045
1046     PROJECT_CVS_REPOS_PATH is the main CVS directory for this project
1047     (within the filesystem).  TRUNK_PATH, BRANCHES_PATH, and TAGS_PATH
1048     are the full, normalized directory names in svn for the
1049     corresponding part of the repository."""
1050
1051     self.project_cvs_repos_path = project_cvs_repos_path
1052     prefix = Ctx().cvs_repository.cvs_repos_path
1053     if not self.project_cvs_repos_path.startswith(prefix):
1054       raise FatalError("Project '%s' must start with '%s'"
1055                        % (self.project_cvs_repos_path, prefix,))
1056     # The project's main directory as a cvs_path:
1057     self.project_cvs_path = self.project_cvs_repos_path[len(prefix):]
1058     if self.project_cvs_path.startswith(os.sep):
1059       self.project_cvs_path = self.project_cvs_path[1:]
1060     self.trunk_path = trunk_path
1061     self.branches_path = branches_path
1062     self.tags_path = tags_path
1063     verify_paths_disjoint(self.trunk_path, self.branches_path, self.tags_path)
1064
1065   def is_source(self, svn_path):
1066     """Return True iff SVN_PATH is a legitimate source for this project.
1067
1068     Legitimate paths are self.trunk_path or any directory directly
1069     under self.branches_path."""
1070
1071     if svn_path == self.trunk_path:
1072       return True
1073
1074     (head, tail,) = _path_split(svn_path)
1075     if head == self.branches_path:
1076       return True
1077
1078     return False
1079
1080   def is_unremovable(self, svn_path):
1081     """Return True iff the specified path must not be removed."""
1082
1083     return svn_path in [self.trunk_path, self.branches_path, self.tags_path]
1084
1085   def get_branch_path(self, branch_name):
1086     """Return the svnpath for the branch named BRANCH_NAME."""
1087
1088     return _path_join(self.branches_path, _clean_symbolic_name(branch_name))
1089
1090   def get_tag_path(self, tag_name):
1091     """Return the svnpath for the tag named TAG_NAME."""
1092
1093     return _path_join(self.tags_path, _clean_symbolic_name(tag_name))
1094
1095   def _relative_name(self, cvs_path):
1096     """Convert CVS_PATH into a name relative to this project's root directory.
1097
1098     CVS_PATH has to begin (textually) with self.project_cvs_path.
1099     Remove prefix and optional '/'."""
1100
1101     if not cvs_path.startswith(self.project_cvs_path):
1102       raise FatalError(
1103           "_relative_name: '%s' is not a sub-path of '%s'"
1104           % (cvs_path, self.project_cvs_path,))
1105     l = len(self.project_cvs_path)
1106     if cvs_path[l] == os.sep:
1107       l += 1
1108     return cvs_path[l:]
1109
1110   def make_trunk_path(self, cvs_path):
1111     """Return the trunk path for CVS_PATH.
1112
1113     Return the svn path for this file on trunk."""
1114
1115     return _path_join(self.trunk_path, self._relative_name(cvs_path))
1116
1117   def make_branch_path(self, branch_name, cvs_path):
1118     """Return the svn path for CVS_PATH on branch BRANCH_NAME."""
1119
1120     return _path_join(self.get_branch_path(branch_name),
1121                       self._relative_name(cvs_path))
1122
1123
1124 class CVSRevision:
1125   def __init__(self, ctx, *args):
1126     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
1127
1128     If CTX is None, the following members and methods of the
1129     instantiated CVSRevision class object will be unavailable (or
1130     simply will not work correctly, if at all):
1131        cvs_path
1132        svn_path
1133        is_default_branch_revision()
1134
1135     (Note that this class treats CTX as const, because the caller
1136     likely passed in a Borg instance of a Ctx.  The reason this class
1137     takes CTX as as a parameter, instead of just instantiating a Ctx
1138     itself, is that this class should be usable outside cvs2svn.)
1139
1140     If there is one argument in ARGS, it is a string, in the format of
1141     a line from a revs file.  Do *not* include a trailing newline.
1142
1143     If there are multiple ARGS, there must be 17 of them,
1144     comprising a parsed revs line:
1145        timestamp       -->  (int) date stamp for this cvs revision
1146        digest          -->  (string) digest of author+logmsg
1147        prev_timestamp  -->  (int) date stamp for the previous cvs revision
1148        next_timestamp  -->  (int) date stamp for the next cvs revision
1149        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
1150        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
1151        rev             -->  (string) this CVS rev, e.g., "1.3"
1152        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
1153        file_in_attic   -->  (char or None) true if RCS file is in Attic
1154        file_executable -->  (char or None) true if RCS file has exec bit set.
1155        file_size       -->  (int) size of the RCS file
1156        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
1157        fname           -->  (string) relative path of file in CVS repos
1158        mode            -->  (string or None) "kkv", "kb", etc.
1159        branch_name     -->  (string or None) branch on which this rev occurred
1160        tags            -->  (list of strings) all tags on this revision
1161        branches        -->  (list of strings) all branches rooted in this rev
1162
1163     The two forms of initialization are equivalent.
1164
1165     WARNING: Due to the resync process in pass2, prev_timestamp or
1166     next_timestamp may be incorrect in the c-revs or s-revs files."""
1167
1168     self._ctx = ctx
1169     if len(args) == 17:
1170       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1171        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1172        self.file_executable, self.file_size, self.deltatext_code,
1173        self.fname,
1174        self.mode, self.branch_name, self.tags, self.branches) = args
1175     elif len(args) == 1:
1176       data = args[0].split(' ', 15)
1177       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1178        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1179        self.file_executable, self.file_size, self.deltatext_code,
1180        self.mode, self.branch_name, numtags, remainder) = data
1181       # Patch up data items which are not simple strings
1182       self.timestamp = int(self.timestamp, 16)
1183       if self.prev_timestamp == "*":
1184         self.prev_timestamp = 0
1185       else:
1186         self.prev_timestamp = int(self.prev_timestamp)
1187       if self.next_timestamp == "*":
1188         self.next_timestamp = 0
1189       else:
1190         self.next_timestamp = int(self.next_timestamp)
1191       if self.prev_rev == "*":
1192         self.prev_rev = None
1193       if self.next_rev == "*":
1194         self.next_rev = None
1195       if self.file_in_attic == "*":
1196         self.file_in_attic = None
1197       if self.file_executable == "*":
1198         self.file_executable = None
1199       self.file_size = int(self.file_size)
1200       if self.mode == "*":
1201         self.mode = None
1202       if self.branch_name == "*":
1203         self.branch_name = None
1204       numtags = int(numtags)
1205       tags_and_numbranches_and_remainder = remainder.split(' ', numtags + 1)
1206       self.tags = tags_and_numbranches_and_remainder[:-2]
1207       numbranches = int(tags_and_numbranches_and_remainder[-2])
1208       remainder = tags_and_numbranches_and_remainder[-1]
1209       branches_and_fname = remainder.split(' ', numbranches)
1210       self.branches = branches_and_fname[:-1]
1211       self.fname = branches_and_fname[-1]
1212     else:
1213       raise TypeError, 'CVSRevision() takes 2 or 18 arguments (%d given)' % \
1214           (len(args) + 1)
1215     if ctx is not None:
1216       self.cvs_path = ctx.cvs_repository.get_cvs_path(self.fname)
1217       if self.branch_name:
1218         self.svn_path = ctx.project.make_branch_path(self.branch_name,
1219                                                      self.cvs_path)
1220       else:
1221         self.svn_path = ctx.project.make_trunk_path(self.cvs_path)
1222
1223   # The 'primary key' of a CVS Revision is the revision number + the
1224   # filename.  To provide a unique key (say, for a dict), we just glom
1225   # them together in a string.  By passing in self.prev_rev or
1226   # self.next_rev, you can get the unique key for their respective
1227   # CVSRevisions.
1228   def unique_key(self, revnum="0"):
1229     if revnum is "0":
1230       revnum = self.rev
1231     elif revnum is None:
1232       return None
1233     return revnum + "/" + self.fname
1234
1235   def __str__(self):
1236     return ('%08lx %s %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s'
1237             % (self.timestamp, self.digest, self.prev_timestamp or "*",
1238               self.next_timestamp or "*", self.op, (self.prev_rev or "*"),
1239               self.rev, (self.next_rev or "*"), (self.file_in_attic or "*"),
1240               (self.file_executable or "*"),
1241               self.file_size,
1242               self.deltatext_code, (self.mode or "*"),
1243               (self.branch_name or "*"),
1244               len(self.tags), self.tags and " " or "", " ".join(self.tags),
1245               len(self.branches), self.branches and " " or "",
1246               " ".join(self.branches),
1247               self.fname, ))
1248
1249   # Returns true if this CVSRevision is the opening CVSRevision for
1250   # NAME (for this RCS file).
1251   def opens_symbolic_name(self, name):
1252     if name in self.tags:
1253       return 1
1254     if name in self.branches:
1255       # If this c_rev opens a branch and our op is OP_DELETE, then
1256       # that means that the file that this c_rev belongs to was
1257       # created on the branch, so for all intents and purposes, this
1258       # c_rev is *technically* not an opening.  See Issue #62 for more
1259       # information.
1260       if self.op != OP_DELETE:
1261         return 1
1262     return 0
1263
1264   def is_default_branch_revision(self):
1265     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
1266     revision according to DEFAULT_BRANCHES_DB (see the conditions
1267     documented there), else return None."""
1268     val = self._ctx._default_branches_db.get(self.cvs_path, None)
1269     if val is not None:
1270       val_last_dot = val.rindex(".")
1271       our_last_dot = self.rev.rindex(".")
1272       default_branch = val[:val_last_dot]
1273       our_branch = self.rev[:our_last_dot]
1274       default_rev_component = int(val[val_last_dot + 1:])
1275       our_rev_component = int(self.rev[our_last_dot + 1:])
1276       if (default_branch == our_branch
1277           and our_rev_component <= default_rev_component):
1278         return 1
1279     # else
1280     return None
1281
1282   def rcs_path(self):
1283     """Returns the actual filesystem path to the RCS file of this
1284     CVSRevision."""
1285     if self.file_in_attic is None:
1286       return self.fname
1287     else:
1288       basepath, filename = os.path.split(self.fname)
1289       return os.path.join(basepath, 'Attic', filename)
1290
1291   def filename(self):
1292     "Return the last path component of self.fname, minus the ',v'"
1293     return os.path.split(self.fname)[-1][:-2]
1294
1295 class SymbolDatabase:
1296   """This database records information on all symbols in the RCS
1297   files.  It is created in pass 1 and it is used in pass 2."""
1298   def __init__(self):
1299     # A hash that maps tag names to commit counts
1300     self.tags = { }
1301     # A hash that maps branch names to lists of the format
1302     # [ create_count, commit_count, blockers ], where blockers
1303     # is a hash that lists the symbols that depend on the
1304     # the branch.  The blockers hash is used as a set, so the
1305     # values are not used.
1306     self.branches = { }
1307
1308   def register_tag_creation(self, name):
1309     """Register the creation of the tag NAME."""
1310     self.tags[name] = self.tags.get(name, 0) + 1
1311
1312   def _branch(self, name):
1313     """Helper function to get a branch node that will create and
1314     initialize the node if it does not exist."""
1315     if not self.branches.has_key(name):
1316       self.branches[name] = [ 0, 0, { } ]
1317     return self.branches[name]
1318
1319   def register_branch_creation(self, name):
1320     """Register the creation of the branch NAME."""
1321     self._branch(name)[0] += 1
1322
1323   def register_branch_commit(self, name):
1324     """Register a commit on the branch NAME."""
1325     self._branch(name)[1] += 1
1326
1327   def register_branch_blocker(self, name, blocker):
1328     """Register BLOCKER as a blocker on the branch NAME."""
1329     self._branch(name)[2][blocker] = None
1330
1331   def branch_has_commit(self, name):
1332     """Return non-zero if NAME has commits.  Returns 0 if name
1333     is not a branch or if it has no commits."""
1334     return self.branches.has_key(name) and self.branches[name][1]
1335
1336   def find_excluded_symbols(self, regexp_list):
1337     """Returns a hash of all symbols thaht match the regexps in
1338     REGEXP_LISTE.  The hash is used as a set so the values are
1339     not used."""
1340     excludes = { }
1341     for tag in self.tags:
1342       if match_regexp_list(regexp_list, tag):
1343         excludes[tag] = None
1344     for branch in self.branches:
1345       if match_regexp_list(regexp_list, branch):
1346         excludes[branch] = None
1347     return excludes
1348
1349   def find_branch_exclude_blockers(self, branch, excludes):
1350     """Find all blockers of BRANCH, excluding the ones in the hash
1351     EXCLUDES."""
1352     blockers = { }
1353     if excludes.has_key(branch):
1354       for blocker in self.branches[branch][2]:
1355         if not excludes.has_key(blocker):
1356           blockers[blocker] = None
1357     return blockers
1358
1359   def find_blocked_excludes(self, excludes):
1360     """Find all branches not in EXCLUDES that have blocking symbols that
1361     are not themselves excluded.  Return a hash that maps branch names
1362     to a hash of blockers.  The hash of blockes is used as a set so the
1363     values are not used."""
1364     blocked_branches = { }
1365     for branch in self.branches:
1366       blockers = self.find_branch_exclude_blockers(branch, excludes)
1367       if blockers:
1368         blocked_branches[branch] = blockers
1369     return blocked_branches
1370
1371   def find_mismatches(self, excludes=None):
1372     """Find all symbols that are defined as both tags and branches,
1373     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1374     the symbol name, tag count, branch count and commit count."""
1375     if excludes is None:
1376       excludes = { }
1377     mismatches = [ ]
1378     for branch in self.branches:
1379       if not excludes.has_key(branch) and self.tags.has_key(branch):
1380         mismatches.append((branch,                    # name
1381                            self.tags[branch],         # tag count
1382                            self.branches[branch][0],  # branch count
1383                            self.branches[branch][1])) # commit count
1384     return mismatches
1385
1386   def read(self):
1387     """Read the symbol database from files."""
1388     f = open(temp(TAGS_LIST))
1389     while 1:
1390       line = f.readline()
1391       if not line:
1392         break
1393       tag, count = line.split()
1394       self.tags[tag] = int(count)
1395
1396     f = open(temp(BRANCHES_LIST))
1397     while 1:
1398       line = f.readline()
1399       if not line:
1400         break
1401       words = line.split()
1402       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1403       for blocker in words[3:]:
1404         self.branches[words[0]][2][blocker] = None
1405
1406   def write(self):
1407     """Store the symbol database to files."""
1408     f = open(temp(TAGS_LIST), "w")
1409     Cleanup().register(temp(TAGS_LIST), pass2)
1410     for tag, count in self.tags.items():
1411       f.write("%s %d\n" % (tag, count))
1412
1413     f = open(temp(BRANCHES_LIST), "w")
1414     Cleanup().register(temp(BRANCHES_LIST), pass2)
1415     for branch, info in self.branches.items():
1416       f.write("%s %d %d" % (branch, info[0], info[1]))
1417       if info[2]:
1418         f.write(" ")
1419         f.write(" ".join(info[2].keys()))
1420       f.write("\n")
1421
1422 class CollectData(cvs2svn_rcsparse.Sink):
1423   def __init__(self):
1424     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1425     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1426     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1427     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1428     self.default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
1429                                          DB_OPEN_NEW)
1430     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1431     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1432     Cleanup().register(temp(METADATA_DB), pass8)
1433     self.fatal_errors = []
1434     self.num_files = 0
1435     self.symbol_db = SymbolDatabase()
1436
1437     # 1 if we've collected data for at least one file, None otherwise.
1438     self.found_valid_file = None
1439
1440     # See set_fname() for initializations of other variables.
1441
1442   def set_fname(self, canonical_name, filename):
1443     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1444     filesystem path to the file in question, and CANONICAL_NAME is
1445     FILENAME with the 'Attic' component removed (if the file is indeed
1446     in the Attic) ."""
1447     self.fname = canonical_name
1448
1449     # We calculate and save some file metadata here, where we can do
1450     # it only once per file, instead of waiting until later where we
1451     # would have to do the same calculations once per CVS *revision*.
1452
1453     self.cvs_path = Ctx().cvs_repository.get_cvs_path(self.fname)
1454
1455     # If the paths are not the same, then that means that the
1456     # canonical_name has had the 'Attic' component stripped out.
1457     self.file_in_attic = None
1458     if canonical_name != filename:
1459       self.file_in_attic = 1
1460
1461     file_stat = os.stat(filename)
1462     # The size of our file in bytes
1463     self.file_size = file_stat[stat.ST_SIZE]
1464
1465     # Whether or not the executable bit is set.
1466     self.file_executable = None
1467     if file_stat[0] & stat.S_IXUSR:
1468       self.file_executable = 1
1469
1470     # revision -> [timestamp, author, old-timestamp]
1471     self.rev_data = { }
1472
1473     # Maps revision number (key) to the revision number of the
1474     # previous revision along this line of development.
1475     #
1476     # For the first revision R on a branch, we consider the revision
1477     # from which R sprouted to be the 'previous'.
1478     #
1479     # Note that this revision can't be determined arithmetically (due
1480     # to cvsadmin -o, which is why this is necessary).
1481     #
1482     # If the key has no previous revision, then store None as key's
1483     # value.
1484     self.prev_rev = { }
1485
1486     # This dict is essentially self.prev_rev with the values mapped in
1487     # the other direction, so following key -> value will yield you
1488     # the next revision number.
1489     #
1490     # Unlike self.prev_rev, if the key has no next revision, then the
1491     # key is not present.
1492     self.next_rev = { }
1493
1494     # Track the state of each revision so that in set_revision_info,
1495     # we can determine if our op is an add/change/delete.  We can do
1496     # this because in set_revision_info, we'll have all of the
1497     # revisions for a file at our fingertips, and we need to examine
1498     # the state of our prev_rev to determine if we're an add or a
1499     # change--without the state of the prev_rev, we are unable to
1500     # distinguish between an add and a change.
1501     self.rev_state = { }
1502
1503     # Hash mapping branch numbers, like '1.7.2', to branch names,
1504     # like 'Release_1_0_dev'.
1505     self.branch_names = { }
1506
1507     # RCS flags (used for keyword expansion).
1508     self.mode = None
1509
1510     # Hash mapping revision numbers, like '1.7', to lists of names
1511     # indicating which branches sprout from that revision, like
1512     # ['Release_1_0_dev', 'experimental_driver', ...].
1513     self.branchlist = { }
1514
1515     # Like self.branchlist, but the values are lists of tag names that
1516     # apply to the key revision.
1517     self.taglist = { }
1518
1519     # If set, this is an RCS branch number -- rcsparse calls this the
1520     # "principal branch", but CVS and RCS refer to it as the "default
1521     # branch", so that's what we call it, even though the rcsparse API
1522     # setter method is still 'set_principal_branch'.
1523     self.default_branch = None
1524
1525     # If the RCS file doesn't have a default branch anymore, but does
1526     # have vendor revisions, then we make an educated guess that those
1527     # revisions *were* the head of the default branch up until the
1528     # commit of 1.2, at which point the file's default branch became
1529     # trunk.  This records the date at which 1.2 was committed.
1530     self.first_non_vendor_revision_date = None
1531
1532     # A list of all symbols defined for the current file.  Used to
1533     # prevent multiple definitions of a symbol, something which can
1534     # easily happen when --symbol-transform is used.
1535     self.defined_symbols = { }
1536
1537   def set_principal_branch(self, branch):
1538     self.default_branch = branch
1539
1540   def set_expansion(self, mode):
1541     self.mode = mode
1542
1543   def set_branch_name(self, branch_number, name):
1544     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1545     and that NAME sprouts from BRANCH_NUMBER .
1546     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1547     for example '1.7.2' (never '1.7.0.2')."""
1548     if not self.branch_names.has_key(branch_number):
1549       self.branch_names[branch_number] = name
1550       # The branchlist is keyed on the revision number from which the
1551       # branch sprouts, so strip off the odd final component.
1552       sprout_rev = branch_number[:branch_number.rfind(".")]
1553       self.branchlist.setdefault(sprout_rev, []).append(name)
1554       self.symbol_db.register_branch_creation(name)
1555     else:
1556       sys.stderr.write("%s: in '%s':\n"
1557                        "   branch '%s' already has name '%s',\n"
1558                        "   cannot also have name '%s', ignoring the latter\n"
1559                        % (warning_prefix, self.fname, branch_number,
1560                           self.branch_names[branch_number], name))
1561
1562   def rev_to_branch_name(self, revision):
1563     """Return the name of the branch on which REVISION lies.
1564     REVISION is a non-branch revision number with an even number of,
1565     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1566     For the convenience of callers, REVISION can also be a trunk
1567     revision such as '1.2', in which case just return None."""
1568     if trunk_rev.match(revision):
1569       return None
1570     return self.branch_names.get(revision[:revision.rindex(".")])
1571
1572   def add_cvs_branch(self, revision, branch_name):
1573     """Record the root revision and branch revision for BRANCH_NAME,
1574     based on REVISION.  REVISION is a CVS branch number having an even
1575     number of components where the second-to-last is '0'.  For
1576     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1577     from 1.7 and has branch number 1.7.2."""
1578     last_dot = revision.rfind(".")
1579     branch_rev = revision[:last_dot]
1580     last2_dot = branch_rev.rfind(".")
1581     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1582     self.set_branch_name(branch_rev, branch_name)
1583
1584   def define_tag(self, name, revision):
1585     """Record a bidirectional mapping between symbolic NAME and REVISION.
1586     REVISION is an unprocessed revision number from the RCS file's
1587     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1588     This function will determine what kind of symbolic name it is by
1589     inspection, and record it in the right places."""
1590     for (pattern, replacement) in Ctx().symbol_transforms:
1591       newname = pattern.sub(replacement, name)
1592       if newname != name:
1593         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1594                     % (name, newname))
1595         name = newname
1596     if self.defined_symbols.has_key(name):
1597       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1598                 % (error_prefix, name, self.fname)
1599       sys.stderr.write(err + "\n")
1600       self.fatal_errors.append(err)
1601     self.defined_symbols[name] = None
1602     if branch_tag.match(revision):
1603       self.add_cvs_branch(revision, name)
1604     elif vendor_tag.match(revision):
1605       self.set_branch_name(revision, name)
1606     else:
1607       self.taglist.setdefault(revision, []).append(name)
1608       self.symbol_db.register_tag_creation(name)
1609
1610   def define_revision(self, revision, timestamp, author, state,
1611                       branches, next):
1612
1613     # Record the state of our revision for later calculations
1614     self.rev_state[revision] = state
1615
1616     # store the rev_data as a list in case we have to jigger the timestamp
1617     self.rev_data[revision] = [int(timestamp), author, None]
1618
1619     # When on trunk, the RCS 'next' revision number points to what
1620     # humans might consider to be the 'previous' revision number.  For
1621     # example, 1.3's RCS 'next' is 1.2.
1622     #
1623     # However, on a branch, the RCS 'next' revision number really does
1624     # point to what humans would consider to be the 'next' revision
1625     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1626     #
1627     # In other words, in RCS, 'next' always means "where to find the next
1628     # deltatext that you need this revision to retrieve.
1629     #
1630     # That said, we don't *want* RCS's behavior here, so we determine
1631     # whether we're on trunk or a branch and set self.prev_rev
1632     # accordingly.
1633     #
1634     # One last thing.  Note that if REVISION is a branch revision,
1635     # instead of mapping REVISION to NEXT, we instead map NEXT to
1636     # REVISION.  Since we loop over all revisions in the file before
1637     # doing anything with the data we gather here, this 'reverse
1638     # assignment' effectively does the following:
1639     #
1640     # 1. Gives us no 'prev' value for REVISION (in this
1641     # iteration... it may have been set in a previous iteration)
1642     #
1643     # 2. Sets the 'prev' value for the revision with number NEXT to
1644     # REVISION.  So when we come around to the branch revision whose
1645     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1646     # set.
1647     if trunk_rev.match(revision):
1648       self.prev_rev[revision] = next
1649       self.next_rev[next] = revision
1650     elif next:
1651       self.prev_rev[next] = revision
1652       self.next_rev[revision] = next
1653
1654     for b in branches:
1655       self.prev_rev[b] = revision
1656
1657     # Ratchet up the highest vendor head revision, if necessary.
1658     if self.default_branch:
1659       default_branch_root = self.default_branch + "."
1660       if ((revision.find(default_branch_root) == 0)
1661           and (default_branch_root.count('.') == revision.count('.'))):
1662         # This revision is on the default branch, so record that it is
1663         # the new highest default branch head revision.
1664         self.default_branches_db[self.cvs_path] = revision
1665     else:
1666       # No default branch, so make an educated guess.
1667       if revision == '1.2':
1668         # This is probably the time when the file stopped having a
1669         # default branch, so make a note of it.
1670         self.first_non_vendor_revision_date = timestamp
1671       else:
1672         m = vendor_revision.match(revision)
1673         if m and ((not self.first_non_vendor_revision_date)
1674                   or (timestamp < self.first_non_vendor_revision_date)):
1675           # We're looking at a vendor revision, and it wasn't
1676           # committed after this file lost its default branch, so bump
1677           # the maximum trunk vendor revision in the permanent record.
1678           self.default_branches_db[self.cvs_path] = revision
1679
1680     if not trunk_rev.match(revision):
1681       # Check for unlabeled branches, record them.  We tried to collect
1682       # all branch names when we parsed the symbolic name header
1683       # earlier, of course, but that didn't catch unlabeled branches.
1684       # If a branch is unlabeled, this is our first encounter with it,
1685       # so we have to record its data now.
1686       branch_number = revision[:revision.rindex(".")]
1687       if not self.branch_names.has_key(branch_number):
1688         branch_name = "unlabeled-" + branch_number
1689         self.set_branch_name(branch_number, branch_name)
1690
1691       # Register the commit on this non-trunk branch
1692       branch_name = self.branch_names[branch_number]
1693       self.symbol_db.register_branch_commit(branch_name)
1694
1695   def tree_completed(self):
1696     "The revision tree has been parsed.  Analyze it for consistency."
1697
1698     # Our algorithm depends upon the timestamps on the revisions occuring
1699     # monotonically over time.  That is, we want to see rev 1.34 occur in
1700     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1701     # sorting), and then tried to insert 1.34, we'd be screwed.
1702
1703     # to perform the analysis, we'll simply visit all of the 'previous'
1704     # links that we have recorded and validate that the timestamp on the
1705     # previous revision is before the specified revision
1706
1707     # if we have to resync some nodes, then we restart the scan. just keep
1708     # looping as long as we need to restart.
1709     while 1:
1710       for current, prev in self.prev_rev.items():
1711         if not prev:
1712           # no previous revision exists (i.e. the initial revision)
1713           continue
1714         t_c = self.rev_data[current][0]
1715         t_p = self.rev_data[prev][0]
1716         if t_p >= t_c:
1717           # the previous revision occurred later than the current revision.
1718           # shove the previous revision back in time (and any before it that
1719           # may need to shift).
1720
1721           # We sync backwards and not forwards because any given CVS
1722           # Revision has only one previous revision.  However, a CVS
1723           # Revision can *be* a previous revision for many other
1724           # revisions (e.g., a revision that is the source of multiple
1725           # branches).  This becomes relevant when we do the secondary
1726           # synchronization in pass 2--we can make certain that we
1727           # don't resync a revision earlier than it's previous
1728           # revision, but it would be non-trivial to make sure that we
1729           # don't resync revision R *after* any revisions that have R
1730           # as a previous revision.
1731           while t_p >= t_c:
1732             self.rev_data[prev][0] = t_c - 1    # new timestamp
1733             self.rev_data[prev][2] = t_p        # old timestamp
1734             delta = t_c - 1 - t_p
1735             msg =  "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1736                   % (self.cvs_path, prev, time.ctime(t_p), delta)
1737             Log().write(LOG_VERBOSE, msg)
1738             if (delta > COMMIT_THRESHOLD
1739                 or delta < (COMMIT_THRESHOLD * -1)):
1740               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1741               Log().write(LOG_WARN,
1742                           str % (warning_prefix, self.cvs_path, delta))
1743             current = prev
1744             prev = self.prev_rev[current]
1745             if not prev:
1746               break
1747             t_c = t_c - 1               # self.rev_data[current][0]
1748             t_p = self.rev_data[prev][0]
1749
1750           # break from the for-loop
1751           break
1752       else:
1753         # finished the for-loop (no resyncing was performed)
1754         return
1755
1756   def set_revision_info(self, revision, log, text):
1757     timestamp, author, old_ts = self.rev_data[revision]
1758     digest = sha.new(log + '\0' + author).hexdigest()
1759     if old_ts:
1760       # the timestamp on this revision was changed. log it for later
1761       # resynchronization of other files's revisions that occurred
1762       # for this time and log message.
1763       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1764
1765     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1766     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1767     #
1768     # If revision 1.1 appears to have been created via 'cvs add'
1769     # instead of 'cvs import', then this file probably never had a
1770     # default branch, so retroactively remove its record in the
1771     # default branches db.  The test is that the log message CVS uses
1772     # for 1.1 in imports is "Initial revision\n" with no period.
1773     if revision == '1.1' and log != 'Initial revision\n':
1774       try:
1775         del self.default_branches_db[self.cvs_path]
1776       except KeyError:
1777         pass
1778
1779     # Get the timestamps of the previous and next revisions
1780     prev_rev = self.prev_rev[revision]
1781     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1782
1783     next_rev = self.next_rev.get(revision)
1784     next_timestamp, ign, ign = self.rev_data.get(next_rev, [0, None, None])
1785
1786     # How to tell if a CVSRevision is an add, a change, or a deletion:
1787     #
1788     # It's a delete if RCS state is 'dead'
1789     #
1790     # It's an add if RCS state is 'Exp.' and
1791     #      - we either have no previous revision
1792     #        or
1793     #      - we have a previous revision whose state is 'dead'
1794     #
1795     # Anything else is a change.
1796     if self.rev_state[revision] == 'dead':
1797       op = OP_DELETE
1798     elif ((self.prev_rev.get(revision, None) is None)
1799           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1800       op = OP_ADD
1801     else:
1802       op = OP_CHANGE
1803
1804     def is_branch_revision(rev):
1805       """Return True if this revision is not a trunk revision,
1806       else return False."""
1807       if rev.count('.') >= 3:
1808         return True
1809       return False
1810
1811     def is_same_line_of_development(rev1, rev2):
1812       """Return True if rev1 and rev2 are on the same line of
1813       development (i.e., both on trunk, or both on the same branch);
1814       return False otherwise.  Either rev1 or rev2 can be None, in
1815       which case automatically return False."""
1816       if rev1 is None or rev2 is None:
1817         return False
1818       if rev1.count('.') == 1 and rev2.count('.') == 1:
1819         return True
1820       if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
1821         return True
1822       return False
1823
1824     # There can be an odd situation where the tip revision of a branch
1825     # is alive, but every predecessor on the branch is in state 'dead',
1826     # yet the revision from which the branch sprouts is alive.  (This
1827     # is sort of a mirror image of the more common case of adding a
1828     # file on a branch, in which the first revision on the branch is
1829     # alive while the revision from which it sprouts is dead.)
1830     #
1831     # In this odd situation, we must mark the first live revision on
1832     # the branch as an OP_CHANGE instead of an OP_ADD, because it
1833     # reflects, however indirectly, a change w.r.t. the source
1834     # revision from which the branch sprouts.
1835     #
1836     # This is issue #89.
1837     cur_num = revision
1838     if is_branch_revision(revision) and self.rev_state[revision] != 'dead':
1839       while 1:
1840         prev_num = self.prev_rev.get(cur_num, None)
1841         if not cur_num or not prev_num:
1842           break
1843         if (not is_same_line_of_development(cur_num, prev_num)
1844             and self.rev_state[cur_num] == 'dead'
1845             and self.rev_state[prev_num] != 'dead'):
1846           op = OP_CHANGE
1847         cur_num = self.prev_rev.get(cur_num, None)
1848
1849     if text:
1850       deltatext_code = DELTATEXT_NONEMPTY
1851     else:
1852       deltatext_code = DELTATEXT_EMPTY
1853
1854     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp,
1855                         next_timestamp, op,
1856                         prev_rev, revision, next_rev,
1857                         self.file_in_attic, self.file_executable,
1858                         self.file_size,
1859                         deltatext_code, self.fname,
1860                         self.mode, self.rev_to_branch_name(revision),
1861                         self.taglist.get(revision, []),
1862                         self.branchlist.get(revision, []))
1863     self.revs.write(str(c_rev) + "\n")
1864     StatsKeeper().record_c_rev(c_rev)
1865
1866     if not self.metadata_db.has_key(digest):
1867       self.metadata_db[digest] = (author, log)
1868
1869   def parse_completed(self):
1870     # Walk through all branches and tags and register them with
1871     # their parent branch in the symbol database.
1872     for revision, symbols in self.taglist.items() + self.branchlist.items():
1873       for symbol in symbols:
1874         name = self.rev_to_branch_name(revision)
1875         if name is not None:
1876           self.symbol_db.register_branch_blocker(name, symbol)
1877
1878     self.num_files = self.num_files + 1
1879
1880   def write_symbol_db(self):
1881     self.symbol_db.write()
1882
1883 class SymbolingsLogger:
1884   """Manage the file that contains lines for symbol openings and
1885   closings.
1886
1887   This data will later be used to determine valid SVNRevision ranges
1888   from which a file can be copied when creating a branch or tag in
1889   Subversion.  Do this by finding "Openings" and "Closings" for each
1890   file copied onto a branch or tag.
1891
1892   An "Opening" is the CVSRevision from which a given branch/tag
1893   sprouts on a path.
1894
1895   The "Closing" for that branch/tag and path is the next CVSRevision
1896   on the same line of development as the opening.
1897
1898   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1899   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1900   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1901   'foo.c'.  Note that there may be many revisions chronologically
1902   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1903   perhaps even including on branch BEE itself.  But 1.3 is the next
1904   revision *on the same line* as 1.2, that is why it is the closing
1905   revision for those symbolic names of which 1.2 is the opening.
1906
1907   The reason for doing all this hullabaloo is to make branch and tag
1908   creation as efficient as possible by minimizing the number of copies
1909   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1910   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1911   means that when creating branch BEE, there is some motivation to do
1912   the copy from one of 17-30.  Now if there were another file,
1913   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1914   to revisions 24 and 39 in Subversion, we would know that the ideal
1915   thing would be to copy the branch from somewhere between 24 and 29,
1916   inclusive.
1917   """
1918   def __init__(self):
1919     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1920     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1921     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1922     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1923
1924     # This keys of this dictionary are *source* cvs_paths for which
1925     # we've encountered an 'opening' on the default branch.  The
1926     # values are the (uncleaned) symbolic names that this path has
1927     # opened.
1928     self.open_paths_with_default_branches = { }
1929
1930   def log_revision(self, c_rev, svn_revnum):
1931     """Log any openings found in C_REV, and if C_REV.next_rev is not
1932     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1933     any) will have its revnum determined later."""
1934     for name in c_rev.tags + c_rev.branches:
1935       self._note_default_branch_opening(c_rev, name)
1936       if c_rev.op != OP_DELETE:
1937         self._log(name, svn_revnum,
1938                   c_rev.cvs_path, c_rev.branch_name, OPENING)
1939
1940       # If our c_rev has a next_rev, then that's the closing rev for
1941       # this source revision.  Log it to closings for later processing
1942       # since we don't know the svn_revnum yet.
1943       if c_rev.next_rev is not None:
1944         self.closings.write('%s %s\n' %
1945                             (name, c_rev.unique_key(c_rev.next_rev)))
1946
1947   def _log(self, name, svn_revnum, cvs_path, branch_name, type):
1948     """Write out a single line to the symbol_openings_closings file
1949     representing that SVN_REVNUM of SVN_PATH on BRANCH_NAME is either the
1950     opening or closing (TYPE) of NAME (a symbolic name).
1951
1952     TYPE should only be one of the following global constants:
1953     OPENING or CLOSING."""
1954     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1955     self.symbolings.write(
1956         '%s %.8d %s %s %s\n'
1957         % (name, svn_revnum, type, branch_name or '*', cvs_path))
1958
1959   def close(self):
1960     """Iterate through the closings file, lookup the svn_revnum for
1961     each closing CVSRevision, and write a proper line out to the
1962     symbolings file."""
1963     # Use this to get the c_rev of our rev_key
1964     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1965
1966     self.closings.close()
1967     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1968       (name, rev_key) = line.rstrip().split(" ", 1)
1969       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1970
1971       c_rev = cvs_revs_db.get_revision(rev_key)
1972       self._log(name, svn_revnum, c_rev.cvs_path, c_rev.branch_name, CLOSING)
1973
1974     self.symbolings.close()
1975
1976   def _note_default_branch_opening(self, c_rev, symbolic_name):
1977     """If C_REV is a default branch revision, log C_REV.cvs_path as an
1978     opening for SYMBOLIC_NAME."""
1979     self.open_paths_with_default_branches.setdefault(
1980         c_rev.cvs_path, []).append(symbolic_name)
1981
1982   def log_default_branch_closing(self, c_rev, svn_revnum):
1983     """If self.open_paths_with_default_branches contains
1984     C_REV.cvs_path, then call log each name in
1985     self.open_paths_with_default_branches[C_REV.cvs_path] as a closing
1986     with SVN_REVNUM as the closing revision number."""
1987     path = c_rev.cvs_path
1988     if self.open_paths_with_default_branches.has_key(path):
1989       # log each symbol as a closing
1990       for name in self.open_paths_with_default_branches[path]:
1991         self._log(name, svn_revnum, path, None, CLOSING)
1992       # Remove them from the openings list as we're done with them.
1993       del self.open_paths_with_default_branches[path]
1994
1995
1996 class PersistenceManager:
1997   """The PersistenceManager allows us to effectively store SVNCommits
1998   to disk and retrieve them later using only their subversion revision
1999   number as the key.  It also returns the subversion revision number
2000   for a given CVSRevision's unique key.
2001
2002   All information pertinent to each SVNCommit is stored in a series of
2003   on-disk databases so that SVNCommits can be retrieved on-demand.
2004
2005   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
2006   In 'new' mode, PersistenceManager will initialize a new set of on-disk
2007   databases and be fully-featured.
2008   In 'read' mode, PersistenceManager will open existing on-disk databases
2009   and the set_* methods will be unavailable."""
2010   def __init__(self, mode):
2011     self.mode = mode
2012     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
2013       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
2014     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
2015     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
2016     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
2017     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
2018     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
2019     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
2020     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
2021     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
2022     ###PERF kff Elsewhere there are comments about sucking the tags db
2023     ### into memory.  That seems like a good idea.
2024     if not Ctx().trunk_only:
2025       self.tags_db = TagsDatabase(DB_OPEN_READ)
2026       self.motivating_revnums = SDatabase(temp(MOTIVATING_REVNUMS), mode)
2027       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
2028
2029     # "branch_name" -> svn_revnum in which branch was last filled.
2030     # This is used by CVSCommit._pre_commit, to prevent creating a fill
2031     # revision which would have nothing to do.
2032     self.last_filled = {}
2033
2034   def get_svn_revnum(self, cvs_rev_unique_key):
2035     """Return the Subversion revision number in which
2036     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
2037     is no mapping for CVS_REV_UNIQUE_KEY."""
2038     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
2039
2040   def get_svn_commit(self, svn_revnum):
2041     """Return an SVNCommit that corresponds to SVN_REVNUM.
2042
2043     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
2044
2045     This method can throw SVNCommitInternalInconsistencyError.
2046     """
2047     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
2048     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
2049     if c_rev_keys == None:
2050       return None
2051
2052     digest = None
2053     for key in c_rev_keys:
2054       c_rev = self.cvs_revisions.get_revision(key)
2055       svn_commit.add_revision(c_rev)
2056       # Set the author and log message for this commit by using
2057       # CVSRevision metadata, but only if haven't done so already.
2058       if digest is None:
2059         digest = c_rev.digest
2060         author, log_msg = self.svn_commit_metadata[digest]
2061         svn_commit.set_author(author)
2062         svn_commit.set_log_msg(log_msg)
2063
2064     # If we're doing a trunk-only conversion, we don't need to do any more
2065     # work.
2066     if Ctx().trunk_only:
2067       return svn_commit
2068
2069     name, date = self._get_name_and_date(svn_revnum)
2070     if name:
2071       svn_commit.set_symbolic_name(name)
2072       svn_commit.set_date(date)
2073       if self.tags_db.has_key(name):
2074         svn_commit.is_tag = 1
2075
2076     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
2077     if motivating_revnum:
2078       svn_commit.set_motivating_revnum(int(motivating_revnum))
2079       svn_commit.set_date(date)
2080
2081     if len(svn_commit.cvs_revs) and name:
2082       raise SVNCommit.SVNCommitInternalInconsistencyError(
2083           "An SVNCommit cannot have cvs_revisions *and* a corresponding\n"
2084           "symbolic name ('%s') to fill."
2085           % (_clean_symbolic_name(name),))
2086
2087     return svn_commit
2088
2089   def set_cvs_revs(self, svn_revnum, cvs_revs):
2090     """Record the bidirectional mapping between SVN_REVNUM and
2091     CVS_REVS."""
2092     if self.mode == DB_OPEN_READ:
2093       raise RuntimeError, \
2094           'Write operation attempted on read-only PersistenceManager'
2095     for c_rev in cvs_revs:
2096       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
2097     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
2098     for c_rev in cvs_revs:
2099       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
2100
2101   def set_name_and_date(self, svn_revnum, name, date):
2102     """Associate symbolic name NAME and DATE with SVN_REVNUM.
2103
2104     NAME is allowed to be None."""
2105
2106     if self.mode == DB_OPEN_READ:
2107       raise RuntimeError, \
2108           'Write operation attempted on read-only PersistenceManager'
2109     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
2110     self.last_filled[name] = svn_revnum
2111
2112   def _get_name_and_date(self, svn_revnum):
2113     """Return a tuple containing the symbolic name and date associated
2114     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
2115     associated with it."""
2116     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
2117
2118   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
2119     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
2120     if self.mode == DB_OPEN_READ:
2121       raise RuntimeError, \
2122           'Write operation attempted on read-only PersistenceManager'
2123     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
2124
2125
2126 class CVSCommit:
2127   """Each instance of this class contains a number of CVS Revisions
2128   that correspond to one or more Subversion Commits.  After all CVS
2129   Revisions are added to the grouping, calling process_revisions will
2130   generate a Subversion Commit (or Commits) for the set of CVS
2131   Revisions in the grouping."""
2132
2133   def __init__(self, digest, author, log):
2134     self.digest = digest
2135     self.author = author
2136     self.log = log
2137
2138     # Symbolic names for which the last source revision has already
2139     # been seen and for which the CVSRevisionAggregator has already
2140     # generated a fill SVNCommit.  See self.process_revisions().
2141     self.done_symbols = [ ]
2142
2143     self.files = { }
2144     # Lists of CVSRevisions
2145     self.changes = [ ]
2146     self.deletes = [ ]
2147
2148     # Start out with a t_min higher than any incoming time T, and a
2149     # t_max lower than any incoming T.  This way the first T will
2150     # push t_min down to T, and t_max up to T, naturally (without any
2151     # special-casing), and successive times will then ratchet them
2152     # outward as appropriate.
2153     self.t_min = 1L<<32
2154     self.t_max = 0
2155
2156     # This will be set to the SVNCommit that occurs in self._commit.
2157     self.motivating_commit = None
2158
2159     # This is a list of all non-primary commits motivated by the main
2160     # commit.  We gather these so that we can set their dates to the
2161     # same date as the primary commit.
2162     self.secondary_commits = [ ]
2163
2164     # State for handling default branches.
2165     #
2166     # Here is a tempting, but ultimately nugatory, bit of logic, which
2167     # I share with you so you may appreciate the less attractive, but
2168     # refreshingly non-nugatory, logic which follows it:
2169     #
2170     # If some of the commits in this txn happened on a non-trunk
2171     # default branch, then those files will have to be copied into
2172     # trunk manually after being changed on the branch (because the
2173     # RCS "default branch" appears as head, i.e., trunk, in practice).
2174     # As long as those copies don't overwrite any trunk paths that
2175     # were also changed in this commit, then we can do the copies in
2176     # the same revision, because they won't cover changes that don't
2177     # appear anywhere/anywhen else.  However, if some of the trunk dst
2178     # paths *did* change in this commit, then immediately copying the
2179     # branch changes would lose those trunk mods forever.  So in this
2180     # case, we need to do at least that copy in its own revision.  And
2181     # for simplicity's sake, if we're creating the new revision for
2182     # even one file, then we just do all such copies together in the
2183     # new revision.
2184     #
2185     # Doesn't that sound nice?
2186     #
2187     # Unfortunately, Subversion doesn't support copies with sources
2188     # in the current txn.  All copies must be based in committed
2189     # revisions.  Therefore, we generate the above-described new
2190     # revision unconditionally.
2191     #
2192     # This is a list of c_revs, and a c_rev is appended for each
2193     # default branch commit that will need to be copied to trunk (or
2194     # deleted from trunk) in some generated revision following the
2195     # "regular" revision.
2196     self.default_branch_cvs_revisions = [ ]
2197
2198   def __cmp__(self, other):
2199     # Commits should be sorted by t_max.  If both self and other have
2200     # the same t_max, break the tie using t_min, and lastly, digest
2201     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
2202             or cmp(self.digest, other.digest))
2203
2204   def has_file(self, fname):
2205     return self.files.has_key(fname)
2206
2207   def revisions(self):
2208     return self.changes + self.deletes
2209
2210   def opens_symbolic_name(self, name):
2211     """Returns true if any CVSRevision in this commit is on a tag or a
2212     branch or is the origin of a tag or branch."""
2213     for c_rev in self.revisions():
2214       if c_rev.opens_symbolic_name(name):
2215         return 1
2216     return 0
2217
2218   def add_revision(self, c_rev):
2219     # Record the time range of this commit.
2220     #
2221     # ### ISSUE: It's possible, though unlikely, that the time range
2222     # of a commit could get gradually expanded to be arbitrarily
2223     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
2224     # problem, and anyway deciding where to break it up would be a
2225     # judgement call.  For now, we just print a warning in commit() if
2226     # this happens.
2227     if c_rev.timestamp < self.t_min:
2228       self.t_min = c_rev.timestamp
2229     if c_rev.timestamp > self.t_max:
2230       self.t_max = c_rev.timestamp
2231
2232     if c_rev.op == OP_DELETE:
2233       self.deletes.append(c_rev)
2234     else:
2235       # OP_CHANGE or OP_ADD
2236       self.changes.append(c_rev)
2237
2238     self.files[c_rev.fname] = 1
2239
2240   def _pre_commit(self):
2241     """Generates any SVNCommits that must exist before the main
2242     commit."""
2243
2244     # There may be multiple c_revs in this commit that would cause
2245     # branch B to be filled, but we only want to fill B once.  On the
2246     # other hand, there might be multiple branches committed on in
2247     # this commit.  Whatever the case, we should count exactly one
2248     # commit per branch, because we only fill a branch once per
2249     # CVSCommit.  This list tracks which branches we've already
2250     # counted.
2251     accounted_for_sym_names = [ ]
2252
2253     def fill_needed(c_rev, pm):
2254       """Return 1 if this is the first commit on a new branch (for
2255       this file) and we need to fill the branch; else return 0
2256       (meaning that some other file's first commit on the branch has
2257       already done the fill for us).
2258
2259       If C_REV.op is OP_ADD, only return 1 if the branch that this
2260       commit is on has no last filled revision.
2261
2262       PM is a PersistenceManager to query.
2263       """
2264
2265       # Different '.' counts indicate that c_rev is now on a different
2266       # line of development (and may need a fill)
2267       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
2268         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
2269         # It should be the case that when we have a file F that
2270         # is added on branch B (thus, F on trunk is in state
2271         # 'dead'), we generate an SVNCommit to fill B iff the branch
2272         # has never been filled before.
2273         #
2274         # If this c_rev.op == OP_ADD, *and* the branch has never
2275         # been filled before, then fill it now.  Otherwise, no need to
2276         # fill it.
2277         if c_rev.op == OP_ADD:
2278           if pm.last_filled.get(c_rev.branch_name, None) is None:
2279             return 1
2280         elif c_rev.op == OP_CHANGE:
2281           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
2282             return 1
2283         elif c_rev.op == OP_DELETE:
2284           if pm.last_filled.get(c_rev.branch_name, None) is None:
2285             return 1
2286       return 0
2287
2288     for c_rev in self.changes + self.deletes:
2289       # If a commit is on a branch, we must ensure that the branch
2290       # path being committed exists (in HEAD of the Subversion
2291       # repository).  If it doesn't exist, we will need to fill the
2292       # branch.  After the fill, the path on which we're committing
2293       # will exist.
2294       if c_rev.branch_name \
2295           and c_rev.branch_name not in accounted_for_sym_names \
2296           and c_rev.branch_name not in self.done_symbols \
2297           and fill_needed(c_rev, Ctx()._persistence_manager):
2298         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
2299                                % c_rev.branch_name)
2300         svn_commit.set_symbolic_name(c_rev.branch_name)
2301         self.secondary_commits.append(svn_commit)
2302         accounted_for_sym_names.append(c_rev.branch_name)
2303
2304   def _commit(self):
2305     """Generates the primary SVNCommit that corresponds to this
2306     CVSCommit."""
2307     # Generate an SVNCommit unconditionally.  Even if the only change
2308     # in this CVSCommit is a deletion of an already-deleted file (that
2309     # is, a CVS revision in state 'dead' whose predecessor was also in
2310     # state 'dead'), the conversion will still generate a Subversion
2311     # revision containing the log message for the second dead
2312     # revision, because we don't want to lose that information.
2313     svn_commit = SVNCommit("commit")
2314     self.motivating_commit = svn_commit
2315
2316     for c_rev in self.changes:
2317       svn_commit.add_revision(c_rev)
2318       # Only make a change if we need to.  When 1.1.1.1 has an empty
2319       # deltatext, the explanation is almost always that we're looking
2320       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
2321       # such imports, CVS creates an RCS file where 1.1 has the
2322       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2323       # content as 1.1.  There's no reason to reflect this non-change
2324       # in the repository, so we want to do nothing in this case.  (If
2325       # we were really paranoid, we could make sure 1.1's log message
2326       # is the CVS-generated "Initial revision\n", but I think the
2327       # conditions below are strict enough.)
2328       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
2329               and (c_rev.rev == "1.1.1.1")):
2330         if c_rev.is_default_branch_revision():
2331           self.default_branch_cvs_revisions.append(c_rev)
2332
2333     for c_rev in self.deletes:
2334       # When a file is added on a branch, CVS not only adds the file
2335       # on the branch, but generates a trunk revision (typically
2336       # 1.1) for that file in state 'dead'.  We only want to add
2337       # this revision if the log message is not the standard cvs
2338       # fabricated log message.
2339       if c_rev.prev_rev is None:
2340         # c_rev.branches may be empty if the originating branch
2341         # has been excluded.
2342         if not c_rev.branches:
2343           continue
2344         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
2345                              % (c_rev.filename(),
2346                                 c_rev.branches[0]))
2347         author, log_msg = \
2348             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
2349         if log_msg == cvs_generated_msg:
2350           continue
2351
2352       svn_commit.add_revision(c_rev)
2353       if c_rev.is_default_branch_revision():
2354         self.default_branch_cvs_revisions.append(c_rev)
2355
2356     # There is a slight chance that we didn't actually register any
2357     # CVSRevisions with our SVNCommit (see loop over self.deletes
2358     # above), so if we have no CVSRevisions, we don't flush the
2359     # svn_commit to disk and roll back our revnum.
2360     if len(svn_commit.cvs_revs) > 0:
2361       svn_commit.flush()
2362     else:
2363       # We will not be flushing this SVNCommit, so rollback the
2364       # SVNCommit revision counter.
2365       SVNCommit.revnum = SVNCommit.revnum - 1
2366
2367     if not Ctx().trunk_only:
2368       for c_rev in self.revisions():
2369         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
2370
2371   def _post_commit(self):
2372     """Generates any SVNCommits that we can perform now that _commit
2373     has happened.  That is, handle non-trunk default branches.
2374     Sometimes an RCS file has a non-trunk default branch, so a commit
2375     on that default branch would be visible in a default CVS checkout
2376     of HEAD.  If we don't copy that commit over to Subversion's trunk,
2377     then there will be no Subversion tree which corresponds to that
2378     CVS checkout.  Of course, in order to copy the path over, we may
2379     first need to delete the existing trunk there.  """
2380
2381     # Only generate a commit if we have default branch revs
2382     if len(self.default_branch_cvs_revisions):
2383       # Generate an SVNCommit for all of our default branch c_revs.
2384       svn_commit = SVNCommit("post-commit default branch(es)")
2385       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
2386       for c_rev in self.default_branch_cvs_revisions:
2387         svn_commit.add_revision(c_rev)
2388         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
2389                                                             svn_commit.revnum)
2390       self.secondary_commits.append(svn_commit)
2391
2392   def process_revisions(self, done_symbols):
2393     """Process all the CVSRevisions that this instance has, creating
2394     one or more SVNCommits in the process.  Generate fill SVNCommits
2395     only for symbols not in DONE_SYMBOLS (avoids unnecessary
2396     fills).
2397
2398     Return the primary SVNCommit that corresponds to this CVSCommit.
2399     The returned SVNCommit is the commit that motivated any other
2400     SVNCommits generated in this CVSCommit."""
2401     self.done_symbols = done_symbols
2402     seconds = self.t_max - self.t_min + 1
2403
2404     Log().write(LOG_VERBOSE, '-' * 60)
2405     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2406     if seconds == 1:
2407       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2408                   % time.ctime(self.t_max))
2409     else:
2410       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2411       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2412                   % (time.ctime(self.t_max), seconds))
2413
2414     if seconds > COMMIT_THRESHOLD + 1:
2415       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2416                   % (warning_prefix, COMMIT_THRESHOLD))
2417
2418     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2419       self._commit()
2420       return self.motivating_commit
2421
2422     self._pre_commit()
2423     self._commit()
2424     self._post_commit()
2425
2426     for svn_commit in self.secondary_commits:
2427       svn_commit.set_date(self.motivating_commit.get_date())
2428       svn_commit.flush()
2429
2430     return self.motivating_commit
2431
2432
2433 class SVNCommit:
2434   """This represents one commit to the Subversion Repository.  There
2435   are three types of SVNCommits:
2436
2437   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2438
2439   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2440
2441   3. Updates trunk to reflect the contents of a particular branch
2442      (this is to handle RCS default branches)."""
2443
2444   # The revision number to assign to the next new SVNCommit.
2445   # We start at 2 because SVNRepositoryMirror uses the first commit
2446   # to create trunk, tags, and branches.
2447   revnum = 2
2448
2449   class SVNCommitInternalInconsistencyError(Exception):
2450     """Exception raised if we encounter an impossible state in the
2451     SVNCommit Databases."""
2452     pass
2453
2454   def __init__(self, description="", revnum=None, cvs_revs=None):
2455     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2456     If REVNUM, the SVNCommit will correspond to that revision number;
2457     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2458     REVNUM.
2459
2460     It is an error to pass CVS_REVS without REVNUM, but you may pass
2461     REVNUM without CVS_REVS, and then add a revision at a time by
2462     invoking add_revision()."""
2463     self._description = description
2464
2465     # Revprop metadata for this commit.
2466     #
2467     # These initial values are placeholders.  At least the log and the
2468     # date should be different by the time these are used.
2469     #
2470     # They are private because their values should be returned encoded
2471     # in UTF8, but callers aren't required to set them in UTF8.
2472     # Therefore, accessor methods are used to set them, and
2473     # self.get_revprops() is used to to get them, in dictionary form.
2474     self._author = Ctx().username
2475     self._log_msg = "This log message means an SVNCommit was used too soon."
2476     self._max_date = 0  # Latest date seen so far.
2477
2478     self.cvs_revs = cvs_revs or []
2479     if revnum:
2480       self.revnum = revnum
2481     else:
2482       self.revnum = SVNCommit.revnum
2483       SVNCommit.revnum = SVNCommit.revnum + 1
2484
2485     # The (uncleaned) symbolic name that is filled in this SVNCommit, if any.
2486     self.symbolic_name = None
2487
2488     # If this commit is a default branch synchronization, this
2489     # variable represents the subversion revision number of the
2490     # *primary* commit where the default branch changes actually
2491     # happened.  It is None otherwise.
2492     #
2493     # It is possible for multiple synchronization commits to refer to
2494     # the same motivating commit revision number, and it is possible
2495     # for a single synchronization commit to contain CVSRevisions on
2496     # multiple different default branches.
2497     self.motivating_revnum = None
2498
2499     # is_tag is true only if this commit is a fill of a symbolic name
2500     # that is a tag, None in all other cases.
2501     self.is_tag = None
2502
2503   def set_symbolic_name(self, symbolic_name):
2504     "Set self.symbolic_name to SYMBOLIC_NAME."
2505     self.symbolic_name = symbolic_name
2506
2507   def set_motivating_revnum(self, revnum):
2508     "Set self.motivating_revnum to REVNUM."
2509     self.motivating_revnum = revnum
2510
2511   def set_author(self, author):
2512     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2513     This is the only way to set an SVNCommit's author."""
2514     self._author = author
2515
2516   def set_log_msg(self, msg):
2517     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2518     This is the only way to set an SVNCommit's log message."""
2519     self._log_msg = msg
2520
2521   def set_date(self, date):
2522     """Set this SVNCommit's date to DATE (an integer).
2523     Note that self.add_revision() updates this automatically based on
2524     a CVSRevision; so you may not need to call this at all, and even
2525     if you do, the value may be overwritten by a later call to
2526     self.add_revision()."""
2527     self._max_date = date
2528
2529   def get_date(self):
2530     """Returns this SVNCommit's date as an integer."""
2531     return self._max_date
2532
2533   def get_revprops(self):
2534     """Return the Subversion revprops for this SVNCommit."""
2535     date = format_date(self._max_date)
2536     try:
2537       utf8_author = None
2538       if self._author is not None:
2539         utf8_author = to_utf8(self._author)
2540       utf8_log = to_utf8(self.get_log_msg())
2541       return { 'svn:author' : utf8_author,
2542                'svn:log'    : utf8_log,
2543                'svn:date'   : date }
2544     except UnicodeError:
2545       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2546                   % warning_prefix)
2547       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2548       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2549       Log().write(LOG_WARN, "  date:   '%s'" % date)
2550       Log().write(LOG_WARN,
2551                   "(subversion rev %s)  Related files:" % self.revnum)
2552       for c_rev in self.cvs_revs:
2553         Log().write(LOG_WARN, " ", c_rev.fname)
2554
2555       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2556                   "'--encoding=latin1'.\n")
2557       # It's better to fall back to the original (unknown encoding) data
2558       # than to either 1) quit or 2) record nothing at all.
2559       return { 'svn:author' : self._author,
2560                'svn:log'    : self.get_log_msg(),
2561                'svn:date'   : date }
2562
2563   def add_revision(self, cvs_rev):
2564     self.cvs_revs.append(cvs_rev)
2565     if cvs_rev.timestamp > self._max_date:
2566       self._max_date = cvs_rev.timestamp
2567
2568   def _is_primary_commit(self):
2569     """Return true if this is a primary SVNCommit, false otherwise."""
2570     return not (self.symbolic_name or self.motivating_revnum)
2571
2572   def flush(self):
2573     Log().write(LOG_NORMAL, "Creating Subversion r%d (%s)"
2574                 % (self.revnum, self._description))
2575     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2576
2577     if self.motivating_revnum is not None:
2578       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2579                                                        self.motivating_revnum)
2580
2581     # If we're not a primary commit, then store our date and/or our
2582     # symbolic_name
2583     if not self._is_primary_commit():
2584       Ctx()._persistence_manager.set_name_and_date(
2585           self.revnum, self.symbolic_name, self._max_date)
2586
2587   def __str__(self):
2588     """ Print a human-readable description of this SVNCommit.  This
2589     description is not intended to be machine-parseable (although
2590     we're not going to stop you if you try!)"""
2591
2592     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2593     if self.symbolic_name:
2594       ret += ("   symbolic name: " + _clean_symbolic_name(self.symbolic_name)
2595               + "\n")
2596     else:
2597       ret += "   NO symbolic name\n"
2598     ret += "   debug description: " + self._description + "\n"
2599     ret += "   cvs_revs:\n"
2600     for c_rev in self.cvs_revs:
2601       ret += "     " + c_rev.unique_key() + "\n"
2602     return ret
2603
2604   def get_log_msg(self):
2605     """Returns the actual log message for a primary commit, and the
2606     appropriate manufactured log message for a secondary commit."""
2607     if self.symbolic_name is not None:
2608       return self._log_msg_for_symbolic_name_commit()
2609     elif self.motivating_revnum is not None:
2610       return self._log_msg_for_default_branch_commit()
2611     else:
2612       return self._log_msg
2613
2614   def _log_msg_for_symbolic_name_commit(self):
2615     """Creates a log message for a manufactured commit that fills
2616     self.symbolic_name.  If self.is_tag is true, write the log message
2617     as though for a tag, else write it as though for a branch."""
2618     type = 'branch'
2619     if self.is_tag:
2620       type = 'tag'
2621
2622     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2623     space_or_newline = ' '
2624     cleaned_symbolic_name = _clean_symbolic_name(self.symbolic_name)
2625     if len(cleaned_symbolic_name) >= 13:
2626       space_or_newline = '\n'
2627
2628     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2629            % (type, space_or_newline, cleaned_symbolic_name)
2630
2631   def _log_msg_for_default_branch_commit(self):
2632     """Creates a log message for a manufactured commit that
2633     synchronizes a non-trunk default branch with trunk."""
2634     msg = 'This commit was generated by cvs2svn to compensate for '     \
2635           'changes in r%d,\n'                                           \
2636           'which included commits to RCS files with non-trunk default ' \
2637           'branches.\n' % self.motivating_revnum
2638     return msg
2639
2640 class CVSRevisionAggregator:
2641   """This class groups CVSRevisions into CVSCommits that represent
2642   at least one SVNCommit."""
2643   def __init__(self):
2644     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2645     if not Ctx().trunk_only:
2646       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB),
2647                                    DB_OPEN_READ)
2648
2649     # A map { key : CVSCommit } of CVS commits currently being
2650     # accumulated.  If the CVSCommit is still open to further
2651     # CVSRevisions, then key is CVSRevision.digest.  If not (because
2652     # an inbound commit wanted to affect a file that was already
2653     # within the CVSCommit), then key is CVSRevision.digest plus some
2654     # number of appended '-'.
2655     self.cvs_commits = {}
2656
2657     # A map { symbol : None } of symbolic names for which the last
2658     # source CVSRevision has already been processed but which haven't
2659     # been closed yet.
2660     self.pending_symbols = {}
2661
2662     # A list of closed symbols.  That is, we've already encountered
2663     # the last CVSRevision that is a source for that symbol, the final
2664     # fill for this symbol has been done, and we never need to fill it
2665     # again.
2666     self.done_symbols = [ ]
2667
2668     # This variable holds the most recently created primary svn_commit
2669     # object.  CVSRevisionAggregator maintains this variable merely
2670     # for its date, so that it can set dates for the SVNCommits
2671     # created in self._attempt_to_commit_symbols().
2672     self.latest_primary_svn_commit = None
2673
2674     Ctx()._symbolings_logger = SymbolingsLogger()
2675     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2676     Ctx()._default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
2677                                            DB_OPEN_READ)
2678
2679   def _extract_ready_commits(self, timestamp):
2680     """Extract and return any active commits that expire by TIMESTAMP."""
2681
2682     ready_queue = [ ]
2683     for digest_key, cvs_commit in self.cvs_commits.items():
2684       if cvs_commit.t_max + COMMIT_THRESHOLD < timestamp:
2685         ready_queue.append(cvs_commit)
2686         del self.cvs_commits[digest_key]
2687     return ready_queue
2688
2689   def process_revision(self, c_rev):
2690     # Each time we read a new line, scan the accumulating commits to
2691     # see if any are ready for processing.
2692     ready_queue = self._extract_ready_commits(c_rev.timestamp)
2693
2694     for digest_key, cvs_commit in self.cvs_commits.items():
2695       # If the inbound commit is on the same file as a pending commit,
2696       # close the pending commit to further changes.  Don't flush it though,
2697       # as there may be other pending commits dated before this one.
2698       # ### ISSUE: the has_file() check below is not optimal.
2699       # It does fix the dataloss bug where revisions would get lost
2700       # if checked in too quickly, but it can also break apart the
2701       # commits.  The correct fix would require tracking the dependencies
2702       # between change sets and committing them in proper order.
2703       if cvs_commit.has_file(c_rev.fname):
2704         unused_id = digest_key + '-'
2705         # Find a string that does is not already a key in
2706         # the self.cvs_commits dict
2707         while self.cvs_commits.has_key(unused_id):
2708           unused_id = unused_id + '-'
2709         self.cvs_commits[unused_id] = cvs_commit
2710         del self.cvs_commits[digest_key]
2711
2712     # Add this item into the set of still-available commits.
2713     if self.cvs_commits.has_key(c_rev.digest):
2714       cvs_commit = self.cvs_commits[c_rev.digest]
2715     else:
2716       author, log = self.metadata_db[c_rev.digest]
2717       cvs_commit = CVSCommit(c_rev.digest, author, log)
2718       self.cvs_commits[c_rev.digest] = cvs_commit
2719     cvs_commit.add_revision(c_rev)
2720
2721     if ready_queue:
2722       # Any elements in the ready_queue at this point need to be
2723       # processed, because this latest rev couldn't possibly be part
2724       # of any of them.  Sort them into time-order, then process 'em.
2725       ready_queue.sort()
2726
2727       while ready_queue:
2728         cvs_commit = ready_queue.pop(0)
2729         self.latest_primary_svn_commit = \
2730             cvs_commit.process_revisions(self.done_symbols)
2731         self._add_pending_symbols(c_rev)
2732         self._attempt_to_commit_symbols(ready_queue)
2733     else:
2734       # Make sure we _add_pending_symbols() for this c_rev and
2735       # _attempt_to_commit_symbols(), even if no commits are ready.
2736       self._add_pending_symbols(c_rev)
2737       self._attempt_to_commit_symbols(ready_queue)
2738
2739   def flush(self):
2740     """Commit anything left in self.cvs_commits.  Then inform the
2741     SymbolingsLogger that all commits are done."""
2742
2743     ready_queue = [ ]
2744     for k, v in self.cvs_commits.items():
2745       ready_queue.append((v, k))
2746
2747     ready_queue.sort()
2748     while ready_queue:
2749       (cvs_commit, key) = ready_queue.pop(0)
2750       self.latest_primary_svn_commit = \
2751           cvs_commit.process_revisions(self.done_symbols)
2752       del self.cvs_commits[key]
2753       self._attempt_to_commit_symbols([])
2754
2755     if not Ctx().trunk_only:
2756       Ctx()._symbolings_logger.close()
2757
2758   def _add_pending_symbols(self, c_rev):
2759     """Add to self.pending_symbols any symbols from C_REV for which
2760     C_REV is the last CVSRevision.
2761
2762     If we're not doing a trunk-only conversion, get the symbolic names
2763     that this c_rev is the last *source* CVSRevision for and add them
2764     to those left over from previous passes through the aggregator."""
2765
2766     if not Ctx().trunk_only:
2767       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2768         self.pending_symbols[sym] = None
2769
2770   def _attempt_to_commit_symbols(self, queued_commits):
2771     """Generate one SVNCommit for each symbol in self.pending_symbols
2772     that doesn't have an opening CVSRevision in either QUEUED_COMMITS
2773     or self.cvs_commits.values()."""
2774
2775     # Make a list of all symbols from self.pending_symbols that do not
2776     # have *source* CVSRevisions in the pending commit queue
2777     # (self.cvs_commits) or in queued_commits:
2778     closeable_symbols = []
2779     for sym in self.pending_symbols:
2780       for cvs_commit in self.cvs_commits.values() + queued_commits:
2781         if cvs_commit.opens_symbolic_name(sym):
2782           break
2783       else:
2784         closeable_symbols.append(sym)
2785
2786     # Sort the closeable symbols so that we will always process the
2787     # symbols in the same order, regardless of the order in which the
2788     # dict hashing algorithm hands them back to us.  We do this so
2789     # that our tests will get the same results on all platforms.
2790     closeable_symbols.sort()
2791     for sym in closeable_symbols:
2792       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2793       svn_commit.set_symbolic_name(sym)
2794       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2795       svn_commit.flush()
2796       self.done_symbols.append(sym)
2797       del self.pending_symbols[sym]
2798
2799
2800 class SymbolingsReader:
2801   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2802   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2803   returning the correct opening and closing Subversion revision
2804   numbers for a given symbolic name."""
2805   def __init__(self):
2806     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2807     reads the offsets database into memory."""
2808     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2809     # The offsets_db is really small, and we need to read and write
2810     # from it a fair bit, so suck it into memory
2811     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2812     self.offsets = { }
2813     for key in offsets_db:
2814       #print " ZOO:", key, offsets_db[key]
2815       self.offsets[key] = offsets_db[key]
2816
2817   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2818     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2819     SymbolicNameFillingGuide object.
2820
2821     Note that if we encounter an opening rev in this fill, but the
2822     corresponding closing rev takes place later than SVN_REVNUM, the
2823     closing will not be passed to SymbolicNameFillingGuide in this
2824     fill (and will be discarded when encountered in a later fill).
2825     This is perfectly fine, because we can still do a valid fill
2826     without the closing--we always try to fill what we can as soon as
2827     we can."""
2828
2829     openings_closings_map = OpeningsClosingsMap(symbolic_name)
2830
2831     # It's possible to have a branch start with a file that was added
2832     # on a branch
2833     if self.offsets.has_key(symbolic_name):
2834       # set our read offset for self.symbolings to the offset for
2835       # symbolic_name
2836       self.symbolings.seek(self.offsets[symbolic_name])
2837
2838       while 1:
2839         fpos = self.symbolings.tell()
2840         line = self.symbolings.readline().rstrip()
2841         if not line:
2842           break
2843         name, revnum, type, branch_name, cvs_path = line.split(" ", 4)
2844         if branch_name == '*':
2845           svn_path = Ctx().project.make_trunk_path(cvs_path)
2846         else:
2847           svn_path = Ctx().project.make_branch_path(branch_name, cvs_path)
2848         revnum = int(revnum)
2849         if revnum > svn_revnum or name != symbolic_name:
2850           break
2851         openings_closings_map.register(svn_path, revnum, type)
2852
2853       # get current offset of the read marker and set it to the offset
2854       # for the beginning of the line we just read if we used anything
2855       # we read.
2856       if not openings_closings_map.is_empty():
2857         self.offsets[symbolic_name] = fpos
2858
2859     return SymbolicNameFillingGuide(openings_closings_map)
2860
2861
2862 class SvnRevisionRange:
2863   """The range of subversion revision numbers from which a path can be
2864   copied.  self.opening_revnum is the number of the earliest such
2865   revision, and self.closing_revnum is one higher than the number of
2866   the last such revision.  If self.closing_revnum is None, then no
2867   closings were registered."""
2868
2869   def __init__(self, opening_revnum):
2870     self.opening_revnum = opening_revnum
2871     self.closing_revnum = None
2872
2873   def add_closing(self, closing_revnum):
2874     # When we have a non-trunk default branch, we may have multiple
2875     # closings--only register the first closing we encounter.
2876     if self.closing_revnum is None:
2877       self.closing_revnum = closing_revnum
2878
2879   def __str__(self):
2880     if self.closing_revnum is None:
2881       return '[%d:]' % (self.opening_revnum,)
2882     else:
2883       return '[%d:%d]' % (self.opening_revnum, self.closing_revnum,)
2884
2885
2886 class OpeningsClosingsMap:
2887   """A dictionary of openings and closings for a symbolic name in the
2888   current SVNCommit.
2889
2890   The user should call self.register() for the openings and closings,
2891   then self.get_node_tree() to retrieve the information as a
2892   SymbolicNameFillingGuide."""
2893
2894   def __init__(self, symbolic_name):
2895     """Initialize OpeningsClosingsMap and prepare it for receiving
2896     openings and closings."""
2897
2898     self.name = symbolic_name
2899
2900     # A dictionary of SVN_PATHS to SvnRevisionRange objects.
2901     self.things = { }
2902
2903   def register(self, svn_path, svn_revnum, type):
2904     """Register an opening or closing revision for this symbolic name.
2905     SVN_PATH is the source path that needs to be copied into
2906     self.symbolic_name, and SVN_REVNUM is either the first svn
2907     revision number that we can copy from (our opening), or the last
2908     (not inclusive) svn revision number that we can copy from (our
2909     closing).  TYPE indicates whether this path is an opening or a a
2910     closing.
2911
2912     The opening for a given SVN_PATH must be passed before the closing
2913     for it to have any effect... any closing encountered before a
2914     corresponding opening will be discarded.
2915
2916     It is not necessary to pass a corresponding closing for every
2917     opening.
2918     """
2919     # Always log an OPENING
2920     if type == OPENING:
2921       self.things[svn_path] = SvnRevisionRange(svn_revnum)
2922     # Only log a closing if we've already registered the opening for that
2923     # path.
2924     elif type == CLOSING and self.things.has_key(svn_path):
2925       self.things[svn_path].add_closing(svn_revnum)
2926
2927   def is_empty(self):
2928     """Return true if we haven't accumulated any openings or closings,
2929     false otherwise."""
2930     return not len(self.things)
2931
2932   def get_things(self):
2933     """Return a list of (svn_path, SvnRevisionRange) tuples for all
2934     svn_paths with registered openings or closings."""
2935
2936     return self.things.items()
2937
2938
2939 class SymbolicNameFillingGuide:
2940   """A node tree representing the source paths to be copied to fill
2941   self.symbolic_name in the current SVNCommit.
2942
2943   self._node_tree is the root of the directory tree, in the form {
2944   path_component : subnode }.  Leaf nodes are instances of
2945   SvnRevisionRange.  Intermediate (directory) nodes are dictionaries
2946   mapping relative names to subnodes.
2947
2948   By walking self._node_tree and calling self.get_best_revnum() on
2949   each node, the caller can determine what subversion revision number
2950   to copy the path corresponding to that node from.  self._node_tree
2951   should be treated as read-only.
2952
2953   The caller can then descend to sub-nodes to see if their "best
2954   revnum" differs from their parents' and if it does, take appropriate
2955   actions to "patch up" the subtrees."""
2956
2957   def __init__(self, openings_closings_map):
2958     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2959     store into it the openings and closings from
2960     OPENINGS_CLOSINGS_MAP."""
2961
2962     self.name = openings_closings_map.name
2963
2964     # The dictionary that holds our node tree as a map { node_key :
2965     # node }.
2966     self._node_tree = { }
2967
2968     for svn_path, svn_revision_range in openings_closings_map.get_things():
2969       (head, tail) = _path_split(svn_path)
2970       self._get_node_for_path(head)[tail] = svn_revision_range
2971
2972     #self.print_node_tree(self._node_tree)
2973
2974   def _get_node_for_path(self, svn_path):
2975     """Return the node key for svn_path, creating new nodes as needed."""
2976     # Walk down the path, one node at a time.
2977     node = self._node_tree
2978     for component in svn_path.split('/'):
2979       if node.has_key(component):
2980         node = node[component]
2981       else:
2982         old_node = node
2983         node = {}
2984         old_node[component] = node
2985
2986     return node
2987
2988   def get_best_revnum(self, node, preferred_revnum):
2989     """Determine the best subversion revision number to use when
2990     copying the source tree beginning at NODE.  Returns a
2991     subversion revision number.
2992
2993     PREFERRED_REVNUM is passed to best_rev and used to calculate the
2994     best_revnum."""
2995
2996     def score_revisions(svn_revision_ranges):
2997       """Return a list of revisions and scores based on
2998       SVN_REVISION_RANGES.  The returned list looks like:
2999
3000          [(REV1 SCORE1), (REV2 SCORE2), ...]
3001
3002       where the tuples are sorted by revision number.
3003       SVN_REVISION_RANGES is a list of SvnRevisionRange objects.
3004
3005       For each svn revision that appears as either an opening_revnum
3006       or closing_revnum for one of the svn_revision_ranges, output a
3007       tuple indicating how many of the SvnRevisionRanges include that
3008       svn_revision in its range.  A score thus indicates that copying
3009       the corresponding revision (or any following revision up to the
3010       next revision in the list) of the object in question would yield
3011       that many correct paths at or underneath the object.  There may
3012       be other paths underneath it which are not correct and would
3013       need to be deleted or recopied; those can only be detected by
3014       descending and examining their scores.
3015
3016       If OPENINGS is empty, return the empty list."""
3017       openings = [ x.opening_revnum
3018                    for x in svn_revision_ranges ]
3019       closings = [ x.closing_revnum
3020                    for x in svn_revision_ranges
3021                    if x.closing_revnum is not None ]
3022
3023       # First look for easy out.
3024       if not openings:
3025         return []
3026
3027       # Create a list with both openings (which increment the total)
3028       # and closings (which decrement the total):
3029       things = [(rev,1) for rev in openings] + [(rev,-1) for rev in closings]
3030       # Sort by revision number:
3031       things.sort()
3032       # Initialize output list with zeroth element of things.  This
3033       # element must exist, because it was already verified that
3034       # openings is not empty.
3035       scores = [ things[0] ]
3036       total = scores[-1][1]
3037       for (rev, change) in things[1:]:
3038         total += change
3039         if rev == scores[-1][0]:
3040           # Same revision as last entry; modify last entry:
3041           scores[-1] = (rev, total)
3042         else:
3043           # Previously-unseen revision; create new entry:
3044           scores.append((rev, total))
3045       return scores
3046
3047     def best_rev(scores, preferred_rev):
3048       """Return the revision with the highest score from SCORES, a list
3049       returned by score_revisions().  When the maximum score is shared
3050       by multiple revisions, the oldest revision is selected, unless
3051       PREFERRED_REV is one of the possibilities, in which case, it is
3052       selected."""
3053       max_score = 0
3054       preferred_rev_score = -1
3055       rev = SVN_INVALID_REVNUM
3056       if preferred_rev is None:
3057         # Comparison order of different types is arbitrary.  Do not
3058         # expect None to compare less than int values below.
3059         preferred_rev = SVN_INVALID_REVNUM
3060       for revnum, count in scores:
3061         if count > max_score:
3062           max_score = count
3063           rev = revnum
3064         if revnum <= preferred_rev:
3065           preferred_rev_score = count
3066       if preferred_rev_score == max_score:
3067         rev = preferred_rev
3068       return rev, max_score
3069
3070     # Aggregate openings and closings from the rev tree
3071     svn_revision_ranges = self._list_revnums(node)
3072
3073     # Score the lists
3074     scores = score_revisions(svn_revision_ranges)
3075
3076     revnum, max_score = best_rev(scores, preferred_revnum)
3077
3078     if revnum == SVN_INVALID_REVNUM:
3079       raise FatalError("failed to find a revision "
3080                        + "to copy from when copying %s" % name)
3081     return revnum, max_score
3082
3083   def _list_revnums(self, node):
3084     """Return a list of all the SvnRevisionRanges (including
3085     duplicates) for all leaf nodes at and under NODE."""
3086
3087     if isinstance(node, SvnRevisionRange):
3088       # It is a leaf node.
3089       return [ node ]
3090     else:
3091       # It is an intermediate node.
3092       revnums = []
3093       for key, subnode in node.items():
3094         revnums.extend(self._list_revnums(subnode))
3095       return revnums
3096
3097   def get_sources(self):
3098     """Return the list of sources for this symbolic name.
3099
3100     The Project instance defines what are legitimate sources.  Raise
3101     an exception if a change occurred outside of the source
3102     directories."""
3103
3104     return self._get_sub_sources('', self._node_tree)
3105
3106   def _get_sub_sources(self, start_svn_path, start_node):
3107     """Return the list of sources for this symbolic name, starting the
3108     search at path START_SVN_PATH, which is node START_NODE.  This is
3109     a helper method, called by get_sources() (see)."""
3110
3111     project = Ctx().project
3112     if isinstance(start_node, SvnRevisionRange):
3113       # This implies that a change was found outside of the
3114       # legitimate sources.  This should never happen.
3115       raise
3116     elif project.is_source(start_svn_path):
3117       # This is a legitimate source.  Add it to list.
3118       return [ FillSource(start_svn_path, start_node) ]
3119     else:
3120       # This is a directory that is not a legitimate source.  (That's
3121       # OK because it hasn't changed directly.)  But directories
3122       # within it have been changed, so we need to search recursively
3123       # to find their enclosing sources.
3124       sources = []
3125       for entry, node in start_node.items():
3126         svn_path = _path_join(start_svn_path, entry)
3127         sources.extend(self._get_sub_sources(svn_path, node))
3128
3129     return sources
3130
3131   def print_node_tree(self, node, name='/', indent_depth=0):
3132     """For debugging purposes.  Prints all nodes in TREE that are
3133     rooted at NODE.  INDENT_DEPTH is used to indent the output of
3134     recursive calls."""
3135     if not indent_depth:
3136       print "TREE", "=" * 75
3137     if isinstance(node, SvnRevisionRange):
3138       print "TREE:", " " * (indent_depth * 2), name, node
3139     else:
3140       print "TREE:", " " * (indent_depth * 2), name
3141       for key, value in node.items():
3142         self.print_node_tree(value, key, (indent_depth + 1))
3143
3144
3145 class FillSource:
3146   """Representation of a fill source used by the symbol filler in
3147   SVNRepositoryMirror."""
3148   def __init__(self, prefix, node):
3149     """Create an unscored fill source with a prefix and a key."""
3150     self.prefix = prefix
3151     self.node = node
3152     self.score = None
3153     self.revnum = None
3154
3155   def set_score(self, score, revnum):
3156     """Set the SCORE and REVNUM."""
3157     self.score = score
3158     self.revnum = revnum
3159
3160   def __cmp__(self, other):
3161     """Comparison operator used to sort FillSources in descending
3162     score order."""
3163     if self.score is None or other.score is None:
3164       raise TypeError, 'Tried to compare unscored FillSource'
3165     return cmp(other.score, self.score)
3166
3167
3168 class SVNRepositoryMirror:
3169   """Mirror a Subversion Repository as it is constructed, one
3170   SVNCommit at a time.  The mirror is skeletal; it does not contain
3171   file contents.  The creation of a dumpfile or Subversion repository
3172   is handled by delegates.  See self.add_delegate method for how to
3173   set delegates.
3174
3175   The structure of the repository is kept in two databases and one
3176   hash.  The revs_db database maps revisions to root node keys, and
3177   the nodes_db database maps node keys to nodes.  A node is a hash
3178   from directory names to keys.  Both the revs_db and the nodes_db are
3179   stored on disk and each access is expensive.
3180
3181   The nodes_db database only has the keys for old revisions.  The
3182   revision that is being contructed is kept in memory in the new_nodes
3183   hash which is cheap to access.
3184
3185   You must invoke _start_commit between SVNCommits.
3186
3187   *** WARNING *** All path arguments to methods in this class CANNOT
3188       have leading or trailing slashes.
3189   """
3190
3191   class SVNRepositoryMirrorPathExistsError(Exception):
3192     """Exception raised if an attempt is made to add a path to the
3193     repository mirror and that path already exists in the youngest
3194     revision of the repository."""
3195     pass
3196
3197   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
3198     """Exception raised if a CVSRevision is found to have an unexpected
3199     operation (OP) value."""
3200     pass
3201
3202   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
3203     """Exception raised if an empty SymbolicNameFillingGuide is returned
3204     during a fill where the branch in question already exists."""
3205     pass
3206
3207   def __init__(self):
3208     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
3209     self.delegates = [ ]
3210
3211     # This corresponds to the 'revisions' table in a Subversion fs.
3212     self.revs_db = SDatabase(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
3213     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
3214
3215     # This corresponds to the 'nodes' table in a Subversion fs.  (We
3216     # don't need a 'representations' or 'strings' table because we
3217     # only track metadata, not file contents.)
3218     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
3219     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
3220
3221     # Start at revision 0 without a root node.  It will be created
3222     # by _open_writable_root_node.
3223     self.youngest = 0
3224     self.new_root_key = None
3225     self.new_nodes = { }
3226
3227     if not Ctx().trunk_only:
3228       ###PERF IMPT: Suck this into memory.
3229       self.tags_db = TagsDatabase(DB_OPEN_READ)
3230       self.symbolings_reader = SymbolingsReader()
3231
3232   def _initialize_repository(self, date):
3233     """Initialize the repository by creating the directories for
3234     trunk, tags, and branches.  This method should only be called
3235     after all delegates are added to the repository mirror."""
3236     # Make a 'fake' SVNCommit so we can take advantage of the revprops
3237     # magic therein
3238     svn_commit = SVNCommit("Initialization", 1)
3239     svn_commit.set_date(date)
3240     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
3241
3242     self._start_commit(svn_commit)
3243     self._mkdir(Ctx().project.trunk_path)
3244     if not Ctx().trunk_only:
3245       self._mkdir(Ctx().project.branches_path)
3246       self._mkdir(Ctx().project.tags_path)
3247
3248   def _start_commit(self, svn_commit):
3249     """Start a new commit."""
3250     if self.youngest > 0:
3251       self._end_commit()
3252
3253     self.youngest = svn_commit.revnum
3254     self.new_root_key = None
3255     self.new_nodes = { }
3256
3257     self._invoke_delegates('start_commit', svn_commit)
3258
3259   def _end_commit(self):
3260     """Called at the end of each commit.  This method copies the newly
3261     created nodes to the on-disk nodes db."""
3262     if self.new_root_key is None:
3263       # No changes were made in this revision, so we make the root node
3264       # of the new revision be the same as the last one.
3265       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
3266     else:
3267       self.revs_db[str(self.youngest)] = self.new_root_key
3268       # Copy the new nodes to the nodes_db
3269       for key, value in self.new_nodes.items():
3270         self.nodes_db[key] = value
3271
3272   def _get_node(self, key):
3273     """Returns the node contents for KEY which may refer to either
3274     self.nodes_db or self.new_nodes."""
3275     if self.new_nodes.has_key(key):
3276       return self.new_nodes[key]
3277     else:
3278       return self.nodes_db[key]
3279
3280   def _open_readonly_node(self, path, revnum):
3281     """Open a readonly node for PATH at revision REVNUM.  Returns the
3282     node key and node contents if the path exists, else (None, None)."""
3283     # Get the root key
3284     if revnum == self.youngest:
3285       if self.new_root_key is None:
3286         node_key = self.revs_db[str(self.youngest - 1)]
3287       else:
3288         node_key = self.new_root_key
3289     else:
3290       node_key = self.revs_db[str(revnum)]
3291
3292     for component in path.split('/'):
3293       node_contents = self._get_node(node_key)
3294       node_key = node_contents.get(component, None)
3295       if node_key is None:
3296         return None
3297
3298     return node_key
3299
3300   def _open_writable_root_node(self):
3301     """Open a writable root node.  The current root node is returned
3302     immeditely if it is already writable.  If not, create a new one by
3303     copying the contents of the root node of the previous version."""
3304     if self.new_root_key is not None:
3305       return self.new_root_key, self.new_nodes[self.new_root_key]
3306
3307     if self.youngest < 2:
3308       new_contents = { }
3309     else:
3310       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
3311     self.new_root_key = gen_key()
3312     self.new_nodes = { self.new_root_key: new_contents }
3313
3314     return self.new_root_key, new_contents
3315
3316   def _open_writable_node(self, svn_path, create):
3317     """Open a writable node for the path SVN_PATH, creating SVN_PATH
3318     and any missing directories if CREATE is True."""
3319     parent_key, parent_contents = self._open_writable_root_node()
3320
3321     # Walk up the path, one node at a time.
3322     path_so_far = None
3323     components = svn_path.split('/')
3324     for i in range(len(components)):
3325       component = components[i]
3326       path_so_far = _path_join(path_so_far, component)
3327       this_key = parent_contents.get(component, None)
3328       if this_key is not None:
3329         # The component exists.
3330         this_contents = self.new_nodes.get(this_key, None)
3331         if this_contents is None:
3332           # Suck the node from the nodes_db, but update the key
3333           this_contents = self.nodes_db[this_key]
3334           this_key = gen_key()
3335           self.new_nodes[this_key] = this_contents
3336           parent_contents[component] = this_key
3337       elif create:
3338         # The component does not exists, so we create it.
3339         this_contents = { }
3340         this_key = gen_key()
3341         self.new_nodes[this_key] = this_contents
3342         parent_contents[component] = this_key
3343         if i < len(components) - 1:
3344           self._invoke_delegates('mkdir', path_so_far)
3345       else:
3346         # The component does not exists and we are not instructed to
3347         # create it, so we give up.
3348         return None, None
3349
3350       parent_key = this_key
3351       parent_contents = this_contents
3352
3353     return this_key, this_contents
3354
3355   def _path_exists(self, path):
3356     """If PATH exists in self.youngest of the svn repository mirror,
3357     return true, else return None.
3358
3359     PATH must not start with '/'."""
3360     return self._open_readonly_node(path, self.youngest) is not None
3361
3362   def _fast_delete_path(self, parent_path, parent_contents, component):
3363     """Delete COMPONENT from the parent direcory PARENT_PATH with the
3364     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
3365     in PARENT_CONTENTS."""
3366     if parent_contents.has_key(component):
3367       del parent_contents[component]
3368       self._invoke_delegates('delete_path',
3369                              _path_join(parent_path, component))
3370
3371   def _delete_path(self, svn_path, should_prune=False):
3372     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
3373     all ancestor directories that are made empty when SVN_PATH is deleted.
3374     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
3375
3376     NOTE: This function ignores requests to delete the root directory
3377     or any directory for which Ctx().project.is_unremovable() returns
3378     True, either directly or by pruning."""
3379
3380     if svn_path == '' or Ctx().project.is_unremovable(svn_path):
3381       return
3382
3383     (parent_path, entry,) = _path_split(svn_path)
3384     if parent_path:
3385       parent_key, parent_contents = \
3386           self._open_writable_node(parent_path, False)
3387     else:
3388       parent_key, parent_contents = self._open_writable_root_node()
3389
3390     if parent_key is not None:
3391       self._fast_delete_path(parent_path, parent_contents, entry)
3392       # The following recursion makes pruning an O(n^2) operation in the
3393       # worst case (where n is the depth of SVN_PATH), but the worst case
3394       # is probably rare, and the constant cost is pretty low.  Another
3395       # drawback is that we issue a delete for each path and not just
3396       # a single delete for the topmost directory pruned.
3397       if should_prune and len(parent_contents) == 0:
3398         self._delete_path(parent_path, True)
3399
3400   def _mkdir(self, path):
3401     """Create PATH in the repository mirror at the youngest revision."""
3402     self._open_writable_node(path, True)
3403     self._invoke_delegates('mkdir', path)
3404
3405   def _change_path(self, cvs_rev):
3406     """Register a change in self.youngest for the CVS_REV's svn_path
3407     in the repository mirror."""
3408     # We do not have to update the nodes because our mirror is only
3409     # concerned with the presence or absence of paths, and a file
3410     # content change does not cause any path changes.
3411     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, False))
3412
3413   def _add_path(self, cvs_rev):
3414     """Add the CVS_REV's svn_path to the repository mirror."""
3415     self._open_writable_node(cvs_rev.svn_path, True)
3416     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, True))
3417
3418   def _copy_path(self, src_path, dest_path, src_revnum):
3419     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
3420     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
3421     parent *must* exist, but DEST_PATH *cannot* exist.
3422
3423     Return the node key and the contents of the new node at DEST_PATH
3424     as a dictionary."""
3425     # get the contents of the node of our src_path
3426     src_key = self._open_readonly_node(src_path, src_revnum)
3427     src_contents = self._get_node(src_key)
3428
3429     # Get the parent path and the base path of the dest_path
3430     (dest_parent, dest_basename,) = _path_split(dest_path)
3431     dest_parent_key, dest_parent_contents = \
3432                    self._open_writable_node(dest_parent, False)
3433
3434     if dest_parent_contents.has_key(dest_basename):
3435       msg = "Attempt to add path '%s' to repository mirror " % dest_path
3436       msg = msg + "when it already exists in the mirror."
3437       raise self.SVNRepositoryMirrorPathExistsError, msg
3438
3439     dest_parent_contents[dest_basename] = src_key
3440     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
3441
3442     # Yes sir, src_key and src_contents are also the contents of the
3443     # destination.  This is a cheap copy, remember!  :-)
3444     return src_key, src_contents
3445
3446   def _fill_symbolic_name(self, svn_commit):
3447     """Performs all copies necessary to create as much of the the tag
3448     or branch SVN_COMMIT.symbolic_name as possible given the current
3449     revision of the repository mirror.
3450
3451     The symbolic name is guaranteed to exist in the Subversion
3452     repository by the end of this call, even if there are no paths
3453     under it."""
3454     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3455         svn_commit.symbolic_name, self.youngest)
3456     # Get the list of sources for the symbolic name.
3457     sources = symbol_fill.get_sources()
3458
3459     if sources:
3460       if self.tags_db.has_key(svn_commit.symbolic_name):
3461         dest_prefix = Ctx().project.get_tag_path(svn_commit.symbolic_name)
3462       else:
3463         dest_prefix = Ctx().project.get_branch_path(svn_commit.symbolic_name)
3464
3465       dest_key = self._open_writable_node(dest_prefix, False)[0]
3466       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3467     else:
3468       # We can only get here for a branch whose first commit is an add
3469       # (as opposed to a copy).
3470       dest_path = Ctx().project.get_branch_path(symbol_fill.name)
3471       if not self._path_exists(dest_path):
3472         # If our symbol_fill was empty, that means that our first
3473         # commit on the branch was to a file added on the branch, and
3474         # that this is our first fill of that branch.
3475         #
3476         # This case is covered by test 16.
3477         #
3478         # ...we create the branch by copying trunk from the our
3479         # current revision number minus 1
3480         source_path = Ctx().project.trunk_path
3481         entries = self._copy_path(source_path, dest_path,
3482                                   svn_commit.revnum - 1)[1]
3483         # Now since we've just copied trunk to a branch that's
3484         # *supposed* to be empty, we delete any entries in the
3485         # copied directory.
3486         for entry in entries:
3487           del_path = dest_path + '/' + entry
3488           # Delete but don't prune.
3489           self._delete_path(del_path)
3490       else:
3491         msg = "Error filling branch '" \
3492               + _clean_symbolic_name(symbol_fill.name) + "'.\n"
3493         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3494         msg = msg + "attempted to create a branch that already exists."
3495         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3496
3497   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3498             path = None, parent_source_prefix = None,
3499             preferred_revnum = None, prune_ok = None):
3500     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3501     SOURCES, and recurse into the child items.
3502
3503     DEST_PREFIX is the prefix of the destination directory, e.g.
3504     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3505     FillSource classes that are candidates to be copied to the
3506     destination.  DEST_KEY is the key in self.nodes_db to the
3507     destination, or None if the destination does not yet exist.
3508
3509     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3510     are at the top level, e.g. '/tags/my_tag'.
3511
3512     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3513     the parent directory, and PREFERRED_REVNUM is an int which is the
3514     source revision number that the caller (who may have copied KEY's
3515     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3516     then no revision is preferable to any other (which probably means
3517     that no copies have happened yet).
3518
3519     PRUNE_OK means that a copy has been made in this recursion, and
3520     it's safe to prune directories that are not in
3521     SYMBOL_FILL._node_tree, provided that said directory has a source
3522     prefix of one of the PARENT_SOURCE_PREFIX.
3523
3524     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3525     should only be passed in by recursive calls."""
3526     # Calculate scores and revnums for all sources
3527     for source in sources:
3528       src_revnum, score = symbol_fill.get_best_revnum(source.node,
3529                                                       preferred_revnum)
3530       source.set_score(score, src_revnum)
3531
3532     # Sort the sources in descending score order so that we will make
3533     # a eventual copy from the source with the highest score.
3534     sources.sort()
3535     copy_source = sources[0]
3536
3537     src_path = _path_join(copy_source.prefix, path)
3538     dest_path = _path_join(dest_prefix, path)
3539
3540     # Figure out if we shall copy to this destination and delete any
3541     # destination path that is in the way.
3542     do_copy = 0
3543     if dest_key is None:
3544       do_copy = 1
3545     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3546                        copy_source.revnum != preferred_revnum):
3547       # We are about to replace the destination, so we need to remove
3548       # it before we perform the copy.
3549       self._delete_path(dest_path)
3550       do_copy = 1
3551
3552     if do_copy:
3553       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3554                                                copy_source.revnum)
3555       prune_ok = 1
3556     else:
3557       dest_entries = self._get_node(dest_key)
3558
3559     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3560     # elements and the values are lists of FillSource classes where
3561     # this path element exists.
3562     src_entries = {}
3563     for source in sources:
3564       if isinstance(source.node, SvnRevisionRange):
3565         continue
3566       for entry, node in source.node.items():
3567         src_entries.setdefault(entry, []).append(
3568             FillSource(source.prefix, node))
3569
3570     if prune_ok:
3571       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3572       delete_list = [ ]
3573       for entry in dest_entries:
3574         if not src_entries.has_key(entry):
3575           delete_list.append(entry)
3576       if delete_list:
3577         if not self.new_nodes.has_key(dest_key):
3578           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3579         # Sort the delete list to get "diffable" dumpfiles.
3580         delete_list.sort()
3581         for entry in delete_list:
3582           self._fast_delete_path(dest_path, dest_entries, entry)
3583
3584     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3585     src_keys = src_entries.keys()
3586     src_keys.sort()
3587     for src_key in src_keys:
3588       next_dest_key = dest_entries.get(src_key, None)
3589       self._fill(symbol_fill, dest_prefix, next_dest_key,
3590                  src_entries[src_key], _path_join(path, src_key),
3591                  copy_source.prefix, sources[0].revnum, prune_ok)
3592
3593   def _synchronize_default_branch(self, svn_commit):
3594     """Propagate any changes that happened on a non-trunk default
3595     branch to the trunk of the repository.  See
3596     CVSCommit._post_commit() for details on why this is necessary."""
3597     for cvs_rev in svn_commit.cvs_revs:
3598       svn_trunk_path = Ctx().project.make_trunk_path(cvs_rev.cvs_path)
3599       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3600         if self._path_exists(svn_trunk_path):
3601           # Delete the path on trunk...
3602           self._delete_path(svn_trunk_path)
3603         # ...and copy over from branch
3604         self._copy_path(cvs_rev.svn_path, svn_trunk_path,
3605                         svn_commit.motivating_revnum)
3606       elif cvs_rev.op == OP_DELETE:
3607         # delete trunk path
3608         self._delete_path(svn_trunk_path)
3609       else:
3610         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3611                % cvs_rev.op)
3612         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3613
3614   def commit(self, svn_commit):
3615     """Add an SVNCommit to the SVNRepository, incrementing the
3616     Repository revision number, and changing the repository.  Invoke
3617     the delegates' _start_commit() method."""
3618
3619     if svn_commit.revnum == 2:
3620       self._initialize_repository(svn_commit.get_date())
3621
3622     self._start_commit(svn_commit)
3623
3624     if svn_commit.symbolic_name:
3625       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3626                   _clean_symbolic_name(svn_commit.symbolic_name))
3627       self._fill_symbolic_name(svn_commit)
3628     elif svn_commit.motivating_revnum:
3629       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3630                   % svn_commit.motivating_revnum)
3631       self._synchronize_default_branch(svn_commit)
3632     else: # This actually commits CVSRevisions
3633       if len(svn_commit.cvs_revs) > 1: plural = "s"
3634       else: plural = ""
3635       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3636                   % (len(svn_commit.cvs_revs), plural))
3637       for cvs_rev in svn_commit.cvs_revs:
3638         # See comment in CVSCommit._commit() for what this is all
3639         # about.  Note that although asking self._path_exists() is
3640         # somewhat expensive, we only do it if the first two (cheap)
3641         # tests succeed first.
3642         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3643                 and (cvs_rev.rev == "1.1.1.1")
3644                 and self._path_exists(cvs_rev.svn_path)):
3645           if cvs_rev.op == OP_ADD:
3646             self._add_path(cvs_rev)
3647           elif cvs_rev.op == OP_CHANGE:
3648             # Fix for Issue #74:
3649             #
3650             # Here's the scenario.  You have file FOO that is imported
3651             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3652             # the file exists.
3653             #
3654             # Moving forward in time, FOO is deleted on the default
3655             # branch (r1.1.1.2).  cvs2svn determines that this delete
3656             # also needs to happen on trunk, so FOO is deleted on
3657             # trunk.
3658             #
3659             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3660             # not 'dead', we assume it's a change).  However, since
3661             # our trunk file has been deleted, svnadmin blows up--you
3662             # can't change a file that doesn't exist!
3663             #
3664             # Soooo... we just check the path, and if it doesn't
3665             # exist, we do an add... if the path does exist, it's
3666             # business as usual.
3667             if not self._path_exists(cvs_rev.svn_path):
3668               self._add_path(cvs_rev)
3669             else:
3670               self._change_path(cvs_rev)
3671
3672         if cvs_rev.op == OP_DELETE:
3673           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3674
3675   def cleanup(self):
3676     """Callback for the Cleanup.register in self.__init__."""
3677     self.revs_db = None
3678     self.nodes_db = None
3679
3680   def add_delegate(self, delegate):
3681     """Adds DELEGATE to self.delegates.
3682
3683     For every delegate you add, as soon as SVNRepositoryMirror
3684     performs a repository action method, SVNRepositoryMirror will call
3685     the delegate's corresponding repository action method.  Multiple
3686     delegates will be called in the order that they are added.  See
3687     SVNRepositoryMirrorDelegate for more information."""
3688     self.delegates.append(delegate)
3689
3690   def _invoke_delegates(self, method, *args):
3691     """Iterate through each of our delegates, in the order that they
3692     were added, and call the delegate's method named METHOD with the
3693     arguments in ARGS."""
3694     for delegate in self.delegates:
3695       getattr(delegate, method)(*args)
3696
3697   def finish(self):
3698     """Calls the delegate finish method."""
3699     self._end_commit()
3700     self._invoke_delegates('finish')
3701     self.cleanup()
3702
3703
3704 class SVNCommitItem:
3705   """A wrapper class for CVSRevision objects upon which
3706   Subversion-related data (such as properties) may be hung."""
3707
3708   def __init__(self, c_rev, svn_props_changed):
3709     """Initialize instance and record the properties for this file.
3710     SVN_PROPS_CHANGED indicates whether the svn: properties are known
3711     to have changed since the last revision.
3712
3713     The properties are set by the SVNPropertySetters in
3714     Ctx().svn_property_setters, then we read a couple of the
3715     properties back out for our own purposes."""
3716
3717     self.c_rev = c_rev
3718     # Did the svn properties change for this file (i.e., do they have
3719     # to be written to the dumpfile?)
3720     self.svn_props_changed = svn_props_changed
3721
3722     # The properties for this item as a map { key : value }.  If VALUE
3723     # is None, no property should be set.
3724     self.svn_props = { }
3725
3726     for svn_property_setter in Ctx().svn_property_setters:
3727       svn_property_setter.set_properties(self)
3728
3729     # Remember if we need to filter the EOLs.  We could actually use
3730     # self.svn_props now, since it is initialized for each revision.
3731     self.needs_eol_filter = \
3732         self.svn_props.get('svn:eol-style', None) is not None
3733
3734     self.has_keywords = self.svn_props.get('svn:keywords', None) is not None
3735
3736
3737 class SVNPropertySetter:
3738   """Abstract class for objects that can set properties on a SVNCommitItem."""
3739
3740   def set_properties(self, s_item):
3741     """Set any properties that can be determined for S_ITEM."""
3742
3743     raise NotImplementedError
3744
3745
3746 class SVNRepositoryMirrorDelegate:
3747   """Abstract superclass for any delegate to SVNRepositoryMirror.
3748   Subclasses must implement all of the methods below.
3749
3750   For each method, a subclass implements, in its own way, the
3751   Subversion operation implied by the method's name.  For example, for
3752   the add_path method, the DumpfileDelegate would write out a
3753   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3754   would merely print that the path is being added to the repository,
3755   and the RepositoryDelegate would actually cause the path to be added
3756   to the Subversion repository that it is creating.
3757   """
3758
3759   def start_commit(self, svn_commit):
3760     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3761     see subclass implementation for details."""
3762     raise NotImplementedError
3763
3764   def mkdir(self, path):
3765     """PATH is a string; see subclass implementation for details."""
3766     raise NotImplementedError
3767
3768   def add_path(self, s_item):
3769     """S_ITEM is an SVNCommitItem; see subclass implementation for
3770     details."""
3771     raise NotImplementedError
3772
3773   def change_path(self, s_item):
3774     """S_ITEM is an SVNCommitItem; see subclass implementation for
3775     details."""
3776     raise NotImplementedError
3777
3778   def delete_path(self, path):
3779     """PATH is a string; see subclass implementation for
3780     details."""
3781     raise NotImplementedError
3782
3783   def copy_path(self, src_path, dest_path, src_revnum):
3784     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3785     subversion revision number (int); see subclass implementation for
3786     details."""
3787     raise NotImplementedError
3788
3789   def finish(self):
3790     """Perform any cleanup necessary after all revisions have been
3791     committed."""
3792     raise NotImplementedError
3793
3794
3795 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3796   """Create a Subversion dumpfile."""
3797
3798   def __init__(self, dumpfile_path=None):
3799     """Return a new DumpfileDelegate instance, attached to a dumpfile
3800     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding."""
3801     if dumpfile_path:
3802       self.dumpfile_path = dumpfile_path
3803     else:
3804       self.dumpfile_path = Ctx().dumpfile
3805
3806     self.dumpfile = open(self.dumpfile_path, 'wb')
3807     self._write_dumpfile_header(self.dumpfile)
3808
3809   def _write_dumpfile_header(self, dumpfile):
3810     # Initialize the dumpfile with the standard headers.
3811     #
3812     # Since the CVS repository doesn't have a UUID, and the Subversion
3813     # repository will be created with one anyway, we don't specify a
3814     # UUID in the dumpflie
3815     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3816
3817   def _utf8_path(self, path):
3818     """Return a copy of PATH encoded in UTF-8."""
3819     pieces = string.split(path, '/')
3820     # Convert each path component separately (as they may each use
3821     # different encodings).
3822     for i in range(len(pieces)):
3823       try:
3824         # Log messages can be converted with the 'replace' strategy,
3825         # but we can't afford any lossiness here.
3826         pieces[i] = to_utf8(pieces[i], 'strict')
3827       except UnicodeError:
3828         raise FatalError(
3829             "Unable to convert a path '%s' to internal encoding.\n"
3830             "Consider rerunning with (for example) '--encoding=latin1'."
3831             % (path,))
3832     return string.join(pieces, '/')
3833
3834   def _string_for_prop(self, name, value):
3835     """Return a property in the form needed for the dumpfile."""
3836
3837     return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value)
3838
3839   def start_commit(self, svn_commit):
3840     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3841
3842     self.revision = svn_commit.revnum
3843
3844     # The start of a new commit typically looks like this:
3845     #
3846     #   Revision-number: 1
3847     #   Prop-content-length: 129
3848     #   Content-length: 129
3849     #
3850     #   K 7
3851     #   svn:log
3852     #   V 27
3853     #   Log message for revision 1.
3854     #   K 10
3855     #   svn:author
3856     #   V 7
3857     #   jrandom
3858     #   K 8
3859     #   svn:date
3860     #   V 27
3861     #   2003-04-22T22:57:58.132837Z
3862     #   PROPS-END
3863     #
3864     # Notice that the length headers count everything -- not just the
3865     # length of the data but also the lengths of the lengths, including
3866     # the 'K ' or 'V ' prefixes.
3867     #
3868     # The reason there are both Prop-content-length and Content-length
3869     # is that the former includes just props, while the latter includes
3870     # everything.  That's the generic header form for any entity in a
3871     # dumpfile.  But since revisions only have props, the two lengths
3872     # are always the same for revisions.
3873
3874     # Calculate the output needed for the property definitions.
3875     props = svn_commit.get_revprops()
3876     prop_names = props.keys()
3877     prop_names.sort()
3878     prop_strings = []
3879     for propname in prop_names:
3880       if props[propname] is not None:
3881         prop_strings.append(self._string_for_prop(propname, props[propname]))
3882
3883     all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n'
3884     total_len = len(all_prop_strings)
3885
3886     # Print the revision header and props
3887     self.dumpfile.write('Revision-number: %d\n'
3888                         'Prop-content-length: %d\n'
3889                         'Content-length: %d\n'
3890                         '\n'
3891                         % (self.revision, total_len, total_len))
3892
3893     self.dumpfile.write(all_prop_strings)
3894     self.dumpfile.write('\n')
3895
3896   def mkdir(self, path):
3897     """Emit the creation of directory PATH."""
3898     self.dumpfile.write("Node-path: %s\n"
3899                         "Node-kind: dir\n"
3900                         "Node-action: add\n"
3901                         "\n"
3902                         "\n" % self._utf8_path(path))
3903
3904   def _add_or_change_path(self, s_item, op):
3905     """Emit the addition or change corresponding to S_ITEM.
3906     OP is either the constant OP_ADD or OP_CHANGE."""
3907
3908     # Validation stuffs
3909     if op == OP_ADD:
3910       action = 'add'
3911     elif op == OP_CHANGE:
3912       action = 'change'
3913     else:
3914       raise FatalError("_add_or_change_path() called with bad op ('%s')"
3915                        % (op,))
3916
3917     # Convenience variables
3918     c_rev = s_item.c_rev
3919
3920     # The property handling here takes advantage of an undocumented
3921     # but IMHO consistent feature of the Subversion dumpfile-loading
3922     # code.  When a node's properties aren't mentioned (that is, the
3923     # "Prop-content-length:" header is absent, no properties are
3924     # listed at all, and there is no "PROPS-END\n" line) then no
3925     # change is made to the node's properties.
3926     #
3927     # This is consistent with the way dumpfiles behave w.r.t. text
3928     # content changes, so I'm comfortable relying on it.  If you
3929     # commit a change to *just* the properties of some node that
3930     # already has text contents from a previous revision, then in the
3931     # dumpfile output for the prop change, no "Text-content-length:"
3932     # nor "Text-content-md5:" header will be present, and the text of
3933     # the file will not be given.  But this does not cause the file's
3934     # text to be erased!  It simply remains unchanged.
3935     #
3936     # This works out great for cvs2svn, due to lucky coincidences:
3937     #
3938     # For files, the only properties we ever set are set in the first
3939     # revision; all other revisions (including on branches) inherit
3940     # from that.  After the first revision, we never change file
3941     # properties, therefore, there is no need to remember the full set
3942     # of properties on a given file once we've set it.
3943     #
3944     # For directories, the only property we set is "svn:ignore", and
3945     # while we may change it after the first revision, we always do so
3946     # based on the contents of a ".cvsignore" file -- in other words,
3947     # CVS is doing the remembering for us, so we still don't have to
3948     # preserve the previous value of the property ourselves.
3949
3950     # Calculate the (sorted-by-name) property string and length, if any.
3951     if s_item.svn_props_changed:
3952       svn_props = s_item.svn_props
3953       prop_contents = ''
3954       prop_names = svn_props.keys()
3955       prop_names.sort()
3956       for pname in prop_names:
3957         pvalue = svn_props[pname]
3958         if pvalue is not None:
3959           prop_contents += self._string_for_prop(pname, pvalue)
3960       prop_contents += 'PROPS-END\n'
3961       props_header = 'Prop-content-length: %d\n' % len(prop_contents)
3962     else:
3963       prop_contents = ''
3964       props_header = ''
3965
3966     # treat .cvsignore as a directory property
3967     dir_path, basename = os.path.split(c_rev.svn_path)
3968     if basename == ".cvsignore":
3969       ignore_vals = generate_ignores(c_rev)
3970       ignore_contents = '\n'.join(ignore_vals)
3971       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3972                          (len(ignore_contents), ignore_contents))
3973       ignore_contents = ignore_contents + 'PROPS-END\n'
3974       ignore_len = len(ignore_contents)
3975
3976       # write headers, then props
3977       self.dumpfile.write('Node-path: %s\n'
3978                           'Node-kind: dir\n'
3979                           'Node-action: change\n'
3980                           'Prop-content-length: %d\n'
3981                           'Content-length: %d\n'
3982                           '\n'
3983                           '%s'
3984                           % (self._utf8_path(dir_path), ignore_len,
3985                              ignore_len, ignore_contents))
3986
3987     # If the file has keywords, we must prevent CVS/RCS from expanding
3988     # the keywords because they must be unexpanded in the repository,
3989     # or Subversion will get confused.
3990     pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(
3991         c_rev, suppress_keyword_substitution=s_item.has_keywords)
3992
3993     self.dumpfile.write('Node-path: %s\n'
3994                         'Node-kind: file\n'
3995                         'Node-action: %s\n'
3996                         '%s'  # no property header if no props
3997                         'Text-content-length: '
3998                         % (self._utf8_path(c_rev.svn_path),
3999                            action, props_header))
4000
4001     pos = self.dumpfile.tell()
4002
4003     self.dumpfile.write('0000000000000000\n'
4004                         'Text-content-md5: 00000000000000000000000000000000\n'
4005                         'Content-length: 0000000000000000\n'
4006                         '\n')
4007
4008     if prop_contents:
4009       self.dumpfile.write(prop_contents)
4010
4011     # Insert a filter to convert all EOLs to LFs if neccessary
4012     if s_item.needs_eol_filter:
4013       data_reader = LF_EOL_Filter(pipe.stdout)
4014     else:
4015       data_reader = pipe.stdout
4016
4017     # Insert the rev contents, calculating length and checksum as we go.
4018     checksum = md5.new()
4019     length = 0
4020     while True:
4021       buf = data_reader.read(PIPE_READ_SIZE)
4022       if buf == '':
4023         break
4024       checksum.update(buf)
4025       length = length + len(buf)
4026       self.dumpfile.write(buf)
4027
4028     pipe.stdout.close()
4029     error_output = pipe.stderr.read()
4030     exit_status = pipe.wait()
4031     if exit_status:
4032       raise FatalError("The command '%s' failed with exit status: %s\n"
4033                        "and the following output:\n"
4034                        "%s" % (pipe_cmd, exit_status, error_output))
4035
4036     # Go back to patch up the length and checksum headers:
4037     self.dumpfile.seek(pos, 0)
4038     # We left 16 zeros for the text length; replace them with the real
4039     # length, padded on the left with spaces:
4040     self.dumpfile.write('%16d' % length)
4041     # 16... + 1 newline + len('Text-content-md5: ') == 35
4042     self.dumpfile.seek(pos + 35, 0)
4043     self.dumpfile.write(checksum.hexdigest())
4044     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
4045     self.dumpfile.seek(pos + 84, 0)
4046     # The content length is the length of property data, text data,
4047     # and any metadata around/inside around them.
4048     self.dumpfile.write('%16d' % (length + len(prop_contents)))
4049     # Jump back to the end of the stream
4050     self.dumpfile.seek(0, 2)
4051
4052     # This record is done (write two newlines -- one to terminate
4053     # contents that weren't themselves newline-termination, one to
4054     # provide a blank line for readability.
4055     self.dumpfile.write('\n\n')
4056
4057   def add_path(self, s_item):
4058     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
4059     self._add_or_change_path(s_item, OP_ADD)
4060
4061   def change_path(self, s_item):
4062     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
4063     self._add_or_change_path(s_item, OP_CHANGE)
4064
4065   def delete_path(self, path):
4066     """Emit the deletion of PATH."""
4067     self.dumpfile.write('Node-path: %s\n'
4068                         'Node-action: delete\n'
4069                         '\n' % self._utf8_path(path))
4070
4071   def copy_path(self, src_path, dest_path, src_revnum):
4072     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
4073     # We don't need to include "Node-kind:" for copies; the loader
4074     # ignores it anyway and just uses the source kind instead.
4075     self.dumpfile.write('Node-path: %s\n'
4076                         'Node-action: add\n'
4077                         'Node-copyfrom-rev: %d\n'
4078                         'Node-copyfrom-path: /%s\n'
4079                         '\n'
4080                         % (self._utf8_path(dest_path),
4081                            src_revnum,
4082                            self._utf8_path(src_path)))
4083
4084   def finish(self):
4085     """Perform any cleanup necessary after all revisions have been
4086     committed."""
4087     self.dumpfile.close()
4088
4089
4090 class RepositoryDelegate(DumpfileDelegate):
4091   """Creates a new Subversion Repository.  DumpfileDelegate does all
4092   of the heavy lifting."""
4093   def __init__(self):
4094     self.svnadmin = Ctx().svnadmin
4095     self.target = Ctx().target
4096     if not Ctx().existing_svnrepos:
4097       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
4098       if not Ctx().fs_type:
4099         # User didn't say what kind repository (bdb, fsfs, etc).
4100         # We still pass --bdb-txn-nosync.  It's a no-op if the default
4101         # repository type doesn't support it, but we definitely want
4102         # it if BDB is the default.
4103         run_command('%s create %s "%s"' % (self.svnadmin,
4104                                            "--bdb-txn-nosync",
4105                                            self.target))
4106       elif Ctx().fs_type == 'bdb':
4107         # User explicitly specified bdb.
4108         #
4109         # Since this is a BDB repository, pass --bdb-txn-nosync,
4110         # because it gives us a 4-5x speed boost (if cvs2svn is
4111         # creating the repository, cvs2svn should be the only program
4112         # accessing the svn repository (until cvs is done, at least)).
4113         # But we'll turn no-sync off in self.finish(), unless
4114         # instructed otherwise.
4115         run_command('%s create %s %s "%s"' % (self.svnadmin,
4116                                               "--fs-type=bdb",
4117                                               "--bdb-txn-nosync",
4118                                               self.target))
4119       else:
4120         # User specified something other than bdb.
4121         run_command('%s create %s "%s"' % (self.svnadmin,
4122                                            "--fs-type=%s" % Ctx().fs_type,
4123                                            self.target))
4124
4125     # Since the output of this run is a repository, not a dumpfile,
4126     # the temporary dumpfiles we create should go in the tmpdir.
4127     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
4128
4129     # This is 1 if a commit is in progress, otherwise None.
4130     self._commit_in_progress = None
4131
4132     self.dumpfile = open(self.dumpfile_path, 'w+b')
4133     self.loader_pipe = SimplePopen([ self.svnadmin, 'load', '-q',
4134                                      self.target ], True)
4135     self.loader_pipe.stdout.close()
4136     try:
4137       self._write_dumpfile_header(self.loader_pipe.stdin)
4138     except IOError:
4139       raise FatalError("svnadmin failed with the following output while "
4140                        "loading the dumpfile:\n"
4141                        + self.loader_pipe.stderr.read())
4142
4143   def _feed_pipe(self):
4144     """Feed the revision stored in the dumpfile to the svnadmin
4145     load pipe."""
4146     self.dumpfile.seek(0)
4147     while 1:
4148       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
4149       if not len(data):
4150         break
4151       try:
4152         self.loader_pipe.stdin.write(data)
4153       except IOError:
4154         raise FatalError("svnadmin failed with the following output "
4155                          "while loading the dumpfile:\n"
4156                          + self.loader_pipe.stderr.read())
4157
4158   def start_commit(self, svn_commit):
4159     """Start a new commit.  If a commit is already in progress, close
4160     the dumpfile, load it into the svn repository, open a new
4161     dumpfile, and write the header into it."""
4162     if self._commit_in_progress:
4163       self._feed_pipe()
4164     self.dumpfile.seek(0)
4165     self.dumpfile.truncate()
4166     DumpfileDelegate.start_commit(self, svn_commit)
4167     self._commit_in_progress = 1
4168
4169   def finish(self):
4170     """Loads the last commit into the repository."""
4171     self._feed_pipe()
4172     self.dumpfile.close()
4173     self.loader_pipe.stdin.close()
4174     error_output = self.loader_pipe.stderr.read()
4175     exit_status = self.loader_pipe.wait()
4176     if exit_status:
4177       raise FatalError('svnadmin load failed with exit status: %s\n'
4178                        'and the following output:\n'
4179                        '%s' % (exit_status, error_output,))
4180     os.remove(self.dumpfile_path)
4181
4182     # If this is a BDB repository, and we created the repository, and
4183     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
4184     # line in the DB_CONFIG file, because txn syncing should be on by
4185     # default in BDB repositories.
4186     #
4187     # We determine if this is a BDB repository by looking for the
4188     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
4189     # checking Ctx().fs_type.  That way this code will Do The Right
4190     # Thing in all circumstances.
4191     db_config = os.path.join(self.target, "db/DB_CONFIG")
4192     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
4193         and os.path.exists(db_config)):
4194       no_sync = 'set_flags DB_TXN_NOSYNC\n'
4195
4196       contents = open(db_config, 'r').readlines()
4197       index = contents.index(no_sync)
4198       contents[index] = '# ' + no_sync
4199       contents = open(db_config, 'w').writelines(contents)
4200
4201
4202 class StdoutDelegate(SVNRepositoryMirrorDelegate):
4203   """Makes no changes to the disk, but writes out information to
4204   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
4205   print statements will state that we're doing something, when in
4206   reality, we aren't doing anything other than printing out that we're
4207   doing something.  Kind of zen, really."""
4208   def __init__(self, total_revs):
4209     self.total_revs = total_revs
4210
4211   def start_commit(self, svn_commit):
4212     """Prints out the Subversion revision number of the commit that is
4213     being started."""
4214     Log().write(LOG_VERBOSE, "=" * 60)
4215     Log().write(LOG_NORMAL, "Starting Subversion r%d / %d" %
4216                 (svn_commit.revnum, self.total_revs))
4217
4218   def mkdir(self, path):
4219     """Print a line stating that we are creating directory PATH."""
4220     Log().write(LOG_VERBOSE, "  New Directory", path)
4221
4222   def add_path(self, s_item):
4223     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
4224     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
4225
4226   def change_path(self, s_item):
4227     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
4228     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
4229
4230   def delete_path(self, path):
4231     """Print a line stating that we are 'deleting' PATH."""
4232     Log().write(LOG_VERBOSE, "  Deleting", path)
4233
4234   def copy_path(self, src_path, dest_path, src_revnum):
4235     """Print a line stating that we are 'copying' revision SRC_REVNUM
4236     of SRC_PATH to DEST_PATH."""
4237     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
4238     Log().write(LOG_VERBOSE, "                to", dest_path)
4239
4240   def finish(self):
4241     """State that we are done creating our repository."""
4242     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
4243     Log().write(LOG_QUIET, "Done.")
4244
4245 def pass1():
4246   OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
4247   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
4248   cd = CollectData()
4249
4250   def visit_file(baton, dirname, files):
4251     cd = baton
4252     for fname in files:
4253       if fname[-2:] != ',v':
4254         continue
4255       cd.found_valid_file = 1
4256       pathname = os.path.join(dirname, fname)
4257       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
4258         # drop the 'Attic' portion from the pathname for the canonical name.
4259         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
4260       else:
4261         # If this file also exists in the attic, it's a fatal error
4262         attic_path = os.path.join(dirname, 'Attic', fname)
4263         if os.path.exists(attic_path):
4264           err = "%s: A CVS repository cannot contain both %s and %s" \
4265                 % (error_prefix, pathname, attic_path)
4266           sys.stderr.write(err + '\n')
4267           cd.fatal_errors.append(err)
4268         cd.set_fname(pathname, pathname)
4269       Log().write(LOG_NORMAL, pathname)
4270       try:
4271         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
4272       except (cvs2svn_rcsparse.common.RCSParseError, ValueError,
4273               RuntimeError):
4274         err = "%s: '%s' is not a valid ,v file" \
4275               % (error_prefix, pathname)
4276         sys.stderr.write(err + '\n')
4277         cd.fatal_errors.append(err)
4278       except:
4279         Log().write(LOG_WARN,
4280                     "Exception occurred while parsing %s" % pathname)
4281         raise
4282
4283   os.path.walk(Ctx().project.project_cvs_repos_path, visit_file, cd)
4284   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
4285
4286   cd.write_symbol_db()
4287
4288   if len(cd.fatal_errors) > 0:
4289     raise FatalException("Pass 1 complete.\n"
4290                          + "=" * 75 + "\n"
4291                          + "Error summary:\n"
4292                          + "\n".join(cd.fatal_errors) + "\n"
4293                          + "Exited due to fatal error(s).\n")
4294
4295   if cd.found_valid_file is None:
4296     raise FatalException(
4297         "\n"
4298         "No RCS files found in your CVS Repository!\n"
4299         "Are you absolutely certain you are pointing cvs2svn\n"
4300         "at a CVS repository?\n"
4301         "\n"
4302         "Exited due to fatal error(s).\n")
4303
4304   StatsKeeper().reset_c_rev_info()
4305   StatsKeeper().archive()
4306   Log().write(LOG_QUIET, "Done")
4307
4308 def pass2():
4309   "Pass 2: clean up the revision information."
4310
4311   symbol_db = SymbolDatabase()
4312   symbol_db.read()
4313
4314   # Convert the list of regexps to a list of strings
4315   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
4316
4317   error_detected = 0
4318
4319   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
4320   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
4321   if blocked_excludes:
4322     for branch, blockers in blocked_excludes.items():
4323       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
4324                        "excluded because the following symbols depend "
4325                        "on it:\n" % (branch))
4326       for blocker in blockers:
4327         sys.stderr.write("    '%s'\n" % (blocker))
4328     sys.stderr.write("\n")
4329     error_detected = 1
4330
4331   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
4332   invalid_forced_tags = [ ]
4333   for forced_tag in Ctx().forced_tags:
4334     if excludes.has_key(forced_tag):
4335       continue
4336     if symbol_db.branch_has_commit(forced_tag):
4337       invalid_forced_tags.append(forced_tag)
4338   if invalid_forced_tags:
4339     sys.stderr.write(error_prefix + ": The following branches cannot be "
4340                      "forced to be tags because they have commits:\n")
4341     for tag in invalid_forced_tags:
4342       sys.stderr.write("    '%s'\n" % (tag))
4343     sys.stderr.write("\n")
4344     error_detected = 1
4345
4346   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
4347   mismatches = symbol_db.find_mismatches(excludes)
4348   def is_not_forced(mismatch):
4349     name = mismatch[0]
4350     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
4351   mismatches = filter(is_not_forced, mismatches)
4352   if mismatches:
4353     sys.stderr.write(error_prefix + ": The following symbols are tags "
4354                      "in some files and branches in others.\nUse "
4355                      "--force-tag, --force-branch and/or --exclude to "
4356                      "resolve the symbols.\n")
4357     for name, tag_count, branch_count, commit_count in mismatches:
4358       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
4359                        "%d files and has commits in %d files.\n"
4360                        % (name, tag_count, branch_count, commit_count))
4361     error_detected = 1
4362
4363   # Bail out now if we found errors
4364   if error_detected:
4365     sys.exit(1)
4366
4367   # Create the tags database
4368   tags_db = TagsDatabase(DB_OPEN_NEW)
4369   for tag in symbol_db.tags:
4370     if tag not in Ctx().forced_branches:
4371       tags_db[tag] = None
4372   for tag in Ctx().forced_tags:
4373     tags_db[tag] = None
4374
4375   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
4376
4377   # We may have recorded some changes in revisions' timestamp.  We need to
4378   # scan for any other files which may have had the same log message and
4379   # occurred at "the same time" and change their timestamps, too.
4380
4381   # read the resync data file
4382   def read_resync(fname):
4383     "Read the .resync file into memory."
4384
4385     ### note that we assume that we can hold the entire resync file in
4386     ### memory. really large repositories with whacky timestamps could
4387     ### bust this assumption. should that ever happen, then it is possible
4388     ### to split the resync file into pieces and make multiple passes,
4389     ### using each piece.
4390
4391     #
4392     # A digest maps to a sequence of lists which specify a lower and upper
4393     # time bound for matching up the commit.  We keep a sequence of these
4394     # because a number of checkins with the same log message (e.g. an empty
4395     # log message) could need to be remapped.  We also make them a list
4396     # because we will dynamically expand the lower/upper bound as we find
4397     # commits that fall into a particular msg and time range.
4398     #
4399     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4400     #
4401     resync = { }
4402
4403     for line in fileinput.FileInput(fname):
4404       t1 = int(line[:8], 16)
4405       digest = line[9:DIGEST_END_IDX]
4406       t2 = int(line[DIGEST_END_IDX+1:], 16)
4407       t1_l = t1 - COMMIT_THRESHOLD/2
4408       t1_u = t1 + COMMIT_THRESHOLD/2
4409       resync.setdefault(digest, []).append([t1_l, t1_u, t2])
4410
4411     # For each digest, sort the resync items in it in increasing order,
4412     # based on the lower time bound.
4413     for val in resync.values():
4414       val.sort()
4415
4416     return resync
4417
4418   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4419
4420   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4421   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4422
4423   tweaked_timestamps_db = Database(temp(TWEAKED_TIMESTAMPS_DB), DB_OPEN_NEW)
4424   Cleanup().register(temp(TWEAKED_TIMESTAMPS_DB), pass2)
4425
4426   # process the revisions file, looking for items to clean up
4427   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4428     c_rev = CVSRevision(Ctx(), line[:-1])
4429
4430     # Skip this entire revision if it's on an excluded branch
4431     if excludes.has_key(c_rev.branch_name):
4432       continue
4433
4434     new_prev_ts = None
4435     if c_rev.prev_rev is not None:
4436       new_prev_ts = tweaked_timestamps_db.get(
4437         c_rev.unique_key(c_rev.prev_rev), None)
4438     if new_prev_ts:
4439       c_rev.prev_timestamp = new_prev_ts
4440
4441     new_next_ts = None
4442     if c_rev.next_rev is not None:
4443       new_next_ts = tweaked_timestamps_db.get(
4444         c_rev.unique_key(c_rev.next_rev), None)
4445     if new_next_ts:
4446       c_rev.next_timestamp = new_next_ts
4447
4448     # Remove all references to excluded tags and branches
4449     def not_excluded(symbol, excludes=excludes):
4450       return not excludes.has_key(symbol)
4451     c_rev.branches = filter(not_excluded, c_rev.branches)
4452     c_rev.tags = filter(not_excluded, c_rev.tags)
4453
4454     # Convert all branches that are forced to be tags
4455     for forced_tag in Ctx().forced_tags:
4456       if forced_tag in c_rev.branches:
4457         c_rev.branches.remove(forced_tag)
4458         c_rev.tags.append(forced_tag)
4459
4460     # Convert all tags that are forced to be branches
4461     for forced_branch in Ctx().forced_branches:
4462       if forced_branch in c_rev.tags:
4463         c_rev.tags.remove(forced_branch)
4464         c_rev.branches.append(forced_branch)
4465
4466     # see if this is "near" any of the resync records we
4467     # have recorded for this digest [of the log message].
4468     for record in resync.get(c_rev.digest, []):
4469       if record[2] == c_rev.timestamp:
4470         # This means that either c_rev is the same revision that
4471         # caused the resync record to exist, or c_rev is a different
4472         # CVS revision that happens to have the same timestamp.  In
4473         # either case, we don't have to do anything, so we...
4474         continue
4475
4476       if record[0] <= c_rev.timestamp <= record[1]:
4477         # bingo!  We probably want to remap the time on this c_rev,
4478         # unless the remapping would be useless because the new time
4479         # would fall outside the COMMIT_THRESHOLD window for this
4480         # commit group.
4481         new_timestamp = record[2]
4482         # If the new timestamp is earlier than that of our previous revision
4483         if new_timestamp < c_rev.prev_timestamp:
4484           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4485                   + " to time %s, which is before previous the time of"
4486                   + " revision %s (%s):")
4487           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4488                                         c_rev.cvs_path, new_timestamp,
4489                                         c_rev.prev_rev, c_rev.prev_timestamp))
4490           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4491           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4492           # attempted resync time, then sync back to c_rev.prev_timestamp
4493           # + 1...
4494           if ((c_rev.prev_timestamp + 1) - new_timestamp) < COMMIT_THRESHOLD:
4495             new_timestamp = c_rev.prev_timestamp + 1
4496             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4497                                                           new_timestamp))
4498           else:
4499             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4500                         warning_prefix)
4501             continue
4502
4503         # If the new timestamp is later than that of our next revision
4504         elif c_rev.next_timestamp and new_timestamp > c_rev.next_timestamp:
4505           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4506                   + " to time %s, which is after time of next"
4507                   + " revision %s (%s):")
4508           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4509                                         c_rev.cvs_path, new_timestamp,
4510                                         c_rev.prev_rev, c_rev.next_timestamp))
4511           # If resyncing our rev to c_rev.next_timestamp - 1 will place
4512           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4513           # attempted resync time, then sync forward to c_rev.next_timestamp
4514           # - 1...
4515           if (new_timestamp - (c_rev.next_timestamp - 1)) < COMMIT_THRESHOLD:
4516             new_timestamp = c_rev.next_timestamp - 1
4517             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4518                                                           new_timestamp))
4519           else:
4520             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4521                         warning_prefix)
4522             continue
4523
4524         # Fix for Issue #71: Avoid resyncing two consecutive revisions
4525         # to the same timestamp.
4526         elif (new_timestamp == c_rev.prev_timestamp
4527               or new_timestamp == c_rev.next_timestamp):
4528           continue
4529
4530         # adjust the time range. we want the COMMIT_THRESHOLD from the
4531         # bounds of the earlier/latest commit in this group.
4532         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4533         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4534
4535         msg = "PASS2 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4536               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4537                  new_timestamp - c_rev.timestamp)
4538         Log().write(LOG_VERBOSE, msg)
4539
4540         c_rev.timestamp = new_timestamp
4541         tweaked_timestamps_db[c_rev.unique_key()] = new_timestamp
4542
4543         # stop looking for hits
4544         break
4545
4546     output.write(str(c_rev) + "\n")
4547   Log().write(LOG_QUIET, "Done")
4548
4549 def pass3():
4550   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4551   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4552             temp(DATAFILE + SORTED_REVS_SUFFIX))
4553   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4554   Log().write(LOG_QUIET, "Done")
4555
4556 def pass4():
4557   """Iterate through sorted revs, storing them in a database.
4558   If we're not doing a trunk-only conversion, generate the
4559   LastSymbolicNameDatabase, which contains the last CVSRevision
4560   that is a source for each tag or branch.
4561   """
4562   Log().write(LOG_QUIET,
4563       "Copying CVS revision data from flat file to database...")
4564   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4565   if not Ctx().trunk_only:
4566     Log().write(LOG_QUIET,
4567         "Finding last CVS revisions for all symbolic names...")
4568     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4569   else:
4570     # This is to avoid testing Ctx().trunk_only every time around the loop
4571     class DummyLSNDB:
4572       def noop(*args): pass
4573       log_revision = noop
4574       create_database = noop
4575     last_sym_name_db = DummyLSNDB()
4576
4577   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4578     c_rev = CVSRevision(Ctx(), line[:-1])
4579     cvs_revs_db.log_revision(c_rev)
4580     last_sym_name_db.log_revision(c_rev)
4581     StatsKeeper().record_c_rev(c_rev)
4582
4583   last_sym_name_db.create_database()
4584   StatsKeeper().archive()
4585   Log().write(LOG_QUIET, "Done")
4586
4587 def pass5():
4588   """
4589   Generate the SVNCommit <-> CVSRevision mapping
4590   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4591   CVSRevisions that represent an opening or closing for a path on a
4592   branch or tag.  See SymbolingsLogger for more details.
4593   """
4594   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4595
4596   aggregator = CVSRevisionAggregator()
4597   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4598     c_rev = CVSRevision(Ctx(), line[:-1])
4599     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4600       aggregator.process_revision(c_rev)
4601   aggregator.flush()
4602
4603   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4604   StatsKeeper().archive()
4605   Log().write(LOG_QUIET, "Done")
4606
4607 def pass6():
4608   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4609
4610   if not Ctx().trunk_only:
4611     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4612               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4613     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4614   Log().write(LOG_QUIET, "Done")
4615
4616 def pass7():
4617   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4618
4619   def generate_offsets_for_symbolings():
4620     """This function iterates through all the lines in
4621     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4622     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4623     where SYMBOLIC_NAME is first encountered.  This will allow us to
4624     seek to the various offsets in the file and sequentially read only
4625     the openings and closings that we need."""
4626
4627     ###PERF This is a fine example of a db that can be in-memory and
4628     #just flushed to disk when we're done.  Later, it can just be sucked
4629     #back into memory.
4630     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4631     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4632
4633     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4634     old_sym = ""
4635     while 1:
4636       fpos = file.tell()
4637       line = file.readline()
4638       if not line:
4639         break
4640       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4641       if sym != old_sym:
4642         Log().write(LOG_VERBOSE, " ", sym)
4643         old_sym = sym
4644         offsets_db[sym] = fpos
4645
4646   if not Ctx().trunk_only:
4647     generate_offsets_for_symbolings()
4648   Log().write(LOG_QUIET, "Done.")
4649
4650 def pass8():
4651   svncounter = 2 # Repository initialization is 1.
4652   repos = SVNRepositoryMirror()
4653   persistence_manager = PersistenceManager(DB_OPEN_READ)
4654
4655   if Ctx().target:
4656     if not Ctx().dry_run:
4657       repos.add_delegate(RepositoryDelegate())
4658     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4659   else:
4660     if not Ctx().dry_run:
4661       repos.add_delegate(DumpfileDelegate())
4662     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4663
4664   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4665
4666   while 1:
4667     svn_commit = persistence_manager.get_svn_commit(svncounter)
4668     if not svn_commit:
4669       break
4670     repos.commit(svn_commit)
4671     svncounter += 1
4672
4673   repos.finish()
4674
4675 _passes = [
4676   pass1,
4677   pass2,
4678   pass3,
4679   pass4,
4680   pass5,
4681   pass6,
4682   pass7,
4683   pass8,
4684   ]
4685
4686
4687 class Ctx:
4688   """Session state for this run of cvs2svn.  For example, run-time
4689   options are stored here.  This class is a Borg, see
4690   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4691   """
4692   __shared_state = { }
4693   def __init__(self):
4694     self.__dict__ = self.__shared_state
4695     if self.__dict__:
4696       return
4697     # Else, initialize to defaults.
4698     self.target = None
4699     self.dumpfile = DUMPFILE
4700     self.tmpdir = '.'
4701     self.verbose = 0
4702     self.quiet = 0
4703     self.prune = 1
4704     self.existing_svnrepos = 0
4705     self.dump_only = 0
4706     self.dry_run = 0
4707     self.trunk_only = 0
4708     self.trunk_base = "trunk"
4709     self.tags_base = "tags"
4710     self.branches_base = "branches"
4711     self.encoding = ["ascii"]
4712     self.mime_types_file = None
4713     self.auto_props_file = None
4714     self.auto_props_ignore_case = False
4715     self.no_default_eol = 0
4716     self.eol_from_mime_type = 0
4717     self.keywords_off = 0
4718     self.use_cvs = None
4719     self.svnadmin = "svnadmin"
4720     self.username = None
4721     self.print_help = 0
4722     self.skip_cleanup = 0
4723     self.bdb_txn_nosync = 0
4724     self.fs_type = None
4725     self.forced_branches = []
4726     self.forced_tags = []
4727     self.excludes = []
4728     self.symbol_transforms = []
4729     self.svn_property_setters = []
4730
4731
4732 class CVSRevisionNumberSetter(SVNPropertySetter):
4733   """Set the cvs2svn:cvs-rev property to the CVS revision number."""
4734
4735   def set_properties(self, s_item):
4736     s_item.svn_props['cvs2svn:cvs-rev'] = s_item.c_rev.rev
4737     s_item.svn_props_changed = True
4738
4739
4740 class MimeMapper(SVNPropertySetter):
4741   """A class that provides mappings from file names to MIME types."""
4742
4743   def __init__(self, mime_types_file):
4744     self.mappings = { }
4745
4746     for line in fileinput.input(mime_types_file):
4747       if line.startswith("#"):
4748         continue
4749
4750       # format of a line is something like
4751       # text/plain c h cpp
4752       extensions = line.split()
4753       if len(extensions) < 2:
4754         continue
4755       type = extensions.pop(0)
4756       for ext in extensions:
4757         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4758           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n"
4759                            % (warning_prefix, ext, self.mappings[ext], type))
4760         self.mappings[ext] = type
4761
4762   def set_properties(self, s_item):
4763     basename, extension = os.path.splitext(
4764         os.path.basename(s_item.c_rev.cvs_path)
4765         )
4766
4767     # Extension includes the dot, so strip it (will leave extension
4768     # empty if filename ends with a dot, which is ok):
4769     extension = extension[1:]
4770
4771     # If there is no extension (or the file ends with a period), use
4772     # the base name for mapping.  This allows us to set mappings for
4773     # files such as README or Makefile:
4774     if not extension:
4775       extension = basename
4776
4777     mime_type = self.mappings.get(extension, None)
4778     if mime_type is not None:
4779       s_item.svn_props['svn:mime-type'] = mime_type
4780
4781
4782 class AutoPropsPropertySetter(SVNPropertySetter):
4783   """Set arbitrary svn properties based on an auto-props configuration.
4784
4785   This class always supports case-sensitive and case-insensitive
4786   pattern matching.  The 'correct' behavior is not quite clear,
4787   because subversion itself does an inconsistent job of handling case
4788   in auto-props patterns; see
4789   http://subversion.tigris.org/issues/show_bug.cgi?id=2036."""
4790
4791   class Pattern:
4792     """Describes the properties to be set for files matching a pattern."""
4793     def __init__(self, pattern, propdict):
4794       # A glob-like pattern:
4795       self.pattern = pattern
4796       # A dictionary of properties that should be set:
4797       self.propdict = propdict
4798
4799     def match(self, basename):
4800       """Does the file with the specified basename match pattern?"""
4801       return fnmatch.fnmatch(basename, self.pattern)
4802
4803   def __init__(self, configfilename, ignore_case):
4804     config = ConfigParser.ConfigParser()
4805     if ignore_case:
4806       self.transform_case = self.squash_case
4807     else:
4808       config.optionxform = self.preserve_case
4809       self.transform_case = self.preserve_case
4810
4811     config.readfp(file(configfilename))
4812     self.patterns = []
4813     for section in config.sections():
4814       if self.transform_case(section) == 'auto-props':
4815         for (pattern, value) in config.items(section):
4816           if value:
4817             self._add_pattern(pattern, value)
4818
4819   def squash_case(self, s):
4820     return s.lower()
4821
4822   def preserve_case(self, s):
4823     return s
4824
4825   def _add_pattern(self, pattern, value):
4826     props = value.split(';')
4827     propdict = {}
4828     for prop in props:
4829       s = prop.split('=', 1)
4830       if len(s) == 1:
4831         propdict[s[0]] = None
4832       else:
4833         propdict[s[0]] = s[1]
4834     self.patterns.append(
4835         self.Pattern(self.transform_case(pattern), propdict))
4836
4837   def get_propdict(self, path):
4838     basename = self.transform_case(os.path.basename(path))
4839     propdict = {}
4840     for pattern in self.patterns:
4841       if pattern.match(basename):
4842         for (key,value) in pattern.propdict.items():
4843           if propdict.has_key(key):
4844             if propdict[key] != value:
4845               Log().write(
4846                   LOG_WARN,
4847                   "Contradictory values set for property '%s' for file %s."
4848                   % (k, path,))
4849           else:
4850             propdict[key] = value
4851
4852     print 'propdict %s -> %s' % (path, propdict,) ###
4853     return propdict
4854
4855   def set_properties(self, s_item):
4856     propdict = self.get_propdict(s_item.c_rev.cvs_path)
4857     for (k,v) in propdict.items():
4858       if s_item.svn_props.has_key(k):
4859         if s_item.svn_props[k] != v:
4860           Log().write(
4861               LOG_WARN,
4862               "Property '%s' already set for file %s."
4863               % (k, s_item.c_rev.cvs_path,))
4864       else:
4865         s_item.svn_props[k] = v
4866
4867
4868 class BinaryFileDefaultMimeTypeSetter(SVNPropertySetter):
4869   """Set the default mime type for binary files, if no other one is known."""
4870
4871   def set_properties(self, s_item):
4872     if not s_item.svn_props.has_key('svn:mime-type') \
4873            and s_item.c_rev.mode == 'b':
4874       s_item.svn_props['svn:mime-type'] = 'application/octet-stream'
4875
4876
4877 class BinaryFileEOLStyleSetter(SVNPropertySetter):
4878   """Set the eol-style for binary files to None."""
4879
4880   def set_properties(self, s_item):
4881     if s_item.c_rev.mode == 'b':
4882       s_item.svn_props['svn:eol-style'] = None
4883
4884
4885 class EOLStyleFromMimeTypeSetter(SVNPropertySetter):
4886   """Set the eol-style from the mime type if it is not already known.
4887
4888   This setting is influenced by the mime-type setting, which must
4889   already have been set.  See also issue #39."""
4890
4891   def set_properties(self, s_item):
4892     if not s_item.svn_props.has_key('svn:eol-style') \
4893        and s_item.svn_props.get('svn:mime-type', None) is not None:
4894       if s_item.svn_props['svn:mime-type'].startswith("text/"):
4895         s_item.svn_props['svn:eol-style'] = 'native'
4896       else:
4897         s_item.svn_props['svn:eol-style'] = None
4898
4899
4900 class DefaultEOLStyleSetter(SVNPropertySetter):
4901   """Set the default eol-style if one has not already been set."""
4902
4903   def __init__(self, value):
4904     """Initialize with the specified default VALUE."""
4905
4906     self.value = value
4907
4908   def set_properties(self, s_item):
4909     if not s_item.svn_props.has_key('svn:eol-style'):
4910       s_item.svn_props['svn:eol-style'] = self.value
4911
4912
4913 class KeywordsPropertySetter(SVNPropertySetter):
4914   """Set the svn:keywords property based on the file's mode.  See
4915   issue #2."""
4916
4917   def __init__(self, value):
4918     """Use VALUE for the value of the svn:keywords property if it is
4919     to be set."""
4920
4921     self.value = value
4922
4923   def set_properties(self, s_item):
4924     if not s_item.svn_props.has_key('svn:keywords') \
4925            and s_item.c_rev.mode in [None, 'kv', 'kvl']:
4926       s_item.svn_props['svn:keywords'] = self.value
4927
4928
4929 class ExecutablePropertySetter(SVNPropertySetter):
4930   """Set the svn:executable property based on c_rev.file_executable."""
4931
4932   def set_properties(self, s_item):
4933     if s_item.c_rev.file_executable:
4934       s_item.svn_props['svn:executable'] = '*'
4935
4936
4937 def convert(start_pass, end_pass):
4938   "Convert a CVS repository to an SVN repository."
4939
4940   cleanup = Cleanup()
4941   times = [ None ] * (end_pass + 1)
4942   times[start_pass - 1] = time.time()
4943   StatsKeeper().set_start_time(time.time())
4944   for i in range(start_pass - 1, end_pass):
4945     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4946     _passes[i]()
4947     times[i + 1] = time.time()
4948     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4949     # Dispose of items in Ctx() not intended to live past the end of the pass
4950     # (Identified by exactly one leading underscore)
4951     for attr in dir(Ctx()):
4952       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4953           and attr[:6] != "_Ctx__"):
4954         delattr(Ctx(), attr)
4955     if not Ctx().skip_cleanup:
4956       cleanup.cleanup(_passes[i])
4957     StatsKeeper().set_end_time(time.time())
4958
4959   Log().write(LOG_QUIET, StatsKeeper())
4960   if end_pass < 4:
4961     Log().write(LOG_QUIET,
4962                 '(These are unaltered CVS repository stats and do not\n'
4963                 ' reflect tags or branches excluded via --exclude)\n')
4964   Log().write(LOG_NORMAL, StatsKeeper().timings())
4965
4966
4967 def normalize_ttb_path(opt, path):
4968   """Normalize a path to be used for --trunk, --tags, or --branches.
4969
4970   1. Strip leading, trailing, and duplicated '/'.
4971   2. Verify that the path is not empty.
4972
4973   Return the normalized path.
4974
4975   If the path is invalid, write an error message and exit."""
4976
4977   norm_path = _path_join(*path.split('/'))
4978   if not norm_path:
4979     raise FatalError("cannot pass an empty path to %s." % (opt,))
4980   return norm_path
4981
4982
4983 def verify_paths_disjoint(*paths):
4984   """Verify that all of the paths in the argument list are disjoint.
4985
4986   If any of the paths is nested in another one (i.e., in the sense
4987   that 'a/b/c/d' is nested in 'a/b'), or any two paths are identical,
4988   write an error message and exit."""
4989
4990   paths = [(path.split('/'), path) for path in paths]
4991   # If all overlapping elements are equal, a shorter list is
4992   # considered "less than" a longer one.  Therefore if any paths are
4993   # nested, this sort will leave at least one such pair adjacent, in
4994   # the order [nest,nestling].
4995   paths.sort()
4996   for i in range(1, len(paths)):
4997     split_path1, path1 = paths[i - 1]
4998     split_path2, path2 = paths[i]
4999     if len(split_path1) <= len(split_path2) \
5000        and split_path2[:len(split_path1)] == split_path1:
5001       raise FatalError("paths %s and %s are not disjoint." % (path1, path2,))
5002
5003
5004 def usage():
5005   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
5006         % os.path.basename(sys.argv[0])
5007   print '  --help, -h           print this usage message and exit with success'
5008   print '  --version            print the version number'
5009   print '  -q                   quiet'
5010   print '  -v                   verbose'
5011   print '  -s PATH              path for SVN repos'
5012   print '  -p START[:END]       start at pass START, end at pass END of %d' \
5013         % len(_passes)
5014   print '                       If only START is given, run only pass START'
5015   print '                       (implicitly enables --skip-cleanup)'
5016   print '  --existing-svnrepos  load into existing SVN repository'
5017   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
5018   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
5019   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
5020   print '  --dry-run            do not create a repository or a dumpfile;'
5021   print '                       just print what would happen.'
5022   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
5023   print '                       (only use this if having problems with RCS)'
5024   print '  --svnadmin=PATH      path to the svnadmin program'
5025   print '  --trunk-only         convert only trunk commits, not tags nor branches'
5026   print '  --trunk=PATH         path for trunk (default: %s)'    \
5027         % Ctx().trunk_base
5028   print '  --branches=PATH      path for branches (default: %s)' \
5029         % Ctx().branches_base
5030   print '  --tags=PATH          path for tags (default: %s)'     \
5031         % Ctx().tags_base
5032   print '  --no-prune           don\'t prune empty directories'
5033   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
5034   print '  --encoding=ENC       encoding of paths and log messages in CVS repos'
5035   print '                       Multiple of these options may be passed, where they'
5036   print '                       will be treated as an ordered list of encodings to'
5037   print '                       attempt (with "ascii" as a hardcoded last resort)'
5038   print '  --force-branch=NAME  force NAME to be a branch'
5039   print '  --force-tag=NAME     force NAME to be a tag'
5040   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
5041   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
5042   print '                       use Python regexp and reference syntax respectively'
5043   print '  --username=NAME      username for cvs2svn-synthesized commits'
5044   print '  --skip-cleanup       prevent the deletion of intermediate files'
5045   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
5046   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
5047   print '  --cvs-revnums        record CVS revision numbers as file properties'
5048   print '  --auto-props=FILE    set file properties from the auto-props section'
5049   print '                       of a file in svn config format'
5050   print '  --auto-props-ignore-case Ignore case when matching auto-props patterns'
5051   print '  --mime-types=FILE    specify an apache-style mime.types file for'
5052   print '                       setting svn:mime-type'
5053   print '  --eol-from-mime-type set svn:eol-style from mime type if known'
5054   print '  --no-default-eol     don\'t set svn:eol-style to \'native\' for'
5055   print '                       non-binary files with undetermined mime types'
5056   print '  --keywords-off       don\'t set svn:keywords on any files (by default,'
5057   print '                       cvs2svn sets svn:keywords on non-binary files to'
5058   print '                       "%s")' % SVN_KEYWORDS_VALUE
5059
5060 def main():
5061   # Convenience var, so we don't have to keep instantiating this Borg.
5062   ctx = Ctx()
5063
5064   profiling = None
5065   start_pass = 1
5066   end_pass = len(_passes)
5067
5068   try:
5069     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
5070                                [ "help", "create", "trunk=",
5071                                  "username=", "existing-svnrepos",
5072                                  "branches=", "tags=", "encoding=",
5073                                  "force-branch=", "force-tag=", "exclude=",
5074                                  "use-cvs", "mime-types=",
5075                                  "auto-props=", "auto-props-ignore-case",
5076                                  "eol-from-mime-type", "no-default-eol",
5077                                  "trunk-only", "no-prune", "dry-run",
5078                                  "dump-only", "dumpfile=", "tmpdir=",
5079                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
5080                                  "bdb-txn-nosync", "fs-type=",
5081                                  "version", "profile",
5082                                  "keywords-off", "symbol-transform="])
5083   except getopt.GetoptError, e:
5084     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
5085     usage()
5086     sys.exit(1)
5087
5088   for opt, value in opts:
5089     if opt == '--version':
5090         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
5091         sys.exit(0)
5092     elif opt == '-p':
5093       # Don't cleanup if we're doing incrementals.
5094       ctx.skip_cleanup = 1
5095       if value.find(':') > 0:
5096         start_pass, end_pass = map(int, value.split(':'))
5097       else:
5098         end_pass = start_pass = int(value)
5099       if start_pass > len(_passes) or start_pass < 1:
5100         raise FatalError(
5101             'illegal value (%d) for starting pass.  Must be 1 through %d.'
5102             % (int(start_pass), len(_passes),))
5103       if end_pass < start_pass or end_pass > len(_passes):
5104         raise FatalError(
5105             'illegal value (%d) for ending pass.  Must be %d through %d.'
5106             % (int(end_pass), int(start_pass), len(_passes),))
5107     elif (opt == '--help') or (opt == '-h'):
5108       ctx.print_help = 1
5109     elif opt == '-v':
5110       Log().log_level = LOG_VERBOSE
5111       ctx.verbose = 1
5112     elif opt == '-q':
5113       Log().log_level = LOG_QUIET
5114       ctx.quiet = 1
5115     elif opt == '-s':
5116       ctx.target = value
5117     elif opt == '--existing-svnrepos':
5118       ctx.existing_svnrepos = 1
5119     elif opt == '--dumpfile':
5120       ctx.dumpfile = value
5121     elif opt == '--tmpdir':
5122       ctx.tmpdir = value
5123     elif opt == '--use-cvs':
5124       ctx.use_cvs = 1
5125     elif opt == '--svnadmin':
5126       ctx.svnadmin = value
5127     elif opt == '--trunk-only':
5128       ctx.trunk_only = 1
5129     elif opt == '--trunk':
5130       ctx.trunk_base = normalize_ttb_path(opt, value)
5131     elif opt == '--branches':
5132       ctx.branches_base = normalize_ttb_path(opt, value)
5133     elif opt == '--tags':
5134       ctx.tags_base = normalize_ttb_path(opt, value)
5135     elif opt == '--no-prune':
5136       ctx.prune = None
5137     elif opt == '--dump-only':
5138       ctx.dump_only = 1
5139     elif opt == '--dry-run':
5140       ctx.dry_run = 1
5141     elif opt == '--encoding':
5142       ctx.encoding.insert(-1, value)
5143     elif opt == '--force-branch':
5144       ctx.forced_branches.append(value)
5145     elif opt == '--force-tag':
5146       ctx.forced_tags.append(value)
5147     elif opt == '--exclude':
5148       try:
5149         ctx.excludes.append(re.compile('^' + value + '$'))
5150       except re.error, e:
5151         raise FatalError("'%s' is not a valid regexp." % (value,))
5152     elif opt == '--mime-types':
5153       ctx.mime_types_file = value
5154     elif opt == '--auto-props':
5155       ctx.auto_props_file = value
5156     elif opt == '--auto-props-ignore-case':
5157       ctx.auto_props_ignore_case = True
5158     elif opt == '--eol-from-mime-type':
5159       ctx.eol_from_mime_type = 1
5160     elif opt == '--no-default-eol':
5161       ctx.no_default_eol = 1
5162     elif opt == '--keywords-off':
5163       ctx.keywords_off = 1
5164     elif opt == '--username':
5165       ctx.username = value
5166     elif opt == '--skip-cleanup':
5167       ctx.skip_cleanup = 1
5168     elif opt == '--cvs-revnums':
5169       ctx.svn_property_setters.append(CVSRevisionNumberSetter())
5170     elif opt == '--bdb-txn-nosync':
5171       ctx.bdb_txn_nosync = 1
5172     elif opt == '--fs-type':
5173       ctx.fs_type = value
5174     elif opt == '--create':
5175       sys.stderr.write(warning_prefix +
5176           ': The behaviour produced by the --create option is now the '
5177           'default,\nand passing the option is deprecated.\n')
5178     elif opt == '--profile':
5179       profiling = 1
5180     elif opt == '--symbol-transform':
5181       [pattern, replacement] = value.split(":")
5182       try:
5183         pattern = re.compile(pattern)
5184       except re.error, e:
5185         raise FatalError("'%s' is not a valid regexp." % (pattern,))
5186       ctx.symbol_transforms.append((pattern, replacement,))
5187
5188   if ctx.print_help:
5189     usage()
5190     sys.exit(0)
5191
5192   # Consistency check for options and arguments.
5193   if len(args) == 0:
5194     usage()
5195     sys.exit(1)
5196
5197   if len(args) > 1:
5198     sys.stderr.write(error_prefix +
5199                      ": must pass only one CVS repository.\n")
5200     usage()
5201     sys.exit(1)
5202
5203   cvsroot = args[0]
5204
5205   if ctx.use_cvs:
5206     ctx.cvs_repository = CVSRepositoryViaCVS(cvsroot)
5207   else:
5208     ctx.cvs_repository = CVSRepositoryViaRCS(cvsroot)
5209
5210   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
5211     raise FatalError("must pass one of '-s' or '--dump-only'.")
5212
5213   def not_both(opt1val, opt1name, opt2val, opt2name):
5214     if opt1val and opt2val:
5215       raise FatalError("cannot pass both '%s' and '%s'."
5216                        % (opt1name, opt2name,))
5217
5218   not_both(ctx.target, '-s',
5219            ctx.dump_only, '--dump-only')
5220
5221   not_both(ctx.dump_only, '--dump-only',
5222            ctx.existing_svnrepos, '--existing-svnrepos')
5223
5224   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
5225            ctx.existing_svnrepos, '--existing-svnrepos')
5226
5227   not_both(ctx.dump_only, '--dump-only',
5228            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
5229
5230   not_both(ctx.quiet, '-q',
5231            ctx.verbose, '-v')
5232
5233   not_both(ctx.fs_type, '--fs-type',
5234            ctx.existing_svnrepos, '--existing-svnrepos')
5235
5236   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
5237     raise FatalError("cannot pass --bdb-txn-nosync with --fs-type=%s."
5238                      % ctx.fs_type)
5239
5240   # Create the default project (using ctx.trunk, ctx.branches, and ctx.tags):
5241   ctx.project = Project(ctx.cvs_repository.cvs_repos_path,
5242                         ctx.trunk_base, ctx.branches_base, ctx.tags_base)
5243
5244   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
5245     raise FatalError("the svn-repos-path '%s' is not an "
5246                      "existing directory." % ctx.target)
5247
5248   if not ctx.dump_only and not ctx.existing_svnrepos \
5249      and (not ctx.dry_run) and os.path.exists(ctx.target):
5250     raise FatalError("the svn-repos-path '%s' exists.\n"
5251                      "Remove it, or pass '--existing-svnrepos'."
5252                      % ctx.target)
5253
5254   if ctx.target and not ctx.dry_run:
5255     # Verify that svnadmin can be executed.  The 'help' subcommand
5256     # should be harmless.
5257     try:
5258       check_command_runs([ctx.svnadmin, 'help'], 'svnadmin')
5259     except CommandFailedException, e:
5260       raise FatalError(
5261           '%s\n'
5262           'svnadmin could not be executed.  Please ensure that it is\n'
5263           'installed and/or use the --svnadmin option.' % (e,))
5264
5265   if ctx.mime_types_file:
5266     ctx.svn_property_setters.append(MimeMapper(ctx.mime_types_file))
5267
5268   if ctx.auto_props_file:
5269     ctx.svn_property_setters.append(AutoPropsPropertySetter(
5270         ctx.auto_props_file, ctx.auto_props_ignore_case))
5271
5272   ctx.svn_property_setters.append(BinaryFileDefaultMimeTypeSetter())
5273   ctx.svn_property_setters.append(BinaryFileEOLStyleSetter())
5274
5275   if ctx.eol_from_mime_type:
5276     ctx.svn_property_setters.append(EOLStyleFromMimeTypeSetter())
5277
5278   if ctx.no_default_eol:
5279     ctx.svn_property_setters.append(DefaultEOLStyleSetter(None))
5280   else:
5281     ctx.svn_property_setters.append(DefaultEOLStyleSetter('native'))
5282
5283   if not ctx.keywords_off:
5284     ctx.svn_property_setters.append(
5285         KeywordsPropertySetter(SVN_KEYWORDS_VALUE))
5286
5287   ctx.svn_property_setters.append(ExecutablePropertySetter())
5288
5289   # Make sure the tmp directory exists.  Note that we don't check if
5290   # it's empty -- we want to be able to use, for example, "." to hold
5291   # tempfiles.  But if we *did* want check if it were empty, we'd do
5292   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
5293   if not os.path.exists(ctx.tmpdir):
5294     os.mkdir(ctx.tmpdir)
5295   elif not os.path.isdir(ctx.tmpdir):
5296     raise FatalError(
5297         "cvs2svn tried to use '%s' for temporary files, but that path\n"
5298         "  exists and is not a directory.  Please make it be a directory,\n"
5299         "  or specify some other directory for temporary files."
5300         % (ctx.tmpdir,))
5301
5302   # But do lock the tmpdir, to avoid process clash.
5303   try:
5304     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5305   except OSError, e:
5306     if e.errno == errno.EACCES:
5307       raise FatalError("Permission denied:"
5308                        + " No write access to directory '%s'." % ctx.tmpdir)
5309     if e.errno == errno.EEXIST:
5310       raise FatalError(
5311           "cvs2svn is using directory '%s' for temporary files, but\n"
5312           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
5313           "  cvs2svn process is currently using '%s' as its temporary\n"
5314           "  workspace.  If you are certain that is not the case,\n"
5315           "  then remove the '%s/cvs2svn.lock' subdirectory."
5316           % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir,))
5317     raise
5318   try:
5319     if profiling:
5320       import hotshot
5321       prof = hotshot.Profile('cvs2svn.hotshot')
5322       prof.runcall(convert, start_pass, end_pass)
5323       prof.close()
5324     else:
5325       convert(start_pass, end_pass)
5326   finally:
5327     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5328     except: pass
5329
5330
5331 if __name__ == '__main__':
5332   try:
5333     main()
5334   except FatalException, e:
5335     sys.stderr.write(str(e))
5336     sys.exit(1)
5337
5338