cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 from __future__ import generators
  23
  24 import cvs2svn_rcsparse
  25 import os
  26 import sys
  27 import sha
  28 import re
  29 import time
  30 import fileinput
  31 import fnmatch
  32 import getopt
  33 import stat
  34 import md5
  35 import marshal
  36 import errno
  37 import popen2
  38 import types
  39 import ConfigParser
  40 try:
  41   # Try to get access to a bunch of encodings for use with --encoding.
  42   # See http://cjkpython.i18n.org/ for details.
  43   import iconv_codec
  44 except ImportError:
  45   pass
  46
  47 # Warnings and errors start with these strings.  They are typically
  48 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  49 warning_prefix = "WARNING"
  50 error_prefix = "ERROR"
  51
  52 # Make sure this Python is recent enough.
  53 if sys.hexversion < 0x02020000:
  54   sys.stderr.write("'%s: Python 2.2 or higher required, "
  55                    "see www.python.org.\n" % error_prefix)
  56   sys.exit(1)
  57
  58 # Pretend we have true booleans on older python versions
  59 try:
  60   True
  61 except:
  62   True = 1
  63   False = 0
  64
  65 # Opening pipes was a mess before Python 2.4, because some methods did
  66 # not exist on some platforms, and some behaved differenly on other.
  67 # Python 2.4 solved this by adding the subprocess module, but since we
  68 # cannot require such a new version, we cannot use it directly, but
  69 # must implement a simplified Popen using the best means neccessary.
  70 #
  71 # The SimplePopen class only has the following members and methods, all
  72 # behaving as documented in the subprocess.Popen class:
  73 #     - stdin
  74 #     - stdout
  75 #     - stderr
  76 #     - wait
  77 try:
  78   # First try subprocess.Popen...
  79   import subprocess
  80   class SimplePopen:
  81     def __init__(self, cmd, capture_stderr):
  82       if capture_stderr:
  83         stderr = subprocess.PIPE
  84       else:
  85         stderr = None
  86       self._popen = subprocess.Popen(cmd, stdin=subprocess.PIPE,
  87                                     stdout=subprocess.PIPE, stderr=stderr)
  88       self.stdin = self._popen.stdin
  89       self.stdout = self._popen.stdout
  90       if capture_stderr:
  91         self.stderr = self._popen.stderr
  92       self.wait = self._popen.wait
  93 except ImportError:
  94   if hasattr(popen2, 'Popen3'):
  95     # ...then try popen2.Popen3...
  96     class SimplePopen:
  97       def __init__(self, cmd, capture_stderr):
  98         self._popen3 = popen2.Popen3(cmd, capture_stderr)
  99         self.stdin = self._popen3.tochild
 100         self.stdout = self._popen3.fromchild
 101         if capture_stderr:
 102           self.stderr = self._popen3.childerr
 103         self.wait = self._popen3.wait
 104   else:
 105     # ...and if all fails, use popen2.popen3...
 106     class SimplePopen:
 107       def __init__(self, cmd, capture_stderr):
 108         if type(cmd) != types.StringType:
 109           cmd = argv_to_command_string(cmd)
 110         self.stdout, self.stdin, self.stderr = popen2.popen3(cmd, mode='b')
 111       def wait(self):
 112         return self.stdout.close() or self.stdin.close() or \
 113                self.stderr.close()
 114
 115 # DBM module selection
 116
 117 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
 118 #    so that the dbhash module used by anydbm will use bsddb3.
 119 try:
 120   import bsddb3
 121   sys.modules['bsddb'] = sys.modules['bsddb3']
 122 except ImportError:
 123   pass
 124
 125 # 2. These DBM modules are not good for cvs2svn.
 126 import anydbm
 127 if (anydbm._defaultmod.__name__ == 'dumbdbm'
 128     or anydbm._defaultmod.__name__ == 'dbm'):
 129   sys.stderr.write(
 130     error_prefix
 131     + ': your installation of Python does not contain a suitable\n'
 132     + 'DBM module -- cvs2svn cannot continue.\n'
 133     + 'See http://python.org/doc/current/lib/module-anydbm.html to solve.\n')
 134   sys.exit(1)
 135
 136 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
 137 #    Unfortunately, gdbm appears not to be trouble free, either.
 138 if hasattr(anydbm._defaultmod, 'bsddb') \
 139     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
 140   try:
 141     gdbm = __import__('gdbm')
 142   except ImportError:
 143     sys.stderr.write(warning_prefix +
 144         ': The version of the bsddb module found '
 145         'on your computer has been reported to malfunction on some datasets, '
 146         'causing KeyError exceptions. You may wish to upgrade your Python to '
 147         'version 2.3 or later.\n')
 148   else:
 149     anydbm._defaultmod = gdbm
 150
 151 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 152 cvs_branch_tag = re.compile('^((?:[0-9]+\\.[0-9]+\\.)+)0\\.([0-9]+)$')
 153 rcs_branch_tag = re.compile('^(?:[0-9]+\\.[0-9]+\\.)+[0-9]+$')
 154
 155 SVN_KEYWORDS_VALUE = 'Author Date Id Revision'
 156
 157 # This really only matches standard '1.1.1.*'-style vendor revisions.
 158 # One could conceivably have a file whose default branch is 1.1.3 or
 159 # whatever, or was that at some point in time, with vendor revisions
 160 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 161 # is the only time this regexp gets used), we'd have no basis for
 162 # assuming that the non-standard vendor branch had ever been the
 163 # default branch anyway, so we don't want this to match them anyway.
 164 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 165
 166 # If this run's output is a repository, then (in the tmpdir) we use
 167 # a dumpfile of this name for repository loads.
 168 #
 169 # If this run's output is a dumpfile, then this is default name of
 170 # that dumpfile, but in the current directory (unless the user has
 171 # specified a dumpfile path, of course, in which case it will be
 172 # wherever the user said).
 173 DUMPFILE = 'cvs2svn-dump'
 174
 175 # This file appears with different suffixes at different stages of
 176 # processing.  CVS revisions are cleaned and sorted here, for commit
 177 # grouping.  See design-notes.txt for details.
 178 DATAFILE = 'cvs2svn-data'
 179
 180 # This file contains a marshalled copy of all the statistics that we
 181 # gather throughout the various runs of cvs2svn.  The data stored as a
 182 # marshalled dictionary.
 183 STATISTICS_FILE = 'cvs2svn-statistics'
 184
 185 # This text file contains records (1 per line) that describe svn
 186 # filesystem paths that are the opening and closing source revisions
 187 # for copies to tags and branches.  The format is as follows:
 188 #
 189 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 190 #
 191 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 192 # SVN_REVNUM are the primary and secondary sorting criteria for
 193 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 194 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 195 # A sorted version of the above file.
 196 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 197
 198 # This file is a temporary file for storing symbolic_name -> closing
 199 # CVSRevision until the end of our pass where we can look up the
 200 # corresponding SVNRevNum for the closing revs and write these out to
 201 # the SYMBOL_OPENINGS_CLOSINGS.
 202 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 203
 204 # Skeleton version of an svn filesystem.
 205 # (These supersede and will eventually replace the two above.)
 206 # See class SVNRepositoryMirror for how these work.
 207 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 208 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 209
 210 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 211 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 212 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 213
 214 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 215 # the CVSRevision is the last such that is a source for those symbolic
 216 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 217 # file, and this file's 1.3 is the latest (by date) revision among
 218 # *all* CVS files that is a source for branch B, then the
 219 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 220 # list at least B in its list.
 221 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 222
 223 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 224 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 225 ### the s-revs data in this database.
 226 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 227
 228 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 229 # names), values are ignorable.
 230 TAGS_DB = 'cvs2svn-tags.db'
 231
 232 # A list all tags.  Each line consists of the tag name and the number
 233 # of files in which it exists, separated by a space.
 234 TAGS_LIST = 'cvs2svn-tags.txt'
 235
 236 # A list of all branches.  The file is stored as a plain text file
 237 # to make it easy to look at in an editor.  Each line contains the
 238 # branch name, the number of files where the branch is created, the
 239 # commit count, and a list of tags and branches that are defined on
 240 # revisions in the branch.
 241 BRANCHES_LIST = 'cvs2svn-branches.txt'
 242
 243 # These two databases provide a bidirectional mapping between
 244 # CVSRevision.unique_key()s and Subversion revision numbers.
 245 #
 246 # The first maps CVSRevision.unique_key() to a number; the values are
 247 # not unique.
 248 #
 249 # The second maps Subversion revision numbers to tuples (c_rev_keys,
 250 # motivating_revnum, symbolic_name, date).
 251 #
 252 # c_rev_keys is a list of CVSRevision.unique_key()s.
 253 #
 254 # If the SVNCommit is a default branch synchronization,
 255 # motivating_revnum is the svn_revnum of the primary SVNCommit that
 256 # motivated it; otherwise it is None.  (NOTE: Secondary commits that
 257 # fill branches and tags also have a motivating commit, but we do not
 258 # record it because it is (currently) not needed for anything.)
 259 # motivating_revnum is used when generating the log message for the
 260 # commit that synchronizes the default branch with trunk.
 261 #
 262 # symbolic_name is the symbolic name associated with the commit (if it
 263 # filled a symbolic name) or None otherwise.
 264 #
 265 # date is the date of the commit.
 266 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 267 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 268
 269 # How many bytes to read at a time from a pipe.  128 kiB should be
 270 # large enough to be efficient without wasting too much memory.
 271 PIPE_READ_SIZE = 128 * 1024
 272
 273 # Record the default RCS branches, if any, for CVS filepaths.
 274 #
 275 # The keys are CVS filepaths, relative to the top of the repository
 276 # and with the ",v" stripped off, so they match the cvs paths used in
 277 # Commit.commit().  The values are vendor branch revisions, such as
 278 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 279 # represents the highest vendor branch revision thought to have ever
 280 # been head of the default branch.
 281 #
 282 # The reason we record a specific vendor revision, rather than a
 283 # default branch number, is that there are two cases to handle:
 284 #
 285 # One case is simple.  The RCS file lists a default branch explicitly
 286 # in its header, such as '1.1.1'.  In this case, we know that every
 287 # revision on the vendor branch is to be treated as head of trunk at
 288 # that point in time.
 289 #
 290 # But there's also a degenerate case.  The RCS file does not currently
 291 # have a default branch, yet we can deduce that for some period in the
 292 # past it probably *did* have one.  For example, the file has vendor
 293 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 294 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 295 # case, we should record 1.1.1.96 as the last vendor revision to have
 296 # been the head of the default branch.
 297 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 298
 299 # Records the author and log message for each changeset.
 300 # The keys are author+log digests, the same kind used to identify
 301 # unique revisions in the .revs, etc files.  Each value is a tuple
 302 # of two elements: '(author logmessage)'.
 303 METADATA_DB = "cvs2svn-metadata.db"
 304
 305 # A temporary on-disk hash that maps CVSRevision unique keys to a new
 306 # timestamp for that CVSRevision.  These new timestamps are created in
 307 # pass2, and this hash is used exclusively in pass2.
 308 TWEAKED_TIMESTAMPS_DB = "cvs2svn-fixed-timestamps.db"
 309
 310 REVS_SUFFIX = '.revs'
 311 CLEAN_REVS_SUFFIX = '.c-revs'
 312 SORTED_REVS_SUFFIX = '.s-revs'
 313 RESYNC_SUFFIX = '.resync'
 314
 315 SVN_INVALID_REVNUM = -1
 316
 317 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 318
 319 # Things that can happen to a file.
 320 OP_NOOP   = '-'
 321 OP_ADD    = 'A'
 322 OP_DELETE = 'D'
 323 OP_CHANGE = 'C'
 324
 325 # A deltatext either does or doesn't represent some change.
 326 DELTATEXT_NONEMPTY = 'N'
 327 DELTATEXT_EMPTY    = 'E'
 328
 329 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 330
 331 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 332 OPENING = 'O'
 333 CLOSING = 'C'
 334
 335
 336 class FatalException(Exception):
 337   """Exception thrown on a non-recoverable error.
 338
 339   If this exception is thrown by main(), it is caught by the global
 340   layer of the program, its string representation is printed, and the
 341   program is ended with an exit code of 1."""
 342
 343   pass
 344
 345
 346 class FatalError(FatalException):
 347   """A FatalException that prepends error_prefix to the message."""
 348
 349   def __init__(self, msg):
 350     """Use (error_prefix + ': ' + MSG + '\n') as the error message."""
 351
 352     FatalException.__init__(self, '%s: %s\n' % (error_prefix, msg,))
 353
 354
 355 def temp(basename):
 356   """Return a path to BASENAME in Ctx().tmpdir.
 357   This is a convenience function to save horizontal space in source."""
 358
 359   return os.path.join(Ctx().tmpdir, basename)
 360
 361
 362 # Since the unofficial set also includes [/\] we need to translate those
 363 # into ones that don't conflict with Subversion limitations.
 364 def _clean_symbolic_name(name):
 365   """Return symbolic name NAME, translating characters that Subversion
 366   does not allow in a pathname."""
 367
 368   name = name.replace('/','++')
 369   name = name.replace('\\','--')
 370   return name
 371
 372
 373 def _path_join(*components):
 374   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 375   Empty component are skipped."""
 376
 377   return '/'.join(filter(None, components))
 378
 379
 380 def _path_split(path):
 381   """Split the svn pathname PATH into a pair, (HEAD, TAIL).
 382
 383   This is similar to os.path.split(), but always uses '/' as path
 384   separator.  PATH is an svn path, which should not start with a '/'.
 385   HEAD is everything before the last slash, and TAIL is everything
 386   after.  If PATH ends in a slash, TAIL will be empty.  If there is no
 387   slash in PATH, HEAD will be empty.  If PATH is empty, both HEAD and
 388   TAIL are empty."""
 389
 390   pos = path.rfind('/')
 391   if pos == -1:
 392     return ('', path,)
 393   else:
 394     return (path[:pos], path[pos+1:],)
 395
 396
 397 def to_utf8(value, mode='replace'):
 398   """Encode (as Unicode) VALUE, trying the encodings in Ctx.encoding
 399   as valid source encodings.  Raise UnicodeError on failure of all
 400   source encodings."""
 401
 402   ### FIXME: The 'replace' default mode should be an option,
 403   ### like --encoding is.
 404   for encoding in Ctx().encoding:
 405     try:
 406       return unicode(value, encoding, mode).encode('utf8')
 407     except UnicodeError:
 408       Log().write(LOG_VERBOSE, "Encoding '%s' failed for string '%s'"
 409                   % (encoding, value))
 410   raise UnicodeError
 411
 412
 413 ctrl_characters_regexp = re.compile('[\\\x00-\\\x1f\\\x7f]')
 414
 415 def verify_filename_legal(filename):
 416   """Verify that FILENAME does not include any control characters.  If
 417   it does, raise a FatalError."""
 418
 419   m = ctrl_characters_regexp.search(filename)
 420   if m:
 421     raise FatalError(
 422         "Character %r in filename %r is not supported by subversion."
 423         % (m.group(), filename,))
 424
 425
 426 def run_command(command):
 427   if os.system(command):
 428     raise FatalError('Command failed: "%s"' % (command,))
 429
 430
 431 class CommandFailedException(Exception):
 432   """Exception raised if check_command_runs() fails."""
 433
 434   pass
 435
 436
 437 def check_command_runs(cmd, cmdname):
 438   """Check whether the command CMD can be executed without errors.
 439
 440   CMD is a list or string, as accepted by SimplePopen.  CMDNAME is the
 441   name of the command as it should be included in exception error
 442   messages.
 443
 444   This function checks three things: (1) the command can be run
 445   without throwing an OSError; (2) it exits with status=0; (3) it
 446   doesn't output anything to stderr.  If any of these conditions is
 447   not met, raise a CommandFailedException describing the problem."""
 448
 449   try:
 450     pipe = SimplePopen(cmd, True)
 451   except OSError, e:
 452     raise CommandFailedException('error executing %s: %s' % (cmdname, e,))
 453   pipe.stdin.close()
 454   pipe.stdout.read()
 455   errmsg = pipe.stderr.read()
 456   status = pipe.wait()
 457   if status or errmsg:
 458     msg = 'error executing %s: status %s' % (cmdname, status,)
 459     if errmsg:
 460       msg += ', error output:\n%s' % (errmsg,)
 461     raise CommandFailedException(msg)
 462
 463
 464 class CVSRepository:
 465   """A CVS repository from which data can be extracted."""
 466
 467   def __init__(self, cvs_repos_path):
 468     """CVS_REPOS_PATH is the top of the CVS repository (at least as
 469     far as this run is concerned)."""
 470
 471     if not os.path.isdir(cvs_repos_path):
 472       raise FatalError("The specified CVS repository path '%s' is not an "
 473                        "existing directory." % cvs_repos_path)
 474
 475     self.cvs_repos_path = os.path.normpath(cvs_repos_path)
 476     self.cvs_prefix_re = re.compile(
 477         r'^' + re.escape(self.cvs_repos_path)
 478         + r'(' + re.escape(os.sep) + r'|$)')
 479
 480   def get_cvs_path(self, fname):
 481     """Return the path to FNAME relative to cvs_repos_path, with ',v' removed.
 482
 483     FNAME is a filesystem name that has to be within
 484     self.cvs_repos_path.  Return the filename relative to
 485     self.cvs_repos_path, with ',v' striped off if present, and with
 486     os.sep converted to '/'."""
 487
 488     (tail, n) = self.cvs_prefix_re.subn('', fname, 1)
 489     if n != 1:
 490       raise FatalError(
 491           "get_cvs_path: '%s' is not a sub-path of '%s'"
 492           % (fname, self.cvs_repos_path,))
 493     if tail.endswith(',v'):
 494       tail = tail[:-2]
 495     return tail.replace(os.sep, '/')
 496
 497   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 498     """Return a command string, and a pipe from which the file
 499     contents of C_REV can be read.  C_REV is a CVSRevision.  If
 500     SUPPRESS_KEYWORD_SUBSTITUTION is True, then suppress the
 501     substitution of RCS/CVS keywords in the output.  Standard output
 502     of the pipe returns the text of that CVS Revision.
 503
 504     The command string that is returned is provided for use in error
 505     messages; it is not escaped in such a way that it could
 506     necessarily be executed."""
 507
 508     raise NotImplementedError
 509
 510
 511 class CVSRepositoryViaRCS(CVSRepository):
 512   """A CVSRepository accessed via RCS."""
 513
 514   def __init__(self, cvs_repos_path):
 515     CVSRepository.__init__(self, cvs_repos_path)
 516     try:
 517       check_command_runs([ 'co', '-V' ], 'co')
 518     except CommandFailedException, e:
 519       raise FatalError('%s\n'
 520                        'Please check that co is installed and in your PATH\n'
 521                        '(it is a part of the RCS software).' % (e,))
 522
 523   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 524     pipe_cmd = [ 'co', '-q', '-x,v', '-p' + c_rev.rev ]
 525     if suppress_keyword_substitution:
 526       pipe_cmd.append('-kk')
 527     pipe_cmd.append(c_rev.rcs_path())
 528     pipe = SimplePopen(pipe_cmd, True)
 529     pipe.stdin.close()
 530     return ' '.join(pipe_cmd), pipe
 531
 532
 533 class CVSRepositoryViaCVS(CVSRepository):
 534   """A CVSRepository accessed via CVS."""
 535
 536   def __init__(self, cvs_repos_path):
 537     CVSRepository.__init__(self, cvs_repos_path)
 538     # Ascend above the specified root if necessary, to find the
 539     # cvs_repository_root (a directory containing a CVSROOT directory)
 540     # and the cvs_module (the path of the conversion root within the
 541     # cvs repository) NB: cvs_module must be seperated by '/' *not* by
 542     # os.sep .
 543     def is_cvs_repository_root(path):
 544       return os.path.isdir(os.path.join(path, 'CVSROOT'))
 545
 546     self.cvs_repository_root = os.path.abspath(self.cvs_repos_path)
 547     self.cvs_module = ""
 548     while not is_cvs_repository_root(self.cvs_repository_root):
 549       # Step up one directory:
 550       prev_cvs_repository_root = self.cvs_repository_root
 551       self.cvs_repository_root, module_component = \
 552           os.path.split(self.cvs_repository_root)
 553       if self.cvs_repository_root == prev_cvs_repository_root:
 554         # Hit the root (of the drive, on Windows) without finding a
 555         # CVSROOT dir.
 556         raise FatalError(
 557             "the path '%s' is not a CVS repository, nor a path "
 558             "within a CVS repository.  A CVS repository contains "
 559             "a CVSROOT directory within its root directory."
 560             % (self.cvs_repos_path,))
 561
 562       self.cvs_module = module_component + "/" + self.cvs_module
 563
 564     os.environ['CVSROOT'] = self.cvs_repository_root
 565
 566     def cvs_ok(global_arguments):
 567       check_command_runs(
 568           [ 'cvs' ] + global_arguments + [ '--version' ], 'cvs')
 569
 570     self.global_arguments = [ "-q", "-R" ]
 571     try:
 572       cvs_ok(self.global_arguments)
 573     except CommandFailedException, e:
 574       self.global_arguments = [ "-q" ]
 575       try:
 576         cvs_ok(self.global_arguments)
 577       except CommandFailedException, e:
 578         raise FatalError(
 579             '%s\n'
 580             'Please check that cvs is installed and in your PATH.' % (e,))
 581
 582   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 583     pipe_cmd = [ 'cvs' ] + self.global_arguments + \
 584                [ 'co', '-r' + c_rev.rev, '-p' ]
 585     if suppress_keyword_substitution:
 586       pipe_cmd.append('-kk')
 587     pipe_cmd.append(self.cvs_module + c_rev.cvs_path)
 588     pipe = SimplePopen(pipe_cmd, True)
 589     pipe.stdin.close()
 590     return ' '.join(pipe_cmd), pipe
 591
 592
 593 def generate_ignores(c_rev):
 594   # Read in props
 595   pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(c_rev)
 596   buf = pipe.stdout.read(PIPE_READ_SIZE)
 597   raw_ignore_val = ""
 598   while buf:
 599     raw_ignore_val += buf
 600     buf = pipe.stdout.read(PIPE_READ_SIZE)
 601   pipe.stdout.close()
 602   error_output = pipe.stderr.read()
 603   exit_status = pipe.wait()
 604   if exit_status:
 605     raise FatalError("The command '%s' failed with exit status: %s\n"
 606                      "and the following output:\n"
 607                      "%s" % (pipe_cmd, exit_status, error_output))
 608
 609   # Tweak props: First, convert any spaces to newlines...
 610   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 611   raw_ignores = raw_ignore_val.split('\n')
 612   ignore_vals = [ ]
 613   for ignore in raw_ignores:
 614     # Reset the list if we encounter a '!'
 615     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 616     if ignore == '!':
 617       ignore_vals = [ ]
 618       continue
 619     # Skip empty lines
 620     if len(ignore) == 0:
 621       continue
 622     ignore_vals.append(ignore)
 623   return ignore_vals
 624
 625
 626 class KeyGenerator:
 627   """Generate a series of unique strings."""
 628
 629   def __init__(self):
 630     self.key_base = 0L
 631
 632   def gen_key(self):
 633     """Generate and return a previously-unused key."""
 634
 635     key = '%x' % self.key_base
 636     self.key_base += 1
 637
 638     return key
 639
 640
 641 # ============================================================================
 642 # This code is copied with a few modifications from:
 643 #   subversion/subversion/bindings/swig/python/svn/core.py
 644
 645 if sys.platform == "win32":
 646   _escape_shell_arg_re = re.compile(r'(\\+)(\"|$)')
 647
 648   def escape_shell_arg(arg):
 649     # The (very strange) parsing rules used by the C runtime library are
 650     # described at:
 651     # http://msdn.microsoft.com/library/en-us/vclang/html/_pluslang_Parsing_C.2b2b_.Command.2d.Line_Arguments.asp
 652
 653     # double up slashes, but only if they are followed by a quote character
 654     arg = re.sub(_escape_shell_arg_re, r'\1\1\2', arg)
 655
 656     # surround by quotes and escape quotes inside
 657     arg = '"' + arg.replace('"', '"^""') + '"'
 658     return arg
 659
 660
 661   def argv_to_command_string(argv):
 662     """Flatten a list of command line arguments into a command string.
 663
 664     The resulting command string is expected to be passed to the system
 665     shell which os functions like popen() and system() invoke internally.
 666     """
 667
 668     # According cmd's usage notes (cmd /?), it parses the command line by
 669     # "seeing if the first character is a quote character and if so, stripping
 670     # the leading character and removing the last quote character."
 671     # So to prevent the argument string from being changed we add an extra set
 672     # of quotes around it here.
 673     return '"' + ' '.join(map(escape_shell_arg, argv)) + '"'
 674
 675 else:
 676   def escape_shell_arg(arg):
 677     return "'" + arg.replace("'", "'\\''") + "'"
 678
 679   def argv_to_command_string(argv):
 680     """Flatten a list of command line arguments into a command string.
 681
 682     The resulting command string is expected to be passed to the system
 683     shell which os functions like popen() and system() invoke internally.
 684     """
 685
 686     return ' '.join(map(escape_shell_arg, argv))
 687
 688
 689 # ============================================================================
 690
 691 def format_date(date):
 692   """Return an svn-compatible date string for DATE (seconds since epoch).
 693
 694   A Subversion date looks like '2002-09-29T14:44:59.000000Z'."""
 695
 696   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 697
 698
 699 def sort_file(infilename, outfilename):
 700   """Sort file INFILENAME, storing the results to OUTFILENAME."""
 701
 702   # GNU sort will sort our dates differently (incorrectly!) if our
 703   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 704   # it to 'C'
 705   lc_all_tmp = os.environ.get('LC_ALL', None)
 706   os.environ['LC_ALL'] = 'C'
 707   try:
 708     # The -T option to sort has a nice side effect.  The Win32 sort is
 709     # case insensitive and cannot be used, and since it does not
 710     # understand the -T option and dies if we try to use it, there is
 711     # no risk that we use that sort by accident.
 712     run_command('sort -T %s %s > %s'
 713                 % (Ctx().tmpdir, infilename, outfilename))
 714   finally:
 715     if lc_all_tmp is None:
 716       del os.environ['LC_ALL']
 717     else:
 718       os.environ['LC_ALL'] = lc_all_tmp
 719
 720
 721 def match_regexp_list(regexp_list, s):
 722   """Test whether string S matches any of the compiled regexps in
 723   REGEXP_LIST."""
 724
 725   for regexp in regexp_list:
 726     if regexp.match(s):
 727       return True
 728   return False
 729
 730
 731 class LF_EOL_Filter:
 732   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 733   into LFs only."""
 734
 735   def __init__(self, stream):
 736     self.stream = stream
 737     self.carry_cr = False
 738     self.eof = False
 739
 740   def read(self, size):
 741     while True:
 742       buf = self.stream.read(size)
 743       self.eof = len(buf) == 0
 744       if self.carry_cr:
 745         buf = '\r' + buf
 746         self.carry_cr = False
 747       if not self.eof and buf[-1] == '\r':
 748         self.carry_cr = True
 749         buf = buf[:-1]
 750       buf = buf.replace('\r\n', '\n')
 751       buf = buf.replace('\r', '\n')
 752       if len(buf) > 0 or self.eof:
 753         return buf
 754
 755
 756 # These constants represent the log levels that this script supports
 757 LOG_WARN = -1
 758 LOG_QUIET = 0
 759 LOG_NORMAL = 1
 760 LOG_VERBOSE = 2
 761
 762 class Log:
 763   """A Simple logging facility.  Each line will be timestamped is
 764   self.use_timestamps is TRUE.  This class is a Borg, see
 765   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 766
 767   __shared_state = {}
 768
 769   def __init__(self):
 770     self.__dict__ = self.__shared_state
 771     if self.__dict__:
 772       return
 773     self.log_level = LOG_NORMAL
 774     # Set this to true if you want to see timestamps on each line output.
 775     self.use_timestamps = None
 776     self.logger = sys.stdout
 777
 778   def _timestamp(self):
 779     """Output a detailed timestamp at the beginning of each line output."""
 780
 781     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 782
 783   def write(self, log_level, *args):
 784     """This is the public method to use for writing to a file.  Only
 785     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 786     there are multiple ARGS, they will be separated by a space."""
 787
 788     if log_level > self.log_level:
 789       return
 790     if self.use_timestamps:
 791       self._timestamp()
 792     self.logger.write(' '.join(map(str,args)) + "\n")
 793     # Ensure that log output doesn't get out-of-order with respect to
 794     # stderr output.
 795     self.logger.flush()
 796
 797
 798 class Cleanup:
 799   """This singleton class manages any files created by cvs2svn.  When
 800   you first create a file, call Cleanup.register, passing the
 801   filename, and the last pass that you need the file.  After the end
 802   of that pass, your file will be cleaned up after running an optional
 803   callback.  This class is a Borg, see
 804   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 805
 806   __shared_state = {}
 807
 808   def __init__(self):
 809     self.__dict__ = self.__shared_state
 810     if self.__dict__:
 811       return
 812     self._log = {}
 813     self._callbacks = {}
 814
 815   def register(self, file, which_pass, callback=None):
 816     """Register FILE for cleanup at the end of WHICH_PASS, running
 817     function CALLBACK prior to removal.  Registering a given FILE is
 818     idempotent; you may register as many times as you wish, but it
 819     will only be cleaned up once.
 820
 821     Note that if a file is registered multiple times, only the first
 822     callback registered for that file will be called at cleanup
 823     time.  Also note that if you register a database file you must
 824     close the database before cleanup, e.g. using a callback."""
 825
 826     self._log.setdefault(which_pass, {})[file] = 1
 827     if callback and not self._callbacks.has_key(file):
 828       self._callbacks[file] = callback
 829
 830   def cleanup(self, which_pass):
 831     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 832
 833     if not self._log.has_key(which_pass):
 834       return
 835     for file in self._log[which_pass]:
 836       Log().write(LOG_VERBOSE, "Deleting", file)
 837       if self._callbacks.has_key(file):
 838         self._callbacks[file]()
 839       os.unlink(file)
 840
 841
 842 # Always use these constants for opening databases.
 843 DB_OPEN_READ = 'r'
 844 DB_OPEN_NEW = 'n'
 845
 846
 847 class AbstractDatabase:
 848   """An abstract base class for anydbm-based databases."""
 849
 850   def __init__(self, filename, mode):
 851     """A convenience function for opening an anydbm database."""
 852
 853     # pybsddb3 has a bug which prevents it from working with
 854     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 855     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 856     # for databases protected by lock and transaction support
 857     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 858     #
 859     # Therefore, manually perform the removal (we can do this, because
 860     # we know that for bsddb - but *not* anydbm in general - the database
 861     # consists of one file with the name we specify, rather than several
 862     # based on that name).
 863     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 864       if os.path.isfile(filename):
 865         os.unlink(filename)
 866       mode = 'c'
 867
 868     self.db = anydbm.open(filename, mode)
 869
 870     # Import implementations for many mapping interface methods.  Note
 871     # that we specifically do not do this for any method which handles
 872     # *values*, because our derived classes define __getitem__ and
 873     # __setitem__ to override the storage of values, and grabbing
 874     # methods directly from the dbm object would bypass this.
 875     for meth_name in ('__delitem__', 'keys',
 876         '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
 877       meth_ref = getattr(self.db, meth_name, None)
 878       if meth_ref:
 879         setattr(self, meth_name, meth_ref)
 880
 881   def __delitem__(self, key):
 882     # gdbm defines a __delitem__ method, but it cannot be assigned.  So
 883     # this method provides a fallback definition via explicit delegation:
 884     del self.db[key]
 885
 886   def __iter__(self):
 887     for key in self.keys():
 888       yield key
 889
 890   def has_key(self, key):
 891     try:
 892       self.db[key]
 893       return True
 894     except KeyError:
 895       return False
 896
 897   def __contains__(self, key):
 898     return self.has_key(key)
 899
 900   def iterkeys(self):
 901     return self.__iter__()
 902
 903   def clear(self):
 904     for key in self.keys():
 905       del self[key]
 906
 907   def items(self):
 908     return [(key, self[key],) for key in self.keys()]
 909
 910   def values(self):
 911     return [self[key] for key in self.keys()]
 912
 913   def get(self, key, default=None):
 914     try:
 915       return self[key]
 916     except KeyError:
 917       return default
 918
 919
 920 class SDatabase(AbstractDatabase):
 921   """A database that can only store strings."""
 922
 923   def __getitem__(self, key):
 924     return self.db[key]
 925
 926   def __setitem__(self, key, value):
 927     self.db[key] = value
 928
 929
 930 class Database(AbstractDatabase):
 931   """A database that uses the marshal module to store built-in types."""
 932
 933   def __getitem__(self, key):
 934     return marshal.loads(self.db[key])
 935
 936   def __setitem__(self, key, value):
 937     self.db[key] = marshal.dumps(value)
 938
 939
 940 class StatsKeeper:
 941   __shared_state = { }
 942
 943   def __init__(self):
 944     self.__dict__ = self.__shared_state
 945     if self.__dict__:
 946       return
 947     self.filename = temp(STATISTICS_FILE)
 948     Cleanup().register(self.filename, pass8)
 949     # This can get kinda large, so we don't store it in our data dict.
 950     self.repos_files = { }
 951
 952     if os.path.exists(self.filename):
 953       self.unarchive()
 954     else:
 955       self.data = { 'cvs_revs_count' : 0,
 956                     'tags': { },
 957                     'branches' : { },
 958                     'repos_size' : 0,
 959                     'repos_file_count' : 0,
 960                     'svn_rev_count' : None,
 961                     'first_rev_date' : 1L<<32,
 962                     'last_rev_date' : 0,
 963                     'pass_timings' : { },
 964                     'start_time' : 0,
 965                     'end_time' : 0,
 966                     }
 967
 968   def log_duration_for_pass(self, duration, pass_num):
 969     self.data['pass_timings'][pass_num] = duration
 970
 971   def set_start_time(self, start):
 972     self.data['start_time'] = start
 973
 974   def set_end_time(self, end):
 975     self.data['end_time'] = end
 976
 977   def _bump_item(self, key, amount=1):
 978     self.data[key] += amount
 979
 980   def reset_c_rev_info(self):
 981     self.data['cvs_revs_count'] = 0
 982     self.data['tags'] = { }
 983     self.data['branches'] = { }
 984
 985   def record_c_rev(self, c_rev):
 986     self._bump_item('cvs_revs_count')
 987
 988     for tag in c_rev.tags:
 989       self.data['tags'][tag] = None
 990     for branch in c_rev.branches:
 991       self.data['branches'][branch] = None
 992
 993     if c_rev.timestamp < self.data['first_rev_date']:
 994       self.data['first_rev_date'] = c_rev.timestamp
 995
 996     if c_rev.timestamp > self.data['last_rev_date']:
 997       self.data['last_rev_date'] = c_rev.timestamp
 998
 999     # Only add the size if this is the first time we see the file.
1000     if not self.repos_files.has_key(c_rev.fname):
1001       self._bump_item('repos_size', c_rev.file_size)
1002     self.repos_files[c_rev.fname] = None
1003
1004     self.data['repos_file_count'] = len(self.repos_files)
1005
1006   def set_svn_rev_count(self, count):
1007     self.data['svn_rev_count'] = count
1008
1009   def svn_rev_count(self):
1010     return self.data['svn_rev_count']
1011
1012   def archive(self):
1013     open(self.filename, 'w').write(marshal.dumps(self.data))
1014
1015   def unarchive(self):
1016     self.data = marshal.loads(open(self.filename, 'r').read())
1017
1018   def __str__(self):
1019     svn_revs_str = ""
1020     if self.data['svn_rev_count'] is not None:
1021       svn_revs_str = ('Total SVN Commits:      %10s\n'
1022                       % self.data['svn_rev_count'])
1023
1024     return ('\n'                                \
1025             'cvs2svn Statistics:\n'             \
1026             '------------------\n'              \
1027             'Total CVS Files:        %10i\n'    \
1028             'Total CVS Revisions:    %10i\n'    \
1029             'Total Unique Tags:      %10i\n'    \
1030             'Total Unique Branches:  %10i\n'    \
1031             'CVS Repos Size in KB:   %10i\n'    \
1032             '%s'                                \
1033             'First Revision Date:    %s\n'      \
1034             'Last Revision Date:     %s\n'      \
1035             '------------------'                \
1036             % (self.data['repos_file_count'],
1037                self.data['cvs_revs_count'],
1038                len(self.data['tags']),
1039                len(self.data['branches']),
1040                (self.data['repos_size'] / 1024),
1041                svn_revs_str,
1042                time.ctime(self.data['first_rev_date']),
1043                time.ctime(self.data['last_rev_date']),
1044                ))
1045
1046   def timings(self):
1047     passes = self.data['pass_timings'].keys()
1048     passes.sort()
1049     output = 'Timings:\n------------------\n'
1050
1051     def desc(val):
1052       if val == 1: return "second"
1053       return "seconds"
1054
1055     for pass_num in passes:
1056       duration = int(self.data['pass_timings'][pass_num])
1057       p_str = ('pass %d:%6d %s\n'
1058                % (pass_num, duration, desc(duration)))
1059       output += p_str
1060
1061     total = int(self.data['end_time'] - self.data['start_time'])
1062     output += ('total: %6d %s' % (total, desc(total)))
1063     return output
1064
1065
1066 class LastSymbolicNameDatabase:
1067   """Passing every CVSRevision in s-revs to this class will result in
1068   a Database whose key is the last CVS Revision a symbolicname was
1069   seen in, and whose value is a list of all symbolicnames that were
1070   last seen in that revision."""
1071
1072   def __init__(self):
1073     self.symbols = {}
1074
1075   # Once we've gone through all the revs,
1076   # symbols.keys() will be a list of all tags and branches, and
1077   # their corresponding values will be a key into the last CVS revision
1078   # that they were used in.
1079   def log_revision(self, c_rev):
1080     # Gather last CVS Revision for symbolic name info and tag info
1081     for tag in c_rev.tags:
1082       self.symbols[tag] = c_rev.unique_key()
1083     if c_rev.op is not OP_DELETE:
1084       for branch in c_rev.branches:
1085         self.symbols[branch] = c_rev.unique_key()
1086
1087   # Creates an inversion of symbols above--a dictionary of lists (key
1088   # = CVS rev unique_key: val = list of symbols that close in that
1089   # rev.
1090   def create_database(self):
1091     symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), DB_OPEN_NEW)
1092     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
1093     for sym, rev_unique_key in self.symbols.items():
1094       ary = symbol_revs_db.get(rev_unique_key, [])
1095       ary.append(sym)
1096       symbol_revs_db[rev_unique_key] = ary
1097
1098
1099 class CVSRevisionDatabase:
1100   """A Database to store CVSRevision objects and retrieve them by their
1101   unique_key()."""
1102
1103   def __init__(self, mode):
1104     """Initialize an instance, opening database in MODE (like the MODE
1105     argument to Database or anydbm.open())."""
1106
1107     self.cvs_revs_db = SDatabase(temp(CVS_REVS_DB), mode)
1108     Cleanup().register(temp(CVS_REVS_DB), pass8)
1109
1110   def log_revision(self, c_rev):
1111     """Add C_REV, a CVSRevision, to the database."""
1112
1113     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
1114
1115   def get_revision(self, unique_key):
1116     """Return the CVSRevision stored under UNIQUE_KEY."""
1117
1118     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
1119
1120
1121 class TagsDatabase:
1122   """A Database to record symbolic names that are tags.
1123
1124   Each key is a tag name.  The value has no meaning, and is set to the
1125   empty string.  (Since an SDatabase is used, the key cannot be set to
1126   None.)"""
1127
1128   def __init__(self, mode):
1129     self.db = SDatabase(temp(TAGS_DB), mode)
1130     Cleanup().register(temp(TAGS_DB), pass8)
1131
1132   def add(self, item):
1133     self.db[item] = ''
1134
1135   def remove(self, item):
1136     del self.db[item]
1137
1138   def __contains__(self, item):
1139     return self.db.has_key(item)
1140
1141
1142 class Project:
1143   """A project within a CVS repository."""
1144
1145   def __init__(self, project_cvs_repos_path,
1146                trunk_path, branches_path, tags_path):
1147     """Create a new Project record.
1148
1149     PROJECT_CVS_REPOS_PATH is the main CVS directory for this project
1150     (within the filesystem).  TRUNK_PATH, BRANCHES_PATH, and TAGS_PATH
1151     are the full, normalized directory names in svn for the
1152     corresponding part of the repository."""
1153
1154     self.project_cvs_repos_path = project_cvs_repos_path
1155     prefix = Ctx().cvs_repository.cvs_repos_path
1156     if not self.project_cvs_repos_path.startswith(prefix):
1157       raise FatalError("Project '%s' must start with '%s'"
1158                        % (self.project_cvs_repos_path, prefix,))
1159     # The project's main directory as a cvs_path:
1160     self.project_cvs_path = self.project_cvs_repos_path[len(prefix):]
1161     if self.project_cvs_path.startswith(os.sep):
1162       self.project_cvs_path = self.project_cvs_path[1:]
1163     self.trunk_path = trunk_path
1164     self.branches_path = branches_path
1165     self.tags_path = tags_path
1166     verify_paths_disjoint(self.trunk_path, self.branches_path, self.tags_path)
1167
1168   def is_source(self, svn_path):
1169     """Return True iff SVN_PATH is a legitimate source for this project.
1170
1171     Legitimate paths are self.trunk_path or any directory directly
1172     under self.branches_path."""
1173
1174     if svn_path == self.trunk_path:
1175       return True
1176
1177     (head, tail,) = _path_split(svn_path)
1178     if head == self.branches_path:
1179       return True
1180
1181     return False
1182
1183   def is_unremovable(self, svn_path):
1184     """Return True iff the specified path must not be removed."""
1185
1186     return svn_path in [self.trunk_path, self.branches_path, self.tags_path]
1187
1188   def get_branch_path(self, branch_name):
1189     """Return the svnpath for the branch named BRANCH_NAME."""
1190
1191     return _path_join(self.branches_path, _clean_symbolic_name(branch_name))
1192
1193   def get_tag_path(self, tag_name):
1194     """Return the svnpath for the tag named TAG_NAME."""
1195
1196     return _path_join(self.tags_path, _clean_symbolic_name(tag_name))
1197
1198   def _relative_name(self, cvs_path):
1199     """Convert CVS_PATH into a name relative to this project's root directory.
1200
1201     CVS_PATH has to begin (textually) with self.project_cvs_path.
1202     Remove prefix and optional '/'."""
1203
1204     if not cvs_path.startswith(self.project_cvs_path):
1205       raise FatalError(
1206           "_relative_name: '%s' is not a sub-path of '%s'"
1207           % (cvs_path, self.project_cvs_path,))
1208     l = len(self.project_cvs_path)
1209     if cvs_path[l] == os.sep:
1210       l += 1
1211     return cvs_path[l:]
1212
1213   def make_trunk_path(self, cvs_path):
1214     """Return the trunk path for CVS_PATH.
1215
1216     Return the svn path for this file on trunk."""
1217
1218     return _path_join(self.trunk_path, self._relative_name(cvs_path))
1219
1220   def make_branch_path(self, branch_name, cvs_path):
1221     """Return the svn path for CVS_PATH on branch BRANCH_NAME."""
1222
1223     return _path_join(self.get_branch_path(branch_name),
1224                       self._relative_name(cvs_path))
1225
1226
1227 class CVSRevision:
1228   def __init__(self, ctx, *args):
1229     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
1230
1231     If CTX is None, the following members and methods of the
1232     instantiated CVSRevision class object will be unavailable (or
1233     simply will not work correctly, if at all):
1234        cvs_path
1235        svn_path
1236        is_default_branch_revision()
1237
1238     (Note that this class treats CTX as const, because the caller
1239     likely passed in a Borg instance of a Ctx.  The reason this class
1240     takes CTX as as a parameter, instead of just instantiating a Ctx
1241     itself, is that this class should be usable outside cvs2svn.)
1242
1243     If there is one argument in ARGS, it is a string, in the format of
1244     a line from a revs file.  Do *not* include a trailing newline.
1245
1246     If there are multiple ARGS, there must be 17 of them,
1247     comprising a parsed revs line:
1248        timestamp       -->  (int) date stamp for this cvs revision
1249        digest          -->  (string) digest of author+logmsg
1250        prev_timestamp  -->  (int) date stamp for the previous cvs revision
1251        next_timestamp  -->  (int) date stamp for the next cvs revision
1252        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
1253        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
1254        rev             -->  (string) this CVS rev, e.g., "1.3"
1255        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
1256        file_in_attic   -->  (char or None) true if RCS file is in Attic
1257        file_executable -->  (char or None) true if RCS file has exec bit set.
1258        file_size       -->  (int) size of the RCS file
1259        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
1260        fname           -->  (string) relative path of file in CVS repos
1261        mode            -->  (string or None) "kkv", "kb", etc.
1262        branch_name     -->  (string or None) branch on which this rev occurred
1263        tags            -->  (list of strings) all tags on this revision
1264        branches        -->  (list of strings) all branches rooted in this rev
1265
1266     The two forms of initialization are equivalent.
1267
1268     WARNING: Due to the resync process in pass2, prev_timestamp or
1269     next_timestamp may be incorrect in the c-revs or s-revs files."""
1270
1271     self._ctx = ctx
1272     if len(args) == 17:
1273       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1274        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1275        self.file_executable, self.file_size, self.deltatext_code,
1276        self.fname,
1277        self.mode, self.branch_name, self.tags, self.branches) = args
1278     elif len(args) == 1:
1279       data = args[0].split(' ', 15)
1280       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1281        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1282        self.file_executable, self.file_size, self.deltatext_code,
1283        self.mode, self.branch_name, numtags, remainder) = data
1284       # Patch up data items which are not simple strings
1285       self.timestamp = int(self.timestamp, 16)
1286       if self.prev_timestamp == "*":
1287         self.prev_timestamp = 0
1288       else:
1289         self.prev_timestamp = int(self.prev_timestamp)
1290       if self.next_timestamp == "*":
1291         self.next_timestamp = 0
1292       else:
1293         self.next_timestamp = int(self.next_timestamp)
1294       if self.prev_rev == "*":
1295         self.prev_rev = None
1296       if self.next_rev == "*":
1297         self.next_rev = None
1298       if self.file_in_attic == "*":
1299         self.file_in_attic = None
1300       if self.file_executable == "*":
1301         self.file_executable = None
1302       self.file_size = int(self.file_size)
1303       if self.mode == "*":
1304         self.mode = None
1305       if self.branch_name == "*":
1306         self.branch_name = None
1307       numtags = int(numtags)
1308       tags_and_numbranches_and_remainder = remainder.split(' ', numtags + 1)
1309       self.tags = tags_and_numbranches_and_remainder[:-2]
1310       numbranches = int(tags_and_numbranches_and_remainder[-2])
1311       remainder = tags_and_numbranches_and_remainder[-1]
1312       branches_and_fname = remainder.split(' ', numbranches)
1313       self.branches = branches_and_fname[:-1]
1314       self.fname = branches_and_fname[-1]
1315     else:
1316       raise TypeError, 'CVSRevision() takes 2 or 18 arguments (%d given)' % \
1317           (len(args) + 1)
1318     if ctx is not None:
1319       self.cvs_path = ctx.cvs_repository.get_cvs_path(self.fname)
1320       if self.branch_name:
1321         self.svn_path = ctx.project.make_branch_path(self.branch_name,
1322                                                      self.cvs_path)
1323       else:
1324         self.svn_path = ctx.project.make_trunk_path(self.cvs_path)
1325
1326   # The 'primary key' of a CVS Revision is the revision number + the
1327   # filename.  To provide a unique key (say, for a dict), we just glom
1328   # them together in a string.  By passing in self.prev_rev or
1329   # self.next_rev, you can get the unique key for their respective
1330   # CVSRevisions.
1331   def unique_key(self, revnum="0"):
1332     if revnum is "0":
1333       revnum = self.rev
1334     elif revnum is None:
1335       return None
1336     return revnum + "/" + self.fname
1337
1338   def __str__(self):
1339     return ('%08lx %s %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s'
1340             % (self.timestamp, self.digest, self.prev_timestamp or "*",
1341               self.next_timestamp or "*", self.op, (self.prev_rev or "*"),
1342               self.rev, (self.next_rev or "*"), (self.file_in_attic or "*"),
1343               (self.file_executable or "*"),
1344               self.file_size,
1345               self.deltatext_code, (self.mode or "*"),
1346               (self.branch_name or "*"),
1347               len(self.tags), self.tags and " " or "", " ".join(self.tags),
1348               len(self.branches), self.branches and " " or "",
1349               " ".join(self.branches),
1350               self.fname, ))
1351
1352   # Returns true if this CVSRevision is the opening CVSRevision for
1353   # NAME (for this RCS file).
1354   def opens_symbolic_name(self, name):
1355     if name in self.tags:
1356       return 1
1357     if name in self.branches:
1358       # If this c_rev opens a branch and our op is OP_DELETE, then
1359       # that means that the file that this c_rev belongs to was
1360       # created on the branch, so for all intents and purposes, this
1361       # c_rev is *technically* not an opening.  See Issue #62 for more
1362       # information.
1363       if self.op != OP_DELETE:
1364         return 1
1365     return 0
1366
1367   def is_default_branch_revision(self):
1368     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
1369     revision according to DEFAULT_BRANCHES_DB (see the conditions
1370     documented there), else return None."""
1371
1372     val = self._ctx._default_branches_db.get(self.cvs_path, None)
1373     if val is not None:
1374       val_last_dot = val.rindex(".")
1375       our_last_dot = self.rev.rindex(".")
1376       default_branch = val[:val_last_dot]
1377       our_branch = self.rev[:our_last_dot]
1378       default_rev_component = int(val[val_last_dot + 1:])
1379       our_rev_component = int(self.rev[our_last_dot + 1:])
1380       if (default_branch == our_branch
1381           and our_rev_component <= default_rev_component):
1382         return 1
1383     # else
1384     return None
1385
1386   def rcs_path(self):
1387     """Returns the actual filesystem path to the RCS file of this
1388     CVSRevision."""
1389
1390     if self.file_in_attic is None:
1391       return self.fname
1392     else:
1393       basepath, filename = os.path.split(self.fname)
1394       return os.path.join(basepath, 'Attic', filename)
1395
1396   def filename(self):
1397     """Return the last path component of self.fname, minus the ',v'."""
1398
1399     return os.path.split(self.fname)[-1][:-2]
1400
1401
1402 class SymbolDatabase:
1403   """This database records information on all symbols in the RCS
1404   files.  It is created in pass 1 and it is used in pass 2."""
1405
1406   def __init__(self):
1407     # A hash that maps tag names to commit counts
1408     self.tags = { }
1409     # A hash that maps branch names to lists of the format
1410     # [ create_count, commit_count, blockers ], where blockers
1411     # is a hash that lists the symbols that depend on the
1412     # the branch.  The blockers hash is used as a set, so the
1413     # values are not used.
1414     self.branches = { }
1415
1416   def register_tag_creation(self, name):
1417     """Register the creation of the tag NAME."""
1418
1419     self.tags[name] = self.tags.get(name, 0) + 1
1420
1421   def _branch(self, name):
1422     """Helper function to get a branch node that will create and
1423     initialize the node if it does not exist."""
1424
1425     if not self.branches.has_key(name):
1426       self.branches[name] = [ 0, 0, { } ]
1427     return self.branches[name]
1428
1429   def register_branch_creation(self, name):
1430     """Register the creation of the branch NAME."""
1431
1432     self._branch(name)[0] += 1
1433
1434   def register_branch_commit(self, name):
1435     """Register a commit on the branch NAME."""
1436
1437     self._branch(name)[1] += 1
1438
1439   def register_branch_blocker(self, name, blocker):
1440     """Register BLOCKER as a blocker on the branch NAME."""
1441
1442     self._branch(name)[2][blocker] = None
1443
1444   def branch_has_commit(self, name):
1445     """Return non-zero if NAME has commits.  Returns 0 if name
1446     is not a branch or if it has no commits."""
1447
1448     return self.branches.has_key(name) and self.branches[name][1]
1449
1450   def find_excluded_symbols(self, regexp_list):
1451     """Returns a hash of all symbols that match the regexps in
1452     REGEXP_LIST.  The hash is used as a set so the values are
1453     not used."""
1454
1455     excludes = { }
1456     for tag in self.tags:
1457       if match_regexp_list(regexp_list, tag):
1458         excludes[tag] = None
1459     for branch in self.branches:
1460       if match_regexp_list(regexp_list, branch):
1461         excludes[branch] = None
1462     return excludes
1463
1464   def find_branch_exclude_blockers(self, branch, excludes):
1465     """Find all blockers of BRANCH, excluding the ones in the hash
1466     EXCLUDES."""
1467
1468     blockers = { }
1469     if excludes.has_key(branch):
1470       for blocker in self.branches[branch][2]:
1471         if not excludes.has_key(blocker):
1472           blockers[blocker] = None
1473     return blockers
1474
1475   def find_blocked_excludes(self, excludes):
1476     """Find all branches not in EXCLUDES that have blocking symbols that
1477     are not themselves excluded.  Return a hash that maps branch names
1478     to a hash of blockers.  The hash of blockes is used as a set so the
1479     values are not used."""
1480
1481     blocked_branches = { }
1482     for branch in self.branches:
1483       blockers = self.find_branch_exclude_blockers(branch, excludes)
1484       if blockers:
1485         blocked_branches[branch] = blockers
1486     return blocked_branches
1487
1488   def find_mismatches(self, excludes=None):
1489     """Find all symbols that are defined as both tags and branches,
1490     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1491     the symbol name, tag count, branch count and commit count."""
1492
1493     if excludes is None:
1494       excludes = { }
1495     mismatches = [ ]
1496     for branch in self.branches:
1497       if not excludes.has_key(branch) and self.tags.has_key(branch):
1498         mismatches.append((branch,                    # name
1499                            self.tags[branch],         # tag count
1500                            self.branches[branch][0],  # branch count
1501                            self.branches[branch][1])) # commit count
1502     return mismatches
1503
1504   def read(self):
1505     """Read the symbol database from files."""
1506
1507     f = open(temp(TAGS_LIST))
1508     while 1:
1509       line = f.readline()
1510       if not line:
1511         break
1512       tag, count = line.split()
1513       self.tags[tag] = int(count)
1514
1515     f = open(temp(BRANCHES_LIST))
1516     while 1:
1517       line = f.readline()
1518       if not line:
1519         break
1520       words = line.split()
1521       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1522       for blocker in words[3:]:
1523         self.branches[words[0]][2][blocker] = None
1524
1525   def write(self):
1526     """Store the symbol database to files."""
1527
1528     f = open(temp(TAGS_LIST), "w")
1529     Cleanup().register(temp(TAGS_LIST), pass2)
1530     for tag, count in self.tags.items():
1531       f.write("%s %d\n" % (tag, count))
1532
1533     f = open(temp(BRANCHES_LIST), "w")
1534     Cleanup().register(temp(BRANCHES_LIST), pass2)
1535     for branch, info in self.branches.items():
1536       f.write("%s %d %d" % (branch, info[0], info[1]))
1537       if info[2]:
1538         f.write(" ")
1539         f.write(" ".join(info[2].keys()))
1540       f.write("\n")
1541
1542
1543 class FileDataCollector(cvs2svn_rcsparse.Sink):
1544   """Class responsible for collecting RCS data for a particular file.
1545
1546   Any collected data that need to be remembered are stored into the
1547   referenced CollectData instance."""
1548
1549   def __init__(self, collect_data, canonical_name, filename):
1550     """Create an object that is prepared to receive data for FILENAME.
1551     FILENAME is the absolute filesystem path to the file in question,
1552     and CANONICAL_NAME is FILENAME with the 'Attic' component removed
1553     (if the file is indeed in the Attic).  COLLECT_DATA is used to
1554     store the information collected about the file."""
1555
1556     self.collect_data = collect_data
1557
1558     self.fname = canonical_name
1559
1560     # We calculate and save some file metadata here, where we can do
1561     # it only once per file, instead of waiting until later where we
1562     # would have to do the same calculations once per CVS *revision*.
1563
1564     self.cvs_path = Ctx().cvs_repository.get_cvs_path(self.fname)
1565
1566     # If the paths are not the same, then that means that the
1567     # canonical_name has had the 'Attic' component stripped out.
1568     self.file_in_attic = None
1569     if canonical_name != filename:
1570       self.file_in_attic = 1
1571
1572     file_stat = os.stat(filename)
1573     # The size of our file in bytes
1574     self.file_size = file_stat[stat.ST_SIZE]
1575
1576     # Whether or not the executable bit is set.
1577     self.file_executable = None
1578     if file_stat[0] & stat.S_IXUSR:
1579       self.file_executable = 1
1580
1581     # revision -> [timestamp, author, old-timestamp]
1582     self.rev_data = { }
1583
1584     # Maps revision number (key) to the revision number of the
1585     # previous revision along this line of development.
1586     #
1587     # For the first revision R on a branch, we consider the revision
1588     # from which R sprouted to be the 'previous'.
1589     #
1590     # Note that this revision can't be determined arithmetically (due
1591     # to cvsadmin -o, which is why this is necessary).
1592     #
1593     # If the key has no previous revision, then store None as key's
1594     # value.
1595     self.prev_rev = { }
1596
1597     # This dict is essentially self.prev_rev with the values mapped in
1598     # the other direction, so following key -> value will yield you
1599     # the next revision number.
1600     #
1601     # Unlike self.prev_rev, if the key has no next revision, then the
1602     # key is not present.
1603     self.next_rev = { }
1604
1605     # Track the state of each revision so that in set_revision_info,
1606     # we can determine if our op is an add/change/delete.  We can do
1607     # this because in set_revision_info, we'll have all of the
1608     # revisions for a file at our fingertips, and we need to examine
1609     # the state of our prev_rev to determine if we're an add or a
1610     # change--without the state of the prev_rev, we are unable to
1611     # distinguish between an add and a change.
1612     self.rev_state = { }
1613
1614     # Hash mapping branch numbers, like '1.7.2', to branch names,
1615     # like 'Release_1_0_dev'.
1616     self.branch_names = { }
1617
1618     # RCS flags (used for keyword expansion).
1619     self.mode = None
1620
1621     # Hash mapping revision numbers, like '1.7', to lists of names
1622     # indicating which branches sprout from that revision, like
1623     # ['Release_1_0_dev', 'experimental_driver', ...].
1624     self.branchlist = { }
1625
1626     # Like self.branchlist, but the values are lists of tag names that
1627     # apply to the key revision.
1628     self.taglist = { }
1629
1630     # If set, this is an RCS branch number -- rcsparse calls this the
1631     # "principal branch", but CVS and RCS refer to it as the "default
1632     # branch", so that's what we call it, even though the rcsparse API
1633     # setter method is still 'set_principal_branch'.
1634     self.default_branch = None
1635
1636     # If the RCS file doesn't have a default branch anymore, but does
1637     # have vendor revisions, then we make an educated guess that those
1638     # revisions *were* the head of the default branch up until the
1639     # commit of 1.2, at which point the file's default branch became
1640     # trunk.  This records the date at which 1.2 was committed.
1641     self.first_non_vendor_revision_date = None
1642
1643     # A list of all symbols defined for the current file.  Used to
1644     # prevent multiple definitions of a symbol, something which can
1645     # easily happen when --symbol-transform is used.
1646     self.defined_symbols = { }
1647
1648   def set_principal_branch(self, branch):
1649     self.default_branch = branch
1650
1651   def set_expansion(self, mode):
1652     self.mode = mode
1653
1654   def set_branch_name(self, branch_number, name):
1655     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1656     and that NAME sprouts from BRANCH_NUMBER.
1657     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1658     for example '1.7.2' (never '1.7.0.2')."""
1659
1660     if not self.branch_names.has_key(branch_number):
1661       self.branch_names[branch_number] = name
1662       # The branchlist is keyed on the revision number from which the
1663       # branch sprouts, so strip off the odd final component.
1664       sprout_rev = branch_number[:branch_number.rfind(".")]
1665       self.branchlist.setdefault(sprout_rev, []).append(name)
1666       self.collect_data.symbol_db.register_branch_creation(name)
1667     else:
1668       sys.stderr.write("%s: in '%s':\n"
1669                        "   branch '%s' already has name '%s',\n"
1670                        "   cannot also have name '%s', ignoring the latter\n"
1671                        % (warning_prefix, self.fname, branch_number,
1672                           self.branch_names[branch_number], name))
1673
1674   def rev_to_branch_name(self, revision):
1675     """Return the name of the branch on which REVISION lies.
1676     REVISION is a non-branch revision number with an even number of,
1677     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1678     For the convenience of callers, REVISION can also be a trunk
1679     revision such as '1.2', in which case just return None."""
1680
1681     if trunk_rev.match(revision):
1682       return None
1683     return self.branch_names.get(revision[:revision.rindex(".")])
1684
1685   def define_tag(self, name, revision):
1686     """Record a bidirectional mapping between symbolic NAME and REVISION.
1687     REVISION is an unprocessed revision number from the RCS file's
1688     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1689     This function will determine what kind of symbolic name it is by
1690     inspection, and record it in the right places."""
1691
1692     for (pattern, replacement) in Ctx().symbol_transforms:
1693       newname = pattern.sub(replacement, name)
1694       if newname != name:
1695         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1696                     % (name, newname))
1697         name = newname
1698     if self.defined_symbols.has_key(name):
1699       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1700                 % (error_prefix, name, self.fname)
1701       sys.stderr.write(err + "\n")
1702       self.collect_data.fatal_errors.append(err)
1703     self.defined_symbols[name] = None
1704     m = cvs_branch_tag.match(revision)
1705     if m:
1706       self.set_branch_name(m.group(1) + m.group(2), name)
1707     elif rcs_branch_tag.match(revision):
1708       self.set_branch_name(revision, name)
1709     else:
1710       self.taglist.setdefault(revision, []).append(name)
1711       self.collect_data.symbol_db.register_tag_creation(name)
1712
1713   def define_revision(self, revision, timestamp, author, state,
1714                       branches, next):
1715     # Record the state of our revision for later calculations
1716     self.rev_state[revision] = state
1717
1718     # store the rev_data as a list in case we have to jigger the timestamp
1719     self.rev_data[revision] = [int(timestamp), author, None]
1720
1721     # When on trunk, the RCS 'next' revision number points to what
1722     # humans might consider to be the 'previous' revision number.  For
1723     # example, 1.3's RCS 'next' is 1.2.
1724     #
1725     # However, on a branch, the RCS 'next' revision number really does
1726     # point to what humans would consider to be the 'next' revision
1727     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1728     #
1729     # In other words, in RCS, 'next' always means "where to find the next
1730     # deltatext that you need this revision to retrieve.
1731     #
1732     # That said, we don't *want* RCS's behavior here, so we determine
1733     # whether we're on trunk or a branch and set self.prev_rev
1734     # accordingly.
1735     #
1736     # One last thing.  Note that if REVISION is a branch revision,
1737     # instead of mapping REVISION to NEXT, we instead map NEXT to
1738     # REVISION.  Since we loop over all revisions in the file before
1739     # doing anything with the data we gather here, this 'reverse
1740     # assignment' effectively does the following:
1741     #
1742     # 1. Gives us no 'prev' value for REVISION (in this
1743     # iteration... it may have been set in a previous iteration)
1744     #
1745     # 2. Sets the 'prev' value for the revision with number NEXT to
1746     # REVISION.  So when we come around to the branch revision whose
1747     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1748     # set.
1749     if trunk_rev.match(revision):
1750       self.prev_rev[revision] = next
1751       self.next_rev[next] = revision
1752     elif next:
1753       self.prev_rev[next] = revision
1754       self.next_rev[revision] = next
1755
1756     for b in branches:
1757       self.prev_rev[b] = revision
1758
1759     # Ratchet up the highest vendor head revision, if necessary.
1760     if self.default_branch:
1761       default_branch_root = self.default_branch + "."
1762       if ((revision.find(default_branch_root) == 0)
1763           and (default_branch_root.count('.') == revision.count('.'))):
1764         # This revision is on the default branch, so record that it is
1765         # the new highest default branch head revision.
1766         self.collect_data.default_branches_db[self.cvs_path] = revision
1767     else:
1768       # No default branch, so make an educated guess.
1769       if revision == '1.2':
1770         # This is probably the time when the file stopped having a
1771         # default branch, so make a note of it.
1772         self.first_non_vendor_revision_date = timestamp
1773       else:
1774         m = vendor_revision.match(revision)
1775         if m and ((not self.first_non_vendor_revision_date)
1776                   or (timestamp < self.first_non_vendor_revision_date)):
1777           # We're looking at a vendor revision, and it wasn't
1778           # committed after this file lost its default branch, so bump
1779           # the maximum trunk vendor revision in the permanent record.
1780           self.collect_data.default_branches_db[self.cvs_path] = revision
1781
1782     if not trunk_rev.match(revision):
1783       # Check for unlabeled branches, record them.  We tried to collect
1784       # all branch names when we parsed the symbolic name header
1785       # earlier, of course, but that didn't catch unlabeled branches.
1786       # If a branch is unlabeled, this is our first encounter with it,
1787       # so we have to record its data now.
1788       branch_number = revision[:revision.rindex(".")]
1789       if not self.branch_names.has_key(branch_number):
1790         branch_name = "unlabeled-" + branch_number
1791         self.set_branch_name(branch_number, branch_name)
1792
1793       # Register the commit on this non-trunk branch
1794       branch_name = self.branch_names[branch_number]
1795       self.collect_data.symbol_db.register_branch_commit(branch_name)
1796
1797   def tree_completed(self):
1798     """The revision tree has been parsed.  Analyze it for consistency."""
1799
1800     # Our algorithm depends upon the timestamps on the revisions occuring
1801     # monotonically over time.  That is, we want to see rev 1.34 occur in
1802     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1803     # sorting), and then tried to insert 1.34, we'd be screwed.
1804
1805     # to perform the analysis, we'll simply visit all of the 'previous'
1806     # links that we have recorded and validate that the timestamp on the
1807     # previous revision is before the specified revision
1808
1809     # if we have to resync some nodes, then we restart the scan. just keep
1810     # looping as long as we need to restart.
1811     while 1:
1812       for current, prev in self.prev_rev.items():
1813         if not prev:
1814           # no previous revision exists (i.e. the initial revision)
1815           continue
1816         t_c = self.rev_data[current][0]
1817         t_p = self.rev_data[prev][0]
1818         if t_p >= t_c:
1819           # the previous revision occurred later than the current revision.
1820           # shove the previous revision back in time (and any before it that
1821           # may need to shift).
1822
1823           # We sync backwards and not forwards because any given CVS
1824           # Revision has only one previous revision.  However, a CVS
1825           # Revision can *be* a previous revision for many other
1826           # revisions (e.g., a revision that is the source of multiple
1827           # branches).  This becomes relevant when we do the secondary
1828           # synchronization in pass 2--we can make certain that we
1829           # don't resync a revision earlier than it's previous
1830           # revision, but it would be non-trivial to make sure that we
1831           # don't resync revision R *after* any revisions that have R
1832           # as a previous revision.
1833           while t_p >= t_c:
1834             self.rev_data[prev][0] = t_c - 1    # new timestamp
1835             self.rev_data[prev][2] = t_p        # old timestamp
1836             delta = t_c - 1 - t_p
1837             msg =  "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1838                   % (self.cvs_path, prev, time.ctime(t_p), delta)
1839             Log().write(LOG_VERBOSE, msg)
1840             if (delta > COMMIT_THRESHOLD
1841                 or delta < (COMMIT_THRESHOLD * -1)):
1842               Log().write(LOG_WARN,
1843                           "%s: Significant timestamp change for '%s' "
1844                           "(%d seconds)"
1845                           % (warning_prefix, self.cvs_path, delta))
1846             current = prev
1847             prev = self.prev_rev[current]
1848             if not prev:
1849               break
1850             t_c -= 1            # self.rev_data[current][0]
1851             t_p = self.rev_data[prev][0]
1852
1853           # break from the for-loop
1854           break
1855       else:
1856         # finished the for-loop (no resyncing was performed)
1857         return
1858
1859   def set_revision_info(self, revision, log, text):
1860     timestamp, author, old_ts = self.rev_data[revision]
1861     digest = sha.new(log + '\0' + author).hexdigest()
1862     if old_ts:
1863       # the timestamp on this revision was changed. log it for later
1864       # resynchronization of other files's revisions that occurred
1865       # for this time and log message.
1866       self.collect_data.resync.write('%08lx %s %08lx\n'
1867                                      % (old_ts, digest, timestamp))
1868
1869     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1870     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1871     #
1872     # If revision 1.1 appears to have been created via 'cvs add'
1873     # instead of 'cvs import', then this file probably never had a
1874     # default branch, so retroactively remove its record in the
1875     # default branches db.  The test is that the log message CVS uses
1876     # for 1.1 in imports is "Initial revision\n" with no period.
1877     if revision == '1.1' and log != 'Initial revision\n':
1878       try:
1879         del self.collect_data.default_branches_db[self.cvs_path]
1880       except KeyError:
1881         pass
1882
1883     # Get the timestamps of the previous and next revisions
1884     prev_rev = self.prev_rev[revision]
1885     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1886
1887     next_rev = self.next_rev.get(revision)
1888     next_timestamp, ign, ign = self.rev_data.get(next_rev, [0, None, None])
1889
1890     # How to tell if a CVSRevision is an add, a change, or a deletion:
1891     #
1892     # It's a delete if RCS state is 'dead'
1893     #
1894     # It's an add if RCS state is 'Exp.' and
1895     #      - we either have no previous revision
1896     #        or
1897     #      - we have a previous revision whose state is 'dead'
1898     #
1899     # Anything else is a change.
1900     if self.rev_state[revision] == 'dead':
1901       op = OP_DELETE
1902     elif ((self.prev_rev.get(revision, None) is None)
1903           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1904       op = OP_ADD
1905     else:
1906       op = OP_CHANGE
1907
1908     def is_branch_revision(rev):
1909       """Return True if this revision is not a trunk revision,
1910       else return False."""
1911
1912       if rev.count('.') >= 3:
1913         return True
1914       return False
1915
1916     def is_same_line_of_development(rev1, rev2):
1917       """Return True if rev1 and rev2 are on the same line of
1918       development (i.e., both on trunk, or both on the same branch);
1919       return False otherwise.  Either rev1 or rev2 can be None, in
1920       which case automatically return False."""
1921
1922       if rev1 is None or rev2 is None:
1923         return False
1924       if rev1.count('.') == 1 and rev2.count('.') == 1:
1925         return True
1926       if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
1927         return True
1928       return False
1929
1930     # There can be an odd situation where the tip revision of a branch
1931     # is alive, but every predecessor on the branch is in state 'dead',
1932     # yet the revision from which the branch sprouts is alive.  (This
1933     # is sort of a mirror image of the more common case of adding a
1934     # file on a branch, in which the first revision on the branch is
1935     # alive while the revision from which it sprouts is dead.)
1936     #
1937     # In this odd situation, we must mark the first live revision on
1938     # the branch as an OP_CHANGE instead of an OP_ADD, because it
1939     # reflects, however indirectly, a change w.r.t. the source
1940     # revision from which the branch sprouts.
1941     #
1942     # This is issue #89.
1943     cur_num = revision
1944     if is_branch_revision(revision) and self.rev_state[revision] != 'dead':
1945       while 1:
1946         prev_num = self.prev_rev.get(cur_num, None)
1947         if not cur_num or not prev_num:
1948           break
1949         if (not is_same_line_of_development(cur_num, prev_num)
1950             and self.rev_state[cur_num] == 'dead'
1951             and self.rev_state[prev_num] != 'dead'):
1952           op = OP_CHANGE
1953         cur_num = self.prev_rev.get(cur_num, None)
1954
1955     if text:
1956       deltatext_code = DELTATEXT_NONEMPTY
1957     else:
1958       deltatext_code = DELTATEXT_EMPTY
1959
1960     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp,
1961                         next_timestamp, op,
1962                         prev_rev, revision, next_rev,
1963                         self.file_in_attic, self.file_executable,
1964                         self.file_size,
1965                         deltatext_code, self.fname,
1966                         self.mode, self.rev_to_branch_name(revision),
1967                         self.taglist.get(revision, []),
1968                         self.branchlist.get(revision, []))
1969     self.collect_data.revs.write(str(c_rev) + "\n")
1970     StatsKeeper().record_c_rev(c_rev)
1971
1972     if not self.collect_data.metadata_db.has_key(digest):
1973       self.collect_data.metadata_db[digest] = (author, log)
1974
1975   def parse_completed(self):
1976     # Walk through all branches and tags and register them with
1977     # their parent branch in the symbol database.
1978     for revision, symbols in self.taglist.items() + self.branchlist.items():
1979       for symbol in symbols:
1980         name = self.rev_to_branch_name(revision)
1981         if name is not None:
1982           self.collect_data.symbol_db.register_branch_blocker(name, symbol)
1983
1984     self.collect_data.num_files += 1
1985
1986
1987 class CollectData:
1988   """Repository for data collected by parsing the CVS repository files.
1989
1990   This class manages the databases into which information collected
1991   from the CVS repository is stored.  The data are stored into this
1992   class by FileDataCollector instances, one of which is created for
1993   each file to be parsed."""
1994
1995   def __init__(self):
1996     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1997     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1998     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1999     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
2000     self.default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
2001                                          DB_OPEN_NEW)
2002     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
2003     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
2004     Cleanup().register(temp(METADATA_DB), pass8)
2005     self.fatal_errors = []
2006     self.num_files = 0
2007     self.symbol_db = SymbolDatabase()
2008
2009     # 1 if we've collected data for at least one file, None otherwise.
2010     self.found_valid_file = None
2011
2012   def write_symbol_db(self):
2013     self.symbol_db.write()
2014
2015
2016 class SymbolingsLogger:
2017   """Manage the file that contains lines for symbol openings and
2018   closings.
2019
2020   This data will later be used to determine valid SVNRevision ranges
2021   from which a file can be copied when creating a branch or tag in
2022   Subversion.  Do this by finding "Openings" and "Closings" for each
2023   file copied onto a branch or tag.
2024
2025   An "Opening" is the CVSRevision from which a given branch/tag
2026   sprouts on a path.
2027
2028   The "Closing" for that branch/tag and path is the next CVSRevision
2029   on the same line of development as the opening.
2030
2031   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
2032   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
2033   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
2034   'foo.c'.  Note that there may be many revisions chronologically
2035   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
2036   perhaps even including on branch BEE itself.  But 1.3 is the next
2037   revision *on the same line* as 1.2, that is why it is the closing
2038   revision for those symbolic names of which 1.2 is the opening.
2039
2040   The reason for doing all this hullabaloo is to make branch and tag
2041   creation as efficient as possible by minimizing the number of copies
2042   and deletes per creation.  For example, revisions 1.2 and 1.3 of
2043   foo.c might correspond to revisions 17 and 30 in Subversion.  That
2044   means that when creating branch BEE, there is some motivation to do
2045   the copy from one of 17-30.  Now if there were another file,
2046   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
2047   to revisions 24 and 39 in Subversion, we would know that the ideal
2048   thing would be to copy the branch from somewhere between 24 and 29,
2049   inclusive.
2050   """
2051
2052   def __init__(self):
2053     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
2054     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
2055     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
2056     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
2057
2058     # This keys of this dictionary are *source* cvs_paths for which
2059     # we've encountered an 'opening' on the default branch.  The
2060     # values are the (uncleaned) symbolic names that this path has
2061     # opened.
2062     self.open_paths_with_default_branches = { }
2063
2064   def log_revision(self, c_rev, svn_revnum):
2065     """Log any openings found in C_REV, and if C_REV.next_rev is not
2066     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
2067     any) will have its revnum determined later."""
2068
2069     for name in c_rev.tags + c_rev.branches:
2070       self._note_default_branch_opening(c_rev, name)
2071       if c_rev.op != OP_DELETE:
2072         self._log(name, svn_revnum,
2073                   c_rev.cvs_path, c_rev.branch_name, OPENING)
2074
2075       # If our c_rev has a next_rev, then that's the closing rev for
2076       # this source revision.  Log it to closings for later processing
2077       # since we don't know the svn_revnum yet.
2078       if c_rev.next_rev is not None:
2079         self.closings.write('%s %s\n' %
2080                             (name, c_rev.unique_key(c_rev.next_rev)))
2081
2082   def _log(self, name, svn_revnum, cvs_path, branch_name, type):
2083     """Write out a single line to the symbol_openings_closings file
2084     representing that SVN_REVNUM of SVN_PATH on BRANCH_NAME is either the
2085     opening or closing (TYPE) of NAME (a symbolic name).
2086
2087     TYPE should only be one of the following global constants:
2088     OPENING or CLOSING."""
2089
2090     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
2091     self.symbolings.write(
2092         '%s %.8d %s %s %s\n'
2093         % (name, svn_revnum, type, branch_name or '*', cvs_path))
2094
2095   def close(self):
2096     """Iterate through the closings file, lookup the svn_revnum for
2097     each closing CVSRevision, and write a proper line out to the
2098     symbolings file."""
2099
2100     # Use this to get the c_rev of our rev_key
2101     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
2102
2103     self.closings.close()
2104     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
2105       (name, rev_key) = line.rstrip().split(" ", 1)
2106       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
2107
2108       c_rev = cvs_revs_db.get_revision(rev_key)
2109       self._log(name, svn_revnum, c_rev.cvs_path, c_rev.branch_name, CLOSING)
2110
2111     self.symbolings.close()
2112
2113   def _note_default_branch_opening(self, c_rev, symbolic_name):
2114     """If C_REV is a default branch revision, log C_REV.cvs_path as an
2115     opening for SYMBOLIC_NAME."""
2116
2117     self.open_paths_with_default_branches.setdefault(
2118         c_rev.cvs_path, []).append(symbolic_name)
2119
2120   def log_default_branch_closing(self, c_rev, svn_revnum):
2121     """If self.open_paths_with_default_branches contains
2122     C_REV.cvs_path, then call log each name in
2123     self.open_paths_with_default_branches[C_REV.cvs_path] as a closing
2124     with SVN_REVNUM as the closing revision number."""
2125
2126     path = c_rev.cvs_path
2127     if self.open_paths_with_default_branches.has_key(path):
2128       # log each symbol as a closing
2129       for name in self.open_paths_with_default_branches[path]:
2130         self._log(name, svn_revnum, path, None, CLOSING)
2131       # Remove them from the openings list as we're done with them.
2132       del self.open_paths_with_default_branches[path]
2133
2134
2135 class PersistenceManager:
2136   """The PersistenceManager allows us to effectively store SVNCommits
2137   to disk and retrieve them later using only their subversion revision
2138   number as the key.  It also returns the subversion revision number
2139   for a given CVSRevision's unique key.
2140
2141   All information pertinent to each SVNCommit is stored in a series of
2142   on-disk databases so that SVNCommits can be retrieved on-demand.
2143
2144   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
2145   In 'new' mode, PersistenceManager will initialize a new set of on-disk
2146   databases and be fully-featured.
2147   In 'read' mode, PersistenceManager will open existing on-disk databases
2148   and the set_* methods will be unavailable."""
2149
2150   def __init__(self, mode):
2151     self.mode = mode
2152     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
2153       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
2154     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
2155     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
2156     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
2157     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
2158     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
2159     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
2160     ###PERF kff Elsewhere there are comments about sucking the tags db
2161     ### into memory.  That seems like a good idea.
2162     if not Ctx().trunk_only:
2163       self.tags_db = TagsDatabase(DB_OPEN_READ)
2164
2165     # "branch_name" -> svn_revnum in which branch was last filled.
2166     # This is used by CVSCommit._pre_commit, to prevent creating a fill
2167     # revision which would have nothing to do.
2168     self.last_filled = {}
2169
2170   def get_svn_revnum(self, cvs_rev_unique_key):
2171     """Return the Subversion revision number in which
2172     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
2173     is no mapping for CVS_REV_UNIQUE_KEY."""
2174
2175     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
2176
2177   def get_svn_commit(self, svn_revnum):
2178     """Return an SVNCommit that corresponds to SVN_REVNUM.
2179
2180     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
2181
2182     This method can throw SVNCommitInternalInconsistencyError."""
2183
2184     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
2185     (c_rev_keys, motivating_revnum, name, date) = self.svn2cvs_db.get(
2186         str(svn_revnum), (None, None, None, None))
2187     if c_rev_keys is None:
2188       return None
2189
2190     digest = None
2191     for key in c_rev_keys:
2192       c_rev = self.cvs_revisions.get_revision(key)
2193       svn_commit.add_revision(c_rev)
2194       # Set the author and log message for this commit by using
2195       # CVSRevision metadata, but only if haven't done so already.
2196       if digest is None:
2197         digest = c_rev.digest
2198         author, log_msg = self.svn_commit_metadata[digest]
2199         svn_commit.set_author(author)
2200         svn_commit.set_log_msg(log_msg)
2201
2202     svn_commit.set_date(date)
2203
2204     # If we're doing a trunk-only conversion, we don't need to do any more
2205     # work.
2206     if Ctx().trunk_only:
2207       return svn_commit
2208
2209     if name:
2210       if svn_commit.cvs_revs:
2211         raise SVNCommit.SVNCommitInternalInconsistencyError(
2212             "An SVNCommit cannot have cvs_revisions *and* a corresponding\n"
2213             "symbolic name ('%s') to fill."
2214             % (_clean_symbolic_name(name),))
2215       svn_commit.set_symbolic_name(name)
2216       if name in self.tags_db:
2217         svn_commit.is_tag = 1
2218
2219     if motivating_revnum is not None:
2220       svn_commit.set_motivating_revnum(motivating_revnum)
2221
2222     return svn_commit
2223
2224   def put_svn_commit(self, svn_revnum, cvs_revs,
2225                      date, name, motivating_revnum):
2226     """Record the bidirectional mapping between SVN_REVNUM and
2227     CVS_REVS and record associated attributes."""
2228
2229     if self.mode == DB_OPEN_READ:
2230       raise RuntimeError, \
2231           'Write operation attempted on read-only PersistenceManager'
2232
2233     for c_rev in cvs_revs:
2234       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
2235
2236     self.svn2cvs_db[str(svn_revnum)] = ([x.unique_key() for x in cvs_revs],
2237                                         motivating_revnum, name, date)
2238
2239     for c_rev in cvs_revs:
2240       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
2241
2242     # If it is not a primary commit, then record last_filled.  name is
2243     # allowed to be None.
2244     if name or motivating_revnum:
2245       self.last_filled[name] = svn_revnum
2246
2247
2248 class CVSCommit:
2249   """Each instance of this class contains a number of CVS Revisions
2250   that correspond to one or more Subversion Commits.  After all CVS
2251   Revisions are added to the grouping, calling process_revisions will
2252   generate a Subversion Commit (or Commits) for the set of CVS
2253   Revisions in the grouping."""
2254
2255   def __init__(self, digest, author, log):
2256     self.digest = digest
2257     self.author = author
2258     self.log = log
2259
2260     # Symbolic names for which the last source revision has already
2261     # been seen and for which the CVSRevisionAggregator has already
2262     # generated a fill SVNCommit.  See self.process_revisions().
2263     self.done_symbols = [ ]
2264
2265     self.files = { }
2266     # Lists of CVSRevisions
2267     self.changes = [ ]
2268     self.deletes = [ ]
2269
2270     # Start out with a t_min higher than any incoming time T, and a
2271     # t_max lower than any incoming T.  This way the first T will
2272     # push t_min down to T, and t_max up to T, naturally (without any
2273     # special-casing), and successive times will then ratchet them
2274     # outward as appropriate.
2275     self.t_min = 1L<<32
2276     self.t_max = 0
2277
2278     # This will be set to the SVNCommit that occurs in self._commit.
2279     self.motivating_commit = None
2280
2281     # This is a list of all non-primary commits motivated by the main
2282     # commit.  We gather these so that we can set their dates to the
2283     # same date as the primary commit.
2284     self.secondary_commits = [ ]
2285
2286     # State for handling default branches.
2287     #
2288     # Here is a tempting, but ultimately nugatory, bit of logic, which
2289     # I share with you so you may appreciate the less attractive, but
2290     # refreshingly non-nugatory, logic which follows it:
2291     #
2292     # If some of the commits in this txn happened on a non-trunk
2293     # default branch, then those files will have to be copied into
2294     # trunk manually after being changed on the branch (because the
2295     # RCS "default branch" appears as head, i.e., trunk, in practice).
2296     # As long as those copies don't overwrite any trunk paths that
2297     # were also changed in this commit, then we can do the copies in
2298     # the same revision, because they won't cover changes that don't
2299     # appear anywhere/anywhen else.  However, if some of the trunk dst
2300     # paths *did* change in this commit, then immediately copying the
2301     # branch changes would lose those trunk mods forever.  So in this
2302     # case, we need to do at least that copy in its own revision.  And
2303     # for simplicity's sake, if we're creating the new revision for
2304     # even one file, then we just do all such copies together in the
2305     # new revision.
2306     #
2307     # Doesn't that sound nice?
2308     #
2309     # Unfortunately, Subversion doesn't support copies with sources
2310     # in the current txn.  All copies must be based in committed
2311     # revisions.  Therefore, we generate the above-described new
2312     # revision unconditionally.
2313     #
2314     # This is a list of c_revs, and a c_rev is appended for each
2315     # default branch commit that will need to be copied to trunk (or
2316     # deleted from trunk) in some generated revision following the
2317     # "regular" revision.
2318     self.default_branch_cvs_revisions = [ ]
2319
2320   def __cmp__(self, other):
2321     # Commits should be sorted by t_max.  If both self and other have
2322     # the same t_max, break the tie using t_min, and lastly, digest.
2323     # If all those are equal, then compare based on ids, to ensure
2324     # that no two instances compare equal.
2325     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
2326             or cmp(self.digest, other.digest) or cmp(id(self), id(other)))
2327
2328   def __hash__(self):
2329     return id(self)
2330
2331   def has_file(self, fname):
2332     return self.files.has_key(fname)
2333
2334   def revisions(self):
2335     return self.changes + self.deletes
2336
2337   def opens_symbolic_name(self, name):
2338     """Returns true if any CVSRevision in this commit is on a tag or a
2339     branch or is the origin of a tag or branch."""
2340
2341     for c_rev in self.revisions():
2342       if c_rev.opens_symbolic_name(name):
2343         return 1
2344     return 0
2345
2346   def add_revision(self, c_rev):
2347     # Record the time range of this commit.
2348     #
2349     # ### ISSUE: It's possible, though unlikely, that the time range
2350     # of a commit could get gradually expanded to be arbitrarily
2351     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
2352     # problem, and anyway deciding where to break it up would be a
2353     # judgement call.  For now, we just print a warning in commit() if
2354     # this happens.
2355     if c_rev.timestamp < self.t_min:
2356       self.t_min = c_rev.timestamp
2357     if c_rev.timestamp > self.t_max:
2358       self.t_max = c_rev.timestamp
2359
2360     if c_rev.op == OP_DELETE:
2361       self.deletes.append(c_rev)
2362     else:
2363       # OP_CHANGE or OP_ADD
2364       self.changes.append(c_rev)
2365
2366     self.files[c_rev.fname] = 1
2367
2368   def _pre_commit(self):
2369     """Generates any SVNCommits that must exist before the main commit."""
2370
2371     # There may be multiple c_revs in this commit that would cause
2372     # branch B to be filled, but we only want to fill B once.  On the
2373     # other hand, there might be multiple branches committed on in
2374     # this commit.  Whatever the case, we should count exactly one
2375     # commit per branch, because we only fill a branch once per
2376     # CVSCommit.  This list tracks which branches we've already
2377     # counted.
2378     accounted_for_sym_names = [ ]
2379
2380     def fill_needed(c_rev, pm):
2381       """Return 1 if this is the first commit on a new branch (for
2382       this file) and we need to fill the branch; else return 0
2383       (meaning that some other file's first commit on the branch has
2384       already done the fill for us).
2385
2386       If C_REV.op is OP_ADD, only return 1 if the branch that this
2387       commit is on has no last filled revision.
2388
2389       PM is a PersistenceManager to query."""
2390
2391       # Different '.' counts indicate that c_rev is now on a different
2392       # line of development (and may need a fill)
2393       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
2394         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
2395         # It should be the case that when we have a file F that
2396         # is added on branch B (thus, F on trunk is in state
2397         # 'dead'), we generate an SVNCommit to fill B iff the branch
2398         # has never been filled before.
2399         #
2400         # If this c_rev.op == OP_ADD, *and* the branch has never
2401         # been filled before, then fill it now.  Otherwise, no need to
2402         # fill it.
2403         if c_rev.op == OP_ADD:
2404           if pm.last_filled.get(c_rev.branch_name, None) is None:
2405             return 1
2406         elif c_rev.op == OP_CHANGE:
2407           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
2408             return 1
2409         elif c_rev.op == OP_DELETE:
2410           if pm.last_filled.get(c_rev.branch_name, None) is None:
2411             return 1
2412       return 0
2413
2414     for c_rev in self.changes + self.deletes:
2415       # If a commit is on a branch, we must ensure that the branch
2416       # path being committed exists (in HEAD of the Subversion
2417       # repository).  If it doesn't exist, we will need to fill the
2418       # branch.  After the fill, the path on which we're committing
2419       # will exist.
2420       if c_rev.branch_name \
2421           and c_rev.branch_name not in accounted_for_sym_names \
2422           and c_rev.branch_name not in self.done_symbols \
2423           and fill_needed(c_rev, Ctx()._persistence_manager):
2424         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
2425                                % c_rev.branch_name)
2426         svn_commit.set_symbolic_name(c_rev.branch_name)
2427         self.secondary_commits.append(svn_commit)
2428         accounted_for_sym_names.append(c_rev.branch_name)
2429
2430   def _commit(self):
2431     """Generates the primary SVNCommit that corresponds to this
2432     CVSCommit."""
2433
2434     # Generate an SVNCommit unconditionally.  Even if the only change
2435     # in this CVSCommit is a deletion of an already-deleted file (that
2436     # is, a CVS revision in state 'dead' whose predecessor was also in
2437     # state 'dead'), the conversion will still generate a Subversion
2438     # revision containing the log message for the second dead
2439     # revision, because we don't want to lose that information.
2440     svn_commit = SVNCommit("commit")
2441     self.motivating_commit = svn_commit
2442
2443     for c_rev in self.changes:
2444       svn_commit.add_revision(c_rev)
2445       # Only make a change if we need to.  When 1.1.1.1 has an empty
2446       # deltatext, the explanation is almost always that we're looking
2447       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
2448       # such imports, CVS creates an RCS file where 1.1 has the
2449       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2450       # content as 1.1.  There's no reason to reflect this non-change
2451       # in the repository, so we want to do nothing in this case.  (If
2452       # we were really paranoid, we could make sure 1.1's log message
2453       # is the CVS-generated "Initial revision\n", but I think the
2454       # conditions below are strict enough.)
2455       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
2456               and (c_rev.rev == "1.1.1.1")):
2457         if c_rev.is_default_branch_revision():
2458           self.default_branch_cvs_revisions.append(c_rev)
2459
2460     for c_rev in self.deletes:
2461       # When a file is added on a branch, CVS not only adds the file
2462       # on the branch, but generates a trunk revision (typically
2463       # 1.1) for that file in state 'dead'.  We only want to add
2464       # this revision if the log message is not the standard cvs
2465       # fabricated log message.
2466       if c_rev.prev_rev is None:
2467         # c_rev.branches may be empty if the originating branch
2468         # has been excluded.
2469         if not c_rev.branches:
2470           continue
2471         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
2472                              % (c_rev.filename(),
2473                                 c_rev.branches[0]))
2474         author, log_msg = \
2475             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
2476         if log_msg == cvs_generated_msg:
2477           continue
2478
2479       svn_commit.add_revision(c_rev)
2480       if c_rev.is_default_branch_revision():
2481         self.default_branch_cvs_revisions.append(c_rev)
2482
2483     # There is a slight chance that we didn't actually register any
2484     # CVSRevisions with our SVNCommit (see loop over self.deletes
2485     # above), so if we have no CVSRevisions, we don't flush the
2486     # svn_commit to disk and roll back our revnum.
2487     if len(svn_commit.cvs_revs) > 0:
2488       svn_commit.flush()
2489     else:
2490       # We will not be flushing this SVNCommit, so rollback the
2491       # SVNCommit revision counter.
2492       SVNCommit.revnum -= 1
2493
2494     if not Ctx().trunk_only:
2495       for c_rev in self.revisions():
2496         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
2497
2498   def _post_commit(self):
2499     """Generates any SVNCommits that we can perform now that _commit
2500     has happened.  That is, handle non-trunk default branches.
2501     Sometimes an RCS file has a non-trunk default branch, so a commit
2502     on that default branch would be visible in a default CVS checkout
2503     of HEAD.  If we don't copy that commit over to Subversion's trunk,
2504     then there will be no Subversion tree which corresponds to that
2505     CVS checkout.  Of course, in order to copy the path over, we may
2506     first need to delete the existing trunk there."""
2507
2508     # Only generate a commit if we have default branch revs
2509     if len(self.default_branch_cvs_revisions):
2510       # Generate an SVNCommit for all of our default branch c_revs.
2511       svn_commit = SVNCommit("post-commit default branch(es)")
2512       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
2513       for c_rev in self.default_branch_cvs_revisions:
2514         svn_commit.add_revision(c_rev)
2515         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
2516                                                             svn_commit.revnum)
2517       self.secondary_commits.append(svn_commit)
2518
2519   def process_revisions(self, done_symbols):
2520     """Process all the CVSRevisions that this instance has, creating
2521     one or more SVNCommits in the process.  Generate fill SVNCommits
2522     only for symbols not in DONE_SYMBOLS (avoids unnecessary
2523     fills).
2524
2525     Return the primary SVNCommit that corresponds to this CVSCommit.
2526     The returned SVNCommit is the commit that motivated any other
2527     SVNCommits generated in this CVSCommit."""
2528
2529     self.done_symbols = done_symbols
2530     seconds = self.t_max - self.t_min + 1
2531
2532     Log().write(LOG_VERBOSE, '-' * 60)
2533     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2534     if seconds == 1:
2535       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2536                   % time.ctime(self.t_max))
2537     else:
2538       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2539       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2540                   % (time.ctime(self.t_max), seconds))
2541
2542     if seconds > COMMIT_THRESHOLD + 1:
2543       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2544                   % (warning_prefix, COMMIT_THRESHOLD))
2545
2546     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2547       self._commit()
2548       return self.motivating_commit
2549
2550     self._pre_commit()
2551     self._commit()
2552     self._post_commit()
2553
2554     for svn_commit in self.secondary_commits:
2555       svn_commit.set_date(self.motivating_commit.get_date())
2556       svn_commit.flush()
2557
2558     return self.motivating_commit
2559
2560
2561 class SVNCommit:
2562   """This represents one commit to the Subversion Repository.  There
2563   are three types of SVNCommits:
2564
2565   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2566
2567   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2568
2569   3. Updates trunk to reflect the contents of a particular branch
2570      (this is to handle RCS default branches)."""
2571
2572   # The revision number to assign to the next new SVNCommit.
2573   # We start at 2 because SVNRepositoryMirror uses the first commit
2574   # to create trunk, tags, and branches.
2575   revnum = 2
2576
2577   class SVNCommitInternalInconsistencyError(Exception):
2578     """Exception raised if we encounter an impossible state in the
2579     SVNCommit Databases."""
2580
2581     pass
2582
2583   def __init__(self, description="", revnum=None, cvs_revs=None):
2584     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2585     If REVNUM, the SVNCommit will correspond to that revision number;
2586     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2587     REVNUM.
2588
2589     It is an error to pass CVS_REVS without REVNUM, but you may pass
2590     REVNUM without CVS_REVS, and then add a revision at a time by
2591     invoking add_revision()."""
2592
2593     self._description = description
2594
2595     # Revprop metadata for this commit.
2596     #
2597     # These initial values are placeholders.  At least the log and the
2598     # date should be different by the time these are used.
2599     #
2600     # They are private because their values should be returned encoded
2601     # in UTF8, but callers aren't required to set them in UTF8.
2602     # Therefore, accessor methods are used to set them, and
2603     # self.get_revprops() is used to to get them, in dictionary form.
2604     self._author = Ctx().username
2605     self._log_msg = "This log message means an SVNCommit was used too soon."
2606     self._max_date = 0  # Latest date seen so far.
2607
2608     self.cvs_revs = cvs_revs or []
2609     if revnum:
2610       self.revnum = revnum
2611     else:
2612       self.revnum = SVNCommit.revnum
2613       SVNCommit.revnum += 1
2614
2615     # The (uncleaned) symbolic name that is filled in this SVNCommit, if any.
2616     self.symbolic_name = None
2617
2618     # If this commit is a default branch synchronization, this
2619     # variable represents the subversion revision number of the
2620     # *primary* commit where the default branch changes actually
2621     # happened.  It is None otherwise.
2622     #
2623     # It is possible for multiple synchronization commits to refer to
2624     # the same motivating commit revision number, and it is possible
2625     # for a single synchronization commit to contain CVSRevisions on
2626     # multiple different default branches.
2627     self.motivating_revnum = None
2628
2629     # is_tag is true only if this commit is a fill of a symbolic name
2630     # that is a tag, None in all other cases.
2631     self.is_tag = None
2632
2633   def set_symbolic_name(self, symbolic_name):
2634     """Set self.symbolic_name to SYMBOLIC_NAME."""
2635
2636     self.symbolic_name = symbolic_name
2637
2638   def set_motivating_revnum(self, revnum):
2639     """Set self.motivating_revnum to REVNUM."""
2640
2641     self.motivating_revnum = revnum
2642
2643   def set_author(self, author):
2644     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2645     This is the only way to set an SVNCommit's author."""
2646
2647     self._author = author
2648
2649   def set_log_msg(self, msg):
2650     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2651     This is the only way to set an SVNCommit's log message."""
2652
2653     self._log_msg = msg
2654
2655   def set_date(self, date):
2656     """Set this SVNCommit's date to DATE (an integer).
2657     Note that self.add_revision() updates this automatically based on
2658     a CVSRevision; so you may not need to call this at all, and even
2659     if you do, the value may be overwritten by a later call to
2660     self.add_revision()."""
2661
2662     self._max_date = date
2663
2664   def get_date(self):
2665     """Returns this SVNCommit's date as an integer."""
2666
2667     return self._max_date
2668
2669   def get_revprops(self):
2670     """Return the Subversion revprops for this SVNCommit."""
2671
2672     date = format_date(self._max_date)
2673     try:
2674       utf8_author = None
2675       if self._author is not None:
2676         utf8_author = to_utf8(self._author)
2677       utf8_log = to_utf8(self.get_log_msg())
2678       return { 'svn:author' : utf8_author,
2679                'svn:log'    : utf8_log,
2680                'svn:date'   : date }
2681     except UnicodeError:
2682       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2683                   % warning_prefix)
2684       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2685       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2686       Log().write(LOG_WARN, "  date:   '%s'" % date)
2687       Log().write(LOG_WARN,
2688                   "(subversion rev %s)  Related files:" % self.revnum)
2689       for c_rev in self.cvs_revs:
2690         Log().write(LOG_WARN, " ", c_rev.fname)
2691
2692       Log().write(LOG_WARN, "Consider rerunning with one or more ",
2693                   "'--encoding' parameters.\n")
2694       # It's better to fall back to the original (unknown encoding) data
2695       # than to either 1) quit or 2) record nothing at all.
2696       return { 'svn:author' : self._author,
2697                'svn:log'    : self.get_log_msg(),
2698                'svn:date'   : date }
2699
2700   def add_revision(self, cvs_rev):
2701     self.cvs_revs.append(cvs_rev)
2702     if cvs_rev.timestamp > self._max_date:
2703       self._max_date = cvs_rev.timestamp
2704
2705   def flush(self):
2706     Log().write(LOG_NORMAL, "Creating Subversion r%d (%s)"
2707                 % (self.revnum, self._description))
2708     Ctx()._persistence_manager.put_svn_commit(self.revnum,
2709                                               self.cvs_revs,
2710                                               self._max_date,
2711                                               self.symbolic_name,
2712                                               self.motivating_revnum)
2713
2714   def __str__(self):
2715     """ Print a human-readable description of this SVNCommit.  This
2716     description is not intended to be machine-parseable (although
2717     we're not going to stop you if you try!)"""
2718
2719     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2720     if self.symbolic_name:
2721       ret += ("   symbolic name: " + _clean_symbolic_name(self.symbolic_name)
2722               + "\n")
2723     else:
2724       ret += "   NO symbolic name\n"
2725     ret += "   debug description: " + self._description + "\n"
2726     ret += "   cvs_revs:\n"
2727     for c_rev in self.cvs_revs:
2728       ret += "     " + c_rev.unique_key() + "\n"
2729     return ret
2730
2731   def get_log_msg(self):
2732     """Returns the actual log message for a primary commit, and the
2733     appropriate manufactured log message for a secondary commit."""
2734
2735     if self.symbolic_name is not None:
2736       return self._log_msg_for_symbolic_name_commit()
2737     elif self.motivating_revnum is not None:
2738       return self._log_msg_for_default_branch_commit()
2739     else:
2740       return self._log_msg
2741
2742   def _log_msg_for_symbolic_name_commit(self):
2743     """Creates a log message for a manufactured commit that fills
2744     self.symbolic_name.  If self.is_tag is true, write the log message
2745     as though for a tag, else write it as though for a branch."""
2746
2747     type = 'branch'
2748     if self.is_tag:
2749       type = 'tag'
2750
2751     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2752     space_or_newline = ' '
2753     cleaned_symbolic_name = _clean_symbolic_name(self.symbolic_name)
2754     if len(cleaned_symbolic_name) >= 13:
2755       space_or_newline = '\n'
2756
2757     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2758            % (type, space_or_newline, cleaned_symbolic_name)
2759
2760   def _log_msg_for_default_branch_commit(self):
2761     """Creates a log message for a manufactured commit that
2762     synchronizes a non-trunk default branch with trunk."""
2763
2764     msg = 'This commit was generated by cvs2svn to compensate for '     \
2765           'changes in r%d,\n'                                           \
2766           'which included commits to RCS files with non-trunk default ' \
2767           'branches.\n' % self.motivating_revnum
2768     return msg
2769
2770
2771 class CVSRevisionAggregator:
2772   """This class groups CVSRevisions into CVSCommits that represent
2773   at least one SVNCommit."""
2774
2775   def __init__(self):
2776     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2777     if not Ctx().trunk_only:
2778       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB),
2779                                    DB_OPEN_READ)
2780
2781     # A map { key : CVSCommit } of CVS commits currently being
2782     # accumulated.  If the CVSCommit is still open to further
2783     # CVSRevisions, then key is CVSRevision.digest.  If not (because
2784     # an inbound commit wanted to affect a file that was already
2785     # within the CVSCommit), then key is CVSRevision.digest plus some
2786     # number of appended '-'.
2787     self.cvs_commits = {}
2788
2789     # List of ready commits.
2790     self.ready_queue = [ ]
2791
2792     # A map { symbol : None } of symbolic names for which the last
2793     # source CVSRevision has already been processed but which haven't
2794     # been closed yet.
2795     self.pending_symbols = {}
2796
2797     # A list of closed symbols.  That is, we've already encountered
2798     # the last CVSRevision that is a source for that symbol, the final
2799     # fill for this symbol has been done, and we never need to fill it
2800     # again.
2801     self.done_symbols = [ ]
2802
2803     # This variable holds the most recently created primary svn_commit
2804     # object.  CVSRevisionAggregator maintains this variable merely
2805     # for its date, so that it can set dates for the SVNCommits
2806     # created in self._attempt_to_commit_symbols().
2807     self.latest_primary_svn_commit = None
2808
2809     Ctx()._symbolings_logger = SymbolingsLogger()
2810     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2811     Ctx()._default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
2812                                            DB_OPEN_READ)
2813
2814   def _extract_ready_commits(self, timestamp):
2815     """Extract and return any active commits that expire by TIMESTAMP."""
2816
2817     for digest_key, cvs_commit in self.cvs_commits.items():
2818       if cvs_commit.t_max + COMMIT_THRESHOLD < timestamp:
2819         self.ready_queue.append(cvs_commit)
2820         del self.cvs_commits[digest_key]
2821
2822   def _commit_ready_commits(self):
2823     """Sort the commits from self.ready_queue by time, then process them."""
2824
2825     self.ready_queue.sort()
2826     while self.ready_queue:
2827       cvs_commit = self.ready_queue[0]
2828       del self.ready_queue[0]
2829       self.latest_primary_svn_commit = \
2830           cvs_commit.process_revisions(self.done_symbols)
2831       self._attempt_to_commit_symbols()
2832
2833   def process_revision(self, c_rev):
2834     # Each time we read a new line, scan the accumulating commits to
2835     # see if any are ready for processing.
2836     self._extract_ready_commits(c_rev.timestamp)
2837
2838     for digest_key, cvs_commit in self.cvs_commits.items():
2839       # If the inbound commit is on the same file as a pending commit,
2840       # close the pending commit to further changes.  Don't flush it though,
2841       # as there may be other pending commits dated before this one.
2842       # ### ISSUE: the has_file() check below is not optimal.
2843       # It does fix the dataloss bug where revisions would get lost
2844       # if checked in too quickly, but it can also break apart the
2845       # commits.  The correct fix would require tracking the dependencies
2846       # between change sets and committing them in proper order.
2847       if cvs_commit.has_file(c_rev.fname):
2848         unused_id = digest_key + '-'
2849         # Find a string that does is not already a key in
2850         # the self.cvs_commits dict
2851         while self.cvs_commits.has_key(unused_id):
2852           unused_id += '-'
2853         self.cvs_commits[unused_id] = cvs_commit
2854         del self.cvs_commits[digest_key]
2855
2856     # Add this item into the set of still-available commits.
2857     if self.cvs_commits.has_key(c_rev.digest):
2858       cvs_commit = self.cvs_commits[c_rev.digest]
2859     else:
2860       author, log = self.metadata_db[c_rev.digest]
2861       cvs_commit = CVSCommit(c_rev.digest, author, log)
2862       self.cvs_commits[c_rev.digest] = cvs_commit
2863     cvs_commit.add_revision(c_rev)
2864
2865     # Any elements in self.ready_queue at this point need to be
2866     # processed, because this latest rev couldn't possibly be part of
2867     # any of them.
2868     self._commit_ready_commits()
2869
2870     self._add_pending_symbols(c_rev)
2871
2872   def flush(self):
2873     """Commit anything left in self.cvs_commits.  Then inform the
2874     SymbolingsLogger that all commits are done."""
2875
2876     self._extract_ready_commits(1L<<32)
2877     self._commit_ready_commits()
2878
2879     if not Ctx().trunk_only:
2880       Ctx()._symbolings_logger.close()
2881
2882   def _add_pending_symbols(self, c_rev):
2883     """Add to self.pending_symbols any symbols from C_REV for which
2884     C_REV is the last CVSRevision.
2885
2886     If we're not doing a trunk-only conversion, get the symbolic names
2887     that this c_rev is the last *source* CVSRevision for and add them
2888     to those left over from previous passes through the aggregator."""
2889
2890     if not Ctx().trunk_only:
2891       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2892         self.pending_symbols[sym] = None
2893
2894   def _attempt_to_commit_symbols(self):
2895     """Generate one SVNCommit for each symbol in self.pending_symbols
2896     that doesn't have an opening CVSRevision in either self.ready_queue
2897     or self.cvs_commits.values()."""
2898
2899     # Make a list of all symbols from self.pending_symbols that do not
2900     # have *source* CVSRevisions in the pending commit queues
2901     # (self.cvs_commits or self.ready_queue):
2902     closeable_symbols = []
2903     pending_commits = self.cvs_commits.values() + self.ready_queue
2904     for sym in self.pending_symbols:
2905       for cvs_commit in pending_commits:
2906         if cvs_commit.opens_symbolic_name(sym):
2907           break
2908       else:
2909         closeable_symbols.append(sym)
2910
2911     # Sort the closeable symbols so that we will always process the
2912     # symbols in the same order, regardless of the order in which the
2913     # dict hashing algorithm hands them back to us.  We do this so
2914     # that our tests will get the same results on all platforms.
2915     closeable_symbols.sort()
2916     for sym in closeable_symbols:
2917       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2918       svn_commit.set_symbolic_name(sym)
2919       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2920       svn_commit.flush()
2921       self.done_symbols.append(sym)
2922       del self.pending_symbols[sym]
2923
2924
2925 class SymbolingsReader:
2926   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2927   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2928   returning the correct opening and closing Subversion revision
2929   numbers for a given symbolic name."""
2930
2931   def __init__(self):
2932     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2933     reads the offsets database into memory."""
2934
2935     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2936     # The offsets_db is really small, and we need to read and write
2937     # from it a fair bit, so suck it into memory
2938     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2939     self.offsets = { }
2940     for key in offsets_db:
2941       #print " ZOO:", key, offsets_db[key]
2942       self.offsets[key] = offsets_db[key]
2943
2944   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2945     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2946     SymbolicNameFillingGuide object.
2947
2948     Note that if we encounter an opening rev in this fill, but the
2949     corresponding closing rev takes place later than SVN_REVNUM, the
2950     closing will not be passed to SymbolicNameFillingGuide in this
2951     fill (and will be discarded when encountered in a later fill).
2952     This is perfectly fine, because we can still do a valid fill
2953     without the closing--we always try to fill what we can as soon as
2954     we can."""
2955
2956     openings_closings_map = OpeningsClosingsMap(symbolic_name)
2957
2958     # It's possible to have a branch start with a file that was added
2959     # on a branch
2960     if self.offsets.has_key(symbolic_name):
2961       # set our read offset for self.symbolings to the offset for
2962       # symbolic_name
2963       self.symbolings.seek(self.offsets[symbolic_name])
2964
2965       while 1:
2966         fpos = self.symbolings.tell()
2967         line = self.symbolings.readline().rstrip()
2968         if not line:
2969           break
2970         name, revnum, type, branch_name, cvs_path = line.split(" ", 4)
2971         if branch_name == '*':
2972           svn_path = Ctx().project.make_trunk_path(cvs_path)
2973         else:
2974           svn_path = Ctx().project.make_branch_path(branch_name, cvs_path)
2975         revnum = int(revnum)
2976         if revnum > svn_revnum or name != symbolic_name:
2977           break
2978         openings_closings_map.register(svn_path, revnum, type)
2979
2980       # get current offset of the read marker and set it to the offset
2981       # for the beginning of the line we just read if we used anything
2982       # we read.
2983       if not openings_closings_map.is_empty():
2984         self.offsets[symbolic_name] = fpos
2985
2986     return SymbolicNameFillingGuide(openings_closings_map)
2987
2988
2989 class SvnRevisionRange:
2990   """The range of subversion revision numbers from which a path can be
2991   copied.  self.opening_revnum is the number of the earliest such
2992   revision, and self.closing_revnum is one higher than the number of
2993   the last such revision.  If self.closing_revnum is None, then no
2994   closings were registered."""
2995
2996   def __init__(self, opening_revnum):
2997     self.opening_revnum = opening_revnum
2998     self.closing_revnum = None
2999
3000   def add_closing(self, closing_revnum):
3001     # When we have a non-trunk default branch, we may have multiple
3002     # closings--only register the first closing we encounter.
3003     if self.closing_revnum is None:
3004       self.closing_revnum = closing_revnum
3005
3006   def __str__(self):
3007     if self.closing_revnum is None:
3008       return '[%d:]' % (self.opening_revnum,)
3009     else:
3010       return '[%d:%d]' % (self.opening_revnum, self.closing_revnum,)
3011
3012
3013 class OpeningsClosingsMap:
3014   """A dictionary of openings and closings for a symbolic name in the
3015   current SVNCommit.
3016
3017   The user should call self.register() for the openings and closings,
3018   then self.get_node_tree() to retrieve the information as a
3019   SymbolicNameFillingGuide."""
3020
3021   def __init__(self, symbolic_name):
3022     """Initialize OpeningsClosingsMap and prepare it for receiving
3023     openings and closings."""
3024
3025     self.name = symbolic_name
3026
3027     # A dictionary of SVN_PATHS to SvnRevisionRange objects.
3028     self.things = { }
3029
3030   def register(self, svn_path, svn_revnum, type):
3031     """Register an opening or closing revision for this symbolic name.
3032     SVN_PATH is the source path that needs to be copied into
3033     self.symbolic_name, and SVN_REVNUM is either the first svn
3034     revision number that we can copy from (our opening), or the last
3035     (not inclusive) svn revision number that we can copy from (our
3036     closing).  TYPE indicates whether this path is an opening or a a
3037     closing.
3038
3039     The opening for a given SVN_PATH must be passed before the closing
3040     for it to have any effect... any closing encountered before a
3041     corresponding opening will be discarded.
3042
3043     It is not necessary to pass a corresponding closing for every
3044     opening."""
3045
3046     # Always log an OPENING
3047     if type == OPENING:
3048       self.things[svn_path] = SvnRevisionRange(svn_revnum)
3049     # Only log a closing if we've already registered the opening for that
3050     # path.
3051     elif type == CLOSING and self.things.has_key(svn_path):
3052       self.things[svn_path].add_closing(svn_revnum)
3053
3054   def is_empty(self):
3055     """Return true if we haven't accumulated any openings or closings,
3056     false otherwise."""
3057
3058     return not len(self.things)
3059
3060   def get_things(self):
3061     """Return a list of (svn_path, SvnRevisionRange) tuples for all
3062     svn_paths with registered openings or closings."""
3063
3064     return self.things.items()
3065
3066
3067 class SymbolicNameFillingGuide:
3068   """A node tree representing the source paths to be copied to fill
3069   self.symbolic_name in the current SVNCommit.
3070
3071   self._node_tree is the root of the directory tree, in the form {
3072   path_component : subnode }.  Leaf nodes are instances of
3073   SvnRevisionRange.  Intermediate (directory) nodes are dictionaries
3074   mapping relative names to subnodes.
3075
3076   By walking self._node_tree and calling self.get_best_revnum() on
3077   each node, the caller can determine what subversion revision number
3078   to copy the path corresponding to that node from.  self._node_tree
3079   should be treated as read-only.
3080
3081   The caller can then descend to sub-nodes to see if their "best
3082   revnum" differs from their parents' and if it does, take appropriate
3083   actions to "patch up" the subtrees."""
3084
3085   def __init__(self, openings_closings_map):
3086     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
3087     store into it the openings and closings from
3088     OPENINGS_CLOSINGS_MAP."""
3089
3090     self.name = openings_closings_map.name
3091
3092     # The dictionary that holds our node tree as a map { node_key :
3093     # node }.
3094     self._node_tree = { }
3095
3096     for svn_path, svn_revision_range in openings_closings_map.get_things():
3097       (head, tail) = _path_split(svn_path)
3098       self._get_node_for_path(head)[tail] = svn_revision_range
3099
3100     #self.print_node_tree(self._node_tree)
3101
3102   def _get_node_for_path(self, svn_path):
3103     """Return the node key for svn_path, creating new nodes as needed."""
3104
3105     # Walk down the path, one node at a time.
3106     node = self._node_tree
3107     for component in svn_path.split('/'):
3108       if node.has_key(component):
3109         node = node[component]
3110       else:
3111         old_node = node
3112         node = {}
3113         old_node[component] = node
3114
3115     return node
3116
3117   def get_best_revnum(self, node, preferred_revnum):
3118     """Determine the best subversion revision number to use when
3119     copying the source tree beginning at NODE.  Returns a
3120     subversion revision number.
3121
3122     PREFERRED_REVNUM is passed to best_rev and used to calculate the
3123     best_revnum."""
3124
3125     def score_revisions(svn_revision_ranges):
3126       """Return a list of revisions and scores based on
3127       SVN_REVISION_RANGES.  The returned list looks like:
3128
3129          [(REV1 SCORE1), (REV2 SCORE2), ...]
3130
3131       where the tuples are sorted by revision number.
3132       SVN_REVISION_RANGES is a list of SvnRevisionRange objects.
3133
3134       For each svn revision that appears as either an opening_revnum
3135       or closing_revnum for one of the svn_revision_ranges, output a
3136       tuple indicating how many of the SvnRevisionRanges include that
3137       svn_revision in its range.  A score thus indicates that copying
3138       the corresponding revision (or any following revision up to the
3139       next revision in the list) of the object in question would yield
3140       that many correct paths at or underneath the object.  There may
3141       be other paths underneath it which are not correct and would
3142       need to be deleted or recopied; those can only be detected by
3143       descending and examining their scores.
3144
3145       If OPENINGS is empty, return the empty list."""
3146
3147       openings = [ x.opening_revnum
3148                    for x in svn_revision_ranges ]
3149       closings = [ x.closing_revnum
3150                    for x in svn_revision_ranges
3151                    if x.closing_revnum is not None ]
3152
3153       # First look for easy out.
3154       if not openings:
3155         return []
3156
3157       # Create a list with both openings (which increment the total)
3158       # and closings (which decrement the total):
3159       things = [(rev,1) for rev in openings] + [(rev,-1) for rev in closings]
3160       # Sort by revision number:
3161       things.sort()
3162       # Initialize output list with zeroth element of things.  This
3163       # element must exist, because it was already verified that
3164       # openings is not empty.
3165       scores = [ things[0] ]
3166       total = scores[-1][1]
3167       for (rev, change) in things[1:]:
3168         total += change
3169         if rev == scores[-1][0]:
3170           # Same revision as last entry; modify last entry:
3171           scores[-1] = (rev, total)
3172         else:
3173           # Previously-unseen revision; create new entry:
3174           scores.append((rev, total))
3175       return scores
3176
3177     def best_rev(scores, preferred_rev):
3178       """Return the revision with the highest score from SCORES, a list
3179       returned by score_revisions().  When the maximum score is shared
3180       by multiple revisions, the oldest revision is selected, unless
3181       PREFERRED_REV is one of the possibilities, in which case, it is
3182       selected."""
3183
3184       max_score = 0
3185       preferred_rev_score = -1
3186       rev = SVN_INVALID_REVNUM
3187       if preferred_rev is None:
3188         # Comparison order of different types is arbitrary.  Do not
3189         # expect None to compare less than int values below.
3190         preferred_rev = SVN_INVALID_REVNUM
3191       for revnum, count in scores:
3192         if count > max_score:
3193           max_score = count
3194           rev = revnum
3195         if revnum <= preferred_rev:
3196           preferred_rev_score = count
3197       if preferred_rev_score == max_score:
3198         rev = preferred_rev
3199       return rev, max_score
3200
3201     # Aggregate openings and closings from the rev tree
3202     svn_revision_ranges = self._list_revnums(node)
3203
3204     # Score the lists
3205     scores = score_revisions(svn_revision_ranges)
3206
3207     revnum, max_score = best_rev(scores, preferred_revnum)
3208
3209     if revnum == SVN_INVALID_REVNUM:
3210       raise FatalError("failed to find a revision "
3211                        + "to copy from when copying %s" % name)
3212     return revnum, max_score
3213
3214   def _list_revnums(self, node):
3215     """Return a list of all the SvnRevisionRanges (including
3216     duplicates) for all leaf nodes at and under NODE."""
3217
3218     if isinstance(node, SvnRevisionRange):
3219       # It is a leaf node.
3220       return [ node ]
3221     else:
3222       # It is an intermediate node.
3223       revnums = []
3224       for key, subnode in node.items():
3225         revnums.extend(self._list_revnums(subnode))
3226       return revnums
3227
3228   def get_sources(self):
3229     """Return the list of sources for this symbolic name.
3230
3231     The Project instance defines what are legitimate sources.  Raise
3232     an exception if a change occurred outside of the source
3233     directories."""
3234
3235     return self._get_sub_sources('', self._node_tree)
3236
3237   def _get_sub_sources(self, start_svn_path, start_node):
3238     """Return the list of sources for this symbolic name, starting the
3239     search at path START_SVN_PATH, which is node START_NODE.  This is
3240     a helper method, called by get_sources() (see)."""
3241
3242     project = Ctx().project
3243     if isinstance(start_node, SvnRevisionRange):
3244       # This implies that a change was found outside of the
3245       # legitimate sources.  This should never happen.
3246       raise
3247     elif project.is_source(start_svn_path):
3248       # This is a legitimate source.  Add it to list.
3249       return [ FillSource(start_svn_path, start_node) ]
3250     else:
3251       # This is a directory that is not a legitimate source.  (That's
3252       # OK because it hasn't changed directly.)  But directories
3253       # within it have been changed, so we need to search recursively
3254       # to find their enclosing sources.
3255       sources = []
3256       for entry, node in start_node.items():
3257         svn_path = _path_join(start_svn_path, entry)
3258         sources.extend(self._get_sub_sources(svn_path, node))
3259
3260     return sources
3261
3262   def print_node_tree(self, node, name='/', indent_depth=0):
3263     """For debugging purposes.  Prints all nodes in TREE that are
3264     rooted at NODE.  INDENT_DEPTH is used to indent the output of
3265     recursive calls."""
3266
3267     if not indent_depth:
3268       print "TREE", "=" * 75
3269     if isinstance(node, SvnRevisionRange):
3270       print "TREE:", " " * (indent_depth * 2), name, node
3271     else:
3272       print "TREE:", " " * (indent_depth * 2), name
3273       for key, value in node.items():
3274         self.print_node_tree(value, key, (indent_depth + 1))
3275
3276
3277 class FillSource:
3278   """Representation of a fill source used by the symbol filler in
3279   SVNRepositoryMirror."""
3280
3281   def __init__(self, prefix, node):
3282     """Create an unscored fill source with a prefix and a key."""
3283
3284     self.prefix = prefix
3285     self.node = node
3286     self.score = None
3287     self.revnum = None
3288
3289   def set_score(self, score, revnum):
3290     """Set the SCORE and REVNUM."""
3291
3292     self.score = score
3293     self.revnum = revnum
3294
3295   def __cmp__(self, other):
3296     """Comparison operator used to sort FillSources in descending
3297     score order."""
3298
3299     if self.score is None or other.score is None:
3300       raise TypeError, 'Tried to compare unscored FillSource'
3301     return cmp(other.score, self.score)
3302
3303
3304 class SVNRepositoryMirror:
3305   """Mirror a Subversion Repository as it is constructed, one
3306   SVNCommit at a time.  The mirror is skeletal; it does not contain
3307   file contents.  The creation of a dumpfile or Subversion repository
3308   is handled by delegates.  See self.add_delegate method for how to
3309   set delegates.
3310
3311   The structure of the repository is kept in two databases and one
3312   hash.  The revs_db database maps revisions to root node keys, and
3313   the nodes_db database maps node keys to nodes.  A node is a hash
3314   from directory names to keys.  Both the revs_db and the nodes_db are
3315   stored on disk and each access is expensive.
3316
3317   The nodes_db database only has the keys for old revisions.  The
3318   revision that is being contructed is kept in memory in the new_nodes
3319   hash which is cheap to access.
3320
3321   You must invoke _start_commit between SVNCommits.
3322
3323   *** WARNING *** All path arguments to methods in this class CANNOT
3324       have leading or trailing slashes."""
3325
3326   class SVNRepositoryMirrorPathExistsError(Exception):
3327     """Exception raised if an attempt is made to add a path to the
3328     repository mirror and that path already exists in the youngest
3329     revision of the repository."""
3330
3331     pass
3332
3333   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
3334     """Exception raised if a CVSRevision is found to have an unexpected
3335     operation (OP) value."""
3336
3337     pass
3338
3339   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
3340     """Exception raised if an empty SymbolicNameFillingGuide is returned
3341     during a fill where the branch in question already exists."""
3342
3343     pass
3344
3345   def __init__(self):
3346     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
3347
3348     self.key_generator = KeyGenerator()
3349
3350     self.delegates = [ ]
3351
3352     # This corresponds to the 'revisions' table in a Subversion fs.
3353     self.revs_db = SDatabase(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
3354     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
3355
3356     # This corresponds to the 'nodes' table in a Subversion fs.  (We
3357     # don't need a 'representations' or 'strings' table because we
3358     # only track metadata, not file contents.)
3359     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
3360     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
3361
3362     # Start at revision 0 without a root node.  It will be created
3363     # by _open_writable_root_node.
3364     self.youngest = 0
3365     self.new_root_key = None
3366     self.new_nodes = { }
3367
3368     if not Ctx().trunk_only:
3369       ###PERF IMPT: Suck this into memory.
3370       self.tags_db = TagsDatabase(DB_OPEN_READ)
3371       self.symbolings_reader = SymbolingsReader()
3372
3373   def _initialize_repository(self, date):
3374     """Initialize the repository by creating the directories for
3375     trunk, tags, and branches.  This method should only be called
3376     after all delegates are added to the repository mirror."""
3377
3378     # Make a 'fake' SVNCommit so we can take advantage of the revprops
3379     # magic therein
3380     svn_commit = SVNCommit("Initialization", 1)
3381     svn_commit.set_date(date)
3382     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
3383
3384     self._start_commit(svn_commit)
3385     self._mkdir(Ctx().project.trunk_path)
3386     if not Ctx().trunk_only:
3387       self._mkdir(Ctx().project.branches_path)
3388       self._mkdir(Ctx().project.tags_path)
3389
3390   def _start_commit(self, svn_commit):
3391     """Start a new commit."""
3392
3393     if self.youngest > 0:
3394       self._end_commit()
3395
3396     self.youngest = svn_commit.revnum
3397     self.new_root_key = None
3398     self.new_nodes = { }
3399
3400     self._invoke_delegates('start_commit', svn_commit)
3401
3402   def _end_commit(self):
3403     """Called at the end of each commit.  This method copies the newly
3404     created nodes to the on-disk nodes db."""
3405
3406     if self.new_root_key is None:
3407       # No changes were made in this revision, so we make the root node
3408       # of the new revision be the same as the last one.
3409       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
3410     else:
3411       self.revs_db[str(self.youngest)] = self.new_root_key
3412       # Copy the new nodes to the nodes_db
3413       for key, value in self.new_nodes.items():
3414         self.nodes_db[key] = value
3415
3416   def _get_node(self, key):
3417     """Returns the node contents for KEY which may refer to either
3418     self.nodes_db or self.new_nodes."""
3419
3420     if self.new_nodes.has_key(key):
3421       return self.new_nodes[key]
3422     else:
3423       return self.nodes_db[key]
3424
3425   def _open_readonly_node(self, path, revnum):
3426     """Open a readonly node for PATH at revision REVNUM.  Returns the
3427     node key and node contents if the path exists, else (None, None)."""
3428
3429     # Get the root key
3430     if revnum == self.youngest:
3431       if self.new_root_key is None:
3432         node_key = self.revs_db[str(self.youngest - 1)]
3433       else:
3434         node_key = self.new_root_key
3435     else:
3436       node_key = self.revs_db[str(revnum)]
3437
3438     for component in path.split('/'):
3439       node_contents = self._get_node(node_key)
3440       node_key = node_contents.get(component, None)
3441       if node_key is None:
3442         return None
3443
3444     return node_key
3445
3446   def _open_writable_root_node(self):
3447     """Open a writable root node.  The current root node is returned
3448     immeditely if it is already writable.  If not, create a new one by
3449     copying the contents of the root node of the previous version."""
3450
3451     if self.new_root_key is not None:
3452       return self.new_root_key, self.new_nodes[self.new_root_key]
3453
3454     if self.youngest < 2:
3455       new_contents = { }
3456     else:
3457       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
3458     self.new_root_key = self.key_generator.gen_key()
3459     self.new_nodes = { self.new_root_key: new_contents }
3460
3461     return self.new_root_key, new_contents
3462
3463   def _open_writable_node(self, svn_path, create):
3464     """Open a writable node for the path SVN_PATH, creating SVN_PATH
3465     and any missing directories if CREATE is True."""
3466
3467     parent_key, parent_contents = self._open_writable_root_node()
3468
3469     # Walk up the path, one node at a time.
3470     path_so_far = None
3471     components = svn_path.split('/')
3472     for i in range(len(components)):
3473       component = components[i]
3474       path_so_far = _path_join(path_so_far, component)
3475       this_key = parent_contents.get(component, None)
3476       if this_key is not None:
3477         # The component exists.
3478         this_contents = self.new_nodes.get(this_key, None)
3479         if this_contents is None:
3480           # Suck the node from the nodes_db, but update the key
3481           this_contents = self.nodes_db[this_key]
3482           this_key = self.key_generator.gen_key()
3483           self.new_nodes[this_key] = this_contents
3484           parent_contents[component] = this_key
3485       elif create:
3486         # The component does not exists, so we create it.
3487         this_contents = { }
3488         this_key = self.key_generator.gen_key()
3489         self.new_nodes[this_key] = this_contents
3490         parent_contents[component] = this_key
3491         if i < len(components) - 1:
3492           self._invoke_delegates('mkdir', path_so_far)
3493       else:
3494         # The component does not exists and we are not instructed to
3495         # create it, so we give up.
3496         return None, None
3497
3498       parent_key = this_key
3499       parent_contents = this_contents
3500
3501     return this_key, this_contents
3502
3503   def _path_exists(self, path):
3504     """If PATH exists in self.youngest of the svn repository mirror,
3505     return true, else return None.
3506
3507     PATH must not start with '/'."""
3508
3509     return self._open_readonly_node(path, self.youngest) is not None
3510
3511   def _fast_delete_path(self, parent_path, parent_contents, component):
3512     """Delete COMPONENT from the parent direcory PARENT_PATH with the
3513     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
3514     in PARENT_CONTENTS."""
3515
3516     if parent_contents.has_key(component):
3517       del parent_contents[component]
3518       self._invoke_delegates('delete_path',
3519                              _path_join(parent_path, component))
3520
3521   def _delete_path(self, svn_path, should_prune=False):
3522     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
3523     all ancestor directories that are made empty when SVN_PATH is deleted.
3524     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
3525
3526     NOTE: This function ignores requests to delete the root directory
3527     or any directory for which Ctx().project.is_unremovable() returns
3528     True, either directly or by pruning."""
3529
3530     if svn_path == '' or Ctx().project.is_unremovable(svn_path):
3531       return
3532
3533     (parent_path, entry,) = _path_split(svn_path)
3534     if parent_path:
3535       parent_key, parent_contents = \
3536           self._open_writable_node(parent_path, False)
3537     else:
3538       parent_key, parent_contents = self._open_writable_root_node()
3539
3540     if parent_key is not None:
3541       self._fast_delete_path(parent_path, parent_contents, entry)
3542       # The following recursion makes pruning an O(n^2) operation in the
3543       # worst case (where n is the depth of SVN_PATH), but the worst case
3544       # is probably rare, and the constant cost is pretty low.  Another
3545       # drawback is that we issue a delete for each path and not just
3546       # a single delete for the topmost directory pruned.
3547       if should_prune and len(parent_contents) == 0:
3548         self._delete_path(parent_path, True)
3549
3550   def _mkdir(self, path):
3551     """Create PATH in the repository mirror at the youngest revision."""
3552
3553     self._open_writable_node(path, True)
3554     self._invoke_delegates('mkdir', path)
3555
3556   def _change_path(self, cvs_rev):
3557     """Register a change in self.youngest for the CVS_REV's svn_path
3558     in the repository mirror."""
3559
3560     # We do not have to update the nodes because our mirror is only
3561     # concerned with the presence or absence of paths, and a file
3562     # content change does not cause any path changes.
3563     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, False))
3564
3565   def _add_path(self, cvs_rev):
3566     """Add the CVS_REV's svn_path to the repository mirror."""
3567
3568     self._open_writable_node(cvs_rev.svn_path, True)
3569     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, True))
3570
3571   def _copy_path(self, src_path, dest_path, src_revnum):
3572     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
3573     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
3574     parent *must* exist, but DEST_PATH *cannot* exist.
3575
3576     Return the node key and the contents of the new node at DEST_PATH
3577     as a dictionary."""
3578
3579     # get the contents of the node of our src_path
3580     src_key = self._open_readonly_node(src_path, src_revnum)
3581     src_contents = self._get_node(src_key)
3582
3583     # Get the parent path and the base path of the dest_path
3584     (dest_parent, dest_basename,) = _path_split(dest_path)
3585     dest_parent_key, dest_parent_contents = \
3586                    self._open_writable_node(dest_parent, False)
3587
3588     if dest_parent_contents.has_key(dest_basename):
3589       msg = "Attempt to add path '%s' to repository mirror " % dest_path
3590       msg += "when it already exists in the mirror."
3591       raise self.SVNRepositoryMirrorPathExistsError, msg
3592
3593     dest_parent_contents[dest_basename] = src_key
3594     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
3595
3596     # Yes sir, src_key and src_contents are also the contents of the
3597     # destination.  This is a cheap copy, remember!  :-)
3598     return src_key, src_contents
3599
3600   def _fill_symbolic_name(self, svn_commit):
3601     """Performs all copies necessary to create as much of the the tag
3602     or branch SVN_COMMIT.symbolic_name as possible given the current
3603     revision of the repository mirror.
3604
3605     The symbolic name is guaranteed to exist in the Subversion
3606     repository by the end of this call, even if there are no paths
3607     under it."""
3608
3609     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3610         svn_commit.symbolic_name, self.youngest)
3611     # Get the list of sources for the symbolic name.
3612     sources = symbol_fill.get_sources()
3613
3614     if sources:
3615       if svn_commit.symbolic_name in self.tags_db:
3616         dest_prefix = Ctx().project.get_tag_path(svn_commit.symbolic_name)
3617       else:
3618         dest_prefix = Ctx().project.get_branch_path(svn_commit.symbolic_name)
3619
3620       dest_key = self._open_writable_node(dest_prefix, False)[0]
3621       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3622     else:
3623       # We can only get here for a branch whose first commit is an add
3624       # (as opposed to a copy).
3625       dest_path = Ctx().project.get_branch_path(symbol_fill.name)
3626       if not self._path_exists(dest_path):
3627         # If our symbol_fill was empty, that means that our first
3628         # commit on the branch was to a file added on the branch, and
3629         # that this is our first fill of that branch.
3630         #
3631         # This case is covered by test 16.
3632         #
3633         # ...we create the branch by copying trunk from the our
3634         # current revision number minus 1
3635         source_path = Ctx().project.trunk_path
3636         entries = self._copy_path(source_path, dest_path,
3637                                   svn_commit.revnum - 1)[1]
3638         # Now since we've just copied trunk to a branch that's
3639         # *supposed* to be empty, we delete any entries in the
3640         # copied directory.
3641         for entry in entries:
3642           del_path = dest_path + '/' + entry
3643           # Delete but don't prune.
3644           self._delete_path(del_path)
3645       else:
3646         msg = "Error filling branch '" \
3647               + _clean_symbolic_name(symbol_fill.name) + "'.\n"
3648         msg += "Received an empty SymbolicNameFillingGuide and\n"
3649         msg += "attempted to create a branch that already exists."
3650         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3651
3652   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3653             path = None, parent_source_prefix = None,
3654             preferred_revnum = None, prune_ok = None):
3655     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3656     SOURCES, and recurse into the child items.
3657
3658     DEST_PREFIX is the prefix of the destination directory, e.g.
3659     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3660     FillSource classes that are candidates to be copied to the
3661     destination.  DEST_KEY is the key in self.nodes_db to the
3662     destination, or None if the destination does not yet exist.
3663
3664     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3665     are at the top level, e.g. '/tags/my_tag'.
3666
3667     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3668     the parent directory, and PREFERRED_REVNUM is an int which is the
3669     source revision number that the caller (who may have copied KEY's
3670     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3671     then no revision is preferable to any other (which probably means
3672     that no copies have happened yet).
3673
3674     PRUNE_OK means that a copy has been made in this recursion, and
3675     it's safe to prune directories that are not in
3676     SYMBOL_FILL._node_tree, provided that said directory has a source
3677     prefix of one of the PARENT_SOURCE_PREFIX.
3678
3679     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3680     should only be passed in by recursive calls."""
3681
3682     # Calculate scores and revnums for all sources
3683     for source in sources:
3684       src_revnum, score = symbol_fill.get_best_revnum(source.node,
3685                                                       preferred_revnum)
3686       source.set_score(score, src_revnum)
3687
3688     # Sort the sources in descending score order so that we will make
3689     # a eventual copy from the source with the highest score.
3690     sources.sort()
3691     copy_source = sources[0]
3692
3693     src_path = _path_join(copy_source.prefix, path)
3694     dest_path = _path_join(dest_prefix, path)
3695
3696     # Figure out if we shall copy to this destination and delete any
3697     # destination path that is in the way.
3698     do_copy = 0
3699     if dest_key is None:
3700       do_copy = 1
3701     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3702                        copy_source.revnum != preferred_revnum):
3703       # We are about to replace the destination, so we need to remove
3704       # it before we perform the copy.
3705       self._delete_path(dest_path)
3706       do_copy = 1
3707
3708     if do_copy:
3709       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3710                                                copy_source.revnum)
3711       prune_ok = 1
3712     else:
3713       dest_entries = self._get_node(dest_key)
3714
3715     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3716     # elements and the values are lists of FillSource classes where
3717     # this path element exists.
3718     src_entries = {}
3719     for source in sources:
3720       if isinstance(source.node, SvnRevisionRange):
3721         continue
3722       for entry, node in source.node.items():
3723         src_entries.setdefault(entry, []).append(
3724             FillSource(source.prefix, node))
3725
3726     if prune_ok:
3727       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3728       delete_list = [ ]
3729       for entry in dest_entries:
3730         if not src_entries.has_key(entry):
3731           delete_list.append(entry)
3732       if delete_list:
3733         if not self.new_nodes.has_key(dest_key):
3734           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3735         # Sort the delete list to get "diffable" dumpfiles.
3736         delete_list.sort()
3737         for entry in delete_list:
3738           self._fast_delete_path(dest_path, dest_entries, entry)
3739
3740     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3741     src_keys = src_entries.keys()
3742     src_keys.sort()
3743     for src_key in src_keys:
3744       next_dest_key = dest_entries.get(src_key, None)
3745       self._fill(symbol_fill, dest_prefix, next_dest_key,
3746                  src_entries[src_key], _path_join(path, src_key),
3747                  copy_source.prefix, sources[0].revnum, prune_ok)
3748
3749   def _synchronize_default_branch(self, svn_commit):
3750     """Propagate any changes that happened on a non-trunk default
3751     branch to the trunk of the repository.  See
3752     CVSCommit._post_commit() for details on why this is necessary."""
3753
3754     for cvs_rev in svn_commit.cvs_revs:
3755       svn_trunk_path = Ctx().project.make_trunk_path(cvs_rev.cvs_path)
3756       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3757         if self._path_exists(svn_trunk_path):
3758           # Delete the path on trunk...
3759           self._delete_path(svn_trunk_path)
3760         # ...and copy over from branch
3761         self._copy_path(cvs_rev.svn_path, svn_trunk_path,
3762                         svn_commit.motivating_revnum)
3763       elif cvs_rev.op == OP_DELETE:
3764         # delete trunk path
3765         self._delete_path(svn_trunk_path)
3766       else:
3767         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3768                % cvs_rev.op)
3769         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3770
3771   def commit(self, svn_commit):
3772     """Add an SVNCommit to the SVNRepository, incrementing the
3773     Repository revision number, and changing the repository.  Invoke
3774     the delegates' _start_commit() method."""
3775
3776     if svn_commit.revnum == 2:
3777       self._initialize_repository(svn_commit.get_date())
3778
3779     self._start_commit(svn_commit)
3780
3781     if svn_commit.symbolic_name:
3782       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3783                   _clean_symbolic_name(svn_commit.symbolic_name))
3784       self._fill_symbolic_name(svn_commit)
3785     elif svn_commit.motivating_revnum:
3786       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3787                   % svn_commit.motivating_revnum)
3788       self._synchronize_default_branch(svn_commit)
3789     else: # This actually commits CVSRevisions
3790       if len(svn_commit.cvs_revs) > 1: plural = "s"
3791       else: plural = ""
3792       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3793                   % (len(svn_commit.cvs_revs), plural))
3794       for cvs_rev in svn_commit.cvs_revs:
3795         # See comment in CVSCommit._commit() for what this is all
3796         # about.  Note that although asking self._path_exists() is
3797         # somewhat expensive, we only do it if the first two (cheap)
3798         # tests succeed first.
3799         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3800                 and (cvs_rev.rev == "1.1.1.1")
3801                 and self._path_exists(cvs_rev.svn_path)):
3802           if cvs_rev.op == OP_ADD:
3803             self._add_path(cvs_rev)
3804           elif cvs_rev.op == OP_CHANGE:
3805             # Fix for Issue #74:
3806             #
3807             # Here's the scenario.  You have file FOO that is imported
3808             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3809             # the file exists.
3810             #
3811             # Moving forward in time, FOO is deleted on the default
3812             # branch (r1.1.1.2).  cvs2svn determines that this delete
3813             # also needs to happen on trunk, so FOO is deleted on
3814             # trunk.
3815             #
3816             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3817             # not 'dead', we assume it's a change).  However, since
3818             # our trunk file has been deleted, svnadmin blows up--you
3819             # can't change a file that doesn't exist!
3820             #
3821             # Soooo... we just check the path, and if it doesn't
3822             # exist, we do an add... if the path does exist, it's
3823             # business as usual.
3824             if not self._path_exists(cvs_rev.svn_path):
3825               self._add_path(cvs_rev)
3826             else:
3827               self._change_path(cvs_rev)
3828
3829         if cvs_rev.op == OP_DELETE:
3830           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3831
3832   def cleanup(self):
3833     """Callback for the Cleanup.register in self.__init__."""
3834
3835     self.revs_db = None
3836     self.nodes_db = None
3837
3838   def add_delegate(self, delegate):
3839     """Adds DELEGATE to self.delegates.
3840
3841     For every delegate you add, as soon as SVNRepositoryMirror
3842     performs a repository action method, SVNRepositoryMirror will call
3843     the delegate's corresponding repository action method.  Multiple
3844     delegates will be called in the order that they are added.  See
3845     SVNRepositoryMirrorDelegate for more information."""
3846
3847     self.delegates.append(delegate)
3848
3849   def _invoke_delegates(self, method, *args):
3850     """Iterate through each of our delegates, in the order that they
3851     were added, and call the delegate's method named METHOD with the
3852     arguments in ARGS."""
3853
3854     for delegate in self.delegates:
3855       getattr(delegate, method)(*args)
3856
3857   def finish(self):
3858     """Calls the delegate finish method."""
3859
3860     self._end_commit()
3861     self._invoke_delegates('finish')
3862     self.cleanup()
3863
3864
3865 class SVNCommitItem:
3866   """A wrapper class for CVSRevision objects upon which
3867   Subversion-related data (such as properties) may be hung."""
3868
3869   def __init__(self, c_rev, svn_props_changed):
3870     """Initialize instance and record the properties for this file.
3871     SVN_PROPS_CHANGED indicates whether the svn: properties are known
3872     to have changed since the last revision.
3873
3874     The properties are set by the SVNPropertySetters in
3875     Ctx().svn_property_setters, then we read a couple of the
3876     properties back out for our own purposes."""
3877
3878     self.c_rev = c_rev
3879     # Did the svn properties change for this file (i.e., do they have
3880     # to be written to the dumpfile?)
3881     self.svn_props_changed = svn_props_changed
3882
3883     # The properties for this item as a map { key : value }.  If VALUE
3884     # is None, no property should be set.
3885     self.svn_props = { }
3886
3887     for svn_property_setter in Ctx().svn_property_setters:
3888       svn_property_setter.set_properties(self)
3889
3890     # Remember if we need to filter the EOLs.  We could actually use
3891     # self.svn_props now, since it is initialized for each revision.
3892     self.needs_eol_filter = \
3893         self.svn_props.get('svn:eol-style', None) is not None
3894
3895     self.has_keywords = self.svn_props.get('svn:keywords', None) is not None
3896
3897
3898 class SVNRepositoryMirrorDelegate:
3899   """Abstract superclass for any delegate to SVNRepositoryMirror.
3900   Subclasses must implement all of the methods below.
3901
3902   For each method, a subclass implements, in its own way, the
3903   Subversion operation implied by the method's name.  For example, for
3904   the add_path method, the DumpfileDelegate would write out a
3905   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3906   would merely print that the path is being added to the repository,
3907   and the RepositoryDelegate would actually cause the path to be added
3908   to the Subversion repository that it is creating.
3909   """
3910
3911   def start_commit(self, svn_commit):
3912     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3913     see subclass implementation for details."""
3914
3915     raise NotImplementedError
3916
3917   def mkdir(self, path):
3918     """PATH is a string; see subclass implementation for details."""
3919
3920     raise NotImplementedError
3921
3922   def add_path(self, s_item):
3923     """S_ITEM is an SVNCommitItem; see subclass implementation for
3924     details."""
3925
3926     raise NotImplementedError
3927
3928   def change_path(self, s_item):
3929     """S_ITEM is an SVNCommitItem; see subclass implementation for
3930     details."""
3931
3932     raise NotImplementedError
3933
3934   def delete_path(self, path):
3935     """PATH is a string; see subclass implementation for
3936     details."""
3937
3938     raise NotImplementedError
3939
3940   def copy_path(self, src_path, dest_path, src_revnum):
3941     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3942     subversion revision number (int); see subclass implementation for
3943     details."""
3944
3945     raise NotImplementedError
3946
3947   def finish(self):
3948     """Perform any cleanup necessary after all revisions have been
3949     committed."""
3950
3951     raise NotImplementedError
3952
3953
3954 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3955   """Create a Subversion dumpfile."""
3956
3957   def __init__(self, dumpfile_path=None):
3958     """Return a new DumpfileDelegate instance, attached to a dumpfile
3959     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding."""
3960
3961     if dumpfile_path:
3962       self.dumpfile_path = dumpfile_path
3963     else:
3964       self.dumpfile_path = Ctx().dumpfile
3965
3966     self.dumpfile = open(self.dumpfile_path, 'wb')
3967     self._write_dumpfile_header(self.dumpfile)
3968
3969   def _write_dumpfile_header(self, dumpfile):
3970     # Initialize the dumpfile with the standard headers.
3971     #
3972     # Since the CVS repository doesn't have a UUID, and the Subversion
3973     # repository will be created with one anyway, we don't specify a
3974     # UUID in the dumpflie
3975     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3976
3977   def _utf8_path(self, path):
3978     """Return a copy of PATH encoded in UTF-8."""
3979
3980     pieces = path.split('/')
3981     # Convert each path component separately (as they may each use
3982     # different encodings).
3983     for i in range(len(pieces)):
3984       try:
3985         # Log messages can be converted with the 'replace' strategy,
3986         # but we can't afford any lossiness here.
3987         pieces[i] = to_utf8(pieces[i], 'strict')
3988       except UnicodeError:
3989         raise FatalError(
3990             "Unable to convert a path '%s' to internal encoding.\n"
3991             "Consider rerunning with one or more '--encoding' parameters."
3992             % (path,))
3993     return '/'.join(pieces)
3994
3995   def _string_for_prop(self, name, value):
3996     """Return a property in the form needed for the dumpfile."""
3997
3998     return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value)
3999
4000   def start_commit(self, svn_commit):
4001     """Emit the start of SVN_COMMIT (an SVNCommit)."""
4002
4003     self.revision = svn_commit.revnum
4004
4005     # The start of a new commit typically looks like this:
4006     #
4007     #   Revision-number: 1
4008     #   Prop-content-length: 129
4009     #   Content-length: 129
4010     #
4011     #   K 7
4012     #   svn:log
4013     #   V 27
4014     #   Log message for revision 1.
4015     #   K 10
4016     #   svn:author
4017     #   V 7
4018     #   jrandom
4019     #   K 8
4020     #   svn:date
4021     #   V 27
4022     #   2003-04-22T22:57:58.132837Z
4023     #   PROPS-END
4024     #
4025     # Notice that the length headers count everything -- not just the
4026     # length of the data but also the lengths of the lengths, including
4027     # the 'K ' or 'V ' prefixes.
4028     #
4029     # The reason there are both Prop-content-length and Content-length
4030     # is that the former includes just props, while the latter includes
4031     # everything.  That's the generic header form for any entity in a
4032     # dumpfile.  But since revisions only have props, the two lengths
4033     # are always the same for revisions.
4034
4035     # Calculate the output needed for the property definitions.
4036     props = svn_commit.get_revprops()
4037     prop_names = props.keys()
4038     prop_names.sort()
4039     prop_strings = []
4040     for propname in prop_names:
4041       if props[propname] is not None:
4042         prop_strings.append(self._string_for_prop(propname, props[propname]))
4043
4044     all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n'
4045     total_len = len(all_prop_strings)
4046
4047     # Print the revision header and props
4048     self.dumpfile.write('Revision-number: %d\n'
4049                         'Prop-content-length: %d\n'
4050                         'Content-length: %d\n'
4051                         '\n'
4052                         % (self.revision, total_len, total_len))
4053
4054     self.dumpfile.write(all_prop_strings)
4055     self.dumpfile.write('\n')
4056
4057   def mkdir(self, path):
4058     """Emit the creation of directory PATH."""
4059
4060     self.dumpfile.write("Node-path: %s\n"
4061                         "Node-kind: dir\n"
4062                         "Node-action: add\n"
4063                         "\n"
4064                         "\n" % self._utf8_path(path))
4065
4066   def _add_or_change_path(self, s_item, op):
4067     """Emit the addition or change corresponding to S_ITEM.
4068     OP is either the constant OP_ADD or OP_CHANGE."""
4069
4070     # Validation stuffs
4071     if op == OP_ADD:
4072       action = 'add'
4073     elif op == OP_CHANGE:
4074       action = 'change'
4075     else:
4076       raise FatalError("_add_or_change_path() called with bad op ('%s')"
4077                        % (op,))
4078
4079     # Convenience variables
4080     c_rev = s_item.c_rev
4081
4082     # The property handling here takes advantage of an undocumented
4083     # but IMHO consistent feature of the Subversion dumpfile-loading
4084     # code.  When a node's properties aren't mentioned (that is, the
4085     # "Prop-content-length:" header is absent, no properties are
4086     # listed at all, and there is no "PROPS-END\n" line) then no
4087     # change is made to the node's properties.
4088     #
4089     # This is consistent with the way dumpfiles behave w.r.t. text
4090     # content changes, so I'm comfortable relying on it.  If you
4091     # commit a change to *just* the properties of some node that
4092     # already has text contents from a previous revision, then in the
4093     # dumpfile output for the prop change, no "Text-content-length:"
4094     # nor "Text-content-md5:" header will be present, and the text of
4095     # the file will not be given.  But this does not cause the file's
4096     # text to be erased!  It simply remains unchanged.
4097     #
4098     # This works out great for cvs2svn, due to lucky coincidences:
4099     #
4100     # For files, the only properties we ever set are set in the first
4101     # revision; all other revisions (including on branches) inherit
4102     # from that.  After the first revision, we never change file
4103     # properties, therefore, there is no need to remember the full set
4104     # of properties on a given file once we've set it.
4105     #
4106     # For directories, the only property we set is "svn:ignore", and
4107     # while we may change it after the first revision, we always do so
4108     # based on the contents of a ".cvsignore" file -- in other words,
4109     # CVS is doing the remembering for us, so we still don't have to
4110     # preserve the previous value of the property ourselves.
4111
4112     # Calculate the (sorted-by-name) property string and length, if any.
4113     if s_item.svn_props_changed:
4114       svn_props = s_item.svn_props
4115       prop_contents = ''
4116       prop_names = svn_props.keys()
4117       prop_names.sort()
4118       for pname in prop_names:
4119         pvalue = svn_props[pname]
4120         if pvalue is not None:
4121           prop_contents += self._string_for_prop(pname, pvalue)
4122       prop_contents += 'PROPS-END\n'
4123       props_header = 'Prop-content-length: %d\n' % len(prop_contents)
4124     else:
4125       prop_contents = ''
4126       props_header = ''
4127
4128     # treat .cvsignore as a directory property
4129     dir_path, basename = os.path.split(c_rev.svn_path)
4130     if basename == ".cvsignore":
4131       ignore_vals = generate_ignores(c_rev)
4132       ignore_contents = '\n'.join(ignore_vals)
4133       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
4134                          (len(ignore_contents), ignore_contents))
4135       ignore_contents += 'PROPS-END\n'
4136       ignore_len = len(ignore_contents)
4137
4138       # write headers, then props
4139       self.dumpfile.write('Node-path: %s\n'
4140                           'Node-kind: dir\n'
4141                           'Node-action: change\n'
4142                           'Prop-content-length: %d\n'
4143                           'Content-length: %d\n'
4144                           '\n'
4145                           '%s'
4146                           % (self._utf8_path(dir_path), ignore_len,
4147                              ignore_len, ignore_contents))
4148
4149     # If the file has keywords, we must prevent CVS/RCS from expanding
4150     # the keywords because they must be unexpanded in the repository,
4151     # or Subversion will get confused.
4152     pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(
4153         c_rev, suppress_keyword_substitution=s_item.has_keywords)
4154
4155     self.dumpfile.write('Node-path: %s\n'
4156                         'Node-kind: file\n'
4157                         'Node-action: %s\n'
4158                         '%s'  # no property header if no props
4159                         'Text-content-length: '
4160                         % (self._utf8_path(c_rev.svn_path),
4161                            action, props_header))
4162
4163     pos = self.dumpfile.tell()
4164
4165     self.dumpfile.write('0000000000000000\n'
4166                         'Text-content-md5: 00000000000000000000000000000000\n'
4167                         'Content-length: 0000000000000000\n'
4168                         '\n')
4169
4170     if prop_contents:
4171       self.dumpfile.write(prop_contents)
4172
4173     # Insert a filter to convert all EOLs to LFs if neccessary
4174     if s_item.needs_eol_filter:
4175       data_reader = LF_EOL_Filter(pipe.stdout)
4176     else:
4177       data_reader = pipe.stdout
4178
4179     # Insert the rev contents, calculating length and checksum as we go.
4180     checksum = md5.new()
4181     length = 0
4182     while True:
4183       buf = data_reader.read(PIPE_READ_SIZE)
4184       if buf == '':
4185         break
4186       checksum.update(buf)
4187       length += len(buf)
4188       self.dumpfile.write(buf)
4189
4190     pipe.stdout.close()
4191     error_output = pipe.stderr.read()
4192     exit_status = pipe.wait()
4193     if exit_status:
4194       raise FatalError("The command '%s' failed with exit status: %s\n"
4195                        "and the following output:\n"
4196                        "%s" % (pipe_cmd, exit_status, error_output))
4197
4198     # Go back to patch up the length and checksum headers:
4199     self.dumpfile.seek(pos, 0)
4200     # We left 16 zeros for the text length; replace them with the real
4201     # length, padded on the left with spaces:
4202     self.dumpfile.write('%16d' % length)
4203     # 16... + 1 newline + len('Text-content-md5: ') == 35
4204     self.dumpfile.seek(pos + 35, 0)
4205     self.dumpfile.write(checksum.hexdigest())
4206     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
4207     self.dumpfile.seek(pos + 84, 0)
4208     # The content length is the length of property data, text data,
4209     # and any metadata around/inside around them.
4210     self.dumpfile.write('%16d' % (length + len(prop_contents)))
4211     # Jump back to the end of the stream
4212     self.dumpfile.seek(0, 2)
4213
4214     # This record is done (write two newlines -- one to terminate
4215     # contents that weren't themselves newline-termination, one to
4216     # provide a blank line for readability.
4217     self.dumpfile.write('\n\n')
4218
4219   def add_path(self, s_item):
4220     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
4221
4222     self._add_or_change_path(s_item, OP_ADD)
4223
4224   def change_path(self, s_item):
4225     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
4226
4227     self._add_or_change_path(s_item, OP_CHANGE)
4228
4229   def delete_path(self, path):
4230     """Emit the deletion of PATH."""
4231
4232     self.dumpfile.write('Node-path: %s\n'
4233                         'Node-action: delete\n'
4234                         '\n' % self._utf8_path(path))
4235
4236   def copy_path(self, src_path, dest_path, src_revnum):
4237     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
4238
4239     # We don't need to include "Node-kind:" for copies; the loader
4240     # ignores it anyway and just uses the source kind instead.
4241     self.dumpfile.write('Node-path: %s\n'
4242                         'Node-action: add\n'
4243                         'Node-copyfrom-rev: %d\n'
4244                         'Node-copyfrom-path: /%s\n'
4245                         '\n'
4246                         % (self._utf8_path(dest_path),
4247                            src_revnum,
4248                            self._utf8_path(src_path)))
4249
4250   def finish(self):
4251     """Perform any cleanup necessary after all revisions have been
4252     committed."""
4253
4254     self.dumpfile.close()
4255
4256
4257 class RepositoryDelegate(DumpfileDelegate):
4258   """Creates a new Subversion Repository.  DumpfileDelegate does all
4259   of the heavy lifting."""
4260
4261   def __init__(self):
4262     self.svnadmin = Ctx().svnadmin
4263     self.target = Ctx().target
4264     if not Ctx().existing_svnrepos:
4265       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
4266       if not Ctx().fs_type:
4267         # User didn't say what kind repository (bdb, fsfs, etc).
4268         # We still pass --bdb-txn-nosync.  It's a no-op if the default
4269         # repository type doesn't support it, but we definitely want
4270         # it if BDB is the default.
4271         run_command('%s create %s "%s"' % (self.svnadmin,
4272                                            "--bdb-txn-nosync",
4273                                            self.target))
4274       elif Ctx().fs_type == 'bdb':
4275         # User explicitly specified bdb.
4276         #
4277         # Since this is a BDB repository, pass --bdb-txn-nosync,
4278         # because it gives us a 4-5x speed boost (if cvs2svn is
4279         # creating the repository, cvs2svn should be the only program
4280         # accessing the svn repository (until cvs is done, at least)).
4281         # But we'll turn no-sync off in self.finish(), unless
4282         # instructed otherwise.
4283         run_command('%s create %s %s "%s"' % (self.svnadmin,
4284                                               "--fs-type=bdb",
4285                                               "--bdb-txn-nosync",
4286                                               self.target))
4287       else:
4288         # User specified something other than bdb.
4289         run_command('%s create %s "%s"' % (self.svnadmin,
4290                                            "--fs-type=%s" % Ctx().fs_type,
4291                                            self.target))
4292
4293     # Since the output of this run is a repository, not a dumpfile,
4294     # the temporary dumpfiles we create should go in the tmpdir.
4295     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
4296
4297     # This is 1 if a commit is in progress, otherwise None.
4298     self._commit_in_progress = None
4299
4300     self.dumpfile = open(self.dumpfile_path, 'w+b')
4301     self.loader_pipe = SimplePopen([ self.svnadmin, 'load', '-q',
4302                                      self.target ], True)
4303     self.loader_pipe.stdout.close()
4304     try:
4305       self._write_dumpfile_header(self.loader_pipe.stdin)
4306     except IOError:
4307       raise FatalError("svnadmin failed with the following output while "
4308                        "loading the dumpfile:\n"
4309                        + self.loader_pipe.stderr.read())
4310
4311   def _feed_pipe(self):
4312     """Feed the revision stored in the dumpfile to the svnadmin
4313     load pipe."""
4314
4315     self.dumpfile.seek(0)
4316     while 1:
4317       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
4318       if not len(data):
4319         break
4320       try:
4321         self.loader_pipe.stdin.write(data)
4322       except IOError:
4323         raise FatalError("svnadmin failed with the following output "
4324                          "while loading the dumpfile:\n"
4325                          + self.loader_pipe.stderr.read())
4326
4327   def start_commit(self, svn_commit):
4328     """Start a new commit.  If a commit is already in progress, close
4329     the dumpfile, load it into the svn repository, open a new
4330     dumpfile, and write the header into it."""
4331
4332     if self._commit_in_progress:
4333       self._feed_pipe()
4334     self.dumpfile.seek(0)
4335     self.dumpfile.truncate()
4336     DumpfileDelegate.start_commit(self, svn_commit)
4337     self._commit_in_progress = 1
4338
4339   def finish(self):
4340     """Loads the last commit into the repository."""
4341
4342     self._feed_pipe()
4343     self.dumpfile.close()
4344     self.loader_pipe.stdin.close()
4345     error_output = self.loader_pipe.stderr.read()
4346     exit_status = self.loader_pipe.wait()
4347     if exit_status:
4348       raise FatalError('svnadmin load failed with exit status: %s\n'
4349                        'and the following output:\n'
4350                        '%s' % (exit_status, error_output,))
4351     os.remove(self.dumpfile_path)
4352
4353     # If this is a BDB repository, and we created the repository, and
4354     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
4355     # line in the DB_CONFIG file, because txn syncing should be on by
4356     # default in BDB repositories.
4357     #
4358     # We determine if this is a BDB repository by looking for the
4359     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
4360     # checking Ctx().fs_type.  That way this code will Do The Right
4361     # Thing in all circumstances.
4362     db_config = os.path.join(self.target, "db/DB_CONFIG")
4363     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
4364         and os.path.exists(db_config)):
4365       no_sync = 'set_flags DB_TXN_NOSYNC\n'
4366
4367       contents = open(db_config, 'r').readlines()
4368       index = contents.index(no_sync)
4369       contents[index] = '# ' + no_sync
4370       contents = open(db_config, 'w').writelines(contents)
4371
4372
4373 class StdoutDelegate(SVNRepositoryMirrorDelegate):
4374   """Makes no changes to the disk, but writes out information to
4375   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
4376   print statements will state that we're doing something, when in
4377   reality, we aren't doing anything other than printing out that we're
4378   doing something.  Kind of zen, really."""
4379
4380   def __init__(self, total_revs):
4381     self.total_revs = total_revs
4382
4383   def start_commit(self, svn_commit):
4384     """Prints out the Subversion revision number of the commit that is
4385     being started."""
4386
4387     Log().write(LOG_VERBOSE, "=" * 60)
4388     Log().write(LOG_NORMAL, "Starting Subversion r%d / %d" %
4389                 (svn_commit.revnum, self.total_revs))
4390
4391   def mkdir(self, path):
4392     """Print a line stating that we are creating directory PATH."""
4393
4394     Log().write(LOG_VERBOSE, "  New Directory", path)
4395
4396   def add_path(self, s_item):
4397     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
4398
4399     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
4400
4401   def change_path(self, s_item):
4402     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
4403
4404     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
4405
4406   def delete_path(self, path):
4407     """Print a line stating that we are 'deleting' PATH."""
4408
4409     Log().write(LOG_VERBOSE, "  Deleting", path)
4410
4411   def copy_path(self, src_path, dest_path, src_revnum):
4412     """Print a line stating that we are 'copying' revision SRC_REVNUM
4413     of SRC_PATH to DEST_PATH."""
4414
4415     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
4416     Log().write(LOG_VERBOSE, "                to", dest_path)
4417
4418   def finish(self):
4419     """State that we are done creating our repository."""
4420
4421     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
4422     Log().write(LOG_QUIET, "Done.")
4423
4424
4425 def pass1():
4426   OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
4427   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
4428   cd = CollectData()
4429
4430   def visit_file(baton, dirname, files):
4431     cd = baton
4432     for fname in files:
4433       verify_filename_legal(fname)
4434       if not fname.endswith(',v'):
4435         continue
4436       cd.found_valid_file = 1
4437       pathname = os.path.join(dirname, fname)
4438       if dirname.endswith(OS_SEP_PLUS_ATTIC):
4439         # drop the 'Attic' portion from the pathname for the canonical name.
4440         fdc = FileDataCollector(cd, os.path.join(dirname[:-6], fname),
4441                                 pathname)
4442       else:
4443         # If this file also exists in the attic, it's a fatal error
4444         attic_path = os.path.join(dirname, 'Attic', fname)
4445         if os.path.exists(attic_path):
4446           err = "%s: A CVS repository cannot contain both %s and %s" \
4447                 % (error_prefix, pathname, attic_path)
4448           sys.stderr.write(err + '\n')
4449           cd.fatal_errors.append(err)
4450         fdc = FileDataCollector(cd, pathname, pathname)
4451       Log().write(LOG_NORMAL, pathname)
4452       try:
4453         cvs2svn_rcsparse.parse(open(pathname, 'rb'), fdc)
4454       except (cvs2svn_rcsparse.common.RCSParseError, ValueError,
4455               RuntimeError):
4456         err = "%s: '%s' is not a valid ,v file" \
4457               % (error_prefix, pathname)
4458         sys.stderr.write(err + '\n')
4459         cd.fatal_errors.append(err)
4460       except:
4461         Log().write(LOG_WARN,
4462                     "Exception occurred while parsing %s" % pathname)
4463         raise
4464
4465   os.path.walk(Ctx().project.project_cvs_repos_path, visit_file, cd)
4466   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
4467
4468   cd.write_symbol_db()
4469
4470   if len(cd.fatal_errors) > 0:
4471     raise FatalException("Pass 1 complete.\n"
4472                          + "=" * 75 + "\n"
4473                          + "Error summary:\n"
4474                          + "\n".join(cd.fatal_errors) + "\n"
4475                          + "Exited due to fatal error(s).\n")
4476
4477   if cd.found_valid_file is None:
4478     raise FatalException(
4479         "\n"
4480         "No RCS files found in your CVS Repository!\n"
4481         "Are you absolutely certain you are pointing cvs2svn\n"
4482         "at a CVS repository?\n"
4483         "\n"
4484         "Exited due to fatal error(s).\n")
4485
4486   StatsKeeper().reset_c_rev_info()
4487   StatsKeeper().archive()
4488   Log().write(LOG_QUIET, "Done")
4489
4490
4491 def pass2():
4492   """Pass 2: clean up the revision information."""
4493
4494   symbol_db = SymbolDatabase()
4495   symbol_db.read()
4496
4497   # Convert the list of regexps to a list of strings
4498   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
4499
4500   error_detected = 0
4501
4502   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
4503   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
4504   if blocked_excludes:
4505     for branch, blockers in blocked_excludes.items():
4506       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
4507                        "excluded because the following symbols depend "
4508                        "on it:\n" % (branch))
4509       for blocker in blockers:
4510         sys.stderr.write("    '%s'\n" % (blocker))
4511     sys.stderr.write("\n")
4512     error_detected = 1
4513
4514   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
4515   invalid_forced_tags = [ ]
4516   for forced_tag in Ctx().forced_tags:
4517     if excludes.has_key(forced_tag):
4518       continue
4519     if symbol_db.branch_has_commit(forced_tag):
4520       invalid_forced_tags.append(forced_tag)
4521   if invalid_forced_tags:
4522     sys.stderr.write(error_prefix + ": The following branches cannot be "
4523                      "forced to be tags because they have commits:\n")
4524     for tag in invalid_forced_tags:
4525       sys.stderr.write("    '%s'\n" % (tag))
4526     sys.stderr.write("\n")
4527     error_detected = 1
4528
4529   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
4530   mismatches = symbol_db.find_mismatches(excludes)
4531   def is_not_forced(mismatch):
4532     name = mismatch[0]
4533     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
4534   mismatches = filter(is_not_forced, mismatches)
4535   if mismatches:
4536     sys.stderr.write(error_prefix + ": The following symbols are tags "
4537                      "in some files and branches in others.\nUse "
4538                      "--force-tag, --force-branch and/or --exclude to "
4539                      "resolve the symbols.\n")
4540     for name, tag_count, branch_count, commit_count in mismatches:
4541       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
4542                        "%d files and has commits in %d files.\n"
4543                        % (name, tag_count, branch_count, commit_count))
4544     error_detected = 1
4545
4546   # Bail out now if we found errors
4547   if error_detected:
4548     sys.exit(1)
4549
4550   # Create the tags database
4551   tags_db = TagsDatabase(DB_OPEN_NEW)
4552   for tag in symbol_db.tags:
4553     if tag not in Ctx().forced_branches:
4554       tags_db.add(tag)
4555   for tag in Ctx().forced_tags:
4556     tags_db.add(tag)
4557
4558   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
4559
4560   # We may have recorded some changes in revisions' timestamp.  We need to
4561   # scan for any other files which may have had the same log message and
4562   # occurred at "the same time" and change their timestamps, too.
4563
4564   # read the resync data file
4565   def read_resync(fname):
4566     """Read the .resync file into memory."""
4567
4568     ### note that we assume that we can hold the entire resync file in
4569     ### memory. really large repositories with whacky timestamps could
4570     ### bust this assumption. should that ever happen, then it is possible
4571     ### to split the resync file into pieces and make multiple passes,
4572     ### using each piece.
4573
4574     #
4575     # A digest maps to a sequence of lists which specify a lower and upper
4576     # time bound for matching up the commit.  We keep a sequence of these
4577     # because a number of checkins with the same log message (e.g. an empty
4578     # log message) could need to be remapped.  We also make them a list
4579     # because we will dynamically expand the lower/upper bound as we find
4580     # commits that fall into a particular msg and time range.
4581     #
4582     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4583     #
4584     resync = { }
4585
4586     for line in fileinput.FileInput(fname):
4587       t1 = int(line[:8], 16)
4588       digest = line[9:DIGEST_END_IDX]
4589       t2 = int(line[DIGEST_END_IDX+1:], 16)
4590       t1_l = t1 - COMMIT_THRESHOLD/2
4591       t1_u = t1 + COMMIT_THRESHOLD/2
4592       resync.setdefault(digest, []).append([t1_l, t1_u, t2])
4593
4594     # For each digest, sort the resync items in it in increasing order,
4595     # based on the lower time bound.
4596     for val in resync.values():
4597       val.sort()
4598
4599     return resync
4600
4601   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4602
4603   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4604   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4605
4606   tweaked_timestamps_db = Database(temp(TWEAKED_TIMESTAMPS_DB), DB_OPEN_NEW)
4607   Cleanup().register(temp(TWEAKED_TIMESTAMPS_DB), pass2)
4608
4609   # process the revisions file, looking for items to clean up
4610   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4611     c_rev = CVSRevision(Ctx(), line[:-1])
4612
4613     # Skip this entire revision if it's on an excluded branch
4614     if excludes.has_key(c_rev.branch_name):
4615       continue
4616
4617     new_prev_ts = None
4618     if c_rev.prev_rev is not None:
4619       new_prev_ts = tweaked_timestamps_db.get(
4620         c_rev.unique_key(c_rev.prev_rev), None)
4621     if new_prev_ts:
4622       c_rev.prev_timestamp = new_prev_ts
4623
4624     new_next_ts = None
4625     if c_rev.next_rev is not None:
4626       new_next_ts = tweaked_timestamps_db.get(
4627         c_rev.unique_key(c_rev.next_rev), None)
4628     if new_next_ts:
4629       c_rev.next_timestamp = new_next_ts
4630
4631     # Remove all references to excluded tags and branches
4632     def not_excluded(symbol, excludes=excludes):
4633       return not excludes.has_key(symbol)
4634     c_rev.branches = filter(not_excluded, c_rev.branches)
4635     c_rev.tags = filter(not_excluded, c_rev.tags)
4636
4637     # Convert all branches that are forced to be tags
4638     for forced_tag in Ctx().forced_tags:
4639       if forced_tag in c_rev.branches:
4640         c_rev.branches.remove(forced_tag)
4641         c_rev.tags.append(forced_tag)
4642
4643     # Convert all tags that are forced to be branches
4644     for forced_branch in Ctx().forced_branches:
4645       if forced_branch in c_rev.tags:
4646         c_rev.tags.remove(forced_branch)
4647         c_rev.branches.append(forced_branch)
4648
4649     # see if this is "near" any of the resync records we
4650     # have recorded for this digest [of the log message].
4651     for record in resync.get(c_rev.digest, []):
4652       if record[2] == c_rev.timestamp:
4653         # This means that either c_rev is the same revision that
4654         # caused the resync record to exist, or c_rev is a different
4655         # CVS revision that happens to have the same timestamp.  In
4656         # either case, we don't have to do anything, so we...
4657         continue
4658
4659       if record[0] <= c_rev.timestamp <= record[1]:
4660         # bingo!  We probably want to remap the time on this c_rev,
4661         # unless the remapping would be useless because the new time
4662         # would fall outside the COMMIT_THRESHOLD window for this
4663         # commit group.
4664         new_timestamp = record[2]
4665         # If the new timestamp is earlier than that of our previous revision
4666         if new_timestamp < c_rev.prev_timestamp:
4667           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4668                   + " to time %s, which is before previous the time of"
4669                   + " revision %s (%s):")
4670           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4671                                         c_rev.cvs_path, new_timestamp,
4672                                         c_rev.prev_rev, c_rev.prev_timestamp))
4673           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4674           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4675           # attempted resync time, then sync back to c_rev.prev_timestamp
4676           # + 1...
4677           if ((c_rev.prev_timestamp + 1) - new_timestamp) < COMMIT_THRESHOLD:
4678             new_timestamp = c_rev.prev_timestamp + 1
4679             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4680                                                           new_timestamp))
4681           else:
4682             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4683                         warning_prefix)
4684             continue
4685
4686         # If the new timestamp is later than that of our next revision
4687         elif c_rev.next_timestamp and new_timestamp > c_rev.next_timestamp:
4688           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4689                   + " to time %s, which is after time of next"
4690                   + " revision %s (%s):")
4691           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4692                                         c_rev.cvs_path, new_timestamp,
4693                                         c_rev.prev_rev, c_rev.next_timestamp))
4694           # If resyncing our rev to c_rev.next_timestamp - 1 will place
4695           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4696           # attempted resync time, then sync forward to c_rev.next_timestamp
4697           # - 1...
4698           if (new_timestamp - (c_rev.next_timestamp - 1)) < COMMIT_THRESHOLD:
4699             new_timestamp = c_rev.next_timestamp - 1
4700             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4701                                                           new_timestamp))
4702           else:
4703             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4704                         warning_prefix)
4705             continue
4706
4707         # Fix for Issue #71: Avoid resyncing two consecutive revisions
4708         # to the same timestamp.
4709         elif (new_timestamp == c_rev.prev_timestamp
4710               or new_timestamp == c_rev.next_timestamp):
4711           continue
4712
4713         # adjust the time range. we want the COMMIT_THRESHOLD from the
4714         # bounds of the earlier/latest commit in this group.
4715         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4716         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4717
4718         msg = "PASS2 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4719               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4720                  new_timestamp - c_rev.timestamp)
4721         Log().write(LOG_VERBOSE, msg)
4722
4723         c_rev.timestamp = new_timestamp
4724         tweaked_timestamps_db[c_rev.unique_key()] = new_timestamp
4725
4726         # stop looking for hits
4727         break
4728
4729     output.write(str(c_rev) + "\n")
4730   Log().write(LOG_QUIET, "Done")
4731
4732
4733 def pass3():
4734   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4735   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4736             temp(DATAFILE + SORTED_REVS_SUFFIX))
4737   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4738   Log().write(LOG_QUIET, "Done")
4739
4740
4741 def pass4():
4742   """Iterate through sorted revs, storing them in a database.
4743   If we're not doing a trunk-only conversion, generate the
4744   LastSymbolicNameDatabase, which contains the last CVSRevision
4745   that is a source for each tag or branch."""
4746
4747   Log().write(LOG_QUIET,
4748       "Copying CVS revision data from flat file to database...")
4749   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4750   if not Ctx().trunk_only:
4751     Log().write(LOG_QUIET,
4752         "Finding last CVS revisions for all symbolic names...")
4753     last_sym_name_db = LastSymbolicNameDatabase()
4754   else:
4755     # This is to avoid testing Ctx().trunk_only every time around the loop
4756     class DummyLSNDB:
4757       def noop(*args): pass
4758       log_revision = noop
4759       create_database = noop
4760     last_sym_name_db = DummyLSNDB()
4761
4762   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4763     c_rev = CVSRevision(Ctx(), line[:-1])
4764     cvs_revs_db.log_revision(c_rev)
4765     last_sym_name_db.log_revision(c_rev)
4766     StatsKeeper().record_c_rev(c_rev)
4767
4768   last_sym_name_db.create_database()
4769   StatsKeeper().archive()
4770   Log().write(LOG_QUIET, "Done")
4771
4772
4773 def pass5():
4774   """Generate the SVNCommit <-> CVSRevision mapping databases.
4775   CVSCommit._commit also calls SymbolingsLogger to register
4776   CVSRevisions that represent an opening or closing for a path on a
4777   branch or tag.  See SymbolingsLogger for more details."""
4778
4779   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4780
4781   aggregator = CVSRevisionAggregator()
4782   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4783     c_rev = CVSRevision(Ctx(), line[:-1])
4784     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4785       aggregator.process_revision(c_rev)
4786   aggregator.flush()
4787
4788   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4789   StatsKeeper().archive()
4790   Log().write(LOG_QUIET, "Done")
4791
4792
4793 def pass6():
4794   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4795
4796   if not Ctx().trunk_only:
4797     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4798               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4799     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4800   Log().write(LOG_QUIET, "Done")
4801
4802
4803 def pass7():
4804   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4805
4806   def generate_offsets_for_symbolings():
4807     """This function iterates through all the lines in
4808     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4809     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4810     where SYMBOLIC_NAME is first encountered.  This will allow us to
4811     seek to the various offsets in the file and sequentially read only
4812     the openings and closings that we need."""
4813
4814     ###PERF This is a fine example of a db that can be in-memory and
4815     #just flushed to disk when we're done.  Later, it can just be sucked
4816     #back into memory.
4817     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4818     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4819
4820     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4821     old_sym = ""
4822     while 1:
4823       fpos = file.tell()
4824       line = file.readline()
4825       if not line:
4826         break
4827       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4828       if sym != old_sym:
4829         Log().write(LOG_VERBOSE, " ", sym)
4830         old_sym = sym
4831         offsets_db[sym] = fpos
4832
4833   if not Ctx().trunk_only:
4834     generate_offsets_for_symbolings()
4835   Log().write(LOG_QUIET, "Done.")
4836
4837
4838 def pass8():
4839   svncounter = 2 # Repository initialization is 1.
4840   repos = SVNRepositoryMirror()
4841   persistence_manager = PersistenceManager(DB_OPEN_READ)
4842
4843   if Ctx().target:
4844     if not Ctx().dry_run:
4845       repos.add_delegate(RepositoryDelegate())
4846     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4847   else:
4848     if not Ctx().dry_run:
4849       repos.add_delegate(DumpfileDelegate())
4850     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4851
4852   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4853
4854   while 1:
4855     svn_commit = persistence_manager.get_svn_commit(svncounter)
4856     if not svn_commit:
4857       break
4858     repos.commit(svn_commit)
4859     svncounter += 1
4860
4861   repos.finish()
4862
4863 _passes = [
4864   pass1,
4865   pass2,
4866   pass3,
4867   pass4,
4868   pass5,
4869   pass6,
4870   pass7,
4871   pass8,
4872   ]
4873
4874
4875 class Ctx:
4876   """Session state for this run of cvs2svn.  For example, run-time
4877   options are stored here.  This class is a Borg, see
4878   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
4879
4880   __shared_state = { }
4881
4882   def __init__(self):
4883     self.__dict__ = self.__shared_state
4884     if self.__dict__:
4885       return
4886     # Else, initialize to defaults.
4887     self.target = None
4888     self.dumpfile = DUMPFILE
4889     self.tmpdir = '.'
4890     self.verbose = 0
4891     self.quiet = 0
4892     self.prune = 1
4893     self.existing_svnrepos = 0
4894     self.dump_only = 0
4895     self.dry_run = 0
4896     self.trunk_only = 0
4897     self.trunk_base = "trunk"
4898     self.tags_base = "tags"
4899     self.branches_base = "branches"
4900     self.encoding = ["ascii"]
4901     self.mime_types_file = None
4902     self.auto_props_file = None
4903     self.auto_props_ignore_case = False
4904     self.no_default_eol = 0
4905     self.eol_from_mime_type = 0
4906     self.keywords_off = 0
4907     self.use_cvs = None
4908     self.svnadmin = "svnadmin"
4909     self.username = None
4910     self.print_help = 0
4911     self.skip_cleanup = 0
4912     self.bdb_txn_nosync = 0
4913     self.fs_type = None
4914     self.forced_branches = []
4915     self.forced_tags = []
4916     self.excludes = []
4917     self.symbol_transforms = []
4918     self.svn_property_setters = []
4919
4920
4921 class SVNPropertySetter:
4922   """Abstract class for objects that can set properties on a SVNCommitItem."""
4923
4924   def set_properties(self, s_item):
4925     """Set any properties that can be determined for S_ITEM."""
4926
4927     raise NotImplementedError
4928
4929
4930 class CVSRevisionNumberSetter(SVNPropertySetter):
4931   """Set the cvs2svn:cvs-rev property to the CVS revision number."""
4932
4933   def set_properties(self, s_item):
4934     s_item.svn_props['cvs2svn:cvs-rev'] = s_item.c_rev.rev
4935     s_item.svn_props_changed = True
4936
4937
4938 class ExecutablePropertySetter(SVNPropertySetter):
4939   """Set the svn:executable property based on c_rev.file_executable."""
4940
4941   def set_properties(self, s_item):
4942     if s_item.c_rev.file_executable:
4943       s_item.svn_props['svn:executable'] = '*'
4944
4945
4946 class BinaryFileEOLStyleSetter(SVNPropertySetter):
4947   """Set the eol-style for binary files to None."""
4948
4949   def set_properties(self, s_item):
4950     if s_item.c_rev.mode == 'b':
4951       s_item.svn_props['svn:eol-style'] = None
4952
4953
4954 class MimeMapper(SVNPropertySetter):
4955   """A class that provides mappings from file names to MIME types."""
4956
4957   def __init__(self, mime_types_file):
4958     self.mappings = { }
4959
4960     for line in fileinput.input(mime_types_file):
4961       if line.startswith("#"):
4962         continue
4963
4964       # format of a line is something like
4965       # text/plain c h cpp
4966       extensions = line.split()
4967       if len(extensions) < 2:
4968         continue
4969       type = extensions.pop(0)
4970       for ext in extensions:
4971         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4972           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n"
4973                            % (warning_prefix, ext, self.mappings[ext], type))
4974         self.mappings[ext] = type
4975
4976   def set_properties(self, s_item):
4977     basename, extension = os.path.splitext(
4978         os.path.basename(s_item.c_rev.cvs_path)
4979         )
4980
4981     # Extension includes the dot, so strip it (will leave extension
4982     # empty if filename ends with a dot, which is ok):
4983     extension = extension[1:]
4984
4985     # If there is no extension (or the file ends with a period), use
4986     # the base name for mapping.  This allows us to set mappings for
4987     # files such as README or Makefile:
4988     if not extension:
4989       extension = basename
4990
4991     mime_type = self.mappings.get(extension, None)
4992     if mime_type is not None:
4993       s_item.svn_props['svn:mime-type'] = mime_type
4994
4995
4996 class AutoPropsPropertySetter(SVNPropertySetter):
4997   """Set arbitrary svn properties based on an auto-props configuration.
4998
4999   This class supports case-sensitive or case-insensitive pattern
5000   matching.  The 'correct' behavior is not quite clear, because
5001   subversion itself does an inconsistent job of handling case in
5002   auto-props patterns; see
5003   http://subversion.tigris.org/issues/show_bug.cgi?id=2036.
5004
5005   If a property specified in auto-props has already been set to a
5006   different value, print a warning and leave the old property value
5007   unchanged."""
5008
5009   class Pattern:
5010     """Describes the properties to be set for files matching a pattern."""
5011
5012     def __init__(self, pattern, propdict):
5013       # A glob-like pattern:
5014       self.pattern = pattern
5015       # A dictionary of properties that should be set:
5016       self.propdict = propdict
5017
5018     def match(self, basename):
5019       """Does the file with the specified basename match pattern?"""
5020
5021       return fnmatch.fnmatch(basename, self.pattern)
5022
5023   def __init__(self, configfilename, ignore_case):
5024     config = ConfigParser.ConfigParser()
5025     if ignore_case:
5026       self.transform_case = self.squash_case
5027     else:
5028       config.optionxform = self.preserve_case
5029       self.transform_case = self.preserve_case
5030
5031     config.readfp(file(configfilename))
5032     self.patterns = []
5033     for section in config.sections():
5034       if self.transform_case(section) == 'auto-props':
5035         for pattern in config.options(section):
5036           value = config.get(section, pattern)
5037           if value:
5038             self._add_pattern(pattern, value)
5039
5040   def squash_case(self, s):
5041     return s.lower()
5042
5043   def preserve_case(self, s):
5044     return s
5045
5046   def _add_pattern(self, pattern, value):
5047     props = value.split(';')
5048     propdict = {}
5049     for prop in props:
5050       s = prop.split('=', 1)
5051       if len(s) == 1:
5052         propdict[s[0]] = None
5053       else:
5054         propdict[s[0]] = s[1]
5055     self.patterns.append(
5056         self.Pattern(self.transform_case(pattern), propdict))
5057
5058   def get_propdict(self, path):
5059     basename = self.transform_case(os.path.basename(path))
5060     propdict = {}
5061     for pattern in self.patterns:
5062       if pattern.match(basename):
5063         for (key,value) in pattern.propdict.items():
5064           if propdict.has_key(key):
5065             if propdict[key] != value:
5066               Log().write(
5067                   LOG_WARN,
5068                   "Contradictory values set for property '%s' for file %s."
5069                   % (k, path,))
5070           else:
5071             propdict[key] = value
5072
5073     return propdict
5074
5075   def set_properties(self, s_item):
5076     propdict = self.get_propdict(s_item.c_rev.cvs_path)
5077     for (k,v) in propdict.items():
5078       if s_item.svn_props.has_key(k):
5079         if s_item.svn_props[k] != v:
5080           Log().write(
5081               LOG_WARN,
5082               "Property '%s' already set to %r for file %s; "
5083               "auto-props value (%r) ignored."
5084               % (k, s_item.svn_props[k], s_item.c_rev.cvs_path, v,))
5085       else:
5086         s_item.svn_props[k] = v
5087
5088
5089 class BinaryFileDefaultMimeTypeSetter(SVNPropertySetter):
5090   """If the file is binary and its svn:mime-type property is not yet
5091   set, set it to 'application/octet-stream'."""
5092
5093   def set_properties(self, s_item):
5094     if not s_item.svn_props.has_key('svn:mime-type') \
5095            and s_item.c_rev.mode == 'b':
5096       s_item.svn_props['svn:mime-type'] = 'application/octet-stream'
5097
5098
5099 class EOLStyleFromMimeTypeSetter(SVNPropertySetter):
5100   """Set svn:eol-style based on svn:mime-type.
5101
5102   If svn:mime-type is known but svn:eol-style is not, then set
5103   svn:eol-style based on svn:mime-type as follows: if svn:mime-type
5104   starts with 'text/', then set svn:eol-style to native; otherwise,
5105   force it to remain unset.  See also issue #39."""
5106
5107   def set_properties(self, s_item):
5108     if not s_item.svn_props.has_key('svn:eol-style') \
5109        and s_item.svn_props.get('svn:mime-type', None) is not None:
5110       if s_item.svn_props['svn:mime-type'].startswith("text/"):
5111         s_item.svn_props['svn:eol-style'] = 'native'
5112       else:
5113         s_item.svn_props['svn:eol-style'] = None
5114
5115
5116 class DefaultEOLStyleSetter(SVNPropertySetter):
5117   """Set the eol-style if one has not already been set."""
5118
5119   def __init__(self, value):
5120     """Initialize with the specified default VALUE."""
5121
5122     self.value = value
5123
5124   def set_properties(self, s_item):
5125     if not s_item.svn_props.has_key('svn:eol-style'):
5126       s_item.svn_props['svn:eol-style'] = self.value
5127
5128
5129 class KeywordsPropertySetter(SVNPropertySetter):
5130   """If the svn:keywords property is not yet set, set it based on the
5131   file's mode.  See issue #2."""
5132
5133   def __init__(self, value):
5134     """Use VALUE for the value of the svn:keywords property if it is
5135     to be set."""
5136
5137     self.value = value
5138
5139   def set_properties(self, s_item):
5140     if not s_item.svn_props.has_key('svn:keywords') \
5141            and s_item.c_rev.mode in [None, 'kv', 'kvl']:
5142       s_item.svn_props['svn:keywords'] = self.value
5143
5144
5145 def convert(start_pass, end_pass):
5146   """Convert a CVS repository to an SVN repository."""
5147
5148   cleanup = Cleanup()
5149   times = [ None ] * (end_pass + 1)
5150   times[start_pass - 1] = time.time()
5151   StatsKeeper().set_start_time(time.time())
5152   for i in range(start_pass - 1, end_pass):
5153     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
5154     _passes[i]()
5155     times[i + 1] = time.time()
5156     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
5157     # Dispose of items in Ctx() not intended to live past the end of the pass
5158     # (Identified by exactly one leading underscore)
5159     for attr in dir(Ctx()):
5160       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
5161           and attr[:6] != "_Ctx__"):
5162         delattr(Ctx(), attr)
5163     if not Ctx().skip_cleanup:
5164       cleanup.cleanup(_passes[i])
5165     StatsKeeper().set_end_time(time.time())
5166
5167   Log().write(LOG_QUIET, StatsKeeper())
5168   if end_pass < 4:
5169     Log().write(LOG_QUIET,
5170                 '(These are unaltered CVS repository stats and do not\n'
5171                 ' reflect tags or branches excluded via --exclude)\n')
5172   Log().write(LOG_NORMAL, StatsKeeper().timings())
5173
5174
5175 def normalize_ttb_path(opt, path):
5176   """Normalize a path to be used for --trunk, --tags, or --branches.
5177
5178   1. Strip leading, trailing, and duplicated '/'.
5179   2. Verify that the path is not empty.
5180
5181   Return the normalized path.
5182
5183   If the path is invalid, write an error message and exit."""
5184
5185   norm_path = _path_join(*path.split('/'))
5186   if not norm_path:
5187     raise FatalError("cannot pass an empty path to %s." % (opt,))
5188   return norm_path
5189
5190
5191 def verify_paths_disjoint(*paths):
5192   """Verify that all of the paths in the argument list are disjoint.
5193
5194   If any of the paths is nested in another one (i.e., in the sense
5195   that 'a/b/c/d' is nested in 'a/b'), or any two paths are identical,
5196   write an error message and exit."""
5197
5198   paths = [(path.split('/'), path) for path in paths]
5199   # If all overlapping elements are equal, a shorter list is
5200   # considered "less than" a longer one.  Therefore if any paths are
5201   # nested, this sort will leave at least one such pair adjacent, in
5202   # the order [nest,nestling].
5203   paths.sort()
5204   for i in range(1, len(paths)):
5205     split_path1, path1 = paths[i - 1]
5206     split_path2, path2 = paths[i]
5207     if len(split_path1) <= len(split_path2) \
5208        and split_path2[:len(split_path1)] == split_path1:
5209       raise FatalError("paths %s and %s are not disjoint." % (path1, path2,))
5210
5211
5212 def usage():
5213   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
5214         % os.path.basename(sys.argv[0])
5215   print '  --help, -h           print this usage message and exit with success'
5216   print '  --version            print the version number'
5217   print '  -q                   quiet'
5218   print '  -v                   verbose'
5219   print '  -s PATH              path for SVN repos'
5220   print '  -p START[:END]       start at pass START, end at pass END of %d' \
5221         % len(_passes)
5222   print '                       If only START is given, run only pass START'
5223   print '                       (implicitly enables --skip-cleanup)'
5224   print '  --existing-svnrepos  load into existing SVN repository'
5225   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
5226   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
5227   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
5228   print '  --dry-run            do not create a repository or a dumpfile;'
5229   print '                       just print what would happen.'
5230   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
5231   print '                       (only use this if having problems with RCS)'
5232   print '  --svnadmin=PATH      path to the svnadmin program'
5233   print '  --trunk-only         convert only trunk commits, not tags nor branches'
5234   print '  --trunk=PATH         path for trunk (default: %s)'    \
5235         % Ctx().trunk_base
5236   print '  --branches=PATH      path for branches (default: %s)' \
5237         % Ctx().branches_base
5238   print '  --tags=PATH          path for tags (default: %s)'     \
5239         % Ctx().tags_base
5240   print '  --no-prune           don\'t prune empty directories'
5241   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
5242   print '  --encoding=ENC       encoding of paths and log messages in CVS repos'
5243   print '                       Multiple of these options may be passed, where they'
5244   print '                       will be treated as an ordered list of encodings to'
5245   print '                       attempt (with "ascii" as a hardcoded last resort)'
5246   print '  --force-branch=NAME  force NAME to be a branch'
5247   print '  --force-tag=NAME     force NAME to be a tag'
5248   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
5249   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
5250   print '                       use Python regexp and reference syntax respectively'
5251   print '  --username=NAME      username for cvs2svn-synthesized commits'
5252   print '  --skip-cleanup       prevent the deletion of intermediate files'
5253   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
5254   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
5255   print '  --cvs-revnums        record CVS revision numbers as file properties'
5256   print '  --auto-props=FILE    set file properties from the auto-props section'
5257   print '                       of a file in svn config format'
5258   print '  --auto-props-ignore-case Ignore case when matching auto-props patterns'
5259   print '  --mime-types=FILE    specify an apache-style mime.types file for'
5260   print '                       setting svn:mime-type'
5261   print '  --eol-from-mime-type set svn:eol-style from mime type if known'
5262   print '  --no-default-eol     don\'t set svn:eol-style to \'native\' for'
5263   print '                       non-binary files with undetermined mime types'
5264   print '  --keywords-off       don\'t set svn:keywords on any files (by default,'
5265   print '                       cvs2svn sets svn:keywords on non-binary files to'
5266   print '                       "%s")' % SVN_KEYWORDS_VALUE
5267
5268
5269 def main():
5270   # Convenience var, so we don't have to keep instantiating this Borg.
5271   ctx = Ctx()
5272
5273   profiling = None
5274   start_pass = 1
5275   end_pass = len(_passes)
5276
5277   try:
5278     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
5279                                [ "help", "create", "trunk=",
5280                                  "username=", "existing-svnrepos",
5281                                  "branches=", "tags=", "encoding=",
5282                                  "force-branch=", "force-tag=", "exclude=",
5283                                  "use-cvs", "mime-types=",
5284                                  "auto-props=", "auto-props-ignore-case",
5285                                  "eol-from-mime-type", "no-default-eol",
5286                                  "trunk-only", "no-prune", "dry-run",
5287                                  "dump-only", "dumpfile=", "tmpdir=",
5288                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
5289                                  "bdb-txn-nosync", "fs-type=",
5290                                  "version", "profile",
5291                                  "keywords-off", "symbol-transform="])
5292   except getopt.GetoptError, e:
5293     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
5294     usage()
5295     sys.exit(1)
5296
5297   for opt, value in opts:
5298     if opt == '--version':
5299         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
5300         sys.exit(0)
5301     elif opt == '-p':
5302       # Don't cleanup if we're doing incrementals.
5303       ctx.skip_cleanup = 1
5304       if value.find(':') > 0:
5305         start_pass, end_pass = map(int, value.split(':'))
5306       else:
5307         end_pass = start_pass = int(value)
5308       if start_pass > len(_passes) or start_pass < 1:
5309         raise FatalError(
5310             'illegal value (%d) for starting pass.  Must be 1 through %d.'
5311             % (int(start_pass), len(_passes),))
5312       if end_pass < start_pass or end_pass > len(_passes):
5313         raise FatalError(
5314             'illegal value (%d) for ending pass.  Must be %d through %d.'
5315             % (int(end_pass), int(start_pass), len(_passes),))
5316     elif (opt == '--help') or (opt == '-h'):
5317       ctx.print_help = 1
5318     elif opt == '-v':
5319       Log().log_level = LOG_VERBOSE
5320       ctx.verbose = 1
5321     elif opt == '-q':
5322       Log().log_level = LOG_QUIET
5323       ctx.quiet = 1
5324     elif opt == '-s':
5325       ctx.target = value
5326     elif opt == '--existing-svnrepos':
5327       ctx.existing_svnrepos = 1
5328     elif opt == '--dumpfile':
5329       ctx.dumpfile = value
5330     elif opt == '--tmpdir':
5331       ctx.tmpdir = value
5332     elif opt == '--use-cvs':
5333       ctx.use_cvs = 1
5334     elif opt == '--svnadmin':
5335       ctx.svnadmin = value
5336     elif opt == '--trunk-only':
5337       ctx.trunk_only = 1
5338     elif opt == '--trunk':
5339       ctx.trunk_base = normalize_ttb_path(opt, value)
5340     elif opt == '--branches':
5341       ctx.branches_base = normalize_ttb_path(opt, value)
5342     elif opt == '--tags':
5343       ctx.tags_base = normalize_ttb_path(opt, value)
5344     elif opt == '--no-prune':
5345       ctx.prune = None
5346     elif opt == '--dump-only':
5347       ctx.dump_only = 1
5348     elif opt == '--dry-run':
5349       ctx.dry_run = 1
5350     elif opt == '--encoding':
5351       ctx.encoding.insert(-1, value)
5352     elif opt == '--force-branch':
5353       ctx.forced_branches.append(value)
5354     elif opt == '--force-tag':
5355       ctx.forced_tags.append(value)
5356     elif opt == '--exclude':
5357       try:
5358         ctx.excludes.append(re.compile('^' + value + '$'))
5359       except re.error, e:
5360         raise FatalError("'%s' is not a valid regexp." % (value,))
5361     elif opt == '--mime-types':
5362       ctx.mime_types_file = value
5363     elif opt == '--auto-props':
5364       ctx.auto_props_file = value
5365     elif opt == '--auto-props-ignore-case':
5366       ctx.auto_props_ignore_case = True
5367     elif opt == '--eol-from-mime-type':
5368       ctx.eol_from_mime_type = 1
5369     elif opt == '--no-default-eol':
5370       ctx.no_default_eol = 1
5371     elif opt == '--keywords-off':
5372       ctx.keywords_off = 1
5373     elif opt == '--username':
5374       ctx.username = value
5375     elif opt == '--skip-cleanup':
5376       ctx.skip_cleanup = 1
5377     elif opt == '--cvs-revnums':
5378       ctx.svn_property_setters.append(CVSRevisionNumberSetter())
5379     elif opt == '--bdb-txn-nosync':
5380       ctx.bdb_txn_nosync = 1
5381     elif opt == '--fs-type':
5382       ctx.fs_type = value
5383     elif opt == '--create':
5384       sys.stderr.write(warning_prefix +
5385           ': The behaviour produced by the --create option is now the '
5386           'default,\nand passing the option is deprecated.\n')
5387     elif opt == '--profile':
5388       profiling = 1
5389     elif opt == '--symbol-transform':
5390       [pattern, replacement] = value.split(":")
5391       try:
5392         pattern = re.compile(pattern)
5393       except re.error, e:
5394         raise FatalError("'%s' is not a valid regexp." % (pattern,))
5395       ctx.symbol_transforms.append((pattern, replacement,))
5396
5397   if ctx.print_help:
5398     usage()
5399     sys.exit(0)
5400
5401   # Consistency check for options and arguments.
5402   if len(args) == 0:
5403     usage()
5404     sys.exit(1)
5405
5406   if len(args) > 1:
5407     sys.stderr.write(error_prefix +
5408                      ": must pass only one CVS repository.\n")
5409     usage()
5410     sys.exit(1)
5411
5412   cvsroot = args[0]
5413
5414   if ctx.use_cvs:
5415     ctx.cvs_repository = CVSRepositoryViaCVS(cvsroot)
5416   else:
5417     ctx.cvs_repository = CVSRepositoryViaRCS(cvsroot)
5418
5419   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
5420     raise FatalError("must pass one of '-s' or '--dump-only'.")
5421
5422   def not_both(opt1val, opt1name, opt2val, opt2name):
5423     if opt1val and opt2val:
5424       raise FatalError("cannot pass both '%s' and '%s'."
5425                        % (opt1name, opt2name,))
5426
5427   not_both(ctx.target, '-s',
5428            ctx.dump_only, '--dump-only')
5429
5430   not_both(ctx.dump_only, '--dump-only',
5431            ctx.existing_svnrepos, '--existing-svnrepos')
5432
5433   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
5434            ctx.existing_svnrepos, '--existing-svnrepos')
5435
5436   not_both(ctx.dump_only, '--dump-only',
5437            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
5438
5439   not_both(ctx.quiet, '-q',
5440            ctx.verbose, '-v')
5441
5442   not_both(ctx.fs_type, '--fs-type',
5443            ctx.existing_svnrepos, '--existing-svnrepos')
5444
5445   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
5446     raise FatalError("cannot pass --bdb-txn-nosync with --fs-type=%s."
5447                      % ctx.fs_type)
5448
5449   # Create the default project (using ctx.trunk, ctx.branches, and ctx.tags):
5450   ctx.project = Project(ctx.cvs_repository.cvs_repos_path,
5451                         ctx.trunk_base, ctx.branches_base, ctx.tags_base)
5452
5453   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
5454     raise FatalError("the svn-repos-path '%s' is not an "
5455                      "existing directory." % ctx.target)
5456
5457   if not ctx.dump_only and not ctx.existing_svnrepos \
5458      and (not ctx.dry_run) and os.path.exists(ctx.target):
5459     raise FatalError("the svn-repos-path '%s' exists.\n"
5460                      "Remove it, or pass '--existing-svnrepos'."
5461                      % ctx.target)
5462
5463   if ctx.target and not ctx.dry_run:
5464     # Verify that svnadmin can be executed.  The 'help' subcommand
5465     # should be harmless.
5466     try:
5467       check_command_runs([ctx.svnadmin, 'help'], 'svnadmin')
5468     except CommandFailedException, e:
5469       raise FatalError(
5470           '%s\n'
5471           'svnadmin could not be executed.  Please ensure that it is\n'
5472           'installed and/or use the --svnadmin option.' % (e,))
5473
5474   ctx.svn_property_setters.append(ExecutablePropertySetter())
5475
5476   ctx.svn_property_setters.append(BinaryFileEOLStyleSetter())
5477
5478   if ctx.mime_types_file:
5479     ctx.svn_property_setters.append(MimeMapper(ctx.mime_types_file))
5480
5481   if ctx.auto_props_file:
5482     ctx.svn_property_setters.append(AutoPropsPropertySetter(
5483         ctx.auto_props_file, ctx.auto_props_ignore_case))
5484
5485   ctx.svn_property_setters.append(BinaryFileDefaultMimeTypeSetter())
5486
5487   if ctx.eol_from_mime_type:
5488     ctx.svn_property_setters.append(EOLStyleFromMimeTypeSetter())
5489
5490   if ctx.no_default_eol:
5491     ctx.svn_property_setters.append(DefaultEOLStyleSetter(None))
5492   else:
5493     ctx.svn_property_setters.append(DefaultEOLStyleSetter('native'))
5494
5495   if not ctx.keywords_off:
5496     ctx.svn_property_setters.append(
5497         KeywordsPropertySetter(SVN_KEYWORDS_VALUE))
5498
5499   # Make sure the tmp directory exists.  Note that we don't check if
5500   # it's empty -- we want to be able to use, for example, "." to hold
5501   # tempfiles.  But if we *did* want check if it were empty, we'd do
5502   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
5503   if not os.path.exists(ctx.tmpdir):
5504     os.mkdir(ctx.tmpdir)
5505   elif not os.path.isdir(ctx.tmpdir):
5506     raise FatalError(
5507         "cvs2svn tried to use '%s' for temporary files, but that path\n"
5508         "  exists and is not a directory.  Please make it be a directory,\n"
5509         "  or specify some other directory for temporary files."
5510         % (ctx.tmpdir,))
5511
5512   # But do lock the tmpdir, to avoid process clash.
5513   try:
5514     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5515   except OSError, e:
5516     if e.errno == errno.EACCES:
5517       raise FatalError("Permission denied:"
5518                        + " No write access to directory '%s'." % ctx.tmpdir)
5519     if e.errno == errno.EEXIST:
5520       raise FatalError(
5521           "cvs2svn is using directory '%s' for temporary files, but\n"
5522           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
5523           "  cvs2svn process is currently using '%s' as its temporary\n"
5524           "  workspace.  If you are certain that is not the case,\n"
5525           "  then remove the '%s/cvs2svn.lock' subdirectory."
5526           % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir,))
5527     raise
5528   try:
5529     if profiling:
5530       import hotshot
5531       prof = hotshot.Profile('cvs2svn.hotshot')
5532       prof.runcall(convert, start_pass, end_pass)
5533       prof.close()
5534     else:
5535       convert(start_pass, end_pass)
5536   finally:
5537     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5538     except: pass
5539
5540
5541 if __name__ == '__main__':
5542   try:
5543     main()
5544   except FatalException, e:
5545     sys.stderr.write(str(e))
5546     sys.exit(1)
5547
5548