cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 from __future__ import generators
  23
  24 import cvs2svn_rcsparse
  25 import os
  26 import sys
  27 import sha
  28 import re
  29 import time
  30 import fileinput
  31 import fnmatch
  32 import getopt
  33 import stat
  34 import md5
  35 import marshal
  36 import errno
  37 import popen2
  38 import types
  39 import ConfigParser
  40 try:
  41   # Try to get access to a bunch of encodings for use with --encoding.
  42   # See http://cjkpython.i18n.org/ for details.
  43   import iconv_codec
  44 except ImportError:
  45   pass
  46
  47 # Warnings and errors start with these strings.  They are typically
  48 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  49 warning_prefix = "WARNING"
  50 error_prefix = "ERROR"
  51
  52 # Make sure this Python is recent enough.
  53 if sys.hexversion < 0x02020000:
  54   sys.stderr.write("'%s: Python 2.2 or higher required, "
  55                    "see www.python.org.\n" % error_prefix)
  56   sys.exit(1)
  57
  58 # Pretend we have true booleans on older python versions
  59 try:
  60   True
  61 except:
  62   True = 1
  63   False = 0
  64
  65 # Opening pipes was a mess before Python 2.4, because some methods did
  66 # not exist on some platforms, and some behaved differenly on other.
  67 # Python 2.4 solved this by adding the subprocess module, but since we
  68 # cannot require such a new version, we cannot use it directly, but
  69 # must implement a simplified Popen using the best means neccessary.
  70 #
  71 # The SimplePopen class only has the following members and methods, all
  72 # behaving as documented in the subprocess.Popen class:
  73 #     - stdin
  74 #     - stdout
  75 #     - stderr
  76 #     - wait
  77 try:
  78   # First try subprocess.Popen...
  79   import subprocess
  80   class SimplePopen:
  81     def __init__(self, cmd, capture_stderr):
  82       if capture_stderr:
  83         stderr = subprocess.PIPE
  84       else:
  85         stderr = None
  86       self._popen = subprocess.Popen(cmd, stdin=subprocess.PIPE,
  87                                     stdout=subprocess.PIPE, stderr=stderr)
  88       self.stdin = self._popen.stdin
  89       self.stdout = self._popen.stdout
  90       if capture_stderr:
  91         self.stderr = self._popen.stderr
  92       self.wait = self._popen.wait
  93 except ImportError:
  94   if hasattr(popen2, 'Popen3'):
  95     # ...then try popen2.Popen3...
  96     class SimplePopen:
  97       def __init__(self, cmd, capture_stderr):
  98         self._popen3 = popen2.Popen3(cmd, capture_stderr)
  99         self.stdin = self._popen3.tochild
 100         self.stdout = self._popen3.fromchild
 101         if capture_stderr:
 102           self.stderr = self._popen3.childerr
 103         self.wait = self._popen3.wait
 104   else:
 105     # ...and if all fails, use popen2.popen3...
 106     class SimplePopen:
 107       def __init__(self, cmd, capture_stderr):
 108         if type(cmd) != types.StringType:
 109           cmd = argv_to_command_string(cmd)
 110         self.stdout, self.stdin, self.stderr = popen2.popen3(cmd, mode='b')
 111       def wait(self):
 112         return self.stdout.close() or self.stdin.close() or \
 113                self.stderr.close()
 114
 115 # DBM module selection
 116
 117 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
 118 #    so that the dbhash module used by anydbm will use bsddb3.
 119 try:
 120   import bsddb3
 121   sys.modules['bsddb'] = sys.modules['bsddb3']
 122 except ImportError:
 123   pass
 124
 125 # 2. These DBM modules are not good for cvs2svn.
 126 import anydbm
 127 if (anydbm._defaultmod.__name__ == 'dumbdbm'
 128     or anydbm._defaultmod.__name__ == 'dbm'):
 129   sys.stderr.write(
 130     error_prefix
 131     + ': your installation of Python does not contain a suitable\n'
 132     + 'DBM module -- cvs2svn cannot continue.\n'
 133     + 'See http://python.org/doc/current/lib/module-anydbm.html to solve.\n')
 134   sys.exit(1)
 135
 136 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
 137 #    Unfortunately, gdbm appears not to be trouble free, either.
 138 if hasattr(anydbm._defaultmod, 'bsddb') \
 139     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
 140   try:
 141     gdbm = __import__('gdbm')
 142   except ImportError:
 143     sys.stderr.write(warning_prefix +
 144         ': The version of the bsddb module found '
 145         'on your computer has been reported to malfunction on some datasets, '
 146         'causing KeyError exceptions. You may wish to upgrade your Python to '
 147         'version 2.3 or later.\n')
 148   else:
 149     anydbm._defaultmod = gdbm
 150
 151 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 152 cvs_branch_tag = re.compile('^((?:[0-9]+\\.[0-9]+\\.)+)0\\.([0-9]+)$')
 153 rcs_branch_tag = re.compile('^(?:[0-9]+\\.[0-9]+\\.)+[0-9]+$')
 154
 155 SVN_KEYWORDS_VALUE = 'Author Date Id Revision'
 156
 157 # This really only matches standard '1.1.1.*'-style vendor revisions.
 158 # One could conceivably have a file whose default branch is 1.1.3 or
 159 # whatever, or was that at some point in time, with vendor revisions
 160 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 161 # is the only time this regexp gets used), we'd have no basis for
 162 # assuming that the non-standard vendor branch had ever been the
 163 # default branch anyway, so we don't want this to match them anyway.
 164 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 165
 166 # If this run's output is a repository, then (in the tmpdir) we use
 167 # a dumpfile of this name for repository loads.
 168 #
 169 # If this run's output is a dumpfile, then this is default name of
 170 # that dumpfile, but in the current directory (unless the user has
 171 # specified a dumpfile path, of course, in which case it will be
 172 # wherever the user said).
 173 DUMPFILE = 'cvs2svn-dump'
 174
 175 # This file appears with different suffixes at different stages of
 176 # processing.  CVS revisions are cleaned and sorted here, for commit
 177 # grouping.  See design-notes.txt for details.
 178 DATAFILE = 'cvs2svn-data'
 179
 180 # This file contains a marshalled copy of all the statistics that we
 181 # gather throughout the various runs of cvs2svn.  The data stored as a
 182 # marshalled dictionary.
 183 STATISTICS_FILE = 'cvs2svn-statistics'
 184
 185 # This text file contains records (1 per line) that describe svn
 186 # filesystem paths that are the opening and closing source revisions
 187 # for copies to tags and branches.  The format is as follows:
 188 #
 189 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 190 #
 191 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 192 # SVN_REVNUM are the primary and secondary sorting criteria for
 193 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 194 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 195 # A sorted version of the above file.
 196 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 197
 198 # This file is a temporary file for storing symbolic_name -> closing
 199 # CVSRevision until the end of our pass where we can look up the
 200 # corresponding SVNRevNum for the closing revs and write these out to
 201 # the SYMBOL_OPENINGS_CLOSINGS.
 202 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 203
 204 # Skeleton version of an svn filesystem.
 205 # (These supersede and will eventually replace the two above.)
 206 # See class SVNRepositoryMirror for how these work.
 207 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 208 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 209
 210 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 211 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 212 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 213
 214 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 215 # the CVSRevision is the last such that is a source for those symbolic
 216 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 217 # file, and this file's 1.3 is the latest (by date) revision among
 218 # *all* CVS files that is a source for branch B, then the
 219 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 220 # list at least B in its list.
 221 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 222
 223 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 224 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 225 ### the s-revs data in this database.
 226 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 227
 228 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 229 # names), values are ignorable.
 230 TAGS_DB = 'cvs2svn-tags.db'
 231
 232 # A list all tags.  Each line consists of the tag name and the number
 233 # of files in which it exists, separated by a space.
 234 TAGS_LIST = 'cvs2svn-tags.txt'
 235
 236 # A list of all branches.  The file is stored as a plain text file
 237 # to make it easy to look at in an editor.  Each line contains the
 238 # branch name, the number of files where the branch is created, the
 239 # commit count, and a list of tags and branches that are defined on
 240 # revisions in the branch.
 241 BRANCHES_LIST = 'cvs2svn-branches.txt'
 242
 243 # These two databases provide a bidirectional mapping between
 244 # CVSRevision.unique_key()s and Subversion revision numbers.
 245 #
 246 # The first maps CVSRevision.unique_key() to a number; the values are
 247 # not unique.
 248 #
 249 # The second maps Subversion revision numbers to tuples (c_rev_keys,
 250 # motivating_revnum, symbolic_name, date).
 251 #
 252 # c_rev_keys is a list of CVSRevision.unique_key()s.
 253 #
 254 # If the SVNCommit is a default branch synchronization,
 255 # motivating_revnum is the svn_revnum of the primary SVNCommit that
 256 # motivated it; otherwise it is None.  (NOTE: Secondary commits that
 257 # fill branches and tags also have a motivating commit, but we do not
 258 # record it because it is (currently) not needed for anything.)
 259 # motivating_revnum is used when generating the log message for the
 260 # commit that synchronizes the default branch with trunk.
 261 #
 262 # symbolic_name is the symbolic name associated with the commit (if it
 263 # filled a symbolic name) or None otherwise.
 264 #
 265 # date is the date of the commit.
 266 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 267 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 268
 269 # How many bytes to read at a time from a pipe.  128 kiB should be
 270 # large enough to be efficient without wasting too much memory.
 271 PIPE_READ_SIZE = 128 * 1024
 272
 273 # Record the default RCS branches, if any, for CVS filepaths.
 274 #
 275 # The keys are CVS filepaths, relative to the top of the repository
 276 # and with the ",v" stripped off, so they match the cvs paths used in
 277 # Commit.commit().  The values are vendor branch revisions, such as
 278 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 279 # represents the highest vendor branch revision thought to have ever
 280 # been head of the default branch.
 281 #
 282 # The reason we record a specific vendor revision, rather than a
 283 # default branch number, is that there are two cases to handle:
 284 #
 285 # One case is simple.  The RCS file lists a default branch explicitly
 286 # in its header, such as '1.1.1'.  In this case, we know that every
 287 # revision on the vendor branch is to be treated as head of trunk at
 288 # that point in time.
 289 #
 290 # But there's also a degenerate case.  The RCS file does not currently
 291 # have a default branch, yet we can deduce that for some period in the
 292 # past it probably *did* have one.  For example, the file has vendor
 293 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 294 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 295 # case, we should record 1.1.1.96 as the last vendor revision to have
 296 # been the head of the default branch.
 297 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 298
 299 # Records the author and log message for each changeset.
 300 # The keys are author+log digests, the same kind used to identify
 301 # unique revisions in the .revs, etc files.  Each value is a tuple
 302 # of two elements: '(author logmessage)'.
 303 METADATA_DB = "cvs2svn-metadata.db"
 304
 305 # A temporary on-disk hash that maps CVSRevision unique keys to a new
 306 # timestamp for that CVSRevision.  These new timestamps are created in
 307 # pass2, and this hash is used exclusively in pass2.
 308 TWEAKED_TIMESTAMPS_DB = "cvs2svn-fixed-timestamps.db"
 309
 310 REVS_SUFFIX = '.revs'
 311 CLEAN_REVS_SUFFIX = '.c-revs'
 312 SORTED_REVS_SUFFIX = '.s-revs'
 313 RESYNC_SUFFIX = '.resync'
 314
 315 SVN_INVALID_REVNUM = -1
 316
 317 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 318
 319 # Things that can happen to a file.
 320 OP_NOOP   = '-'
 321 OP_ADD    = 'A'
 322 OP_DELETE = 'D'
 323 OP_CHANGE = 'C'
 324
 325 # A deltatext either does or doesn't represent some change.
 326 DELTATEXT_NONEMPTY = 'N'
 327 DELTATEXT_EMPTY    = 'E'
 328
 329 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 330
 331 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 332 OPENING = 'O'
 333 CLOSING = 'C'
 334
 335 class FatalException(Exception):
 336   """Exception thrown on a non-recoverable error.
 337
 338   If this exception is thrown by main(), it is caught by the global
 339   layer of the program, its string representation is printed, and the
 340   program is ended with an exit code of 1."""
 341
 342   pass
 343
 344
 345 class FatalError(FatalException):
 346   """A FatalException that prepends error_prefix to the message."""
 347
 348   def __init__(self, msg):
 349     """Use (error_prefix + ': ' + MSG + '\n') as the error message."""
 350
 351     FatalException.__init__(self, '%s: %s\n' % (error_prefix, msg,))
 352
 353
 354 def temp(basename):
 355   """Return a path to BASENAME in Ctx().tmpdir.
 356   This is a convenience function to save horizontal space in source."""
 357   return os.path.join(Ctx().tmpdir, basename)
 358
 359 # Since the unofficial set also includes [/\] we need to translate those
 360 # into ones that don't conflict with Subversion limitations.
 361 def _clean_symbolic_name(name):
 362   """Return symbolic name NAME, translating characters that Subversion
 363   does not allow in a pathname."""
 364   name = name.replace('/','++')
 365   name = name.replace('\\','--')
 366   return name
 367
 368 def _path_join(*components):
 369   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 370   Empty component are skipped."""
 371   return '/'.join(filter(None, components))
 372
 373 def _path_split(path):
 374   """Split the svn pathname PATH into a pair, (HEAD, TAIL).
 375
 376   This is similar to os.path.split(), but always uses '/' as path
 377   separator.  PATH is an svn path, which should not start with a '/'.
 378   HEAD is everything before the last slash, and TAIL is everything
 379   after.  If PATH ends in a slash, TAIL will be empty.  If there is no
 380   slash in PATH, HEAD will be empty.  If PATH is empty, both HEAD and
 381   TAIL are empty."""
 382
 383   pos = path.rfind('/')
 384   if pos == -1:
 385     return ('', path,)
 386   else:
 387     return (path[:pos], path[pos+1:],)
 388
 389 def to_utf8(value, mode='replace'):
 390   """Encode (as Unicode) VALUE, trying the encodings in Ctx.encoding
 391   as valid source encodings.  Raise UnicodeError on failure of all
 392   source encodings."""
 393   ### FIXME: The 'replace' default mode should be an option,
 394   ### like --encoding is.
 395   for encoding in Ctx().encoding:
 396     try:
 397       return unicode(value, encoding, mode).encode('utf8')
 398     except UnicodeError:
 399       Log().write(LOG_VERBOSE, "Encoding '%s' failed for string '%s'"
 400                   % (encoding, value))
 401   raise UnicodeError
 402
 403 ctrl_characters_regexp = re.compile('[\\\x00-\\\x1f\\\x7f]')
 404
 405 def verify_filename_legal(filename):
 406   """Verify that FILENAME does not include any control characters.  If
 407   it does, raise a FatalError."""
 408
 409   m = ctrl_characters_regexp.search(filename)
 410   if m:
 411     raise FatalError(
 412         "Character %r in filename %r is not supported by subversion."
 413         % (m.group(), filename,))
 414
 415 def run_command(command):
 416   if os.system(command):
 417     raise FatalError('Command failed: "%s"' % (command,))
 418
 419
 420 class CommandFailedException(Exception):
 421   """Exception raised if check_command_runs() fails."""
 422
 423   pass
 424
 425
 426 def check_command_runs(cmd, cmdname):
 427   """Check whether the command CMD can be executed without errors.
 428
 429   CMD is a list or string, as accepted by SimplePopen.  CMDNAME is the
 430   name of the command as it should be included in exception error
 431   messages.
 432
 433   This function checks three things: (1) the command can be run
 434   without throwing an OSError; (2) it exits with status=0; (3) it
 435   doesn't output anything to stderr.  If any of these conditions is
 436   not met, raise a CommandFailedException describing the problem."""
 437
 438   try:
 439     pipe = SimplePopen(cmd, True)
 440   except OSError, e:
 441     raise CommandFailedException('error executing %s: %s' % (cmdname, e,))
 442   pipe.stdin.close()
 443   pipe.stdout.read()
 444   errmsg = pipe.stderr.read()
 445   status = pipe.wait()
 446   if status or errmsg:
 447     msg = 'error executing %s: status %s' % (cmdname, status,)
 448     if errmsg:
 449       msg += ', error output:\n%s' % (errmsg,)
 450     raise CommandFailedException(msg)
 451
 452
 453 class CVSRepository:
 454   """A CVS repository from which data can be extracted."""
 455
 456   def __init__(self, cvs_repos_path):
 457     """CVS_REPOS_PATH is the top of the CVS repository (at least as
 458     far as this run is concerned)."""
 459
 460     if not os.path.isdir(cvs_repos_path):
 461       raise FatalError("The specified CVS repository path '%s' is not an "
 462                        "existing directory." % cvs_repos_path)
 463
 464     self.cvs_repos_path = os.path.normpath(cvs_repos_path)
 465     self.cvs_prefix_re = re.compile(
 466         r'^' + re.escape(self.cvs_repos_path)
 467         + r'(' + re.escape(os.sep) + r'|$)')
 468
 469   def get_cvs_path(self, fname):
 470     """Return the path to FNAME relative to cvs_repos_path, with ',v' removed.
 471
 472     FNAME is a filesystem name that has to be within
 473     self.cvs_repos_path.  Return the filename relative to
 474     self.cvs_repos_path, with ',v' striped off if present, and with
 475     os.sep converted to '/'."""
 476
 477     (tail, n) = self.cvs_prefix_re.subn('', fname, 1)
 478     if n != 1:
 479       raise FatalError(
 480           "get_cvs_path: '%s' is not a sub-path of '%s'"
 481           % (fname, self.cvs_repos_path,))
 482     if tail.endswith(',v'):
 483       tail = tail[:-2]
 484     return tail.replace(os.sep, '/')
 485
 486   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 487     """Return a command string, and the pipe created using that
 488     string.  C_REV is a CVSRevision.  If SUPPRESS_KEYWORD_SUBSTITUTION
 489     is True, then suppress the substitution of RCS/CVS keywords in the
 490     output.  The pipe returns the text of that CVS Revision."""
 491     raise NotImplementedError
 492
 493
 494 class CVSRepositoryViaRCS(CVSRepository):
 495   """A CVSRepository accessed via RCS."""
 496
 497   def __init__(self, cvs_repos_path):
 498     CVSRepository.__init__(self, cvs_repos_path)
 499     try:
 500       check_command_runs([ 'co', '-V' ], 'co')
 501     except CommandFailedException, e:
 502       raise FatalError('%s\n'
 503                        'Please check that co is installed and in your PATH\n'
 504                        '(it is a part of the RCS software).' % (e,))
 505
 506   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 507     pipe_cmd = [ 'co', '-q', '-x,v', '-p' + c_rev.rev ]
 508     if suppress_keyword_substitution:
 509       pipe_cmd.append('-kk')
 510     pipe_cmd.append(c_rev.rcs_path())
 511     pipe = SimplePopen(pipe_cmd, True)
 512     pipe.stdin.close()
 513     return pipe_cmd, pipe
 514
 515
 516 class CVSRepositoryViaCVS(CVSRepository):
 517   """A CVSRepository accessed via CVS."""
 518
 519   def __init__(self, cvs_repos_path):
 520     CVSRepository.__init__(self, cvs_repos_path)
 521     # Ascend above the specified root if necessary, to find the
 522     # cvs_repository_root (a directory containing a CVSROOT directory)
 523     # and the cvs_module (the path of the conversion root within the
 524     # cvs repository) NB: cvs_module must be seperated by '/' *not* by
 525     # os.sep .
 526     def is_cvs_repository_root(path):
 527       return os.path.isdir(os.path.join(path, 'CVSROOT'))
 528
 529     self.cvs_repository_root = os.path.abspath(self.cvs_repos_path)
 530     self.cvs_module = ""
 531     while not is_cvs_repository_root(self.cvs_repository_root):
 532       # Step up one directory:
 533       prev_cvs_repository_root = self.cvs_repository_root
 534       self.cvs_repository_root, module_component = \
 535           os.path.split(self.cvs_repository_root)
 536       if self.cvs_repository_root == prev_cvs_repository_root:
 537         # Hit the root (of the drive, on Windows) without finding a
 538         # CVSROOT dir.
 539         raise FatalError(
 540             "the path '%s' is not a CVS repository, nor a path "
 541             "within a CVS repository.  A CVS repository contains "
 542             "a CVSROOT directory within its root directory."
 543             % (self.cvs_repos_path,))
 544
 545       self.cvs_module = module_component + "/" + self.cvs_module
 546
 547     os.environ['CVSROOT'] = self.cvs_repository_root
 548
 549     def cvs_ok(global_arguments):
 550       check_command_runs(
 551           [ 'cvs' ] + global_arguments + [ '--version' ], 'cvs')
 552
 553     self.global_arguments = [ "-q", "-R" ]
 554     try:
 555       cvs_ok(self.global_arguments)
 556     except CommandFailedException, e:
 557       self.global_arguments = [ "-q" ]
 558       try:
 559         cvs_ok(self.global_arguments)
 560       except CommandFailedException, e:
 561         raise FatalError(
 562             '%s\n'
 563             'Please check that cvs is installed and in your PATH.' % (e,))
 564
 565   def get_co_pipe(self, c_rev, suppress_keyword_substitution=False):
 566     pipe_cmd = [ 'cvs' ] + self.global_arguments + \
 567                [ 'co', '-r' + c_rev.rev, '-p' ]
 568     if suppress_keyword_substitution:
 569       pipe_cmd.append('-kk')
 570     pipe_cmd.append(self.cvs_module + c_rev.cvs_path)
 571     pipe = SimplePopen(pipe_cmd, True)
 572     pipe.stdin.close()
 573     return pipe_cmd, pipe
 574
 575
 576 def generate_ignores(c_rev):
 577   # Read in props
 578   pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(c_rev)
 579   buf = pipe.stdout.read(PIPE_READ_SIZE)
 580   raw_ignore_val = ""
 581   while buf:
 582     raw_ignore_val += buf
 583     buf = pipe.stdout.read(PIPE_READ_SIZE)
 584   pipe.stdout.close()
 585   error_output = pipe.stderr.read()
 586   exit_status = pipe.wait()
 587   if exit_status:
 588     raise FatalError("The command '%s' failed with exit status: %s\n"
 589                      "and the following output:\n"
 590                      "%s" % (pipe_cmd, exit_status, error_output))
 591
 592   # Tweak props: First, convert any spaces to newlines...
 593   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 594   raw_ignores = raw_ignore_val.split('\n')
 595   ignore_vals = [ ]
 596   for ignore in raw_ignores:
 597     # Reset the list if we encounter a '!'
 598     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 599     if ignore == '!':
 600       ignore_vals = [ ]
 601       continue
 602     # Skip empty lines
 603     if len(ignore) == 0:
 604       continue
 605     ignore_vals.append(ignore)
 606   return ignore_vals
 607
 608 # Return a string that has not been returned by gen_key() before.
 609 gen_key_base = 0L
 610 def gen_key():
 611   global gen_key_base
 612   key = '%x' % gen_key_base
 613   gen_key_base += 1
 614   return key
 615
 616 # ============================================================================
 617 # This code is copied with a few modifications from:
 618 #   subversion/subversion/bindings/swig/python/svn/core.py
 619
 620 if sys.platform == "win32":
 621   _escape_shell_arg_re = re.compile(r'(\\+)(\"|$)')
 622
 623   def escape_shell_arg(arg):
 624     # The (very strange) parsing rules used by the C runtime library are
 625     # described at:
 626     # http://msdn.microsoft.com/library/en-us/vclang/html/_pluslang_Parsing_C.2b2b_.Command.2d.Line_Arguments.asp
 627
 628     # double up slashes, but only if they are followed by a quote character
 629     arg = re.sub(_escape_shell_arg_re, r'\1\1\2', arg)
 630
 631     # surround by quotes and escape quotes inside
 632     arg = '"' + arg.replace('"', '"^""') + '"'
 633     return arg
 634
 635
 636   def argv_to_command_string(argv):
 637     """Flatten a list of command line arguments into a command string.
 638
 639     The resulting command string is expected to be passed to the system
 640     shell which os functions like popen() and system() invoke internally.
 641     """
 642
 643     # According cmd's usage notes (cmd /?), it parses the command line by
 644     # "seeing if the first character is a quote character and if so, stripping
 645     # the leading character and removing the last quote character."
 646     # So to prevent the argument string from being changed we add an extra set
 647     # of quotes around it here.
 648     return '"' + ' '.join(map(escape_shell_arg, argv)) + '"'
 649
 650 else:
 651   def escape_shell_arg(arg):
 652     return "'" + arg.replace("'", "'\\''") + "'"
 653
 654   def argv_to_command_string(argv):
 655     """Flatten a list of command line arguments into a command string.
 656
 657     The resulting command string is expected to be passed to the system
 658     shell which os functions like popen() and system() invoke internally.
 659     """
 660
 661     return ' '.join(map(escape_shell_arg, argv))
 662 # ============================================================================
 663
 664 def format_date(date):
 665   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 666   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 667   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 668
 669 def sort_file(infile, outfile):
 670   # sort the log files
 671
 672   # GNU sort will sort our dates differently (incorrectly!) if our
 673   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 674   # it to 'C'
 675   lc_all_tmp = os.environ.get('LC_ALL', None)
 676   os.environ['LC_ALL'] = 'C'
 677   # The -T option to sort has a nice side effect.  The Win32 sort is
 678   # case insensitive and cannot be used, and since it does not
 679   # understand the -T option and dies if we try to use it, there is
 680   # no risk that we use that sort by accident.
 681   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 682   if lc_all_tmp is None:
 683     del os.environ['LC_ALL']
 684   else:
 685     os.environ['LC_ALL'] = lc_all_tmp
 686
 687 def match_regexp_list(regexp_list, s):
 688   """Test whether string S matches any of the compiled regexps in
 689   REGEXP_LIST."""
 690   for regexp in regexp_list:
 691     if regexp.match(s):
 692       return True
 693   return False
 694
 695 class LF_EOL_Filter:
 696   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 697   into LFs only."""
 698   def __init__(self, stream):
 699     self.stream = stream
 700     self.carry_cr = False
 701     self.eof = False
 702
 703   def read(self, size):
 704     while True:
 705       buf = self.stream.read(size)
 706       self.eof = len(buf) == 0
 707       if self.carry_cr:
 708         buf = '\r' + buf
 709         self.carry_cr = False
 710       if not self.eof and buf[-1] == '\r':
 711         self.carry_cr = True
 712         buf = buf[:-1]
 713       buf = buf.replace('\r\n', '\n')
 714       buf = buf.replace('\r', '\n')
 715       if len(buf) > 0 or self.eof:
 716         return buf
 717
 718
 719 # These constants represent the log levels that this script supports
 720 LOG_WARN = -1
 721 LOG_QUIET = 0
 722 LOG_NORMAL = 1
 723 LOG_VERBOSE = 2
 724 class Log:
 725   """A Simple logging facility.  Each line will be timestamped is
 726   self.use_timestamps is TRUE.  This class is a Borg, see
 727   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 728   __shared_state = {}
 729   def __init__(self):
 730     self.__dict__ = self.__shared_state
 731     if self.__dict__:
 732       return
 733     self.log_level = LOG_NORMAL
 734     # Set this to true if you want to see timestamps on each line output.
 735     self.use_timestamps = None
 736     self.logger = sys.stdout
 737
 738   def _timestamp(self):
 739     """Output a detailed timestamp at the beginning of each line output."""
 740     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 741
 742   def write(self, log_level, *args):
 743     """This is the public method to use for writing to a file.  Only
 744     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 745     there are multiple ARGS, they will be separated by a space."""
 746     if log_level > self.log_level:
 747       return
 748     if self.use_timestamps:
 749       self._timestamp()
 750     self.logger.write(' '.join(map(str,args)) + "\n")
 751     # Ensure that log output doesn't get out-of-order with respect to
 752     # stderr output.
 753     self.logger.flush()
 754
 755
 756 class Cleanup:
 757   """This singleton class manages any files created by cvs2svn.  When
 758   you first create a file, call Cleanup.register, passing the
 759   filename, and the last pass that you need the file.  After the end
 760   of that pass, your file will be cleaned up after running an optional
 761   callback.  This class is a Borg, see
 762   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 763
 764   __shared_state = {}
 765   def __init__(self):
 766     self.__dict__ = self.__shared_state
 767     if self.__dict__:
 768       return
 769     self._log = {}
 770     self._callbacks = {}
 771
 772   def register(self, file, which_pass, callback=None):
 773     """Register FILE for cleanup at the end of WHICH_PASS, running
 774     function CALLBACK prior to removal.  Registering a given FILE is
 775     idempotent; you may register as many times as you wish, but it
 776     will only be cleaned up once.
 777
 778     Note that if a file is registered multiple times, only the first
 779     callback registered for that file will be called at cleanup
 780     time.  Also note that if you register a database file you must
 781     close the database before cleanup, e.g. using a callback."""
 782     self._log.setdefault(which_pass, {})[file] = 1
 783     if callback and not self._callbacks.has_key(file):
 784       self._callbacks[file] = callback
 785
 786   def cleanup(self, which_pass):
 787     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 788     if not self._log.has_key(which_pass):
 789       return
 790     for file in self._log[which_pass]:
 791       Log().write(LOG_VERBOSE, "Deleting", file)
 792       if self._callbacks.has_key(file):
 793         self._callbacks[file]()
 794       os.unlink(file)
 795
 796
 797 # Always use these constants for opening databases.
 798 DB_OPEN_READ = 'r'
 799 DB_OPEN_NEW = 'n'
 800
 801
 802 class AbstractDatabase:
 803   """An abstract base class for anydbm-based databases."""
 804
 805   def __init__(self, filename, mode):
 806     """A convenience function for opening an anydbm database."""
 807     # pybsddb3 has a bug which prevents it from working with
 808     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 809     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 810     # for databases protected by lock and transaction support
 811     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 812     #
 813     # Therefore, manually perform the removal (we can do this, because
 814     # we know that for bsddb - but *not* anydbm in general - the database
 815     # consists of one file with the name we specify, rather than several
 816     # based on that name).
 817     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 818       if os.path.isfile(filename):
 819         os.unlink(filename)
 820       mode = 'c'
 821
 822     self.db = anydbm.open(filename, mode)
 823
 824     # Import implementations for many mapping interface methods.  Note
 825     # that we specifically do not do this for any method which handles
 826     # *values*, because our derived classes define __getitem__ and
 827     # __setitem__ to override the storage of values, and grabbing
 828     # methods directly from the dbm object would bypass this.
 829     for meth_name in ('__delitem__', 'keys',
 830         '__iter__', 'has_key', '__contains__', 'iterkeys', 'clear'):
 831       meth_ref = getattr(self.db, meth_name, None)
 832       if meth_ref:
 833         setattr(self, meth_name, meth_ref)
 834
 835   def __delitem__(self, key):
 836     # gdbm defines a __delitem__ method, but it cannot be assigned.  So
 837     # this method provides a fallback definition via explicit delegation:
 838     del self.db[key]
 839
 840   def __iter__(self):
 841     for key in self.keys():
 842       yield key
 843
 844   def has_key(self, key):
 845     try:
 846       self.db[key]
 847       return True
 848     except KeyError:
 849       return False
 850
 851   def __contains__(self, key):
 852     return self.has_key(key)
 853
 854   def iterkeys(self):
 855     return self.__iter__()
 856
 857   def clear(self):
 858     for key in self.keys():
 859       del self[key]
 860
 861   def items(self):
 862     return [(key, self[key],) for key in self.keys()]
 863
 864   def values(self):
 865     return [self[key] for key in self.keys()]
 866
 867   def get(self, key, default=None):
 868     try:
 869       return self[key]
 870     except KeyError:
 871       return default
 872
 873
 874 class SDatabase(AbstractDatabase):
 875   """A database that can only store strings."""
 876
 877   def __getitem__(self, key):
 878     return self.db[key]
 879
 880   def __setitem__(self, key, value):
 881     self.db[key] = value
 882
 883
 884 class Database(AbstractDatabase):
 885   """A database that uses the marshal module to store built-in types."""
 886
 887   def __getitem__(self, key):
 888     return marshal.loads(self.db[key])
 889
 890   def __setitem__(self, key, value):
 891     self.db[key] = marshal.dumps(value)
 892
 893
 894 class StatsKeeper:
 895   __shared_state = { }
 896   def __init__(self):
 897     self.__dict__ = self.__shared_state
 898     if self.__dict__:
 899       return
 900     self.filename = temp(STATISTICS_FILE)
 901     Cleanup().register(self.filename, pass8)
 902     # This can get kinda large, so we don't store it in our data dict.
 903     self.repos_files = { }
 904
 905     if os.path.exists(self.filename):
 906       self.unarchive()
 907     else:
 908       self.data = { 'cvs_revs_count' : 0,
 909                     'tags': { },
 910                     'branches' : { },
 911                     'repos_size' : 0,
 912                     'repos_file_count' : 0,
 913                     'svn_rev_count' : None,
 914                     'first_rev_date' : 1L<<32,
 915                     'last_rev_date' : 0,
 916                     'pass_timings' : { },
 917                     'start_time' : 0,
 918                     'end_time' : 0,
 919                     }
 920
 921   def log_duration_for_pass(self, duration, pass_num):
 922     self.data['pass_timings'][pass_num] = duration
 923
 924   def set_start_time(self, start):
 925     self.data['start_time'] = start
 926
 927   def set_end_time(self, end):
 928     self.data['end_time'] = end
 929
 930   def _bump_item(self, key, amount=1):
 931     self.data[key] += amount
 932
 933   def reset_c_rev_info(self):
 934     self.data['cvs_revs_count'] = 0
 935     self.data['tags'] = { }
 936     self.data['branches'] = { }
 937
 938   def record_c_rev(self, c_rev):
 939     self._bump_item('cvs_revs_count')
 940
 941     for tag in c_rev.tags:
 942       self.data['tags'][tag] = None
 943     for branch in c_rev.branches:
 944       self.data['branches'][branch] = None
 945
 946     if c_rev.timestamp < self.data['first_rev_date']:
 947       self.data['first_rev_date'] = c_rev.timestamp
 948
 949     if c_rev.timestamp > self.data['last_rev_date']:
 950       self.data['last_rev_date'] = c_rev.timestamp
 951
 952     # Only add the size if this is the first time we see the file.
 953     if not self.repos_files.has_key(c_rev.fname):
 954       self._bump_item('repos_size', c_rev.file_size)
 955     self.repos_files[c_rev.fname] = None
 956
 957     self.data['repos_file_count'] = len(self.repos_files)
 958
 959   def set_svn_rev_count(self, count):
 960     self.data['svn_rev_count'] = count
 961
 962   def svn_rev_count(self):
 963     return self.data['svn_rev_count']
 964
 965   def archive(self):
 966     open(self.filename, 'w').write(marshal.dumps(self.data))
 967
 968   def unarchive(self):
 969     self.data = marshal.loads(open(self.filename, 'r').read())
 970
 971   def __str__(self):
 972     svn_revs_str = ""
 973     if self.data['svn_rev_count'] is not None:
 974       svn_revs_str = ('Total SVN Commits:      %10s\n'
 975                       % self.data['svn_rev_count'])
 976
 977     return ('\n'                                \
 978             'cvs2svn Statistics:\n'             \
 979             '------------------\n'              \
 980             'Total CVS Files:        %10i\n'    \
 981             'Total CVS Revisions:    %10i\n'    \
 982             'Total Unique Tags:      %10i\n'    \
 983             'Total Unique Branches:  %10i\n'    \
 984             'CVS Repos Size in KB:   %10i\n'    \
 985             '%s'                                \
 986             'First Revision Date:    %s\n'      \
 987             'Last Revision Date:     %s\n'      \
 988             '------------------'                \
 989             % (self.data['repos_file_count'],
 990                self.data['cvs_revs_count'],
 991                len(self.data['tags']),
 992                len(self.data['branches']),
 993                (self.data['repos_size'] / 1024),
 994                svn_revs_str,
 995                time.ctime(self.data['first_rev_date']),
 996                time.ctime(self.data['last_rev_date']),
 997                ))
 998
 999   def timings(self):
1000     passes = self.data['pass_timings'].keys()
1001     passes.sort()
1002     output = 'Timings:\n------------------\n'
1003
1004     def desc(val):
1005       if val == 1: return "second"
1006       return "seconds"
1007
1008     for pass_num in passes:
1009       duration = int(self.data['pass_timings'][pass_num])
1010       p_str = ('pass %d:%6d %s\n'
1011                % (pass_num, duration, desc(duration)))
1012       output += p_str
1013
1014     total = int(self.data['end_time'] - self.data['start_time'])
1015     output += ('total: %6d %s' % (total, desc(total)))
1016     return output
1017
1018
1019 class LastSymbolicNameDatabase:
1020   """ Passing every CVSRevision in s-revs to this class will result in
1021   a Database whose key is the last CVS Revision a symbolicname was
1022   seen in, and whose value is a list of all symbolicnames that were
1023   last seen in that revision."""
1024   def __init__(self, mode):
1025     self.symbols = {}
1026     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
1027     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
1028
1029   # Once we've gone through all the revs,
1030   # symbols.keys() will be a list of all tags and branches, and
1031   # their corresponding values will be a key into the last CVS revision
1032   # that they were used in.
1033   def log_revision(self, c_rev):
1034     # Gather last CVS Revision for symbolic name info and tag info
1035     for tag in c_rev.tags:
1036       self.symbols[tag] = c_rev.unique_key()
1037     if c_rev.op is not OP_DELETE:
1038       for branch in c_rev.branches:
1039         self.symbols[branch] = c_rev.unique_key()
1040
1041   # Creates an inversion of symbols above--a dictionary of lists (key
1042   # = CVS rev unique_key: val = list of symbols that close in that
1043   # rev.
1044   def create_database(self):
1045     for sym, rev_unique_key in self.symbols.items():
1046       ary = self.symbol_revs_db.get(rev_unique_key, [])
1047       ary.append(sym)
1048       self.symbol_revs_db[rev_unique_key] = ary
1049
1050
1051 class CVSRevisionDatabase:
1052   """A Database to store CVSRevision objects and retrieve them by their
1053   unique_key()."""
1054
1055   def __init__(self, mode):
1056     """Initialize an instance, opening database in MODE (like the MODE
1057     argument to Database or anydbm.open())."""
1058     self.cvs_revs_db = SDatabase(temp(CVS_REVS_DB), mode)
1059     Cleanup().register(temp(CVS_REVS_DB), pass8)
1060
1061   def log_revision(self, c_rev):
1062     """Add C_REV, a CVSRevision, to the database."""
1063     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
1064
1065   def get_revision(self, unique_key):
1066     """Return the CVSRevision stored under UNIQUE_KEY."""
1067     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
1068
1069
1070 def TagsDatabase(mode):
1071   """A Database to store which symbolic names are tags.
1072   Each key is a tag name.
1073   The value has no meaning, and should be set to None."""
1074   db = SDatabase(temp(TAGS_DB), mode)
1075   Cleanup().register(temp(TAGS_DB), pass8)
1076   return db
1077
1078
1079 class Project:
1080   """A project within a CVS repository."""
1081
1082   def __init__(self, project_cvs_repos_path,
1083                trunk_path, branches_path, tags_path):
1084     """Create a new Project record.
1085
1086     PROJECT_CVS_REPOS_PATH is the main CVS directory for this project
1087     (within the filesystem).  TRUNK_PATH, BRANCHES_PATH, and TAGS_PATH
1088     are the full, normalized directory names in svn for the
1089     corresponding part of the repository."""
1090
1091     self.project_cvs_repos_path = project_cvs_repos_path
1092     prefix = Ctx().cvs_repository.cvs_repos_path
1093     if not self.project_cvs_repos_path.startswith(prefix):
1094       raise FatalError("Project '%s' must start with '%s'"
1095                        % (self.project_cvs_repos_path, prefix,))
1096     # The project's main directory as a cvs_path:
1097     self.project_cvs_path = self.project_cvs_repos_path[len(prefix):]
1098     if self.project_cvs_path.startswith(os.sep):
1099       self.project_cvs_path = self.project_cvs_path[1:]
1100     self.trunk_path = trunk_path
1101     self.branches_path = branches_path
1102     self.tags_path = tags_path
1103     verify_paths_disjoint(self.trunk_path, self.branches_path, self.tags_path)
1104
1105   def is_source(self, svn_path):
1106     """Return True iff SVN_PATH is a legitimate source for this project.
1107
1108     Legitimate paths are self.trunk_path or any directory directly
1109     under self.branches_path."""
1110
1111     if svn_path == self.trunk_path:
1112       return True
1113
1114     (head, tail,) = _path_split(svn_path)
1115     if head == self.branches_path:
1116       return True
1117
1118     return False
1119
1120   def is_unremovable(self, svn_path):
1121     """Return True iff the specified path must not be removed."""
1122
1123     return svn_path in [self.trunk_path, self.branches_path, self.tags_path]
1124
1125   def get_branch_path(self, branch_name):
1126     """Return the svnpath for the branch named BRANCH_NAME."""
1127
1128     return _path_join(self.branches_path, _clean_symbolic_name(branch_name))
1129
1130   def get_tag_path(self, tag_name):
1131     """Return the svnpath for the tag named TAG_NAME."""
1132
1133     return _path_join(self.tags_path, _clean_symbolic_name(tag_name))
1134
1135   def _relative_name(self, cvs_path):
1136     """Convert CVS_PATH into a name relative to this project's root directory.
1137
1138     CVS_PATH has to begin (textually) with self.project_cvs_path.
1139     Remove prefix and optional '/'."""
1140
1141     if not cvs_path.startswith(self.project_cvs_path):
1142       raise FatalError(
1143           "_relative_name: '%s' is not a sub-path of '%s'"
1144           % (cvs_path, self.project_cvs_path,))
1145     l = len(self.project_cvs_path)
1146     if cvs_path[l] == os.sep:
1147       l += 1
1148     return cvs_path[l:]
1149
1150   def make_trunk_path(self, cvs_path):
1151     """Return the trunk path for CVS_PATH.
1152
1153     Return the svn path for this file on trunk."""
1154
1155     return _path_join(self.trunk_path, self._relative_name(cvs_path))
1156
1157   def make_branch_path(self, branch_name, cvs_path):
1158     """Return the svn path for CVS_PATH on branch BRANCH_NAME."""
1159
1160     return _path_join(self.get_branch_path(branch_name),
1161                       self._relative_name(cvs_path))
1162
1163
1164 class CVSRevision:
1165   def __init__(self, ctx, *args):
1166     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
1167
1168     If CTX is None, the following members and methods of the
1169     instantiated CVSRevision class object will be unavailable (or
1170     simply will not work correctly, if at all):
1171        cvs_path
1172        svn_path
1173        is_default_branch_revision()
1174
1175     (Note that this class treats CTX as const, because the caller
1176     likely passed in a Borg instance of a Ctx.  The reason this class
1177     takes CTX as as a parameter, instead of just instantiating a Ctx
1178     itself, is that this class should be usable outside cvs2svn.)
1179
1180     If there is one argument in ARGS, it is a string, in the format of
1181     a line from a revs file.  Do *not* include a trailing newline.
1182
1183     If there are multiple ARGS, there must be 17 of them,
1184     comprising a parsed revs line:
1185        timestamp       -->  (int) date stamp for this cvs revision
1186        digest          -->  (string) digest of author+logmsg
1187        prev_timestamp  -->  (int) date stamp for the previous cvs revision
1188        next_timestamp  -->  (int) date stamp for the next cvs revision
1189        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
1190        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
1191        rev             -->  (string) this CVS rev, e.g., "1.3"
1192        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
1193        file_in_attic   -->  (char or None) true if RCS file is in Attic
1194        file_executable -->  (char or None) true if RCS file has exec bit set.
1195        file_size       -->  (int) size of the RCS file
1196        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
1197        fname           -->  (string) relative path of file in CVS repos
1198        mode            -->  (string or None) "kkv", "kb", etc.
1199        branch_name     -->  (string or None) branch on which this rev occurred
1200        tags            -->  (list of strings) all tags on this revision
1201        branches        -->  (list of strings) all branches rooted in this rev
1202
1203     The two forms of initialization are equivalent.
1204
1205     WARNING: Due to the resync process in pass2, prev_timestamp or
1206     next_timestamp may be incorrect in the c-revs or s-revs files."""
1207
1208     self._ctx = ctx
1209     if len(args) == 17:
1210       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1211        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1212        self.file_executable, self.file_size, self.deltatext_code,
1213        self.fname,
1214        self.mode, self.branch_name, self.tags, self.branches) = args
1215     elif len(args) == 1:
1216       data = args[0].split(' ', 15)
1217       (self.timestamp, self.digest, self.prev_timestamp, self.next_timestamp,
1218        self.op, self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
1219        self.file_executable, self.file_size, self.deltatext_code,
1220        self.mode, self.branch_name, numtags, remainder) = data
1221       # Patch up data items which are not simple strings
1222       self.timestamp = int(self.timestamp, 16)
1223       if self.prev_timestamp == "*":
1224         self.prev_timestamp = 0
1225       else:
1226         self.prev_timestamp = int(self.prev_timestamp)
1227       if self.next_timestamp == "*":
1228         self.next_timestamp = 0
1229       else:
1230         self.next_timestamp = int(self.next_timestamp)
1231       if self.prev_rev == "*":
1232         self.prev_rev = None
1233       if self.next_rev == "*":
1234         self.next_rev = None
1235       if self.file_in_attic == "*":
1236         self.file_in_attic = None
1237       if self.file_executable == "*":
1238         self.file_executable = None
1239       self.file_size = int(self.file_size)
1240       if self.mode == "*":
1241         self.mode = None
1242       if self.branch_name == "*":
1243         self.branch_name = None
1244       numtags = int(numtags)
1245       tags_and_numbranches_and_remainder = remainder.split(' ', numtags + 1)
1246       self.tags = tags_and_numbranches_and_remainder[:-2]
1247       numbranches = int(tags_and_numbranches_and_remainder[-2])
1248       remainder = tags_and_numbranches_and_remainder[-1]
1249       branches_and_fname = remainder.split(' ', numbranches)
1250       self.branches = branches_and_fname[:-1]
1251       self.fname = branches_and_fname[-1]
1252     else:
1253       raise TypeError, 'CVSRevision() takes 2 or 18 arguments (%d given)' % \
1254           (len(args) + 1)
1255     if ctx is not None:
1256       self.cvs_path = ctx.cvs_repository.get_cvs_path(self.fname)
1257       if self.branch_name:
1258         self.svn_path = ctx.project.make_branch_path(self.branch_name,
1259                                                      self.cvs_path)
1260       else:
1261         self.svn_path = ctx.project.make_trunk_path(self.cvs_path)
1262
1263   # The 'primary key' of a CVS Revision is the revision number + the
1264   # filename.  To provide a unique key (say, for a dict), we just glom
1265   # them together in a string.  By passing in self.prev_rev or
1266   # self.next_rev, you can get the unique key for their respective
1267   # CVSRevisions.
1268   def unique_key(self, revnum="0"):
1269     if revnum is "0":
1270       revnum = self.rev
1271     elif revnum is None:
1272       return None
1273     return revnum + "/" + self.fname
1274
1275   def __str__(self):
1276     return ('%08lx %s %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s'
1277             % (self.timestamp, self.digest, self.prev_timestamp or "*",
1278               self.next_timestamp or "*", self.op, (self.prev_rev or "*"),
1279               self.rev, (self.next_rev or "*"), (self.file_in_attic or "*"),
1280               (self.file_executable or "*"),
1281               self.file_size,
1282               self.deltatext_code, (self.mode or "*"),
1283               (self.branch_name or "*"),
1284               len(self.tags), self.tags and " " or "", " ".join(self.tags),
1285               len(self.branches), self.branches and " " or "",
1286               " ".join(self.branches),
1287               self.fname, ))
1288
1289   # Returns true if this CVSRevision is the opening CVSRevision for
1290   # NAME (for this RCS file).
1291   def opens_symbolic_name(self, name):
1292     if name in self.tags:
1293       return 1
1294     if name in self.branches:
1295       # If this c_rev opens a branch and our op is OP_DELETE, then
1296       # that means that the file that this c_rev belongs to was
1297       # created on the branch, so for all intents and purposes, this
1298       # c_rev is *technically* not an opening.  See Issue #62 for more
1299       # information.
1300       if self.op != OP_DELETE:
1301         return 1
1302     return 0
1303
1304   def is_default_branch_revision(self):
1305     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
1306     revision according to DEFAULT_BRANCHES_DB (see the conditions
1307     documented there), else return None."""
1308     val = self._ctx._default_branches_db.get(self.cvs_path, None)
1309     if val is not None:
1310       val_last_dot = val.rindex(".")
1311       our_last_dot = self.rev.rindex(".")
1312       default_branch = val[:val_last_dot]
1313       our_branch = self.rev[:our_last_dot]
1314       default_rev_component = int(val[val_last_dot + 1:])
1315       our_rev_component = int(self.rev[our_last_dot + 1:])
1316       if (default_branch == our_branch
1317           and our_rev_component <= default_rev_component):
1318         return 1
1319     # else
1320     return None
1321
1322   def rcs_path(self):
1323     """Returns the actual filesystem path to the RCS file of this
1324     CVSRevision."""
1325     if self.file_in_attic is None:
1326       return self.fname
1327     else:
1328       basepath, filename = os.path.split(self.fname)
1329       return os.path.join(basepath, 'Attic', filename)
1330
1331   def filename(self):
1332     "Return the last path component of self.fname, minus the ',v'"
1333     return os.path.split(self.fname)[-1][:-2]
1334
1335 class SymbolDatabase:
1336   """This database records information on all symbols in the RCS
1337   files.  It is created in pass 1 and it is used in pass 2."""
1338   def __init__(self):
1339     # A hash that maps tag names to commit counts
1340     self.tags = { }
1341     # A hash that maps branch names to lists of the format
1342     # [ create_count, commit_count, blockers ], where blockers
1343     # is a hash that lists the symbols that depend on the
1344     # the branch.  The blockers hash is used as a set, so the
1345     # values are not used.
1346     self.branches = { }
1347
1348   def register_tag_creation(self, name):
1349     """Register the creation of the tag NAME."""
1350     self.tags[name] = self.tags.get(name, 0) + 1
1351
1352   def _branch(self, name):
1353     """Helper function to get a branch node that will create and
1354     initialize the node if it does not exist."""
1355     if not self.branches.has_key(name):
1356       self.branches[name] = [ 0, 0, { } ]
1357     return self.branches[name]
1358
1359   def register_branch_creation(self, name):
1360     """Register the creation of the branch NAME."""
1361     self._branch(name)[0] += 1
1362
1363   def register_branch_commit(self, name):
1364     """Register a commit on the branch NAME."""
1365     self._branch(name)[1] += 1
1366
1367   def register_branch_blocker(self, name, blocker):
1368     """Register BLOCKER as a blocker on the branch NAME."""
1369     self._branch(name)[2][blocker] = None
1370
1371   def branch_has_commit(self, name):
1372     """Return non-zero if NAME has commits.  Returns 0 if name
1373     is not a branch or if it has no commits."""
1374     return self.branches.has_key(name) and self.branches[name][1]
1375
1376   def find_excluded_symbols(self, regexp_list):
1377     """Returns a hash of all symbols that match the regexps in
1378     REGEXP_LIST.  The hash is used as a set so the values are
1379     not used."""
1380     excludes = { }
1381     for tag in self.tags:
1382       if match_regexp_list(regexp_list, tag):
1383         excludes[tag] = None
1384     for branch in self.branches:
1385       if match_regexp_list(regexp_list, branch):
1386         excludes[branch] = None
1387     return excludes
1388
1389   def find_branch_exclude_blockers(self, branch, excludes):
1390     """Find all blockers of BRANCH, excluding the ones in the hash
1391     EXCLUDES."""
1392     blockers = { }
1393     if excludes.has_key(branch):
1394       for blocker in self.branches[branch][2]:
1395         if not excludes.has_key(blocker):
1396           blockers[blocker] = None
1397     return blockers
1398
1399   def find_blocked_excludes(self, excludes):
1400     """Find all branches not in EXCLUDES that have blocking symbols that
1401     are not themselves excluded.  Return a hash that maps branch names
1402     to a hash of blockers.  The hash of blockes is used as a set so the
1403     values are not used."""
1404     blocked_branches = { }
1405     for branch in self.branches:
1406       blockers = self.find_branch_exclude_blockers(branch, excludes)
1407       if blockers:
1408         blocked_branches[branch] = blockers
1409     return blocked_branches
1410
1411   def find_mismatches(self, excludes=None):
1412     """Find all symbols that are defined as both tags and branches,
1413     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1414     the symbol name, tag count, branch count and commit count."""
1415     if excludes is None:
1416       excludes = { }
1417     mismatches = [ ]
1418     for branch in self.branches:
1419       if not excludes.has_key(branch) and self.tags.has_key(branch):
1420         mismatches.append((branch,                    # name
1421                            self.tags[branch],         # tag count
1422                            self.branches[branch][0],  # branch count
1423                            self.branches[branch][1])) # commit count
1424     return mismatches
1425
1426   def read(self):
1427     """Read the symbol database from files."""
1428     f = open(temp(TAGS_LIST))
1429     while 1:
1430       line = f.readline()
1431       if not line:
1432         break
1433       tag, count = line.split()
1434       self.tags[tag] = int(count)
1435
1436     f = open(temp(BRANCHES_LIST))
1437     while 1:
1438       line = f.readline()
1439       if not line:
1440         break
1441       words = line.split()
1442       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1443       for blocker in words[3:]:
1444         self.branches[words[0]][2][blocker] = None
1445
1446   def write(self):
1447     """Store the symbol database to files."""
1448     f = open(temp(TAGS_LIST), "w")
1449     Cleanup().register(temp(TAGS_LIST), pass2)
1450     for tag, count in self.tags.items():
1451       f.write("%s %d\n" % (tag, count))
1452
1453     f = open(temp(BRANCHES_LIST), "w")
1454     Cleanup().register(temp(BRANCHES_LIST), pass2)
1455     for branch, info in self.branches.items():
1456       f.write("%s %d %d" % (branch, info[0], info[1]))
1457       if info[2]:
1458         f.write(" ")
1459         f.write(" ".join(info[2].keys()))
1460       f.write("\n")
1461
1462
1463 class FileDataCollector(cvs2svn_rcsparse.Sink):
1464   """Class responsible for collecting RCS data for a particular file.
1465
1466   Any collected data that need to be remembered are stored into the
1467   referenced CollectData instance."""
1468
1469   def __init__(self, collect_data, canonical_name, filename):
1470     """Create an object that is prepared to receive data for FILENAME.
1471     FILENAME is the absolute filesystem path to the file in question,
1472     and CANONICAL_NAME is FILENAME with the 'Attic' component removed
1473     (if the file is indeed in the Attic).  COLLECT_DATA is used to
1474     store the information collected about the file."""
1475
1476     self.collect_data = collect_data
1477
1478     self.fname = canonical_name
1479
1480     # We calculate and save some file metadata here, where we can do
1481     # it only once per file, instead of waiting until later where we
1482     # would have to do the same calculations once per CVS *revision*.
1483
1484     self.cvs_path = Ctx().cvs_repository.get_cvs_path(self.fname)
1485
1486     # If the paths are not the same, then that means that the
1487     # canonical_name has had the 'Attic' component stripped out.
1488     self.file_in_attic = None
1489     if canonical_name != filename:
1490       self.file_in_attic = 1
1491
1492     file_stat = os.stat(filename)
1493     # The size of our file in bytes
1494     self.file_size = file_stat[stat.ST_SIZE]
1495
1496     # Whether or not the executable bit is set.
1497     self.file_executable = None
1498     if file_stat[0] & stat.S_IXUSR:
1499       self.file_executable = 1
1500
1501     # revision -> [timestamp, author, old-timestamp]
1502     self.rev_data = { }
1503
1504     # Maps revision number (key) to the revision number of the
1505     # previous revision along this line of development.
1506     #
1507     # For the first revision R on a branch, we consider the revision
1508     # from which R sprouted to be the 'previous'.
1509     #
1510     # Note that this revision can't be determined arithmetically (due
1511     # to cvsadmin -o, which is why this is necessary).
1512     #
1513     # If the key has no previous revision, then store None as key's
1514     # value.
1515     self.prev_rev = { }
1516
1517     # This dict is essentially self.prev_rev with the values mapped in
1518     # the other direction, so following key -> value will yield you
1519     # the next revision number.
1520     #
1521     # Unlike self.prev_rev, if the key has no next revision, then the
1522     # key is not present.
1523     self.next_rev = { }
1524
1525     # Track the state of each revision so that in set_revision_info,
1526     # we can determine if our op is an add/change/delete.  We can do
1527     # this because in set_revision_info, we'll have all of the
1528     # revisions for a file at our fingertips, and we need to examine
1529     # the state of our prev_rev to determine if we're an add or a
1530     # change--without the state of the prev_rev, we are unable to
1531     # distinguish between an add and a change.
1532     self.rev_state = { }
1533
1534     # Hash mapping branch numbers, like '1.7.2', to branch names,
1535     # like 'Release_1_0_dev'.
1536     self.branch_names = { }
1537
1538     # RCS flags (used for keyword expansion).
1539     self.mode = None
1540
1541     # Hash mapping revision numbers, like '1.7', to lists of names
1542     # indicating which branches sprout from that revision, like
1543     # ['Release_1_0_dev', 'experimental_driver', ...].
1544     self.branchlist = { }
1545
1546     # Like self.branchlist, but the values are lists of tag names that
1547     # apply to the key revision.
1548     self.taglist = { }
1549
1550     # If set, this is an RCS branch number -- rcsparse calls this the
1551     # "principal branch", but CVS and RCS refer to it as the "default
1552     # branch", so that's what we call it, even though the rcsparse API
1553     # setter method is still 'set_principal_branch'.
1554     self.default_branch = None
1555
1556     # If the RCS file doesn't have a default branch anymore, but does
1557     # have vendor revisions, then we make an educated guess that those
1558     # revisions *were* the head of the default branch up until the
1559     # commit of 1.2, at which point the file's default branch became
1560     # trunk.  This records the date at which 1.2 was committed.
1561     self.first_non_vendor_revision_date = None
1562
1563     # A list of all symbols defined for the current file.  Used to
1564     # prevent multiple definitions of a symbol, something which can
1565     # easily happen when --symbol-transform is used.
1566     self.defined_symbols = { }
1567
1568   def set_principal_branch(self, branch):
1569     self.default_branch = branch
1570
1571   def set_expansion(self, mode):
1572     self.mode = mode
1573
1574   def set_branch_name(self, branch_number, name):
1575     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1576     and that NAME sprouts from BRANCH_NUMBER .
1577     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1578     for example '1.7.2' (never '1.7.0.2')."""
1579     if not self.branch_names.has_key(branch_number):
1580       self.branch_names[branch_number] = name
1581       # The branchlist is keyed on the revision number from which the
1582       # branch sprouts, so strip off the odd final component.
1583       sprout_rev = branch_number[:branch_number.rfind(".")]
1584       self.branchlist.setdefault(sprout_rev, []).append(name)
1585       self.collect_data.symbol_db.register_branch_creation(name)
1586     else:
1587       sys.stderr.write("%s: in '%s':\n"
1588                        "   branch '%s' already has name '%s',\n"
1589                        "   cannot also have name '%s', ignoring the latter\n"
1590                        % (warning_prefix, self.fname, branch_number,
1591                           self.branch_names[branch_number], name))
1592
1593   def rev_to_branch_name(self, revision):
1594     """Return the name of the branch on which REVISION lies.
1595     REVISION is a non-branch revision number with an even number of,
1596     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1597     For the convenience of callers, REVISION can also be a trunk
1598     revision such as '1.2', in which case just return None."""
1599     if trunk_rev.match(revision):
1600       return None
1601     return self.branch_names.get(revision[:revision.rindex(".")])
1602
1603   def define_tag(self, name, revision):
1604     """Record a bidirectional mapping between symbolic NAME and REVISION.
1605     REVISION is an unprocessed revision number from the RCS file's
1606     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1607     This function will determine what kind of symbolic name it is by
1608     inspection, and record it in the right places."""
1609     for (pattern, replacement) in Ctx().symbol_transforms:
1610       newname = pattern.sub(replacement, name)
1611       if newname != name:
1612         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1613                     % (name, newname))
1614         name = newname
1615     if self.defined_symbols.has_key(name):
1616       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1617                 % (error_prefix, name, self.fname)
1618       sys.stderr.write(err + "\n")
1619       self.collect_data.fatal_errors.append(err)
1620     self.defined_symbols[name] = None
1621     m = cvs_branch_tag.match(revision)
1622     if m:
1623       self.set_branch_name(m.group(1) + m.group(2), name)
1624     elif rcs_branch_tag.match(revision):
1625       self.set_branch_name(revision, name)
1626     else:
1627       self.taglist.setdefault(revision, []).append(name)
1628       self.collect_data.symbol_db.register_tag_creation(name)
1629
1630   def define_revision(self, revision, timestamp, author, state,
1631                       branches, next):
1632
1633     # Record the state of our revision for later calculations
1634     self.rev_state[revision] = state
1635
1636     # store the rev_data as a list in case we have to jigger the timestamp
1637     self.rev_data[revision] = [int(timestamp), author, None]
1638
1639     # When on trunk, the RCS 'next' revision number points to what
1640     # humans might consider to be the 'previous' revision number.  For
1641     # example, 1.3's RCS 'next' is 1.2.
1642     #
1643     # However, on a branch, the RCS 'next' revision number really does
1644     # point to what humans would consider to be the 'next' revision
1645     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1646     #
1647     # In other words, in RCS, 'next' always means "where to find the next
1648     # deltatext that you need this revision to retrieve.
1649     #
1650     # That said, we don't *want* RCS's behavior here, so we determine
1651     # whether we're on trunk or a branch and set self.prev_rev
1652     # accordingly.
1653     #
1654     # One last thing.  Note that if REVISION is a branch revision,
1655     # instead of mapping REVISION to NEXT, we instead map NEXT to
1656     # REVISION.  Since we loop over all revisions in the file before
1657     # doing anything with the data we gather here, this 'reverse
1658     # assignment' effectively does the following:
1659     #
1660     # 1. Gives us no 'prev' value for REVISION (in this
1661     # iteration... it may have been set in a previous iteration)
1662     #
1663     # 2. Sets the 'prev' value for the revision with number NEXT to
1664     # REVISION.  So when we come around to the branch revision whose
1665     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1666     # set.
1667     if trunk_rev.match(revision):
1668       self.prev_rev[revision] = next
1669       self.next_rev[next] = revision
1670     elif next:
1671       self.prev_rev[next] = revision
1672       self.next_rev[revision] = next
1673
1674     for b in branches:
1675       self.prev_rev[b] = revision
1676
1677     # Ratchet up the highest vendor head revision, if necessary.
1678     if self.default_branch:
1679       default_branch_root = self.default_branch + "."
1680       if ((revision.find(default_branch_root) == 0)
1681           and (default_branch_root.count('.') == revision.count('.'))):
1682         # This revision is on the default branch, so record that it is
1683         # the new highest default branch head revision.
1684         self.collect_data.default_branches_db[self.cvs_path] = revision
1685     else:
1686       # No default branch, so make an educated guess.
1687       if revision == '1.2':
1688         # This is probably the time when the file stopped having a
1689         # default branch, so make a note of it.
1690         self.first_non_vendor_revision_date = timestamp
1691       else:
1692         m = vendor_revision.match(revision)
1693         if m and ((not self.first_non_vendor_revision_date)
1694                   or (timestamp < self.first_non_vendor_revision_date)):
1695           # We're looking at a vendor revision, and it wasn't
1696           # committed after this file lost its default branch, so bump
1697           # the maximum trunk vendor revision in the permanent record.
1698           self.collect_data.default_branches_db[self.cvs_path] = revision
1699
1700     if not trunk_rev.match(revision):
1701       # Check for unlabeled branches, record them.  We tried to collect
1702       # all branch names when we parsed the symbolic name header
1703       # earlier, of course, but that didn't catch unlabeled branches.
1704       # If a branch is unlabeled, this is our first encounter with it,
1705       # so we have to record its data now.
1706       branch_number = revision[:revision.rindex(".")]
1707       if not self.branch_names.has_key(branch_number):
1708         branch_name = "unlabeled-" + branch_number
1709         self.set_branch_name(branch_number, branch_name)
1710
1711       # Register the commit on this non-trunk branch
1712       branch_name = self.branch_names[branch_number]
1713       self.collect_data.symbol_db.register_branch_commit(branch_name)
1714
1715   def tree_completed(self):
1716     "The revision tree has been parsed.  Analyze it for consistency."
1717
1718     # Our algorithm depends upon the timestamps on the revisions occuring
1719     # monotonically over time.  That is, we want to see rev 1.34 occur in
1720     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1721     # sorting), and then tried to insert 1.34, we'd be screwed.
1722
1723     # to perform the analysis, we'll simply visit all of the 'previous'
1724     # links that we have recorded and validate that the timestamp on the
1725     # previous revision is before the specified revision
1726
1727     # if we have to resync some nodes, then we restart the scan. just keep
1728     # looping as long as we need to restart.
1729     while 1:
1730       for current, prev in self.prev_rev.items():
1731         if not prev:
1732           # no previous revision exists (i.e. the initial revision)
1733           continue
1734         t_c = self.rev_data[current][0]
1735         t_p = self.rev_data[prev][0]
1736         if t_p >= t_c:
1737           # the previous revision occurred later than the current revision.
1738           # shove the previous revision back in time (and any before it that
1739           # may need to shift).
1740
1741           # We sync backwards and not forwards because any given CVS
1742           # Revision has only one previous revision.  However, a CVS
1743           # Revision can *be* a previous revision for many other
1744           # revisions (e.g., a revision that is the source of multiple
1745           # branches).  This becomes relevant when we do the secondary
1746           # synchronization in pass 2--we can make certain that we
1747           # don't resync a revision earlier than it's previous
1748           # revision, but it would be non-trivial to make sure that we
1749           # don't resync revision R *after* any revisions that have R
1750           # as a previous revision.
1751           while t_p >= t_c:
1752             self.rev_data[prev][0] = t_c - 1    # new timestamp
1753             self.rev_data[prev][2] = t_p        # old timestamp
1754             delta = t_c - 1 - t_p
1755             msg =  "PASS1 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1756                   % (self.cvs_path, prev, time.ctime(t_p), delta)
1757             Log().write(LOG_VERBOSE, msg)
1758             if (delta > COMMIT_THRESHOLD
1759                 or delta < (COMMIT_THRESHOLD * -1)):
1760               Log().write(LOG_WARN,
1761                           "%s: Significant timestamp change for '%s' "
1762                           "(%d seconds)"
1763                           % (warning_prefix, self.cvs_path, delta))
1764             current = prev
1765             prev = self.prev_rev[current]
1766             if not prev:
1767               break
1768             t_c -= 1            # self.rev_data[current][0]
1769             t_p = self.rev_data[prev][0]
1770
1771           # break from the for-loop
1772           break
1773       else:
1774         # finished the for-loop (no resyncing was performed)
1775         return
1776
1777   def set_revision_info(self, revision, log, text):
1778     timestamp, author, old_ts = self.rev_data[revision]
1779     digest = sha.new(log + '\0' + author).hexdigest()
1780     if old_ts:
1781       # the timestamp on this revision was changed. log it for later
1782       # resynchronization of other files's revisions that occurred
1783       # for this time and log message.
1784       self.collect_data.resync.write('%08lx %s %08lx\n'
1785                                      % (old_ts, digest, timestamp))
1786
1787     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1788     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1789     #
1790     # If revision 1.1 appears to have been created via 'cvs add'
1791     # instead of 'cvs import', then this file probably never had a
1792     # default branch, so retroactively remove its record in the
1793     # default branches db.  The test is that the log message CVS uses
1794     # for 1.1 in imports is "Initial revision\n" with no period.
1795     if revision == '1.1' and log != 'Initial revision\n':
1796       try:
1797         del self.collect_data.default_branches_db[self.cvs_path]
1798       except KeyError:
1799         pass
1800
1801     # Get the timestamps of the previous and next revisions
1802     prev_rev = self.prev_rev[revision]
1803     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1804
1805     next_rev = self.next_rev.get(revision)
1806     next_timestamp, ign, ign = self.rev_data.get(next_rev, [0, None, None])
1807
1808     # How to tell if a CVSRevision is an add, a change, or a deletion:
1809     #
1810     # It's a delete if RCS state is 'dead'
1811     #
1812     # It's an add if RCS state is 'Exp.' and
1813     #      - we either have no previous revision
1814     #        or
1815     #      - we have a previous revision whose state is 'dead'
1816     #
1817     # Anything else is a change.
1818     if self.rev_state[revision] == 'dead':
1819       op = OP_DELETE
1820     elif ((self.prev_rev.get(revision, None) is None)
1821           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1822       op = OP_ADD
1823     else:
1824       op = OP_CHANGE
1825
1826     def is_branch_revision(rev):
1827       """Return True if this revision is not a trunk revision,
1828       else return False."""
1829       if rev.count('.') >= 3:
1830         return True
1831       return False
1832
1833     def is_same_line_of_development(rev1, rev2):
1834       """Return True if rev1 and rev2 are on the same line of
1835       development (i.e., both on trunk, or both on the same branch);
1836       return False otherwise.  Either rev1 or rev2 can be None, in
1837       which case automatically return False."""
1838       if rev1 is None or rev2 is None:
1839         return False
1840       if rev1.count('.') == 1 and rev2.count('.') == 1:
1841         return True
1842       if rev1[0:rev1.rfind('.')] == rev2[0:rev2.rfind('.')]:
1843         return True
1844       return False
1845
1846     # There can be an odd situation where the tip revision of a branch
1847     # is alive, but every predecessor on the branch is in state 'dead',
1848     # yet the revision from which the branch sprouts is alive.  (This
1849     # is sort of a mirror image of the more common case of adding a
1850     # file on a branch, in which the first revision on the branch is
1851     # alive while the revision from which it sprouts is dead.)
1852     #
1853     # In this odd situation, we must mark the first live revision on
1854     # the branch as an OP_CHANGE instead of an OP_ADD, because it
1855     # reflects, however indirectly, a change w.r.t. the source
1856     # revision from which the branch sprouts.
1857     #
1858     # This is issue #89.
1859     cur_num = revision
1860     if is_branch_revision(revision) and self.rev_state[revision] != 'dead':
1861       while 1:
1862         prev_num = self.prev_rev.get(cur_num, None)
1863         if not cur_num or not prev_num:
1864           break
1865         if (not is_same_line_of_development(cur_num, prev_num)
1866             and self.rev_state[cur_num] == 'dead'
1867             and self.rev_state[prev_num] != 'dead'):
1868           op = OP_CHANGE
1869         cur_num = self.prev_rev.get(cur_num, None)
1870
1871     if text:
1872       deltatext_code = DELTATEXT_NONEMPTY
1873     else:
1874       deltatext_code = DELTATEXT_EMPTY
1875
1876     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp,
1877                         next_timestamp, op,
1878                         prev_rev, revision, next_rev,
1879                         self.file_in_attic, self.file_executable,
1880                         self.file_size,
1881                         deltatext_code, self.fname,
1882                         self.mode, self.rev_to_branch_name(revision),
1883                         self.taglist.get(revision, []),
1884                         self.branchlist.get(revision, []))
1885     self.collect_data.revs.write(str(c_rev) + "\n")
1886     StatsKeeper().record_c_rev(c_rev)
1887
1888     if not self.collect_data.metadata_db.has_key(digest):
1889       self.collect_data.metadata_db[digest] = (author, log)
1890
1891   def parse_completed(self):
1892     # Walk through all branches and tags and register them with
1893     # their parent branch in the symbol database.
1894     for revision, symbols in self.taglist.items() + self.branchlist.items():
1895       for symbol in symbols:
1896         name = self.rev_to_branch_name(revision)
1897         if name is not None:
1898           self.collect_data.symbol_db.register_branch_blocker(name, symbol)
1899
1900     self.collect_data.num_files += 1
1901
1902
1903 class CollectData:
1904   """Repository for data collected by parsing the CVS repository files.
1905
1906   This class manages the databases into which information collected
1907   from the CVS repository is stored.  The data are stored into this
1908   class by FileDataCollector instances, one of which is created for
1909   each file to be parsed."""
1910
1911   def __init__(self):
1912     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1913     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1914     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1915     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1916     self.default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
1917                                          DB_OPEN_NEW)
1918     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1919     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1920     Cleanup().register(temp(METADATA_DB), pass8)
1921     self.fatal_errors = []
1922     self.num_files = 0
1923     self.symbol_db = SymbolDatabase()
1924
1925     # 1 if we've collected data for at least one file, None otherwise.
1926     self.found_valid_file = None
1927
1928   def write_symbol_db(self):
1929     self.symbol_db.write()
1930
1931
1932 class SymbolingsLogger:
1933   """Manage the file that contains lines for symbol openings and
1934   closings.
1935
1936   This data will later be used to determine valid SVNRevision ranges
1937   from which a file can be copied when creating a branch or tag in
1938   Subversion.  Do this by finding "Openings" and "Closings" for each
1939   file copied onto a branch or tag.
1940
1941   An "Opening" is the CVSRevision from which a given branch/tag
1942   sprouts on a path.
1943
1944   The "Closing" for that branch/tag and path is the next CVSRevision
1945   on the same line of development as the opening.
1946
1947   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1948   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1949   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1950   'foo.c'.  Note that there may be many revisions chronologically
1951   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1952   perhaps even including on branch BEE itself.  But 1.3 is the next
1953   revision *on the same line* as 1.2, that is why it is the closing
1954   revision for those symbolic names of which 1.2 is the opening.
1955
1956   The reason for doing all this hullabaloo is to make branch and tag
1957   creation as efficient as possible by minimizing the number of copies
1958   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1959   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1960   means that when creating branch BEE, there is some motivation to do
1961   the copy from one of 17-30.  Now if there were another file,
1962   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1963   to revisions 24 and 39 in Subversion, we would know that the ideal
1964   thing would be to copy the branch from somewhere between 24 and 29,
1965   inclusive.
1966   """
1967   def __init__(self):
1968     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1969     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1970     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1971     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1972
1973     # This keys of this dictionary are *source* cvs_paths for which
1974     # we've encountered an 'opening' on the default branch.  The
1975     # values are the (uncleaned) symbolic names that this path has
1976     # opened.
1977     self.open_paths_with_default_branches = { }
1978
1979   def log_revision(self, c_rev, svn_revnum):
1980     """Log any openings found in C_REV, and if C_REV.next_rev is not
1981     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1982     any) will have its revnum determined later."""
1983     for name in c_rev.tags + c_rev.branches:
1984       self._note_default_branch_opening(c_rev, name)
1985       if c_rev.op != OP_DELETE:
1986         self._log(name, svn_revnum,
1987                   c_rev.cvs_path, c_rev.branch_name, OPENING)
1988
1989       # If our c_rev has a next_rev, then that's the closing rev for
1990       # this source revision.  Log it to closings for later processing
1991       # since we don't know the svn_revnum yet.
1992       if c_rev.next_rev is not None:
1993         self.closings.write('%s %s\n' %
1994                             (name, c_rev.unique_key(c_rev.next_rev)))
1995
1996   def _log(self, name, svn_revnum, cvs_path, branch_name, type):
1997     """Write out a single line to the symbol_openings_closings file
1998     representing that SVN_REVNUM of SVN_PATH on BRANCH_NAME is either the
1999     opening or closing (TYPE) of NAME (a symbolic name).
2000
2001     TYPE should only be one of the following global constants:
2002     OPENING or CLOSING."""
2003     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
2004     self.symbolings.write(
2005         '%s %.8d %s %s %s\n'
2006         % (name, svn_revnum, type, branch_name or '*', cvs_path))
2007
2008   def close(self):
2009     """Iterate through the closings file, lookup the svn_revnum for
2010     each closing CVSRevision, and write a proper line out to the
2011     symbolings file."""
2012     # Use this to get the c_rev of our rev_key
2013     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
2014
2015     self.closings.close()
2016     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
2017       (name, rev_key) = line.rstrip().split(" ", 1)
2018       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
2019
2020       c_rev = cvs_revs_db.get_revision(rev_key)
2021       self._log(name, svn_revnum, c_rev.cvs_path, c_rev.branch_name, CLOSING)
2022
2023     self.symbolings.close()
2024
2025   def _note_default_branch_opening(self, c_rev, symbolic_name):
2026     """If C_REV is a default branch revision, log C_REV.cvs_path as an
2027     opening for SYMBOLIC_NAME."""
2028     self.open_paths_with_default_branches.setdefault(
2029         c_rev.cvs_path, []).append(symbolic_name)
2030
2031   def log_default_branch_closing(self, c_rev, svn_revnum):
2032     """If self.open_paths_with_default_branches contains
2033     C_REV.cvs_path, then call log each name in
2034     self.open_paths_with_default_branches[C_REV.cvs_path] as a closing
2035     with SVN_REVNUM as the closing revision number."""
2036     path = c_rev.cvs_path
2037     if self.open_paths_with_default_branches.has_key(path):
2038       # log each symbol as a closing
2039       for name in self.open_paths_with_default_branches[path]:
2040         self._log(name, svn_revnum, path, None, CLOSING)
2041       # Remove them from the openings list as we're done with them.
2042       del self.open_paths_with_default_branches[path]
2043
2044
2045 class PersistenceManager:
2046   """The PersistenceManager allows us to effectively store SVNCommits
2047   to disk and retrieve them later using only their subversion revision
2048   number as the key.  It also returns the subversion revision number
2049   for a given CVSRevision's unique key.
2050
2051   All information pertinent to each SVNCommit is stored in a series of
2052   on-disk databases so that SVNCommits can be retrieved on-demand.
2053
2054   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
2055   In 'new' mode, PersistenceManager will initialize a new set of on-disk
2056   databases and be fully-featured.
2057   In 'read' mode, PersistenceManager will open existing on-disk databases
2058   and the set_* methods will be unavailable."""
2059   def __init__(self, mode):
2060     self.mode = mode
2061     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
2062       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
2063     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
2064     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
2065     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
2066     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
2067     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
2068     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
2069     ###PERF kff Elsewhere there are comments about sucking the tags db
2070     ### into memory.  That seems like a good idea.
2071     if not Ctx().trunk_only:
2072       self.tags_db = TagsDatabase(DB_OPEN_READ)
2073
2074     # "branch_name" -> svn_revnum in which branch was last filled.
2075     # This is used by CVSCommit._pre_commit, to prevent creating a fill
2076     # revision which would have nothing to do.
2077     self.last_filled = {}
2078
2079   def get_svn_revnum(self, cvs_rev_unique_key):
2080     """Return the Subversion revision number in which
2081     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
2082     is no mapping for CVS_REV_UNIQUE_KEY."""
2083     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
2084
2085   def get_svn_commit(self, svn_revnum):
2086     """Return an SVNCommit that corresponds to SVN_REVNUM.
2087
2088     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
2089
2090     This method can throw SVNCommitInternalInconsistencyError.
2091     """
2092     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
2093     (c_rev_keys, motivating_revnum, name, date) = self.svn2cvs_db.get(
2094         str(svn_revnum), (None, None, None, None))
2095     if c_rev_keys is None:
2096       return None
2097
2098     digest = None
2099     for key in c_rev_keys:
2100       c_rev = self.cvs_revisions.get_revision(key)
2101       svn_commit.add_revision(c_rev)
2102       # Set the author and log message for this commit by using
2103       # CVSRevision metadata, but only if haven't done so already.
2104       if digest is None:
2105         digest = c_rev.digest
2106         author, log_msg = self.svn_commit_metadata[digest]
2107         svn_commit.set_author(author)
2108         svn_commit.set_log_msg(log_msg)
2109
2110     svn_commit.set_date(date)
2111
2112     # If we're doing a trunk-only conversion, we don't need to do any more
2113     # work.
2114     if Ctx().trunk_only:
2115       return svn_commit
2116
2117     if name:
2118       if svn_commit.cvs_revs:
2119         raise SVNCommit.SVNCommitInternalInconsistencyError(
2120             "An SVNCommit cannot have cvs_revisions *and* a corresponding\n"
2121             "symbolic name ('%s') to fill."
2122             % (_clean_symbolic_name(name),))
2123       svn_commit.set_symbolic_name(name)
2124       if self.tags_db.has_key(name):
2125         svn_commit.is_tag = 1
2126
2127     if motivating_revnum is not None:
2128       svn_commit.set_motivating_revnum(motivating_revnum)
2129
2130     return svn_commit
2131
2132   def put_svn_commit(self, svn_revnum, cvs_revs,
2133                      date, name, motivating_revnum):
2134     """Record the bidirectional mapping between SVN_REVNUM and
2135     CVS_REVS and record associated attributes."""
2136     if self.mode == DB_OPEN_READ:
2137       raise RuntimeError, \
2138           'Write operation attempted on read-only PersistenceManager'
2139
2140     for c_rev in cvs_revs:
2141       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
2142
2143     self.svn2cvs_db[str(svn_revnum)] = ([x.unique_key() for x in cvs_revs],
2144                                         motivating_revnum, name, date)
2145
2146     for c_rev in cvs_revs:
2147       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
2148
2149     # If it is not a primary commit, then record last_filled.  name is
2150     # allowed to be None.
2151     if name or motivating_revnum:
2152       self.last_filled[name] = svn_revnum
2153
2154
2155 class CVSCommit:
2156   """Each instance of this class contains a number of CVS Revisions
2157   that correspond to one or more Subversion Commits.  After all CVS
2158   Revisions are added to the grouping, calling process_revisions will
2159   generate a Subversion Commit (or Commits) for the set of CVS
2160   Revisions in the grouping."""
2161
2162   def __init__(self, digest, author, log):
2163     self.digest = digest
2164     self.author = author
2165     self.log = log
2166
2167     # Symbolic names for which the last source revision has already
2168     # been seen and for which the CVSRevisionAggregator has already
2169     # generated a fill SVNCommit.  See self.process_revisions().
2170     self.done_symbols = [ ]
2171
2172     self.files = { }
2173     # Lists of CVSRevisions
2174     self.changes = [ ]
2175     self.deletes = [ ]
2176
2177     # Start out with a t_min higher than any incoming time T, and a
2178     # t_max lower than any incoming T.  This way the first T will
2179     # push t_min down to T, and t_max up to T, naturally (without any
2180     # special-casing), and successive times will then ratchet them
2181     # outward as appropriate.
2182     self.t_min = 1L<<32
2183     self.t_max = 0
2184
2185     # This will be set to the SVNCommit that occurs in self._commit.
2186     self.motivating_commit = None
2187
2188     # This is a list of all non-primary commits motivated by the main
2189     # commit.  We gather these so that we can set their dates to the
2190     # same date as the primary commit.
2191     self.secondary_commits = [ ]
2192
2193     # State for handling default branches.
2194     #
2195     # Here is a tempting, but ultimately nugatory, bit of logic, which
2196     # I share with you so you may appreciate the less attractive, but
2197     # refreshingly non-nugatory, logic which follows it:
2198     #
2199     # If some of the commits in this txn happened on a non-trunk
2200     # default branch, then those files will have to be copied into
2201     # trunk manually after being changed on the branch (because the
2202     # RCS "default branch" appears as head, i.e., trunk, in practice).
2203     # As long as those copies don't overwrite any trunk paths that
2204     # were also changed in this commit, then we can do the copies in
2205     # the same revision, because they won't cover changes that don't
2206     # appear anywhere/anywhen else.  However, if some of the trunk dst
2207     # paths *did* change in this commit, then immediately copying the
2208     # branch changes would lose those trunk mods forever.  So in this
2209     # case, we need to do at least that copy in its own revision.  And
2210     # for simplicity's sake, if we're creating the new revision for
2211     # even one file, then we just do all such copies together in the
2212     # new revision.
2213     #
2214     # Doesn't that sound nice?
2215     #
2216     # Unfortunately, Subversion doesn't support copies with sources
2217     # in the current txn.  All copies must be based in committed
2218     # revisions.  Therefore, we generate the above-described new
2219     # revision unconditionally.
2220     #
2221     # This is a list of c_revs, and a c_rev is appended for each
2222     # default branch commit that will need to be copied to trunk (or
2223     # deleted from trunk) in some generated revision following the
2224     # "regular" revision.
2225     self.default_branch_cvs_revisions = [ ]
2226
2227   def __cmp__(self, other):
2228     # Commits should be sorted by t_max.  If both self and other have
2229     # the same t_max, break the tie using t_min, and lastly, digest.
2230     # If all those are equal, then compare based on ids, to ensure
2231     # that no two instances compare equal.
2232     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
2233             or cmp(self.digest, other.digest) or cmp(id(self), id(other)))
2234
2235   def __hash__(self):
2236     return id(self)
2237
2238   def has_file(self, fname):
2239     return self.files.has_key(fname)
2240
2241   def revisions(self):
2242     return self.changes + self.deletes
2243
2244   def opens_symbolic_name(self, name):
2245     """Returns true if any CVSRevision in this commit is on a tag or a
2246     branch or is the origin of a tag or branch."""
2247     for c_rev in self.revisions():
2248       if c_rev.opens_symbolic_name(name):
2249         return 1
2250     return 0
2251
2252   def add_revision(self, c_rev):
2253     # Record the time range of this commit.
2254     #
2255     # ### ISSUE: It's possible, though unlikely, that the time range
2256     # of a commit could get gradually expanded to be arbitrarily
2257     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
2258     # problem, and anyway deciding where to break it up would be a
2259     # judgement call.  For now, we just print a warning in commit() if
2260     # this happens.
2261     if c_rev.timestamp < self.t_min:
2262       self.t_min = c_rev.timestamp
2263     if c_rev.timestamp > self.t_max:
2264       self.t_max = c_rev.timestamp
2265
2266     if c_rev.op == OP_DELETE:
2267       self.deletes.append(c_rev)
2268     else:
2269       # OP_CHANGE or OP_ADD
2270       self.changes.append(c_rev)
2271
2272     self.files[c_rev.fname] = 1
2273
2274   def _pre_commit(self):
2275     """Generates any SVNCommits that must exist before the main
2276     commit."""
2277
2278     # There may be multiple c_revs in this commit that would cause
2279     # branch B to be filled, but we only want to fill B once.  On the
2280     # other hand, there might be multiple branches committed on in
2281     # this commit.  Whatever the case, we should count exactly one
2282     # commit per branch, because we only fill a branch once per
2283     # CVSCommit.  This list tracks which branches we've already
2284     # counted.
2285     accounted_for_sym_names = [ ]
2286
2287     def fill_needed(c_rev, pm):
2288       """Return 1 if this is the first commit on a new branch (for
2289       this file) and we need to fill the branch; else return 0
2290       (meaning that some other file's first commit on the branch has
2291       already done the fill for us).
2292
2293       If C_REV.op is OP_ADD, only return 1 if the branch that this
2294       commit is on has no last filled revision.
2295
2296       PM is a PersistenceManager to query.
2297       """
2298
2299       # Different '.' counts indicate that c_rev is now on a different
2300       # line of development (and may need a fill)
2301       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
2302         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
2303         # It should be the case that when we have a file F that
2304         # is added on branch B (thus, F on trunk is in state
2305         # 'dead'), we generate an SVNCommit to fill B iff the branch
2306         # has never been filled before.
2307         #
2308         # If this c_rev.op == OP_ADD, *and* the branch has never
2309         # been filled before, then fill it now.  Otherwise, no need to
2310         # fill it.
2311         if c_rev.op == OP_ADD:
2312           if pm.last_filled.get(c_rev.branch_name, None) is None:
2313             return 1
2314         elif c_rev.op == OP_CHANGE:
2315           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
2316             return 1
2317         elif c_rev.op == OP_DELETE:
2318           if pm.last_filled.get(c_rev.branch_name, None) is None:
2319             return 1
2320       return 0
2321
2322     for c_rev in self.changes + self.deletes:
2323       # If a commit is on a branch, we must ensure that the branch
2324       # path being committed exists (in HEAD of the Subversion
2325       # repository).  If it doesn't exist, we will need to fill the
2326       # branch.  After the fill, the path on which we're committing
2327       # will exist.
2328       if c_rev.branch_name \
2329           and c_rev.branch_name not in accounted_for_sym_names \
2330           and c_rev.branch_name not in self.done_symbols \
2331           and fill_needed(c_rev, Ctx()._persistence_manager):
2332         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
2333                                % c_rev.branch_name)
2334         svn_commit.set_symbolic_name(c_rev.branch_name)
2335         self.secondary_commits.append(svn_commit)
2336         accounted_for_sym_names.append(c_rev.branch_name)
2337
2338   def _commit(self):
2339     """Generates the primary SVNCommit that corresponds to this
2340     CVSCommit."""
2341     # Generate an SVNCommit unconditionally.  Even if the only change
2342     # in this CVSCommit is a deletion of an already-deleted file (that
2343     # is, a CVS revision in state 'dead' whose predecessor was also in
2344     # state 'dead'), the conversion will still generate a Subversion
2345     # revision containing the log message for the second dead
2346     # revision, because we don't want to lose that information.
2347     svn_commit = SVNCommit("commit")
2348     self.motivating_commit = svn_commit
2349
2350     for c_rev in self.changes:
2351       svn_commit.add_revision(c_rev)
2352       # Only make a change if we need to.  When 1.1.1.1 has an empty
2353       # deltatext, the explanation is almost always that we're looking
2354       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
2355       # such imports, CVS creates an RCS file where 1.1 has the
2356       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
2357       # content as 1.1.  There's no reason to reflect this non-change
2358       # in the repository, so we want to do nothing in this case.  (If
2359       # we were really paranoid, we could make sure 1.1's log message
2360       # is the CVS-generated "Initial revision\n", but I think the
2361       # conditions below are strict enough.)
2362       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
2363               and (c_rev.rev == "1.1.1.1")):
2364         if c_rev.is_default_branch_revision():
2365           self.default_branch_cvs_revisions.append(c_rev)
2366
2367     for c_rev in self.deletes:
2368       # When a file is added on a branch, CVS not only adds the file
2369       # on the branch, but generates a trunk revision (typically
2370       # 1.1) for that file in state 'dead'.  We only want to add
2371       # this revision if the log message is not the standard cvs
2372       # fabricated log message.
2373       if c_rev.prev_rev is None:
2374         # c_rev.branches may be empty if the originating branch
2375         # has been excluded.
2376         if not c_rev.branches:
2377           continue
2378         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
2379                              % (c_rev.filename(),
2380                                 c_rev.branches[0]))
2381         author, log_msg = \
2382             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
2383         if log_msg == cvs_generated_msg:
2384           continue
2385
2386       svn_commit.add_revision(c_rev)
2387       if c_rev.is_default_branch_revision():
2388         self.default_branch_cvs_revisions.append(c_rev)
2389
2390     # There is a slight chance that we didn't actually register any
2391     # CVSRevisions with our SVNCommit (see loop over self.deletes
2392     # above), so if we have no CVSRevisions, we don't flush the
2393     # svn_commit to disk and roll back our revnum.
2394     if len(svn_commit.cvs_revs) > 0:
2395       svn_commit.flush()
2396     else:
2397       # We will not be flushing this SVNCommit, so rollback the
2398       # SVNCommit revision counter.
2399       SVNCommit.revnum -= 1
2400
2401     if not Ctx().trunk_only:
2402       for c_rev in self.revisions():
2403         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
2404
2405   def _post_commit(self):
2406     """Generates any SVNCommits that we can perform now that _commit
2407     has happened.  That is, handle non-trunk default branches.
2408     Sometimes an RCS file has a non-trunk default branch, so a commit
2409     on that default branch would be visible in a default CVS checkout
2410     of HEAD.  If we don't copy that commit over to Subversion's trunk,
2411     then there will be no Subversion tree which corresponds to that
2412     CVS checkout.  Of course, in order to copy the path over, we may
2413     first need to delete the existing trunk there.  """
2414
2415     # Only generate a commit if we have default branch revs
2416     if len(self.default_branch_cvs_revisions):
2417       # Generate an SVNCommit for all of our default branch c_revs.
2418       svn_commit = SVNCommit("post-commit default branch(es)")
2419       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
2420       for c_rev in self.default_branch_cvs_revisions:
2421         svn_commit.add_revision(c_rev)
2422         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
2423                                                             svn_commit.revnum)
2424       self.secondary_commits.append(svn_commit)
2425
2426   def process_revisions(self, done_symbols):
2427     """Process all the CVSRevisions that this instance has, creating
2428     one or more SVNCommits in the process.  Generate fill SVNCommits
2429     only for symbols not in DONE_SYMBOLS (avoids unnecessary
2430     fills).
2431
2432     Return the primary SVNCommit that corresponds to this CVSCommit.
2433     The returned SVNCommit is the commit that motivated any other
2434     SVNCommits generated in this CVSCommit."""
2435     self.done_symbols = done_symbols
2436     seconds = self.t_max - self.t_min + 1
2437
2438     Log().write(LOG_VERBOSE, '-' * 60)
2439     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2440     if seconds == 1:
2441       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2442                   % time.ctime(self.t_max))
2443     else:
2444       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2445       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2446                   % (time.ctime(self.t_max), seconds))
2447
2448     if seconds > COMMIT_THRESHOLD + 1:
2449       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2450                   % (warning_prefix, COMMIT_THRESHOLD))
2451
2452     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2453       self._commit()
2454       return self.motivating_commit
2455
2456     self._pre_commit()
2457     self._commit()
2458     self._post_commit()
2459
2460     for svn_commit in self.secondary_commits:
2461       svn_commit.set_date(self.motivating_commit.get_date())
2462       svn_commit.flush()
2463
2464     return self.motivating_commit
2465
2466
2467 class SVNCommit:
2468   """This represents one commit to the Subversion Repository.  There
2469   are three types of SVNCommits:
2470
2471   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2472
2473   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2474
2475   3. Updates trunk to reflect the contents of a particular branch
2476      (this is to handle RCS default branches)."""
2477
2478   # The revision number to assign to the next new SVNCommit.
2479   # We start at 2 because SVNRepositoryMirror uses the first commit
2480   # to create trunk, tags, and branches.
2481   revnum = 2
2482
2483   class SVNCommitInternalInconsistencyError(Exception):
2484     """Exception raised if we encounter an impossible state in the
2485     SVNCommit Databases."""
2486     pass
2487
2488   def __init__(self, description="", revnum=None, cvs_revs=None):
2489     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2490     If REVNUM, the SVNCommit will correspond to that revision number;
2491     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2492     REVNUM.
2493
2494     It is an error to pass CVS_REVS without REVNUM, but you may pass
2495     REVNUM without CVS_REVS, and then add a revision at a time by
2496     invoking add_revision()."""
2497     self._description = description
2498
2499     # Revprop metadata for this commit.
2500     #
2501     # These initial values are placeholders.  At least the log and the
2502     # date should be different by the time these are used.
2503     #
2504     # They are private because their values should be returned encoded
2505     # in UTF8, but callers aren't required to set them in UTF8.
2506     # Therefore, accessor methods are used to set them, and
2507     # self.get_revprops() is used to to get them, in dictionary form.
2508     self._author = Ctx().username
2509     self._log_msg = "This log message means an SVNCommit was used too soon."
2510     self._max_date = 0  # Latest date seen so far.
2511
2512     self.cvs_revs = cvs_revs or []
2513     if revnum:
2514       self.revnum = revnum
2515     else:
2516       self.revnum = SVNCommit.revnum
2517       SVNCommit.revnum += 1
2518
2519     # The (uncleaned) symbolic name that is filled in this SVNCommit, if any.
2520     self.symbolic_name = None
2521
2522     # If this commit is a default branch synchronization, this
2523     # variable represents the subversion revision number of the
2524     # *primary* commit where the default branch changes actually
2525     # happened.  It is None otherwise.
2526     #
2527     # It is possible for multiple synchronization commits to refer to
2528     # the same motivating commit revision number, and it is possible
2529     # for a single synchronization commit to contain CVSRevisions on
2530     # multiple different default branches.
2531     self.motivating_revnum = None
2532
2533     # is_tag is true only if this commit is a fill of a symbolic name
2534     # that is a tag, None in all other cases.
2535     self.is_tag = None
2536
2537   def set_symbolic_name(self, symbolic_name):
2538     "Set self.symbolic_name to SYMBOLIC_NAME."
2539     self.symbolic_name = symbolic_name
2540
2541   def set_motivating_revnum(self, revnum):
2542     "Set self.motivating_revnum to REVNUM."
2543     self.motivating_revnum = revnum
2544
2545   def set_author(self, author):
2546     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2547     This is the only way to set an SVNCommit's author."""
2548     self._author = author
2549
2550   def set_log_msg(self, msg):
2551     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2552     This is the only way to set an SVNCommit's log message."""
2553     self._log_msg = msg
2554
2555   def set_date(self, date):
2556     """Set this SVNCommit's date to DATE (an integer).
2557     Note that self.add_revision() updates this automatically based on
2558     a CVSRevision; so you may not need to call this at all, and even
2559     if you do, the value may be overwritten by a later call to
2560     self.add_revision()."""
2561     self._max_date = date
2562
2563   def get_date(self):
2564     """Returns this SVNCommit's date as an integer."""
2565     return self._max_date
2566
2567   def get_revprops(self):
2568     """Return the Subversion revprops for this SVNCommit."""
2569     date = format_date(self._max_date)
2570     try:
2571       utf8_author = None
2572       if self._author is not None:
2573         utf8_author = to_utf8(self._author)
2574       utf8_log = to_utf8(self.get_log_msg())
2575       return { 'svn:author' : utf8_author,
2576                'svn:log'    : utf8_log,
2577                'svn:date'   : date }
2578     except UnicodeError:
2579       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2580                   % warning_prefix)
2581       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2582       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2583       Log().write(LOG_WARN, "  date:   '%s'" % date)
2584       Log().write(LOG_WARN,
2585                   "(subversion rev %s)  Related files:" % self.revnum)
2586       for c_rev in self.cvs_revs:
2587         Log().write(LOG_WARN, " ", c_rev.fname)
2588
2589       Log().write(LOG_WARN, "Consider rerunning with one or more ",
2590                   "'--encoding' parameters.\n")
2591       # It's better to fall back to the original (unknown encoding) data
2592       # than to either 1) quit or 2) record nothing at all.
2593       return { 'svn:author' : self._author,
2594                'svn:log'    : self.get_log_msg(),
2595                'svn:date'   : date }
2596
2597   def add_revision(self, cvs_rev):
2598     self.cvs_revs.append(cvs_rev)
2599     if cvs_rev.timestamp > self._max_date:
2600       self._max_date = cvs_rev.timestamp
2601
2602   def flush(self):
2603     Log().write(LOG_NORMAL, "Creating Subversion r%d (%s)"
2604                 % (self.revnum, self._description))
2605     Ctx()._persistence_manager.put_svn_commit(self.revnum,
2606                                               self.cvs_revs,
2607                                               self._max_date,
2608                                               self.symbolic_name,
2609                                               self.motivating_revnum)
2610
2611   def __str__(self):
2612     """ Print a human-readable description of this SVNCommit.  This
2613     description is not intended to be machine-parseable (although
2614     we're not going to stop you if you try!)"""
2615
2616     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2617     if self.symbolic_name:
2618       ret += ("   symbolic name: " + _clean_symbolic_name(self.symbolic_name)
2619               + "\n")
2620     else:
2621       ret += "   NO symbolic name\n"
2622     ret += "   debug description: " + self._description + "\n"
2623     ret += "   cvs_revs:\n"
2624     for c_rev in self.cvs_revs:
2625       ret += "     " + c_rev.unique_key() + "\n"
2626     return ret
2627
2628   def get_log_msg(self):
2629     """Returns the actual log message for a primary commit, and the
2630     appropriate manufactured log message for a secondary commit."""
2631     if self.symbolic_name is not None:
2632       return self._log_msg_for_symbolic_name_commit()
2633     elif self.motivating_revnum is not None:
2634       return self._log_msg_for_default_branch_commit()
2635     else:
2636       return self._log_msg
2637
2638   def _log_msg_for_symbolic_name_commit(self):
2639     """Creates a log message for a manufactured commit that fills
2640     self.symbolic_name.  If self.is_tag is true, write the log message
2641     as though for a tag, else write it as though for a branch."""
2642     type = 'branch'
2643     if self.is_tag:
2644       type = 'tag'
2645
2646     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2647     space_or_newline = ' '
2648     cleaned_symbolic_name = _clean_symbolic_name(self.symbolic_name)
2649     if len(cleaned_symbolic_name) >= 13:
2650       space_or_newline = '\n'
2651
2652     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2653            % (type, space_or_newline, cleaned_symbolic_name)
2654
2655   def _log_msg_for_default_branch_commit(self):
2656     """Creates a log message for a manufactured commit that
2657     synchronizes a non-trunk default branch with trunk."""
2658     msg = 'This commit was generated by cvs2svn to compensate for '     \
2659           'changes in r%d,\n'                                           \
2660           'which included commits to RCS files with non-trunk default ' \
2661           'branches.\n' % self.motivating_revnum
2662     return msg
2663
2664 class CVSRevisionAggregator:
2665   """This class groups CVSRevisions into CVSCommits that represent
2666   at least one SVNCommit."""
2667   def __init__(self):
2668     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2669     if not Ctx().trunk_only:
2670       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB),
2671                                    DB_OPEN_READ)
2672
2673     # A map { key : CVSCommit } of CVS commits currently being
2674     # accumulated.  If the CVSCommit is still open to further
2675     # CVSRevisions, then key is CVSRevision.digest.  If not (because
2676     # an inbound commit wanted to affect a file that was already
2677     # within the CVSCommit), then key is CVSRevision.digest plus some
2678     # number of appended '-'.
2679     self.cvs_commits = {}
2680
2681     # List of ready commits.
2682     self.ready_queue = [ ]
2683
2684     # A map { symbol : None } of symbolic names for which the last
2685     # source CVSRevision has already been processed but which haven't
2686     # been closed yet.
2687     self.pending_symbols = {}
2688
2689     # A list of closed symbols.  That is, we've already encountered
2690     # the last CVSRevision that is a source for that symbol, the final
2691     # fill for this symbol has been done, and we never need to fill it
2692     # again.
2693     self.done_symbols = [ ]
2694
2695     # This variable holds the most recently created primary svn_commit
2696     # object.  CVSRevisionAggregator maintains this variable merely
2697     # for its date, so that it can set dates for the SVNCommits
2698     # created in self._attempt_to_commit_symbols().
2699     self.latest_primary_svn_commit = None
2700
2701     Ctx()._symbolings_logger = SymbolingsLogger()
2702     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2703     Ctx()._default_branches_db = SDatabase(temp(DEFAULT_BRANCHES_DB),
2704                                            DB_OPEN_READ)
2705
2706   def _extract_ready_commits(self, timestamp):
2707     """Extract and return any active commits that expire by TIMESTAMP."""
2708
2709     for digest_key, cvs_commit in self.cvs_commits.items():
2710       if cvs_commit.t_max + COMMIT_THRESHOLD < timestamp:
2711         self.ready_queue.append(cvs_commit)
2712         del self.cvs_commits[digest_key]
2713
2714   def _commit_ready_commits(self):
2715     """Sort the commits from self.ready_queue by time, then process them."""
2716     self.ready_queue.sort()
2717     while self.ready_queue:
2718       cvs_commit = self.ready_queue[0]
2719       del self.ready_queue[0]
2720       self.latest_primary_svn_commit = \
2721           cvs_commit.process_revisions(self.done_symbols)
2722       self._attempt_to_commit_symbols()
2723
2724   def process_revision(self, c_rev):
2725     # Each time we read a new line, scan the accumulating commits to
2726     # see if any are ready for processing.
2727     self._extract_ready_commits(c_rev.timestamp)
2728
2729     for digest_key, cvs_commit in self.cvs_commits.items():
2730       # If the inbound commit is on the same file as a pending commit,
2731       # close the pending commit to further changes.  Don't flush it though,
2732       # as there may be other pending commits dated before this one.
2733       # ### ISSUE: the has_file() check below is not optimal.
2734       # It does fix the dataloss bug where revisions would get lost
2735       # if checked in too quickly, but it can also break apart the
2736       # commits.  The correct fix would require tracking the dependencies
2737       # between change sets and committing them in proper order.
2738       if cvs_commit.has_file(c_rev.fname):
2739         unused_id = digest_key + '-'
2740         # Find a string that does is not already a key in
2741         # the self.cvs_commits dict
2742         while self.cvs_commits.has_key(unused_id):
2743           unused_id += '-'
2744         self.cvs_commits[unused_id] = cvs_commit
2745         del self.cvs_commits[digest_key]
2746
2747     # Add this item into the set of still-available commits.
2748     if self.cvs_commits.has_key(c_rev.digest):
2749       cvs_commit = self.cvs_commits[c_rev.digest]
2750     else:
2751       author, log = self.metadata_db[c_rev.digest]
2752       cvs_commit = CVSCommit(c_rev.digest, author, log)
2753       self.cvs_commits[c_rev.digest] = cvs_commit
2754     cvs_commit.add_revision(c_rev)
2755
2756     # Any elements in self.ready_queue at this point need to be
2757     # processed, because this latest rev couldn't possibly be part of
2758     # any of them.
2759     self._commit_ready_commits()
2760
2761     self._add_pending_symbols(c_rev)
2762
2763   def flush(self):
2764     """Commit anything left in self.cvs_commits.  Then inform the
2765     SymbolingsLogger that all commits are done."""
2766
2767     self._extract_ready_commits(1L<<32)
2768     self._commit_ready_commits()
2769
2770     if not Ctx().trunk_only:
2771       Ctx()._symbolings_logger.close()
2772
2773   def _add_pending_symbols(self, c_rev):
2774     """Add to self.pending_symbols any symbols from C_REV for which
2775     C_REV is the last CVSRevision.
2776
2777     If we're not doing a trunk-only conversion, get the symbolic names
2778     that this c_rev is the last *source* CVSRevision for and add them
2779     to those left over from previous passes through the aggregator."""
2780
2781     if not Ctx().trunk_only:
2782       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2783         self.pending_symbols[sym] = None
2784
2785   def _attempt_to_commit_symbols(self):
2786     """Generate one SVNCommit for each symbol in self.pending_symbols
2787     that doesn't have an opening CVSRevision in either self.ready_queue
2788     or self.cvs_commits.values()."""
2789
2790     # Make a list of all symbols from self.pending_symbols that do not
2791     # have *source* CVSRevisions in the pending commit queues
2792     # (self.cvs_commits or self.ready_queue):
2793     closeable_symbols = []
2794     pending_commits = self.cvs_commits.values() + self.ready_queue
2795     for sym in self.pending_symbols:
2796       for cvs_commit in pending_commits:
2797         if cvs_commit.opens_symbolic_name(sym):
2798           break
2799       else:
2800         closeable_symbols.append(sym)
2801
2802     # Sort the closeable symbols so that we will always process the
2803     # symbols in the same order, regardless of the order in which the
2804     # dict hashing algorithm hands them back to us.  We do this so
2805     # that our tests will get the same results on all platforms.
2806     closeable_symbols.sort()
2807     for sym in closeable_symbols:
2808       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2809       svn_commit.set_symbolic_name(sym)
2810       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2811       svn_commit.flush()
2812       self.done_symbols.append(sym)
2813       del self.pending_symbols[sym]
2814
2815
2816 class SymbolingsReader:
2817   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2818   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2819   returning the correct opening and closing Subversion revision
2820   numbers for a given symbolic name."""
2821   def __init__(self):
2822     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2823     reads the offsets database into memory."""
2824     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2825     # The offsets_db is really small, and we need to read and write
2826     # from it a fair bit, so suck it into memory
2827     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2828     self.offsets = { }
2829     for key in offsets_db:
2830       #print " ZOO:", key, offsets_db[key]
2831       self.offsets[key] = offsets_db[key]
2832
2833   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2834     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2835     SymbolicNameFillingGuide object.
2836
2837     Note that if we encounter an opening rev in this fill, but the
2838     corresponding closing rev takes place later than SVN_REVNUM, the
2839     closing will not be passed to SymbolicNameFillingGuide in this
2840     fill (and will be discarded when encountered in a later fill).
2841     This is perfectly fine, because we can still do a valid fill
2842     without the closing--we always try to fill what we can as soon as
2843     we can."""
2844
2845     openings_closings_map = OpeningsClosingsMap(symbolic_name)
2846
2847     # It's possible to have a branch start with a file that was added
2848     # on a branch
2849     if self.offsets.has_key(symbolic_name):
2850       # set our read offset for self.symbolings to the offset for
2851       # symbolic_name
2852       self.symbolings.seek(self.offsets[symbolic_name])
2853
2854       while 1:
2855         fpos = self.symbolings.tell()
2856         line = self.symbolings.readline().rstrip()
2857         if not line:
2858           break
2859         name, revnum, type, branch_name, cvs_path = line.split(" ", 4)
2860         if branch_name == '*':
2861           svn_path = Ctx().project.make_trunk_path(cvs_path)
2862         else:
2863           svn_path = Ctx().project.make_branch_path(branch_name, cvs_path)
2864         revnum = int(revnum)
2865         if revnum > svn_revnum or name != symbolic_name:
2866           break
2867         openings_closings_map.register(svn_path, revnum, type)
2868
2869       # get current offset of the read marker and set it to the offset
2870       # for the beginning of the line we just read if we used anything
2871       # we read.
2872       if not openings_closings_map.is_empty():
2873         self.offsets[symbolic_name] = fpos
2874
2875     return SymbolicNameFillingGuide(openings_closings_map)
2876
2877
2878 class SvnRevisionRange:
2879   """The range of subversion revision numbers from which a path can be
2880   copied.  self.opening_revnum is the number of the earliest such
2881   revision, and self.closing_revnum is one higher than the number of
2882   the last such revision.  If self.closing_revnum is None, then no
2883   closings were registered."""
2884
2885   def __init__(self, opening_revnum):
2886     self.opening_revnum = opening_revnum
2887     self.closing_revnum = None
2888
2889   def add_closing(self, closing_revnum):
2890     # When we have a non-trunk default branch, we may have multiple
2891     # closings--only register the first closing we encounter.
2892     if self.closing_revnum is None:
2893       self.closing_revnum = closing_revnum
2894
2895   def __str__(self):
2896     if self.closing_revnum is None:
2897       return '[%d:]' % (self.opening_revnum,)
2898     else:
2899       return '[%d:%d]' % (self.opening_revnum, self.closing_revnum,)
2900
2901
2902 class OpeningsClosingsMap:
2903   """A dictionary of openings and closings for a symbolic name in the
2904   current SVNCommit.
2905
2906   The user should call self.register() for the openings and closings,
2907   then self.get_node_tree() to retrieve the information as a
2908   SymbolicNameFillingGuide."""
2909
2910   def __init__(self, symbolic_name):
2911     """Initialize OpeningsClosingsMap and prepare it for receiving
2912     openings and closings."""
2913
2914     self.name = symbolic_name
2915
2916     # A dictionary of SVN_PATHS to SvnRevisionRange objects.
2917     self.things = { }
2918
2919   def register(self, svn_path, svn_revnum, type):
2920     """Register an opening or closing revision for this symbolic name.
2921     SVN_PATH is the source path that needs to be copied into
2922     self.symbolic_name, and SVN_REVNUM is either the first svn
2923     revision number that we can copy from (our opening), or the last
2924     (not inclusive) svn revision number that we can copy from (our
2925     closing).  TYPE indicates whether this path is an opening or a a
2926     closing.
2927
2928     The opening for a given SVN_PATH must be passed before the closing
2929     for it to have any effect... any closing encountered before a
2930     corresponding opening will be discarded.
2931
2932     It is not necessary to pass a corresponding closing for every
2933     opening.
2934     """
2935     # Always log an OPENING
2936     if type == OPENING:
2937       self.things[svn_path] = SvnRevisionRange(svn_revnum)
2938     # Only log a closing if we've already registered the opening for that
2939     # path.
2940     elif type == CLOSING and self.things.has_key(svn_path):
2941       self.things[svn_path].add_closing(svn_revnum)
2942
2943   def is_empty(self):
2944     """Return true if we haven't accumulated any openings or closings,
2945     false otherwise."""
2946     return not len(self.things)
2947
2948   def get_things(self):
2949     """Return a list of (svn_path, SvnRevisionRange) tuples for all
2950     svn_paths with registered openings or closings."""
2951
2952     return self.things.items()
2953
2954
2955 class SymbolicNameFillingGuide:
2956   """A node tree representing the source paths to be copied to fill
2957   self.symbolic_name in the current SVNCommit.
2958
2959   self._node_tree is the root of the directory tree, in the form {
2960   path_component : subnode }.  Leaf nodes are instances of
2961   SvnRevisionRange.  Intermediate (directory) nodes are dictionaries
2962   mapping relative names to subnodes.
2963
2964   By walking self._node_tree and calling self.get_best_revnum() on
2965   each node, the caller can determine what subversion revision number
2966   to copy the path corresponding to that node from.  self._node_tree
2967   should be treated as read-only.
2968
2969   The caller can then descend to sub-nodes to see if their "best
2970   revnum" differs from their parents' and if it does, take appropriate
2971   actions to "patch up" the subtrees."""
2972
2973   def __init__(self, openings_closings_map):
2974     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2975     store into it the openings and closings from
2976     OPENINGS_CLOSINGS_MAP."""
2977
2978     self.name = openings_closings_map.name
2979
2980     # The dictionary that holds our node tree as a map { node_key :
2981     # node }.
2982     self._node_tree = { }
2983
2984     for svn_path, svn_revision_range in openings_closings_map.get_things():
2985       (head, tail) = _path_split(svn_path)
2986       self._get_node_for_path(head)[tail] = svn_revision_range
2987
2988     #self.print_node_tree(self._node_tree)
2989
2990   def _get_node_for_path(self, svn_path):
2991     """Return the node key for svn_path, creating new nodes as needed."""
2992     # Walk down the path, one node at a time.
2993     node = self._node_tree
2994     for component in svn_path.split('/'):
2995       if node.has_key(component):
2996         node = node[component]
2997       else:
2998         old_node = node
2999         node = {}
3000         old_node[component] = node
3001
3002     return node
3003
3004   def get_best_revnum(self, node, preferred_revnum):
3005     """Determine the best subversion revision number to use when
3006     copying the source tree beginning at NODE.  Returns a
3007     subversion revision number.
3008
3009     PREFERRED_REVNUM is passed to best_rev and used to calculate the
3010     best_revnum."""
3011
3012     def score_revisions(svn_revision_ranges):
3013       """Return a list of revisions and scores based on
3014       SVN_REVISION_RANGES.  The returned list looks like:
3015
3016          [(REV1 SCORE1), (REV2 SCORE2), ...]
3017
3018       where the tuples are sorted by revision number.
3019       SVN_REVISION_RANGES is a list of SvnRevisionRange objects.
3020
3021       For each svn revision that appears as either an opening_revnum
3022       or closing_revnum for one of the svn_revision_ranges, output a
3023       tuple indicating how many of the SvnRevisionRanges include that
3024       svn_revision in its range.  A score thus indicates that copying
3025       the corresponding revision (or any following revision up to the
3026       next revision in the list) of the object in question would yield
3027       that many correct paths at or underneath the object.  There may
3028       be other paths underneath it which are not correct and would
3029       need to be deleted or recopied; those can only be detected by
3030       descending and examining their scores.
3031
3032       If OPENINGS is empty, return the empty list."""
3033       openings = [ x.opening_revnum
3034                    for x in svn_revision_ranges ]
3035       closings = [ x.closing_revnum
3036                    for x in svn_revision_ranges
3037                    if x.closing_revnum is not None ]
3038
3039       # First look for easy out.
3040       if not openings:
3041         return []
3042
3043       # Create a list with both openings (which increment the total)
3044       # and closings (which decrement the total):
3045       things = [(rev,1) for rev in openings] + [(rev,-1) for rev in closings]
3046       # Sort by revision number:
3047       things.sort()
3048       # Initialize output list with zeroth element of things.  This
3049       # element must exist, because it was already verified that
3050       # openings is not empty.
3051       scores = [ things[0] ]
3052       total = scores[-1][1]
3053       for (rev, change) in things[1:]:
3054         total += change
3055         if rev == scores[-1][0]:
3056           # Same revision as last entry; modify last entry:
3057           scores[-1] = (rev, total)
3058         else:
3059           # Previously-unseen revision; create new entry:
3060           scores.append((rev, total))
3061       return scores
3062
3063     def best_rev(scores, preferred_rev):
3064       """Return the revision with the highest score from SCORES, a list
3065       returned by score_revisions().  When the maximum score is shared
3066       by multiple revisions, the oldest revision is selected, unless
3067       PREFERRED_REV is one of the possibilities, in which case, it is
3068       selected."""
3069       max_score = 0
3070       preferred_rev_score = -1
3071       rev = SVN_INVALID_REVNUM
3072       if preferred_rev is None:
3073         # Comparison order of different types is arbitrary.  Do not
3074         # expect None to compare less than int values below.
3075         preferred_rev = SVN_INVALID_REVNUM
3076       for revnum, count in scores:
3077         if count > max_score:
3078           max_score = count
3079           rev = revnum
3080         if revnum <= preferred_rev:
3081           preferred_rev_score = count
3082       if preferred_rev_score == max_score:
3083         rev = preferred_rev
3084       return rev, max_score
3085
3086     # Aggregate openings and closings from the rev tree
3087     svn_revision_ranges = self._list_revnums(node)
3088
3089     # Score the lists
3090     scores = score_revisions(svn_revision_ranges)
3091
3092     revnum, max_score = best_rev(scores, preferred_revnum)
3093
3094     if revnum == SVN_INVALID_REVNUM:
3095       raise FatalError("failed to find a revision "
3096                        + "to copy from when copying %s" % name)
3097     return revnum, max_score
3098
3099   def _list_revnums(self, node):
3100     """Return a list of all the SvnRevisionRanges (including
3101     duplicates) for all leaf nodes at and under NODE."""
3102
3103     if isinstance(node, SvnRevisionRange):
3104       # It is a leaf node.
3105       return [ node ]
3106     else:
3107       # It is an intermediate node.
3108       revnums = []
3109       for key, subnode in node.items():
3110         revnums.extend(self._list_revnums(subnode))
3111       return revnums
3112
3113   def get_sources(self):
3114     """Return the list of sources for this symbolic name.
3115
3116     The Project instance defines what are legitimate sources.  Raise
3117     an exception if a change occurred outside of the source
3118     directories."""
3119
3120     return self._get_sub_sources('', self._node_tree)
3121
3122   def _get_sub_sources(self, start_svn_path, start_node):
3123     """Return the list of sources for this symbolic name, starting the
3124     search at path START_SVN_PATH, which is node START_NODE.  This is
3125     a helper method, called by get_sources() (see)."""
3126
3127     project = Ctx().project
3128     if isinstance(start_node, SvnRevisionRange):
3129       # This implies that a change was found outside of the
3130       # legitimate sources.  This should never happen.
3131       raise
3132     elif project.is_source(start_svn_path):
3133       # This is a legitimate source.  Add it to list.
3134       return [ FillSource(start_svn_path, start_node) ]
3135     else:
3136       # This is a directory that is not a legitimate source.  (That's
3137       # OK because it hasn't changed directly.)  But directories
3138       # within it have been changed, so we need to search recursively
3139       # to find their enclosing sources.
3140       sources = []
3141       for entry, node in start_node.items():
3142         svn_path = _path_join(start_svn_path, entry)
3143         sources.extend(self._get_sub_sources(svn_path, node))
3144
3145     return sources
3146
3147   def print_node_tree(self, node, name='/', indent_depth=0):
3148     """For debugging purposes.  Prints all nodes in TREE that are
3149     rooted at NODE.  INDENT_DEPTH is used to indent the output of
3150     recursive calls."""
3151     if not indent_depth:
3152       print "TREE", "=" * 75
3153     if isinstance(node, SvnRevisionRange):
3154       print "TREE:", " " * (indent_depth * 2), name, node
3155     else:
3156       print "TREE:", " " * (indent_depth * 2), name
3157       for key, value in node.items():
3158         self.print_node_tree(value, key, (indent_depth + 1))
3159
3160
3161 class FillSource:
3162   """Representation of a fill source used by the symbol filler in
3163   SVNRepositoryMirror."""
3164   def __init__(self, prefix, node):
3165     """Create an unscored fill source with a prefix and a key."""
3166     self.prefix = prefix
3167     self.node = node
3168     self.score = None
3169     self.revnum = None
3170
3171   def set_score(self, score, revnum):
3172     """Set the SCORE and REVNUM."""
3173     self.score = score
3174     self.revnum = revnum
3175
3176   def __cmp__(self, other):
3177     """Comparison operator used to sort FillSources in descending
3178     score order."""
3179     if self.score is None or other.score is None:
3180       raise TypeError, 'Tried to compare unscored FillSource'
3181     return cmp(other.score, self.score)
3182
3183
3184 class SVNRepositoryMirror:
3185   """Mirror a Subversion Repository as it is constructed, one
3186   SVNCommit at a time.  The mirror is skeletal; it does not contain
3187   file contents.  The creation of a dumpfile or Subversion repository
3188   is handled by delegates.  See self.add_delegate method for how to
3189   set delegates.
3190
3191   The structure of the repository is kept in two databases and one
3192   hash.  The revs_db database maps revisions to root node keys, and
3193   the nodes_db database maps node keys to nodes.  A node is a hash
3194   from directory names to keys.  Both the revs_db and the nodes_db are
3195   stored on disk and each access is expensive.
3196
3197   The nodes_db database only has the keys for old revisions.  The
3198   revision that is being contructed is kept in memory in the new_nodes
3199   hash which is cheap to access.
3200
3201   You must invoke _start_commit between SVNCommits.
3202
3203   *** WARNING *** All path arguments to methods in this class CANNOT
3204       have leading or trailing slashes.
3205   """
3206
3207   class SVNRepositoryMirrorPathExistsError(Exception):
3208     """Exception raised if an attempt is made to add a path to the
3209     repository mirror and that path already exists in the youngest
3210     revision of the repository."""
3211     pass
3212
3213   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
3214     """Exception raised if a CVSRevision is found to have an unexpected
3215     operation (OP) value."""
3216     pass
3217
3218   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
3219     """Exception raised if an empty SymbolicNameFillingGuide is returned
3220     during a fill where the branch in question already exists."""
3221     pass
3222
3223   def __init__(self):
3224     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
3225     self.delegates = [ ]
3226
3227     # This corresponds to the 'revisions' table in a Subversion fs.
3228     self.revs_db = SDatabase(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
3229     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
3230
3231     # This corresponds to the 'nodes' table in a Subversion fs.  (We
3232     # don't need a 'representations' or 'strings' table because we
3233     # only track metadata, not file contents.)
3234     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
3235     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
3236
3237     # Start at revision 0 without a root node.  It will be created
3238     # by _open_writable_root_node.
3239     self.youngest = 0
3240     self.new_root_key = None
3241     self.new_nodes = { }
3242
3243     if not Ctx().trunk_only:
3244       ###PERF IMPT: Suck this into memory.
3245       self.tags_db = TagsDatabase(DB_OPEN_READ)
3246       self.symbolings_reader = SymbolingsReader()
3247
3248   def _initialize_repository(self, date):
3249     """Initialize the repository by creating the directories for
3250     trunk, tags, and branches.  This method should only be called
3251     after all delegates are added to the repository mirror."""
3252     # Make a 'fake' SVNCommit so we can take advantage of the revprops
3253     # magic therein
3254     svn_commit = SVNCommit("Initialization", 1)
3255     svn_commit.set_date(date)
3256     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
3257
3258     self._start_commit(svn_commit)
3259     self._mkdir(Ctx().project.trunk_path)
3260     if not Ctx().trunk_only:
3261       self._mkdir(Ctx().project.branches_path)
3262       self._mkdir(Ctx().project.tags_path)
3263
3264   def _start_commit(self, svn_commit):
3265     """Start a new commit."""
3266     if self.youngest > 0:
3267       self._end_commit()
3268
3269     self.youngest = svn_commit.revnum
3270     self.new_root_key = None
3271     self.new_nodes = { }
3272
3273     self._invoke_delegates('start_commit', svn_commit)
3274
3275   def _end_commit(self):
3276     """Called at the end of each commit.  This method copies the newly
3277     created nodes to the on-disk nodes db."""
3278     if self.new_root_key is None:
3279       # No changes were made in this revision, so we make the root node
3280       # of the new revision be the same as the last one.
3281       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
3282     else:
3283       self.revs_db[str(self.youngest)] = self.new_root_key
3284       # Copy the new nodes to the nodes_db
3285       for key, value in self.new_nodes.items():
3286         self.nodes_db[key] = value
3287
3288   def _get_node(self, key):
3289     """Returns the node contents for KEY which may refer to either
3290     self.nodes_db or self.new_nodes."""
3291     if self.new_nodes.has_key(key):
3292       return self.new_nodes[key]
3293     else:
3294       return self.nodes_db[key]
3295
3296   def _open_readonly_node(self, path, revnum):
3297     """Open a readonly node for PATH at revision REVNUM.  Returns the
3298     node key and node contents if the path exists, else (None, None)."""
3299     # Get the root key
3300     if revnum == self.youngest:
3301       if self.new_root_key is None:
3302         node_key = self.revs_db[str(self.youngest - 1)]
3303       else:
3304         node_key = self.new_root_key
3305     else:
3306       node_key = self.revs_db[str(revnum)]
3307
3308     for component in path.split('/'):
3309       node_contents = self._get_node(node_key)
3310       node_key = node_contents.get(component, None)
3311       if node_key is None:
3312         return None
3313
3314     return node_key
3315
3316   def _open_writable_root_node(self):
3317     """Open a writable root node.  The current root node is returned
3318     immeditely if it is already writable.  If not, create a new one by
3319     copying the contents of the root node of the previous version."""
3320     if self.new_root_key is not None:
3321       return self.new_root_key, self.new_nodes[self.new_root_key]
3322
3323     if self.youngest < 2:
3324       new_contents = { }
3325     else:
3326       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
3327     self.new_root_key = gen_key()
3328     self.new_nodes = { self.new_root_key: new_contents }
3329
3330     return self.new_root_key, new_contents
3331
3332   def _open_writable_node(self, svn_path, create):
3333     """Open a writable node for the path SVN_PATH, creating SVN_PATH
3334     and any missing directories if CREATE is True."""
3335     parent_key, parent_contents = self._open_writable_root_node()
3336
3337     # Walk up the path, one node at a time.
3338     path_so_far = None
3339     components = svn_path.split('/')
3340     for i in range(len(components)):
3341       component = components[i]
3342       path_so_far = _path_join(path_so_far, component)
3343       this_key = parent_contents.get(component, None)
3344       if this_key is not None:
3345         # The component exists.
3346         this_contents = self.new_nodes.get(this_key, None)
3347         if this_contents is None:
3348           # Suck the node from the nodes_db, but update the key
3349           this_contents = self.nodes_db[this_key]
3350           this_key = gen_key()
3351           self.new_nodes[this_key] = this_contents
3352           parent_contents[component] = this_key
3353       elif create:
3354         # The component does not exists, so we create it.
3355         this_contents = { }
3356         this_key = gen_key()
3357         self.new_nodes[this_key] = this_contents
3358         parent_contents[component] = this_key
3359         if i < len(components) - 1:
3360           self._invoke_delegates('mkdir', path_so_far)
3361       else:
3362         # The component does not exists and we are not instructed to
3363         # create it, so we give up.
3364         return None, None
3365
3366       parent_key = this_key
3367       parent_contents = this_contents
3368
3369     return this_key, this_contents
3370
3371   def _path_exists(self, path):
3372     """If PATH exists in self.youngest of the svn repository mirror,
3373     return true, else return None.
3374
3375     PATH must not start with '/'."""
3376     return self._open_readonly_node(path, self.youngest) is not None
3377
3378   def _fast_delete_path(self, parent_path, parent_contents, component):
3379     """Delete COMPONENT from the parent direcory PARENT_PATH with the
3380     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
3381     in PARENT_CONTENTS."""
3382     if parent_contents.has_key(component):
3383       del parent_contents[component]
3384       self._invoke_delegates('delete_path',
3385                              _path_join(parent_path, component))
3386
3387   def _delete_path(self, svn_path, should_prune=False):
3388     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
3389     all ancestor directories that are made empty when SVN_PATH is deleted.
3390     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
3391
3392     NOTE: This function ignores requests to delete the root directory
3393     or any directory for which Ctx().project.is_unremovable() returns
3394     True, either directly or by pruning."""
3395
3396     if svn_path == '' or Ctx().project.is_unremovable(svn_path):
3397       return
3398
3399     (parent_path, entry,) = _path_split(svn_path)
3400     if parent_path:
3401       parent_key, parent_contents = \
3402           self._open_writable_node(parent_path, False)
3403     else:
3404       parent_key, parent_contents = self._open_writable_root_node()
3405
3406     if parent_key is not None:
3407       self._fast_delete_path(parent_path, parent_contents, entry)
3408       # The following recursion makes pruning an O(n^2) operation in the
3409       # worst case (where n is the depth of SVN_PATH), but the worst case
3410       # is probably rare, and the constant cost is pretty low.  Another
3411       # drawback is that we issue a delete for each path and not just
3412       # a single delete for the topmost directory pruned.
3413       if should_prune and len(parent_contents) == 0:
3414         self._delete_path(parent_path, True)
3415
3416   def _mkdir(self, path):
3417     """Create PATH in the repository mirror at the youngest revision."""
3418     self._open_writable_node(path, True)
3419     self._invoke_delegates('mkdir', path)
3420
3421   def _change_path(self, cvs_rev):
3422     """Register a change in self.youngest for the CVS_REV's svn_path
3423     in the repository mirror."""
3424     # We do not have to update the nodes because our mirror is only
3425     # concerned with the presence or absence of paths, and a file
3426     # content change does not cause any path changes.
3427     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, False))
3428
3429   def _add_path(self, cvs_rev):
3430     """Add the CVS_REV's svn_path to the repository mirror."""
3431     self._open_writable_node(cvs_rev.svn_path, True)
3432     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, True))
3433
3434   def _copy_path(self, src_path, dest_path, src_revnum):
3435     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
3436     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
3437     parent *must* exist, but DEST_PATH *cannot* exist.
3438
3439     Return the node key and the contents of the new node at DEST_PATH
3440     as a dictionary."""
3441     # get the contents of the node of our src_path
3442     src_key = self._open_readonly_node(src_path, src_revnum)
3443     src_contents = self._get_node(src_key)
3444
3445     # Get the parent path and the base path of the dest_path
3446     (dest_parent, dest_basename,) = _path_split(dest_path)
3447     dest_parent_key, dest_parent_contents = \
3448                    self._open_writable_node(dest_parent, False)
3449
3450     if dest_parent_contents.has_key(dest_basename):
3451       msg = "Attempt to add path '%s' to repository mirror " % dest_path
3452       msg += "when it already exists in the mirror."
3453       raise self.SVNRepositoryMirrorPathExistsError, msg
3454
3455     dest_parent_contents[dest_basename] = src_key
3456     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
3457
3458     # Yes sir, src_key and src_contents are also the contents of the
3459     # destination.  This is a cheap copy, remember!  :-)
3460     return src_key, src_contents
3461
3462   def _fill_symbolic_name(self, svn_commit):
3463     """Performs all copies necessary to create as much of the the tag
3464     or branch SVN_COMMIT.symbolic_name as possible given the current
3465     revision of the repository mirror.
3466
3467     The symbolic name is guaranteed to exist in the Subversion
3468     repository by the end of this call, even if there are no paths
3469     under it."""
3470     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3471         svn_commit.symbolic_name, self.youngest)
3472     # Get the list of sources for the symbolic name.
3473     sources = symbol_fill.get_sources()
3474
3475     if sources:
3476       if self.tags_db.has_key(svn_commit.symbolic_name):
3477         dest_prefix = Ctx().project.get_tag_path(svn_commit.symbolic_name)
3478       else:
3479         dest_prefix = Ctx().project.get_branch_path(svn_commit.symbolic_name)
3480
3481       dest_key = self._open_writable_node(dest_prefix, False)[0]
3482       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3483     else:
3484       # We can only get here for a branch whose first commit is an add
3485       # (as opposed to a copy).
3486       dest_path = Ctx().project.get_branch_path(symbol_fill.name)
3487       if not self._path_exists(dest_path):
3488         # If our symbol_fill was empty, that means that our first
3489         # commit on the branch was to a file added on the branch, and
3490         # that this is our first fill of that branch.
3491         #
3492         # This case is covered by test 16.
3493         #
3494         # ...we create the branch by copying trunk from the our
3495         # current revision number minus 1
3496         source_path = Ctx().project.trunk_path
3497         entries = self._copy_path(source_path, dest_path,
3498                                   svn_commit.revnum - 1)[1]
3499         # Now since we've just copied trunk to a branch that's
3500         # *supposed* to be empty, we delete any entries in the
3501         # copied directory.
3502         for entry in entries:
3503           del_path = dest_path + '/' + entry
3504           # Delete but don't prune.
3505           self._delete_path(del_path)
3506       else:
3507         msg = "Error filling branch '" \
3508               + _clean_symbolic_name(symbol_fill.name) + "'.\n"
3509         msg += "Received an empty SymbolicNameFillingGuide and\n"
3510         msg += "attempted to create a branch that already exists."
3511         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3512
3513   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3514             path = None, parent_source_prefix = None,
3515             preferred_revnum = None, prune_ok = None):
3516     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3517     SOURCES, and recurse into the child items.
3518
3519     DEST_PREFIX is the prefix of the destination directory, e.g.
3520     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3521     FillSource classes that are candidates to be copied to the
3522     destination.  DEST_KEY is the key in self.nodes_db to the
3523     destination, or None if the destination does not yet exist.
3524
3525     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3526     are at the top level, e.g. '/tags/my_tag'.
3527
3528     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3529     the parent directory, and PREFERRED_REVNUM is an int which is the
3530     source revision number that the caller (who may have copied KEY's
3531     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3532     then no revision is preferable to any other (which probably means
3533     that no copies have happened yet).
3534
3535     PRUNE_OK means that a copy has been made in this recursion, and
3536     it's safe to prune directories that are not in
3537     SYMBOL_FILL._node_tree, provided that said directory has a source
3538     prefix of one of the PARENT_SOURCE_PREFIX.
3539
3540     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3541     should only be passed in by recursive calls."""
3542     # Calculate scores and revnums for all sources
3543     for source in sources:
3544       src_revnum, score = symbol_fill.get_best_revnum(source.node,
3545                                                       preferred_revnum)
3546       source.set_score(score, src_revnum)
3547
3548     # Sort the sources in descending score order so that we will make
3549     # a eventual copy from the source with the highest score.
3550     sources.sort()
3551     copy_source = sources[0]
3552
3553     src_path = _path_join(copy_source.prefix, path)
3554     dest_path = _path_join(dest_prefix, path)
3555
3556     # Figure out if we shall copy to this destination and delete any
3557     # destination path that is in the way.
3558     do_copy = 0
3559     if dest_key is None:
3560       do_copy = 1
3561     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3562                        copy_source.revnum != preferred_revnum):
3563       # We are about to replace the destination, so we need to remove
3564       # it before we perform the copy.
3565       self._delete_path(dest_path)
3566       do_copy = 1
3567
3568     if do_copy:
3569       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3570                                                copy_source.revnum)
3571       prune_ok = 1
3572     else:
3573       dest_entries = self._get_node(dest_key)
3574
3575     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3576     # elements and the values are lists of FillSource classes where
3577     # this path element exists.
3578     src_entries = {}
3579     for source in sources:
3580       if isinstance(source.node, SvnRevisionRange):
3581         continue
3582       for entry, node in source.node.items():
3583         src_entries.setdefault(entry, []).append(
3584             FillSource(source.prefix, node))
3585
3586     if prune_ok:
3587       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3588       delete_list = [ ]
3589       for entry in dest_entries:
3590         if not src_entries.has_key(entry):
3591           delete_list.append(entry)
3592       if delete_list:
3593         if not self.new_nodes.has_key(dest_key):
3594           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3595         # Sort the delete list to get "diffable" dumpfiles.
3596         delete_list.sort()
3597         for entry in delete_list:
3598           self._fast_delete_path(dest_path, dest_entries, entry)
3599
3600     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3601     src_keys = src_entries.keys()
3602     src_keys.sort()
3603     for src_key in src_keys:
3604       next_dest_key = dest_entries.get(src_key, None)
3605       self._fill(symbol_fill, dest_prefix, next_dest_key,
3606                  src_entries[src_key], _path_join(path, src_key),
3607                  copy_source.prefix, sources[0].revnum, prune_ok)
3608
3609   def _synchronize_default_branch(self, svn_commit):
3610     """Propagate any changes that happened on a non-trunk default
3611     branch to the trunk of the repository.  See
3612     CVSCommit._post_commit() for details on why this is necessary."""
3613     for cvs_rev in svn_commit.cvs_revs:
3614       svn_trunk_path = Ctx().project.make_trunk_path(cvs_rev.cvs_path)
3615       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3616         if self._path_exists(svn_trunk_path):
3617           # Delete the path on trunk...
3618           self._delete_path(svn_trunk_path)
3619         # ...and copy over from branch
3620         self._copy_path(cvs_rev.svn_path, svn_trunk_path,
3621                         svn_commit.motivating_revnum)
3622       elif cvs_rev.op == OP_DELETE:
3623         # delete trunk path
3624         self._delete_path(svn_trunk_path)
3625       else:
3626         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3627                % cvs_rev.op)
3628         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3629
3630   def commit(self, svn_commit):
3631     """Add an SVNCommit to the SVNRepository, incrementing the
3632     Repository revision number, and changing the repository.  Invoke
3633     the delegates' _start_commit() method."""
3634
3635     if svn_commit.revnum == 2:
3636       self._initialize_repository(svn_commit.get_date())
3637
3638     self._start_commit(svn_commit)
3639
3640     if svn_commit.symbolic_name:
3641       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3642                   _clean_symbolic_name(svn_commit.symbolic_name))
3643       self._fill_symbolic_name(svn_commit)
3644     elif svn_commit.motivating_revnum:
3645       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3646                   % svn_commit.motivating_revnum)
3647       self._synchronize_default_branch(svn_commit)
3648     else: # This actually commits CVSRevisions
3649       if len(svn_commit.cvs_revs) > 1: plural = "s"
3650       else: plural = ""
3651       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3652                   % (len(svn_commit.cvs_revs), plural))
3653       for cvs_rev in svn_commit.cvs_revs:
3654         # See comment in CVSCommit._commit() for what this is all
3655         # about.  Note that although asking self._path_exists() is
3656         # somewhat expensive, we only do it if the first two (cheap)
3657         # tests succeed first.
3658         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3659                 and (cvs_rev.rev == "1.1.1.1")
3660                 and self._path_exists(cvs_rev.svn_path)):
3661           if cvs_rev.op == OP_ADD:
3662             self._add_path(cvs_rev)
3663           elif cvs_rev.op == OP_CHANGE:
3664             # Fix for Issue #74:
3665             #
3666             # Here's the scenario.  You have file FOO that is imported
3667             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3668             # the file exists.
3669             #
3670             # Moving forward in time, FOO is deleted on the default
3671             # branch (r1.1.1.2).  cvs2svn determines that this delete
3672             # also needs to happen on trunk, so FOO is deleted on
3673             # trunk.
3674             #
3675             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3676             # not 'dead', we assume it's a change).  However, since
3677             # our trunk file has been deleted, svnadmin blows up--you
3678             # can't change a file that doesn't exist!
3679             #
3680             # Soooo... we just check the path, and if it doesn't
3681             # exist, we do an add... if the path does exist, it's
3682             # business as usual.
3683             if not self._path_exists(cvs_rev.svn_path):
3684               self._add_path(cvs_rev)
3685             else:
3686               self._change_path(cvs_rev)
3687
3688         if cvs_rev.op == OP_DELETE:
3689           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3690
3691   def cleanup(self):
3692     """Callback for the Cleanup.register in self.__init__."""
3693     self.revs_db = None
3694     self.nodes_db = None
3695
3696   def add_delegate(self, delegate):
3697     """Adds DELEGATE to self.delegates.
3698
3699     For every delegate you add, as soon as SVNRepositoryMirror
3700     performs a repository action method, SVNRepositoryMirror will call
3701     the delegate's corresponding repository action method.  Multiple
3702     delegates will be called in the order that they are added.  See
3703     SVNRepositoryMirrorDelegate for more information."""
3704     self.delegates.append(delegate)
3705
3706   def _invoke_delegates(self, method, *args):
3707     """Iterate through each of our delegates, in the order that they
3708     were added, and call the delegate's method named METHOD with the
3709     arguments in ARGS."""
3710     for delegate in self.delegates:
3711       getattr(delegate, method)(*args)
3712
3713   def finish(self):
3714     """Calls the delegate finish method."""
3715     self._end_commit()
3716     self._invoke_delegates('finish')
3717     self.cleanup()
3718
3719
3720 class SVNCommitItem:
3721   """A wrapper class for CVSRevision objects upon which
3722   Subversion-related data (such as properties) may be hung."""
3723
3724   def __init__(self, c_rev, svn_props_changed):
3725     """Initialize instance and record the properties for this file.
3726     SVN_PROPS_CHANGED indicates whether the svn: properties are known
3727     to have changed since the last revision.
3728
3729     The properties are set by the SVNPropertySetters in
3730     Ctx().svn_property_setters, then we read a couple of the
3731     properties back out for our own purposes."""
3732
3733     self.c_rev = c_rev
3734     # Did the svn properties change for this file (i.e., do they have
3735     # to be written to the dumpfile?)
3736     self.svn_props_changed = svn_props_changed
3737
3738     # The properties for this item as a map { key : value }.  If VALUE
3739     # is None, no property should be set.
3740     self.svn_props = { }
3741
3742     for svn_property_setter in Ctx().svn_property_setters:
3743       svn_property_setter.set_properties(self)
3744
3745     # Remember if we need to filter the EOLs.  We could actually use
3746     # self.svn_props now, since it is initialized for each revision.
3747     self.needs_eol_filter = \
3748         self.svn_props.get('svn:eol-style', None) is not None
3749
3750     self.has_keywords = self.svn_props.get('svn:keywords', None) is not None
3751
3752
3753 class SVNRepositoryMirrorDelegate:
3754   """Abstract superclass for any delegate to SVNRepositoryMirror.
3755   Subclasses must implement all of the methods below.
3756
3757   For each method, a subclass implements, in its own way, the
3758   Subversion operation implied by the method's name.  For example, for
3759   the add_path method, the DumpfileDelegate would write out a
3760   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3761   would merely print that the path is being added to the repository,
3762   and the RepositoryDelegate would actually cause the path to be added
3763   to the Subversion repository that it is creating.
3764   """
3765
3766   def start_commit(self, svn_commit):
3767     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3768     see subclass implementation for details."""
3769     raise NotImplementedError
3770
3771   def mkdir(self, path):
3772     """PATH is a string; see subclass implementation for details."""
3773     raise NotImplementedError
3774
3775   def add_path(self, s_item):
3776     """S_ITEM is an SVNCommitItem; see subclass implementation for
3777     details."""
3778     raise NotImplementedError
3779
3780   def change_path(self, s_item):
3781     """S_ITEM is an SVNCommitItem; see subclass implementation for
3782     details."""
3783     raise NotImplementedError
3784
3785   def delete_path(self, path):
3786     """PATH is a string; see subclass implementation for
3787     details."""
3788     raise NotImplementedError
3789
3790   def copy_path(self, src_path, dest_path, src_revnum):
3791     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3792     subversion revision number (int); see subclass implementation for
3793     details."""
3794     raise NotImplementedError
3795
3796   def finish(self):
3797     """Perform any cleanup necessary after all revisions have been
3798     committed."""
3799     raise NotImplementedError
3800
3801
3802 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3803   """Create a Subversion dumpfile."""
3804
3805   def __init__(self, dumpfile_path=None):
3806     """Return a new DumpfileDelegate instance, attached to a dumpfile
3807     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding."""
3808     if dumpfile_path:
3809       self.dumpfile_path = dumpfile_path
3810     else:
3811       self.dumpfile_path = Ctx().dumpfile
3812
3813     self.dumpfile = open(self.dumpfile_path, 'wb')
3814     self._write_dumpfile_header(self.dumpfile)
3815
3816   def _write_dumpfile_header(self, dumpfile):
3817     # Initialize the dumpfile with the standard headers.
3818     #
3819     # Since the CVS repository doesn't have a UUID, and the Subversion
3820     # repository will be created with one anyway, we don't specify a
3821     # UUID in the dumpflie
3822     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3823
3824   def _utf8_path(self, path):
3825     """Return a copy of PATH encoded in UTF-8."""
3826     pieces = path.split('/')
3827     # Convert each path component separately (as they may each use
3828     # different encodings).
3829     for i in range(len(pieces)):
3830       try:
3831         # Log messages can be converted with the 'replace' strategy,
3832         # but we can't afford any lossiness here.
3833         pieces[i] = to_utf8(pieces[i], 'strict')
3834       except UnicodeError:
3835         raise FatalError(
3836             "Unable to convert a path '%s' to internal encoding.\n"
3837             "Consider rerunning with one or more '--encoding' parameters."
3838             % (path,))
3839     return '/'.join(pieces)
3840
3841   def _string_for_prop(self, name, value):
3842     """Return a property in the form needed for the dumpfile."""
3843
3844     return 'K %d\n%s\nV %d\n%s\n' % (len(name), name, len(value), value)
3845
3846   def start_commit(self, svn_commit):
3847     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3848
3849     self.revision = svn_commit.revnum
3850
3851     # The start of a new commit typically looks like this:
3852     #
3853     #   Revision-number: 1
3854     #   Prop-content-length: 129
3855     #   Content-length: 129
3856     #
3857     #   K 7
3858     #   svn:log
3859     #   V 27
3860     #   Log message for revision 1.
3861     #   K 10
3862     #   svn:author
3863     #   V 7
3864     #   jrandom
3865     #   K 8
3866     #   svn:date
3867     #   V 27
3868     #   2003-04-22T22:57:58.132837Z
3869     #   PROPS-END
3870     #
3871     # Notice that the length headers count everything -- not just the
3872     # length of the data but also the lengths of the lengths, including
3873     # the 'K ' or 'V ' prefixes.
3874     #
3875     # The reason there are both Prop-content-length and Content-length
3876     # is that the former includes just props, while the latter includes
3877     # everything.  That's the generic header form for any entity in a
3878     # dumpfile.  But since revisions only have props, the two lengths
3879     # are always the same for revisions.
3880
3881     # Calculate the output needed for the property definitions.
3882     props = svn_commit.get_revprops()
3883     prop_names = props.keys()
3884     prop_names.sort()
3885     prop_strings = []
3886     for propname in prop_names:
3887       if props[propname] is not None:
3888         prop_strings.append(self._string_for_prop(propname, props[propname]))
3889
3890     all_prop_strings = ''.join(prop_strings) + 'PROPS-END\n'
3891     total_len = len(all_prop_strings)
3892
3893     # Print the revision header and props
3894     self.dumpfile.write('Revision-number: %d\n'
3895                         'Prop-content-length: %d\n'
3896                         'Content-length: %d\n'
3897                         '\n'
3898                         % (self.revision, total_len, total_len))
3899
3900     self.dumpfile.write(all_prop_strings)
3901     self.dumpfile.write('\n')
3902
3903   def mkdir(self, path):
3904     """Emit the creation of directory PATH."""
3905     self.dumpfile.write("Node-path: %s\n"
3906                         "Node-kind: dir\n"
3907                         "Node-action: add\n"
3908                         "\n"
3909                         "\n" % self._utf8_path(path))
3910
3911   def _add_or_change_path(self, s_item, op):
3912     """Emit the addition or change corresponding to S_ITEM.
3913     OP is either the constant OP_ADD or OP_CHANGE."""
3914
3915     # Validation stuffs
3916     if op == OP_ADD:
3917       action = 'add'
3918     elif op == OP_CHANGE:
3919       action = 'change'
3920     else:
3921       raise FatalError("_add_or_change_path() called with bad op ('%s')"
3922                        % (op,))
3923
3924     # Convenience variables
3925     c_rev = s_item.c_rev
3926
3927     # The property handling here takes advantage of an undocumented
3928     # but IMHO consistent feature of the Subversion dumpfile-loading
3929     # code.  When a node's properties aren't mentioned (that is, the
3930     # "Prop-content-length:" header is absent, no properties are
3931     # listed at all, and there is no "PROPS-END\n" line) then no
3932     # change is made to the node's properties.
3933     #
3934     # This is consistent with the way dumpfiles behave w.r.t. text
3935     # content changes, so I'm comfortable relying on it.  If you
3936     # commit a change to *just* the properties of some node that
3937     # already has text contents from a previous revision, then in the
3938     # dumpfile output for the prop change, no "Text-content-length:"
3939     # nor "Text-content-md5:" header will be present, and the text of
3940     # the file will not be given.  But this does not cause the file's
3941     # text to be erased!  It simply remains unchanged.
3942     #
3943     # This works out great for cvs2svn, due to lucky coincidences:
3944     #
3945     # For files, the only properties we ever set are set in the first
3946     # revision; all other revisions (including on branches) inherit
3947     # from that.  After the first revision, we never change file
3948     # properties, therefore, there is no need to remember the full set
3949     # of properties on a given file once we've set it.
3950     #
3951     # For directories, the only property we set is "svn:ignore", and
3952     # while we may change it after the first revision, we always do so
3953     # based on the contents of a ".cvsignore" file -- in other words,
3954     # CVS is doing the remembering for us, so we still don't have to
3955     # preserve the previous value of the property ourselves.
3956
3957     # Calculate the (sorted-by-name) property string and length, if any.
3958     if s_item.svn_props_changed:
3959       svn_props = s_item.svn_props
3960       prop_contents = ''
3961       prop_names = svn_props.keys()
3962       prop_names.sort()
3963       for pname in prop_names:
3964         pvalue = svn_props[pname]
3965         if pvalue is not None:
3966           prop_contents += self._string_for_prop(pname, pvalue)
3967       prop_contents += 'PROPS-END\n'
3968       props_header = 'Prop-content-length: %d\n' % len(prop_contents)
3969     else:
3970       prop_contents = ''
3971       props_header = ''
3972
3973     # treat .cvsignore as a directory property
3974     dir_path, basename = os.path.split(c_rev.svn_path)
3975     if basename == ".cvsignore":
3976       ignore_vals = generate_ignores(c_rev)
3977       ignore_contents = '\n'.join(ignore_vals)
3978       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3979                          (len(ignore_contents), ignore_contents))
3980       ignore_contents += 'PROPS-END\n'
3981       ignore_len = len(ignore_contents)
3982
3983       # write headers, then props
3984       self.dumpfile.write('Node-path: %s\n'
3985                           'Node-kind: dir\n'
3986                           'Node-action: change\n'
3987                           'Prop-content-length: %d\n'
3988                           'Content-length: %d\n'
3989                           '\n'
3990                           '%s'
3991                           % (self._utf8_path(dir_path), ignore_len,
3992                              ignore_len, ignore_contents))
3993
3994     # If the file has keywords, we must prevent CVS/RCS from expanding
3995     # the keywords because they must be unexpanded in the repository,
3996     # or Subversion will get confused.
3997     pipe_cmd, pipe = Ctx().cvs_repository.get_co_pipe(
3998         c_rev, suppress_keyword_substitution=s_item.has_keywords)
3999
4000     self.dumpfile.write('Node-path: %s\n'
4001                         'Node-kind: file\n'
4002                         'Node-action: %s\n'
4003                         '%s'  # no property header if no props
4004                         'Text-content-length: '
4005                         % (self._utf8_path(c_rev.svn_path),
4006                            action, props_header))
4007
4008     pos = self.dumpfile.tell()
4009
4010     self.dumpfile.write('0000000000000000\n'
4011                         'Text-content-md5: 00000000000000000000000000000000\n'
4012                         'Content-length: 0000000000000000\n'
4013                         '\n')
4014
4015     if prop_contents:
4016       self.dumpfile.write(prop_contents)
4017
4018     # Insert a filter to convert all EOLs to LFs if neccessary
4019     if s_item.needs_eol_filter:
4020       data_reader = LF_EOL_Filter(pipe.stdout)
4021     else:
4022       data_reader = pipe.stdout
4023
4024     # Insert the rev contents, calculating length and checksum as we go.
4025     checksum = md5.new()
4026     length = 0
4027     while True:
4028       buf = data_reader.read(PIPE_READ_SIZE)
4029       if buf == '':
4030         break
4031       checksum.update(buf)
4032       length += len(buf)
4033       self.dumpfile.write(buf)
4034
4035     pipe.stdout.close()
4036     error_output = pipe.stderr.read()
4037     exit_status = pipe.wait()
4038     if exit_status:
4039       raise FatalError("The command '%s' failed with exit status: %s\n"
4040                        "and the following output:\n"
4041                        "%s" % (pipe_cmd, exit_status, error_output))
4042
4043     # Go back to patch up the length and checksum headers:
4044     self.dumpfile.seek(pos, 0)
4045     # We left 16 zeros for the text length; replace them with the real
4046     # length, padded on the left with spaces:
4047     self.dumpfile.write('%16d' % length)
4048     # 16... + 1 newline + len('Text-content-md5: ') == 35
4049     self.dumpfile.seek(pos + 35, 0)
4050     self.dumpfile.write(checksum.hexdigest())
4051     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
4052     self.dumpfile.seek(pos + 84, 0)
4053     # The content length is the length of property data, text data,
4054     # and any metadata around/inside around them.
4055     self.dumpfile.write('%16d' % (length + len(prop_contents)))
4056     # Jump back to the end of the stream
4057     self.dumpfile.seek(0, 2)
4058
4059     # This record is done (write two newlines -- one to terminate
4060     # contents that weren't themselves newline-termination, one to
4061     # provide a blank line for readability.
4062     self.dumpfile.write('\n\n')
4063
4064   def add_path(self, s_item):
4065     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
4066     self._add_or_change_path(s_item, OP_ADD)
4067
4068   def change_path(self, s_item):
4069     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
4070     self._add_or_change_path(s_item, OP_CHANGE)
4071
4072   def delete_path(self, path):
4073     """Emit the deletion of PATH."""
4074     self.dumpfile.write('Node-path: %s\n'
4075                         'Node-action: delete\n'
4076                         '\n' % self._utf8_path(path))
4077
4078   def copy_path(self, src_path, dest_path, src_revnum):
4079     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
4080     # We don't need to include "Node-kind:" for copies; the loader
4081     # ignores it anyway and just uses the source kind instead.
4082     self.dumpfile.write('Node-path: %s\n'
4083                         'Node-action: add\n'
4084                         'Node-copyfrom-rev: %d\n'
4085                         'Node-copyfrom-path: /%s\n'
4086                         '\n'
4087                         % (self._utf8_path(dest_path),
4088                            src_revnum,
4089                            self._utf8_path(src_path)))
4090
4091   def finish(self):
4092     """Perform any cleanup necessary after all revisions have been
4093     committed."""
4094     self.dumpfile.close()
4095
4096
4097 class RepositoryDelegate(DumpfileDelegate):
4098   """Creates a new Subversion Repository.  DumpfileDelegate does all
4099   of the heavy lifting."""
4100   def __init__(self):
4101     self.svnadmin = Ctx().svnadmin
4102     self.target = Ctx().target
4103     if not Ctx().existing_svnrepos:
4104       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
4105       if not Ctx().fs_type:
4106         # User didn't say what kind repository (bdb, fsfs, etc).
4107         # We still pass --bdb-txn-nosync.  It's a no-op if the default
4108         # repository type doesn't support it, but we definitely want
4109         # it if BDB is the default.
4110         run_command('%s create %s "%s"' % (self.svnadmin,
4111                                            "--bdb-txn-nosync",
4112                                            self.target))
4113       elif Ctx().fs_type == 'bdb':
4114         # User explicitly specified bdb.
4115         #
4116         # Since this is a BDB repository, pass --bdb-txn-nosync,
4117         # because it gives us a 4-5x speed boost (if cvs2svn is
4118         # creating the repository, cvs2svn should be the only program
4119         # accessing the svn repository (until cvs is done, at least)).
4120         # But we'll turn no-sync off in self.finish(), unless
4121         # instructed otherwise.
4122         run_command('%s create %s %s "%s"' % (self.svnadmin,
4123                                               "--fs-type=bdb",
4124                                               "--bdb-txn-nosync",
4125                                               self.target))
4126       else:
4127         # User specified something other than bdb.
4128         run_command('%s create %s "%s"' % (self.svnadmin,
4129                                            "--fs-type=%s" % Ctx().fs_type,
4130                                            self.target))
4131
4132     # Since the output of this run is a repository, not a dumpfile,
4133     # the temporary dumpfiles we create should go in the tmpdir.
4134     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
4135
4136     # This is 1 if a commit is in progress, otherwise None.
4137     self._commit_in_progress = None
4138
4139     self.dumpfile = open(self.dumpfile_path, 'w+b')
4140     self.loader_pipe = SimplePopen([ self.svnadmin, 'load', '-q',
4141                                      self.target ], True)
4142     self.loader_pipe.stdout.close()
4143     try:
4144       self._write_dumpfile_header(self.loader_pipe.stdin)
4145     except IOError:
4146       raise FatalError("svnadmin failed with the following output while "
4147                        "loading the dumpfile:\n"
4148                        + self.loader_pipe.stderr.read())
4149
4150   def _feed_pipe(self):
4151     """Feed the revision stored in the dumpfile to the svnadmin
4152     load pipe."""
4153     self.dumpfile.seek(0)
4154     while 1:
4155       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
4156       if not len(data):
4157         break
4158       try:
4159         self.loader_pipe.stdin.write(data)
4160       except IOError:
4161         raise FatalError("svnadmin failed with the following output "
4162                          "while loading the dumpfile:\n"
4163                          + self.loader_pipe.stderr.read())
4164
4165   def start_commit(self, svn_commit):
4166     """Start a new commit.  If a commit is already in progress, close
4167     the dumpfile, load it into the svn repository, open a new
4168     dumpfile, and write the header into it."""
4169     if self._commit_in_progress:
4170       self._feed_pipe()
4171     self.dumpfile.seek(0)
4172     self.dumpfile.truncate()
4173     DumpfileDelegate.start_commit(self, svn_commit)
4174     self._commit_in_progress = 1
4175
4176   def finish(self):
4177     """Loads the last commit into the repository."""
4178     self._feed_pipe()
4179     self.dumpfile.close()
4180     self.loader_pipe.stdin.close()
4181     error_output = self.loader_pipe.stderr.read()
4182     exit_status = self.loader_pipe.wait()
4183     if exit_status:
4184       raise FatalError('svnadmin load failed with exit status: %s\n'
4185                        'and the following output:\n'
4186                        '%s' % (exit_status, error_output,))
4187     os.remove(self.dumpfile_path)
4188
4189     # If this is a BDB repository, and we created the repository, and
4190     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
4191     # line in the DB_CONFIG file, because txn syncing should be on by
4192     # default in BDB repositories.
4193     #
4194     # We determine if this is a BDB repository by looking for the
4195     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
4196     # checking Ctx().fs_type.  That way this code will Do The Right
4197     # Thing in all circumstances.
4198     db_config = os.path.join(self.target, "db/DB_CONFIG")
4199     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
4200         and os.path.exists(db_config)):
4201       no_sync = 'set_flags DB_TXN_NOSYNC\n'
4202
4203       contents = open(db_config, 'r').readlines()
4204       index = contents.index(no_sync)
4205       contents[index] = '# ' + no_sync
4206       contents = open(db_config, 'w').writelines(contents)
4207
4208
4209 class StdoutDelegate(SVNRepositoryMirrorDelegate):
4210   """Makes no changes to the disk, but writes out information to
4211   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
4212   print statements will state that we're doing something, when in
4213   reality, we aren't doing anything other than printing out that we're
4214   doing something.  Kind of zen, really."""
4215   def __init__(self, total_revs):
4216     self.total_revs = total_revs
4217
4218   def start_commit(self, svn_commit):
4219     """Prints out the Subversion revision number of the commit that is
4220     being started."""
4221     Log().write(LOG_VERBOSE, "=" * 60)
4222     Log().write(LOG_NORMAL, "Starting Subversion r%d / %d" %
4223                 (svn_commit.revnum, self.total_revs))
4224
4225   def mkdir(self, path):
4226     """Print a line stating that we are creating directory PATH."""
4227     Log().write(LOG_VERBOSE, "  New Directory", path)
4228
4229   def add_path(self, s_item):
4230     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
4231     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
4232
4233   def change_path(self, s_item):
4234     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
4235     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
4236
4237   def delete_path(self, path):
4238     """Print a line stating that we are 'deleting' PATH."""
4239     Log().write(LOG_VERBOSE, "  Deleting", path)
4240
4241   def copy_path(self, src_path, dest_path, src_revnum):
4242     """Print a line stating that we are 'copying' revision SRC_REVNUM
4243     of SRC_PATH to DEST_PATH."""
4244     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
4245     Log().write(LOG_VERBOSE, "                to", dest_path)
4246
4247   def finish(self):
4248     """State that we are done creating our repository."""
4249     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
4250     Log().write(LOG_QUIET, "Done.")
4251
4252 def pass1():
4253   OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
4254   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
4255   cd = CollectData()
4256
4257   def visit_file(baton, dirname, files):
4258     cd = baton
4259     for fname in files:
4260       verify_filename_legal(fname)
4261       if not fname.endswith(',v'):
4262         continue
4263       cd.found_valid_file = 1
4264       pathname = os.path.join(dirname, fname)
4265       if dirname.endswith(OS_SEP_PLUS_ATTIC):
4266         # drop the 'Attic' portion from the pathname for the canonical name.
4267         fdc = FileDataCollector(cd, os.path.join(dirname[:-6], fname),
4268                                 pathname)
4269       else:
4270         # If this file also exists in the attic, it's a fatal error
4271         attic_path = os.path.join(dirname, 'Attic', fname)
4272         if os.path.exists(attic_path):
4273           err = "%s: A CVS repository cannot contain both %s and %s" \
4274                 % (error_prefix, pathname, attic_path)
4275           sys.stderr.write(err + '\n')
4276           cd.fatal_errors.append(err)
4277         fdc = FileDataCollector(cd, pathname, pathname)
4278       Log().write(LOG_NORMAL, pathname)
4279       try:
4280         cvs2svn_rcsparse.parse(open(pathname, 'rb'), fdc)
4281       except (cvs2svn_rcsparse.common.RCSParseError, ValueError,
4282               RuntimeError):
4283         err = "%s: '%s' is not a valid ,v file" \
4284               % (error_prefix, pathname)
4285         sys.stderr.write(err + '\n')
4286         cd.fatal_errors.append(err)
4287       except:
4288         Log().write(LOG_WARN,
4289                     "Exception occurred while parsing %s" % pathname)
4290         raise
4291
4292   os.path.walk(Ctx().project.project_cvs_repos_path, visit_file, cd)
4293   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
4294
4295   cd.write_symbol_db()
4296
4297   if len(cd.fatal_errors) > 0:
4298     raise FatalException("Pass 1 complete.\n"
4299                          + "=" * 75 + "\n"
4300                          + "Error summary:\n"
4301                          + "\n".join(cd.fatal_errors) + "\n"
4302                          + "Exited due to fatal error(s).\n")
4303
4304   if cd.found_valid_file is None:
4305     raise FatalException(
4306         "\n"
4307         "No RCS files found in your CVS Repository!\n"
4308         "Are you absolutely certain you are pointing cvs2svn\n"
4309         "at a CVS repository?\n"
4310         "\n"
4311         "Exited due to fatal error(s).\n")
4312
4313   StatsKeeper().reset_c_rev_info()
4314   StatsKeeper().archive()
4315   Log().write(LOG_QUIET, "Done")
4316
4317 def pass2():
4318   "Pass 2: clean up the revision information."
4319
4320   symbol_db = SymbolDatabase()
4321   symbol_db.read()
4322
4323   # Convert the list of regexps to a list of strings
4324   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
4325
4326   error_detected = 0
4327
4328   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
4329   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
4330   if blocked_excludes:
4331     for branch, blockers in blocked_excludes.items():
4332       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
4333                        "excluded because the following symbols depend "
4334                        "on it:\n" % (branch))
4335       for blocker in blockers:
4336         sys.stderr.write("    '%s'\n" % (blocker))
4337     sys.stderr.write("\n")
4338     error_detected = 1
4339
4340   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
4341   invalid_forced_tags = [ ]
4342   for forced_tag in Ctx().forced_tags:
4343     if excludes.has_key(forced_tag):
4344       continue
4345     if symbol_db.branch_has_commit(forced_tag):
4346       invalid_forced_tags.append(forced_tag)
4347   if invalid_forced_tags:
4348     sys.stderr.write(error_prefix + ": The following branches cannot be "
4349                      "forced to be tags because they have commits:\n")
4350     for tag in invalid_forced_tags:
4351       sys.stderr.write("    '%s'\n" % (tag))
4352     sys.stderr.write("\n")
4353     error_detected = 1
4354
4355   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
4356   mismatches = symbol_db.find_mismatches(excludes)
4357   def is_not_forced(mismatch):
4358     name = mismatch[0]
4359     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
4360   mismatches = filter(is_not_forced, mismatches)
4361   if mismatches:
4362     sys.stderr.write(error_prefix + ": The following symbols are tags "
4363                      "in some files and branches in others.\nUse "
4364                      "--force-tag, --force-branch and/or --exclude to "
4365                      "resolve the symbols.\n")
4366     for name, tag_count, branch_count, commit_count in mismatches:
4367       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
4368                        "%d files and has commits in %d files.\n"
4369                        % (name, tag_count, branch_count, commit_count))
4370     error_detected = 1
4371
4372   # Bail out now if we found errors
4373   if error_detected:
4374     sys.exit(1)
4375
4376   # Create the tags database
4377   tags_db = TagsDatabase(DB_OPEN_NEW)
4378   for tag in symbol_db.tags:
4379     if tag not in Ctx().forced_branches:
4380       tags_db[tag] = None
4381   for tag in Ctx().forced_tags:
4382     tags_db[tag] = None
4383
4384   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
4385
4386   # We may have recorded some changes in revisions' timestamp.  We need to
4387   # scan for any other files which may have had the same log message and
4388   # occurred at "the same time" and change their timestamps, too.
4389
4390   # read the resync data file
4391   def read_resync(fname):
4392     "Read the .resync file into memory."
4393
4394     ### note that we assume that we can hold the entire resync file in
4395     ### memory. really large repositories with whacky timestamps could
4396     ### bust this assumption. should that ever happen, then it is possible
4397     ### to split the resync file into pieces and make multiple passes,
4398     ### using each piece.
4399
4400     #
4401     # A digest maps to a sequence of lists which specify a lower and upper
4402     # time bound for matching up the commit.  We keep a sequence of these
4403     # because a number of checkins with the same log message (e.g. an empty
4404     # log message) could need to be remapped.  We also make them a list
4405     # because we will dynamically expand the lower/upper bound as we find
4406     # commits that fall into a particular msg and time range.
4407     #
4408     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4409     #
4410     resync = { }
4411
4412     for line in fileinput.FileInput(fname):
4413       t1 = int(line[:8], 16)
4414       digest = line[9:DIGEST_END_IDX]
4415       t2 = int(line[DIGEST_END_IDX+1:], 16)
4416       t1_l = t1 - COMMIT_THRESHOLD/2
4417       t1_u = t1 + COMMIT_THRESHOLD/2
4418       resync.setdefault(digest, []).append([t1_l, t1_u, t2])
4419
4420     # For each digest, sort the resync items in it in increasing order,
4421     # based on the lower time bound.
4422     for val in resync.values():
4423       val.sort()
4424
4425     return resync
4426
4427   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4428
4429   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4430   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4431
4432   tweaked_timestamps_db = Database(temp(TWEAKED_TIMESTAMPS_DB), DB_OPEN_NEW)
4433   Cleanup().register(temp(TWEAKED_TIMESTAMPS_DB), pass2)
4434
4435   # process the revisions file, looking for items to clean up
4436   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4437     c_rev = CVSRevision(Ctx(), line[:-1])
4438
4439     # Skip this entire revision if it's on an excluded branch
4440     if excludes.has_key(c_rev.branch_name):
4441       continue
4442
4443     new_prev_ts = None
4444     if c_rev.prev_rev is not None:
4445       new_prev_ts = tweaked_timestamps_db.get(
4446         c_rev.unique_key(c_rev.prev_rev), None)
4447     if new_prev_ts:
4448       c_rev.prev_timestamp = new_prev_ts
4449
4450     new_next_ts = None
4451     if c_rev.next_rev is not None:
4452       new_next_ts = tweaked_timestamps_db.get(
4453         c_rev.unique_key(c_rev.next_rev), None)
4454     if new_next_ts:
4455       c_rev.next_timestamp = new_next_ts
4456
4457     # Remove all references to excluded tags and branches
4458     def not_excluded(symbol, excludes=excludes):
4459       return not excludes.has_key(symbol)
4460     c_rev.branches = filter(not_excluded, c_rev.branches)
4461     c_rev.tags = filter(not_excluded, c_rev.tags)
4462
4463     # Convert all branches that are forced to be tags
4464     for forced_tag in Ctx().forced_tags:
4465       if forced_tag in c_rev.branches:
4466         c_rev.branches.remove(forced_tag)
4467         c_rev.tags.append(forced_tag)
4468
4469     # Convert all tags that are forced to be branches
4470     for forced_branch in Ctx().forced_branches:
4471       if forced_branch in c_rev.tags:
4472         c_rev.tags.remove(forced_branch)
4473         c_rev.branches.append(forced_branch)
4474
4475     # see if this is "near" any of the resync records we
4476     # have recorded for this digest [of the log message].
4477     for record in resync.get(c_rev.digest, []):
4478       if record[2] == c_rev.timestamp:
4479         # This means that either c_rev is the same revision that
4480         # caused the resync record to exist, or c_rev is a different
4481         # CVS revision that happens to have the same timestamp.  In
4482         # either case, we don't have to do anything, so we...
4483         continue
4484
4485       if record[0] <= c_rev.timestamp <= record[1]:
4486         # bingo!  We probably want to remap the time on this c_rev,
4487         # unless the remapping would be useless because the new time
4488         # would fall outside the COMMIT_THRESHOLD window for this
4489         # commit group.
4490         new_timestamp = record[2]
4491         # If the new timestamp is earlier than that of our previous revision
4492         if new_timestamp < c_rev.prev_timestamp:
4493           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4494                   + " to time %s, which is before previous the time of"
4495                   + " revision %s (%s):")
4496           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4497                                         c_rev.cvs_path, new_timestamp,
4498                                         c_rev.prev_rev, c_rev.prev_timestamp))
4499           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4500           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4501           # attempted resync time, then sync back to c_rev.prev_timestamp
4502           # + 1...
4503           if ((c_rev.prev_timestamp + 1) - new_timestamp) < COMMIT_THRESHOLD:
4504             new_timestamp = c_rev.prev_timestamp + 1
4505             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4506                                                           new_timestamp))
4507           else:
4508             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4509                         warning_prefix)
4510             continue
4511
4512         # If the new timestamp is later than that of our next revision
4513         elif c_rev.next_timestamp and new_timestamp > c_rev.next_timestamp:
4514           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4515                   + " to time %s, which is after time of next"
4516                   + " revision %s (%s):")
4517           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4518                                         c_rev.cvs_path, new_timestamp,
4519                                         c_rev.prev_rev, c_rev.next_timestamp))
4520           # If resyncing our rev to c_rev.next_timestamp - 1 will place
4521           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4522           # attempted resync time, then sync forward to c_rev.next_timestamp
4523           # - 1...
4524           if (new_timestamp - (c_rev.next_timestamp - 1)) < COMMIT_THRESHOLD:
4525             new_timestamp = c_rev.next_timestamp - 1
4526             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4527                                                           new_timestamp))
4528           else:
4529             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4530                         warning_prefix)
4531             continue
4532
4533         # Fix for Issue #71: Avoid resyncing two consecutive revisions
4534         # to the same timestamp.
4535         elif (new_timestamp == c_rev.prev_timestamp
4536               or new_timestamp == c_rev.next_timestamp):
4537           continue
4538
4539         # adjust the time range. we want the COMMIT_THRESHOLD from the
4540         # bounds of the earlier/latest commit in this group.
4541         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4542         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4543
4544         msg = "PASS2 RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4545               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4546                  new_timestamp - c_rev.timestamp)
4547         Log().write(LOG_VERBOSE, msg)
4548
4549         c_rev.timestamp = new_timestamp
4550         tweaked_timestamps_db[c_rev.unique_key()] = new_timestamp
4551
4552         # stop looking for hits
4553         break
4554
4555     output.write(str(c_rev) + "\n")
4556   Log().write(LOG_QUIET, "Done")
4557
4558 def pass3():
4559   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4560   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4561             temp(DATAFILE + SORTED_REVS_SUFFIX))
4562   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4563   Log().write(LOG_QUIET, "Done")
4564
4565 def pass4():
4566   """Iterate through sorted revs, storing them in a database.
4567   If we're not doing a trunk-only conversion, generate the
4568   LastSymbolicNameDatabase, which contains the last CVSRevision
4569   that is a source for each tag or branch.
4570   """
4571   Log().write(LOG_QUIET,
4572       "Copying CVS revision data from flat file to database...")
4573   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4574   if not Ctx().trunk_only:
4575     Log().write(LOG_QUIET,
4576         "Finding last CVS revisions for all symbolic names...")
4577     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4578   else:
4579     # This is to avoid testing Ctx().trunk_only every time around the loop
4580     class DummyLSNDB:
4581       def noop(*args): pass
4582       log_revision = noop
4583       create_database = noop
4584     last_sym_name_db = DummyLSNDB()
4585
4586   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4587     c_rev = CVSRevision(Ctx(), line[:-1])
4588     cvs_revs_db.log_revision(c_rev)
4589     last_sym_name_db.log_revision(c_rev)
4590     StatsKeeper().record_c_rev(c_rev)
4591
4592   last_sym_name_db.create_database()
4593   StatsKeeper().archive()
4594   Log().write(LOG_QUIET, "Done")
4595
4596 def pass5():
4597   """
4598   Generate the SVNCommit <-> CVSRevision mapping
4599   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4600   CVSRevisions that represent an opening or closing for a path on a
4601   branch or tag.  See SymbolingsLogger for more details.
4602   """
4603   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4604
4605   aggregator = CVSRevisionAggregator()
4606   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4607     c_rev = CVSRevision(Ctx(), line[:-1])
4608     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4609       aggregator.process_revision(c_rev)
4610   aggregator.flush()
4611
4612   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4613   StatsKeeper().archive()
4614   Log().write(LOG_QUIET, "Done")
4615
4616 def pass6():
4617   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4618
4619   if not Ctx().trunk_only:
4620     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4621               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4622     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4623   Log().write(LOG_QUIET, "Done")
4624
4625 def pass7():
4626   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4627
4628   def generate_offsets_for_symbolings():
4629     """This function iterates through all the lines in
4630     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4631     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4632     where SYMBOLIC_NAME is first encountered.  This will allow us to
4633     seek to the various offsets in the file and sequentially read only
4634     the openings and closings that we need."""
4635
4636     ###PERF This is a fine example of a db that can be in-memory and
4637     #just flushed to disk when we're done.  Later, it can just be sucked
4638     #back into memory.
4639     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4640     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4641
4642     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4643     old_sym = ""
4644     while 1:
4645       fpos = file.tell()
4646       line = file.readline()
4647       if not line:
4648         break
4649       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4650       if sym != old_sym:
4651         Log().write(LOG_VERBOSE, " ", sym)
4652         old_sym = sym
4653         offsets_db[sym] = fpos
4654
4655   if not Ctx().trunk_only:
4656     generate_offsets_for_symbolings()
4657   Log().write(LOG_QUIET, "Done.")
4658
4659 def pass8():
4660   svncounter = 2 # Repository initialization is 1.
4661   repos = SVNRepositoryMirror()
4662   persistence_manager = PersistenceManager(DB_OPEN_READ)
4663
4664   if Ctx().target:
4665     if not Ctx().dry_run:
4666       repos.add_delegate(RepositoryDelegate())
4667     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4668   else:
4669     if not Ctx().dry_run:
4670       repos.add_delegate(DumpfileDelegate())
4671     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4672
4673   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4674
4675   while 1:
4676     svn_commit = persistence_manager.get_svn_commit(svncounter)
4677     if not svn_commit:
4678       break
4679     repos.commit(svn_commit)
4680     svncounter += 1
4681
4682   repos.finish()
4683
4684 _passes = [
4685   pass1,
4686   pass2,
4687   pass3,
4688   pass4,
4689   pass5,
4690   pass6,
4691   pass7,
4692   pass8,
4693   ]
4694
4695
4696 class Ctx:
4697   """Session state for this run of cvs2svn.  For example, run-time
4698   options are stored here.  This class is a Borg, see
4699   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4700   """
4701   __shared_state = { }
4702   def __init__(self):
4703     self.__dict__ = self.__shared_state
4704     if self.__dict__:
4705       return
4706     # Else, initialize to defaults.
4707     self.target = None
4708     self.dumpfile = DUMPFILE
4709     self.tmpdir = '.'
4710     self.verbose = 0
4711     self.quiet = 0
4712     self.prune = 1
4713     self.existing_svnrepos = 0
4714     self.dump_only = 0
4715     self.dry_run = 0
4716     self.trunk_only = 0
4717     self.trunk_base = "trunk"
4718     self.tags_base = "tags"
4719     self.branches_base = "branches"
4720     self.encoding = ["ascii"]
4721     self.mime_types_file = None
4722     self.auto_props_file = None
4723     self.auto_props_ignore_case = False
4724     self.no_default_eol = 0
4725     self.eol_from_mime_type = 0
4726     self.keywords_off = 0
4727     self.use_cvs = None
4728     self.svnadmin = "svnadmin"
4729     self.username = None
4730     self.print_help = 0
4731     self.skip_cleanup = 0
4732     self.bdb_txn_nosync = 0
4733     self.fs_type = None
4734     self.forced_branches = []
4735     self.forced_tags = []
4736     self.excludes = []
4737     self.symbol_transforms = []
4738     self.svn_property_setters = []
4739
4740
4741 class SVNPropertySetter:
4742   """Abstract class for objects that can set properties on a SVNCommitItem."""
4743
4744   def set_properties(self, s_item):
4745     """Set any properties that can be determined for S_ITEM."""
4746
4747     raise NotImplementedError
4748
4749
4750 class CVSRevisionNumberSetter(SVNPropertySetter):
4751   """Set the cvs2svn:cvs-rev property to the CVS revision number."""
4752
4753   def set_properties(self, s_item):
4754     s_item.svn_props['cvs2svn:cvs-rev'] = s_item.c_rev.rev
4755     s_item.svn_props_changed = True
4756
4757
4758 class ExecutablePropertySetter(SVNPropertySetter):
4759   """Set the svn:executable property based on c_rev.file_executable."""
4760
4761   def set_properties(self, s_item):
4762     if s_item.c_rev.file_executable:
4763       s_item.svn_props['svn:executable'] = '*'
4764
4765
4766 class BinaryFileEOLStyleSetter(SVNPropertySetter):
4767   """Set the eol-style for binary files to None."""
4768
4769   def set_properties(self, s_item):
4770     if s_item.c_rev.mode == 'b':
4771       s_item.svn_props['svn:eol-style'] = None
4772
4773
4774 class MimeMapper(SVNPropertySetter):
4775   """A class that provides mappings from file names to MIME types."""
4776
4777   def __init__(self, mime_types_file):
4778     self.mappings = { }
4779
4780     for line in fileinput.input(mime_types_file):
4781       if line.startswith("#"):
4782         continue
4783
4784       # format of a line is something like
4785       # text/plain c h cpp
4786       extensions = line.split()
4787       if len(extensions) < 2:
4788         continue
4789       type = extensions.pop(0)
4790       for ext in extensions:
4791         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4792           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n"
4793                            % (warning_prefix, ext, self.mappings[ext], type))
4794         self.mappings[ext] = type
4795
4796   def set_properties(self, s_item):
4797     basename, extension = os.path.splitext(
4798         os.path.basename(s_item.c_rev.cvs_path)
4799         )
4800
4801     # Extension includes the dot, so strip it (will leave extension
4802     # empty if filename ends with a dot, which is ok):
4803     extension = extension[1:]
4804
4805     # If there is no extension (or the file ends with a period), use
4806     # the base name for mapping.  This allows us to set mappings for
4807     # files such as README or Makefile:
4808     if not extension:
4809       extension = basename
4810
4811     mime_type = self.mappings.get(extension, None)
4812     if mime_type is not None:
4813       s_item.svn_props['svn:mime-type'] = mime_type
4814
4815
4816 class AutoPropsPropertySetter(SVNPropertySetter):
4817   """Set arbitrary svn properties based on an auto-props configuration.
4818
4819   This class supports case-sensitive or case-insensitive pattern
4820   matching.  The 'correct' behavior is not quite clear, because
4821   subversion itself does an inconsistent job of handling case in
4822   auto-props patterns; see
4823   http://subversion.tigris.org/issues/show_bug.cgi?id=2036.
4824
4825   If a property specified in auto-props has already been set to a
4826   different value, print a warning and leave the old property value
4827   unchanged."""
4828
4829   class Pattern:
4830     """Describes the properties to be set for files matching a pattern."""
4831     def __init__(self, pattern, propdict):
4832       # A glob-like pattern:
4833       self.pattern = pattern
4834       # A dictionary of properties that should be set:
4835       self.propdict = propdict
4836
4837     def match(self, basename):
4838       """Does the file with the specified basename match pattern?"""
4839       return fnmatch.fnmatch(basename, self.pattern)
4840
4841   def __init__(self, configfilename, ignore_case):
4842     config = ConfigParser.ConfigParser()
4843     if ignore_case:
4844       self.transform_case = self.squash_case
4845     else:
4846       config.optionxform = self.preserve_case
4847       self.transform_case = self.preserve_case
4848
4849     config.readfp(file(configfilename))
4850     self.patterns = []
4851     for section in config.sections():
4852       if self.transform_case(section) == 'auto-props':
4853         for pattern in config.options(section):
4854           value = config.get(section, pattern)
4855           if value:
4856             self._add_pattern(pattern, value)
4857
4858   def squash_case(self, s):
4859     return s.lower()
4860
4861   def preserve_case(self, s):
4862     return s
4863
4864   def _add_pattern(self, pattern, value):
4865     props = value.split(';')
4866     propdict = {}
4867     for prop in props:
4868       s = prop.split('=', 1)
4869       if len(s) == 1:
4870         propdict[s[0]] = None
4871       else:
4872         propdict[s[0]] = s[1]
4873     self.patterns.append(
4874         self.Pattern(self.transform_case(pattern), propdict))
4875
4876   def get_propdict(self, path):
4877     basename = self.transform_case(os.path.basename(path))
4878     propdict = {}
4879     for pattern in self.patterns:
4880       if pattern.match(basename):
4881         for (key,value) in pattern.propdict.items():
4882           if propdict.has_key(key):
4883             if propdict[key] != value:
4884               Log().write(
4885                   LOG_WARN,
4886                   "Contradictory values set for property '%s' for file %s."
4887                   % (k, path,))
4888           else:
4889             propdict[key] = value
4890
4891     print 'propdict %s -> %s' % (path, propdict,) ###
4892     return propdict
4893
4894   def set_properties(self, s_item):
4895     propdict = self.get_propdict(s_item.c_rev.cvs_path)
4896     for (k,v) in propdict.items():
4897       if s_item.svn_props.has_key(k):
4898         if s_item.svn_props[k] != v:
4899           Log().write(
4900               LOG_WARN,
4901               "Property '%s' already set for file %s."
4902               % (k, s_item.c_rev.cvs_path,))
4903       else:
4904         s_item.svn_props[k] = v
4905
4906
4907 class BinaryFileDefaultMimeTypeSetter(SVNPropertySetter):
4908   """If the file is binary and its svn:mime-type property is not yet
4909   set, set it to 'application/octet-stream'."""
4910
4911   def set_properties(self, s_item):
4912     if not s_item.svn_props.has_key('svn:mime-type') \
4913            and s_item.c_rev.mode == 'b':
4914       s_item.svn_props['svn:mime-type'] = 'application/octet-stream'
4915
4916
4917 class EOLStyleFromMimeTypeSetter(SVNPropertySetter):
4918   """Set svn:eol-style based on svn:mime-type.
4919
4920   If svn:mime-type is known but svn:eol-style is not, then set
4921   svn:eol-style based on svn:mime-type as follows: if svn:mime-type
4922   starts with 'text/', then set svn:eol-style to native; otherwise,
4923   force it to remain unset.  See also issue #39."""
4924
4925   def set_properties(self, s_item):
4926     if not s_item.svn_props.has_key('svn:eol-style') \
4927        and s_item.svn_props.get('svn:mime-type', None) is not None:
4928       if s_item.svn_props['svn:mime-type'].startswith("text/"):
4929         s_item.svn_props['svn:eol-style'] = 'native'
4930       else:
4931         s_item.svn_props['svn:eol-style'] = None
4932
4933
4934 class DefaultEOLStyleSetter(SVNPropertySetter):
4935   """Set the eol-style if one has not already been set."""
4936
4937   def __init__(self, value):
4938     """Initialize with the specified default VALUE."""
4939
4940     self.value = value
4941
4942   def set_properties(self, s_item):
4943     if not s_item.svn_props.has_key('svn:eol-style'):
4944       s_item.svn_props['svn:eol-style'] = self.value
4945
4946
4947 class KeywordsPropertySetter(SVNPropertySetter):
4948   """If the svn:keywords property is not yet set, set it based on the
4949   file's mode.  See issue #2."""
4950
4951   def __init__(self, value):
4952     """Use VALUE for the value of the svn:keywords property if it is
4953     to be set."""
4954
4955     self.value = value
4956
4957   def set_properties(self, s_item):
4958     if not s_item.svn_props.has_key('svn:keywords') \
4959            and s_item.c_rev.mode in [None, 'kv', 'kvl']:
4960       s_item.svn_props['svn:keywords'] = self.value
4961
4962
4963 def convert(start_pass, end_pass):
4964   "Convert a CVS repository to an SVN repository."
4965
4966   cleanup = Cleanup()
4967   times = [ None ] * (end_pass + 1)
4968   times[start_pass - 1] = time.time()
4969   StatsKeeper().set_start_time(time.time())
4970   for i in range(start_pass - 1, end_pass):
4971     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4972     _passes[i]()
4973     times[i + 1] = time.time()
4974     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4975     # Dispose of items in Ctx() not intended to live past the end of the pass
4976     # (Identified by exactly one leading underscore)
4977     for attr in dir(Ctx()):
4978       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4979           and attr[:6] != "_Ctx__"):
4980         delattr(Ctx(), attr)
4981     if not Ctx().skip_cleanup:
4982       cleanup.cleanup(_passes[i])
4983     StatsKeeper().set_end_time(time.time())
4984
4985   Log().write(LOG_QUIET, StatsKeeper())
4986   if end_pass < 4:
4987     Log().write(LOG_QUIET,
4988                 '(These are unaltered CVS repository stats and do not\n'
4989                 ' reflect tags or branches excluded via --exclude)\n')
4990   Log().write(LOG_NORMAL, StatsKeeper().timings())
4991
4992
4993 def normalize_ttb_path(opt, path):
4994   """Normalize a path to be used for --trunk, --tags, or --branches.
4995
4996   1. Strip leading, trailing, and duplicated '/'.
4997   2. Verify that the path is not empty.
4998
4999   Return the normalized path.
5000
5001   If the path is invalid, write an error message and exit."""
5002
5003   norm_path = _path_join(*path.split('/'))
5004   if not norm_path:
5005     raise FatalError("cannot pass an empty path to %s." % (opt,))
5006   return norm_path
5007
5008
5009 def verify_paths_disjoint(*paths):
5010   """Verify that all of the paths in the argument list are disjoint.
5011
5012   If any of the paths is nested in another one (i.e., in the sense
5013   that 'a/b/c/d' is nested in 'a/b'), or any two paths are identical,
5014   write an error message and exit."""
5015
5016   paths = [(path.split('/'), path) for path in paths]
5017   # If all overlapping elements are equal, a shorter list is
5018   # considered "less than" a longer one.  Therefore if any paths are
5019   # nested, this sort will leave at least one such pair adjacent, in
5020   # the order [nest,nestling].
5021   paths.sort()
5022   for i in range(1, len(paths)):
5023     split_path1, path1 = paths[i - 1]
5024     split_path2, path2 = paths[i]
5025     if len(split_path1) <= len(split_path2) \
5026        and split_path2[:len(split_path1)] == split_path1:
5027       raise FatalError("paths %s and %s are not disjoint." % (path1, path2,))
5028
5029
5030 def usage():
5031   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
5032         % os.path.basename(sys.argv[0])
5033   print '  --help, -h           print this usage message and exit with success'
5034   print '  --version            print the version number'
5035   print '  -q                   quiet'
5036   print '  -v                   verbose'
5037   print '  -s PATH              path for SVN repos'
5038   print '  -p START[:END]       start at pass START, end at pass END of %d' \
5039         % len(_passes)
5040   print '                       If only START is given, run only pass START'
5041   print '                       (implicitly enables --skip-cleanup)'
5042   print '  --existing-svnrepos  load into existing SVN repository'
5043   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
5044   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
5045   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
5046   print '  --dry-run            do not create a repository or a dumpfile;'
5047   print '                       just print what would happen.'
5048   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
5049   print '                       (only use this if having problems with RCS)'
5050   print '  --svnadmin=PATH      path to the svnadmin program'
5051   print '  --trunk-only         convert only trunk commits, not tags nor branches'
5052   print '  --trunk=PATH         path for trunk (default: %s)'    \
5053         % Ctx().trunk_base
5054   print '  --branches=PATH      path for branches (default: %s)' \
5055         % Ctx().branches_base
5056   print '  --tags=PATH          path for tags (default: %s)'     \
5057         % Ctx().tags_base
5058   print '  --no-prune           don\'t prune empty directories'
5059   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
5060   print '  --encoding=ENC       encoding of paths and log messages in CVS repos'
5061   print '                       Multiple of these options may be passed, where they'
5062   print '                       will be treated as an ordered list of encodings to'
5063   print '                       attempt (with "ascii" as a hardcoded last resort)'
5064   print '  --force-branch=NAME  force NAME to be a branch'
5065   print '  --force-tag=NAME     force NAME to be a tag'
5066   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
5067   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
5068   print '                       use Python regexp and reference syntax respectively'
5069   print '  --username=NAME      username for cvs2svn-synthesized commits'
5070   print '  --skip-cleanup       prevent the deletion of intermediate files'
5071   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
5072   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
5073   print '  --cvs-revnums        record CVS revision numbers as file properties'
5074   print '  --auto-props=FILE    set file properties from the auto-props section'
5075   print '                       of a file in svn config format'
5076   print '  --auto-props-ignore-case Ignore case when matching auto-props patterns'
5077   print '  --mime-types=FILE    specify an apache-style mime.types file for'
5078   print '                       setting svn:mime-type'
5079   print '  --eol-from-mime-type set svn:eol-style from mime type if known'
5080   print '  --no-default-eol     don\'t set svn:eol-style to \'native\' for'
5081   print '                       non-binary files with undetermined mime types'
5082   print '  --keywords-off       don\'t set svn:keywords on any files (by default,'
5083   print '                       cvs2svn sets svn:keywords on non-binary files to'
5084   print '                       "%s")' % SVN_KEYWORDS_VALUE
5085
5086 def main():
5087   # Convenience var, so we don't have to keep instantiating this Borg.
5088   ctx = Ctx()
5089
5090   profiling = None
5091   start_pass = 1
5092   end_pass = len(_passes)
5093
5094   try:
5095     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
5096                                [ "help", "create", "trunk=",
5097                                  "username=", "existing-svnrepos",
5098                                  "branches=", "tags=", "encoding=",
5099                                  "force-branch=", "force-tag=", "exclude=",
5100                                  "use-cvs", "mime-types=",
5101                                  "auto-props=", "auto-props-ignore-case",
5102                                  "eol-from-mime-type", "no-default-eol",
5103                                  "trunk-only", "no-prune", "dry-run",
5104                                  "dump-only", "dumpfile=", "tmpdir=",
5105                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
5106                                  "bdb-txn-nosync", "fs-type=",
5107                                  "version", "profile",
5108                                  "keywords-off", "symbol-transform="])
5109   except getopt.GetoptError, e:
5110     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
5111     usage()
5112     sys.exit(1)
5113
5114   for opt, value in opts:
5115     if opt == '--version':
5116         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
5117         sys.exit(0)
5118     elif opt == '-p':
5119       # Don't cleanup if we're doing incrementals.
5120       ctx.skip_cleanup = 1
5121       if value.find(':') > 0:
5122         start_pass, end_pass = map(int, value.split(':'))
5123       else:
5124         end_pass = start_pass = int(value)
5125       if start_pass > len(_passes) or start_pass < 1:
5126         raise FatalError(
5127             'illegal value (%d) for starting pass.  Must be 1 through %d.'
5128             % (int(start_pass), len(_passes),))
5129       if end_pass < start_pass or end_pass > len(_passes):
5130         raise FatalError(
5131             'illegal value (%d) for ending pass.  Must be %d through %d.'
5132             % (int(end_pass), int(start_pass), len(_passes),))
5133     elif (opt == '--help') or (opt == '-h'):
5134       ctx.print_help = 1
5135     elif opt == '-v':
5136       Log().log_level = LOG_VERBOSE
5137       ctx.verbose = 1
5138     elif opt == '-q':
5139       Log().log_level = LOG_QUIET
5140       ctx.quiet = 1
5141     elif opt == '-s':
5142       ctx.target = value
5143     elif opt == '--existing-svnrepos':
5144       ctx.existing_svnrepos = 1
5145     elif opt == '--dumpfile':
5146       ctx.dumpfile = value
5147     elif opt == '--tmpdir':
5148       ctx.tmpdir = value
5149     elif opt == '--use-cvs':
5150       ctx.use_cvs = 1
5151     elif opt == '--svnadmin':
5152       ctx.svnadmin = value
5153     elif opt == '--trunk-only':
5154       ctx.trunk_only = 1
5155     elif opt == '--trunk':
5156       ctx.trunk_base = normalize_ttb_path(opt, value)
5157     elif opt == '--branches':
5158       ctx.branches_base = normalize_ttb_path(opt, value)
5159     elif opt == '--tags':
5160       ctx.tags_base = normalize_ttb_path(opt, value)
5161     elif opt == '--no-prune':
5162       ctx.prune = None
5163     elif opt == '--dump-only':
5164       ctx.dump_only = 1
5165     elif opt == '--dry-run':
5166       ctx.dry_run = 1
5167     elif opt == '--encoding':
5168       ctx.encoding.insert(-1, value)
5169     elif opt == '--force-branch':
5170       ctx.forced_branches.append(value)
5171     elif opt == '--force-tag':
5172       ctx.forced_tags.append(value)
5173     elif opt == '--exclude':
5174       try:
5175         ctx.excludes.append(re.compile('^' + value + '$'))
5176       except re.error, e:
5177         raise FatalError("'%s' is not a valid regexp." % (value,))
5178     elif opt == '--mime-types':
5179       ctx.mime_types_file = value
5180     elif opt == '--auto-props':
5181       ctx.auto_props_file = value
5182     elif opt == '--auto-props-ignore-case':
5183       ctx.auto_props_ignore_case = True
5184     elif opt == '--eol-from-mime-type':
5185       ctx.eol_from_mime_type = 1
5186     elif opt == '--no-default-eol':
5187       ctx.no_default_eol = 1
5188     elif opt == '--keywords-off':
5189       ctx.keywords_off = 1
5190     elif opt == '--username':
5191       ctx.username = value
5192     elif opt == '--skip-cleanup':
5193       ctx.skip_cleanup = 1
5194     elif opt == '--cvs-revnums':
5195       ctx.svn_property_setters.append(CVSRevisionNumberSetter())
5196     elif opt == '--bdb-txn-nosync':
5197       ctx.bdb_txn_nosync = 1
5198     elif opt == '--fs-type':
5199       ctx.fs_type = value
5200     elif opt == '--create':
5201       sys.stderr.write(warning_prefix +
5202           ': The behaviour produced by the --create option is now the '
5203           'default,\nand passing the option is deprecated.\n')
5204     elif opt == '--profile':
5205       profiling = 1
5206     elif opt == '--symbol-transform':
5207       [pattern, replacement] = value.split(":")
5208       try:
5209         pattern = re.compile(pattern)
5210       except re.error, e:
5211         raise FatalError("'%s' is not a valid regexp." % (pattern,))
5212       ctx.symbol_transforms.append((pattern, replacement,))
5213
5214   if ctx.print_help:
5215     usage()
5216     sys.exit(0)
5217
5218   # Consistency check for options and arguments.
5219   if len(args) == 0:
5220     usage()
5221     sys.exit(1)
5222
5223   if len(args) > 1:
5224     sys.stderr.write(error_prefix +
5225                      ": must pass only one CVS repository.\n")
5226     usage()
5227     sys.exit(1)
5228
5229   cvsroot = args[0]
5230
5231   if ctx.use_cvs:
5232     ctx.cvs_repository = CVSRepositoryViaCVS(cvsroot)
5233   else:
5234     ctx.cvs_repository = CVSRepositoryViaRCS(cvsroot)
5235
5236   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
5237     raise FatalError("must pass one of '-s' or '--dump-only'.")
5238
5239   def not_both(opt1val, opt1name, opt2val, opt2name):
5240     if opt1val and opt2val:
5241       raise FatalError("cannot pass both '%s' and '%s'."
5242                        % (opt1name, opt2name,))
5243
5244   not_both(ctx.target, '-s',
5245            ctx.dump_only, '--dump-only')
5246
5247   not_both(ctx.dump_only, '--dump-only',
5248            ctx.existing_svnrepos, '--existing-svnrepos')
5249
5250   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
5251            ctx.existing_svnrepos, '--existing-svnrepos')
5252
5253   not_both(ctx.dump_only, '--dump-only',
5254            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
5255
5256   not_both(ctx.quiet, '-q',
5257            ctx.verbose, '-v')
5258
5259   not_both(ctx.fs_type, '--fs-type',
5260            ctx.existing_svnrepos, '--existing-svnrepos')
5261
5262   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
5263     raise FatalError("cannot pass --bdb-txn-nosync with --fs-type=%s."
5264                      % ctx.fs_type)
5265
5266   # Create the default project (using ctx.trunk, ctx.branches, and ctx.tags):
5267   ctx.project = Project(ctx.cvs_repository.cvs_repos_path,
5268                         ctx.trunk_base, ctx.branches_base, ctx.tags_base)
5269
5270   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
5271     raise FatalError("the svn-repos-path '%s' is not an "
5272                      "existing directory." % ctx.target)
5273
5274   if not ctx.dump_only and not ctx.existing_svnrepos \
5275      and (not ctx.dry_run) and os.path.exists(ctx.target):
5276     raise FatalError("the svn-repos-path '%s' exists.\n"
5277                      "Remove it, or pass '--existing-svnrepos'."
5278                      % ctx.target)
5279
5280   if ctx.target and not ctx.dry_run:
5281     # Verify that svnadmin can be executed.  The 'help' subcommand
5282     # should be harmless.
5283     try:
5284       check_command_runs([ctx.svnadmin, 'help'], 'svnadmin')
5285     except CommandFailedException, e:
5286       raise FatalError(
5287           '%s\n'
5288           'svnadmin could not be executed.  Please ensure that it is\n'
5289           'installed and/or use the --svnadmin option.' % (e,))
5290
5291   ctx.svn_property_setters.append(ExecutablePropertySetter())
5292
5293   ctx.svn_property_setters.append(BinaryFileEOLStyleSetter())
5294
5295   if ctx.mime_types_file:
5296     ctx.svn_property_setters.append(MimeMapper(ctx.mime_types_file))
5297
5298   if ctx.auto_props_file:
5299     ctx.svn_property_setters.append(AutoPropsPropertySetter(
5300         ctx.auto_props_file, ctx.auto_props_ignore_case))
5301
5302   ctx.svn_property_setters.append(BinaryFileDefaultMimeTypeSetter())
5303
5304   if ctx.eol_from_mime_type:
5305     ctx.svn_property_setters.append(EOLStyleFromMimeTypeSetter())
5306
5307   if ctx.no_default_eol:
5308     ctx.svn_property_setters.append(DefaultEOLStyleSetter(None))
5309   else:
5310     ctx.svn_property_setters.append(DefaultEOLStyleSetter('native'))
5311
5312   if not ctx.keywords_off:
5313     ctx.svn_property_setters.append(
5314         KeywordsPropertySetter(SVN_KEYWORDS_VALUE))
5315
5316   # Make sure the tmp directory exists.  Note that we don't check if
5317   # it's empty -- we want to be able to use, for example, "." to hold
5318   # tempfiles.  But if we *did* want check if it were empty, we'd do
5319   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
5320   if not os.path.exists(ctx.tmpdir):
5321     os.mkdir(ctx.tmpdir)
5322   elif not os.path.isdir(ctx.tmpdir):
5323     raise FatalError(
5324         "cvs2svn tried to use '%s' for temporary files, but that path\n"
5325         "  exists and is not a directory.  Please make it be a directory,\n"
5326         "  or specify some other directory for temporary files."
5327         % (ctx.tmpdir,))
5328
5329   # But do lock the tmpdir, to avoid process clash.
5330   try:
5331     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5332   except OSError, e:
5333     if e.errno == errno.EACCES:
5334       raise FatalError("Permission denied:"
5335                        + " No write access to directory '%s'." % ctx.tmpdir)
5336     if e.errno == errno.EEXIST:
5337       raise FatalError(
5338           "cvs2svn is using directory '%s' for temporary files, but\n"
5339           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
5340           "  cvs2svn process is currently using '%s' as its temporary\n"
5341           "  workspace.  If you are certain that is not the case,\n"
5342           "  then remove the '%s/cvs2svn.lock' subdirectory."
5343           % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir,))
5344     raise
5345   try:
5346     if profiling:
5347       import hotshot
5348       prof = hotshot.Profile('cvs2svn.hotshot')
5349       prof.runcall(convert, start_pass, end_pass)
5350       prof.close()
5351     else:
5352       convert(start_pass, end_pass)
5353   finally:
5354     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
5355     except: pass
5356
5357
5358 if __name__ == '__main__':
5359   try:
5360     main()
5361   except FatalException, e:
5362     sys.stderr.write(str(e))
5363     sys.exit(1)
5364
5365