cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import string
  30 import getopt
  31 import stat
  32 import string
  33 import md5
  34 import marshal
  35 import errno
  36 import popen2
  37
  38 # Warnings and errors start with these strings.  They are typically
  39 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  40 warning_prefix = "WARNING"
  41 error_prefix = "ERROR"
  42
  43 # Make sure this Python is recent enough.
  44 if sys.hexversion < 0x2000000:
  45   sys.stderr.write("'%s: Python 2.0 or higher required, "
  46                    "see www.python.org.\n" % error_prefix)
  47   sys.exit(1)
  48
  49 # Pretend we have true booleans on older python versions
  50 try:
  51   True
  52 except:
  53   True = 1
  54   False = 0
  55
  56 # Minimal, incomplete, version of popen2.Popen3 for those platforms
  57 # for which popen2 does not provide it.
  58 try:
  59   Popen3 = popen2.Popen3
  60 except AttributeError:
  61   class Popen3:
  62     def __init__(self, cmd, capturestderr):
  63       if type(cmd) != str:
  64         cmd = " ".join(cmd)
  65       self.fromchild, self.tochild, self.childerr = popen2.popen3(cmd,
  66                                                                   mode='b')
  67     def wait(self):
  68       return self.fromchild.close() or self.tochild.close() or \
  69              self.childerr.close()
  70
  71 # DBM module selection
  72
  73 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
  74 #    so that the dbhash module used by anydbm will use bsddb3.
  75 try:
  76   import bsddb3
  77   sys.modules['bsddb'] = sys.modules['bsddb3']
  78 except ImportError:
  79   pass
  80
  81 # 2. These DBM modules are not good for cvs2svn.
  82 import anydbm
  83 if (anydbm._defaultmod.__name__ == 'dumbdbm'
  84     or anydbm._defaultmod.__name__ == 'dbm'):
  85   print 'ERROR: your installation of Python does not contain a suitable'
  86   print '  DBM module. This script cannot continue.'
  87   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  88   print '  for details.'
  89   sys.exit(1)
  90
  91 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
  92 #    Unfortunately, gdbm appears not to be trouble free, either.
  93 if hasattr(anydbm._defaultmod, 'bsddb') \
  94     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  95   try:
  96     gdbm = __import__('gdbm')
  97   except ImportError:
  98     sys.stderr.write(warning_prefix +
  99         ': The version of the bsddb module found '
 100         'on your computer has been reported to malfunction on some datasets, '
 101         'causing KeyError exceptions. You may wish to upgrade your Python to '
 102         'version 2.3 or later.\n')
 103   else:
 104     anydbm._defaultmod = gdbm
 105
 106 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 107 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 108 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 109
 110 # This really only matches standard '1.1.1.*'-style vendor revisions.
 111 # One could conceivably have a file whose default branch is 1.1.3 or
 112 # whatever, or was that at some point in time, with vendor revisions
 113 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 114 # is the only time this regexp gets used), we'd have no basis for
 115 # assuming that the non-standard vendor branch had ever been the
 116 # default branch anyway, so we don't want this to match them anyway.
 117 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 118
 119 # If this run's output is a repository, then (in the tmpdir) we use
 120 # a dumpfile of this name for repository loads.
 121 #
 122 # If this run's output is a dumpfile, then this is default name of
 123 # that dumpfile, but in the current directory (unless the user has
 124 # specified a dumpfile path, of course, in which case it will be
 125 # wherever the user said).
 126 DUMPFILE = 'cvs2svn-dump'
 127
 128 # This file appears with different suffixes at different stages of
 129 # processing.  CVS revisions are cleaned and sorted here, for commit
 130 # grouping.  See design-notes.txt for details.
 131 DATAFILE = 'cvs2svn-data'
 132
 133 # This file contains a marshalled copy of all the statistics that we
 134 # gather throughout the various runs of cvs2svn.  The data stored as a
 135 # marshalled dictionary.
 136 STATISTICS_FILE = 'cvs2svn-statistics'
 137
 138 # This text file contains records (1 per line) that describe svn
 139 # filesystem paths that are the opening and closing source revisions
 140 # for copies to tags and branches.  The format is as follows:
 141 #
 142 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 143 #
 144 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 145 # SVN_REVNUM are the primary and secondary sorting criteria for
 146 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 147 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 148 # A sorted version of the above file.
 149 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 150
 151 # This file is a temporary file for storing symbolic_name -> closing
 152 # CVSRevision until the end of our pass where we can look up the
 153 # corresponding SVNRevNum for the closing revs and write these out to
 154 # the SYMBOL_OPENINGS_CLOSINGS.
 155 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 156
 157 # Skeleton version of an svn filesystem.
 158 # (These supersede and will eventually replace the two above.)
 159 # See class SVNRepositoryMirror for how these work.
 160 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 161 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 162
 163 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 164 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 165 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 166
 167 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 168 # the CVSRevision is the last such that is a source for those symbolic
 169 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 170 # file, and this file's 1.3 is the latest (by date) revision among
 171 # *all* CVS files that is a source for branch B, then the
 172 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 173 # list at least B in its list.
 174 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 175
 176 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 177 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 178 ### the s-revs data in this database.
 179 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 180
 181 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 182 # names), values are ignorable.
 183 TAGS_DB = 'cvs2svn-tags.db'
 184
 185 # A list all tags.  Each line consists of the tag name and the number
 186 # of files in which it exists, separated by a space.
 187 TAGS_LIST = 'cvs2svn-tags.txt'
 188
 189 # A list of all branches.  The file is stored as a plain text file
 190 # to make it easy to look at in an editor.  Each line contains the
 191 # branch name, the number of files where the branch is created, the
 192 # commit count, and a list of tags and branches that are defined on
 193 # revisions in the branch.
 194 BRANCHES_LIST = 'cvs2svn-branches.txt'
 195
 196 # These two databases provide a bidirectional mapping between
 197 # CVSRevision.unique_key()s and Subversion revision numbers.
 198 #
 199 # The first maps CVSRevision.unique_key() to a number; the values are
 200 # not unique.
 201 #
 202 # The second maps a number to a list of CVSRevision.unique_key()s.
 203 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 204 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 205
 206 # This database maps svn_revnums to tuples of (symbolic_name, date).
 207 #
 208 # The svn_revnums are the revision numbers of all non-primary
 209 # SVNCommits.  No primary SVNCommit has a key in this database.
 210 #
 211 # The date is stored for all commits in this database.
 212 #
 213 # For commits that fill symbolic names, the symbolic_name is stored.
 214 # For commits that default branch syncs, the symbolic_name is None.
 215 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 216
 217 # This database maps svn_revnums of a default branch synchronization
 218 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 219 #
 220 # (NOTE: Secondary commits that fill branches and tags also have a
 221 # motivating commit, but we do not record it because it is (currently)
 222 # not needed for anything.)
 223 #
 224 # This mapping is used when generating the log message for the commit
 225 # that synchronizes the default branch with trunk.
 226 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 227
 228 # How many bytes to read at a time from a pipe.  128 kiB should be
 229 # large enough to be efficient without wasting too much memory.
 230 PIPE_READ_SIZE = 128 * 1024
 231
 232 # Record the default RCS branches, if any, for CVS filepaths.
 233 #
 234 # The keys are CVS filepaths, relative to the top of the repository
 235 # and with the ",v" stripped off, so they match the cvs paths used in
 236 # Commit.commit().  The values are vendor branch revisions, such as
 237 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 238 # represents the highest vendor branch revision thought to have ever
 239 # been head of the default branch.
 240 #
 241 # The reason we record a specific vendor revision, rather than a
 242 # default branch number, is that there are two cases to handle:
 243 #
 244 # One case is simple.  The RCS file lists a default branch explicitly
 245 # in its header, such as '1.1.1'.  In this case, we know that every
 246 # revision on the vendor branch is to be treated as head of trunk at
 247 # that point in time.
 248 #
 249 # But there's also a degenerate case.  The RCS file does not currently
 250 # have a default branch, yet we can deduce that for some period in the
 251 # past it probably *did* have one.  For example, the file has vendor
 252 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 253 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 254 # case, we should record 1.1.1.96 as the last vendor revision to have
 255 # been the head of the default branch.
 256 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 257
 258 # Records the author and log message for each changeset.
 259 # The keys are author+log digests, the same kind used to identify
 260 # unique revisions in the .revs, etc files.  Each value is a tuple
 261 # of two elements: '(author logmessage)'.
 262 METADATA_DB = "cvs2svn-metadata.db"
 263
 264 REVS_SUFFIX = '.revs'
 265 CLEAN_REVS_SUFFIX = '.c-revs'
 266 SORTED_REVS_SUFFIX = '.s-revs'
 267 RESYNC_SUFFIX = '.resync'
 268
 269 SVN_INVALID_REVNUM = -1
 270
 271 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 272
 273 # Things that can happen to a file.
 274 OP_NOOP   = '-'
 275 OP_ADD    = 'A'
 276 OP_DELETE = 'D'
 277 OP_CHANGE = 'C'
 278
 279 # A deltatext either does or doesn't represent some change.
 280 DELTATEXT_NONEMPTY = 'N'
 281 DELTATEXT_EMPTY    = 'E'
 282
 283 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 284
 285 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 286 OPENING = 'O'
 287 CLOSING = 'C'
 288
 289 def temp(basename):
 290   """Return a path to BASENAME in Ctx().tmpdir.
 291   This is a convenience function to save horizontal space in source."""
 292   return os.path.join(Ctx().tmpdir, basename)
 293
 294 # Since the unofficial set also includes [/\] we need to translate those
 295 # into ones that don't conflict with Subversion limitations.
 296 def _clean_symbolic_name(name):
 297   """Return symbolic name NAME, translating characters that Subversion
 298   does not allow in a pathname."""
 299   name = name.replace('/','++')
 300   name = name.replace('\\','--')
 301   return name
 302
 303 def _path_join(*components):
 304   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 305   Empty component are skipped."""
 306   return string.join(filter(None, components), '/')
 307
 308 def run_command(command):
 309   if os.system(command):
 310     sys.exit('Command failed: "%s"' % command)
 311
 312 def relative_name(cvsroot, fname):
 313   l = len(cvsroot)
 314   if fname[:l] == cvsroot:
 315     if fname[l] == os.sep:
 316       return string.replace(fname[l+1:], os.sep, '/')
 317     return string.replace(fname[l:], os.sep, '/')
 318   sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
 319                    " cvsroot\n" % (error_prefix, cvsroot, fname))
 320   sys.exit(1)
 321
 322 def get_co_pipe(c_rev):
 323   """Return a command string, and the pipe created using that string.
 324   C_REV is a CVSRevision. The pipe returns the text of that CVS Revision."""
 325   ctx = Ctx()
 326   if ctx.use_cvs:
 327     pipe_cmd = 'cvs %s co -r%s -p %s' % \
 328                (ctx.cvs_global_arguments, c_rev.rev,
 329                 escape_shell_arg(ctx.cvs_module + c_rev.cvs_path))
 330   else:
 331     pipe_cmd = 'co -q -x,v -p%s %s' % \
 332                (c_rev.rev, escape_shell_arg(c_rev.rcs_path()))
 333   pipe = Popen3(pipe_cmd, True)
 334   pipe.tochild.close()
 335   return pipe_cmd, pipe
 336
 337 def generate_ignores(c_rev):
 338   # Read in props
 339   pipe_cmd, pipe = get_co_pipe(c_rev)
 340   buf = pipe.fromchild.read(PIPE_READ_SIZE)
 341   raw_ignore_val = ""
 342   while buf:
 343     raw_ignore_val = raw_ignore_val + buf
 344     buf = pipe.fromchild.read(PIPE_READ_SIZE)
 345   pipe.fromchild.close()
 346   error_output = pipe.childerr.read()
 347   exit_status = pipe.wait()
 348   if exit_status:
 349     sys.exit("%s: The command '%s' failed with exit status: %s\n"
 350              "and the following output:\n"
 351              "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
 352
 353   # Tweak props: First, convert any spaces to newlines...
 354   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 355   raw_ignores = raw_ignore_val.split('\n')
 356   ignore_vals = [ ]
 357   for ignore in raw_ignores:
 358     # Reset the list if we encounter a '!'
 359     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 360     if ignore == '!':
 361       ignore_vals = [ ]
 362       continue
 363     # Skip empty lines
 364     if len(ignore) == 0:
 365       continue
 366     ignore_vals.append(ignore)
 367   return ignore_vals
 368
 369 # Return a string that has not been returned by gen_key() before.
 370 gen_key_base = 0L
 371 def gen_key():
 372   global gen_key_base
 373   key = '%x' % gen_key_base
 374   gen_key_base = gen_key_base + 1
 375   return key
 376
 377 if sys.platform == "win32":
 378   def escape_shell_arg(str):
 379     return '"' + string.replace(str, '"', '"^""') + '"'
 380 else:
 381   def escape_shell_arg(str):
 382     return "'" + string.replace(str, "'", "'\\''") + "'"
 383
 384 def format_date(date):
 385   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 386   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 387   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 388
 389 def sort_file(infile, outfile):
 390   # sort the log files
 391
 392   # GNU sort will sort our dates differently (incorrectly!) if our
 393   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 394   # it to 'C'
 395   if os.environ.has_key('LC_ALL'):
 396     lc_all_tmp = os.environ['LC_ALL']
 397   else:
 398     lc_all_tmp = None
 399   os.environ['LC_ALL'] = 'C'
 400   if sys.platform == "win32":
 401     run_command('sort %s /T %s > %s' % (infile, Ctx().tmpdir, outfile))
 402   else:
 403     run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 404   if lc_all_tmp is None:
 405     del os.environ['LC_ALL']
 406   else:
 407     os.environ['LC_ALL'] = lc_all_tmp
 408
 409 def print_node_tree(tree, root_node, indent_depth=0):
 410   """For debugging purposes.  Prints all nodes in TREE that are
 411   rooted at ROOT_NODE.  INDENT_DEPTH is merely for purposes of
 412   debugging with the print statement in this function."""
 413   if not indent_depth:
 414     print "TREE", "=" * 75
 415   print "TREE:", " " * (indent_depth * 2), root_node, tree[root_node]
 416   for key, value in tree[root_node].items():
 417     if key[0] == '/': #Skip flags
 418       continue
 419     print_node_tree(tree, value, (indent_depth + 1))
 420
 421 def match_regexp_list(regexp_list, string):
 422   """Return 1 if string matches any of the compiled regexps in REGEXP_LIST,
 423   else return None."""
 424   for regexp in regexp_list:
 425     if regexp.match(string):
 426       return 1
 427
 428 # These constants represent the log levels that this script supports
 429 LOG_WARN = -1
 430 LOG_QUIET = 0
 431 LOG_NORMAL = 1
 432 LOG_VERBOSE = 2
 433 class Log:
 434   """A Simple logging facility.  Each line will be timestamped is
 435   self.use_timestamps is TRUE.  This class is a Borg, see
 436   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 437   __shared_state = {}
 438   def __init__(self):
 439     self.__dict__ = self.__shared_state
 440     if self.__dict__:
 441       return
 442     self.log_level = LOG_NORMAL
 443     # Set this to true if you want to see timestamps on each line output.
 444     self.use_timestamps = None
 445     self.logger = sys.stdout
 446
 447   def _timestamp(self):
 448     """Output a detailed timestamp at the beginning of each line output."""
 449     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 450
 451   def write(self, log_level, *args):
 452     """This is the public method to use for writing to a file.  Only
 453     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 454     there are multiple ARGS, they will be separated by a space."""
 455     if log_level > self.log_level:
 456       return
 457     if self.use_timestamps:
 458       self._timestamp()
 459     self.logger.write(' '.join(map(str,args)) + "\n")
 460
 461
 462 class Cleanup:
 463   """This singleton class manages any files created by cvs2svn.  When
 464   you first create a file, call Cleanup.register, passing the
 465   filename, and the last pass that you need the file.  After the end
 466   of that pass, your file will be cleaned up after running an optional
 467   callback.  This class is a Borg, see
 468   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 469
 470   __shared_state = {}
 471   def __init__(self):
 472     self.__dict__ = self.__shared_state
 473     if self.__dict__:
 474       return
 475     self._log = {}
 476     self._callbacks = {}
 477
 478   def register(self, file, which_pass, callback=None):
 479     """Register FILE for cleanup at the end of WHICH_PASS, running
 480     function CALLBACK prior to removal.  Registering a given FILE is
 481     idempotent; you may register as many times as you wish, but it
 482     will only be cleaned up once.
 483
 484     Note that if a file is registered multiple times, only the first
 485     callback registered for that file will be called at cleanup
 486     time.  Also note that if you register a database file you must
 487     close the database before cleanup, e.g. using a callback."""
 488     if not self._log.has_key(which_pass):
 489       self._log[which_pass] = {}
 490     self._log[which_pass][file] = 1
 491     if callback and not self._callbacks.has_key(file):
 492       self._callbacks[file] = callback
 493
 494   def cleanup(self, which_pass):
 495     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 496     if not self._log.has_key(which_pass):
 497       return
 498     for file in self._log[which_pass].keys():
 499       Log().write(LOG_VERBOSE, "Deleting", file)
 500       if self._callbacks.has_key(file):
 501         self._callbacks[file]()
 502       os.unlink(file)
 503
 504
 505 # Always use these constants for opening databases.
 506 DB_OPEN_READ = 'r'
 507 DB_OPEN_NEW = 'n'
 508
 509 # A wrapper for anydbm that uses the marshal module to store items as
 510 # strings.
 511 class Database:
 512   def __init__(self, filename, mode):
 513     # pybsddb3 has a bug which prevents it from working with
 514     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 515     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 516     # for databases protected by lock and transaction support
 517     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 518     #
 519     # Therefore, manually perform the removal (we can do this, because
 520     # we know that for bsddb - but *not* anydbm in general - the database
 521     # consists of one file with the name we specify, rather than several
 522     # based on that name).
 523     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 524       if os.path.isfile(filename):
 525         os.unlink(filename)
 526       mode = 'c'
 527
 528     self.db = anydbm.open(filename, mode)
 529
 530   def has_key(self, key):
 531     return self.db.has_key(key)
 532
 533   def __getitem__(self, key):
 534     return marshal.loads(self.db[key])
 535
 536   def __setitem__(self, key, value):
 537     self.db[key] = marshal.dumps(value)
 538
 539   def __delitem__(self, key):
 540     del self.db[key]
 541
 542   def get(self, key, default):
 543     if self.has_key(key):
 544       return self.__getitem__(key)
 545     return default
 546
 547
 548 class StatsKeeper:
 549   __shared_state = { }
 550   def __init__(self):
 551     self.__dict__ = self.__shared_state
 552     if self.__dict__:
 553       return
 554     self.filename = temp(STATISTICS_FILE)
 555     Cleanup().register(self.filename, pass8)
 556     # This can get kinda large, so we don't store it in our data dict.
 557     self.repos_files = { }
 558
 559     if os.path.exists(self.filename):
 560       self.unarchive()
 561     else:
 562       self.data = { 'cvs_revs_count' : 0,
 563                     'tags': { },
 564                     'branches' : { },
 565                     'repos_size' : 0,
 566                     'repos_file_count' : 0,
 567                     'svn_rev_count' : None,
 568                     'first_rev_date' : 1L<<32,
 569                     'last_rev_date' : 0,
 570                     'pass_timings' : { },
 571                     'start_time' : 0,
 572                     'end_time' : 0,
 573                     }
 574
 575   def log_duration_for_pass(self, duration, pass_num):
 576     self.data['pass_timings'][pass_num] = duration
 577
 578   def set_start_time(self, start):
 579     self.data['start_time'] = start
 580
 581   def set_end_time(self, end):
 582     self.data['end_time'] = end
 583
 584   def _bump_item(self, key, amount=1):
 585     self.data[key] = self.data[key] + amount
 586
 587   def reset_c_rev_info(self):
 588     self.data['cvs_revs_count'] = 0
 589     self.data['tags'] = { }
 590     self.data['branches'] = { }
 591
 592   def record_c_rev(self, c_rev):
 593     self._bump_item('cvs_revs_count')
 594
 595     for tag in c_rev.tags:
 596       self.data['tags'][tag] = None
 597     for branch in c_rev.branches:
 598       self.data['branches'][branch] = None
 599
 600     if c_rev.timestamp < self.data['first_rev_date']:
 601       self.data['first_rev_date'] = c_rev.timestamp
 602
 603     if c_rev.timestamp > self.data['last_rev_date']:
 604       self.data['last_rev_date'] = c_rev.timestamp
 605
 606     # Only add the size if this is the first time we see the file.
 607     if not self.repos_files.has_key(c_rev.fname):
 608       self._bump_item('repos_size', c_rev.file_size)
 609     self.repos_files[c_rev.fname] = None
 610
 611     self.data['repos_file_count'] = len(self.repos_files)
 612
 613   def set_svn_rev_count(self, count):
 614     self.data['svn_rev_count'] = count
 615
 616   def svn_rev_count(self):
 617     return self.data['svn_rev_count']
 618
 619   def archive(self):
 620     open(self.filename, 'w').write(marshal.dumps(self.data))
 621
 622   def unarchive(self):
 623     self.data = marshal.loads(open(self.filename, 'r').read())
 624
 625   def __str__(self):
 626     svn_revs_str = ""
 627     if self.data['svn_rev_count'] is not None:
 628       svn_revs_str = ('Total SVN Commits:      %10s\n'
 629                       % self.data['svn_rev_count'])
 630
 631     return ('\n'                                \
 632             'cvs2svn Statistics:\n'             \
 633             '------------------\n'              \
 634             'Total CVS Files:        %10i\n'    \
 635             'Total CVS Revisions:    %10i\n'    \
 636             'Total Unique Tags:      %10i\n'    \
 637             'Total Unique Branches:  %10i\n'    \
 638             'CVS Repos Size in KB:   %10i\n'    \
 639             '%s'                                \
 640             'First Revision Date:    %s\n'      \
 641             'Last Revision Date:     %s\n'      \
 642             '------------------'                \
 643             % (self.data['repos_file_count'],
 644                self.data['cvs_revs_count'],
 645                len(self.data['tags']),
 646                len(self.data['branches']),
 647                (self.data['repos_size'] / 1024),
 648                svn_revs_str,
 649                time.ctime(self.data['first_rev_date']),
 650                time.ctime(self.data['last_rev_date']),
 651                ))
 652
 653   def timings(self):
 654     passes = self.data['pass_timings'].keys()
 655     passes.sort()
 656     str = 'Timings:\n------------------\n'
 657
 658     def desc(val):
 659       if val == 1: return "second"
 660       return "seconds"
 661
 662     for pass_num in passes:
 663       duration = int(self.data['pass_timings'][pass_num])
 664       p_str = ('pass %d:%6d %s\n'
 665                % (pass_num, duration, desc(duration)))
 666       str = str + p_str
 667
 668     total = int(self.data['end_time'] - self.data['start_time'])
 669     str = str + ('total: %6d %s' % (total, desc(total)))
 670     return str
 671
 672
 673 class LastSymbolicNameDatabase:
 674   """ Passing every CVSRevision in s-revs to this class will result in
 675   a Database whose key is the last CVS Revision a symbolicname was
 676   seen in, and whose value is a list of all symbolicnames that were
 677   last seen in that revision."""
 678   def __init__(self, mode):
 679     self.symbols = {}
 680     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 681     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 682
 683   # Once we've gone through all the revs,
 684   # symbols.keys() will be a list of all tags and branches, and
 685   # their corresponding values will be a key into the last CVS revision
 686   # that they were used in.
 687   def log_revision(self, c_rev):
 688     # Gather last CVS Revision for symbolic name info and tag info
 689     for tag in c_rev.tags:
 690       self.symbols[tag] = c_rev.unique_key()
 691     if c_rev.op is not OP_DELETE:
 692       for branch in c_rev.branches:
 693         self.symbols[branch] = c_rev.unique_key()
 694
 695   # Creates an inversion of symbols above--a dictionary of lists (key
 696   # = CVS rev unique_key: val = list of symbols that close in that
 697   # rev.
 698   def create_database(self):
 699     for sym, rev_unique_key in self.symbols.items():
 700       if self.symbol_revs_db.has_key(rev_unique_key):
 701         ary = self.symbol_revs_db[rev_unique_key]
 702         ary.append(sym)
 703         self.symbol_revs_db[rev_unique_key] = ary
 704       else:
 705         self.symbol_revs_db[rev_unique_key] = [sym]
 706
 707
 708 class CVSRevisionDatabase:
 709   """A Database to store CVSRevision objects and retrieve them by their
 710   unique_key()."""
 711
 712   def __init__(self, mode):
 713     """Initialize an instance, opening database in MODE (like the MODE
 714     argument to Database or anydbm.open())."""
 715     self.cvs_revs_db = Database(temp(CVS_REVS_DB), mode)
 716     Cleanup().register(temp(CVS_REVS_DB), pass8)
 717
 718   def log_revision(self, c_rev):
 719     """Add C_REV, a CVSRevision, to the database."""
 720     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
 721
 722   def get_revision(self, unique_key):
 723     """Return the CVSRevision stored under UNIQUE_KEY."""
 724     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
 725
 726
 727 class TagsDatabase(Database):
 728   """A Database to store which symbolic names are tags.
 729   Each key is a tag name.
 730   The value has no meaning, and should be set to None."""
 731   def __init__(self, mode):
 732     Database.__init__(self, temp(TAGS_DB), mode)
 733     Cleanup().register(temp(TAGS_DB), pass8)
 734
 735
 736 class CVSRevision:
 737   def __init__(self, ctx, *args):
 738     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
 739
 740     If CTX is None, the following members and methods of the
 741     instantiated CVSRevision class object will be unavailable (or
 742     simply will not work correctly, if at all):
 743        cvs_path
 744        svn_path
 745        svn_trunk_path
 746        is_default_branch_revision()
 747
 748     (Note that this class treats CTX as const, because the caller
 749     likely passed in a Borg instance of a Ctx.  The reason this class
 750     takes CTX as as a parameter, instead of just instantiating a Ctx
 751     itself, is that this class should be usable outside cvs2svn.)
 752
 753     If there is one argument in ARGS, it is a string, in the format of
 754     a line from a revs file.  Do *not* include a trailing newline.
 755
 756     If there are multiple ARGS, there must be 16 of them,
 757     comprising a parsed revs line:
 758        timestamp       -->  (int) date stamp for this cvs revision
 759        digest          -->  (string) digest of author+logmsg
 760        prev_timestamp  -->  (int) date stamp for the previous cvs revision
 761        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
 762        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
 763        rev             -->  (string) this CVS rev, e.g., "1.3"
 764        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
 765        file_in_attic   -->  (char or None) true if RCS file is in Attic
 766        file_executable -->  (char or None) true if RCS file has exec bit set.
 767        file_size       -->  (int) size of the RCS file
 768        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
 769        mode            -->  (string or None) "kkv", "kb", etc.
 770        branch_name     -->  (string or None) branch on which this rev occurred
 771        tags            -->  (list of strings) all tags on this revision
 772        branches        -->  (list of strings) all branches rooted in this rev
 773        fname           -->  (string) relative path of file in CVS repos
 774
 775     The two forms of initialization are equivalent."""
 776
 777     self._ctx = ctx
 778     if len(args) == 16:
 779       (self.timestamp, self.digest, self.prev_timestamp, self.op,
 780        self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
 781        self.file_executable, self.file_size, self.deltatext_code, self.fname,
 782        self.mode, self.branch_name, self.tags, self.branches) = args
 783     elif len(args) == 1:
 784       data = args[0].split(' ', 14)
 785       self.timestamp = int(data[0], 16)
 786       self.digest = data[1]
 787       if data[2] == "*":
 788         self.prev_timestamp = 0
 789       else:
 790         self.prev_timestamp = int(data[2])
 791       self.op = data[3]
 792       self.prev_rev = data[4]
 793       if self.prev_rev == "*":
 794         self.prev_rev = None
 795       self.rev = data[5]
 796       self.next_rev = data[6]
 797       if self.next_rev == "*":
 798         self.next_rev = None
 799       self.file_in_attic = data[7]
 800       if self.file_in_attic == "*":
 801         self.file_in_attic = None
 802       self.file_executable = data[8]
 803       if self.file_executable == "*":
 804         self.file_executable = None
 805       self.file_size = int(data[9])
 806       self.deltatext_code = data[10]
 807       self.mode = data[11]
 808       if self.mode == "*":
 809         self.mode = None
 810       self.branch_name = data[12]
 811       if self.branch_name == "*":
 812         self.branch_name = None
 813       ntags = int(data[13])
 814       tags = data[14].split(' ', ntags + 1)
 815       nbranches = int(tags[ntags])
 816       branches = tags[ntags + 1].split(' ', nbranches)
 817       self.fname = branches[nbranches]
 818       self.tags = tags[:ntags]
 819       self.branches = branches[:nbranches]
 820     else:
 821       raise TypeError, 'CVSRevision() takes 2 or 16 arguments (%d given)' % \
 822           (len(args) + 1)
 823     if ctx is not None:
 824       self.cvs_path = relative_name(self._ctx.cvsroot, self.fname[:-2])
 825       self.svn_path = self._make_path(self.cvs_path, self.branch_name)
 826       self.svn_trunk_path = self._make_path(self.cvs_path)
 827
 828   # The 'primary key' of a CVS Revision is the revision number + the
 829   # filename.  To provide a unique key (say, for a dict), we just glom
 830   # them together in a string.  By passing in self.prev_rev or
 831   # self.next_rev, you can get the unique key for their respective
 832   # CVSRevisions.
 833   def unique_key(self, revnum=None):
 834     if revnum is None:
 835       revnum = self.rev
 836     return revnum + "/" + self.fname
 837
 838   def __str__(self):
 839     return ('%08lx %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s' % (
 840       self.timestamp, self.digest, self.prev_timestamp or "*", self.op,
 841       (self.prev_rev or "*"), self.rev, (self.next_rev or "*"),
 842       (self.file_in_attic or "*"), (self.file_executable or "*"),
 843       self.file_size,
 844       self.deltatext_code, (self.mode or "*"), (self.branch_name or "*"),
 845       len(self.tags), self.tags and " " or "", " ".join(self.tags),
 846       len(self.branches), self.branches and " " or "", " ".join(self.branches),
 847       self.fname, ))
 848
 849   # Returns true if this CVSRevision is the opening CVSRevision for
 850   # NAME (for this RCS file).
 851   def opens_symbolic_name(self, name):
 852     if name in self.tags:
 853       return 1
 854     if name in self.branches:
 855       # If this c_rev opens a branch and our op is OP_DELETE, then
 856       # that means that the file that this c_rev belongs to was
 857       # created on the branch, so for all intents and purposes, this
 858       # c_rev is *technically* not an opening.  See Issue #62 for more
 859       # information.
 860       if self.op != OP_DELETE:
 861         return 1
 862     return 0
 863
 864   def is_default_branch_revision(self):
 865     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
 866     revision according to DEFAULT_BRANCHES_DB (see the conditions
 867     documented there), else return None."""
 868     if self._ctx._default_branches_db.has_key(self.cvs_path):
 869       val = self._ctx._default_branches_db[self.cvs_path]
 870       val_last_dot = val.rindex(".")
 871       our_last_dot = self.rev.rindex(".")
 872       default_branch = val[:val_last_dot]
 873       our_branch = self.rev[:our_last_dot]
 874       default_rev_component = int(val[val_last_dot + 1:])
 875       our_rev_component = int(self.rev[our_last_dot + 1:])
 876       if (default_branch == our_branch
 877           and our_rev_component <= default_rev_component):
 878         return 1
 879     # else
 880     return None
 881
 882   def _make_path(self, path, branch_name = None):
 883     """Return the trunk path or branch path for PATH.
 884
 885     If PATH is None, return None."""
 886     # For a while, we treated each top-level subdir of the CVS
 887     # repository as a "project root" and interpolated the appropriate
 888     # genealogy (trunk|tag|branch) in according to the official
 889     # recommended layout.  For example, the path '/foo/bar/baz.c' on
 890     # branch 'Rel2' would become
 891     #
 892     #   /foo/branches/Rel2/bar/baz.c
 893     #
 894     # and on trunk it would become
 895     #
 896     #   /foo/trunk/bar/baz.c
 897     #
 898     # However, we went back to the older and simpler method of just
 899     # prepending the genealogy to the front, instead of interpolating.
 900     # So now we produce:
 901     #
 902     #   /branches/Rel2/foo/bar/baz.c
 903     #   /trunk/foo/bar/baz.c
 904     #
 905     # Why?  Well, Jack Repenning pointed out that this way is much
 906     # friendlier to "anonymously rooted subtrees" (that's a tree where
 907     # the name of the top level dir doesn't matter, the point is that if
 908     # you cd into it and, say, run 'make', something good will happen).
 909     # By interpolating, we made it impossible to point cvs2svn at some
 910     # subdir in the CVS repository and convert it as a project, because
 911     # we'd treat every subdir underneath it as an independent project
 912     # root, which is probably not what the user wanted.
 913     #
 914     # Also, see Blair Zajac's post
 915     #
 916     #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
 917     #
 918     # and the surrounding thread, for why what people really want is a
 919     # way of specifying an in-repository prefix path, not interpolation.
 920     if path is None:
 921       return None
 922
 923     if branch_name:
 924       branch_name = _clean_symbolic_name(branch_name)
 925       return self._ctx.branches_base + '/' + branch_name + '/' + path
 926     else:
 927       return self._ctx.trunk_base + '/' + path
 928
 929   def rcs_path(self):
 930     """Returns the actual filesystem path to the RCS file of this
 931     CVSRevision."""
 932     if self.file_in_attic is None:
 933       return self.fname
 934     else:
 935       basepath, filename = os.path.split(self.fname)
 936       return os.path.join(basepath, 'Attic', filename)
 937
 938   def filename(self):
 939     "Return the last path component of self.fname, minus the ',v'"
 940     return os.path.split(self.fname)[-1][:-2]
 941
 942 class SymbolDatabase:
 943   """This database records information on all symbols in the RCS
 944   files.  It is created in pass 1 and it is used in pass 2."""
 945   def __init__(self):
 946     # A hash that maps tag names to commit counts
 947     self.tags = { }
 948     # A hash that maps branch names to lists of the format
 949     # [ create_count, commit_count, blockers ], where blockers
 950     # is a hash that lists the symbols that depend on the
 951     # the branch.  The blockers hash is used as a set, so the
 952     # values are not used.
 953     self.branches = { }
 954
 955   def register_tag_creation(self, name):
 956     """Register the creation of the tag NAME."""
 957     if not self.tags.has_key(name):
 958       self.tags[name] = 0
 959     self.tags[name] += 1
 960
 961   def _branch(self, name):
 962     """Helper function to get a branch node that will create and
 963     initialize the node if it does not exist."""
 964     if not self.branches.has_key(name):
 965       self.branches[name] = [ 0, 0, { } ]
 966     return self.branches[name]
 967
 968   def register_branch_creation(self, name):
 969     """Register the creation of the branch NAME."""
 970     self._branch(name)[0] += 1
 971
 972   def register_branch_commit(self, name):
 973     """Register a commit on the branch NAME."""
 974     self._branch(name)[1] += 1
 975
 976   def register_branch_blocker(self, name, blocker):
 977     """Register BLOCKER as a blocker on the branch NAME."""
 978     self._branch(name)[2][blocker] = None
 979
 980   def branch_has_commit(self, name):
 981     """Return non-zero if NAME has commits.  Returns 0 if name
 982     is not a branch or if it has no commits."""
 983     return self.branches.has_key(name) and self.branches[name][1]
 984
 985   def find_excluded_symbols(self, regexp_list):
 986     """Returns a hash of all symbols thaht match the regexps in
 987     REGEXP_LISTE.  The hash is used as a set so the values are
 988     not used."""
 989     excludes = { }
 990     for tag in self.tags.keys():
 991       if match_regexp_list(regexp_list, tag):
 992         excludes[tag] = None
 993     for branch in self.branches.keys():
 994       if match_regexp_list(regexp_list, branch):
 995         excludes[branch] = None
 996     return excludes
 997
 998   def find_branch_exclude_blockers(self, branch, excludes):
 999     """Find all blockers of BRANCH, excluding the ones in the hash
1000     EXCLUDES."""
1001     blockers = { }
1002     if excludes.has_key(branch):
1003       for blocker in self.branches[branch][2]:
1004         if not excludes.has_key(blocker):
1005           blockers[blocker] = None
1006     return blockers
1007
1008   def find_blocked_excludes(self, excludes):
1009     """Find all branches not in EXCLUDES that have blocking symbols that
1010     are not themselves excluded.  Return a hash that maps branch names
1011     to a hash of blockers.  The hash of blockes is used as a set so the
1012     values are not used."""
1013     blocked_branches = { }
1014     for branch in self.branches.keys():
1015       blockers = self.find_branch_exclude_blockers(branch, excludes)
1016       if blockers:
1017         blocked_branches[branch] = blockers
1018     return blocked_branches
1019
1020   def find_mismatches(self, excludes=None):
1021     """Find all symbols that are defined as both tags and branches,
1022     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1023     the symbol name, tag count, branch count and commit count."""
1024     if excludes is None:
1025       excludes = { }
1026     mismatches = [ ]
1027     for branch in self.branches.keys():
1028       if not excludes.has_key(branch) and self.tags.has_key(branch):
1029         mismatches.append((branch,                    # name
1030                            self.tags[branch],         # tag count
1031                            self.branches[branch][0],  # branch count
1032                            self.branches[branch][1])) # commit count
1033     return mismatches
1034
1035   def read(self):
1036     """Read the symbol database from files."""
1037     f = open(temp(TAGS_LIST))
1038     while 1:
1039       line = f.readline()
1040       if not line:
1041         break
1042       tag, count = line.split()
1043       self.tags[tag] = int(count)
1044
1045     f = open(temp(BRANCHES_LIST))
1046     while 1:
1047       line = f.readline()
1048       if not line:
1049         break
1050       words = line.split()
1051       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1052       for blocker in words[3:]:
1053         self.branches[words[0]][2][blocker] = None
1054
1055   def write(self):
1056     """Store the symbol database to files."""
1057     f = open(temp(TAGS_LIST), "w")
1058     Cleanup().register(temp(TAGS_LIST), pass2)
1059     for tag, count in self.tags.items():
1060       f.write("%s %d\n" % (tag, count))
1061
1062     f = open(temp(BRANCHES_LIST), "w")
1063     Cleanup().register(temp(BRANCHES_LIST), pass2)
1064     for branch, info in self.branches.items():
1065       f.write("%s %d %d" % (branch, info[0], info[1]))
1066       if info[2]:
1067         f.write(" ")
1068         f.write(" ".join(info[2].keys()))
1069       f.write("\n")
1070
1071 class CollectData(cvs2svn_rcsparse.Sink):
1072   def __init__(self):
1073     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1074     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1075     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1076     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1077     self.default_branches_db = Database(temp(DEFAULT_BRANCHES_DB), DB_OPEN_NEW)
1078     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1079     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1080     Cleanup().register(temp(METADATA_DB), pass8)
1081     self.fatal_errors = []
1082     self.num_files = 0
1083     self.symbol_db = SymbolDatabase()
1084
1085     # 1 if we've collected data for at least one file, None otherwise.
1086     self.found_valid_file = None
1087
1088     # See set_fname() for initializations of other variables.
1089
1090   def set_fname(self, canonical_name, filename):
1091     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1092     filesystem path to the file in question, and CANONICAL_NAME is
1093     FILENAME with the 'Attic' component removed (if the file is indeed
1094     in the Attic) ."""
1095     self.fname = canonical_name
1096
1097     # We calculate and save some file metadata here, where we can do
1098     # it only once per file, instead of waiting until later where we
1099     # would have to do the same calculations once per CVS *revision*.
1100
1101     self.rel_name = relative_name(Ctx().cvsroot, self.fname)[:-2]
1102
1103     # If the paths are not the same, then that means that the
1104     # canonical_name has had the 'Attic' component stripped out.
1105     self.file_in_attic = None
1106     if not canonical_name == filename:
1107       self.file_in_attic = 1
1108
1109     file_stat = os.stat(filename)
1110     # The size of our file in bytes
1111     self.file_size = file_stat[stat.ST_SIZE]
1112
1113     # Whether or not the executable bit is set.
1114     self.file_executable = None
1115     if file_stat[0] & stat.S_IXUSR:
1116       self.file_executable = 1
1117
1118     # revision -> [timestamp, author, old-timestamp]
1119     self.rev_data = { }
1120
1121     # Maps revision number (key) to the revision number of the
1122     # previous revision along this line of development.
1123     #
1124     # For the first revision R on a branch, we consider the revision
1125     # from which R sprouted to be the 'previous'.
1126     #
1127     # Note that this revision can't be determined arithmetically (due
1128     # to cvsadmin -o, which is why this is necessary).
1129     self.prev_rev = { }
1130
1131     # This dict is essentially self.prev_rev with the values mapped in
1132     # the other direction, so following key -> value will yield you
1133     # the next revision number
1134     self.next_rev = { }
1135
1136     # Track the state of each revision so that in set_revision_info,
1137     # we can determine if our op is an add/change/delete.  We can do
1138     # this because in set_revision_info, we'll have all of the
1139     # revisions for a file at our fingertips, and we need to examine
1140     # the state of our prev_rev to determine if we're an add or a
1141     # change--without the state of the prev_rev, we are unable to
1142     # distinguish between an add and a change.
1143     self.rev_state = { }
1144
1145     # Hash mapping branch numbers, like '1.7.2', to branch names,
1146     # like 'Release_1_0_dev'.
1147     self.branch_names = { }
1148
1149     # RCS flags (used for keyword expansion).
1150     self.mode = None
1151
1152     # Hash mapping revision numbers, like '1.7', to lists of names
1153     # indicating which branches sprout from that revision, like
1154     # ['Release_1_0_dev', 'experimental_driver', ...].
1155     self.branchlist = { }
1156
1157     # Like self.branchlist, but the values are lists of tag names that
1158     # apply to the key revision.
1159     self.taglist = { }
1160
1161     # If set, this is an RCS branch number -- rcsparse calls this the
1162     # "principal branch", but CVS and RCS refer to it as the "default
1163     # branch", so that's what we call it, even though the rcsparse API
1164     # setter method is still 'set_principal_branch'.
1165     self.default_branch = None
1166
1167     # If the RCS file doesn't have a default branch anymore, but does
1168     # have vendor revisions, then we make an educated guess that those
1169     # revisions *were* the head of the default branch up until the
1170     # commit of 1.2, at which point the file's default branch became
1171     # trunk.  This records the date at which 1.2 was committed.
1172     self.first_non_vendor_revision_date = None
1173
1174     # A list of all symbols defined for the current file.  Used to
1175     # prevent multiple definitions of a symbol, something which can
1176     # easily happen when --symbol-transform is used.
1177     self.defined_symbols = [ ]
1178
1179   def set_principal_branch(self, branch):
1180     self.default_branch = branch
1181
1182   def set_expansion(self, mode):
1183     self.mode = mode
1184
1185   def set_branch_name(self, branch_number, name):
1186     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1187     and that NAME sprouts from BRANCH_NUMBER .
1188     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1189     for example '1.7.2' (never '1.7.0.2')."""
1190     if not self.branch_names.has_key(branch_number):
1191       self.branch_names[branch_number] = name
1192       # The branchlist is keyed on the revision number from which the
1193       # branch sprouts, so strip off the odd final component.
1194       sprout_rev = branch_number[:branch_number.rfind(".")]
1195       if not self.branchlist.has_key(sprout_rev):
1196         self.branchlist[sprout_rev] = []
1197       self.branchlist[sprout_rev].append(name)
1198       self.symbol_db.register_branch_creation(name)
1199     else:
1200       sys.stderr.write("%s: in '%s':\n"
1201                        "   branch '%s' already has name '%s',\n"
1202                        "   cannot also have name '%s', ignoring the latter\n"
1203                        % (warning_prefix, self.fname, branch_number,
1204                           self.branch_names[branch_number], name))
1205
1206   def rev_to_branch_name(self, revision):
1207     """Return the name of the branch on which REVISION lies.
1208     REVISION is a non-branch revision number with an even number of,
1209     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1210     For the convenience of callers, REVISION can also be a trunk
1211     revision such as '1.2', in which case just return None."""
1212     if trunk_rev.match(revision):
1213       return None
1214     return self.branch_names.get(revision[:revision.rindex(".")])
1215
1216   def add_cvs_branch(self, revision, branch_name):
1217     """Record the root revision and branch revision for BRANCH_NAME,
1218     based on REVISION.  REVISION is a CVS branch number having an even
1219     number of components where the second-to-last is '0'.  For
1220     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1221     from 1.7 and has branch number 1.7.2."""
1222     last_dot = revision.rfind(".")
1223     branch_rev = revision[:last_dot]
1224     last2_dot = branch_rev.rfind(".")
1225     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1226     self.set_branch_name(branch_rev, branch_name)
1227
1228   def define_tag(self, name, revision):
1229     """Record a bidirectional mapping between symbolic NAME and REVISION.
1230     REVISION is an unprocessed revision number from the RCS file's
1231     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1232     This function will determine what kind of symbolic name it is by
1233     inspection, and record it in the right places."""
1234     for (pattern, replacement) in Ctx().symbol_transforms:
1235       newname = re.sub(pattern, replacement, name)
1236       if newname != name:
1237         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1238                     % (name, newname))
1239         name = newname
1240     if name in self.defined_symbols:
1241       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1242                 % (error_prefix, name, self.fname)
1243       sys.stderr.write(err + "\n")
1244       self.fatal_errors.append(err)
1245     self.defined_symbols.append(name)
1246     if branch_tag.match(revision):
1247       self.add_cvs_branch(revision, name)
1248     elif vendor_tag.match(revision):
1249       self.set_branch_name(revision, name)
1250     else:
1251       if not self.taglist.has_key(revision):
1252         self.taglist[revision] = []
1253       self.taglist[revision].append(name)
1254       self.symbol_db.register_tag_creation(name)
1255
1256   def define_revision(self, revision, timestamp, author, state,
1257                       branches, next):
1258
1259     # Record the state of our revision for later calculations
1260     self.rev_state[revision] = state
1261
1262     # store the rev_data as a list in case we have to jigger the timestamp
1263     self.rev_data[revision] = [int(timestamp), author, None]
1264
1265     # When on trunk, the RCS 'next' revision number points to what
1266     # humans might consider to be the 'previous' revision number.  For
1267     # example, 1.3's RCS 'next' is 1.2.
1268     #
1269     # However, on a branch, the RCS 'next' revision number really does
1270     # point to what humans would consider to be the 'next' revision
1271     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1272     #
1273     # In other words, in RCS, 'next' always means "where to find the next
1274     # deltatext that you need this revision to retrieve.
1275     #
1276     # That said, we don't *want* RCS's behavior here, so we determine
1277     # whether we're on trunk or a branch and set self.prev_rev
1278     # accordingly.
1279     #
1280     # One last thing.  Note that if REVISION is a branch revision,
1281     # instead of mapping REVISION to NEXT, we instead map NEXT to
1282     # REVISION.  Since we loop over all revisions in the file before
1283     # doing anything with the data we gather here, this 'reverse
1284     # assignment' effectively does the following:
1285     #
1286     # 1. Gives us no 'prev' value for REVISION (in this
1287     # iteration... it may have been set in a previous iteration)
1288     #
1289     # 2. Sets the 'prev' value for the revision with number NEXT to
1290     # REVISION.  So when we come around to the branch revision whose
1291     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1292     # set.
1293     if trunk_rev.match(revision):
1294       self.prev_rev[revision] = next
1295       self.next_rev[next] = revision
1296     elif next:
1297       self.prev_rev[next] = revision
1298       self.next_rev[revision] = next
1299
1300     for b in branches:
1301       self.prev_rev[b] = revision
1302
1303     # Ratchet up the highest vendor head revision, if necessary.
1304     if self.default_branch:
1305       default_branch_root = self.default_branch + "."
1306       if ((revision.find(default_branch_root) == 0)
1307           and (default_branch_root.count('.') == revision.count('.'))):
1308         # This revision is on the default branch, so record that it is
1309         # the new highest default branch head revision.
1310         self.default_branches_db[self.rel_name] = revision
1311     else:
1312       # No default branch, so make an educated guess.
1313       if revision == '1.2':
1314         # This is probably the time when the file stopped having a
1315         # default branch, so make a note of it.
1316         self.first_non_vendor_revision_date = timestamp
1317       else:
1318         m = vendor_revision.match(revision)
1319         if m and ((not self.first_non_vendor_revision_date)
1320                   or (timestamp < self.first_non_vendor_revision_date)):
1321           # We're looking at a vendor revision, and it wasn't
1322           # committed after this file lost its default branch, so bump
1323           # the maximum trunk vendor revision in the permanent record.
1324           self.default_branches_db[self.rel_name] = revision
1325
1326     if not trunk_rev.match(revision):
1327       # Check for unlabeled branches, record them.  We tried to collect
1328       # all branch names when we parsed the symbolic name header
1329       # earlier, of course, but that didn't catch unlabeled branches.
1330       # If a branch is unlabeled, this is our first encounter with it,
1331       # so we have to record its data now.
1332       branch_number = revision[:revision.rindex(".")]
1333       if not self.branch_names.has_key(branch_number):
1334         branch_name = "unlabeled-" + branch_number
1335         self.set_branch_name(branch_number, branch_name)
1336
1337       # Register the commit on this non-trunk branch
1338       branch_name = self.branch_names[branch_number]
1339       self.symbol_db.register_branch_commit(branch_name)
1340
1341   def tree_completed(self):
1342     "The revision tree has been parsed.  Analyze it for consistency."
1343
1344     # Our algorithm depends upon the timestamps on the revisions occuring
1345     # monotonically over time.  That is, we want to see rev 1.34 occur in
1346     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1347     # sorting), and then tried to insert 1.34, we'd be screwed.
1348
1349     # to perform the analysis, we'll simply visit all of the 'previous'
1350     # links that we have recorded and validate that the timestamp on the
1351     # previous revision is before the specified revision
1352
1353     # if we have to resync some nodes, then we restart the scan. just keep
1354     # looping as long as we need to restart.
1355     while 1:
1356       for current, prev in self.prev_rev.items():
1357         if not prev:
1358           # no previous revision exists (i.e. the initial revision)
1359           continue
1360         t_c = self.rev_data[current][0]
1361         t_p = self.rev_data[prev][0]
1362         if t_p >= t_c:
1363           # the previous revision occurred later than the current revision.
1364           # shove the previous revision back in time (and any before it that
1365           # may need to shift).
1366
1367           # We sync backwards and not forwards because any given CVS
1368           # Revision has only one previous revision.  However, a CVS
1369           # Revision can *be* a previous revision for many other
1370           # revisions (e.g., a revision that is the source of multiple
1371           # branches).  This becomes relevant when we do the secondary
1372           # synchronization in pass 2--we can make certain that we
1373           # don't resync a revision earlier than it's previous
1374           # revision, but it would be non-trivial to make sure that we
1375           # don't resync revision R *after* any revisions that have R
1376           # as a previous revision.
1377           while t_p >= t_c:
1378             self.rev_data[prev][0] = t_c - 1    # new timestamp
1379             self.rev_data[prev][2] = t_p        # old timestamp
1380             delta = t_c - 1 - t_p
1381             msg =  "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1382                   % (self.rel_name,
1383                      prev, time.ctime(t_p), delta)
1384             Log().write(LOG_VERBOSE, msg)
1385             if (delta > COMMIT_THRESHOLD
1386                 or delta < (COMMIT_THRESHOLD * -1)):
1387               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1388               Log().write(LOG_WARN, str % (warning_prefix, self.rel_name,
1389                                            delta))
1390             current = prev
1391             prev = self.prev_rev[current]
1392             if not prev:
1393               break
1394             t_c = t_c - 1               # self.rev_data[current][0]
1395             t_p = self.rev_data[prev][0]
1396
1397           # break from the for-loop
1398           break
1399       else:
1400         # finished the for-loop (no resyncing was performed)
1401         return
1402
1403   def set_revision_info(self, revision, log, text):
1404     timestamp, author, old_ts = self.rev_data[revision]
1405     digest = sha.new(log + '\0' + author).hexdigest()
1406     if old_ts:
1407       # the timestamp on this revision was changed. log it for later
1408       # resynchronization of other files's revisions that occurred
1409       # for this time and log message.
1410       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1411
1412     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1413     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1414     #
1415     # If revision 1.1 appears to have been created via 'cvs add'
1416     # instead of 'cvs import', then this file probably never had a
1417     # default branch, so retroactively remove its record in the
1418     # default branches db.  The test is that the log message CVS uses
1419     # for 1.1 in imports is "Initial revision\n" with no period.
1420     if revision == '1.1' and log != 'Initial revision\n':
1421       if self.default_branches_db.has_key(self.rel_name):
1422         del self.default_branches_db[self.rel_name]
1423
1424     # Get the timestamp of the previous revision
1425     prev_rev = self.prev_rev.get(revision, None)
1426     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1427
1428     # How to tell if a CVSRevision is an add, a change, or a deletion:
1429     #
1430     # It's a delete if RCS state is 'dead'
1431     #
1432     # It's an add if RCS state is 'Exp.' and
1433     #      - we either have no previous revision
1434     #        or
1435     #      - we have a previous revision whose state is 'dead'
1436     #
1437     # Anything else is a change.
1438     if self.rev_state[revision] == 'dead':
1439       op = OP_DELETE
1440     elif ((self.prev_rev.get(revision, None) is None)
1441           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1442       op = OP_ADD
1443     else:
1444       op = OP_CHANGE
1445
1446     if text:
1447       deltatext_code = DELTATEXT_NONEMPTY
1448     else:
1449       deltatext_code = DELTATEXT_EMPTY
1450
1451     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp, op,
1452                         self.prev_rev[revision], revision,
1453                         self.next_rev.get(revision),
1454                         self.file_in_attic, self.file_executable,
1455                         self.file_size,
1456                         deltatext_code, self.fname,
1457                         self.mode, self.rev_to_branch_name(revision),
1458                         self.taglist.get(revision, []),
1459                         self.branchlist.get(revision, []))
1460     self.revs.write(str(c_rev) + "\n")
1461     StatsKeeper().record_c_rev(c_rev)
1462
1463     if not self.metadata_db.has_key(digest):
1464       self.metadata_db[digest] = (author, log)
1465
1466   def parse_completed(self):
1467     # Walk through all branches and tags and register them with
1468     # their parent branch in the symbol database.
1469     for revision, symbols in self.taglist.items() + self.branchlist.items():
1470       for symbol in symbols:
1471         name = self.rev_to_branch_name(revision)
1472         if name is not None:
1473           self.symbol_db.register_branch_blocker(name, symbol)
1474
1475     self.num_files = self.num_files + 1
1476
1477   def write_symbol_db(self):
1478     self.symbol_db.write()
1479
1480 class SymbolingsLogger:
1481   """Manage the file that contains lines for symbol openings and
1482   closings.
1483
1484   This data will later be used to determine valid SVNRevision ranges
1485   from which a file can be copied when creating a branch or tag in
1486   Subversion.  Do this by finding "Openings" and "Closings" for each
1487   file copied onto a branch or tag.
1488
1489   An "Opening" is the CVSRevision from which a given branch/tag
1490   sprouts on a path.
1491
1492   The "Closing" for that branch/tag and path is the next CVSRevision
1493   on the same line of development as the opening.
1494
1495   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1496   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1497   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1498   'foo.c'.  Note that there may be many revisions chronologically
1499   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1500   perhaps even including on branch BEE itself.  But 1.3 is the next
1501   revision *on the same line* as 1.2, that is why it is the closing
1502   revision for those symbolic names of which 1.2 is the opening.
1503
1504   The reason for doing all this hullabaloo is to make branch and tag
1505   creation as efficient as possible by minimizing the number of copies
1506   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1507   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1508   means that when creating branch BEE, there is some motivation to do
1509   the copy from one of 17-30.  Now if there were another file,
1510   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1511   to revisions 24 and 39 in Subversion, we would know that the ideal
1512   thing would be to copy the branch from somewhere between 24 and 29,
1513   inclusive.
1514   """
1515   def __init__(self):
1516     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1517     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1518     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1519     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1520
1521     # This keys of this dictionary are Subversion repository *source*
1522     # paths for which we've encountered an 'opening'.  The values are
1523     # the symbolic names that this path has opened.  The only paths
1524     # that should be in this dict are paths whose corresponding
1525     # CVSRevision is a default branch revision.
1526     self.open_paths_with_default_branches = { }
1527
1528   def log_revision(self, c_rev, svn_revnum):
1529     """Log any openings found in C_REV, and if C_REV.next_rev is not
1530     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1531     any) will have its revnum determined later."""
1532     for name in c_rev.tags + c_rev.branches:
1533       name = _clean_symbolic_name(name)
1534       self._note_default_branch_opening(c_rev, name)
1535       if c_rev.op != OP_DELETE:
1536         self._log(name, svn_revnum, c_rev.svn_path, OPENING)
1537
1538       # If our c_rev has a next_rev, then that's the closing rev for
1539       # this source revision.  Log it to closings for later processing
1540       # since we don't know the svn_revnum yet.
1541       if c_rev.next_rev is not None:
1542         self.closings.write('%s %s\n' %
1543                             (name, c_rev.unique_key(c_rev.next_rev)))
1544
1545   def _log(self, name, svn_revnum, svn_path, type):
1546     """Write out a single line to the symbol_openings_closings file
1547     representing that svn_revnum of svn_path is either the opening or
1548     closing (TYPE) of NAME (a symbolic name).
1549
1550     TYPE should only be one of the following global constants:
1551     OPENING or CLOSING."""
1552     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1553     self.symbolings.write('%s %.8d %s %s\n' % (name, svn_revnum,
1554                                                type, svn_path))
1555
1556   def close(self):
1557     """Iterate through the closings file, lookup the svn_revnum for
1558     each closing CVSRevision, and write a proper line out to the
1559     symbolings file."""
1560     # Use this to get the c_rev.svn_path of our rev_key
1561     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1562
1563     self.closings.close()
1564     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1565       (name, rev_key) = line.rstrip().split(" ", 1)
1566       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1567
1568       c_rev = cvs_revs_db.get_revision(rev_key)
1569       self._log(name, svn_revnum, c_rev.svn_path, CLOSING)
1570
1571     self.symbolings.close()
1572
1573   def _note_default_branch_opening(self, c_rev, symbolic_name):
1574     """If C_REV is a default branch revision, log C_REV.svn_trunk_path
1575     as an opening for SYMBOLIC_NAME."""
1576     path = c_rev.svn_trunk_path
1577     if not self.open_paths_with_default_branches.has_key(path):
1578       self.open_paths_with_default_branches[path] = [ ]
1579     self.open_paths_with_default_branches[path].append(symbolic_name)
1580
1581   def log_default_branch_closing(self, c_rev, svn_revnum):
1582     """If self.open_paths_with_default_branches contains
1583     C_REV.svn_trunk_path, then call log each name in
1584     self.open_paths_with_default_branches[C_REV.svn_trunk_path] as a
1585     closing with SVN_REVNUM as the closing revision number. """
1586     path = c_rev.svn_trunk_path
1587     if self.open_paths_with_default_branches.has_key(path):
1588       # log each symbol as a closing
1589       for name in self.open_paths_with_default_branches[path]:
1590         self._log(name, svn_revnum, path, CLOSING)
1591       # Remove them from the openings list as we're done with them.
1592       del self.open_paths_with_default_branches[path]
1593
1594
1595 class PersistenceManager:
1596   """The PersistenceManager allows us to effectively store SVNCommits
1597   to disk and retrieve them later using only their subversion revision
1598   number as the key.  It also returns the subversion revision number
1599   for a given CVSRevision's unique key.
1600
1601   All information pertinent to each SVNCommit is stored in a series of
1602   on-disk databases so that SVNCommits can be retrieved on-demand.
1603
1604   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
1605   In 'new' mode, PersistenceManager will initialize a new set of on-disk
1606   databases and be fully-featured.
1607   In 'read' mode, PersistenceManager will open existing on-disk databases
1608   and the set_* methods will be unavailable."""
1609   def __init__(self, mode):
1610     self.mode = mode
1611     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
1612       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
1613     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
1614     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
1615     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
1616     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
1617     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
1618     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
1619     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
1620     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
1621     ###PERF kff Elsewhere there are comments about sucking the tags db
1622     ### into memory.  That seems like a good idea.
1623     if not Ctx().trunk_only:
1624       self.tags_db = TagsDatabase(DB_OPEN_READ)
1625       self.motivating_revnums = Database(temp(MOTIVATING_REVNUMS), mode)
1626       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
1627
1628     # "branch_name" -> svn_revnum in which branch was last filled.
1629     # This is used by CVSCommit._pre_commit, to prevent creating a fill
1630     # revision which would have nothing to do.
1631     self.last_filled = {}
1632
1633   def get_svn_revnum(self, cvs_rev_unique_key):
1634     """Return the Subversion revision number in which
1635     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
1636     is no mapping for CVS_REV_UNIQUE_KEY."""
1637     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
1638
1639   def get_svn_commit(self, svn_revnum):
1640     """Return an SVNCommit that corresponds to SVN_REVNUM.
1641
1642     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
1643
1644     This method can throw SVNCommitInternalInconsistencyError.
1645     """
1646     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
1647     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
1648     if c_rev_keys == None:
1649       return None
1650
1651     digest = None
1652     for key in c_rev_keys:
1653       c_rev = self.cvs_revisions.get_revision(key)
1654       svn_commit.add_revision(c_rev)
1655       # Set the author and log message for this commit by using
1656       # CVSRevision metadata, but only if haven't done so already.
1657       if digest is None:
1658         digest = c_rev.digest
1659         author, log_msg = self.svn_commit_metadata[digest]
1660         svn_commit.set_author(author)
1661         svn_commit.set_log_msg(log_msg)
1662
1663     # If we're doing a trunk-only conversion, we don't need to do any more work.
1664     if Ctx().trunk_only:
1665       return svn_commit
1666
1667     name, date = self._get_name_and_date(svn_revnum)
1668     if name:
1669       svn_commit.set_symbolic_name(name)
1670       svn_commit.set_date(date)
1671       if self.tags_db.has_key(name):
1672         svn_commit.is_tag = 1
1673
1674     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
1675     if motivating_revnum:
1676       svn_commit.set_motivating_revnum(int(motivating_revnum))
1677       svn_commit.set_date(date)
1678
1679     if len(svn_commit.cvs_revs) and name:
1680       msg = """An SVNCommit cannot have cvs_revisions *and* a
1681       corresponding symbolic name ('%s') to fill.""" % name
1682       raise SVNCommit.SVNCommitInternalInconsistencyError(msg)
1683
1684     return svn_commit
1685
1686   def set_cvs_revs(self, svn_revnum, cvs_revs):
1687     """Record the bidirectional mapping between SVN_REVNUM and
1688     CVS_REVS."""
1689     if self.mode == DB_OPEN_READ:
1690       raise RuntimeError, \
1691           'Write operation attempted on read-only PersistenceManager'
1692     for c_rev in cvs_revs:
1693       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
1694     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
1695     for c_rev in cvs_revs:
1696       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
1697
1698   def set_name_and_date(self, svn_revnum, name, date):
1699     """Associate symbolic name NAME and DATE with SVN_REVNUM."""
1700     if self.mode == DB_OPEN_READ:
1701       raise RuntimeError, \
1702           'Write operation attempted on read-only PersistenceManager'
1703     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
1704     self.last_filled[name] = svn_revnum
1705
1706   def _get_name_and_date(self, svn_revnum):
1707     """Return a tuple containing the symbolic name and date associated
1708     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
1709     associated with it."""
1710     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
1711
1712   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
1713     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
1714     if self.mode == DB_OPEN_READ:
1715       raise RuntimeError, \
1716           'Write operation attempted on read-only PersistenceManager'
1717     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
1718
1719
1720 class CVSCommit:
1721   """Each instance of this class contains a number of CVS Revisions
1722   that correspond to one or more Subversion Commits.  After all CVS
1723   Revisions are added to the grouping, calling process_revisions will
1724   generate a Subversion Commit (or Commits) for the set of CVS
1725   Revisions in the grouping."""
1726
1727   def __init__(self, digest, author, log):
1728     self.digest = digest
1729     self.author = author
1730     self.log = log
1731
1732     # Symbolic names for which the last source revision has already
1733     # been seen and for which the CVSRevisionAggregator has already
1734     # generated a fill SVNCommit.  See self.process_revisions().
1735     self.done_symbols = [ ]
1736
1737     self.files = { }
1738     # Lists of CVSRevisions
1739     self.changes = [ ]
1740     self.deletes = [ ]
1741
1742     # Start out with a t_min higher than any incoming time T, and a
1743     # t_max lower than any incoming T.  This way the first T will
1744     # push t_min down to T, and t_max up to T, naturally (without any
1745     # special-casing), and successive times will then ratchet them
1746     # outward as appropriate.
1747     self.t_min = 1L<<32
1748     self.t_max = 0
1749
1750     # This will be set to the SVNCommit that occurs in self._commit.
1751     self.motivating_commit = None
1752
1753     # This is a list of all non-primary commits motivated by the main
1754     # commit.  We gather these so that we can set their dates to the
1755     # same date as the primary commit.
1756     self.secondary_commits = [ ]
1757
1758     # State for handling default branches.
1759     #
1760     # Here is a tempting, but ultimately nugatory, bit of logic, which
1761     # I share with you so you may appreciate the less attractive, but
1762     # refreshingly non-nugatory, logic which follows it:
1763     #
1764     # If some of the commits in this txn happened on a non-trunk
1765     # default branch, then those files will have to be copied into
1766     # trunk manually after being changed on the branch (because the
1767     # RCS "default branch" appears as head, i.e., trunk, in practice).
1768     # As long as those copies don't overwrite any trunk paths that
1769     # were also changed in this commit, then we can do the copies in
1770     # the same revision, because they won't cover changes that don't
1771     # appear anywhere/anywhen else.  However, if some of the trunk dst
1772     # paths *did* change in this commit, then immediately copying the
1773     # branch changes would lose those trunk mods forever.  So in this
1774     # case, we need to do at least that copy in its own revision.  And
1775     # for simplicity's sake, if we're creating the new revision for
1776     # even one file, then we just do all such copies together in the
1777     # new revision.
1778     #
1779     # Doesn't that sound nice?
1780     #
1781     # Unfortunately, Subversion doesn't support copies with sources
1782     # in the current txn.  All copies must be based in committed
1783     # revisions.  Therefore, we generate the above-described new
1784     # revision unconditionally.
1785     #
1786     # This is a list of c_revs, and a c_rev is appended for each
1787     # default branch commit that will need to be copied to trunk (or
1788     # deleted from trunk) in some generated revision following the
1789     # "regular" revision.
1790     self.default_branch_cvs_revisions = [ ]
1791
1792   def __cmp__(self, other):
1793     # Commits should be sorted by t_max.  If both self and other have
1794     # the same t_max, break the tie using t_min, and lastly, digest
1795     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
1796             or cmp(self.digest, other.digest))
1797
1798   def has_file(self, fname):
1799     return self.files.has_key(fname)
1800
1801   def revisions(self):
1802     return self.changes + self.deletes
1803
1804   def opens_symbolic_name(self, name):
1805     """Returns true if any CVSRevision in this commit is on a tag or a
1806     branch or is the origin of a tag or branch."""
1807     for c_rev in self.revisions():
1808       if c_rev.opens_symbolic_name(name):
1809         return 1
1810     return 0
1811
1812   def add_revision(self, c_rev):
1813     # Record the time range of this commit.
1814     #
1815     # ### ISSUE: It's possible, though unlikely, that the time range
1816     # of a commit could get gradually expanded to be arbitrarily
1817     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
1818     # problem, and anyway deciding where to break it up would be a
1819     # judgement call.  For now, we just print a warning in commit() if
1820     # this happens.
1821     if c_rev.timestamp < self.t_min:
1822       self.t_min = c_rev.timestamp
1823     if c_rev.timestamp > self.t_max:
1824       self.t_max = c_rev.timestamp
1825
1826     if c_rev.op == OP_DELETE:
1827       self.deletes.append(c_rev)
1828     else:
1829       # OP_CHANGE or OP_ADD
1830       self.changes.append(c_rev)
1831
1832     self.files[c_rev.fname] = 1
1833
1834   def _pre_commit(self):
1835     """Generates any SVNCommits that must exist before the main
1836     commit."""
1837
1838     # There may be multiple c_revs in this commit that would cause
1839     # branch B to be filled, but we only want to fill B once.  On the
1840     # other hand, there might be multiple branches committed on in
1841     # this commit.  Whatever the case, we should count exactly one
1842     # commit per branch, because we only fill a branch once per
1843     # CVSCommit.  This list tracks which branches we've already
1844     # counted.
1845     accounted_for_sym_names = [ ]
1846
1847     def fill_needed(c_rev, pm):
1848       """Return 1 if this is the first commit on a new branch (for
1849       this file) and we need to fill the branch; else return 0
1850       (meaning that some other file's first commit on the branch has
1851       already done the fill for us).
1852
1853       If C_REV.op is OP_ADD, only return 1 if the branch that this
1854       commit is on has no last filled revision.
1855
1856       PM is a PersistenceManager to query.
1857       """
1858
1859       # Different '.' counts indicate that c_rev is now on a different
1860       # line of development (and may need a fill)
1861       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
1862         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
1863         # It should be the case that when we have a file F that
1864         # is added on branch B (thus, F on trunk is in state
1865         # 'dead'), we generate an SVNCommit to fill B iff the branch
1866         # has never been filled before.
1867         #
1868         # If this c_rev.op == OP_ADD, *and* the branch has never
1869         # been filled before, then fill it now.  Otherwise, no need to
1870         # fill it.
1871         if c_rev.op == OP_ADD:
1872           if pm.last_filled.get(c_rev.branch_name, None) is None:
1873             return 1
1874         else:
1875           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
1876             return 1
1877       return 0
1878
1879     for c_rev in self.changes + self.deletes:
1880       # If a commit is on a branch, we must ensure that the branch
1881       # path being committed exists (in HEAD of the Subversion
1882       # repository).  If it doesn't exist, we will need to fill the
1883       # branch.  After the fill, the path on which we're committing
1884       # will exist.
1885       if c_rev.branch_name \
1886           and c_rev.branch_name not in accounted_for_sym_names \
1887           and c_rev.branch_name not in self.done_symbols \
1888           and fill_needed(c_rev, Ctx()._persistence_manager):
1889         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
1890                                % c_rev.branch_name)
1891         svn_commit.set_symbolic_name(c_rev.branch_name)
1892         self.secondary_commits.append(svn_commit)
1893         accounted_for_sym_names.append(c_rev.branch_name)
1894
1895   def _commit(self):
1896     """Generates the primary SVNCommit that corresponds the this
1897     CVSCommit."""
1898     # Generate an SVNCommit unconditionally.  Even if the only change
1899     # in this CVSCommit is a deletion of an already-deleted file (that
1900     # is, a CVS revision in state 'dead' whose predecessor was also in
1901     # state 'dead'), the conversion will still generate a Subversion
1902     # revision containing the log message for the second dead
1903     # revision, because we don't want to lose that information.
1904     svn_commit = SVNCommit("commit")
1905     self.motivating_commit = svn_commit
1906
1907     for c_rev in self.changes:
1908       svn_commit.add_revision(c_rev)
1909       # Only make a change if we need to.  When 1.1.1.1 has an empty
1910       # deltatext, the explanation is almost always that we're looking
1911       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
1912       # such imports, CVS creates an RCS file where 1.1 has the
1913       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
1914       # content as 1.1.  There's no reason to reflect this non-change
1915       # in the repository, so we want to do nothing in this case.  (If
1916       # we were really paranoid, we could make sure 1.1's log message
1917       # is the CVS-generated "Initial revision\n", but I think the
1918       # conditions below are strict enough.)
1919       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
1920               and (c_rev.rev == "1.1.1.1")):
1921         if c_rev.is_default_branch_revision():
1922           self.default_branch_cvs_revisions.append(c_rev)
1923
1924     for c_rev in self.deletes:
1925       # When a file is added on a branch, CVS not only adds the file
1926       # on the branch, but generates a trunk revision (typically
1927       # 1.1) for that file in state 'dead'.  We only want to add
1928       # this revision if the log message is not the standard cvs
1929       # fabricated log message.
1930       if c_rev.prev_rev is None:
1931         # c_rev.branches may be empty if the originating branch
1932         # has been excluded.
1933         if not c_rev.branches:
1934           continue
1935         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
1936                              % (c_rev.filename(),
1937                                 c_rev.branches[0]))
1938         author, log_msg = \
1939             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
1940         if log_msg == cvs_generated_msg:
1941           continue
1942
1943       svn_commit.add_revision(c_rev)
1944       if c_rev.is_default_branch_revision():
1945         self.default_branch_cvs_revisions.append(c_rev)
1946
1947     # There is a slight chance that we didn't actually register any
1948     # CVSRevisions with our SVNCommit (see loop over self.deletes
1949     # above), so if we have no CVSRevisions, we don't flush the
1950     # svn_commit to disk and roll back our revnum.
1951     if len(svn_commit.cvs_revs) > 0:
1952       svn_commit.flush()
1953     else:
1954       # We will not be flushing this SVNCommit, so rollback the
1955       # SVNCommit revision counter.
1956       SVNCommit.revnum = SVNCommit.revnum - 1
1957
1958     if not Ctx().trunk_only:
1959       for c_rev in self.revisions():
1960         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
1961
1962   def _post_commit(self):
1963     """Generates any SVNCommits that we can perform now that _commit
1964     has happened.  That is, handle non-trunk default branches.
1965     Sometimes an RCS file has a non-trunk default branch, so a commit
1966     on that default branch would be visible in a default CVS checkout
1967     of HEAD.  If we don't copy that commit over to Subversion's trunk,
1968     then there will be no Subversion tree which corresponds to that
1969     CVS checkout.  Of course, in order to copy the path over, we may
1970     first need to delete the existing trunk there.  """
1971
1972     # Only generate a commit if we have default branch revs
1973     if len(self.default_branch_cvs_revisions):
1974       # Generate an SVNCommit for all of our default branch c_revs.
1975       svn_commit = SVNCommit("post-commit default branch(es)")
1976       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
1977       for c_rev in self.default_branch_cvs_revisions:
1978         svn_commit.add_revision(c_rev)
1979         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
1980                                                             svn_commit.revnum)
1981       self.secondary_commits.append(svn_commit)
1982
1983   def process_revisions(self, done_symbols):
1984     """Process all the CVSRevisions that this instance has, creating
1985     one or more SVNCommits in the process.  Generate fill SVNCommits
1986     only for symbols not in DONE_SYMBOLS (avoids unnecessary
1987     fills).
1988
1989     Return the primary SVNCommit that corresponds to this CVSCommit.
1990     The returned SVNCommit is the commit that motivated any other
1991     SVNCommits generated in this CVSCommit."""
1992     self.done_symbols = done_symbols
1993     seconds = self.t_max - self.t_min + 1
1994
1995     Log().write(LOG_VERBOSE, '-' * 60)
1996     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
1997     if seconds == 1:
1998       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
1999                   % time.ctime(self.t_max))
2000     else:
2001       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2002       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2003                   % (time.ctime(self.t_max), seconds))
2004
2005     if seconds > COMMIT_THRESHOLD + 1:
2006       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2007                   % (warning_prefix, COMMIT_THRESHOLD))
2008
2009     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2010       self._commit()
2011       return self.motivating_commit
2012
2013     self._pre_commit()
2014     self._commit()
2015     self._post_commit()
2016
2017     for svn_commit in self.secondary_commits:
2018       svn_commit.set_date(self.motivating_commit.get_date())
2019       svn_commit.flush()
2020
2021     return self.motivating_commit
2022
2023
2024 class SVNCommit:
2025   """This represents one commit to the Subversion Repository.  There
2026   are three types of SVNCommits:
2027
2028   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2029
2030   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2031
2032   3. Updates trunk to reflect the contents of a particular branch
2033      (this is to handle RCS default branches)."""
2034
2035   # The revision number to assign to the next new SVNCommit.
2036   # We start at 2 because SVNRepositoryMirror uses the first commit
2037   # to create trunk, tags, and branches.
2038   revnum = 2
2039
2040   class SVNCommitInternalInconsistencyError(Exception):
2041     """Exception raised if we encounter an impossible state in the
2042     SVNCommit Databases."""
2043     pass
2044
2045   def __init__(self, description="", revnum=None, cvs_revs=None):
2046     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2047     If REVNUM, the SVNCommit will correspond to that revision number;
2048     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2049     REVNUM.
2050
2051     It is an error to pass CVS_REVS without REVNUM, but you may pass
2052     REVNUM without CVS_REVS, and then add a revision at a time by
2053     invoking add_revision()."""
2054     self._description = description
2055
2056     # Revprop metadata for this commit.
2057     #
2058     # These initial values are placeholders.  At least the log and the
2059     # date should be different by the time these are used.
2060     #
2061     # They are private because their values should be returned encoded
2062     # in UTF8, but callers aren't required to set them in UTF8.
2063     # Therefore, accessor methods are used to set them, and
2064     # self.get_revprops() is used to to get them, in dictionary form.
2065     self._author = Ctx().username
2066     self._log_msg = "This log message means an SVNCommit was used too soon."
2067     self._max_date = 0  # Latest date seen so far.
2068
2069     self.cvs_revs = cvs_revs or []
2070     if revnum:
2071       self.revnum = revnum
2072     else:
2073       self.revnum = SVNCommit.revnum
2074       SVNCommit.revnum = SVNCommit.revnum + 1
2075
2076     # The symbolic name that is filled in this SVNCommit, if any
2077     self.symbolic_name = None
2078
2079     # If this commit is a default branch synchronization, this
2080     # variable represents the subversion revision number of the
2081     # *primary* commit where the default branch changes actually
2082     # happened.  It is None otherwise.
2083     #
2084     # It is possible for multiple for multiple synchronization commits
2085     # to refer to the same motivating commit revision number, and it
2086     # is possible for a single synchronization commit to contain
2087     # CVSRevisions on multiple different default branches.
2088     self.motivating_revnum = None
2089
2090     # is_tag is true only if this commit is a fill of a symbolic name
2091     # that is a tag, None in all other cases.
2092     self.is_tag = None
2093
2094   def set_symbolic_name(self, name):
2095     "Set self.symbolic_name to NAME."
2096     name = _clean_symbolic_name(name)
2097     self.symbolic_name = name
2098
2099   def set_motivating_revnum(self, revnum):
2100     "Set self.motivating_revnum to REVNUM."
2101     self.motivating_revnum = revnum
2102
2103   def set_author(self, author):
2104     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2105     This is the only way to set an SVNCommit's author."""
2106     self._author = author
2107
2108   def set_log_msg(self, msg):
2109     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2110     This is the only way to set an SVNCommit's log message."""
2111     self._log_msg = msg
2112
2113   def set_date(self, date):
2114     """Set this SVNCommit's date to DATE (an integer).
2115     Note that self.add_revision() updates this automatically based on
2116     a CVSRevision; so you may not need to call this at all, and even
2117     if you do, the value may be overwritten by a later call to
2118     self.add_revision()."""
2119     self._max_date = date
2120
2121   def get_date(self):
2122     """Returns this SVNCommit's date as an integer."""
2123     return self._max_date
2124
2125   def get_revprops(self):
2126     """Return the Subversion revprops for this SVNCommit."""
2127     date = format_date(self._max_date)
2128     try:
2129       ### FIXME: The 'replace' behavior should be an option, like
2130       ### --encoding is.
2131       utf8_author = None
2132       if self._author is not None:
2133         unicode_author = unicode(self._author, Ctx().encoding, 'replace')
2134         utf8_author = unicode_author.encode('utf8')
2135       unicode_log = unicode(self.get_log_msg(), Ctx().encoding, 'replace')
2136       utf8_log = unicode_log.encode('utf8')
2137       return { 'svn:author' : utf8_author,
2138                'svn:log'    : utf8_log,
2139                'svn:date'   : date }
2140     except UnicodeError:
2141       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2142                   % warning_prefix)
2143       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2144       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2145       Log().write(LOG_WARN, "  date:   '%s'" % date)
2146       Log().write(LOG_WARN, "(subversion rev %s)  Related files:" % self.revnum)
2147       for c_rev in self.cvs_revs:
2148         Log().write(LOG_WARN, " ", c_rev.fname)
2149
2150       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2151                   "'--encoding=latin1'.\n")
2152       # It's better to fall back to the original (unknown encoding) data
2153       # than to either 1) quit or 2) record nothing at all.
2154       return { 'svn:author' : self._author,
2155                'svn:log'    : self.get_log_msg(),
2156                'svn:date'   : date }
2157
2158   def add_revision(self, cvs_rev):
2159     self.cvs_revs.append(cvs_rev)
2160     if cvs_rev.timestamp > self._max_date:
2161       self._max_date = cvs_rev.timestamp
2162
2163   def _is_primary_commit(self):
2164     """Return true if this is a primary SVNCommit, false otherwise."""
2165     return not (self.symbolic_name or self.motivating_revnum)
2166
2167   def flush(self):
2168     Log().write(LOG_NORMAL, "Creating Subversion commit %d (%s)"
2169                 % (self.revnum, self._description))
2170     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2171
2172     if self.motivating_revnum is not None:
2173       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2174                                                        self.motivating_revnum)
2175
2176     # If we're not a primary commit, then store our date and/or our
2177     # symbolic_name
2178     if not self._is_primary_commit():
2179       Ctx()._persistence_manager.set_name_and_date(self.revnum,
2180                                                    self.symbolic_name,
2181                                                    self._max_date)
2182
2183   def __str__(self):
2184     """ Print a human-readable description of this SVNCommit.  This
2185     description is not intended to be machine-parseable (although
2186     we're not going to stop you if you try!)"""
2187
2188     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2189     if self.symbolic_name:
2190       ret = ret + "   symbolic name: " +  self.symbolic_name + "\n"
2191     else:
2192       ret = ret + "   NO symbolic name\n"
2193     ret = ret + "   debug description: " + self._description + "\n"
2194     ret = ret + "   cvs_revs:\n"
2195     for c_rev in self.cvs_revs:
2196       ret = ret + "     " + c_rev.unique_key() + "\n"
2197     return ret
2198
2199   def get_log_msg(self):
2200     """Returns the actual log message for a primary commit, and the
2201     appropriate manufactured log message for a secondary commit."""
2202     if self.symbolic_name is not None:
2203       return self._log_msg_for_symbolic_name_commit()
2204     elif self.motivating_revnum is not None:
2205       return self._log_msg_for_default_branch_commit()
2206     else:
2207       return self._log_msg
2208
2209   def _log_msg_for_symbolic_name_commit(self):
2210     """Creates a log message for a manufactured commit that fills
2211     self.symbolic_name.  If self.is_tag is true, write the log message
2212     as though for a tag, else write it as though for a branch."""
2213     type = 'branch'
2214     if self.is_tag:
2215       type = 'tag'
2216
2217     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2218     space_or_newline = ' '
2219     if len(self.symbolic_name) >= 13:
2220       space_or_newline = '\n'
2221
2222     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2223            % (type, space_or_newline, self.symbolic_name)
2224
2225   def _log_msg_for_default_branch_commit(self):
2226     """Creates a log message for a manufactured commit that
2227     synchronizes a non-trunk default branch with trunk."""
2228     msg = 'This commit was generated by cvs2svn to compensate for '     \
2229           'changes in r%d,\n'                                           \
2230           'which included commits to RCS files with non-trunk default ' \
2231           'branches.\n' % self.motivating_revnum
2232     return msg
2233
2234 class CVSRevisionAggregator:
2235   """This class groups CVSRevisions into CVSCommits that represent
2236   at least one SVNCommit."""
2237   def __init__(self):
2238     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2239     if not Ctx().trunk_only:
2240       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), DB_OPEN_READ)
2241     self.cvs_commits = {}
2242     self.pending_symbols = {}
2243     # A list of symbols for which we've already encountered the last
2244     # CVSRevision that is a source for that symbol.  That is, the
2245     # final fill for this symbol has been done, and we never need to
2246     # fill it again.
2247     self.done_symbols = [ ]
2248
2249     # This variable holds the most recently created primary svn_commit
2250     # object.  CVSRevisionAggregator maintains this variable merely
2251     # for its date, so that it can set dates for the SVNCommits
2252     # created in self.attempt_to_commit_symbols().
2253     self.latest_primary_svn_commit = None
2254
2255     Ctx()._symbolings_logger = SymbolingsLogger()
2256     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2257     Ctx()._default_branches_db = Database(temp(DEFAULT_BRANCHES_DB),
2258                                           DB_OPEN_READ)
2259
2260
2261   def process_revision(self, c_rev):
2262     # Each time we read a new line, we scan the commits we've
2263     # accumulated so far to see if any are ready for processing now.
2264     ready_queue = [ ]
2265     for digest_key, cvs_commit in self.cvs_commits.items():
2266       if cvs_commit.t_max + COMMIT_THRESHOLD < c_rev.timestamp:
2267         ready_queue.append(cvs_commit)
2268         del self.cvs_commits[digest_key]
2269         continue
2270       # If the inbound commit is on the same file as a pending commit,
2271       # close the pending commit to further changes.  Don't flush it though,
2272       # as there may be other pending commits dated before this one.
2273       # ### ISSUE: the has_file() check below is not optimal.
2274       # It does fix the dataloss bug where revisions would get lost
2275       # if checked in too quickly, but it can also break apart the
2276       # commits.  The correct fix would require tracking the dependencies
2277       # between change sets and committing them in proper order.
2278       if cvs_commit.has_file(c_rev.fname):
2279         unused_id = digest_key + '-'
2280         # Find a string that does is not already a key in
2281         # the self.cvs_commits dict
2282         while self.cvs_commits.has_key(unused_id):
2283           unused_id = unused_id + '-'
2284         self.cvs_commits[unused_id] = cvs_commit
2285         del self.cvs_commits[digest_key]
2286
2287     # Add this item into the set of still-available commits.
2288     if self.cvs_commits.has_key(c_rev.digest):
2289       cvs_commit = self.cvs_commits[c_rev.digest]
2290     else:
2291       author, log = self.metadata_db[c_rev.digest]
2292       self.cvs_commits[c_rev.digest] = CVSCommit(c_rev.digest,
2293                                                  author, log)
2294       cvs_commit = self.cvs_commits[c_rev.digest]
2295     cvs_commit.add_revision(c_rev)
2296
2297     # If there are any elements in the ready_queue at this point, they
2298     # need to be processed, because this latest rev couldn't possibly
2299     # be part of any of them.  Sort them into time-order, then process
2300     # 'em.
2301     ready_queue.sort()
2302
2303     # Make sure we attempt_to_commit_symbols for this c_rev, even if no
2304     # commits are ready.
2305     if len(ready_queue) == 0:
2306       self.attempt_to_commit_symbols(ready_queue, c_rev)
2307
2308     for cvs_commit in ready_queue[:]:
2309       self.latest_primary_svn_commit \
2310           = cvs_commit.process_revisions(self.done_symbols)
2311       ready_queue.remove(cvs_commit)
2312       self.attempt_to_commit_symbols(ready_queue, c_rev)
2313
2314   def flush(self):
2315     """Commit anything left in self.cvs_commits.  Then inform the
2316     SymbolingsLogger that all commits are done."""
2317
2318     ready_queue = [ ]
2319     for k, v in self.cvs_commits.items():
2320       ready_queue.append((v, k))
2321
2322     ready_queue.sort()
2323     for cvs_commit_tuple in ready_queue[:]:
2324       self.latest_primary_svn_commit = \
2325         cvs_commit_tuple[0].process_revisions(self.done_symbols)
2326       ready_queue.remove(cvs_commit_tuple)
2327       del self.cvs_commits[cvs_commit_tuple[1]]
2328       self.attempt_to_commit_symbols([])
2329
2330     if not Ctx().trunk_only:
2331       Ctx()._symbolings_logger.close()
2332
2333   def attempt_to_commit_symbols(self, queued_commits, c_rev=None):
2334     """
2335     This function generates 1 SVNCommit for each symbol in
2336     self.pending_symbols that doesn't have an opening CVSRevision in
2337     either QUEUED_COMMITS or self.cvs_commits.values().
2338
2339     If C_REV is not None, then we first add to self.pending_symbols
2340     any symbols from C_REV that C_REV is the last CVSRevision for.
2341     """
2342     # If we're not doing a trunk-only conversion, get the symbolic
2343     # names that this c_rev is the last *source* CVSRevision for and
2344     # add them to those left over from previous passes through the
2345     # aggregator.
2346     if c_rev and not Ctx().trunk_only:
2347       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2348         self.pending_symbols[sym] = None
2349
2350     # Make a list of all symbols that still have *source* CVSRevisions
2351     # in the pending commit queue (self.cvs_commits).
2352     open_symbols = {}
2353     for sym in self.pending_symbols.keys():
2354       for cvs_commit in self.cvs_commits.values() + queued_commits:
2355         if cvs_commit.opens_symbolic_name(sym):
2356           open_symbols[sym] = None
2357           break
2358
2359     # Sort the pending symbols so that we will always process the
2360     # symbols in the same order, regardless of the order in which the
2361     # dict hashing algorithm hands them back to us.  We do this so
2362     # that our tests will get the same results on all platforms.
2363     sorted_pending_symbols_keys = self.pending_symbols.keys()
2364     sorted_pending_symbols_keys.sort()
2365     for sym in sorted_pending_symbols_keys:
2366       if open_symbols.has_key(sym): # sym is still open--don't close it.
2367         continue
2368       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2369       svn_commit.set_symbolic_name(sym)
2370       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2371       svn_commit.flush()
2372       self.done_symbols.append(sym)
2373       del self.pending_symbols[sym]
2374
2375
2376 class SymbolingsReader:
2377   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2378   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2379   returning the correct opening and closing Subversion revision
2380   numbers for a given symbolic name."""
2381   def __init__(self):
2382     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2383     reads the offsets database into memory."""
2384     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2385     # The offsets_db is really small, and we need to read and write
2386     # from it a fair bit, so suck it into memory
2387     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2388     self.offsets = { }
2389     for key in offsets_db.db.keys():
2390       #print " ZOO:", key, offsets_db[key]
2391       self.offsets[key] = offsets_db[key]
2392
2393   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2394     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2395     SymbolicNameFillingGuide object.
2396
2397     Note that if we encounter an opening rev in this fill, but the
2398     corresponding closing rev takes place later than SVN_REVNUM, the
2399     closing will not be passed to SymbolicNameFillingGuide in this
2400     fill (and will be discarded when encountered in a later fill).
2401     This is perfectly fine, because we can still do a valid fill
2402     without the closing--we always try to fill what we can as soon as
2403     we can."""
2404     # It's possible to have a branch start with a file that was added
2405     # on a branch
2406     if not self.offsets.has_key(symbolic_name):
2407       return SymbolicNameFillingGuide(symbolic_name)
2408     # set our read offset for self.symbolings to the offset for
2409     # symbolic_name
2410     self.symbolings.seek(self.offsets[symbolic_name])
2411
2412     symbol_fill = SymbolicNameFillingGuide(symbolic_name)
2413     while (1):
2414       fpos = self.symbolings.tell()
2415       line = self.symbolings.readline().rstrip()
2416       if not line:
2417         break
2418       name, revnum, type, svn_path = line.split(" ", 3)
2419       revnum = int(revnum)
2420       if (revnum > svn_revnum
2421           or name != symbolic_name):
2422         break
2423       symbol_fill.register(svn_path, revnum, type)
2424
2425     # get current offset of the read marker and set it to the offset
2426     # for the beginning of the line we just read if we used anything
2427     # we read.
2428     if not symbol_fill.is_empty():
2429       self.offsets[symbolic_name] = fpos
2430
2431     symbol_fill.make_node_tree()
2432     return symbol_fill
2433
2434
2435 class SymbolicNameFillingGuide:
2436   """A SymbolicNameFillingGuide is essentially a node tree
2437   representing the source paths to be copied to fill
2438   self.symbolic_name in the current SVNCommit.
2439
2440   After calling self.register() on a series of openings and closings,
2441   call self.make_node_tree() to prepare self.node_tree for
2442   examination.  See the docstring for self.make_node_tree() for
2443   details on the structure of self.node_tree.
2444
2445   By walking self.node_tree and calling self.get_best_revnum() on each
2446   node, the caller can determine what subversion revision number to
2447   copy the path corresponding to that node from.  self.node_tree
2448   should be treated as read-only.
2449
2450   The caller can then descend to sub-nodes to see if their "best
2451   revnum" differs from their parents' and if it does, take appropriate
2452   actions to "patch up" the subtrees."""
2453   def __init__(self, symbolic_name):
2454     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2455     prepares it for receiving openings and closings.
2456
2457     Returns a fully functional and armed SymbolicNameFillingGuide
2458     object."""
2459     self.name = symbolic_name
2460
2461     self.opening_key = "/o"
2462     self.closing_key = "/c"
2463
2464     # A dictionary of SVN_PATHS and SVN_REVNUMS whose format is:
2465     #
2466     # { svn_path : { self.opening_key : svn_revnum,
2467     #                self.closing_key : svn_revnum }
2468     #                ...}
2469     self.things = { }
2470
2471     # The key for the root node of the node tree
2472     self.root_key = '0'
2473     # The dictionary that holds our node tree, seeded with the root key.
2474     self.node_tree = { self.root_key : { } }
2475
2476   def get_best_revnum(self, node, preferred_revnum):
2477     """Determine the best subversion revision number to use when
2478     copying the source tree beginning at NODE.  Returns a
2479     subversion revision number.
2480
2481     PREFERRED_REVNUM is passed to self._best_rev and used to
2482     calculate the best_revnum."""
2483     revnum = SVN_INVALID_REVNUM
2484
2485     # Aggregate openings and closings from the rev tree
2486     openings = self._list_revnums_for_key(node, self.opening_key)
2487     closings = self._list_revnums_for_key(node, self.closing_key)
2488
2489     # Score the lists
2490     scores = self._score_revisions(self._sum_revnum_counts(openings),
2491                                   self._sum_revnum_counts(closings))
2492
2493     revnum, max_score = self._best_rev(scores, preferred_revnum)
2494
2495     if revnum == SVN_INVALID_REVNUM:
2496       sys.stderr.write(error_prefix + ": failed to find a revision "
2497                        + "to copy from when copying %s\n" % name)
2498       sys.exit(1)
2499     return revnum, max_score
2500
2501
2502   def _best_rev(self, scores, preferred_rev):
2503     """Return the revision with the highest score from SCORES, a list
2504     returned by _score_revisions().  When the maximum score is shared
2505     by multiple revisions, the oldest revision is selected, unless
2506     PREFERRED_REV is one of the possibilities, in which case, it is
2507     selected."""
2508     max_score = 0
2509     preferred_rev_score = -1
2510     rev = SVN_INVALID_REVNUM
2511     if preferred_rev is None:
2512       # Comparison order of different types is arbitrary. Do not
2513       # expect None to compare less than int values below.
2514       # In Python 2.3 None compares with ints like negative infinity.
2515       # In Python 2.0 None compares with ints like positive infinity.
2516       preferred_rev = SVN_INVALID_REVNUM
2517     for revnum, count in scores:
2518       if count > max_score:
2519         max_score = count
2520         rev = revnum
2521       if revnum <= preferred_rev:
2522         preferred_rev_score = count
2523     if preferred_rev_score == max_score:
2524       rev = preferred_rev
2525     return rev, max_score
2526
2527
2528   def _score_revisions(self, openings, closings):
2529     """Return a list of revisions and scores based on OPENINGS and
2530     CLOSINGS.  The returned list looks like:
2531
2532        [(REV1 SCORE1), (REV2 SCORE2), ...]
2533
2534     where REV2 > REV1.  OPENINGS and CLOSINGS are the values of
2535     self.opening__key and self.closing_key from some file or
2536     directory node, or else None.
2537
2538     Each score indicates that copying the corresponding revision (or
2539     any following revision up to the next revision in the list) of the
2540     object in question would yield that many correct paths at or
2541     underneath the object.  There may be other paths underneath it
2542     which are not correct and would need to be deleted or recopied;
2543     those can only be detected by descending and examining their
2544     scores.
2545
2546     If OPENINGS is false, return the empty list."""
2547     # First look for easy outs.
2548     if not openings:
2549       return []
2550
2551     # Must be able to call len(closings) below.
2552     if closings is None:
2553       closings = []
2554
2555     # No easy out, so wish for lexical closures and calculate the scores :-).
2556     scores = []
2557     opening_score_accum = 0
2558     for i in range(len(openings)):
2559       opening_rev, opening_score = openings[i]
2560       opening_score_accum = opening_score_accum + opening_score
2561       scores.append((opening_rev, opening_score_accum))
2562     min = 0
2563     for i in range(len(closings)):
2564       closing_rev, closing_score = closings[i]
2565       done_exact_rev = None
2566       insert_index = None
2567       insert_score = None
2568       for j in range(min, len(scores)):
2569         score_rev, score = scores[j]
2570         if score_rev >= closing_rev:
2571           if not done_exact_rev:
2572             if score_rev > closing_rev:
2573               insert_index = j
2574               insert_score = scores[j-1][1] - closing_score
2575             done_exact_rev = 1
2576           scores[j] = (score_rev, score - closing_score)
2577         else:
2578           min = j + 1
2579       if not done_exact_rev:
2580         scores.append((closing_rev,scores[-1][1] - closing_score))
2581       if insert_index is not None:
2582         scores.insert(insert_index, (closing_rev, insert_score))
2583     return scores
2584
2585   def _sum_revnum_counts(self, rev_list):
2586     """Takes an array of revisions (REV_LIST), for example:
2587
2588       [21, 18, 6, 49, 39, 24, 24, 24, 24, 24, 24, 24]
2589
2590     and adds up every occurrence of each revision and returns a sorted
2591     array of tuples containing (svn_revnum, count):
2592
2593       [(6, 1), (18, 1), (21, 1), (24, 7), (39, 1), (49, 1)]
2594     """
2595     s = {}
2596     for k in rev_list: # Add up the scores
2597       if s.has_key(k):
2598         s[k] = s[k] + 1
2599       else:
2600         s[k] = 1
2601     a = s.items()
2602     a.sort()
2603     return a
2604
2605   def _list_revnums_for_key(self, node, revnum_type_key):
2606     """Scan self.node_tree and return a list of all the revision
2607     numbers (including duplicates) contained in REVNUM_TYPE_KEY values
2608     for all leaf nodes at and under NODE.
2609
2610     REVNUM_TYPE_KEY should be either self.opening_key or
2611     self.closing_key."""
2612     revnums = []
2613
2614     # If the node has self.opening_key, it must be a leaf node--all
2615     # leaf nodes have at least an opening key (although they may not
2616     # have a closing key.  Fetch revnum and return
2617     if (self.node_tree[node].has_key(self.opening_key) and
2618         self.node_tree[node].has_key(revnum_type_key)):
2619       revnums.append(self.node_tree[node][revnum_type_key])
2620       return revnums
2621
2622     for key, node_contents in self.node_tree[node].items():
2623       if key[0] == '/':
2624         continue
2625       revnums = revnums + \
2626           self._list_revnums_for_key(node_contents, revnum_type_key)
2627     return revnums
2628
2629   def register(self, svn_path, svn_revnum, type):
2630     """Collects opening and closing revisions for this
2631     SymbolicNameFillingGuide.  SVN_PATH is the source path that needs
2632     to be copied into self.symbolic_name, and SVN_REVNUM is either the
2633     first svn revision number that we can copy from (our opening), or
2634     the last (not inclusive) svn revision number that we can copy from
2635     (our closing).  TYPE indicates whether this path is an opening or a
2636     a closing.
2637
2638     The opening for a given SVN_PATH must be passed before the closing
2639     for it to have any effect... any closing encountered before a
2640     corresponding opening will be discarded.
2641
2642     It is not necessary to pass a corresponding closing for every
2643     opening.
2644     """
2645     # Always log an OPENING
2646     if type == OPENING:
2647       self.things[svn_path] = {self.opening_key: svn_revnum}
2648     # Only log a closing if we've already registered the opening for that path.
2649     elif type == CLOSING and self.things.has_key(svn_path):
2650       # When we have a non-trunk default branch, we may have multiple
2651       # closings--only register the first closing we encounter.
2652       if not self.things[svn_path].has_key(self.closing_key):
2653         self.things[svn_path][self.closing_key] = svn_revnum
2654
2655   def make_node_tree(self):
2656     """Generates the SymbolicNameFillingGuide's node tree from
2657     self.things.  Each leaf node maps self.opening_key to the earliest
2658     subversion revision from which this node/path may be copied; and
2659     optionally map self.closing_key to the subversion revision one
2660     higher than the last revision from which this node/path may be
2661     copied.  Intermediate nodes never contain opening or closing
2662     flags."""
2663
2664     for svn_path, open_close in self.things.items():
2665       parent_key = self.root_key
2666
2667       path_so_far = ""
2668       # Walk up the path, one node at a time.
2669       components = svn_path.split('/')
2670       last_path_component = components[-1]
2671       for component in components:
2672         path_so_far = path_so_far + '/' + component
2673
2674         child_key = None
2675         if not self.node_tree[parent_key].has_key(component):
2676           child_key = gen_key()
2677           self.node_tree[child_key] = { }
2678           self.node_tree[parent_key][component] = child_key
2679         else:
2680           child_key = self.node_tree[parent_key][component]
2681
2682         # If this is the leaf, add the openings and closings.
2683         if component is last_path_component:
2684           self.node_tree[child_key] = open_close
2685         parent_key = child_key
2686     #print_node_tree(self.node_tree, self.root_key)
2687
2688   def is_empty(self):
2689     """Return true if we haven't accumulated any openings or closings,
2690     false otherwise."""
2691     return not len(self.things)
2692
2693
2694 class FillSource:
2695   """Representation of a fill source used by the symbol filler in
2696   SVNRepositoryMirror."""
2697   def __init__(self, prefix, key):
2698     """Create an unscored fill source with a prefix and a key."""
2699     self.prefix = prefix
2700     self.key = key
2701     self.score = None
2702     self.revnum = None
2703
2704   def set_score(self, score, revnum):
2705     """Set the SCORE and REVNUM."""
2706     self.score = score
2707     self.revnum = revnum
2708
2709   def __cmp__(self, other):
2710     """Comparison operator used to sort FillSources in descending
2711     score order."""
2712     if self.score is None or other.score is None:
2713       raise TypeError, 'Tried to compare unscored FillSource'
2714     return cmp(other.score, self.score)
2715
2716
2717 class SVNRepositoryMirror:
2718   """Mirror a Subversion Repository as it is constructed, one
2719   SVNCommit at a time.  The mirror is skeletal; it does not contain
2720   file contents.  The creation of a dumpfile or Subversion repository
2721   is handled by delegates.  See self.add_delegate method for how to
2722   set delegates.
2723
2724   The structure of the repository is kept in two databases and one
2725   hash.  The revs_db database maps revisions to root node keys, and
2726   the nodes_db database maps node keys to nodes.  A node is a hash
2727   from directory names to keys.  Both the revs_db and the nodes_db are
2728   stored on disk and each access is expensive.
2729
2730   The nodes_db database only has the keys for old revisions.  The
2731   revision that is being contructed is kept in memory in the new_nodes
2732   hash which is cheap to access.
2733
2734   You must invoke _start_commit between SVNCommits.
2735
2736   *** WARNING *** All path arguments to methods in this class CANNOT
2737       have leading or trailing slashes.
2738   """
2739
2740   class SVNRepositoryMirrorPathExistsError(Exception):
2741     """Exception raised if an attempt is made to add a path to the
2742     repository mirror and that path already exists in the youngest
2743     revision of the repository."""
2744     pass
2745
2746   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
2747     """Exception raised if a CVSRevision is found to have an unexpected
2748     operation (OP) value."""
2749     pass
2750
2751   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
2752     """Exception raised if an empty SymbolicNameFillingGuide is returned
2753     during a fill where the branch in question already exists."""
2754     pass
2755
2756   def __init__(self):
2757     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
2758     self.delegates = [ ]
2759
2760     # This corresponds to the 'revisions' table in a Subversion fs.
2761     self.revs_db = Database(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
2762     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
2763
2764     # This corresponds to the 'nodes' table in a Subversion fs.  (We
2765     # don't need a 'representations' or 'strings' table because we
2766     # only track metadata, not file contents.)
2767     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
2768     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
2769
2770     # Start at revision 0 without a root node.  It will be created
2771     # by _open_writable_root_node.
2772     self.youngest = 0
2773     self.new_root_key = None
2774     self.new_nodes = { }
2775
2776     if not Ctx().trunk_only:
2777       ###PERF IMPT: Suck this into memory.
2778       self.tags_db = TagsDatabase(DB_OPEN_READ)
2779       self.symbolings_reader = SymbolingsReader()
2780
2781   def _initialize_repository(self, date):
2782     """Initialize the repository by creating the directories for
2783     trunk, tags, and branches.  This method should only be called
2784     after all delegates are added to the repository mirror."""
2785     # Make a 'fake' SVNCommit so we can take advantage of the revprops
2786     # magic therein
2787     svn_commit = SVNCommit("Initialization", 1)
2788     svn_commit.set_date(date)
2789     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
2790
2791     self._start_commit(svn_commit)
2792     self._mkdir(Ctx().trunk_base)
2793     if not Ctx().trunk_only:
2794       self._mkdir(Ctx().branches_base)
2795       self._mkdir(Ctx().tags_base)
2796
2797   def _start_commit(self, svn_commit):
2798     """Start a new commit."""
2799     if self.youngest > 0:
2800       self._end_commit()
2801
2802     self.youngest = svn_commit.revnum
2803     self.new_root_key = None
2804     self.new_nodes = { }
2805
2806     self._invoke_delegates('start_commit', svn_commit)
2807
2808   def _end_commit(self):
2809     """Called at the end of each commit.  This method copies the newly
2810     created nodes to the on-disk nodes db."""
2811     if self.new_root_key is None:
2812       # No changes were made in this revision, so we make the root node
2813       # of the new revision be the same as the last one.
2814       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
2815     else:
2816       self.revs_db[str(self.youngest)] = self.new_root_key
2817       # Copy the new nodes to the nodes_db
2818       for key, value in self.new_nodes.items():
2819         self.nodes_db[key] = value
2820
2821   def _get_node(self, key):
2822     """Returns the node contents for KEY which may refer to either
2823     self.nodes_db or self.new_nodes."""
2824     if self.new_nodes.has_key(key):
2825       return self.new_nodes[key]
2826     else:
2827       return self.nodes_db[key]
2828
2829   def _open_readonly_node(self, path, revnum):
2830     """Open a readonly node for PATH at revision REVNUM.  Returns the
2831     node key and node contents if the path exists, else (None, None)."""
2832     # Get the root key
2833     if revnum == self.youngest:
2834       if self.new_root_key is None:
2835         node_key = self.revs_db[str(self.youngest - 1)]
2836       else:
2837         node_key = self.new_root_key
2838     else:
2839       node_key = self.revs_db[str(revnum)]
2840
2841     for component in path.split('/'):
2842       node_contents = self._get_node(node_key)
2843       if not node_contents.has_key(component):
2844         return None
2845       node_key = node_contents[component]
2846
2847     return node_key
2848
2849   def _open_writable_root_node(self):
2850     """Open a writable root node.  The current root node is returned
2851     immeditely if it is already writable.  If not, create a new one by
2852     copying the contents of the root node of the previous version."""
2853     if self.new_root_key is not None:
2854       return self.new_root_key, self.new_nodes[self.new_root_key]
2855
2856     if self.youngest < 2:
2857       new_contents = { }
2858     else:
2859       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
2860     self.new_root_key = gen_key()
2861     self.new_nodes = { self.new_root_key: new_contents }
2862
2863     return self.new_root_key, new_contents
2864
2865   def _open_writable_node(self, svn_path, create):
2866     """Open a writable node for the path SVN_PATH, creating SVN_PATH
2867     and any missing directories if CREATE is True."""
2868     parent_key, parent_contents = self._open_writable_root_node()
2869
2870     # Walk up the path, one node at a time.
2871     path_so_far = None
2872     components = svn_path.split('/')
2873     for i in range(len(components)):
2874       component = components[i]
2875       this_key = this_contents = None
2876       path_so_far = _path_join(path_so_far, component)
2877       if parent_contents.has_key(component):
2878         # The component exists.
2879         this_key = parent_contents[component]
2880         if self.new_nodes.has_key(this_key):
2881           this_contents = self.new_nodes[this_key]
2882         else:
2883           # Suck the node from the nodes_db, but update the key
2884           this_contents = self.nodes_db[this_key]
2885           this_key = gen_key()
2886           self.new_nodes[this_key] = this_contents
2887           parent_contents[component] = this_key
2888       elif create:
2889         # The component does not exists, so we create it.
2890         this_contents = { }
2891         this_key = gen_key()
2892         self.new_nodes[this_key] = this_contents
2893         parent_contents[component] = this_key
2894         if i < len(components) - 1:
2895           self._invoke_delegates('mkdir', path_so_far)
2896       else:
2897         # The component does not exists and we are not instructed to
2898         # create it, so we give up.
2899         return None, None
2900
2901       parent_key = this_key
2902       parent_contents = this_contents
2903
2904     return this_key, this_contents
2905
2906   def _path_exists(self, path):
2907     """If PATH exists in self.youngest of the svn repository mirror,
2908     return true, else return None.
2909
2910     PATH must not start with '/'."""
2911     return self._open_readonly_node(path, self.youngest) is not None
2912
2913   def _fast_delete_path(self, parent_path, parent_contents, component):
2914     """Delete COMPONENT from the parent direcory PARENT_PATH with the
2915     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
2916     in PARENT_CONTENTS."""
2917     if parent_contents.has_key(component):
2918       del parent_contents[component]
2919       self._invoke_delegates('delete_path', _path_join(parent_path, component))
2920
2921   def _delete_path(self, svn_path, should_prune=False):
2922     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
2923     all ancestor directories that are made empty when SVN_PATH is deleted.
2924     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
2925
2926     NOTE: This function does *not* allow you delete top-level entries
2927     (like /trunk, /branches, /tags), nor does it prune upwards beyond
2928     those entries."""
2929     pos = svn_path.rfind('/')
2930     parent_path = svn_path[:pos]
2931     entry = svn_path[pos+1:]
2932     parent_key, parent_contents = self._open_writable_node(parent_path, False)
2933     if parent_key is not None:
2934       self._fast_delete_path(parent_path, parent_contents, entry)
2935       # The following recursion makes pruning an O(n^2) operation in the
2936       # worst case (where n is the depth of SVN_PATH), but the worst case
2937       # is probably rare, and the constant cost is pretty low.  Another
2938       # drawback is that we issue a delete for each path and not just
2939       # a single delete for the topmost directory pruned.
2940       if (should_prune and len(parent_contents) == 0 and
2941           parent_path.find('/') != -1):
2942         self._delete_path(parent_path, True)
2943
2944   def _mkdir(self, path):
2945     """Create PATH in the repository mirror at the youngest revision."""
2946     self._open_writable_node(path, True)
2947     self._invoke_delegates('mkdir', path)
2948
2949   def _change_path(self, cvs_rev):
2950     """Register a change in self.youngest for the CVS_REV's svn_path
2951     in the repository mirror."""
2952     # We do not have to update the nodes because our mirror is only
2953     # concerned with the presence or absence of paths, and a file
2954     # content change does not cause any path changes.
2955     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, 0))
2956
2957   def _add_path(self, cvs_rev):
2958     """Add the CVS_REV's svn_path to the repository mirror."""
2959     self._open_writable_node(cvs_rev.svn_path, True)
2960     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, 1))
2961
2962   def _copy_path(self, src_path, dest_path, src_revnum):
2963     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
2964     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
2965     parent *must* exist, but DEST_PATH *cannot* exist.
2966
2967     Return the node key and the contents of the new node at DEST_PATH
2968     as a dictionary."""
2969     # get the contents of the node of our src_path
2970     src_key = self._open_readonly_node(src_path, src_revnum)
2971     src_contents = self._get_node(src_key)
2972
2973     # Get the parent path and the base path of the dest_path
2974     pos = dest_path.rindex('/')
2975     dest_parent = dest_path[:pos]
2976     dest_basename = dest_path[pos+1:]
2977     dest_parent_key, dest_parent_contents = \
2978                    self._open_writable_node(dest_parent, False)
2979
2980     if dest_parent_contents.has_key(dest_basename):
2981       msg = "Attempt to add path '%s' to repository mirror " % dest_path
2982       msg = msg + "when it already exists in the mirror."
2983       raise self.SVNRepositoryMirrorPathExistsError, msg
2984
2985     dest_parent_contents[dest_basename] = src_key
2986     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
2987
2988     # Yes sir, src_key and src_contents are also the contents of the
2989     # destination.  This is a cheap copy, remember!  :-)
2990     return src_key, src_contents
2991
2992   def _fill_symbolic_name(self, svn_commit):
2993     """Performs all copies necessary to create as much of the the tag
2994     or branch SVN_COMMIT.symbolic_name as possible given the current
2995     revision of the repository mirror.
2996
2997     The symbolic name is guaranteed to exist in the Subversion
2998     repository by the end of this call, even if there are no paths
2999     under it."""
3000     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3001       svn_commit.symbolic_name, self.youngest)
3002
3003     # Create the list of sources for the symbolic name.  All source
3004     # prefixes must be direct sources for the destination, i.e. we
3005     # must have 'trunk' and 'branches/my_branch' and not just
3006     # 'branches'.
3007     sources = []
3008     for entry, key in symbol_fill.node_tree[symbol_fill.root_key].items():
3009       if entry == Ctx().trunk_base:
3010         sources.append(FillSource(entry, key))
3011       elif entry == Ctx().branches_base:
3012         for entry2, key2 in symbol_fill.node_tree[key].items():
3013           sources.append(FillSource(entry + '/' + entry2, key2))
3014       else:
3015         raise # Should never happen
3016     if self.tags_db.has_key(svn_commit.symbolic_name):
3017       dest_prefix = _path_join(Ctx().tags_base, svn_commit.symbolic_name)
3018     else:
3019       dest_prefix = _path_join(Ctx().branches_base,
3020                                svn_commit.symbolic_name)
3021
3022     if sources:
3023       dest_key = self._open_writable_node(dest_prefix, False)[0]
3024       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3025     else:
3026       # We can only get here for a branch whose first commit is an add
3027       # (as opposed to a copy).
3028       dest_path = Ctx().branches_base + '/' + symbol_fill.name
3029       if not self._path_exists(dest_path):
3030         # If our symbol_fill was empty, that means that our first
3031         # commit on the branch was to a file added on the branch, and
3032         # that this is our first fill of that branch.
3033         #
3034         # This case is covered by test 16.
3035         #
3036         # ...we create the branch by copying trunk from the our
3037         # current revision number minus 1
3038         source_path = Ctx().trunk_base
3039         entries = self._copy_path(source_path, dest_path,
3040                                   svn_commit.revnum - 1)[1]
3041         # Now since we've just copied trunk to a branch that's
3042         # *supposed* to be empty, we delete any entries in the
3043         # copied directory.
3044         for entry in entries.keys():
3045           del_path = dest_path + '/' + entry
3046           # Delete but don't prune.
3047           self._delete_path(del_path)
3048       else:
3049         msg = "Error filling branch '" + symbol_fill.name + "'.\n"
3050         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3051         msg = msg + "attempted to create a branch that already exists."
3052         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3053
3054   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3055             path = None, parent_source_prefix = None,
3056             preferred_revnum = None, prune_ok = None):
3057     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3058     SOURCES, and recurse into the child items.
3059
3060     DEST_PREFIX is the prefix of the destination directory, e.g.
3061     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3062     FillSource classes that are candidates to be copied to the
3063     destination.  DEST_KEY is the key in self.nodes_db to the
3064     destination, or None if the destination does not yet exist.
3065
3066     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3067     are at the top level, e.g. '/tags/my_tag'.
3068
3069     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3070     the parent directory, and PREFERRED_REVNUM is an int which is the
3071     source revision number that the caller (who may have copied KEY's
3072     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3073     then no revision is preferable to any other (which probably means
3074     that no copies have happened yet).
3075
3076     PRUNE_OK means that a copy has been made in this recursion, and
3077     it's safe to prune directories that are not in
3078     SYMBOL_FILL.node_tree, provided that said directory has a source
3079     prefix of one of the PARENT_SOURCE_PREFIX.
3080
3081     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3082     should only be passed in by recursive calls."""
3083     # Calculate scores and revnums for all sources
3084     for source in sources:
3085       src_revnum, score = symbol_fill.get_best_revnum(source.key,
3086                                                       preferred_revnum)
3087       source.set_score(score, src_revnum)
3088
3089     # Sort the sources in descending score order so that we will make
3090     # a eventual copy from the source with the highest score.
3091     sources.sort()
3092     copy_source = sources[0]
3093
3094     src_path = _path_join(copy_source.prefix, path)
3095     dest_path = _path_join(dest_prefix, path)
3096
3097     # Figure out if we shall copy to this destination and delete any
3098     # destination path that is in the way.
3099     do_copy = 0
3100     if dest_key is None:
3101       do_copy = 1
3102     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3103                        copy_source.revnum != preferred_revnum):
3104       # We are about to replace the destination, so we need to remove
3105       # it before we perform the copy.
3106       self._delete_path(dest_path)
3107       do_copy = 1
3108
3109     if do_copy:
3110       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3111                                                copy_source.revnum)
3112       prune_ok = 1
3113     else:
3114       dest_entries = self._get_node(dest_key)
3115
3116     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3117     # elements and the values are lists of FillSource classes where
3118     # this path element exists.
3119     src_entries = {}
3120     for source in sources:
3121       for entry, key in symbol_fill.node_tree[source.key].items():
3122         if entry[0] == '/': # Skip flags
3123           continue
3124         if not src_entries.has_key(entry):
3125           src_entries[entry] = []
3126         src_entries[entry].append(FillSource(source.prefix, key))
3127
3128     if prune_ok:
3129       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3130       delete_list = [ ]
3131       for entry in dest_entries.keys():
3132         if not src_entries.has_key(entry):
3133           delete_list.append(entry)
3134       if delete_list:
3135         if not self.new_nodes.has_key(dest_key):
3136           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3137         # Sort the delete list to get "diffable" dumpfiles.
3138         delete_list.sort()
3139         for entry in delete_list:
3140           self._fast_delete_path(dest_path, dest_entries, entry)
3141
3142     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3143     src_keys = src_entries.keys()
3144     src_keys.sort()
3145     for src_key in src_keys:
3146       if dest_entries.has_key(src_key):
3147         next_dest_key = dest_entries[src_key]
3148       else:
3149         next_dest_key = None
3150       self._fill(symbol_fill, dest_prefix, next_dest_key,
3151                  src_entries[src_key], _path_join(path, src_key),
3152                  copy_source.prefix, sources[0].revnum, prune_ok)
3153
3154   def _synchronize_default_branch(self, svn_commit):
3155     """Propagate any changes that happened on a non-trunk default
3156     branch to the trunk of the repository.  See
3157     CVSCommit._post_commit() for details on why this is necessary."""
3158     for cvs_rev in svn_commit.cvs_revs:
3159       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3160         if self._path_exists(cvs_rev.svn_trunk_path):
3161           # Delete the path on trunk...
3162           self._delete_path(cvs_rev.svn_trunk_path)
3163         # ...and copy over from branch
3164         self._copy_path(cvs_rev.svn_path, cvs_rev.svn_trunk_path,
3165                         svn_commit.motivating_revnum)
3166       elif cvs_rev.op == OP_DELETE:
3167         # delete trunk path
3168         self._delete_path(cvs_rev.svn_trunk_path)
3169       else:
3170         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3171                % cvs_rev.op)
3172         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3173
3174   def commit(self, svn_commit):
3175     """Add an SVNCommit to the SVNRepository, incrementing the
3176     Repository revision number, and changing the repository.  Invoke
3177     the delegates' _start_commit() method."""
3178
3179     if svn_commit.revnum == 2:
3180       self._initialize_repository(svn_commit.get_date())
3181
3182     self._start_commit(svn_commit)
3183
3184     if svn_commit.symbolic_name:
3185       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3186                   svn_commit.symbolic_name)
3187       self._fill_symbolic_name(svn_commit)
3188     elif svn_commit.motivating_revnum:
3189       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3190                   % svn_commit.motivating_revnum)
3191       self._synchronize_default_branch(svn_commit)
3192     else: # This actually commits CVSRevisions
3193       if len(svn_commit.cvs_revs) > 1: plural = "s"
3194       else: plural = ""
3195       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3196                   % (len(svn_commit.cvs_revs), plural))
3197       for cvs_rev in svn_commit.cvs_revs:
3198         # See comment in CVSCommit._commit() for what this is all
3199         # about.  Note that although asking self._path_exists() is
3200         # somewhat expensive, we only do it if the first two (cheap)
3201         # tests succeed first.
3202         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3203                 and (cvs_rev.rev == "1.1.1.1")
3204                 and self._path_exists(cvs_rev.svn_path)):
3205           if cvs_rev.op == OP_ADD:
3206             self._add_path(cvs_rev)
3207           elif cvs_rev.op == OP_CHANGE:
3208             self._change_path(cvs_rev)
3209
3210         if cvs_rev.op == OP_DELETE:
3211           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3212
3213   def cleanup(self):
3214     """Callback for the Cleanup.register in self.__init__."""
3215     self.revs_db = None
3216     self.nodes_db = None
3217
3218   def add_delegate(self, delegate):
3219     """Adds DELEGATE to self.delegates.
3220
3221     For every delegate you add, as soon as SVNRepositoryMirror
3222     performs a repository action method, SVNRepositoryMirror will call
3223     the delegate's corresponding repository action method.  Multiple
3224     delegates will be called in the order that they are added.  See
3225     SVNRepositoryMirrorDelegate for more information."""
3226     self.delegates.append(delegate)
3227
3228   def _invoke_delegates(self, method, *args):
3229     """Iterate through each of our delegates, in the order that they
3230     were added, and call the delegate's method named METHOD with the
3231     arguments in ARGS."""
3232     for delegate in self.delegates:
3233       getattr(delegate, method)(*args)
3234
3235   def finish(self):
3236     """Calls the delegate finish method."""
3237     self._end_commit()
3238     self._invoke_delegates('finish')
3239     self.cleanup()
3240
3241
3242 class SVNCommitItem:
3243   """A wrapper class for CVSRevision objects upon which
3244    Subversion-related data (such as properties) may be hung."""
3245
3246   def __init__(self, c_rev, make_svn_props):
3247     self.c_rev = c_rev
3248     self.set_cvs_revnum_properties = Ctx().cvs_revnums
3249     self.eol_from_mime_type = Ctx().eol_from_mime_type
3250     self.no_default_eol = Ctx().no_default_eol
3251     self.keywords_off = Ctx().keywords_off
3252     self.mime_mapper = Ctx().mime_mapper
3253
3254     # We begin with only a "CVS revision" property.
3255     self.svn_props = { }
3256     if self.set_cvs_revnum_properties:
3257       self.svn_props['cvs2svn:cvs-rev'] = c_rev.rev
3258       make_svn_props = True
3259
3260     # If asked to fill in the Subversion properties ('svn:' ones), do so.
3261     if make_svn_props:
3262       # Tack on the executableness, if any.
3263       if c_rev.file_executable:
3264         self.svn_props['svn:executable'] = '*'
3265
3266       # Set the svn:keywords property, if appropriate.  See issue #2.
3267       if c_rev.mode is None or c_rev.mode == 'kv' or c_rev.mode == 'kvl':
3268         if not self.keywords_off:
3269           self.svn_props['svn:keywords'] = 'author date id revision'
3270
3271       # Set mime-type and eol.  These two properties are intertwingled;
3272       # follow the conditionals carefully.  See also issue #39.
3273       mime_type = None
3274       eol_style = None
3275
3276       if self.mime_mapper:
3277         mime_type = self.mime_mapper.get_type_from_filename(c_rev.cvs_path)
3278
3279       if not c_rev.mode == 'b':
3280         if not self.no_default_eol:
3281           eol_style = 'native'
3282         elif mime_type and self.eol_from_mime_type:
3283           if mime_type.startswith("text/"):
3284             eol_style = 'native'
3285           else:
3286             eol_style = None
3287       elif mime_type is None:
3288         # file is kb, and no other mimetype specified
3289         mime_type = 'application/octet-stream'
3290
3291       if mime_type:
3292         self.svn_props['svn:mime-type'] = mime_type
3293
3294       if eol_style:
3295         self.svn_props['svn:eol-style'] = eol_style
3296
3297
3298 class SVNRepositoryMirrorDelegate:
3299   """Abstract superclass for any delegate to SVNRepositoryMirror.
3300   Subclasses must implement all of the methods below.
3301
3302   For each method, a subclass implements, in its own way, the
3303   Subversion operation implied by the method's name.  For example, for
3304   the add_path method, the DumpfileDelegate would write out a
3305   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3306   would merely print that the path is being added to the repository,
3307   and the RepositoryDelegate would actually cause the path to be added
3308   to the Subversion repository that it is creating.
3309   """
3310
3311   def start_commit(self, svn_commit):
3312     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3313     see subclass implementation for details."""
3314     raise NotImplementedError
3315
3316   def mkdir(self, path):
3317     """PATH is a string; see subclass implementation for details."""
3318     raise NotImplementedError
3319
3320   def add_path(self, s_item):
3321     """S_ITEM is an SVNCommitItem; see subclass implementation for
3322     details."""
3323     raise NotImplementedError
3324
3325   def change_path(self, s_item):
3326     """S_ITEM is an SVNCommitItem; see subclass implementation for
3327     details."""
3328     raise NotImplementedError
3329
3330   def delete_path(self, path):
3331     """PATH is a string; see subclass implementation for
3332     details."""
3333     raise NotImplementedError
3334
3335   def copy_path(self, src_path, dest_path, src_revnum):
3336     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3337     subversion revision number (int); see subclass implementation for
3338     details."""
3339     raise NotImplementedError
3340
3341   def finish(self):
3342     """Perform any cleanup necessary after all revisions have been
3343     committed."""
3344     raise NotImplementedError
3345
3346
3347 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3348   """Create a Subversion dumpfile."""
3349
3350   def __init__(self, dumpfile_path=None):
3351     """Return a new DumpfileDelegate instance, attached to a dumpfile
3352     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding.
3353
3354     If Ctx().cvs_revnums is true, then set the 'cvs2svn:cvs-revnum'
3355     property on files, when they are changed due to a corresponding
3356     CVS revision.
3357
3358     If Ctx().mime_mapper is not None, then it is a MimeMapper
3359     instance, used to determine whether or not to set the
3360     'svn:mime-type' property on files.  But even if Ctx().mime_mapper
3361     is None, files marked with the CVS 'kb' flag will receive a mime
3362     type of "application/octet-stream".
3363
3364     Unless Ctx().no_default_eol is true, set 'svn:eol-style' to
3365     'native' for files not marked with the CVS 'kb' flag, except as
3366     superseded by Ctx().eol_from_mime_type (see below).
3367
3368     If Ctx().eol_from_mime_type is not None, then set 'svn:eol-style'
3369     to 'native' for all files to which Ctx().mime_mapper assigns a
3370     mime type beginning with "text/", and don't set 'svn:eol-style'
3371     for files assigned a type not beginning with "text/".
3372     """
3373     if dumpfile_path:
3374       self.dumpfile_path = dumpfile_path
3375     else:
3376       self.dumpfile_path = Ctx().dumpfile
3377     self.path_encoding = Ctx().encoding
3378
3379     self.dumpfile = open(self.dumpfile_path, 'wb')
3380     self._write_dumpfile_header(self.dumpfile)
3381
3382   def _write_dumpfile_header(self, dumpfile):
3383     # Initialize the dumpfile with the standard headers.
3384     #
3385     # Since the CVS repository doesn't have a UUID, and the Subversion
3386     # repository will be created with one anyway, we don't specify a
3387     # UUID in the dumpflie
3388     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3389
3390   def _utf8_path(self, path):
3391     """Return a copy of PATH encoded in UTF-8.  PATH is assumed to be
3392     encoded in self.path_encoding."""
3393     try:
3394       # Log messages can be converted with the 'replace' strategy,
3395       # but we can't afford any lossiness here.
3396       unicode_path = unicode(path, self.path_encoding, 'strict')
3397       return unicode_path.encode('utf-8')
3398     except UnicodeError:
3399       print "Unable to convert a path '%s' to internal encoding." % path
3400       print "Consider rerunning with (for example) '--encoding=latin1'"
3401       sys.exit(1)
3402
3403   def start_commit(self, svn_commit):
3404     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3405
3406     self.revision = svn_commit.revnum
3407
3408     # The start of a new commit typically looks like this:
3409     #
3410     #   Revision-number: 1
3411     #   Prop-content-length: 129
3412     #   Content-length: 129
3413     #
3414     #   K 7
3415     #   svn:log
3416     #   V 27
3417     #   Log message for revision 1.
3418     #   K 10
3419     #   svn:author
3420     #   V 7
3421     #   jrandom
3422     #   K 8
3423     #   svn:date
3424     #   V 27
3425     #   2003-04-22T22:57:58.132837Z
3426     #   PROPS-END
3427     #
3428     # Notice that the length headers count everything -- not just the
3429     # length of the data but also the lengths of the lengths, including
3430     # the 'K ' or 'V ' prefixes.
3431     #
3432     # The reason there are both Prop-content-length and Content-length
3433     # is that the former includes just props, while the latter includes
3434     # everything.  That's the generic header form for any entity in a
3435     # dumpfile.  But since revisions only have props, the two lengths
3436     # are always the same for revisions.
3437
3438     # Calculate the total length of the props section.
3439     props = svn_commit.get_revprops()
3440     prop_names = props.keys()
3441     prop_names.sort()
3442     total_len = 10  # len('PROPS-END\n')
3443     for propname in prop_names:
3444       if props[propname] is None:
3445         continue
3446       klen = len(propname)
3447       klen_len = len('K %d' % klen)
3448       vlen = len(props[propname])
3449       vlen_len = len('V %d' % vlen)
3450       # + 4 for the four newlines within a given property's section
3451       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
3452
3453     # Print the revision header and props
3454     self.dumpfile.write('Revision-number: %d\n'
3455                         'Prop-content-length: %d\n'
3456                         'Content-length: %d\n'
3457                         '\n'
3458                         % (self.revision, total_len, total_len))
3459
3460     for propname in prop_names:
3461       if props[propname] is None:
3462         continue
3463       self.dumpfile.write('K %d\n'
3464                           '%s\n'
3465                           'V %d\n'
3466                           '%s\n' % (len(propname),
3467                                     propname,
3468                                     len(props[propname]),
3469                                     props[propname]))
3470
3471     self.dumpfile.write('PROPS-END\n')
3472     self.dumpfile.write('\n')
3473
3474   def mkdir(self, path):
3475     """Emit the creation of directory PATH."""
3476     self.dumpfile.write("Node-path: %s\n"
3477                         "Node-kind: dir\n"
3478                         "Node-action: add\n"
3479                         "Content-length: 10\n"
3480                         "\n"
3481                         "\n" % self._utf8_path(path))
3482
3483   def _add_or_change_path(self, s_item, op):
3484     """Emit the addition or change corresponding to S_ITEM.
3485     OP is either the constant OP_ADD or OP_CHANGE."""
3486
3487     # Validation stuffs
3488     if op == OP_ADD:
3489       action = 'add'
3490     elif op == OP_CHANGE:
3491       action = 'change'
3492     else:
3493       sys.stderr.write("%s: _add_or_change_path() called with bad op ('%s')"
3494                        % (error_prefix, op))
3495       sys.exit(1)
3496
3497     # Convenience variables
3498     c_rev = s_item.c_rev
3499     svn_props = s_item.svn_props
3500
3501     # The property handling here takes advantage of an undocumented
3502     # but IMHO consistent feature of the Subversion dumpfile-loading
3503     # code.  When a node's properties aren't mentioned (that is, the
3504     # "Prop-content-length:" header is absent, no properties are
3505     # listed at all, and there is no "PROPS-END\n" line) then no
3506     # change is made to the node's properties.
3507     #
3508     # This is consistent with the way dumpfiles behave w.r.t. text
3509     # content changes, so I'm comfortable relying on it.  If you
3510     # commit a change to *just* the properties of some node that
3511     # already has text contents from a previous revision, then in the
3512     # dumpfile output for the prop change, no "Text-content-length:"
3513     # nor "Text-content-md5:" header will be present, and the text of
3514     # the file will not be given.  But this does not cause the file's
3515     # text to be erased!  It simply remains unchanged.
3516     #
3517     # This works out great for cvs2svn, due to lucky coincidences:
3518     #
3519     # For files, the only properties we ever set are set in the first
3520     # revision; all other revisions (including on branches) inherit
3521     # from that.  After the first revision, we never change file
3522     # properties, therefore, there is no need to remember the full set
3523     # of properties on a given file once we've set it.
3524     #
3525     # For directories, the only property we set is "svn:ignore", and
3526     # while we may change it after the first revision, we always do so
3527     # based on the contents of a ".cvsignore" file -- in other words,
3528     # CVS is doing the remembering for us, so we still don't have to
3529     # preserve the previous value of the property ourselves.
3530
3531     # Calculate the (sorted-by-name) property string and length, if any.
3532     prop_contents = ''
3533     prop_names = svn_props.keys()
3534     prop_names.sort()
3535     for pname in prop_names:
3536       pval = svn_props[pname]
3537       prop_contents = prop_contents + \
3538                       'K %d\n%s\nV %d\n%s\n' \
3539                       % (len(pname), pname, len(pval), pval)
3540     if prop_contents:
3541       prop_contents = prop_contents + 'PROPS-END\n'
3542       props_len = len(prop_contents)
3543     else:
3544       props_len = 0
3545
3546     props_header = ''
3547     if props_len:
3548       props_header = 'Prop-content-length: %d\n' % props_len
3549
3550     # treat .cvsignore as a directory property
3551     dir_path, basename = os.path.split(c_rev.svn_path)
3552     if basename == ".cvsignore":
3553       ignore_vals = generate_ignores(c_rev)
3554       ignore_contents = '\n'.join(ignore_vals)
3555       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3556                          (len(ignore_contents), ignore_contents))
3557       ignore_contents = ignore_contents + 'PROPS-END\n'
3558       ignore_len = len(ignore_contents)
3559
3560       # write headers, then props
3561       self.dumpfile.write('Node-path: %s\n'
3562                           'Node-kind: dir\n'
3563                           'Node-action: change\n'
3564                           'Prop-content-length: %d\n'
3565                           'Content-length: %d\n'
3566                           '\n'
3567                           '%s'
3568                           % (self._utf8_path(dir_path), ignore_len,
3569                              ignore_len, ignore_contents))
3570
3571     pipe_cmd, pipe = get_co_pipe(c_rev)
3572     self.dumpfile.write('Node-path: %s\n'
3573                         'Node-kind: file\n'
3574                         'Node-action: %s\n'
3575                         '%s'  # no property header if no props
3576                         'Text-content-length: '
3577                         % (self._utf8_path(c_rev.svn_path),
3578                            action, props_header))
3579
3580     pos = self.dumpfile.tell()
3581
3582     self.dumpfile.write('0000000000000000\n'
3583                         'Text-content-md5: 00000000000000000000000000000000\n'
3584                         'Content-length: 0000000000000000\n'
3585                         '\n')
3586
3587     if prop_contents:
3588       self.dumpfile.write(prop_contents)
3589
3590     # Insert the rev contents, calculating length and checksum as we go.
3591     checksum = md5.new()
3592     length = 0
3593     normalize_crlf = sys.platform == "win32" and c_rev.mode != "b"
3594     trailing_cr = ""
3595     buf = pipe.fromchild.read(PIPE_READ_SIZE)
3596     while buf:
3597       if normalize_crlf:
3598         buf = string.replace(buf,"\r\n","\n")
3599         if buf[-1] == "\r":
3600           trailing_cr = "\r"
3601           buf = buf[:-1]
3602         else:
3603           trailing_cr = ""
3604       checksum.update(buf)
3605       length = length + len(buf)
3606       self.dumpfile.write(buf)
3607       # optimize because of python's immutable strings
3608       if trailing_cr:
3609         buf = trailing_cr + pipe.fromchild.read(PIPE_READ_SIZE)
3610       else:
3611         buf = pipe.fromchild.read(PIPE_READ_SIZE)
3612     pipe.fromchild.close()
3613     error_output = pipe.childerr.read()
3614     exit_status = pipe.wait()
3615     if exit_status:
3616       sys.exit("%s: The command '%s' failed with exit status: %s\n"
3617                "and the following output:\n"
3618                "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
3619
3620     # Go back to patch up the length and checksum headers:
3621     self.dumpfile.seek(pos, 0)
3622     # We left 16 zeros for the text length; replace them with the real
3623     # length, padded on the left with spaces:
3624     self.dumpfile.write('%16d' % length)
3625     # 16... + 1 newline + len('Text-content-md5: ') == 35
3626     self.dumpfile.seek(pos + 35, 0)
3627     self.dumpfile.write(checksum.hexdigest())
3628     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
3629     self.dumpfile.seek(pos + 84, 0)
3630     # The content length is the length of property data, text data,
3631     # and any metadata around/inside around them.
3632     self.dumpfile.write('%16d' % (length + props_len))
3633     # Jump back to the end of the stream
3634     self.dumpfile.seek(0, 2)
3635
3636     # This record is done (write two newlines -- one to terminate
3637     # contents that weren't themselves newline-termination, one to
3638     # provide a blank line for readability.
3639     self.dumpfile.write('\n\n')
3640
3641   def add_path(self, s_item):
3642     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
3643     self._add_or_change_path(s_item, OP_ADD)
3644
3645   def change_path(self, s_item):
3646     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
3647     self._add_or_change_path(s_item, OP_CHANGE)
3648
3649   def delete_path(self, path):
3650     """Emit the deletion of PATH."""
3651     self.dumpfile.write('Node-path: %s\n'
3652                         'Node-action: delete\n'
3653                         '\n' % self._utf8_path(path))
3654
3655   def copy_path(self, src_path, dest_path, src_revnum):
3656     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
3657     # We don't need to include "Node-kind:" for copies; the loader
3658     # ignores it anyway and just uses the source kind instead.
3659     self.dumpfile.write('Node-path: %s\n'
3660                         'Node-action: add\n'
3661                         'Node-copyfrom-rev: %d\n'
3662                         'Node-copyfrom-path: /%s\n'
3663                         '\n'
3664                         % (self._utf8_path(dest_path),
3665                            src_revnum,
3666                            self._utf8_path(src_path)))
3667
3668   def finish(self):
3669     """Perform any cleanup necessary after all revisions have been
3670     committed."""
3671     self.dumpfile.close()
3672
3673
3674 class RepositoryDelegate(DumpfileDelegate):
3675   """Creates a new Subversion Repository.  DumpfileDelegate does all
3676   of the heavy lifting."""
3677   def __init__(self):
3678     self.svnadmin = Ctx().svnadmin
3679     self.target = Ctx().target
3680     if not Ctx().existing_svnrepos:
3681       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
3682       # We always pass the --bdb-txn-nosync switch to svnadmin here
3683       # because it gives us a 4-5x speed boost (If cvs2svn is creating
3684       # the repository, cvs2svn should be the only program accessing
3685       # the svn repository (until cvs is done, at least)).  However,
3686       # for the sake of caution, we'll turn no-sync off in self.finish
3687       # unless the user passed --bdb-txn-nosync to cvs2svn.
3688       run_command('%s create %s %s' % (self.svnadmin, "--bdb-txn-nosync",
3689                                        self.target))
3690
3691     # Since the output of this run is a repository, not a dumpfile,
3692     # the temporary dumpfiles we create should go in the tmpdir.
3693     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
3694
3695     # This is 1 if a commit is in progress, otherwise None.
3696     self._commit_in_progress = None
3697
3698     self.dumpfile = open(self.dumpfile_path, 'w+b')
3699     self.loader_pipe = Popen3('%s load -q %s' % (self.svnadmin, self.target),
3700                               True)
3701     self.loader_pipe.fromchild.close()
3702     try:
3703       self._write_dumpfile_header(self.loader_pipe.tochild)
3704     except IOError:
3705       sys.stderr.write("%s: svnadmin failed with the following output while "
3706                        "loading the dumpfile:\n" % (error_prefix))
3707       sys.stderr.write(self.loader_pipe.childerr.read())
3708       sys.exit(1)
3709
3710   def _feed_pipe(self):
3711     """Feed the revision stored in the dumpfile to the svnadmin
3712     load pipe."""
3713     self.dumpfile.seek(0)
3714     while 1:
3715       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
3716       if not len(data):
3717         break
3718       try:
3719         self.loader_pipe.tochild.write(data)
3720       except IOError:
3721         sys.stderr.write("%s: svnadmin failed with the following output while "
3722                          "loading the dumpfile:\n" % (error_prefix))
3723         sys.stderr.write(self.loader_pipe.childerr.read())
3724         sys.exit(1)
3725
3726   def start_commit(self, svn_commit):
3727     """Start a new commit.  If a commit is already in progress, close
3728     the dumpfile, load it into the svn repository, open a new
3729     dumpfile, and write the header into it."""
3730     if self._commit_in_progress:
3731       self._feed_pipe()
3732     self.dumpfile.seek(0)
3733     self.dumpfile.truncate()
3734     DumpfileDelegate.start_commit(self, svn_commit)
3735     self._commit_in_progress = 1
3736
3737   def finish(self):
3738     """Loads the last commit into the repository."""
3739     self._feed_pipe()
3740     self.dumpfile.close()
3741     self.loader_pipe.tochild.close()
3742     error_output = self.loader_pipe.childerr.read()
3743     exit_status = self.loader_pipe.wait()
3744     if exit_status:
3745       sys.exit('%s: svnadmin load failed with exit status: %s\n'
3746                'and the following output:\n'
3747                '%s' % (error_prefix, exit_status, error_output))
3748     os.remove(self.dumpfile_path)
3749
3750     # If we created the repository and --bdb-no-sync wasn't passed,
3751     # then comment out the DB_TXN_NOSYNC line in the DB_CONFIG file
3752     if (not Ctx().existing_svnrepos) and (not Ctx().bdb_txn_nosync):
3753       db_config = os.path.join(self.target, "db/DB_CONFIG")
3754       no_sync = 'set_flags DB_TXN_NOSYNC\n'
3755
3756       contents = open(db_config, 'r').readlines()
3757       index = contents.index(no_sync)
3758       contents[index] = '# ' + no_sync
3759       contents = open(db_config, 'w').writelines(contents)
3760
3761
3762 class StdoutDelegate(SVNRepositoryMirrorDelegate):
3763   """Makes no changes to the disk, but writes out information to
3764   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
3765   print statements will state that we're doing something, when in
3766   reality, we aren't doing anything other than printing out that we're
3767   doing something.  Kind of zen, really."""
3768   def __init__(self, total_revs):
3769     self.total_revs = total_revs
3770
3771   def start_commit(self, svn_commit):
3772     """Prints out the Subversion revision number of the commit that is
3773     being started."""
3774     Log().write(LOG_VERBOSE, "=" * 60)
3775     Log().write(LOG_NORMAL, "Starting Subversion commit %d / %d" %
3776                 (svn_commit.revnum, self.total_revs))
3777
3778   def mkdir(self, path):
3779     """Print a line stating that we are creating directory PATH."""
3780     Log().write(LOG_VERBOSE, "  New Directory", path)
3781
3782   def add_path(self, s_item):
3783     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
3784     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
3785
3786   def change_path(self, s_item):
3787     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
3788     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
3789
3790   def delete_path(self, path):
3791     """Print a line stating that we are 'deleting' PATH."""
3792     Log().write(LOG_VERBOSE, "  Deleting", path)
3793
3794   def copy_path(self, src_path, dest_path, src_revnum):
3795     """Print a line stating that we are 'copying' revision SRC_REVNUM
3796     of SRC_PATH to DEST_PATH."""
3797     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
3798     Log().write(LOG_VERBOSE, "                to", dest_path)
3799
3800   def finish(self):
3801     """State that we are done creating our repository."""
3802     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
3803     Log().write(LOG_QUIET, "Done.")
3804
3805 # This should be a local to pass1,
3806 # but Python 2.0 does not support nested scopes.
3807 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
3808 def pass1():
3809   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
3810   cd = CollectData()
3811
3812   def visit_file(baton, dirname, files):
3813     cd = baton
3814     for fname in files:
3815       if fname[-2:] != ',v':
3816         continue
3817       cd.found_valid_file = 1
3818       pathname = os.path.join(dirname, fname)
3819       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
3820         # drop the 'Attic' portion from the pathname for the canonical name.
3821         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
3822       else:
3823         # If this file also exists in the attic, it's a fatal error
3824         attic_path = os.path.join(dirname, 'Attic', fname)
3825         if os.path.exists(attic_path):
3826           err = "%s: A CVS repository cannot contain both %s and %s" \
3827                 % (error_prefix, pathname, attic_path)
3828           sys.stderr.write(err + '\n')
3829           cd.fatal_errors.append(err)
3830         cd.set_fname(pathname, pathname)
3831       Log().write(LOG_NORMAL, pathname)
3832       try:
3833         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
3834       except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
3835         err = "%s: '%s' is not a valid ,v file" \
3836               % (error_prefix, pathname)
3837         sys.stderr.write(err + '\n')
3838         cd.fatal_errors.append(err)
3839       except:
3840         Log().write(LOG_WARN, "Exception occurred while parsing %s" % pathname)
3841         raise
3842
3843   os.path.walk(Ctx().cvsroot, visit_file, cd)
3844   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
3845
3846   cd.write_symbol_db()
3847
3848   if len(cd.fatal_errors) > 0:
3849     sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
3850              + "Error summary:\n"
3851              + "\n".join(cd.fatal_errors)
3852              + "\nExited due to fatal error(s).")
3853
3854   if cd.found_valid_file is None:
3855     sys.exit("\nNo RCS files found in your CVS Repository!\n"
3856              + "Are you absolutely certain you are pointing cvs2svn\n"
3857              + "at a CVS repository?\n"
3858              + "\nExited due to fatal error(s).")
3859
3860   StatsKeeper().reset_c_rev_info()
3861   StatsKeeper().archive()
3862   Log().write(LOG_QUIET, "Done")
3863
3864 def pass2():
3865   "Pass 2: clean up the revision information."
3866
3867   symbol_db = SymbolDatabase()
3868   symbol_db.read()
3869
3870   # Convert the list of regexps to a list of strings
3871   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
3872
3873   error_detected = 0
3874
3875   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
3876   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
3877   if blocked_excludes:
3878     for branch, blockers in blocked_excludes.items():
3879       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
3880                        "excluded because the following symbols depend "
3881                        "on it:\n" % (branch))
3882       for blocker in blockers:
3883         sys.stderr.write("    '%s'\n" % (blocker))
3884     sys.stderr.write("\n")
3885     error_detected = 1
3886
3887   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
3888   invalid_forced_tags = [ ]
3889   for forced_tag in Ctx().forced_tags:
3890     if excludes.has_key(forced_tag):
3891       continue
3892     if symbol_db.branch_has_commit(forced_tag):
3893       invalid_forced_tags.append(forced_tag)
3894   if invalid_forced_tags:
3895     sys.stderr.write(error_prefix + ": The following branches cannot be "
3896                      "forced to be tags because they have commits:\n")
3897     for tag in invalid_forced_tags:
3898       sys.stderr.write("    '%s'\n" % (tag))
3899     sys.stderr.write("\n")
3900     error_detected = 1
3901
3902   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
3903   mismatches = symbol_db.find_mismatches(excludes)
3904   def is_not_forced(mismatch):
3905     name = mismatch[0]
3906     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
3907   mismatches = filter(is_not_forced, mismatches)
3908   if mismatches:
3909     sys.stderr.write(error_prefix + ": The following symbols are tags "
3910                      "in some files and branches in others.\nUse "
3911                      "--force-tag, --force-branch and/or --exclude to "
3912                      "resolve the symbols.\n")
3913     for name, tag_count, branch_count, commit_count in mismatches:
3914       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
3915                        "%d files and has commits in %d files.\n"
3916                        % (name, tag_count, branch_count, commit_count))
3917     error_detected = 1
3918
3919   # Bail out now if we found errors
3920   if error_detected:
3921     sys.exit(1)
3922
3923   # Create the tags database
3924   tags_db = TagsDatabase(DB_OPEN_NEW)
3925   for tag in symbol_db.tags.keys():
3926     if tag not in Ctx().forced_branches:
3927       tags_db[tag] = None
3928   for tag in Ctx().forced_tags:
3929     tags_db[tag] = None
3930
3931   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
3932
3933   # We may have recorded some changes in revisions' timestamp.  We need to
3934   # scan for any other files which may have had the same log message and
3935   # occurred at "the same time" and change their timestamps, too.
3936
3937   # read the resync data file
3938   def read_resync(fname):
3939     "Read the .resync file into memory."
3940
3941     ### note that we assume that we can hold the entire resync file in
3942     ### memory. really large repositories with whacky timestamps could
3943     ### bust this assumption. should that ever happen, then it is possible
3944     ### to split the resync file into pieces and make multiple passes,
3945     ### using each piece.
3946
3947     #
3948     # A digest maps to a sequence of lists which specify a lower and upper
3949     # time bound for matching up the commit.  We keep a sequence of these
3950     # because a number of checkins with the same log message (e.g. an empty
3951     # log message) could need to be remapped.  We also make them a list because
3952     # we will dynamically expand the lower/upper bound as we find commits
3953     # that fall into a particular msg and time range.
3954     #
3955     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
3956     #
3957     resync = { }
3958
3959     for line in fileinput.FileInput(fname):
3960       t1 = int(line[:8], 16)
3961       digest = line[9:DIGEST_END_IDX]
3962       t2 = int(line[DIGEST_END_IDX+1:], 16)
3963       t1_l = t1 - COMMIT_THRESHOLD/2
3964       t1_u = t1 + COMMIT_THRESHOLD/2
3965       if resync.has_key(digest):
3966         resync[digest].append([t1_l, t1_u, t2])
3967       else:
3968         resync[digest] = [ [t1_l, t1_u, t2] ]
3969
3970     # For each digest, sort the resync items in it in increasing order,
3971     # based on the lower time bound.
3972     digests = resync.keys()
3973     for digest in digests:
3974       (resync[digest]).sort()
3975
3976     return resync
3977
3978   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
3979
3980   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
3981   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
3982
3983   # process the revisions file, looking for items to clean up
3984   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
3985     c_rev = CVSRevision(Ctx(), line[:-1])
3986
3987     # Skip this entire revision if it's on an excluded branch
3988     if excludes.has_key(c_rev.branch_name):
3989       continue
3990
3991     # Remove all references to excluded tags and branches
3992     def not_excluded(symbol, excludes=excludes):
3993       return not excludes.has_key(symbol)
3994     c_rev.branches = filter(not_excluded, c_rev.branches)
3995     c_rev.tags = filter(not_excluded, c_rev.tags)
3996
3997     # Convert all branches that are forced to be tags
3998     for forced_tag in Ctx().forced_tags:
3999       if forced_tag in c_rev.branches:
4000         c_rev.branches.remove(forced_tag)
4001         c_rev.tags.append(forced_tag)
4002
4003     # Convert all tags that are forced to be branches
4004     for forced_branch in Ctx().forced_branches:
4005       if forced_branch in c_rev.tags:
4006         c_rev.tags.remove(forced_branch)
4007         c_rev.branches.append(forced_branch)
4008
4009     # see if this is "near" any of the resync records we
4010     # have recorded for this digest [of the log message].
4011     for record in resync.get(c_rev.digest, []):
4012       if record[0] <= c_rev.timestamp <= record[1]:
4013         # bingo! remap the time on this (record[2] is the new time).
4014
4015         # adjust the time range. we want the COMMIT_THRESHOLD from the
4016         # bounds of the earlier/latest commit in this group.
4017         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4018         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4019
4020         # By default this will be the new timestamp
4021         new_timestamp = record[2]
4022         # If the new timestamp is earlier than that of our previous revision
4023         if record[2] < c_rev.prev_timestamp:
4024           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4025                   + " to time %s, which is before previous the time of"
4026                   + " revision %s (%s):")
4027           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4028                                         c_rev.cvs_path, record[2],
4029                                         c_rev.prev_rev, c_rev.prev_timestamp))
4030           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4031           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4032           # attempted sync time, then sync back to c_rev.prev_timestapm
4033           # + 1...
4034           if (c_rev.prev_timestamp - record[2]) < COMMIT_THRESHOLD:
4035             new_timestamp = c_rev.prev_timestamp + 1
4036             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4037                                                           new_timestamp))
4038           # ...otherwise, make no change
4039           else:
4040             new_timestamp = c_rev.timestamp
4041             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4042                         warning_prefix)
4043
4044         msg = "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4045               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4046                  record[2] - c_rev.timestamp)
4047         Log().write(LOG_VERBOSE, msg)
4048
4049         c_rev.timestamp = new_timestamp
4050
4051         # stop looking for hits
4052         break
4053
4054     output.write(str(c_rev) + "\n")
4055   Log().write(LOG_QUIET, "Done")
4056
4057 def pass3():
4058   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4059   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4060             temp(DATAFILE + SORTED_REVS_SUFFIX))
4061   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4062   Log().write(LOG_QUIET, "Done")
4063
4064 def pass4():
4065   """Iterate through sorted revs, storing them in a database.
4066   If we're not doing a trunk-only conversion, generate the
4067   LastSymbolicNameDatabase, which contains the last CVSRevision
4068   that is a source for each tag or branch.
4069   """
4070   Log().write(LOG_QUIET,
4071       "Copying CVS revision data from flat file to database...")
4072   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4073   if not Ctx().trunk_only:
4074     Log().write(LOG_QUIET,
4075         "and finding last CVS revisions for all symbolic names...")
4076     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4077   else:
4078     # This is to avoid testing Ctx().trunk_only every time around the loop
4079     class DummyLSNDB:
4080       def noop(*args): pass
4081       log_revision = noop
4082       create_database = noop
4083     last_sym_name_db = DummyLSNDB()
4084
4085   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4086     c_rev = CVSRevision(Ctx(), line[:-1])
4087     cvs_revs_db.log_revision(c_rev)
4088     last_sym_name_db.log_revision(c_rev)
4089     StatsKeeper().record_c_rev(c_rev)
4090
4091   last_sym_name_db.create_database()
4092   StatsKeeper().archive()
4093   Log().write(LOG_QUIET, "Done")
4094
4095 def pass5():
4096   """
4097   Generate the SVNCommit <-> CVSRevision mapping
4098   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4099   CVSRevisions that represent an opening or closing for a path on a
4100   branch or tag.  See SymbolingsLogger for more details.
4101   """
4102   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4103
4104   aggregator = CVSRevisionAggregator()
4105   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4106     c_rev = CVSRevision(Ctx(), line[:-1])
4107     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4108       aggregator.process_revision(c_rev)
4109   aggregator.flush()
4110
4111   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4112   StatsKeeper().archive()
4113   Log().write(LOG_QUIET, "Done")
4114
4115 def pass6():
4116   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4117
4118   if not Ctx().trunk_only:
4119     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4120               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4121     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4122   Log().write(LOG_QUIET, "Done")
4123
4124 def pass7():
4125   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4126
4127   def generate_offsets_for_symbolings():
4128     """This function iterates through all the lines in
4129     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4130     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4131     where SYMBOLIC_NAME is first encountered.  This will allow us to
4132     seek to the various offsets in the file and sequentially read only
4133     the openings and closings that we need."""
4134
4135     ###PERF This is a fine example of a db that can be in-memory and
4136     #just flushed to disk when we're done.  Later, it can just be sucked
4137     #back into memory.
4138     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4139     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4140
4141     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4142     old_sym = ""
4143     while 1:
4144       fpos = file.tell()
4145       line = file.readline()
4146       if not line:
4147         break
4148       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4149       if not sym == old_sym:
4150         Log().write(LOG_VERBOSE, " ", sym)
4151         old_sym = sym
4152         offsets_db[sym] = fpos
4153
4154   if not Ctx().trunk_only:
4155     generate_offsets_for_symbolings()
4156   Log().write(LOG_QUIET, "Done.")
4157
4158 def pass8():
4159   svncounter = 2 # Repository initialization is 1.
4160   repos = SVNRepositoryMirror()
4161   persistence_manager = PersistenceManager(DB_OPEN_READ)
4162
4163   if (Ctx().target):
4164     if not Ctx().dry_run:
4165       repos.add_delegate(RepositoryDelegate())
4166     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4167   else:
4168     if not Ctx().dry_run:
4169       repos.add_delegate(DumpfileDelegate())
4170     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4171
4172   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4173
4174   while(1):
4175     svn_commit = persistence_manager.get_svn_commit(svncounter)
4176     if not svn_commit:
4177       break
4178     repos.commit(svn_commit)
4179     svncounter += 1
4180
4181   repos.finish()
4182
4183 _passes = [
4184   pass1,
4185   pass2,
4186   pass3,
4187   pass4,
4188   pass5,
4189   pass6,
4190   pass7,
4191   pass8,
4192   ]
4193
4194
4195 class Ctx:
4196   """Session state for this run of cvs2svn.  For example, run-time
4197   options are stored here.  This class is a Borg, see
4198   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4199   """
4200   __shared_state = { }
4201   def __init__(self):
4202     self.__dict__ = self.__shared_state
4203     if self.__dict__:
4204       return
4205     # Else, initialize to defaults.
4206     self.cvsroot = None
4207     self.target = None
4208     self.dumpfile = DUMPFILE
4209     self.tmpdir = '.'
4210     self.verbose = 0
4211     self.quiet = 0
4212     self.prune = 1
4213     self.existing_svnrepos = 0
4214     self.dump_only = 0
4215     self.dry_run = 0
4216     self.trunk_only = 0
4217     self.trunk_base = "trunk"
4218     self.tags_base = "tags"
4219     self.branches_base = "branches"
4220     self.encoding = "ascii"
4221     self.mime_types_file = None
4222     self.mime_mapper = None
4223     self.no_default_eol = 0
4224     self.eol_from_mime_type = 0
4225     self.keywords_off = 0
4226     self.use_cvs = None
4227     self.svnadmin = "svnadmin"
4228     self.username = None
4229     self.print_help = 0
4230     self.skip_cleanup = 0
4231     self.cvs_revnums = 0
4232     self.bdb_txn_nosync = 0
4233     self.forced_branches = []
4234     self.forced_tags = []
4235     self.excludes = []
4236     self.symbol_transforms = []
4237
4238 class MimeMapper:
4239   """A class that provides mappings from file names to MIME types.
4240   Note that we should really be using Python's 'mimetypes' module.
4241   See http://cvs2svn.tigris.org/servlets/ReadMsg?list=dev&msgNo=266
4242   for more."""
4243
4244   def __init__(self):
4245     self.mappings = { }
4246
4247   def set_mime_types_file(self, mime_types_file):
4248     for line in fileinput.input(mime_types_file):
4249       if line.startswith("#"):
4250         continue
4251
4252       # format of a line is something like
4253       # text/plain c h cpp
4254       extensions = line.split()
4255       if len(extensions) < 2:
4256         continue
4257       type = extensions.pop(0)
4258       for ext in extensions:
4259         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4260           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n" \
4261                            % (warning_prefix, ext, self.mappings[ext], type))
4262         self.mappings[ext] = type
4263
4264
4265   def get_type_from_filename(self, filename):
4266     basename, extension = os.path.splitext(os.path.basename(filename))
4267
4268     # Extension includes the dot, so strip it (will leave extension
4269     # empty if filename ends with a dot, which is ok):
4270     extension = extension[1:]
4271
4272     # If there is no extension (or the file ends with a period), use
4273     # the base name for mapping.  This allows us to set mappings for
4274     # files such as README or Makefile:
4275     if not extension:
4276       extension = basename
4277     if self.mappings.has_key(extension):
4278       return self.mappings[extension]
4279     return None
4280
4281
4282 def convert(start_pass, end_pass):
4283   "Convert a CVS repository to an SVN repository."
4284
4285   cleanup = Cleanup()
4286   times = [ None ] * (end_pass + 1)
4287   times[start_pass - 1] = time.time()
4288   StatsKeeper().set_start_time(time.time())
4289   for i in range(start_pass - 1, end_pass):
4290     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4291     _passes[i]()
4292     times[i + 1] = time.time()
4293     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4294     # Dispose of items in Ctx() not intended to live past the end of the pass
4295     # (Identified by exactly one leading underscore)
4296     for attr in dir(Ctx()):
4297       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4298           and not attr[:6] == "_Ctx__"):
4299         delattr(Ctx(), attr)
4300     if not Ctx().skip_cleanup:
4301       cleanup.cleanup(_passes[i])
4302     StatsKeeper().set_end_time(time.time())
4303
4304   Log().write(LOG_QUIET, StatsKeeper())
4305   if end_pass < 4:
4306     Log().write(LOG_QUIET, '(These are unaltered CVS repository stats and do not\n'
4307                 + ' reflect tags or branches excluded via --exclude)\n')
4308   print StatsKeeper().timings()
4309
4310
4311 def usage():
4312   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
4313         % os.path.basename(sys.argv[0])
4314   print '  --help, -h           print this usage message and exit with success'
4315   print '  --version            print the version number'
4316   print '  -q                   quiet'
4317   print '  -v                   verbose'
4318   print '  -s PATH              path for SVN repos'
4319   print '  -p START[:END]       start at pass START, end at pass END of %d' % len(_passes)
4320   print '                       If only START is given, run only pass START'
4321   print '                       (implicitly enables --skip-cleanup)'
4322   print '  --existing-svnrepos  load into existing SVN repository'
4323   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
4324   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
4325   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
4326   print '  --dry-run            do not create a repository or a dumpfile;'
4327   print '                       just print what would happen.'
4328   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
4329   print '                       (only use this if having problems with RCS)'
4330   print '  --svnadmin=PATH      path to the svnadmin program'
4331   print '  --trunk-only         convert only trunk commits, not tags nor branches'
4332   print '  --trunk=PATH         path for trunk (default: %s)'    \
4333         % Ctx().trunk_base
4334   print '  --branches=PATH      path for branches (default: %s)' \
4335         % Ctx().branches_base
4336   print '  --tags=PATH          path for tags (default: %s)'     \
4337         % Ctx().tags_base
4338   print '  --no-prune           don\'t prune empty directories'
4339   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
4340   print '  --encoding=ENC       encoding of log messages in CVS repos (default: %s)' \
4341         % Ctx().encoding
4342   print '  --force-branch=NAME  force NAME to be a branch'
4343   print '  --force-tag=NAME     force NAME to be a tag'
4344   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
4345   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
4346   print '                       use Python regexp and reference syntax respectively'
4347   print '  --username=NAME      username for cvs2svn-synthesized commits'
4348   print '  --skip-cleanup       prevent the deletion of intermediate files'
4349   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
4350   print '  --cvs-revnums        record CVS revision numbers as file properties'
4351   print '  --mime-types=FILE    specify an apache-style mime.types file for\n' \
4352         '                       setting svn:mime-type'
4353   print '  --eol-from-mime-type set svn:eol-style by mime type (only with --mime-types)'
4354   print '  --no-default-eol     don\'t set svn:eol-style by CVS defaults'
4355   print '  --keywords-off       don\'t set svn:keywords on any files (cvs2svn sets'
4356   print '                       "svn:keywords to author date id" on non-binary files'
4357   print '                       by default)'
4358
4359 def main():
4360   # Convenience var, so we don't have to keep instantiating this Borg.
4361   ctx = Ctx()
4362
4363   profiling = None
4364   start_pass = 1
4365   end_pass = len(_passes)
4366
4367   try:
4368     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
4369                                [ "help", "create", "trunk=",
4370                                  "username=", "existing-svnrepos",
4371                                  "branches=", "tags=", "encoding=",
4372                                  "force-branch=", "force-tag=", "exclude=",
4373                                  "use-cvs", "mime-types=",
4374                                  "eol-from-mime-type", "no-default-eol",
4375                                  "trunk-only", "no-prune", "dry-run",
4376                                  "dump-only", "dumpfile=", "tmpdir=",
4377                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
4378                                  "bdb-txn-nosync", "version", "profile",
4379                                  "keywords-off", "symbol-transform="])
4380   except getopt.GetoptError, e:
4381     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
4382     usage()
4383     sys.exit(1)
4384
4385   for opt, value in opts:
4386     if opt == '--version':
4387         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
4388         sys.exit(0)
4389     elif opt == '-p':
4390       # Don't cleanup if we're doing incrementals.
4391       ctx.skip_cleanup = 1
4392       if value.find(':') > 0:
4393         start_pass, end_pass = map(int, value.split(':'))
4394       else:
4395         end_pass = start_pass = int(value)
4396       if start_pass > len(_passes) or start_pass < 1:
4397         print '%s: illegal value (%d) for starting pass. '\
4398               'must be 1 through %d.' % (error_prefix, int(start_pass),
4399                                          len(_passes))
4400         sys.exit(1)
4401       if end_pass < start_pass or end_pass > len(_passes):
4402         print '%s: illegal value (%d) for ending pass. ' \
4403               'must be %d through %d.' % (error_prefix, int(end_pass),
4404                                           int(start_pass), len(_passes))
4405         sys.exit(1)
4406     elif (opt == '--help') or (opt == '-h'):
4407       ctx.print_help = 1
4408     elif opt == '-v':
4409       Log().log_level = LOG_VERBOSE
4410       ctx.verbose = 1
4411     elif opt == '-q':
4412       Log().log_level = LOG_QUIET
4413       ctx.quiet = 1
4414     elif opt == '-s':
4415       ctx.target = value
4416     elif opt == '--existing-svnrepos':
4417       ctx.existing_svnrepos = 1
4418     elif opt == '--dumpfile':
4419       ctx.dumpfile = value
4420     elif opt == '--tmpdir':
4421       ctx.tmpdir = value
4422     elif opt == '--use-cvs':
4423       ctx.use_cvs = 1
4424     elif opt == '--svnadmin':
4425       ctx.svnadmin = value
4426     elif opt == '--trunk-only':
4427       ctx.trunk_only = 1
4428     elif opt == '--trunk':
4429       if not value:
4430         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4431       ctx.trunk_base = value
4432     elif opt == '--branches':
4433       if not value:
4434         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4435       ctx.branches_base = value
4436     elif opt == '--tags':
4437       if not value:
4438         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4439       ctx.tags_base = value
4440     elif opt == '--no-prune':
4441       ctx.prune = None
4442     elif opt == '--dump-only':
4443       ctx.dump_only = 1
4444     elif opt == '--dry-run':
4445       ctx.dry_run = 1
4446     elif opt == '--encoding':
4447       ctx.encoding = value
4448     elif opt == '--force-branch':
4449       ctx.forced_branches.append(value)
4450     elif opt == '--force-tag':
4451       ctx.forced_tags.append(value)
4452     elif opt == '--exclude':
4453       try:
4454         ctx.excludes.append(re.compile('^' + value + '$'))
4455       except re.error, e:
4456         sys.exit(error_prefix + ": '%s' is not a valid regexp.\n" % (value))
4457     elif opt == '--mime-types':
4458       ctx.mime_types_file = value
4459     elif opt == '--eol-from-mime-type':
4460       ctx.eol_from_mime_type = 1
4461     elif opt == '--no-default-eol':
4462       ctx.no_default_eol = 1
4463     elif opt == '--keywords-off':
4464       ctx.keywords_off = 1
4465     elif opt == '--username':
4466       ctx.username = value
4467     elif opt == '--skip-cleanup':
4468       ctx.skip_cleanup = 1
4469     elif opt == '--cvs-revnums':
4470       ctx.cvs_revnums = 1
4471     elif opt == '--bdb-txn-nosync':
4472       ctx.bdb_txn_nosync = 1
4473     elif opt == '--create':
4474       sys.stderr.write(warning_prefix +
4475           ': The behaviour produced by the --create option is now the '
4476           'default,\nand passing the option is deprecated.\n')
4477     elif opt == '--profile':
4478       profiling = 1
4479     elif opt == '--symbol-transform':
4480       ctx.symbol_transforms.append(value.split(":"))
4481
4482   if ctx.print_help:
4483     usage()
4484     sys.exit(0)
4485
4486   # Consistency check for options and arguments.
4487   if len(args) == 0:
4488     usage()
4489     sys.exit(1)
4490
4491   if len(args) > 1:
4492     sys.stderr.write(error_prefix +
4493                      ": must pass only one CVS repository.\n")
4494     usage()
4495     sys.exit(1)
4496
4497   ctx.cvsroot = args[0]
4498
4499   if not os.path.isdir(ctx.cvsroot):
4500     sys.stderr.write(error_prefix +
4501                      ": the given CVS repository path '%s' is not an "
4502                      "existing directory.\n" % ctx.cvsroot)
4503     sys.exit(1)
4504
4505   if ctx.use_cvs:
4506     # Ascend above the specified root if necessary, to find the cvs_repository
4507     # (a directory containing a CVSROOT directory) and the cvs_module (the
4508     # path of the conversion root within the cvs repository)
4509     # NB: cvs_module must be seperated by '/' *not* by os.sep .
4510     ctx.cvs_repository = os.path.abspath(ctx.cvsroot)
4511     prev_cvs_repository = None
4512     ctx.cvs_module = ""
4513     while prev_cvs_repository != ctx.cvs_repository:
4514       if os.path.isdir(os.path.join(ctx.cvs_repository, 'CVSROOT')):
4515         break
4516       prev_cvs_repository = ctx.cvs_repository
4517       ctx.cvs_repository, module_component = os.path.split(ctx.cvs_repository)
4518       ctx.cvs_module = module_component + "/" + ctx.cvs_module
4519     else:
4520       # Hit the root (of the drive, on Windows) without finding a CVSROOT dir.
4521       sys.stderr.write(error_prefix +
4522                        ": the path '%s' is not a CVS repository, nor a path " \
4523                        "within a CVS repository.  A CVS repository contains " \
4524                        "a CVSROOT directory within its root directory.\n" \
4525                        % ctx.cvsroot)
4526       sys.exit(1)
4527     os.environ['CVSROOT'] = ctx.cvs_repository
4528
4529   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
4530     sys.stderr.write(error_prefix +
4531                      ": must pass one of '-s' or '--dump-only'.\n")
4532     sys.exit(1)
4533
4534   def not_both(opt1val, opt1name, opt2val, opt2name):
4535     if opt1val and opt2val:
4536       sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
4537           % (opt1name, opt2name))
4538       sys.exit(1)
4539
4540   not_both(ctx.target, '-s', ctx.dump_only, '--dump-only')
4541
4542   not_both(ctx.dump_only, '--dump-only',
4543     ctx.existing_svnrepos, '--existing-svnrepos')
4544
4545   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
4546     ctx.existing_svnrepos, '--existing-svnrepos')
4547
4548   not_both(ctx.dump_only, '--dump-only',
4549     ctx.bdb_txn_nosync, '--bdb-txn-nosync')
4550
4551   not_both(ctx.quiet, '-q',
4552     ctx.verbose, '-v')
4553
4554   if ((string.find(ctx.trunk_base, '/') > -1)
4555       or (string.find(ctx.tags_base, '/') > -1)
4556       or (string.find(ctx.branches_base, '/') > -1)):
4557     sys.stderr.write("%s: cannot pass multicomponent path to "
4558                      "--trunk, --tags, or --branches yet.\n"
4559                      "  See http://cvs2svn.tigris.org/issues/show_bug.cgi?"
4560                      "id=7 for details.\n" % error_prefix)
4561     sys.exit(1)
4562
4563   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
4564     sys.stderr.write(error_prefix +
4565                      ": the svn-repos-path '%s' is not an "
4566                      "existing directory.\n" % ctx.target)
4567     sys.exit(1)
4568
4569   if not ctx.dump_only and not ctx.existing_svnrepos \
4570      and (not ctx.dry_run) and os.path.exists(ctx.target):
4571     sys.stderr.write(error_prefix +
4572                      ": the svn-repos-path '%s' exists.\nRemove it, or pass "
4573                      "'--existing-svnrepos'.\n" % ctx.target)
4574     sys.exit(1)
4575
4576   if ctx.mime_types_file:
4577     ctx.mime_mapper = MimeMapper()
4578     ctx.mime_mapper.set_mime_types_file(ctx.mime_types_file)
4579
4580   # Make sure the tmp directory exists.  Note that we don't check if
4581   # it's empty -- we want to be able to use, for example, "." to hold
4582   # tempfiles.  But if we *did* want check if it were empty, we'd do
4583   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
4584   if not os.path.exists(ctx.tmpdir):
4585     os.mkdir(ctx.tmpdir)
4586   elif not os.path.isdir(ctx.tmpdir):
4587     sys.stderr.write(error_prefix +
4588        ": cvs2svn tried to use '%s' for temporary files, but that path\n"
4589        "  exists and is not a directory.  Please make it be a directory,\n"
4590        "  or specify some other directory for temporary files.\n" \
4591                      % ctx.tmpdir)
4592     sys.exit(1)
4593
4594   if ctx.use_cvs:
4595     def cvs_ok():
4596       pipe = Popen3('cvs %s --version' % Ctx().cvs_global_arguments, True)
4597       pipe.tochild.close()
4598       pipe.fromchild.read()
4599       errmsg = pipe.childerr.read()
4600       status = pipe.wait()
4601       ok = len(errmsg) == 0 and status == 0
4602       return (ok, status, errmsg)
4603
4604     ctx.cvs_global_arguments = "-q -R"
4605     ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4606     if not ok:
4607       ctx.cvs_global_arguments = "-q"
4608       ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4609
4610     if not ok:
4611       sys.stderr.write(error_prefix +
4612                        ": error executing CVS: status %s, error output:\n" \
4613                        % (cvs_exitstatus) + cvs_errmsg)
4614
4615   # But do lock the tmpdir, to avoid process clash.
4616   try:
4617     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4618   except OSError, e:
4619     if e.errno == errno.EACCES:
4620       sys.stderr.write(error_prefix + ": Permission denied:"
4621                        + " No write access to output directory.\n")
4622       sys.exit(1)
4623     if e.errno == errno.EEXIST:
4624       sys.stderr.write(error_prefix +
4625           ": cvs2svn is using directory '%s' for temporary files, but\n"
4626           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
4627           "  cvs2svn process is currently using '%s' as its temporary\n"
4628           "  workspace.  If you are certain that is not the case,\n"
4629           "  then remove the '%s/cvs2svn.lock' subdirectory.\n" \
4630                        % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir))
4631       sys.exit(1)
4632     raise
4633   try:
4634     if profiling:
4635       import hotshot
4636       prof = hotshot.Profile('cvs2svn.hotshot')
4637       prof.runcall(convert, start_pass, end_pass)
4638       prof.close()
4639     else:
4640       convert(start_pass, end_pass)
4641   finally:
4642     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4643     except: pass
4644
4645 if __name__ == '__main__':
4646   main()