cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import string
  30 import getopt
  31 import stat
  32 import string
  33 import md5
  34 import marshal
  35 import errno
  36 import popen2
  37
  38 # Warnings and errors start with these strings.  They are typically
  39 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  40 warning_prefix = "WARNING"
  41 error_prefix = "ERROR"
  42
  43 # Make sure this Python is recent enough.
  44 if sys.hexversion < 0x2000000:
  45   sys.stderr.write("'%s: Python 2.0 or higher required, "
  46                    "see www.python.org.\n" % error_prefix)
  47   sys.exit(1)
  48
  49 # Pretend we have true booleans on older python versions
  50 try:
  51   True
  52 except:
  53   True = 1
  54   False = 0
  55
  56 # Minimal, incomplete, version of popen2.Popen3 for those platforms
  57 # for which popen2 does not provide it.
  58 try:
  59   Popen3 = popen2.Popen3
  60 except AttributeError:
  61   class Popen3:
  62     def __init__(self, cmd, capturestderr):
  63       if type(cmd) != str:
  64         cmd = " ".join(cmd)
  65       self.fromchild, self.tochild, self.childerr = popen2.popen3(cmd,
  66                                                                   mode='b')
  67     def wait(self):
  68       return self.fromchild.close() or self.tochild.close() or \
  69              self.childerr.close()
  70
  71 # DBM module selection
  72
  73 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
  74 #    so that the dbhash module used by anydbm will use bsddb3.
  75 try:
  76   import bsddb3
  77   sys.modules['bsddb'] = sys.modules['bsddb3']
  78 except ImportError:
  79   pass
  80
  81 # 2. These DBM modules are not good for cvs2svn.
  82 import anydbm
  83 if (anydbm._defaultmod.__name__ == 'dumbdbm'
  84     or anydbm._defaultmod.__name__ == 'dbm'):
  85   print 'ERROR: your installation of Python does not contain a suitable'
  86   print '  DBM module. This script cannot continue.'
  87   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  88   print '  for details.'
  89   sys.exit(1)
  90
  91 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
  92 #    Unfortunately, gdbm appears not to be trouble free, either.
  93 if hasattr(anydbm._defaultmod, 'bsddb') \
  94     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  95   try:
  96     gdbm = __import__('gdbm')
  97   except ImportError:
  98     sys.stderr.write(warning_prefix +
  99         ': The version of the bsddb module found '
 100         'on your computer has been reported to malfunction on some datasets, '
 101         'causing KeyError exceptions. You may wish to upgrade your Python to '
 102         'version 2.3 or later.\n')
 103   else:
 104     anydbm._defaultmod = gdbm
 105
 106 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 107 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 108 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 109
 110 # This really only matches standard '1.1.1.*'-style vendor revisions.
 111 # One could conceivably have a file whose default branch is 1.1.3 or
 112 # whatever, or was that at some point in time, with vendor revisions
 113 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 114 # is the only time this regexp gets used), we'd have no basis for
 115 # assuming that the non-standard vendor branch had ever been the
 116 # default branch anyway, so we don't want this to match them anyway.
 117 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 118
 119 # If this run's output is a repository, then (in the tmpdir) we use
 120 # a dumpfile of this name for repository loads.
 121 #
 122 # If this run's output is a dumpfile, then this is default name of
 123 # that dumpfile, but in the current directory (unless the user has
 124 # specified a dumpfile path, of course, in which case it will be
 125 # wherever the user said).
 126 DUMPFILE = 'cvs2svn-dump'
 127
 128 # This file appears with different suffixes at different stages of
 129 # processing.  CVS revisions are cleaned and sorted here, for commit
 130 # grouping.  See design-notes.txt for details.
 131 DATAFILE = 'cvs2svn-data'
 132
 133 # This file contains a marshalled copy of all the statistics that we
 134 # gather throughout the various runs of cvs2svn.  The data stored as a
 135 # marshalled dictionary.
 136 STATISTICS_FILE = 'cvs2svn-statistics'
 137
 138 # This text file contains records (1 per line) that describe svn
 139 # filesystem paths that are the opening and closing source revisions
 140 # for copies to tags and branches.  The format is as follows:
 141 #
 142 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 143 #
 144 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 145 # SVN_REVNUM are the primary and secondary sorting criteria for
 146 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 147 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 148 # A sorted version of the above file.
 149 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 150
 151 # This file is a temporary file for storing symbolic_name -> closing
 152 # CVSRevision until the end of our pass where we can look up the
 153 # corresponding SVNRevNum for the closing revs and write these out to
 154 # the SYMBOL_OPENINGS_CLOSINGS.
 155 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 156
 157 # Skeleton version of an svn filesystem.
 158 # (These supersede and will eventually replace the two above.)
 159 # See class SVNRepositoryMirror for how these work.
 160 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 161 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 162
 163 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 164 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 165 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 166
 167 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 168 # the CVSRevision is the last such that is a source for those symbolic
 169 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 170 # file, and this file's 1.3 is the latest (by date) revision among
 171 # *all* CVS files that is a source for branch B, then the
 172 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 173 # list at least B in its list.
 174 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 175
 176 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 177 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 178 ### the s-revs data in this database.
 179 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 180
 181 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 182 # names), values are ignorable.
 183 TAGS_DB = 'cvs2svn-tags.db'
 184
 185 # A list all tags.  Each line consists of the tag name and the number
 186 # of files in which it exists, separated by a space.
 187 TAGS_LIST = 'cvs2svn-tags.txt'
 188
 189 # A list of all branches.  The file is stored as a plain text file
 190 # to make it easy to look at in an editor.  Each line contains the
 191 # branch name, the number of files where the branch is created, the
 192 # commit count, and a list of tags and branches that are defined on
 193 # revisions in the branch.
 194 BRANCHES_LIST = 'cvs2svn-branches.txt'
 195
 196 # These two databases provide a bidirectional mapping between
 197 # CVSRevision.unique_key()s and Subversion revision numbers.
 198 #
 199 # The first maps CVSRevision.unique_key() to a number; the values are
 200 # not unique.
 201 #
 202 # The second maps a number to a list of CVSRevision.unique_key()s.
 203 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 204 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 205
 206 # This database maps svn_revnums to tuples of (symbolic_name, date).
 207 #
 208 # The svn_revnums are the revision numbers of all non-primary
 209 # SVNCommits.  No primary SVNCommit has a key in this database.
 210 #
 211 # The date is stored for all commits in this database.
 212 #
 213 # For commits that fill symbolic names, the symbolic_name is stored.
 214 # For commits that default branch syncs, the symbolic_name is None.
 215 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 216
 217 # This database maps svn_revnums of a default branch synchronization
 218 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 219 #
 220 # (NOTE: Secondary commits that fill branches and tags also have a
 221 # motivating commit, but we do not record it because it is (currently)
 222 # not needed for anything.)
 223 #
 224 # This mapping is used when generating the log message for the commit
 225 # that synchronizes the default branch with trunk.
 226 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 227
 228 # How many bytes to read at a time from a pipe.  128 kiB should be
 229 # large enough to be efficient without wasting too much memory.
 230 PIPE_READ_SIZE = 128 * 1024
 231
 232 # Record the default RCS branches, if any, for CVS filepaths.
 233 #
 234 # The keys are CVS filepaths, relative to the top of the repository
 235 # and with the ",v" stripped off, so they match the cvs paths used in
 236 # Commit.commit().  The values are vendor branch revisions, such as
 237 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 238 # represents the highest vendor branch revision thought to have ever
 239 # been head of the default branch.
 240 #
 241 # The reason we record a specific vendor revision, rather than a
 242 # default branch number, is that there are two cases to handle:
 243 #
 244 # One case is simple.  The RCS file lists a default branch explicitly
 245 # in its header, such as '1.1.1'.  In this case, we know that every
 246 # revision on the vendor branch is to be treated as head of trunk at
 247 # that point in time.
 248 #
 249 # But there's also a degenerate case.  The RCS file does not currently
 250 # have a default branch, yet we can deduce that for some period in the
 251 # past it probably *did* have one.  For example, the file has vendor
 252 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 253 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 254 # case, we should record 1.1.1.96 as the last vendor revision to have
 255 # been the head of the default branch.
 256 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 257
 258 # Records the author and log message for each changeset.
 259 # The keys are author+log digests, the same kind used to identify
 260 # unique revisions in the .revs, etc files.  Each value is a tuple
 261 # of two elements: '(author logmessage)'.
 262 METADATA_DB = "cvs2svn-metadata.db"
 263
 264 REVS_SUFFIX = '.revs'
 265 CLEAN_REVS_SUFFIX = '.c-revs'
 266 SORTED_REVS_SUFFIX = '.s-revs'
 267 RESYNC_SUFFIX = '.resync'
 268
 269 SVN_INVALID_REVNUM = -1
 270
 271 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 272
 273 # Things that can happen to a file.
 274 OP_NOOP   = '-'
 275 OP_ADD    = 'A'
 276 OP_DELETE = 'D'
 277 OP_CHANGE = 'C'
 278
 279 # A deltatext either does or doesn't represent some change.
 280 DELTATEXT_NONEMPTY = 'N'
 281 DELTATEXT_EMPTY    = 'E'
 282
 283 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 284
 285 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 286 OPENING = 'O'
 287 CLOSING = 'C'
 288
 289 def temp(basename):
 290   """Return a path to BASENAME in Ctx().tmpdir.
 291   This is a convenience function to save horizontal space in source."""
 292   return os.path.join(Ctx().tmpdir, basename)
 293
 294 # Since the unofficial set also includes [/\] we need to translate those
 295 # into ones that don't conflict with Subversion limitations.
 296 def _clean_symbolic_name(name):
 297   """Return symbolic name NAME, translating characters that Subversion
 298   does not allow in a pathname."""
 299   name = name.replace('/','++')
 300   name = name.replace('\\','--')
 301   return name
 302
 303 def _path_join(*components):
 304   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 305   Empty component are skipped."""
 306   return string.join(filter(None, components), '/')
 307
 308 def run_command(command):
 309   if os.system(command):
 310     sys.exit('Command failed: "%s"' % command)
 311
 312 def relative_name(cvsroot, fname):
 313   l = len(cvsroot)
 314   if fname[:l] == cvsroot:
 315     if fname[l] == os.sep:
 316       return string.replace(fname[l+1:], os.sep, '/')
 317     return string.replace(fname[l:], os.sep, '/')
 318   sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
 319                    " cvsroot\n" % (error_prefix, cvsroot, fname))
 320   sys.exit(1)
 321
 322 def get_co_pipe(c_rev):
 323   """Return a command string, and the pipe created using that string.
 324   C_REV is a CVSRevision. The pipe returns the text of that CVS Revision."""
 325   ctx = Ctx()
 326   if ctx.use_cvs:
 327     pipe_cmd = 'cvs %s co -r%s -p %s' % \
 328                (ctx.cvs_global_arguments, c_rev.rev,
 329                 escape_shell_arg(ctx.cvs_module + c_rev.cvs_path))
 330   else:
 331     pipe_cmd = 'co -q -x,v -p%s %s' % \
 332                (c_rev.rev, escape_shell_arg(c_rev.rcs_path()))
 333   pipe = Popen3(pipe_cmd, True)
 334   pipe.tochild.close()
 335   return pipe_cmd, pipe
 336
 337 def generate_ignores(c_rev):
 338   # Read in props
 339   pipe_cmd, pipe = get_co_pipe(c_rev)
 340   buf = pipe.fromchild.read(PIPE_READ_SIZE)
 341   raw_ignore_val = ""
 342   while buf:
 343     raw_ignore_val = raw_ignore_val + buf
 344     buf = pipe.fromchild.read(PIPE_READ_SIZE)
 345   pipe.fromchild.close()
 346   error_output = pipe.childerr.read()
 347   exit_status = pipe.wait()
 348   if exit_status:
 349     sys.exit("%s: The command '%s' failed with exit status: %s\n"
 350              "and the following output:\n"
 351              "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
 352
 353   # Tweak props: First, convert any spaces to newlines...
 354   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 355   raw_ignores = raw_ignore_val.split('\n')
 356   ignore_vals = [ ]
 357   for ignore in raw_ignores:
 358     # Reset the list if we encounter a '!'
 359     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 360     if ignore == '!':
 361       ignore_vals = [ ]
 362       continue
 363     # Skip empty lines
 364     if len(ignore) == 0:
 365       continue
 366     ignore_vals.append(ignore)
 367   return ignore_vals
 368
 369 # Return a string that has not been returned by gen_key() before.
 370 gen_key_base = 0L
 371 def gen_key():
 372   global gen_key_base
 373   key = '%x' % gen_key_base
 374   gen_key_base = gen_key_base + 1
 375   return key
 376
 377 if sys.platform == "win32":
 378   def escape_shell_arg(str):
 379     return '"' + string.replace(str, '"', '"^""') + '"'
 380 else:
 381   def escape_shell_arg(str):
 382     return "'" + string.replace(str, "'", "'\\''") + "'"
 383
 384 def format_date(date):
 385   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 386   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 387   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 388
 389 def sort_file(infile, outfile):
 390   # sort the log files
 391
 392   # GNU sort will sort our dates differently (incorrectly!) if our
 393   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 394   # it to 'C'
 395   if os.environ.has_key('LC_ALL'):
 396     lc_all_tmp = os.environ['LC_ALL']
 397   else:
 398     lc_all_tmp = None
 399   os.environ['LC_ALL'] = 'C'
 400   # The -T option to sort has a nice side effect.  The Win32 sort is
 401   # case insensitive and cannot be used, and since it does not
 402   # understand the -T option and dies if we try to use it, there is
 403   # no risk that we use that sort by accident.
 404   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 405   if lc_all_tmp is None:
 406     del os.environ['LC_ALL']
 407   else:
 408     os.environ['LC_ALL'] = lc_all_tmp
 409
 410 def print_node_tree(tree, root_node, indent_depth=0):
 411   """For debugging purposes.  Prints all nodes in TREE that are
 412   rooted at ROOT_NODE.  INDENT_DEPTH is merely for purposes of
 413   debugging with the print statement in this function."""
 414   if not indent_depth:
 415     print "TREE", "=" * 75
 416   print "TREE:", " " * (indent_depth * 2), root_node, tree[root_node]
 417   for key, value in tree[root_node].items():
 418     if key[0] == '/': #Skip flags
 419       continue
 420     print_node_tree(tree, value, (indent_depth + 1))
 421
 422 def match_regexp_list(regexp_list, string):
 423   """Return 1 if string matches any of the compiled regexps in REGEXP_LIST,
 424   else return None."""
 425   for regexp in regexp_list:
 426     if regexp.match(string):
 427       return 1
 428
 429 # These constants represent the log levels that this script supports
 430 LOG_WARN = -1
 431 LOG_QUIET = 0
 432 LOG_NORMAL = 1
 433 LOG_VERBOSE = 2
 434 class Log:
 435   """A Simple logging facility.  Each line will be timestamped is
 436   self.use_timestamps is TRUE.  This class is a Borg, see
 437   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 438   __shared_state = {}
 439   def __init__(self):
 440     self.__dict__ = self.__shared_state
 441     if self.__dict__:
 442       return
 443     self.log_level = LOG_NORMAL
 444     # Set this to true if you want to see timestamps on each line output.
 445     self.use_timestamps = None
 446     self.logger = sys.stdout
 447
 448   def _timestamp(self):
 449     """Output a detailed timestamp at the beginning of each line output."""
 450     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 451
 452   def write(self, log_level, *args):
 453     """This is the public method to use for writing to a file.  Only
 454     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 455     there are multiple ARGS, they will be separated by a space."""
 456     if log_level > self.log_level:
 457       return
 458     if self.use_timestamps:
 459       self._timestamp()
 460     self.logger.write(' '.join(map(str,args)) + "\n")
 461     # Ensure that log output doesn't get out-of-order with respect to
 462     # stderr output.
 463     self.logger.flush()
 464
 465
 466 class Cleanup:
 467   """This singleton class manages any files created by cvs2svn.  When
 468   you first create a file, call Cleanup.register, passing the
 469   filename, and the last pass that you need the file.  After the end
 470   of that pass, your file will be cleaned up after running an optional
 471   callback.  This class is a Borg, see
 472   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 473
 474   __shared_state = {}
 475   def __init__(self):
 476     self.__dict__ = self.__shared_state
 477     if self.__dict__:
 478       return
 479     self._log = {}
 480     self._callbacks = {}
 481
 482   def register(self, file, which_pass, callback=None):
 483     """Register FILE for cleanup at the end of WHICH_PASS, running
 484     function CALLBACK prior to removal.  Registering a given FILE is
 485     idempotent; you may register as many times as you wish, but it
 486     will only be cleaned up once.
 487
 488     Note that if a file is registered multiple times, only the first
 489     callback registered for that file will be called at cleanup
 490     time.  Also note that if you register a database file you must
 491     close the database before cleanup, e.g. using a callback."""
 492     if not self._log.has_key(which_pass):
 493       self._log[which_pass] = {}
 494     self._log[which_pass][file] = 1
 495     if callback and not self._callbacks.has_key(file):
 496       self._callbacks[file] = callback
 497
 498   def cleanup(self, which_pass):
 499     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 500     if not self._log.has_key(which_pass):
 501       return
 502     for file in self._log[which_pass].keys():
 503       Log().write(LOG_VERBOSE, "Deleting", file)
 504       if self._callbacks.has_key(file):
 505         self._callbacks[file]()
 506       os.unlink(file)
 507
 508
 509 # Always use these constants for opening databases.
 510 DB_OPEN_READ = 'r'
 511 DB_OPEN_NEW = 'n'
 512
 513 # A wrapper for anydbm that uses the marshal module to store items as
 514 # strings.
 515 class Database:
 516   def __init__(self, filename, mode):
 517     # pybsddb3 has a bug which prevents it from working with
 518     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 519     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 520     # for databases protected by lock and transaction support
 521     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 522     #
 523     # Therefore, manually perform the removal (we can do this, because
 524     # we know that for bsddb - but *not* anydbm in general - the database
 525     # consists of one file with the name we specify, rather than several
 526     # based on that name).
 527     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 528       if os.path.isfile(filename):
 529         os.unlink(filename)
 530       mode = 'c'
 531
 532     self.db = anydbm.open(filename, mode)
 533
 534   def has_key(self, key):
 535     return self.db.has_key(key)
 536
 537   def __getitem__(self, key):
 538     return marshal.loads(self.db[key])
 539
 540   def __setitem__(self, key, value):
 541     self.db[key] = marshal.dumps(value)
 542
 543   def __delitem__(self, key):
 544     del self.db[key]
 545
 546   def get(self, key, default):
 547     if self.has_key(key):
 548       return self.__getitem__(key)
 549     return default
 550
 551
 552 class StatsKeeper:
 553   __shared_state = { }
 554   def __init__(self):
 555     self.__dict__ = self.__shared_state
 556     if self.__dict__:
 557       return
 558     self.filename = temp(STATISTICS_FILE)
 559     Cleanup().register(self.filename, pass8)
 560     # This can get kinda large, so we don't store it in our data dict.
 561     self.repos_files = { }
 562
 563     if os.path.exists(self.filename):
 564       self.unarchive()
 565     else:
 566       self.data = { 'cvs_revs_count' : 0,
 567                     'tags': { },
 568                     'branches' : { },
 569                     'repos_size' : 0,
 570                     'repos_file_count' : 0,
 571                     'svn_rev_count' : None,
 572                     'first_rev_date' : 1L<<32,
 573                     'last_rev_date' : 0,
 574                     'pass_timings' : { },
 575                     'start_time' : 0,
 576                     'end_time' : 0,
 577                     }
 578
 579   def log_duration_for_pass(self, duration, pass_num):
 580     self.data['pass_timings'][pass_num] = duration
 581
 582   def set_start_time(self, start):
 583     self.data['start_time'] = start
 584
 585   def set_end_time(self, end):
 586     self.data['end_time'] = end
 587
 588   def _bump_item(self, key, amount=1):
 589     self.data[key] = self.data[key] + amount
 590
 591   def reset_c_rev_info(self):
 592     self.data['cvs_revs_count'] = 0
 593     self.data['tags'] = { }
 594     self.data['branches'] = { }
 595
 596   def record_c_rev(self, c_rev):
 597     self._bump_item('cvs_revs_count')
 598
 599     for tag in c_rev.tags:
 600       self.data['tags'][tag] = None
 601     for branch in c_rev.branches:
 602       self.data['branches'][branch] = None
 603
 604     if c_rev.timestamp < self.data['first_rev_date']:
 605       self.data['first_rev_date'] = c_rev.timestamp
 606
 607     if c_rev.timestamp > self.data['last_rev_date']:
 608       self.data['last_rev_date'] = c_rev.timestamp
 609
 610     # Only add the size if this is the first time we see the file.
 611     if not self.repos_files.has_key(c_rev.fname):
 612       self._bump_item('repos_size', c_rev.file_size)
 613     self.repos_files[c_rev.fname] = None
 614
 615     self.data['repos_file_count'] = len(self.repos_files)
 616
 617   def set_svn_rev_count(self, count):
 618     self.data['svn_rev_count'] = count
 619
 620   def svn_rev_count(self):
 621     return self.data['svn_rev_count']
 622
 623   def archive(self):
 624     open(self.filename, 'w').write(marshal.dumps(self.data))
 625
 626   def unarchive(self):
 627     self.data = marshal.loads(open(self.filename, 'r').read())
 628
 629   def __str__(self):
 630     svn_revs_str = ""
 631     if self.data['svn_rev_count'] is not None:
 632       svn_revs_str = ('Total SVN Commits:      %10s\n'
 633                       % self.data['svn_rev_count'])
 634
 635     return ('\n'                                \
 636             'cvs2svn Statistics:\n'             \
 637             '------------------\n'              \
 638             'Total CVS Files:        %10i\n'    \
 639             'Total CVS Revisions:    %10i\n'    \
 640             'Total Unique Tags:      %10i\n'    \
 641             'Total Unique Branches:  %10i\n'    \
 642             'CVS Repos Size in KB:   %10i\n'    \
 643             '%s'                                \
 644             'First Revision Date:    %s\n'      \
 645             'Last Revision Date:     %s\n'      \
 646             '------------------'                \
 647             % (self.data['repos_file_count'],
 648                self.data['cvs_revs_count'],
 649                len(self.data['tags']),
 650                len(self.data['branches']),
 651                (self.data['repos_size'] / 1024),
 652                svn_revs_str,
 653                time.ctime(self.data['first_rev_date']),
 654                time.ctime(self.data['last_rev_date']),
 655                ))
 656
 657   def timings(self):
 658     passes = self.data['pass_timings'].keys()
 659     passes.sort()
 660     str = 'Timings:\n------------------\n'
 661
 662     def desc(val):
 663       if val == 1: return "second"
 664       return "seconds"
 665
 666     for pass_num in passes:
 667       duration = int(self.data['pass_timings'][pass_num])
 668       p_str = ('pass %d:%6d %s\n'
 669                % (pass_num, duration, desc(duration)))
 670       str = str + p_str
 671
 672     total = int(self.data['end_time'] - self.data['start_time'])
 673     str = str + ('total: %6d %s' % (total, desc(total)))
 674     return str
 675
 676
 677 class LastSymbolicNameDatabase:
 678   """ Passing every CVSRevision in s-revs to this class will result in
 679   a Database whose key is the last CVS Revision a symbolicname was
 680   seen in, and whose value is a list of all symbolicnames that were
 681   last seen in that revision."""
 682   def __init__(self, mode):
 683     self.symbols = {}
 684     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 685     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 686
 687   # Once we've gone through all the revs,
 688   # symbols.keys() will be a list of all tags and branches, and
 689   # their corresponding values will be a key into the last CVS revision
 690   # that they were used in.
 691   def log_revision(self, c_rev):
 692     # Gather last CVS Revision for symbolic name info and tag info
 693     for tag in c_rev.tags:
 694       self.symbols[tag] = c_rev.unique_key()
 695     if c_rev.op is not OP_DELETE:
 696       for branch in c_rev.branches:
 697         self.symbols[branch] = c_rev.unique_key()
 698
 699   # Creates an inversion of symbols above--a dictionary of lists (key
 700   # = CVS rev unique_key: val = list of symbols that close in that
 701   # rev.
 702   def create_database(self):
 703     for sym, rev_unique_key in self.symbols.items():
 704       if self.symbol_revs_db.has_key(rev_unique_key):
 705         ary = self.symbol_revs_db[rev_unique_key]
 706         ary.append(sym)
 707         self.symbol_revs_db[rev_unique_key] = ary
 708       else:
 709         self.symbol_revs_db[rev_unique_key] = [sym]
 710
 711
 712 class CVSRevisionDatabase:
 713   """A Database to store CVSRevision objects and retrieve them by their
 714   unique_key()."""
 715
 716   def __init__(self, mode):
 717     """Initialize an instance, opening database in MODE (like the MODE
 718     argument to Database or anydbm.open())."""
 719     self.cvs_revs_db = Database(temp(CVS_REVS_DB), mode)
 720     Cleanup().register(temp(CVS_REVS_DB), pass8)
 721
 722   def log_revision(self, c_rev):
 723     """Add C_REV, a CVSRevision, to the database."""
 724     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
 725
 726   def get_revision(self, unique_key):
 727     """Return the CVSRevision stored under UNIQUE_KEY."""
 728     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
 729
 730
 731 class TagsDatabase(Database):
 732   """A Database to store which symbolic names are tags.
 733   Each key is a tag name.
 734   The value has no meaning, and should be set to None."""
 735   def __init__(self, mode):
 736     Database.__init__(self, temp(TAGS_DB), mode)
 737     Cleanup().register(temp(TAGS_DB), pass8)
 738
 739
 740 class CVSRevision:
 741   def __init__(self, ctx, *args):
 742     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
 743
 744     If CTX is None, the following members and methods of the
 745     instantiated CVSRevision class object will be unavailable (or
 746     simply will not work correctly, if at all):
 747        cvs_path
 748        svn_path
 749        svn_trunk_path
 750        is_default_branch_revision()
 751
 752     (Note that this class treats CTX as const, because the caller
 753     likely passed in a Borg instance of a Ctx.  The reason this class
 754     takes CTX as as a parameter, instead of just instantiating a Ctx
 755     itself, is that this class should be usable outside cvs2svn.)
 756
 757     If there is one argument in ARGS, it is a string, in the format of
 758     a line from a revs file.  Do *not* include a trailing newline.
 759
 760     If there are multiple ARGS, there must be 16 of them,
 761     comprising a parsed revs line:
 762        timestamp       -->  (int) date stamp for this cvs revision
 763        digest          -->  (string) digest of author+logmsg
 764        prev_timestamp  -->  (int) date stamp for the previous cvs revision
 765        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
 766        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
 767        rev             -->  (string) this CVS rev, e.g., "1.3"
 768        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
 769        file_in_attic   -->  (char or None) true if RCS file is in Attic
 770        file_executable -->  (char or None) true if RCS file has exec bit set.
 771        file_size       -->  (int) size of the RCS file
 772        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
 773        mode            -->  (string or None) "kkv", "kb", etc.
 774        branch_name     -->  (string or None) branch on which this rev occurred
 775        tags            -->  (list of strings) all tags on this revision
 776        branches        -->  (list of strings) all branches rooted in this rev
 777        fname           -->  (string) relative path of file in CVS repos
 778
 779     The two forms of initialization are equivalent."""
 780
 781     self._ctx = ctx
 782     if len(args) == 16:
 783       (self.timestamp, self.digest, self.prev_timestamp, self.op,
 784        self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
 785        self.file_executable, self.file_size, self.deltatext_code, self.fname,
 786        self.mode, self.branch_name, self.tags, self.branches) = args
 787     elif len(args) == 1:
 788       data = args[0].split(' ', 14)
 789       self.timestamp = int(data[0], 16)
 790       self.digest = data[1]
 791       if data[2] == "*":
 792         self.prev_timestamp = 0
 793       else:
 794         self.prev_timestamp = int(data[2])
 795       self.op = data[3]
 796       self.prev_rev = data[4]
 797       if self.prev_rev == "*":
 798         self.prev_rev = None
 799       self.rev = data[5]
 800       self.next_rev = data[6]
 801       if self.next_rev == "*":
 802         self.next_rev = None
 803       self.file_in_attic = data[7]
 804       if self.file_in_attic == "*":
 805         self.file_in_attic = None
 806       self.file_executable = data[8]
 807       if self.file_executable == "*":
 808         self.file_executable = None
 809       self.file_size = int(data[9])
 810       self.deltatext_code = data[10]
 811       self.mode = data[11]
 812       if self.mode == "*":
 813         self.mode = None
 814       self.branch_name = data[12]
 815       if self.branch_name == "*":
 816         self.branch_name = None
 817       ntags = int(data[13])
 818       tags = data[14].split(' ', ntags + 1)
 819       nbranches = int(tags[ntags])
 820       branches = tags[ntags + 1].split(' ', nbranches)
 821       self.fname = branches[nbranches]
 822       self.tags = tags[:ntags]
 823       self.branches = branches[:nbranches]
 824     else:
 825       raise TypeError, 'CVSRevision() takes 2 or 16 arguments (%d given)' % \
 826           (len(args) + 1)
 827     if ctx is not None:
 828       self.cvs_path = relative_name(self._ctx.cvsroot, self.fname[:-2])
 829       self.svn_path = self._make_path(self.cvs_path, self.branch_name)
 830       self.svn_trunk_path = self._make_path(self.cvs_path)
 831
 832   # The 'primary key' of a CVS Revision is the revision number + the
 833   # filename.  To provide a unique key (say, for a dict), we just glom
 834   # them together in a string.  By passing in self.prev_rev or
 835   # self.next_rev, you can get the unique key for their respective
 836   # CVSRevisions.
 837   def unique_key(self, revnum=None):
 838     if revnum is None:
 839       revnum = self.rev
 840     return revnum + "/" + self.fname
 841
 842   def __str__(self):
 843     return ('%08lx %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s' % (
 844       self.timestamp, self.digest, self.prev_timestamp or "*", self.op,
 845       (self.prev_rev or "*"), self.rev, (self.next_rev or "*"),
 846       (self.file_in_attic or "*"), (self.file_executable or "*"),
 847       self.file_size,
 848       self.deltatext_code, (self.mode or "*"), (self.branch_name or "*"),
 849       len(self.tags), self.tags and " " or "", " ".join(self.tags),
 850       len(self.branches), self.branches and " " or "", " ".join(self.branches),
 851       self.fname, ))
 852
 853   # Returns true if this CVSRevision is the opening CVSRevision for
 854   # NAME (for this RCS file).
 855   def opens_symbolic_name(self, name):
 856     if name in self.tags:
 857       return 1
 858     if name in self.branches:
 859       # If this c_rev opens a branch and our op is OP_DELETE, then
 860       # that means that the file that this c_rev belongs to was
 861       # created on the branch, so for all intents and purposes, this
 862       # c_rev is *technically* not an opening.  See Issue #62 for more
 863       # information.
 864       if self.op != OP_DELETE:
 865         return 1
 866     return 0
 867
 868   def is_default_branch_revision(self):
 869     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
 870     revision according to DEFAULT_BRANCHES_DB (see the conditions
 871     documented there), else return None."""
 872     if self._ctx._default_branches_db.has_key(self.cvs_path):
 873       val = self._ctx._default_branches_db[self.cvs_path]
 874       val_last_dot = val.rindex(".")
 875       our_last_dot = self.rev.rindex(".")
 876       default_branch = val[:val_last_dot]
 877       our_branch = self.rev[:our_last_dot]
 878       default_rev_component = int(val[val_last_dot + 1:])
 879       our_rev_component = int(self.rev[our_last_dot + 1:])
 880       if (default_branch == our_branch
 881           and our_rev_component <= default_rev_component):
 882         return 1
 883     # else
 884     return None
 885
 886   def _make_path(self, path, branch_name = None):
 887     """Return the trunk path or branch path for PATH.
 888
 889     If PATH is None, return None."""
 890     # For a while, we treated each top-level subdir of the CVS
 891     # repository as a "project root" and interpolated the appropriate
 892     # genealogy (trunk|tag|branch) in according to the official
 893     # recommended layout.  For example, the path '/foo/bar/baz.c' on
 894     # branch 'Rel2' would become
 895     #
 896     #   /foo/branches/Rel2/bar/baz.c
 897     #
 898     # and on trunk it would become
 899     #
 900     #   /foo/trunk/bar/baz.c
 901     #
 902     # However, we went back to the older and simpler method of just
 903     # prepending the genealogy to the front, instead of interpolating.
 904     # So now we produce:
 905     #
 906     #   /branches/Rel2/foo/bar/baz.c
 907     #   /trunk/foo/bar/baz.c
 908     #
 909     # Why?  Well, Jack Repenning pointed out that this way is much
 910     # friendlier to "anonymously rooted subtrees" (that's a tree where
 911     # the name of the top level dir doesn't matter, the point is that if
 912     # you cd into it and, say, run 'make', something good will happen).
 913     # By interpolating, we made it impossible to point cvs2svn at some
 914     # subdir in the CVS repository and convert it as a project, because
 915     # we'd treat every subdir underneath it as an independent project
 916     # root, which is probably not what the user wanted.
 917     #
 918     # Also, see Blair Zajac's post
 919     #
 920     #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
 921     #
 922     # and the surrounding thread, for why what people really want is a
 923     # way of specifying an in-repository prefix path, not interpolation.
 924     if path is None:
 925       return None
 926
 927     if branch_name:
 928       branch_name = _clean_symbolic_name(branch_name)
 929       return self._ctx.branches_base + '/' + branch_name + '/' + path
 930     else:
 931       return self._ctx.trunk_base + '/' + path
 932
 933   def rcs_path(self):
 934     """Returns the actual filesystem path to the RCS file of this
 935     CVSRevision."""
 936     if self.file_in_attic is None:
 937       return self.fname
 938     else:
 939       basepath, filename = os.path.split(self.fname)
 940       return os.path.join(basepath, 'Attic', filename)
 941
 942   def filename(self):
 943     "Return the last path component of self.fname, minus the ',v'"
 944     return os.path.split(self.fname)[-1][:-2]
 945
 946 class SymbolDatabase:
 947   """This database records information on all symbols in the RCS
 948   files.  It is created in pass 1 and it is used in pass 2."""
 949   def __init__(self):
 950     # A hash that maps tag names to commit counts
 951     self.tags = { }
 952     # A hash that maps branch names to lists of the format
 953     # [ create_count, commit_count, blockers ], where blockers
 954     # is a hash that lists the symbols that depend on the
 955     # the branch.  The blockers hash is used as a set, so the
 956     # values are not used.
 957     self.branches = { }
 958
 959   def register_tag_creation(self, name):
 960     """Register the creation of the tag NAME."""
 961     if not self.tags.has_key(name):
 962       self.tags[name] = 0
 963     self.tags[name] += 1
 964
 965   def _branch(self, name):
 966     """Helper function to get a branch node that will create and
 967     initialize the node if it does not exist."""
 968     if not self.branches.has_key(name):
 969       self.branches[name] = [ 0, 0, { } ]
 970     return self.branches[name]
 971
 972   def register_branch_creation(self, name):
 973     """Register the creation of the branch NAME."""
 974     self._branch(name)[0] += 1
 975
 976   def register_branch_commit(self, name):
 977     """Register a commit on the branch NAME."""
 978     self._branch(name)[1] += 1
 979
 980   def register_branch_blocker(self, name, blocker):
 981     """Register BLOCKER as a blocker on the branch NAME."""
 982     self._branch(name)[2][blocker] = None
 983
 984   def branch_has_commit(self, name):
 985     """Return non-zero if NAME has commits.  Returns 0 if name
 986     is not a branch or if it has no commits."""
 987     return self.branches.has_key(name) and self.branches[name][1]
 988
 989   def find_excluded_symbols(self, regexp_list):
 990     """Returns a hash of all symbols thaht match the regexps in
 991     REGEXP_LISTE.  The hash is used as a set so the values are
 992     not used."""
 993     excludes = { }
 994     for tag in self.tags.keys():
 995       if match_regexp_list(regexp_list, tag):
 996         excludes[tag] = None
 997     for branch in self.branches.keys():
 998       if match_regexp_list(regexp_list, branch):
 999         excludes[branch] = None
1000     return excludes
1001
1002   def find_branch_exclude_blockers(self, branch, excludes):
1003     """Find all blockers of BRANCH, excluding the ones in the hash
1004     EXCLUDES."""
1005     blockers = { }
1006     if excludes.has_key(branch):
1007       for blocker in self.branches[branch][2]:
1008         if not excludes.has_key(blocker):
1009           blockers[blocker] = None
1010     return blockers
1011
1012   def find_blocked_excludes(self, excludes):
1013     """Find all branches not in EXCLUDES that have blocking symbols that
1014     are not themselves excluded.  Return a hash that maps branch names
1015     to a hash of blockers.  The hash of blockes is used as a set so the
1016     values are not used."""
1017     blocked_branches = { }
1018     for branch in self.branches.keys():
1019       blockers = self.find_branch_exclude_blockers(branch, excludes)
1020       if blockers:
1021         blocked_branches[branch] = blockers
1022     return blocked_branches
1023
1024   def find_mismatches(self, excludes=None):
1025     """Find all symbols that are defined as both tags and branches,
1026     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1027     the symbol name, tag count, branch count and commit count."""
1028     if excludes is None:
1029       excludes = { }
1030     mismatches = [ ]
1031     for branch in self.branches.keys():
1032       if not excludes.has_key(branch) and self.tags.has_key(branch):
1033         mismatches.append((branch,                    # name
1034                            self.tags[branch],         # tag count
1035                            self.branches[branch][0],  # branch count
1036                            self.branches[branch][1])) # commit count
1037     return mismatches
1038
1039   def read(self):
1040     """Read the symbol database from files."""
1041     f = open(temp(TAGS_LIST))
1042     while 1:
1043       line = f.readline()
1044       if not line:
1045         break
1046       tag, count = line.split()
1047       self.tags[tag] = int(count)
1048
1049     f = open(temp(BRANCHES_LIST))
1050     while 1:
1051       line = f.readline()
1052       if not line:
1053         break
1054       words = line.split()
1055       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1056       for blocker in words[3:]:
1057         self.branches[words[0]][2][blocker] = None
1058
1059   def write(self):
1060     """Store the symbol database to files."""
1061     f = open(temp(TAGS_LIST), "w")
1062     Cleanup().register(temp(TAGS_LIST), pass2)
1063     for tag, count in self.tags.items():
1064       f.write("%s %d\n" % (tag, count))
1065
1066     f = open(temp(BRANCHES_LIST), "w")
1067     Cleanup().register(temp(BRANCHES_LIST), pass2)
1068     for branch, info in self.branches.items():
1069       f.write("%s %d %d" % (branch, info[0], info[1]))
1070       if info[2]:
1071         f.write(" ")
1072         f.write(" ".join(info[2].keys()))
1073       f.write("\n")
1074
1075 class CollectData(cvs2svn_rcsparse.Sink):
1076   def __init__(self):
1077     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1078     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1079     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1080     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1081     self.default_branches_db = Database(temp(DEFAULT_BRANCHES_DB), DB_OPEN_NEW)
1082     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1083     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1084     Cleanup().register(temp(METADATA_DB), pass8)
1085     self.fatal_errors = []
1086     self.num_files = 0
1087     self.symbol_db = SymbolDatabase()
1088
1089     # 1 if we've collected data for at least one file, None otherwise.
1090     self.found_valid_file = None
1091
1092     # See set_fname() for initializations of other variables.
1093
1094   def set_fname(self, canonical_name, filename):
1095     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1096     filesystem path to the file in question, and CANONICAL_NAME is
1097     FILENAME with the 'Attic' component removed (if the file is indeed
1098     in the Attic) ."""
1099     self.fname = canonical_name
1100
1101     # We calculate and save some file metadata here, where we can do
1102     # it only once per file, instead of waiting until later where we
1103     # would have to do the same calculations once per CVS *revision*.
1104
1105     self.rel_name = relative_name(Ctx().cvsroot, self.fname)[:-2]
1106
1107     # If the paths are not the same, then that means that the
1108     # canonical_name has had the 'Attic' component stripped out.
1109     self.file_in_attic = None
1110     if not canonical_name == filename:
1111       self.file_in_attic = 1
1112
1113     file_stat = os.stat(filename)
1114     # The size of our file in bytes
1115     self.file_size = file_stat[stat.ST_SIZE]
1116
1117     # Whether or not the executable bit is set.
1118     self.file_executable = None
1119     if file_stat[0] & stat.S_IXUSR:
1120       self.file_executable = 1
1121
1122     # revision -> [timestamp, author, old-timestamp]
1123     self.rev_data = { }
1124
1125     # Maps revision number (key) to the revision number of the
1126     # previous revision along this line of development.
1127     #
1128     # For the first revision R on a branch, we consider the revision
1129     # from which R sprouted to be the 'previous'.
1130     #
1131     # Note that this revision can't be determined arithmetically (due
1132     # to cvsadmin -o, which is why this is necessary).
1133     self.prev_rev = { }
1134
1135     # This dict is essentially self.prev_rev with the values mapped in
1136     # the other direction, so following key -> value will yield you
1137     # the next revision number
1138     self.next_rev = { }
1139
1140     # Track the state of each revision so that in set_revision_info,
1141     # we can determine if our op is an add/change/delete.  We can do
1142     # this because in set_revision_info, we'll have all of the
1143     # revisions for a file at our fingertips, and we need to examine
1144     # the state of our prev_rev to determine if we're an add or a
1145     # change--without the state of the prev_rev, we are unable to
1146     # distinguish between an add and a change.
1147     self.rev_state = { }
1148
1149     # Hash mapping branch numbers, like '1.7.2', to branch names,
1150     # like 'Release_1_0_dev'.
1151     self.branch_names = { }
1152
1153     # RCS flags (used for keyword expansion).
1154     self.mode = None
1155
1156     # Hash mapping revision numbers, like '1.7', to lists of names
1157     # indicating which branches sprout from that revision, like
1158     # ['Release_1_0_dev', 'experimental_driver', ...].
1159     self.branchlist = { }
1160
1161     # Like self.branchlist, but the values are lists of tag names that
1162     # apply to the key revision.
1163     self.taglist = { }
1164
1165     # If set, this is an RCS branch number -- rcsparse calls this the
1166     # "principal branch", but CVS and RCS refer to it as the "default
1167     # branch", so that's what we call it, even though the rcsparse API
1168     # setter method is still 'set_principal_branch'.
1169     self.default_branch = None
1170
1171     # If the RCS file doesn't have a default branch anymore, but does
1172     # have vendor revisions, then we make an educated guess that those
1173     # revisions *were* the head of the default branch up until the
1174     # commit of 1.2, at which point the file's default branch became
1175     # trunk.  This records the date at which 1.2 was committed.
1176     self.first_non_vendor_revision_date = None
1177
1178     # A list of all symbols defined for the current file.  Used to
1179     # prevent multiple definitions of a symbol, something which can
1180     # easily happen when --symbol-transform is used.
1181     self.defined_symbols = [ ]
1182
1183   def set_principal_branch(self, branch):
1184     self.default_branch = branch
1185
1186   def set_expansion(self, mode):
1187     self.mode = mode
1188
1189   def set_branch_name(self, branch_number, name):
1190     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1191     and that NAME sprouts from BRANCH_NUMBER .
1192     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1193     for example '1.7.2' (never '1.7.0.2')."""
1194     if not self.branch_names.has_key(branch_number):
1195       self.branch_names[branch_number] = name
1196       # The branchlist is keyed on the revision number from which the
1197       # branch sprouts, so strip off the odd final component.
1198       sprout_rev = branch_number[:branch_number.rfind(".")]
1199       if not self.branchlist.has_key(sprout_rev):
1200         self.branchlist[sprout_rev] = []
1201       self.branchlist[sprout_rev].append(name)
1202       self.symbol_db.register_branch_creation(name)
1203     else:
1204       sys.stderr.write("%s: in '%s':\n"
1205                        "   branch '%s' already has name '%s',\n"
1206                        "   cannot also have name '%s', ignoring the latter\n"
1207                        % (warning_prefix, self.fname, branch_number,
1208                           self.branch_names[branch_number], name))
1209
1210   def rev_to_branch_name(self, revision):
1211     """Return the name of the branch on which REVISION lies.
1212     REVISION is a non-branch revision number with an even number of,
1213     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1214     For the convenience of callers, REVISION can also be a trunk
1215     revision such as '1.2', in which case just return None."""
1216     if trunk_rev.match(revision):
1217       return None
1218     return self.branch_names.get(revision[:revision.rindex(".")])
1219
1220   def add_cvs_branch(self, revision, branch_name):
1221     """Record the root revision and branch revision for BRANCH_NAME,
1222     based on REVISION.  REVISION is a CVS branch number having an even
1223     number of components where the second-to-last is '0'.  For
1224     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1225     from 1.7 and has branch number 1.7.2."""
1226     last_dot = revision.rfind(".")
1227     branch_rev = revision[:last_dot]
1228     last2_dot = branch_rev.rfind(".")
1229     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1230     self.set_branch_name(branch_rev, branch_name)
1231
1232   def define_tag(self, name, revision):
1233     """Record a bidirectional mapping between symbolic NAME and REVISION.
1234     REVISION is an unprocessed revision number from the RCS file's
1235     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1236     This function will determine what kind of symbolic name it is by
1237     inspection, and record it in the right places."""
1238     for (pattern, replacement) in Ctx().symbol_transforms:
1239       newname = re.sub(pattern, replacement, name)
1240       if newname != name:
1241         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1242                     % (name, newname))
1243         name = newname
1244     if name in self.defined_symbols:
1245       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1246                 % (error_prefix, name, self.fname)
1247       sys.stderr.write(err + "\n")
1248       self.fatal_errors.append(err)
1249     self.defined_symbols.append(name)
1250     if branch_tag.match(revision):
1251       self.add_cvs_branch(revision, name)
1252     elif vendor_tag.match(revision):
1253       self.set_branch_name(revision, name)
1254     else:
1255       if not self.taglist.has_key(revision):
1256         self.taglist[revision] = []
1257       self.taglist[revision].append(name)
1258       self.symbol_db.register_tag_creation(name)
1259
1260   def define_revision(self, revision, timestamp, author, state,
1261                       branches, next):
1262
1263     # Record the state of our revision for later calculations
1264     self.rev_state[revision] = state
1265
1266     # store the rev_data as a list in case we have to jigger the timestamp
1267     self.rev_data[revision] = [int(timestamp), author, None]
1268
1269     # When on trunk, the RCS 'next' revision number points to what
1270     # humans might consider to be the 'previous' revision number.  For
1271     # example, 1.3's RCS 'next' is 1.2.
1272     #
1273     # However, on a branch, the RCS 'next' revision number really does
1274     # point to what humans would consider to be the 'next' revision
1275     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1276     #
1277     # In other words, in RCS, 'next' always means "where to find the next
1278     # deltatext that you need this revision to retrieve.
1279     #
1280     # That said, we don't *want* RCS's behavior here, so we determine
1281     # whether we're on trunk or a branch and set self.prev_rev
1282     # accordingly.
1283     #
1284     # One last thing.  Note that if REVISION is a branch revision,
1285     # instead of mapping REVISION to NEXT, we instead map NEXT to
1286     # REVISION.  Since we loop over all revisions in the file before
1287     # doing anything with the data we gather here, this 'reverse
1288     # assignment' effectively does the following:
1289     #
1290     # 1. Gives us no 'prev' value for REVISION (in this
1291     # iteration... it may have been set in a previous iteration)
1292     #
1293     # 2. Sets the 'prev' value for the revision with number NEXT to
1294     # REVISION.  So when we come around to the branch revision whose
1295     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1296     # set.
1297     if trunk_rev.match(revision):
1298       self.prev_rev[revision] = next
1299       self.next_rev[next] = revision
1300     elif next:
1301       self.prev_rev[next] = revision
1302       self.next_rev[revision] = next
1303
1304     for b in branches:
1305       self.prev_rev[b] = revision
1306
1307     # Ratchet up the highest vendor head revision, if necessary.
1308     if self.default_branch:
1309       default_branch_root = self.default_branch + "."
1310       if ((revision.find(default_branch_root) == 0)
1311           and (default_branch_root.count('.') == revision.count('.'))):
1312         # This revision is on the default branch, so record that it is
1313         # the new highest default branch head revision.
1314         self.default_branches_db[self.rel_name] = revision
1315     else:
1316       # No default branch, so make an educated guess.
1317       if revision == '1.2':
1318         # This is probably the time when the file stopped having a
1319         # default branch, so make a note of it.
1320         self.first_non_vendor_revision_date = timestamp
1321       else:
1322         m = vendor_revision.match(revision)
1323         if m and ((not self.first_non_vendor_revision_date)
1324                   or (timestamp < self.first_non_vendor_revision_date)):
1325           # We're looking at a vendor revision, and it wasn't
1326           # committed after this file lost its default branch, so bump
1327           # the maximum trunk vendor revision in the permanent record.
1328           self.default_branches_db[self.rel_name] = revision
1329
1330     if not trunk_rev.match(revision):
1331       # Check for unlabeled branches, record them.  We tried to collect
1332       # all branch names when we parsed the symbolic name header
1333       # earlier, of course, but that didn't catch unlabeled branches.
1334       # If a branch is unlabeled, this is our first encounter with it,
1335       # so we have to record its data now.
1336       branch_number = revision[:revision.rindex(".")]
1337       if not self.branch_names.has_key(branch_number):
1338         branch_name = "unlabeled-" + branch_number
1339         self.set_branch_name(branch_number, branch_name)
1340
1341       # Register the commit on this non-trunk branch
1342       branch_name = self.branch_names[branch_number]
1343       self.symbol_db.register_branch_commit(branch_name)
1344
1345   def tree_completed(self):
1346     "The revision tree has been parsed.  Analyze it for consistency."
1347
1348     # Our algorithm depends upon the timestamps on the revisions occuring
1349     # monotonically over time.  That is, we want to see rev 1.34 occur in
1350     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1351     # sorting), and then tried to insert 1.34, we'd be screwed.
1352
1353     # to perform the analysis, we'll simply visit all of the 'previous'
1354     # links that we have recorded and validate that the timestamp on the
1355     # previous revision is before the specified revision
1356
1357     # if we have to resync some nodes, then we restart the scan. just keep
1358     # looping as long as we need to restart.
1359     while 1:
1360       for current, prev in self.prev_rev.items():
1361         if not prev:
1362           # no previous revision exists (i.e. the initial revision)
1363           continue
1364         t_c = self.rev_data[current][0]
1365         t_p = self.rev_data[prev][0]
1366         if t_p >= t_c:
1367           # the previous revision occurred later than the current revision.
1368           # shove the previous revision back in time (and any before it that
1369           # may need to shift).
1370
1371           # We sync backwards and not forwards because any given CVS
1372           # Revision has only one previous revision.  However, a CVS
1373           # Revision can *be* a previous revision for many other
1374           # revisions (e.g., a revision that is the source of multiple
1375           # branches).  This becomes relevant when we do the secondary
1376           # synchronization in pass 2--we can make certain that we
1377           # don't resync a revision earlier than it's previous
1378           # revision, but it would be non-trivial to make sure that we
1379           # don't resync revision R *after* any revisions that have R
1380           # as a previous revision.
1381           while t_p >= t_c:
1382             self.rev_data[prev][0] = t_c - 1    # new timestamp
1383             self.rev_data[prev][2] = t_p        # old timestamp
1384             delta = t_c - 1 - t_p
1385             msg =  "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1386                   % (self.rel_name,
1387                      prev, time.ctime(t_p), delta)
1388             Log().write(LOG_VERBOSE, msg)
1389             if (delta > COMMIT_THRESHOLD
1390                 or delta < (COMMIT_THRESHOLD * -1)):
1391               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1392               Log().write(LOG_WARN, str % (warning_prefix, self.rel_name,
1393                                            delta))
1394             current = prev
1395             prev = self.prev_rev[current]
1396             if not prev:
1397               break
1398             t_c = t_c - 1               # self.rev_data[current][0]
1399             t_p = self.rev_data[prev][0]
1400
1401           # break from the for-loop
1402           break
1403       else:
1404         # finished the for-loop (no resyncing was performed)
1405         return
1406
1407   def set_revision_info(self, revision, log, text):
1408     timestamp, author, old_ts = self.rev_data[revision]
1409     digest = sha.new(log + '\0' + author).hexdigest()
1410     if old_ts:
1411       # the timestamp on this revision was changed. log it for later
1412       # resynchronization of other files's revisions that occurred
1413       # for this time and log message.
1414       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1415
1416     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1417     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1418     #
1419     # If revision 1.1 appears to have been created via 'cvs add'
1420     # instead of 'cvs import', then this file probably never had a
1421     # default branch, so retroactively remove its record in the
1422     # default branches db.  The test is that the log message CVS uses
1423     # for 1.1 in imports is "Initial revision\n" with no period.
1424     if revision == '1.1' and log != 'Initial revision\n':
1425       if self.default_branches_db.has_key(self.rel_name):
1426         del self.default_branches_db[self.rel_name]
1427
1428     # Get the timestamp of the previous revision
1429     prev_rev = self.prev_rev.get(revision, None)
1430     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1431
1432     # How to tell if a CVSRevision is an add, a change, or a deletion:
1433     #
1434     # It's a delete if RCS state is 'dead'
1435     #
1436     # It's an add if RCS state is 'Exp.' and
1437     #      - we either have no previous revision
1438     #        or
1439     #      - we have a previous revision whose state is 'dead'
1440     #
1441     # Anything else is a change.
1442     if self.rev_state[revision] == 'dead':
1443       op = OP_DELETE
1444     elif ((self.prev_rev.get(revision, None) is None)
1445           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1446       op = OP_ADD
1447     else:
1448       op = OP_CHANGE
1449
1450     if text:
1451       deltatext_code = DELTATEXT_NONEMPTY
1452     else:
1453       deltatext_code = DELTATEXT_EMPTY
1454
1455     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp, op,
1456                         self.prev_rev[revision], revision,
1457                         self.next_rev.get(revision),
1458                         self.file_in_attic, self.file_executable,
1459                         self.file_size,
1460                         deltatext_code, self.fname,
1461                         self.mode, self.rev_to_branch_name(revision),
1462                         self.taglist.get(revision, []),
1463                         self.branchlist.get(revision, []))
1464     self.revs.write(str(c_rev) + "\n")
1465     StatsKeeper().record_c_rev(c_rev)
1466
1467     if not self.metadata_db.has_key(digest):
1468       self.metadata_db[digest] = (author, log)
1469
1470   def parse_completed(self):
1471     # Walk through all branches and tags and register them with
1472     # their parent branch in the symbol database.
1473     for revision, symbols in self.taglist.items() + self.branchlist.items():
1474       for symbol in symbols:
1475         name = self.rev_to_branch_name(revision)
1476         if name is not None:
1477           self.symbol_db.register_branch_blocker(name, symbol)
1478
1479     self.num_files = self.num_files + 1
1480
1481   def write_symbol_db(self):
1482     self.symbol_db.write()
1483
1484 class SymbolingsLogger:
1485   """Manage the file that contains lines for symbol openings and
1486   closings.
1487
1488   This data will later be used to determine valid SVNRevision ranges
1489   from which a file can be copied when creating a branch or tag in
1490   Subversion.  Do this by finding "Openings" and "Closings" for each
1491   file copied onto a branch or tag.
1492
1493   An "Opening" is the CVSRevision from which a given branch/tag
1494   sprouts on a path.
1495
1496   The "Closing" for that branch/tag and path is the next CVSRevision
1497   on the same line of development as the opening.
1498
1499   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1500   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1501   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1502   'foo.c'.  Note that there may be many revisions chronologically
1503   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1504   perhaps even including on branch BEE itself.  But 1.3 is the next
1505   revision *on the same line* as 1.2, that is why it is the closing
1506   revision for those symbolic names of which 1.2 is the opening.
1507
1508   The reason for doing all this hullabaloo is to make branch and tag
1509   creation as efficient as possible by minimizing the number of copies
1510   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1511   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1512   means that when creating branch BEE, there is some motivation to do
1513   the copy from one of 17-30.  Now if there were another file,
1514   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1515   to revisions 24 and 39 in Subversion, we would know that the ideal
1516   thing would be to copy the branch from somewhere between 24 and 29,
1517   inclusive.
1518   """
1519   def __init__(self):
1520     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1521     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1522     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1523     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1524
1525     # This keys of this dictionary are Subversion repository *source*
1526     # paths for which we've encountered an 'opening'.  The values are
1527     # the symbolic names that this path has opened.  The only paths
1528     # that should be in this dict are paths whose corresponding
1529     # CVSRevision is a default branch revision.
1530     self.open_paths_with_default_branches = { }
1531
1532   def log_revision(self, c_rev, svn_revnum):
1533     """Log any openings found in C_REV, and if C_REV.next_rev is not
1534     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1535     any) will have its revnum determined later."""
1536     for name in c_rev.tags + c_rev.branches:
1537       name = _clean_symbolic_name(name)
1538       self._note_default_branch_opening(c_rev, name)
1539       if c_rev.op != OP_DELETE:
1540         self._log(name, svn_revnum, c_rev.svn_path, OPENING)
1541
1542       # If our c_rev has a next_rev, then that's the closing rev for
1543       # this source revision.  Log it to closings for later processing
1544       # since we don't know the svn_revnum yet.
1545       if c_rev.next_rev is not None:
1546         self.closings.write('%s %s\n' %
1547                             (name, c_rev.unique_key(c_rev.next_rev)))
1548
1549   def _log(self, name, svn_revnum, svn_path, type):
1550     """Write out a single line to the symbol_openings_closings file
1551     representing that svn_revnum of svn_path is either the opening or
1552     closing (TYPE) of NAME (a symbolic name).
1553
1554     TYPE should only be one of the following global constants:
1555     OPENING or CLOSING."""
1556     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1557     self.symbolings.write('%s %.8d %s %s\n' % (name, svn_revnum,
1558                                                type, svn_path))
1559
1560   def close(self):
1561     """Iterate through the closings file, lookup the svn_revnum for
1562     each closing CVSRevision, and write a proper line out to the
1563     symbolings file."""
1564     # Use this to get the c_rev.svn_path of our rev_key
1565     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1566
1567     self.closings.close()
1568     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1569       (name, rev_key) = line.rstrip().split(" ", 1)
1570       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1571
1572       c_rev = cvs_revs_db.get_revision(rev_key)
1573       self._log(name, svn_revnum, c_rev.svn_path, CLOSING)
1574
1575     self.symbolings.close()
1576
1577   def _note_default_branch_opening(self, c_rev, symbolic_name):
1578     """If C_REV is a default branch revision, log C_REV.svn_trunk_path
1579     as an opening for SYMBOLIC_NAME."""
1580     path = c_rev.svn_trunk_path
1581     if not self.open_paths_with_default_branches.has_key(path):
1582       self.open_paths_with_default_branches[path] = [ ]
1583     self.open_paths_with_default_branches[path].append(symbolic_name)
1584
1585   def log_default_branch_closing(self, c_rev, svn_revnum):
1586     """If self.open_paths_with_default_branches contains
1587     C_REV.svn_trunk_path, then call log each name in
1588     self.open_paths_with_default_branches[C_REV.svn_trunk_path] as a
1589     closing with SVN_REVNUM as the closing revision number. """
1590     path = c_rev.svn_trunk_path
1591     if self.open_paths_with_default_branches.has_key(path):
1592       # log each symbol as a closing
1593       for name in self.open_paths_with_default_branches[path]:
1594         self._log(name, svn_revnum, path, CLOSING)
1595       # Remove them from the openings list as we're done with them.
1596       del self.open_paths_with_default_branches[path]
1597
1598
1599 class PersistenceManager:
1600   """The PersistenceManager allows us to effectively store SVNCommits
1601   to disk and retrieve them later using only their subversion revision
1602   number as the key.  It also returns the subversion revision number
1603   for a given CVSRevision's unique key.
1604
1605   All information pertinent to each SVNCommit is stored in a series of
1606   on-disk databases so that SVNCommits can be retrieved on-demand.
1607
1608   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
1609   In 'new' mode, PersistenceManager will initialize a new set of on-disk
1610   databases and be fully-featured.
1611   In 'read' mode, PersistenceManager will open existing on-disk databases
1612   and the set_* methods will be unavailable."""
1613   def __init__(self, mode):
1614     self.mode = mode
1615     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
1616       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
1617     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
1618     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
1619     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
1620     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
1621     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
1622     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
1623     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
1624     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
1625     ###PERF kff Elsewhere there are comments about sucking the tags db
1626     ### into memory.  That seems like a good idea.
1627     if not Ctx().trunk_only:
1628       self.tags_db = TagsDatabase(DB_OPEN_READ)
1629       self.motivating_revnums = Database(temp(MOTIVATING_REVNUMS), mode)
1630       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
1631
1632     # "branch_name" -> svn_revnum in which branch was last filled.
1633     # This is used by CVSCommit._pre_commit, to prevent creating a fill
1634     # revision which would have nothing to do.
1635     self.last_filled = {}
1636
1637   def get_svn_revnum(self, cvs_rev_unique_key):
1638     """Return the Subversion revision number in which
1639     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
1640     is no mapping for CVS_REV_UNIQUE_KEY."""
1641     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
1642
1643   def get_svn_commit(self, svn_revnum):
1644     """Return an SVNCommit that corresponds to SVN_REVNUM.
1645
1646     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
1647
1648     This method can throw SVNCommitInternalInconsistencyError.
1649     """
1650     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
1651     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
1652     if c_rev_keys == None:
1653       return None
1654
1655     digest = None
1656     for key in c_rev_keys:
1657       c_rev = self.cvs_revisions.get_revision(key)
1658       svn_commit.add_revision(c_rev)
1659       # Set the author and log message for this commit by using
1660       # CVSRevision metadata, but only if haven't done so already.
1661       if digest is None:
1662         digest = c_rev.digest
1663         author, log_msg = self.svn_commit_metadata[digest]
1664         svn_commit.set_author(author)
1665         svn_commit.set_log_msg(log_msg)
1666
1667     # If we're doing a trunk-only conversion, we don't need to do any more work.
1668     if Ctx().trunk_only:
1669       return svn_commit
1670
1671     name, date = self._get_name_and_date(svn_revnum)
1672     if name:
1673       svn_commit.set_symbolic_name(name)
1674       svn_commit.set_date(date)
1675       if self.tags_db.has_key(name):
1676         svn_commit.is_tag = 1
1677
1678     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
1679     if motivating_revnum:
1680       svn_commit.set_motivating_revnum(int(motivating_revnum))
1681       svn_commit.set_date(date)
1682
1683     if len(svn_commit.cvs_revs) and name:
1684       msg = """An SVNCommit cannot have cvs_revisions *and* a
1685       corresponding symbolic name ('%s') to fill.""" % name
1686       raise SVNCommit.SVNCommitInternalInconsistencyError(msg)
1687
1688     return svn_commit
1689
1690   def set_cvs_revs(self, svn_revnum, cvs_revs):
1691     """Record the bidirectional mapping between SVN_REVNUM and
1692     CVS_REVS."""
1693     if self.mode == DB_OPEN_READ:
1694       raise RuntimeError, \
1695           'Write operation attempted on read-only PersistenceManager'
1696     for c_rev in cvs_revs:
1697       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
1698     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
1699     for c_rev in cvs_revs:
1700       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
1701
1702   def set_name_and_date(self, svn_revnum, name, date):
1703     """Associate symbolic name NAME and DATE with SVN_REVNUM."""
1704     if self.mode == DB_OPEN_READ:
1705       raise RuntimeError, \
1706           'Write operation attempted on read-only PersistenceManager'
1707     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
1708     self.last_filled[name] = svn_revnum
1709
1710   def _get_name_and_date(self, svn_revnum):
1711     """Return a tuple containing the symbolic name and date associated
1712     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
1713     associated with it."""
1714     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
1715
1716   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
1717     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
1718     if self.mode == DB_OPEN_READ:
1719       raise RuntimeError, \
1720           'Write operation attempted on read-only PersistenceManager'
1721     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
1722
1723
1724 class CVSCommit:
1725   """Each instance of this class contains a number of CVS Revisions
1726   that correspond to one or more Subversion Commits.  After all CVS
1727   Revisions are added to the grouping, calling process_revisions will
1728   generate a Subversion Commit (or Commits) for the set of CVS
1729   Revisions in the grouping."""
1730
1731   def __init__(self, digest, author, log):
1732     self.digest = digest
1733     self.author = author
1734     self.log = log
1735
1736     # Symbolic names for which the last source revision has already
1737     # been seen and for which the CVSRevisionAggregator has already
1738     # generated a fill SVNCommit.  See self.process_revisions().
1739     self.done_symbols = [ ]
1740
1741     self.files = { }
1742     # Lists of CVSRevisions
1743     self.changes = [ ]
1744     self.deletes = [ ]
1745
1746     # Start out with a t_min higher than any incoming time T, and a
1747     # t_max lower than any incoming T.  This way the first T will
1748     # push t_min down to T, and t_max up to T, naturally (without any
1749     # special-casing), and successive times will then ratchet them
1750     # outward as appropriate.
1751     self.t_min = 1L<<32
1752     self.t_max = 0
1753
1754     # This will be set to the SVNCommit that occurs in self._commit.
1755     self.motivating_commit = None
1756
1757     # This is a list of all non-primary commits motivated by the main
1758     # commit.  We gather these so that we can set their dates to the
1759     # same date as the primary commit.
1760     self.secondary_commits = [ ]
1761
1762     # State for handling default branches.
1763     #
1764     # Here is a tempting, but ultimately nugatory, bit of logic, which
1765     # I share with you so you may appreciate the less attractive, but
1766     # refreshingly non-nugatory, logic which follows it:
1767     #
1768     # If some of the commits in this txn happened on a non-trunk
1769     # default branch, then those files will have to be copied into
1770     # trunk manually after being changed on the branch (because the
1771     # RCS "default branch" appears as head, i.e., trunk, in practice).
1772     # As long as those copies don't overwrite any trunk paths that
1773     # were also changed in this commit, then we can do the copies in
1774     # the same revision, because they won't cover changes that don't
1775     # appear anywhere/anywhen else.  However, if some of the trunk dst
1776     # paths *did* change in this commit, then immediately copying the
1777     # branch changes would lose those trunk mods forever.  So in this
1778     # case, we need to do at least that copy in its own revision.  And
1779     # for simplicity's sake, if we're creating the new revision for
1780     # even one file, then we just do all such copies together in the
1781     # new revision.
1782     #
1783     # Doesn't that sound nice?
1784     #
1785     # Unfortunately, Subversion doesn't support copies with sources
1786     # in the current txn.  All copies must be based in committed
1787     # revisions.  Therefore, we generate the above-described new
1788     # revision unconditionally.
1789     #
1790     # This is a list of c_revs, and a c_rev is appended for each
1791     # default branch commit that will need to be copied to trunk (or
1792     # deleted from trunk) in some generated revision following the
1793     # "regular" revision.
1794     self.default_branch_cvs_revisions = [ ]
1795
1796   def __cmp__(self, other):
1797     # Commits should be sorted by t_max.  If both self and other have
1798     # the same t_max, break the tie using t_min, and lastly, digest
1799     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
1800             or cmp(self.digest, other.digest))
1801
1802   def has_file(self, fname):
1803     return self.files.has_key(fname)
1804
1805   def revisions(self):
1806     return self.changes + self.deletes
1807
1808   def opens_symbolic_name(self, name):
1809     """Returns true if any CVSRevision in this commit is on a tag or a
1810     branch or is the origin of a tag or branch."""
1811     for c_rev in self.revisions():
1812       if c_rev.opens_symbolic_name(name):
1813         return 1
1814     return 0
1815
1816   def add_revision(self, c_rev):
1817     # Record the time range of this commit.
1818     #
1819     # ### ISSUE: It's possible, though unlikely, that the time range
1820     # of a commit could get gradually expanded to be arbitrarily
1821     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
1822     # problem, and anyway deciding where to break it up would be a
1823     # judgement call.  For now, we just print a warning in commit() if
1824     # this happens.
1825     if c_rev.timestamp < self.t_min:
1826       self.t_min = c_rev.timestamp
1827     if c_rev.timestamp > self.t_max:
1828       self.t_max = c_rev.timestamp
1829
1830     if c_rev.op == OP_DELETE:
1831       self.deletes.append(c_rev)
1832     else:
1833       # OP_CHANGE or OP_ADD
1834       self.changes.append(c_rev)
1835
1836     self.files[c_rev.fname] = 1
1837
1838   def _pre_commit(self):
1839     """Generates any SVNCommits that must exist before the main
1840     commit."""
1841
1842     # There may be multiple c_revs in this commit that would cause
1843     # branch B to be filled, but we only want to fill B once.  On the
1844     # other hand, there might be multiple branches committed on in
1845     # this commit.  Whatever the case, we should count exactly one
1846     # commit per branch, because we only fill a branch once per
1847     # CVSCommit.  This list tracks which branches we've already
1848     # counted.
1849     accounted_for_sym_names = [ ]
1850
1851     def fill_needed(c_rev, pm):
1852       """Return 1 if this is the first commit on a new branch (for
1853       this file) and we need to fill the branch; else return 0
1854       (meaning that some other file's first commit on the branch has
1855       already done the fill for us).
1856
1857       If C_REV.op is OP_ADD, only return 1 if the branch that this
1858       commit is on has no last filled revision.
1859
1860       PM is a PersistenceManager to query.
1861       """
1862
1863       # Different '.' counts indicate that c_rev is now on a different
1864       # line of development (and may need a fill)
1865       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
1866         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
1867         # It should be the case that when we have a file F that
1868         # is added on branch B (thus, F on trunk is in state
1869         # 'dead'), we generate an SVNCommit to fill B iff the branch
1870         # has never been filled before.
1871         #
1872         # If this c_rev.op == OP_ADD, *and* the branch has never
1873         # been filled before, then fill it now.  Otherwise, no need to
1874         # fill it.
1875         if c_rev.op == OP_ADD:
1876           if pm.last_filled.get(c_rev.branch_name, None) is None:
1877             return 1
1878         else:
1879           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
1880             return 1
1881       return 0
1882
1883     for c_rev in self.changes + self.deletes:
1884       # If a commit is on a branch, we must ensure that the branch
1885       # path being committed exists (in HEAD of the Subversion
1886       # repository).  If it doesn't exist, we will need to fill the
1887       # branch.  After the fill, the path on which we're committing
1888       # will exist.
1889       if c_rev.branch_name \
1890           and c_rev.branch_name not in accounted_for_sym_names \
1891           and c_rev.branch_name not in self.done_symbols \
1892           and fill_needed(c_rev, Ctx()._persistence_manager):
1893         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
1894                                % c_rev.branch_name)
1895         svn_commit.set_symbolic_name(c_rev.branch_name)
1896         self.secondary_commits.append(svn_commit)
1897         accounted_for_sym_names.append(c_rev.branch_name)
1898
1899   def _commit(self):
1900     """Generates the primary SVNCommit that corresponds the this
1901     CVSCommit."""
1902     # Generate an SVNCommit unconditionally.  Even if the only change
1903     # in this CVSCommit is a deletion of an already-deleted file (that
1904     # is, a CVS revision in state 'dead' whose predecessor was also in
1905     # state 'dead'), the conversion will still generate a Subversion
1906     # revision containing the log message for the second dead
1907     # revision, because we don't want to lose that information.
1908     svn_commit = SVNCommit("commit")
1909     self.motivating_commit = svn_commit
1910
1911     for c_rev in self.changes:
1912       svn_commit.add_revision(c_rev)
1913       # Only make a change if we need to.  When 1.1.1.1 has an empty
1914       # deltatext, the explanation is almost always that we're looking
1915       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
1916       # such imports, CVS creates an RCS file where 1.1 has the
1917       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
1918       # content as 1.1.  There's no reason to reflect this non-change
1919       # in the repository, so we want to do nothing in this case.  (If
1920       # we were really paranoid, we could make sure 1.1's log message
1921       # is the CVS-generated "Initial revision\n", but I think the
1922       # conditions below are strict enough.)
1923       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
1924               and (c_rev.rev == "1.1.1.1")):
1925         if c_rev.is_default_branch_revision():
1926           self.default_branch_cvs_revisions.append(c_rev)
1927
1928     for c_rev in self.deletes:
1929       # When a file is added on a branch, CVS not only adds the file
1930       # on the branch, but generates a trunk revision (typically
1931       # 1.1) for that file in state 'dead'.  We only want to add
1932       # this revision if the log message is not the standard cvs
1933       # fabricated log message.
1934       if c_rev.prev_rev is None:
1935         # c_rev.branches may be empty if the originating branch
1936         # has been excluded.
1937         if not c_rev.branches:
1938           continue
1939         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
1940                              % (c_rev.filename(),
1941                                 c_rev.branches[0]))
1942         author, log_msg = \
1943             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
1944         if log_msg == cvs_generated_msg:
1945           continue
1946
1947       svn_commit.add_revision(c_rev)
1948       if c_rev.is_default_branch_revision():
1949         self.default_branch_cvs_revisions.append(c_rev)
1950
1951     # There is a slight chance that we didn't actually register any
1952     # CVSRevisions with our SVNCommit (see loop over self.deletes
1953     # above), so if we have no CVSRevisions, we don't flush the
1954     # svn_commit to disk and roll back our revnum.
1955     if len(svn_commit.cvs_revs) > 0:
1956       svn_commit.flush()
1957     else:
1958       # We will not be flushing this SVNCommit, so rollback the
1959       # SVNCommit revision counter.
1960       SVNCommit.revnum = SVNCommit.revnum - 1
1961
1962     if not Ctx().trunk_only:
1963       for c_rev in self.revisions():
1964         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
1965
1966   def _post_commit(self):
1967     """Generates any SVNCommits that we can perform now that _commit
1968     has happened.  That is, handle non-trunk default branches.
1969     Sometimes an RCS file has a non-trunk default branch, so a commit
1970     on that default branch would be visible in a default CVS checkout
1971     of HEAD.  If we don't copy that commit over to Subversion's trunk,
1972     then there will be no Subversion tree which corresponds to that
1973     CVS checkout.  Of course, in order to copy the path over, we may
1974     first need to delete the existing trunk there.  """
1975
1976     # Only generate a commit if we have default branch revs
1977     if len(self.default_branch_cvs_revisions):
1978       # Generate an SVNCommit for all of our default branch c_revs.
1979       svn_commit = SVNCommit("post-commit default branch(es)")
1980       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
1981       for c_rev in self.default_branch_cvs_revisions:
1982         svn_commit.add_revision(c_rev)
1983         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
1984                                                             svn_commit.revnum)
1985       self.secondary_commits.append(svn_commit)
1986
1987   def process_revisions(self, done_symbols):
1988     """Process all the CVSRevisions that this instance has, creating
1989     one or more SVNCommits in the process.  Generate fill SVNCommits
1990     only for symbols not in DONE_SYMBOLS (avoids unnecessary
1991     fills).
1992
1993     Return the primary SVNCommit that corresponds to this CVSCommit.
1994     The returned SVNCommit is the commit that motivated any other
1995     SVNCommits generated in this CVSCommit."""
1996     self.done_symbols = done_symbols
1997     seconds = self.t_max - self.t_min + 1
1998
1999     Log().write(LOG_VERBOSE, '-' * 60)
2000     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2001     if seconds == 1:
2002       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2003                   % time.ctime(self.t_max))
2004     else:
2005       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2006       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2007                   % (time.ctime(self.t_max), seconds))
2008
2009     if seconds > COMMIT_THRESHOLD + 1:
2010       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2011                   % (warning_prefix, COMMIT_THRESHOLD))
2012
2013     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2014       self._commit()
2015       return self.motivating_commit
2016
2017     self._pre_commit()
2018     self._commit()
2019     self._post_commit()
2020
2021     for svn_commit in self.secondary_commits:
2022       svn_commit.set_date(self.motivating_commit.get_date())
2023       svn_commit.flush()
2024
2025     return self.motivating_commit
2026
2027
2028 class SVNCommit:
2029   """This represents one commit to the Subversion Repository.  There
2030   are three types of SVNCommits:
2031
2032   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2033
2034   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2035
2036   3. Updates trunk to reflect the contents of a particular branch
2037      (this is to handle RCS default branches)."""
2038
2039   # The revision number to assign to the next new SVNCommit.
2040   # We start at 2 because SVNRepositoryMirror uses the first commit
2041   # to create trunk, tags, and branches.
2042   revnum = 2
2043
2044   class SVNCommitInternalInconsistencyError(Exception):
2045     """Exception raised if we encounter an impossible state in the
2046     SVNCommit Databases."""
2047     pass
2048
2049   def __init__(self, description="", revnum=None, cvs_revs=None):
2050     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2051     If REVNUM, the SVNCommit will correspond to that revision number;
2052     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2053     REVNUM.
2054
2055     It is an error to pass CVS_REVS without REVNUM, but you may pass
2056     REVNUM without CVS_REVS, and then add a revision at a time by
2057     invoking add_revision()."""
2058     self._description = description
2059
2060     # Revprop metadata for this commit.
2061     #
2062     # These initial values are placeholders.  At least the log and the
2063     # date should be different by the time these are used.
2064     #
2065     # They are private because their values should be returned encoded
2066     # in UTF8, but callers aren't required to set them in UTF8.
2067     # Therefore, accessor methods are used to set them, and
2068     # self.get_revprops() is used to to get them, in dictionary form.
2069     self._author = Ctx().username
2070     self._log_msg = "This log message means an SVNCommit was used too soon."
2071     self._max_date = 0  # Latest date seen so far.
2072
2073     self.cvs_revs = cvs_revs or []
2074     if revnum:
2075       self.revnum = revnum
2076     else:
2077       self.revnum = SVNCommit.revnum
2078       SVNCommit.revnum = SVNCommit.revnum + 1
2079
2080     # The symbolic name that is filled in this SVNCommit, if any
2081     self.symbolic_name = None
2082
2083     # If this commit is a default branch synchronization, this
2084     # variable represents the subversion revision number of the
2085     # *primary* commit where the default branch changes actually
2086     # happened.  It is None otherwise.
2087     #
2088     # It is possible for multiple synchronization commits to refer to
2089     # the same motivating commit revision number, and it is possible
2090     # for a single synchronization commit to contain CVSRevisions on
2091     # multiple different default branches.
2092     self.motivating_revnum = None
2093
2094     # is_tag is true only if this commit is a fill of a symbolic name
2095     # that is a tag, None in all other cases.
2096     self.is_tag = None
2097
2098   def set_symbolic_name(self, name):
2099     "Set self.symbolic_name to NAME."
2100     name = _clean_symbolic_name(name)
2101     self.symbolic_name = name
2102
2103   def set_motivating_revnum(self, revnum):
2104     "Set self.motivating_revnum to REVNUM."
2105     self.motivating_revnum = revnum
2106
2107   def set_author(self, author):
2108     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2109     This is the only way to set an SVNCommit's author."""
2110     self._author = author
2111
2112   def set_log_msg(self, msg):
2113     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2114     This is the only way to set an SVNCommit's log message."""
2115     self._log_msg = msg
2116
2117   def set_date(self, date):
2118     """Set this SVNCommit's date to DATE (an integer).
2119     Note that self.add_revision() updates this automatically based on
2120     a CVSRevision; so you may not need to call this at all, and even
2121     if you do, the value may be overwritten by a later call to
2122     self.add_revision()."""
2123     self._max_date = date
2124
2125   def get_date(self):
2126     """Returns this SVNCommit's date as an integer."""
2127     return self._max_date
2128
2129   def get_revprops(self):
2130     """Return the Subversion revprops for this SVNCommit."""
2131     date = format_date(self._max_date)
2132     try:
2133       ### FIXME: The 'replace' behavior should be an option, like
2134       ### --encoding is.
2135       utf8_author = None
2136       if self._author is not None:
2137         unicode_author = unicode(self._author, Ctx().encoding, 'replace')
2138         utf8_author = unicode_author.encode('utf8')
2139       unicode_log = unicode(self.get_log_msg(), Ctx().encoding, 'replace')
2140       utf8_log = unicode_log.encode('utf8')
2141       return { 'svn:author' : utf8_author,
2142                'svn:log'    : utf8_log,
2143                'svn:date'   : date }
2144     except UnicodeError:
2145       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2146                   % warning_prefix)
2147       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2148       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2149       Log().write(LOG_WARN, "  date:   '%s'" % date)
2150       Log().write(LOG_WARN, "(subversion rev %s)  Related files:" % self.revnum)
2151       for c_rev in self.cvs_revs:
2152         Log().write(LOG_WARN, " ", c_rev.fname)
2153
2154       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2155                   "'--encoding=latin1'.\n")
2156       # It's better to fall back to the original (unknown encoding) data
2157       # than to either 1) quit or 2) record nothing at all.
2158       return { 'svn:author' : self._author,
2159                'svn:log'    : self.get_log_msg(),
2160                'svn:date'   : date }
2161
2162   def add_revision(self, cvs_rev):
2163     self.cvs_revs.append(cvs_rev)
2164     if cvs_rev.timestamp > self._max_date:
2165       self._max_date = cvs_rev.timestamp
2166
2167   def _is_primary_commit(self):
2168     """Return true if this is a primary SVNCommit, false otherwise."""
2169     return not (self.symbolic_name or self.motivating_revnum)
2170
2171   def flush(self):
2172     Log().write(LOG_NORMAL, "Creating Subversion commit %d (%s)"
2173                 % (self.revnum, self._description))
2174     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2175
2176     if self.motivating_revnum is not None:
2177       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2178                                                        self.motivating_revnum)
2179
2180     # If we're not a primary commit, then store our date and/or our
2181     # symbolic_name
2182     if not self._is_primary_commit():
2183       Ctx()._persistence_manager.set_name_and_date(self.revnum,
2184                                                    self.symbolic_name,
2185                                                    self._max_date)
2186
2187   def __str__(self):
2188     """ Print a human-readable description of this SVNCommit.  This
2189     description is not intended to be machine-parseable (although
2190     we're not going to stop you if you try!)"""
2191
2192     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2193     if self.symbolic_name:
2194       ret = ret + "   symbolic name: " +  self.symbolic_name + "\n"
2195     else:
2196       ret = ret + "   NO symbolic name\n"
2197     ret = ret + "   debug description: " + self._description + "\n"
2198     ret = ret + "   cvs_revs:\n"
2199     for c_rev in self.cvs_revs:
2200       ret = ret + "     " + c_rev.unique_key() + "\n"
2201     return ret
2202
2203   def get_log_msg(self):
2204     """Returns the actual log message for a primary commit, and the
2205     appropriate manufactured log message for a secondary commit."""
2206     if self.symbolic_name is not None:
2207       return self._log_msg_for_symbolic_name_commit()
2208     elif self.motivating_revnum is not None:
2209       return self._log_msg_for_default_branch_commit()
2210     else:
2211       return self._log_msg
2212
2213   def _log_msg_for_symbolic_name_commit(self):
2214     """Creates a log message for a manufactured commit that fills
2215     self.symbolic_name.  If self.is_tag is true, write the log message
2216     as though for a tag, else write it as though for a branch."""
2217     type = 'branch'
2218     if self.is_tag:
2219       type = 'tag'
2220
2221     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2222     space_or_newline = ' '
2223     if len(self.symbolic_name) >= 13:
2224       space_or_newline = '\n'
2225
2226     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2227            % (type, space_or_newline, self.symbolic_name)
2228
2229   def _log_msg_for_default_branch_commit(self):
2230     """Creates a log message for a manufactured commit that
2231     synchronizes a non-trunk default branch with trunk."""
2232     msg = 'This commit was generated by cvs2svn to compensate for '     \
2233           'changes in r%d,\n'                                           \
2234           'which included commits to RCS files with non-trunk default ' \
2235           'branches.\n' % self.motivating_revnum
2236     return msg
2237
2238 class CVSRevisionAggregator:
2239   """This class groups CVSRevisions into CVSCommits that represent
2240   at least one SVNCommit."""
2241   def __init__(self):
2242     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2243     if not Ctx().trunk_only:
2244       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), DB_OPEN_READ)
2245     self.cvs_commits = {}
2246     self.pending_symbols = {}
2247     # A list of symbols for which we've already encountered the last
2248     # CVSRevision that is a source for that symbol.  That is, the
2249     # final fill for this symbol has been done, and we never need to
2250     # fill it again.
2251     self.done_symbols = [ ]
2252
2253     # This variable holds the most recently created primary svn_commit
2254     # object.  CVSRevisionAggregator maintains this variable merely
2255     # for its date, so that it can set dates for the SVNCommits
2256     # created in self.attempt_to_commit_symbols().
2257     self.latest_primary_svn_commit = None
2258
2259     Ctx()._symbolings_logger = SymbolingsLogger()
2260     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2261     Ctx()._default_branches_db = Database(temp(DEFAULT_BRANCHES_DB),
2262                                           DB_OPEN_READ)
2263
2264
2265   def process_revision(self, c_rev):
2266     # Each time we read a new line, we scan the commits we've
2267     # accumulated so far to see if any are ready for processing now.
2268     ready_queue = [ ]
2269     for digest_key, cvs_commit in self.cvs_commits.items():
2270       if cvs_commit.t_max + COMMIT_THRESHOLD < c_rev.timestamp:
2271         ready_queue.append(cvs_commit)
2272         del self.cvs_commits[digest_key]
2273         continue
2274       # If the inbound commit is on the same file as a pending commit,
2275       # close the pending commit to further changes.  Don't flush it though,
2276       # as there may be other pending commits dated before this one.
2277       # ### ISSUE: the has_file() check below is not optimal.
2278       # It does fix the dataloss bug where revisions would get lost
2279       # if checked in too quickly, but it can also break apart the
2280       # commits.  The correct fix would require tracking the dependencies
2281       # between change sets and committing them in proper order.
2282       if cvs_commit.has_file(c_rev.fname):
2283         unused_id = digest_key + '-'
2284         # Find a string that does is not already a key in
2285         # the self.cvs_commits dict
2286         while self.cvs_commits.has_key(unused_id):
2287           unused_id = unused_id + '-'
2288         self.cvs_commits[unused_id] = cvs_commit
2289         del self.cvs_commits[digest_key]
2290
2291     # Add this item into the set of still-available commits.
2292     if self.cvs_commits.has_key(c_rev.digest):
2293       cvs_commit = self.cvs_commits[c_rev.digest]
2294     else:
2295       author, log = self.metadata_db[c_rev.digest]
2296       self.cvs_commits[c_rev.digest] = CVSCommit(c_rev.digest,
2297                                                  author, log)
2298       cvs_commit = self.cvs_commits[c_rev.digest]
2299     cvs_commit.add_revision(c_rev)
2300
2301     # If there are any elements in the ready_queue at this point, they
2302     # need to be processed, because this latest rev couldn't possibly
2303     # be part of any of them.  Sort them into time-order, then process
2304     # 'em.
2305     ready_queue.sort()
2306
2307     # Make sure we attempt_to_commit_symbols for this c_rev, even if no
2308     # commits are ready.
2309     if len(ready_queue) == 0:
2310       self.attempt_to_commit_symbols(ready_queue, c_rev)
2311
2312     for cvs_commit in ready_queue[:]:
2313       self.latest_primary_svn_commit \
2314           = cvs_commit.process_revisions(self.done_symbols)
2315       ready_queue.remove(cvs_commit)
2316       self.attempt_to_commit_symbols(ready_queue, c_rev)
2317
2318   def flush(self):
2319     """Commit anything left in self.cvs_commits.  Then inform the
2320     SymbolingsLogger that all commits are done."""
2321
2322     ready_queue = [ ]
2323     for k, v in self.cvs_commits.items():
2324       ready_queue.append((v, k))
2325
2326     ready_queue.sort()
2327     for cvs_commit_tuple in ready_queue[:]:
2328       self.latest_primary_svn_commit = \
2329         cvs_commit_tuple[0].process_revisions(self.done_symbols)
2330       ready_queue.remove(cvs_commit_tuple)
2331       del self.cvs_commits[cvs_commit_tuple[1]]
2332       self.attempt_to_commit_symbols([])
2333
2334     if not Ctx().trunk_only:
2335       Ctx()._symbolings_logger.close()
2336
2337   def attempt_to_commit_symbols(self, queued_commits, c_rev=None):
2338     """
2339     This function generates 1 SVNCommit for each symbol in
2340     self.pending_symbols that doesn't have an opening CVSRevision in
2341     either QUEUED_COMMITS or self.cvs_commits.values().
2342
2343     If C_REV is not None, then we first add to self.pending_symbols
2344     any symbols from C_REV that C_REV is the last CVSRevision for.
2345     """
2346     # If we're not doing a trunk-only conversion, get the symbolic
2347     # names that this c_rev is the last *source* CVSRevision for and
2348     # add them to those left over from previous passes through the
2349     # aggregator.
2350     if c_rev and not Ctx().trunk_only:
2351       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2352         self.pending_symbols[sym] = None
2353
2354     # Make a list of all symbols that still have *source* CVSRevisions
2355     # in the pending commit queue (self.cvs_commits).
2356     open_symbols = {}
2357     for sym in self.pending_symbols.keys():
2358       for cvs_commit in self.cvs_commits.values() + queued_commits:
2359         if cvs_commit.opens_symbolic_name(sym):
2360           open_symbols[sym] = None
2361           break
2362
2363     # Sort the pending symbols so that we will always process the
2364     # symbols in the same order, regardless of the order in which the
2365     # dict hashing algorithm hands them back to us.  We do this so
2366     # that our tests will get the same results on all platforms.
2367     sorted_pending_symbols_keys = self.pending_symbols.keys()
2368     sorted_pending_symbols_keys.sort()
2369     for sym in sorted_pending_symbols_keys:
2370       if open_symbols.has_key(sym): # sym is still open--don't close it.
2371         continue
2372       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2373       svn_commit.set_symbolic_name(sym)
2374       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2375       svn_commit.flush()
2376       self.done_symbols.append(sym)
2377       del self.pending_symbols[sym]
2378
2379
2380 class SymbolingsReader:
2381   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2382   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2383   returning the correct opening and closing Subversion revision
2384   numbers for a given symbolic name."""
2385   def __init__(self):
2386     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2387     reads the offsets database into memory."""
2388     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2389     # The offsets_db is really small, and we need to read and write
2390     # from it a fair bit, so suck it into memory
2391     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2392     self.offsets = { }
2393     for key in offsets_db.db.keys():
2394       #print " ZOO:", key, offsets_db[key]
2395       self.offsets[key] = offsets_db[key]
2396
2397   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2398     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2399     SymbolicNameFillingGuide object.
2400
2401     Note that if we encounter an opening rev in this fill, but the
2402     corresponding closing rev takes place later than SVN_REVNUM, the
2403     closing will not be passed to SymbolicNameFillingGuide in this
2404     fill (and will be discarded when encountered in a later fill).
2405     This is perfectly fine, because we can still do a valid fill
2406     without the closing--we always try to fill what we can as soon as
2407     we can."""
2408     # It's possible to have a branch start with a file that was added
2409     # on a branch
2410     if not self.offsets.has_key(symbolic_name):
2411       return SymbolicNameFillingGuide(symbolic_name)
2412     # set our read offset for self.symbolings to the offset for
2413     # symbolic_name
2414     self.symbolings.seek(self.offsets[symbolic_name])
2415
2416     symbol_fill = SymbolicNameFillingGuide(symbolic_name)
2417     while (1):
2418       fpos = self.symbolings.tell()
2419       line = self.symbolings.readline().rstrip()
2420       if not line:
2421         break
2422       name, revnum, type, svn_path = line.split(" ", 3)
2423       revnum = int(revnum)
2424       if (revnum > svn_revnum
2425           or name != symbolic_name):
2426         break
2427       symbol_fill.register(svn_path, revnum, type)
2428
2429     # get current offset of the read marker and set it to the offset
2430     # for the beginning of the line we just read if we used anything
2431     # we read.
2432     if not symbol_fill.is_empty():
2433       self.offsets[symbolic_name] = fpos
2434
2435     symbol_fill.make_node_tree()
2436     return symbol_fill
2437
2438
2439 class SymbolicNameFillingGuide:
2440   """A SymbolicNameFillingGuide is essentially a node tree
2441   representing the source paths to be copied to fill
2442   self.symbolic_name in the current SVNCommit.
2443
2444   After calling self.register() on a series of openings and closings,
2445   call self.make_node_tree() to prepare self.node_tree for
2446   examination.  See the docstring for self.make_node_tree() for
2447   details on the structure of self.node_tree.
2448
2449   By walking self.node_tree and calling self.get_best_revnum() on each
2450   node, the caller can determine what subversion revision number to
2451   copy the path corresponding to that node from.  self.node_tree
2452   should be treated as read-only.
2453
2454   The caller can then descend to sub-nodes to see if their "best
2455   revnum" differs from their parents' and if it does, take appropriate
2456   actions to "patch up" the subtrees."""
2457   def __init__(self, symbolic_name):
2458     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2459     prepares it for receiving openings and closings.
2460
2461     Returns a fully functional and armed SymbolicNameFillingGuide
2462     object."""
2463     self.name = symbolic_name
2464
2465     self.opening_key = "/o"
2466     self.closing_key = "/c"
2467
2468     # A dictionary of SVN_PATHS and SVN_REVNUMS whose format is:
2469     #
2470     # { svn_path : { self.opening_key : svn_revnum,
2471     #                self.closing_key : svn_revnum }
2472     #                ...}
2473     self.things = { }
2474
2475     # The key for the root node of the node tree
2476     self.root_key = '0'
2477     # The dictionary that holds our node tree, seeded with the root key.
2478     self.node_tree = { self.root_key : { } }
2479
2480   def get_best_revnum(self, node, preferred_revnum):
2481     """Determine the best subversion revision number to use when
2482     copying the source tree beginning at NODE.  Returns a
2483     subversion revision number.
2484
2485     PREFERRED_REVNUM is passed to self._best_rev and used to
2486     calculate the best_revnum."""
2487     revnum = SVN_INVALID_REVNUM
2488
2489     # Aggregate openings and closings from the rev tree
2490     openings = self._list_revnums_for_key(node, self.opening_key)
2491     closings = self._list_revnums_for_key(node, self.closing_key)
2492
2493     # Score the lists
2494     scores = self._score_revisions(self._sum_revnum_counts(openings),
2495                                   self._sum_revnum_counts(closings))
2496
2497     revnum, max_score = self._best_rev(scores, preferred_revnum)
2498
2499     if revnum == SVN_INVALID_REVNUM:
2500       sys.stderr.write(error_prefix + ": failed to find a revision "
2501                        + "to copy from when copying %s\n" % name)
2502       sys.exit(1)
2503     return revnum, max_score
2504
2505
2506   def _best_rev(self, scores, preferred_rev):
2507     """Return the revision with the highest score from SCORES, a list
2508     returned by _score_revisions().  When the maximum score is shared
2509     by multiple revisions, the oldest revision is selected, unless
2510     PREFERRED_REV is one of the possibilities, in which case, it is
2511     selected."""
2512     max_score = 0
2513     preferred_rev_score = -1
2514     rev = SVN_INVALID_REVNUM
2515     if preferred_rev is None:
2516       # Comparison order of different types is arbitrary. Do not
2517       # expect None to compare less than int values below.
2518       # In Python 2.3 None compares with ints like negative infinity.
2519       # In Python 2.0 None compares with ints like positive infinity.
2520       preferred_rev = SVN_INVALID_REVNUM
2521     for revnum, count in scores:
2522       if count > max_score:
2523         max_score = count
2524         rev = revnum
2525       if revnum <= preferred_rev:
2526         preferred_rev_score = count
2527     if preferred_rev_score == max_score:
2528       rev = preferred_rev
2529     return rev, max_score
2530
2531
2532   def _score_revisions(self, openings, closings):
2533     """Return a list of revisions and scores based on OPENINGS and
2534     CLOSINGS.  The returned list looks like:
2535
2536        [(REV1 SCORE1), (REV2 SCORE2), ...]
2537
2538     where REV2 > REV1.  OPENINGS and CLOSINGS are the values of
2539     self.opening__key and self.closing_key from some file or
2540     directory node, or else None.
2541
2542     Each score indicates that copying the corresponding revision (or
2543     any following revision up to the next revision in the list) of the
2544     object in question would yield that many correct paths at or
2545     underneath the object.  There may be other paths underneath it
2546     which are not correct and would need to be deleted or recopied;
2547     those can only be detected by descending and examining their
2548     scores.
2549
2550     If OPENINGS is false, return the empty list."""
2551     # First look for easy outs.
2552     if not openings:
2553       return []
2554
2555     # Must be able to call len(closings) below.
2556     if closings is None:
2557       closings = []
2558
2559     # No easy out, so wish for lexical closures and calculate the scores :-).
2560     scores = []
2561     opening_score_accum = 0
2562     for i in range(len(openings)):
2563       opening_rev, opening_score = openings[i]
2564       opening_score_accum = opening_score_accum + opening_score
2565       scores.append((opening_rev, opening_score_accum))
2566     min = 0
2567     for i in range(len(closings)):
2568       closing_rev, closing_score = closings[i]
2569       done_exact_rev = None
2570       insert_index = None
2571       insert_score = None
2572       for j in range(min, len(scores)):
2573         score_rev, score = scores[j]
2574         if score_rev >= closing_rev:
2575           if not done_exact_rev:
2576             if score_rev > closing_rev:
2577               insert_index = j
2578               insert_score = scores[j-1][1] - closing_score
2579             done_exact_rev = 1
2580           scores[j] = (score_rev, score - closing_score)
2581         else:
2582           min = j + 1
2583       if not done_exact_rev:
2584         scores.append((closing_rev,scores[-1][1] - closing_score))
2585       if insert_index is not None:
2586         scores.insert(insert_index, (closing_rev, insert_score))
2587     return scores
2588
2589   def _sum_revnum_counts(self, rev_list):
2590     """Takes an array of revisions (REV_LIST), for example:
2591
2592       [21, 18, 6, 49, 39, 24, 24, 24, 24, 24, 24, 24]
2593
2594     and adds up every occurrence of each revision and returns a sorted
2595     array of tuples containing (svn_revnum, count):
2596
2597       [(6, 1), (18, 1), (21, 1), (24, 7), (39, 1), (49, 1)]
2598     """
2599     s = {}
2600     for k in rev_list: # Add up the scores
2601       if s.has_key(k):
2602         s[k] = s[k] + 1
2603       else:
2604         s[k] = 1
2605     a = s.items()
2606     a.sort()
2607     return a
2608
2609   def _list_revnums_for_key(self, node, revnum_type_key):
2610     """Scan self.node_tree and return a list of all the revision
2611     numbers (including duplicates) contained in REVNUM_TYPE_KEY values
2612     for all leaf nodes at and under NODE.
2613
2614     REVNUM_TYPE_KEY should be either self.opening_key or
2615     self.closing_key."""
2616     revnums = []
2617
2618     # If the node has self.opening_key, it must be a leaf node--all
2619     # leaf nodes have at least an opening key (although they may not
2620     # have a closing key.  Fetch revnum and return
2621     if (self.node_tree[node].has_key(self.opening_key) and
2622         self.node_tree[node].has_key(revnum_type_key)):
2623       revnums.append(self.node_tree[node][revnum_type_key])
2624       return revnums
2625
2626     for key, node_contents in self.node_tree[node].items():
2627       if key[0] == '/':
2628         continue
2629       revnums = revnums + \
2630           self._list_revnums_for_key(node_contents, revnum_type_key)
2631     return revnums
2632
2633   def register(self, svn_path, svn_revnum, type):
2634     """Collects opening and closing revisions for this
2635     SymbolicNameFillingGuide.  SVN_PATH is the source path that needs
2636     to be copied into self.symbolic_name, and SVN_REVNUM is either the
2637     first svn revision number that we can copy from (our opening), or
2638     the last (not inclusive) svn revision number that we can copy from
2639     (our closing).  TYPE indicates whether this path is an opening or a
2640     a closing.
2641
2642     The opening for a given SVN_PATH must be passed before the closing
2643     for it to have any effect... any closing encountered before a
2644     corresponding opening will be discarded.
2645
2646     It is not necessary to pass a corresponding closing for every
2647     opening.
2648     """
2649     # Always log an OPENING
2650     if type == OPENING:
2651       self.things[svn_path] = {self.opening_key: svn_revnum}
2652     # Only log a closing if we've already registered the opening for that path.
2653     elif type == CLOSING and self.things.has_key(svn_path):
2654       # When we have a non-trunk default branch, we may have multiple
2655       # closings--only register the first closing we encounter.
2656       if not self.things[svn_path].has_key(self.closing_key):
2657         self.things[svn_path][self.closing_key] = svn_revnum
2658
2659   def make_node_tree(self):
2660     """Generates the SymbolicNameFillingGuide's node tree from
2661     self.things.  Each leaf node maps self.opening_key to the earliest
2662     subversion revision from which this node/path may be copied; and
2663     optionally map self.closing_key to the subversion revision one
2664     higher than the last revision from which this node/path may be
2665     copied.  Intermediate nodes never contain opening or closing
2666     flags."""
2667
2668     for svn_path, open_close in self.things.items():
2669       parent_key = self.root_key
2670
2671       path_so_far = ""
2672       # Walk up the path, one node at a time.
2673       components = svn_path.split('/')
2674       for component in components:
2675         path_so_far = path_so_far + '/' + component
2676
2677         child_key = None
2678         if not self.node_tree[parent_key].has_key(component):
2679           child_key = gen_key()
2680           self.node_tree[child_key] = { }
2681           self.node_tree[parent_key][component] = child_key
2682         else:
2683           child_key = self.node_tree[parent_key][component]
2684
2685         parent_key = child_key
2686       # Having reached the leaf, attach the value
2687       self.node_tree[parent_key] = open_close
2688     #print_node_tree(self.node_tree, self.root_key)
2689
2690   def is_empty(self):
2691     """Return true if we haven't accumulated any openings or closings,
2692     false otherwise."""
2693     return not len(self.things)
2694
2695
2696 class FillSource:
2697   """Representation of a fill source used by the symbol filler in
2698   SVNRepositoryMirror."""
2699   def __init__(self, prefix, key):
2700     """Create an unscored fill source with a prefix and a key."""
2701     self.prefix = prefix
2702     self.key = key
2703     self.score = None
2704     self.revnum = None
2705
2706   def set_score(self, score, revnum):
2707     """Set the SCORE and REVNUM."""
2708     self.score = score
2709     self.revnum = revnum
2710
2711   def __cmp__(self, other):
2712     """Comparison operator used to sort FillSources in descending
2713     score order."""
2714     if self.score is None or other.score is None:
2715       raise TypeError, 'Tried to compare unscored FillSource'
2716     return cmp(other.score, self.score)
2717
2718
2719 class SVNRepositoryMirror:
2720   """Mirror a Subversion Repository as it is constructed, one
2721   SVNCommit at a time.  The mirror is skeletal; it does not contain
2722   file contents.  The creation of a dumpfile or Subversion repository
2723   is handled by delegates.  See self.add_delegate method for how to
2724   set delegates.
2725
2726   The structure of the repository is kept in two databases and one
2727   hash.  The revs_db database maps revisions to root node keys, and
2728   the nodes_db database maps node keys to nodes.  A node is a hash
2729   from directory names to keys.  Both the revs_db and the nodes_db are
2730   stored on disk and each access is expensive.
2731
2732   The nodes_db database only has the keys for old revisions.  The
2733   revision that is being contructed is kept in memory in the new_nodes
2734   hash which is cheap to access.
2735
2736   You must invoke _start_commit between SVNCommits.
2737
2738   *** WARNING *** All path arguments to methods in this class CANNOT
2739       have leading or trailing slashes.
2740   """
2741
2742   class SVNRepositoryMirrorPathExistsError(Exception):
2743     """Exception raised if an attempt is made to add a path to the
2744     repository mirror and that path already exists in the youngest
2745     revision of the repository."""
2746     pass
2747
2748   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
2749     """Exception raised if a CVSRevision is found to have an unexpected
2750     operation (OP) value."""
2751     pass
2752
2753   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
2754     """Exception raised if an empty SymbolicNameFillingGuide is returned
2755     during a fill where the branch in question already exists."""
2756     pass
2757
2758   def __init__(self):
2759     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
2760     self.delegates = [ ]
2761
2762     # This corresponds to the 'revisions' table in a Subversion fs.
2763     self.revs_db = Database(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
2764     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
2765
2766     # This corresponds to the 'nodes' table in a Subversion fs.  (We
2767     # don't need a 'representations' or 'strings' table because we
2768     # only track metadata, not file contents.)
2769     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
2770     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
2771
2772     # Start at revision 0 without a root node.  It will be created
2773     # by _open_writable_root_node.
2774     self.youngest = 0
2775     self.new_root_key = None
2776     self.new_nodes = { }
2777
2778     if not Ctx().trunk_only:
2779       ###PERF IMPT: Suck this into memory.
2780       self.tags_db = TagsDatabase(DB_OPEN_READ)
2781       self.symbolings_reader = SymbolingsReader()
2782
2783   def _initialize_repository(self, date):
2784     """Initialize the repository by creating the directories for
2785     trunk, tags, and branches.  This method should only be called
2786     after all delegates are added to the repository mirror."""
2787     # Make a 'fake' SVNCommit so we can take advantage of the revprops
2788     # magic therein
2789     svn_commit = SVNCommit("Initialization", 1)
2790     svn_commit.set_date(date)
2791     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
2792
2793     self._start_commit(svn_commit)
2794     self._mkdir(Ctx().trunk_base)
2795     if not Ctx().trunk_only:
2796       self._mkdir(Ctx().branches_base)
2797       self._mkdir(Ctx().tags_base)
2798
2799   def _start_commit(self, svn_commit):
2800     """Start a new commit."""
2801     if self.youngest > 0:
2802       self._end_commit()
2803
2804     self.youngest = svn_commit.revnum
2805     self.new_root_key = None
2806     self.new_nodes = { }
2807
2808     self._invoke_delegates('start_commit', svn_commit)
2809
2810   def _end_commit(self):
2811     """Called at the end of each commit.  This method copies the newly
2812     created nodes to the on-disk nodes db."""
2813     if self.new_root_key is None:
2814       # No changes were made in this revision, so we make the root node
2815       # of the new revision be the same as the last one.
2816       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
2817     else:
2818       self.revs_db[str(self.youngest)] = self.new_root_key
2819       # Copy the new nodes to the nodes_db
2820       for key, value in self.new_nodes.items():
2821         self.nodes_db[key] = value
2822
2823   def _get_node(self, key):
2824     """Returns the node contents for KEY which may refer to either
2825     self.nodes_db or self.new_nodes."""
2826     if self.new_nodes.has_key(key):
2827       return self.new_nodes[key]
2828     else:
2829       return self.nodes_db[key]
2830
2831   def _open_readonly_node(self, path, revnum):
2832     """Open a readonly node for PATH at revision REVNUM.  Returns the
2833     node key and node contents if the path exists, else (None, None)."""
2834     # Get the root key
2835     if revnum == self.youngest:
2836       if self.new_root_key is None:
2837         node_key = self.revs_db[str(self.youngest - 1)]
2838       else:
2839         node_key = self.new_root_key
2840     else:
2841       node_key = self.revs_db[str(revnum)]
2842
2843     for component in path.split('/'):
2844       node_contents = self._get_node(node_key)
2845       if not node_contents.has_key(component):
2846         return None
2847       node_key = node_contents[component]
2848
2849     return node_key
2850
2851   def _open_writable_root_node(self):
2852     """Open a writable root node.  The current root node is returned
2853     immeditely if it is already writable.  If not, create a new one by
2854     copying the contents of the root node of the previous version."""
2855     if self.new_root_key is not None:
2856       return self.new_root_key, self.new_nodes[self.new_root_key]
2857
2858     if self.youngest < 2:
2859       new_contents = { }
2860     else:
2861       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
2862     self.new_root_key = gen_key()
2863     self.new_nodes = { self.new_root_key: new_contents }
2864
2865     return self.new_root_key, new_contents
2866
2867   def _open_writable_node(self, svn_path, create):
2868     """Open a writable node for the path SVN_PATH, creating SVN_PATH
2869     and any missing directories if CREATE is True."""
2870     parent_key, parent_contents = self._open_writable_root_node()
2871
2872     # Walk up the path, one node at a time.
2873     path_so_far = None
2874     components = svn_path.split('/')
2875     for i in range(len(components)):
2876       component = components[i]
2877       this_key = this_contents = None
2878       path_so_far = _path_join(path_so_far, component)
2879       if parent_contents.has_key(component):
2880         # The component exists.
2881         this_key = parent_contents[component]
2882         if self.new_nodes.has_key(this_key):
2883           this_contents = self.new_nodes[this_key]
2884         else:
2885           # Suck the node from the nodes_db, but update the key
2886           this_contents = self.nodes_db[this_key]
2887           this_key = gen_key()
2888           self.new_nodes[this_key] = this_contents
2889           parent_contents[component] = this_key
2890       elif create:
2891         # The component does not exists, so we create it.
2892         this_contents = { }
2893         this_key = gen_key()
2894         self.new_nodes[this_key] = this_contents
2895         parent_contents[component] = this_key
2896         if i < len(components) - 1:
2897           self._invoke_delegates('mkdir', path_so_far)
2898       else:
2899         # The component does not exists and we are not instructed to
2900         # create it, so we give up.
2901         return None, None
2902
2903       parent_key = this_key
2904       parent_contents = this_contents
2905
2906     return this_key, this_contents
2907
2908   def _path_exists(self, path):
2909     """If PATH exists in self.youngest of the svn repository mirror,
2910     return true, else return None.
2911
2912     PATH must not start with '/'."""
2913     return self._open_readonly_node(path, self.youngest) is not None
2914
2915   def _fast_delete_path(self, parent_path, parent_contents, component):
2916     """Delete COMPONENT from the parent direcory PARENT_PATH with the
2917     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
2918     in PARENT_CONTENTS."""
2919     if parent_contents.has_key(component):
2920       del parent_contents[component]
2921       self._invoke_delegates('delete_path', _path_join(parent_path, component))
2922
2923   def _delete_path(self, svn_path, should_prune=False):
2924     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
2925     all ancestor directories that are made empty when SVN_PATH is deleted.
2926     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
2927
2928     NOTE: This function does *not* allow you delete top-level entries
2929     (like /trunk, /branches, /tags), nor does it prune upwards beyond
2930     those entries."""
2931     pos = svn_path.rfind('/')
2932     parent_path = svn_path[:pos]
2933     entry = svn_path[pos+1:]
2934     parent_key, parent_contents = self._open_writable_node(parent_path, False)
2935     if parent_key is not None:
2936       self._fast_delete_path(parent_path, parent_contents, entry)
2937       # The following recursion makes pruning an O(n^2) operation in the
2938       # worst case (where n is the depth of SVN_PATH), but the worst case
2939       # is probably rare, and the constant cost is pretty low.  Another
2940       # drawback is that we issue a delete for each path and not just
2941       # a single delete for the topmost directory pruned.
2942       if (should_prune and len(parent_contents) == 0 and
2943           parent_path.find('/') != -1):
2944         self._delete_path(parent_path, True)
2945
2946   def _mkdir(self, path):
2947     """Create PATH in the repository mirror at the youngest revision."""
2948     self._open_writable_node(path, True)
2949     self._invoke_delegates('mkdir', path)
2950
2951   def _change_path(self, cvs_rev):
2952     """Register a change in self.youngest for the CVS_REV's svn_path
2953     in the repository mirror."""
2954     # We do not have to update the nodes because our mirror is only
2955     # concerned with the presence or absence of paths, and a file
2956     # content change does not cause any path changes.
2957     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, 0))
2958
2959   def _add_path(self, cvs_rev):
2960     """Add the CVS_REV's svn_path to the repository mirror."""
2961     self._open_writable_node(cvs_rev.svn_path, True)
2962     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, 1))
2963
2964   def _copy_path(self, src_path, dest_path, src_revnum):
2965     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
2966     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
2967     parent *must* exist, but DEST_PATH *cannot* exist.
2968
2969     Return the node key and the contents of the new node at DEST_PATH
2970     as a dictionary."""
2971     # get the contents of the node of our src_path
2972     src_key = self._open_readonly_node(src_path, src_revnum)
2973     src_contents = self._get_node(src_key)
2974
2975     # Get the parent path and the base path of the dest_path
2976     pos = dest_path.rindex('/')
2977     dest_parent = dest_path[:pos]
2978     dest_basename = dest_path[pos+1:]
2979     dest_parent_key, dest_parent_contents = \
2980                    self._open_writable_node(dest_parent, False)
2981
2982     if dest_parent_contents.has_key(dest_basename):
2983       msg = "Attempt to add path '%s' to repository mirror " % dest_path
2984       msg = msg + "when it already exists in the mirror."
2985       raise self.SVNRepositoryMirrorPathExistsError, msg
2986
2987     dest_parent_contents[dest_basename] = src_key
2988     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
2989
2990     # Yes sir, src_key and src_contents are also the contents of the
2991     # destination.  This is a cheap copy, remember!  :-)
2992     return src_key, src_contents
2993
2994   def _fill_symbolic_name(self, svn_commit):
2995     """Performs all copies necessary to create as much of the the tag
2996     or branch SVN_COMMIT.symbolic_name as possible given the current
2997     revision of the repository mirror.
2998
2999     The symbolic name is guaranteed to exist in the Subversion
3000     repository by the end of this call, even if there are no paths
3001     under it."""
3002     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3003       svn_commit.symbolic_name, self.youngest)
3004
3005     # Create the list of sources for the symbolic name.  All source
3006     # prefixes must be direct sources for the destination, i.e. we
3007     # must have 'trunk' and 'branches/my_branch' and not just
3008     # 'branches'.
3009     sources = []
3010     for entry, key in symbol_fill.node_tree[symbol_fill.root_key].items():
3011       if entry == Ctx().trunk_base:
3012         sources.append(FillSource(entry, key))
3013       elif entry == Ctx().branches_base:
3014         for entry2, key2 in symbol_fill.node_tree[key].items():
3015           sources.append(FillSource(entry + '/' + entry2, key2))
3016       else:
3017         raise # Should never happen
3018     if self.tags_db.has_key(svn_commit.symbolic_name):
3019       dest_prefix = _path_join(Ctx().tags_base, svn_commit.symbolic_name)
3020     else:
3021       dest_prefix = _path_join(Ctx().branches_base,
3022                                svn_commit.symbolic_name)
3023
3024     if sources:
3025       dest_key = self._open_writable_node(dest_prefix, False)[0]
3026       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3027     else:
3028       # We can only get here for a branch whose first commit is an add
3029       # (as opposed to a copy).
3030       dest_path = Ctx().branches_base + '/' + symbol_fill.name
3031       if not self._path_exists(dest_path):
3032         # If our symbol_fill was empty, that means that our first
3033         # commit on the branch was to a file added on the branch, and
3034         # that this is our first fill of that branch.
3035         #
3036         # This case is covered by test 16.
3037         #
3038         # ...we create the branch by copying trunk from the our
3039         # current revision number minus 1
3040         source_path = Ctx().trunk_base
3041         entries = self._copy_path(source_path, dest_path,
3042                                   svn_commit.revnum - 1)[1]
3043         # Now since we've just copied trunk to a branch that's
3044         # *supposed* to be empty, we delete any entries in the
3045         # copied directory.
3046         for entry in entries.keys():
3047           del_path = dest_path + '/' + entry
3048           # Delete but don't prune.
3049           self._delete_path(del_path)
3050       else:
3051         msg = "Error filling branch '" + symbol_fill.name + "'.\n"
3052         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3053         msg = msg + "attempted to create a branch that already exists."
3054         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3055
3056   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3057             path = None, parent_source_prefix = None,
3058             preferred_revnum = None, prune_ok = None):
3059     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3060     SOURCES, and recurse into the child items.
3061
3062     DEST_PREFIX is the prefix of the destination directory, e.g.
3063     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3064     FillSource classes that are candidates to be copied to the
3065     destination.  DEST_KEY is the key in self.nodes_db to the
3066     destination, or None if the destination does not yet exist.
3067
3068     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3069     are at the top level, e.g. '/tags/my_tag'.
3070
3071     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3072     the parent directory, and PREFERRED_REVNUM is an int which is the
3073     source revision number that the caller (who may have copied KEY's
3074     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3075     then no revision is preferable to any other (which probably means
3076     that no copies have happened yet).
3077
3078     PRUNE_OK means that a copy has been made in this recursion, and
3079     it's safe to prune directories that are not in
3080     SYMBOL_FILL.node_tree, provided that said directory has a source
3081     prefix of one of the PARENT_SOURCE_PREFIX.
3082
3083     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3084     should only be passed in by recursive calls."""
3085     # Calculate scores and revnums for all sources
3086     for source in sources:
3087       src_revnum, score = symbol_fill.get_best_revnum(source.key,
3088                                                       preferred_revnum)
3089       source.set_score(score, src_revnum)
3090
3091     # Sort the sources in descending score order so that we will make
3092     # a eventual copy from the source with the highest score.
3093     sources.sort()
3094     copy_source = sources[0]
3095
3096     src_path = _path_join(copy_source.prefix, path)
3097     dest_path = _path_join(dest_prefix, path)
3098
3099     # Figure out if we shall copy to this destination and delete any
3100     # destination path that is in the way.
3101     do_copy = 0
3102     if dest_key is None:
3103       do_copy = 1
3104     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3105                        copy_source.revnum != preferred_revnum):
3106       # We are about to replace the destination, so we need to remove
3107       # it before we perform the copy.
3108       self._delete_path(dest_path)
3109       do_copy = 1
3110
3111     if do_copy:
3112       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3113                                                copy_source.revnum)
3114       prune_ok = 1
3115     else:
3116       dest_entries = self._get_node(dest_key)
3117
3118     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3119     # elements and the values are lists of FillSource classes where
3120     # this path element exists.
3121     src_entries = {}
3122     for source in sources:
3123       for entry, key in symbol_fill.node_tree[source.key].items():
3124         if entry[0] == '/': # Skip flags
3125           continue
3126         if not src_entries.has_key(entry):
3127           src_entries[entry] = []
3128         src_entries[entry].append(FillSource(source.prefix, key))
3129
3130     if prune_ok:
3131       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3132       delete_list = [ ]
3133       for entry in dest_entries.keys():
3134         if not src_entries.has_key(entry):
3135           delete_list.append(entry)
3136       if delete_list:
3137         if not self.new_nodes.has_key(dest_key):
3138           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3139         # Sort the delete list to get "diffable" dumpfiles.
3140         delete_list.sort()
3141         for entry in delete_list:
3142           self._fast_delete_path(dest_path, dest_entries, entry)
3143
3144     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3145     src_keys = src_entries.keys()
3146     src_keys.sort()
3147     for src_key in src_keys:
3148       if dest_entries.has_key(src_key):
3149         next_dest_key = dest_entries[src_key]
3150       else:
3151         next_dest_key = None
3152       self._fill(symbol_fill, dest_prefix, next_dest_key,
3153                  src_entries[src_key], _path_join(path, src_key),
3154                  copy_source.prefix, sources[0].revnum, prune_ok)
3155
3156   def _synchronize_default_branch(self, svn_commit):
3157     """Propagate any changes that happened on a non-trunk default
3158     branch to the trunk of the repository.  See
3159     CVSCommit._post_commit() for details on why this is necessary."""
3160     for cvs_rev in svn_commit.cvs_revs:
3161       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3162         if self._path_exists(cvs_rev.svn_trunk_path):
3163           # Delete the path on trunk...
3164           self._delete_path(cvs_rev.svn_trunk_path)
3165         # ...and copy over from branch
3166         self._copy_path(cvs_rev.svn_path, cvs_rev.svn_trunk_path,
3167                         svn_commit.motivating_revnum)
3168       elif cvs_rev.op == OP_DELETE:
3169         # delete trunk path
3170         self._delete_path(cvs_rev.svn_trunk_path)
3171       else:
3172         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3173                % cvs_rev.op)
3174         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3175
3176   def commit(self, svn_commit):
3177     """Add an SVNCommit to the SVNRepository, incrementing the
3178     Repository revision number, and changing the repository.  Invoke
3179     the delegates' _start_commit() method."""
3180
3181     if svn_commit.revnum == 2:
3182       self._initialize_repository(svn_commit.get_date())
3183
3184     self._start_commit(svn_commit)
3185
3186     if svn_commit.symbolic_name:
3187       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3188                   svn_commit.symbolic_name)
3189       self._fill_symbolic_name(svn_commit)
3190     elif svn_commit.motivating_revnum:
3191       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3192                   % svn_commit.motivating_revnum)
3193       self._synchronize_default_branch(svn_commit)
3194     else: # This actually commits CVSRevisions
3195       if len(svn_commit.cvs_revs) > 1: plural = "s"
3196       else: plural = ""
3197       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3198                   % (len(svn_commit.cvs_revs), plural))
3199       for cvs_rev in svn_commit.cvs_revs:
3200         # See comment in CVSCommit._commit() for what this is all
3201         # about.  Note that although asking self._path_exists() is
3202         # somewhat expensive, we only do it if the first two (cheap)
3203         # tests succeed first.
3204         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3205                 and (cvs_rev.rev == "1.1.1.1")
3206                 and self._path_exists(cvs_rev.svn_path)):
3207           if cvs_rev.op == OP_ADD:
3208             self._add_path(cvs_rev)
3209           elif cvs_rev.op == OP_CHANGE:
3210             # Fix for Issue #74:
3211             #
3212             # Here's the scenario.  You have file FOO that is imported
3213             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3214             # the file exists.
3215             #
3216             # Moving forward in time, FOO is deleted on the default
3217             # branch (r1.1.1.2).  cvs2svn determines that this delete
3218             # also needs to happen on trunk, so FOO is deleted on
3219             # trunk.
3220             #
3221             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3222             # not 'dead', we assume it's a change).  However, since
3223             # our trunk file has been deleted, svnadmin blows up--you
3224             # can't change a file that doesn't exist!
3225             #
3226             # Soooo... we just check the path, and if it doesn't
3227             # exist, we do an add... if the path does exist, it's
3228             # business as usual.
3229             if not self._path_exists(cvs_rev.svn_path):
3230               self._add_path(cvs_rev)
3231             else:
3232               self._change_path(cvs_rev)
3233
3234         if cvs_rev.op == OP_DELETE:
3235           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3236
3237   def cleanup(self):
3238     """Callback for the Cleanup.register in self.__init__."""
3239     self.revs_db = None
3240     self.nodes_db = None
3241
3242   def add_delegate(self, delegate):
3243     """Adds DELEGATE to self.delegates.
3244
3245     For every delegate you add, as soon as SVNRepositoryMirror
3246     performs a repository action method, SVNRepositoryMirror will call
3247     the delegate's corresponding repository action method.  Multiple
3248     delegates will be called in the order that they are added.  See
3249     SVNRepositoryMirrorDelegate for more information."""
3250     self.delegates.append(delegate)
3251
3252   def _invoke_delegates(self, method, *args):
3253     """Iterate through each of our delegates, in the order that they
3254     were added, and call the delegate's method named METHOD with the
3255     arguments in ARGS."""
3256     for delegate in self.delegates:
3257       getattr(delegate, method)(*args)
3258
3259   def finish(self):
3260     """Calls the delegate finish method."""
3261     self._end_commit()
3262     self._invoke_delegates('finish')
3263     self.cleanup()
3264
3265
3266 class SVNCommitItem:
3267   """A wrapper class for CVSRevision objects upon which
3268    Subversion-related data (such as properties) may be hung."""
3269
3270   def __init__(self, c_rev, make_svn_props):
3271     self.c_rev = c_rev
3272     self.set_cvs_revnum_properties = Ctx().cvs_revnums
3273     self.eol_from_mime_type = Ctx().eol_from_mime_type
3274     self.no_default_eol = Ctx().no_default_eol
3275     self.keywords_off = Ctx().keywords_off
3276     self.mime_mapper = Ctx().mime_mapper
3277
3278     # We begin with only a "CVS revision" property.
3279     self.svn_props = { }
3280     if self.set_cvs_revnum_properties:
3281       self.svn_props['cvs2svn:cvs-rev'] = c_rev.rev
3282       make_svn_props = True
3283
3284     # If asked to fill in the Subversion properties ('svn:' ones), do so.
3285     if make_svn_props:
3286       # Tack on the executableness, if any.
3287       if c_rev.file_executable:
3288         self.svn_props['svn:executable'] = '*'
3289
3290       # Set the svn:keywords property, if appropriate.  See issue #2.
3291       if c_rev.mode is None or c_rev.mode == 'kv' or c_rev.mode == 'kvl':
3292         if not self.keywords_off:
3293           self.svn_props['svn:keywords'] = 'Author Date Id Revision'
3294
3295       # Set mime-type and eol.  These two properties are intertwingled;
3296       # follow the conditionals carefully.  See also issue #39.
3297       mime_type = None
3298       eol_style = None
3299
3300       if self.mime_mapper:
3301         mime_type = self.mime_mapper.get_type_from_filename(c_rev.cvs_path)
3302
3303       if not c_rev.mode == 'b':
3304         if not self.no_default_eol:
3305           eol_style = 'native'
3306         elif mime_type and self.eol_from_mime_type:
3307           if mime_type.startswith("text/"):
3308             eol_style = 'native'
3309           else:
3310             eol_style = None
3311       elif mime_type is None:
3312         # file is kb, and no other mimetype specified
3313         mime_type = 'application/octet-stream'
3314
3315       if mime_type:
3316         self.svn_props['svn:mime-type'] = mime_type
3317
3318       if eol_style:
3319         self.svn_props['svn:eol-style'] = eol_style
3320
3321
3322 class SVNRepositoryMirrorDelegate:
3323   """Abstract superclass for any delegate to SVNRepositoryMirror.
3324   Subclasses must implement all of the methods below.
3325
3326   For each method, a subclass implements, in its own way, the
3327   Subversion operation implied by the method's name.  For example, for
3328   the add_path method, the DumpfileDelegate would write out a
3329   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3330   would merely print that the path is being added to the repository,
3331   and the RepositoryDelegate would actually cause the path to be added
3332   to the Subversion repository that it is creating.
3333   """
3334
3335   def start_commit(self, svn_commit):
3336     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3337     see subclass implementation for details."""
3338     raise NotImplementedError
3339
3340   def mkdir(self, path):
3341     """PATH is a string; see subclass implementation for details."""
3342     raise NotImplementedError
3343
3344   def add_path(self, s_item):
3345     """S_ITEM is an SVNCommitItem; see subclass implementation for
3346     details."""
3347     raise NotImplementedError
3348
3349   def change_path(self, s_item):
3350     """S_ITEM is an SVNCommitItem; see subclass implementation for
3351     details."""
3352     raise NotImplementedError
3353
3354   def delete_path(self, path):
3355     """PATH is a string; see subclass implementation for
3356     details."""
3357     raise NotImplementedError
3358
3359   def copy_path(self, src_path, dest_path, src_revnum):
3360     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3361     subversion revision number (int); see subclass implementation for
3362     details."""
3363     raise NotImplementedError
3364
3365   def finish(self):
3366     """Perform any cleanup necessary after all revisions have been
3367     committed."""
3368     raise NotImplementedError
3369
3370
3371 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3372   """Create a Subversion dumpfile."""
3373
3374   def __init__(self, dumpfile_path=None):
3375     """Return a new DumpfileDelegate instance, attached to a dumpfile
3376     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding.
3377
3378     If Ctx().cvs_revnums is true, then set the 'cvs2svn:cvs-revnum'
3379     property on files, when they are changed due to a corresponding
3380     CVS revision.
3381
3382     If Ctx().mime_mapper is not None, then it is a MimeMapper
3383     instance, used to determine whether or not to set the
3384     'svn:mime-type' property on files.  But even if Ctx().mime_mapper
3385     is None, files marked with the CVS 'kb' flag will receive a mime
3386     type of "application/octet-stream".
3387
3388     Unless Ctx().no_default_eol is true, set 'svn:eol-style' to
3389     'native' for files not marked with the CVS 'kb' flag, except as
3390     superseded by Ctx().eol_from_mime_type (see below).
3391
3392     If Ctx().eol_from_mime_type is not None, then set 'svn:eol-style'
3393     to 'native' for all files to which Ctx().mime_mapper assigns a
3394     mime type beginning with "text/", and don't set 'svn:eol-style'
3395     for files assigned a type not beginning with "text/".
3396     """
3397     if dumpfile_path:
3398       self.dumpfile_path = dumpfile_path
3399     else:
3400       self.dumpfile_path = Ctx().dumpfile
3401     self.path_encoding = Ctx().encoding
3402
3403     self.dumpfile = open(self.dumpfile_path, 'wb')
3404     self._write_dumpfile_header(self.dumpfile)
3405
3406   def _write_dumpfile_header(self, dumpfile):
3407     # Initialize the dumpfile with the standard headers.
3408     #
3409     # Since the CVS repository doesn't have a UUID, and the Subversion
3410     # repository will be created with one anyway, we don't specify a
3411     # UUID in the dumpflie
3412     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3413
3414   def _utf8_path(self, path):
3415     """Return a copy of PATH encoded in UTF-8.  PATH is assumed to be
3416     encoded in self.path_encoding."""
3417     try:
3418       # Log messages can be converted with the 'replace' strategy,
3419       # but we can't afford any lossiness here.
3420       unicode_path = unicode(path, self.path_encoding, 'strict')
3421       return unicode_path.encode('utf-8')
3422     except UnicodeError:
3423       print "Unable to convert a path '%s' to internal encoding." % path
3424       print "Consider rerunning with (for example) '--encoding=latin1'"
3425       sys.exit(1)
3426
3427   def start_commit(self, svn_commit):
3428     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3429
3430     self.revision = svn_commit.revnum
3431
3432     # The start of a new commit typically looks like this:
3433     #
3434     #   Revision-number: 1
3435     #   Prop-content-length: 129
3436     #   Content-length: 129
3437     #
3438     #   K 7
3439     #   svn:log
3440     #   V 27
3441     #   Log message for revision 1.
3442     #   K 10
3443     #   svn:author
3444     #   V 7
3445     #   jrandom
3446     #   K 8
3447     #   svn:date
3448     #   V 27
3449     #   2003-04-22T22:57:58.132837Z
3450     #   PROPS-END
3451     #
3452     # Notice that the length headers count everything -- not just the
3453     # length of the data but also the lengths of the lengths, including
3454     # the 'K ' or 'V ' prefixes.
3455     #
3456     # The reason there are both Prop-content-length and Content-length
3457     # is that the former includes just props, while the latter includes
3458     # everything.  That's the generic header form for any entity in a
3459     # dumpfile.  But since revisions only have props, the two lengths
3460     # are always the same for revisions.
3461
3462     # Calculate the total length of the props section.
3463     props = svn_commit.get_revprops()
3464     prop_names = props.keys()
3465     prop_names.sort()
3466     total_len = 10  # len('PROPS-END\n')
3467     for propname in prop_names:
3468       if props[propname] is None:
3469         continue
3470       klen = len(propname)
3471       klen_len = len('K %d' % klen)
3472       vlen = len(props[propname])
3473       vlen_len = len('V %d' % vlen)
3474       # + 4 for the four newlines within a given property's section
3475       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
3476
3477     # Print the revision header and props
3478     self.dumpfile.write('Revision-number: %d\n'
3479                         'Prop-content-length: %d\n'
3480                         'Content-length: %d\n'
3481                         '\n'
3482                         % (self.revision, total_len, total_len))
3483
3484     for propname in prop_names:
3485       if props[propname] is None:
3486         continue
3487       self.dumpfile.write('K %d\n'
3488                           '%s\n'
3489                           'V %d\n'
3490                           '%s\n' % (len(propname),
3491                                     propname,
3492                                     len(props[propname]),
3493                                     props[propname]))
3494
3495     self.dumpfile.write('PROPS-END\n')
3496     self.dumpfile.write('\n')
3497
3498   def mkdir(self, path):
3499     """Emit the creation of directory PATH."""
3500     self.dumpfile.write("Node-path: %s\n"
3501                         "Node-kind: dir\n"
3502                         "Node-action: add\n"
3503                         "Content-length: 10\n"
3504                         "\n"
3505                         "\n" % self._utf8_path(path))
3506
3507   def _add_or_change_path(self, s_item, op):
3508     """Emit the addition or change corresponding to S_ITEM.
3509     OP is either the constant OP_ADD or OP_CHANGE."""
3510
3511     # Validation stuffs
3512     if op == OP_ADD:
3513       action = 'add'
3514     elif op == OP_CHANGE:
3515       action = 'change'
3516     else:
3517       sys.stderr.write("%s: _add_or_change_path() called with bad op ('%s')"
3518                        % (error_prefix, op))
3519       sys.exit(1)
3520
3521     # Convenience variables
3522     c_rev = s_item.c_rev
3523     svn_props = s_item.svn_props
3524
3525     # The property handling here takes advantage of an undocumented
3526     # but IMHO consistent feature of the Subversion dumpfile-loading
3527     # code.  When a node's properties aren't mentioned (that is, the
3528     # "Prop-content-length:" header is absent, no properties are
3529     # listed at all, and there is no "PROPS-END\n" line) then no
3530     # change is made to the node's properties.
3531     #
3532     # This is consistent with the way dumpfiles behave w.r.t. text
3533     # content changes, so I'm comfortable relying on it.  If you
3534     # commit a change to *just* the properties of some node that
3535     # already has text contents from a previous revision, then in the
3536     # dumpfile output for the prop change, no "Text-content-length:"
3537     # nor "Text-content-md5:" header will be present, and the text of
3538     # the file will not be given.  But this does not cause the file's
3539     # text to be erased!  It simply remains unchanged.
3540     #
3541     # This works out great for cvs2svn, due to lucky coincidences:
3542     #
3543     # For files, the only properties we ever set are set in the first
3544     # revision; all other revisions (including on branches) inherit
3545     # from that.  After the first revision, we never change file
3546     # properties, therefore, there is no need to remember the full set
3547     # of properties on a given file once we've set it.
3548     #
3549     # For directories, the only property we set is "svn:ignore", and
3550     # while we may change it after the first revision, we always do so
3551     # based on the contents of a ".cvsignore" file -- in other words,
3552     # CVS is doing the remembering for us, so we still don't have to
3553     # preserve the previous value of the property ourselves.
3554
3555     # Calculate the (sorted-by-name) property string and length, if any.
3556     prop_contents = ''
3557     prop_names = svn_props.keys()
3558     prop_names.sort()
3559     for pname in prop_names:
3560       pval = svn_props[pname]
3561       prop_contents = prop_contents + \
3562                       'K %d\n%s\nV %d\n%s\n' \
3563                       % (len(pname), pname, len(pval), pval)
3564     if prop_contents:
3565       prop_contents = prop_contents + 'PROPS-END\n'
3566       props_len = len(prop_contents)
3567     else:
3568       props_len = 0
3569
3570     props_header = ''
3571     if props_len:
3572       props_header = 'Prop-content-length: %d\n' % props_len
3573
3574     # treat .cvsignore as a directory property
3575     dir_path, basename = os.path.split(c_rev.svn_path)
3576     if basename == ".cvsignore":
3577       ignore_vals = generate_ignores(c_rev)
3578       ignore_contents = '\n'.join(ignore_vals)
3579       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3580                          (len(ignore_contents), ignore_contents))
3581       ignore_contents = ignore_contents + 'PROPS-END\n'
3582       ignore_len = len(ignore_contents)
3583
3584       # write headers, then props
3585       self.dumpfile.write('Node-path: %s\n'
3586                           'Node-kind: dir\n'
3587                           'Node-action: change\n'
3588                           'Prop-content-length: %d\n'
3589                           'Content-length: %d\n'
3590                           '\n'
3591                           '%s'
3592                           % (self._utf8_path(dir_path), ignore_len,
3593                              ignore_len, ignore_contents))
3594
3595     pipe_cmd, pipe = get_co_pipe(c_rev)
3596     self.dumpfile.write('Node-path: %s\n'
3597                         'Node-kind: file\n'
3598                         'Node-action: %s\n'
3599                         '%s'  # no property header if no props
3600                         'Text-content-length: '
3601                         % (self._utf8_path(c_rev.svn_path),
3602                            action, props_header))
3603
3604     pos = self.dumpfile.tell()
3605
3606     self.dumpfile.write('0000000000000000\n'
3607                         'Text-content-md5: 00000000000000000000000000000000\n'
3608                         'Content-length: 0000000000000000\n'
3609                         '\n')
3610
3611     if prop_contents:
3612       self.dumpfile.write(prop_contents)
3613
3614     # Insert the rev contents, calculating length and checksum as we go.
3615     checksum = md5.new()
3616     length = 0
3617     normalize_crlf = sys.platform == "win32" \
3618                      and svn_props.has_key('svn:eol-style')
3619     trailing_cr = ""
3620     buf = pipe.fromchild.read(PIPE_READ_SIZE)
3621     while buf:
3622       if normalize_crlf:
3623         buf = string.replace(buf,"\r\n","\n")
3624         if buf[-1] == "\r":
3625           trailing_cr = "\r"
3626           buf = buf[:-1]
3627         else:
3628           trailing_cr = ""
3629       checksum.update(buf)
3630       length = length + len(buf)
3631       self.dumpfile.write(buf)
3632       # optimize because of python's immutable strings
3633       if trailing_cr:
3634         buf = trailing_cr + pipe.fromchild.read(PIPE_READ_SIZE)
3635       else:
3636         buf = pipe.fromchild.read(PIPE_READ_SIZE)
3637     pipe.fromchild.close()
3638     error_output = pipe.childerr.read()
3639     exit_status = pipe.wait()
3640     if exit_status:
3641       sys.exit("%s: The command '%s' failed with exit status: %s\n"
3642                "and the following output:\n"
3643                "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
3644
3645     # Go back to patch up the length and checksum headers:
3646     self.dumpfile.seek(pos, 0)
3647     # We left 16 zeros for the text length; replace them with the real
3648     # length, padded on the left with spaces:
3649     self.dumpfile.write('%16d' % length)
3650     # 16... + 1 newline + len('Text-content-md5: ') == 35
3651     self.dumpfile.seek(pos + 35, 0)
3652     self.dumpfile.write(checksum.hexdigest())
3653     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
3654     self.dumpfile.seek(pos + 84, 0)
3655     # The content length is the length of property data, text data,
3656     # and any metadata around/inside around them.
3657     self.dumpfile.write('%16d' % (length + props_len))
3658     # Jump back to the end of the stream
3659     self.dumpfile.seek(0, 2)
3660
3661     # This record is done (write two newlines -- one to terminate
3662     # contents that weren't themselves newline-termination, one to
3663     # provide a blank line for readability.
3664     self.dumpfile.write('\n\n')
3665
3666   def add_path(self, s_item):
3667     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
3668     self._add_or_change_path(s_item, OP_ADD)
3669
3670   def change_path(self, s_item):
3671     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
3672     self._add_or_change_path(s_item, OP_CHANGE)
3673
3674   def delete_path(self, path):
3675     """Emit the deletion of PATH."""
3676     self.dumpfile.write('Node-path: %s\n'
3677                         'Node-action: delete\n'
3678                         '\n' % self._utf8_path(path))
3679
3680   def copy_path(self, src_path, dest_path, src_revnum):
3681     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
3682     # We don't need to include "Node-kind:" for copies; the loader
3683     # ignores it anyway and just uses the source kind instead.
3684     self.dumpfile.write('Node-path: %s\n'
3685                         'Node-action: add\n'
3686                         'Node-copyfrom-rev: %d\n'
3687                         'Node-copyfrom-path: /%s\n'
3688                         '\n'
3689                         % (self._utf8_path(dest_path),
3690                            src_revnum,
3691                            self._utf8_path(src_path)))
3692
3693   def finish(self):
3694     """Perform any cleanup necessary after all revisions have been
3695     committed."""
3696     self.dumpfile.close()
3697
3698
3699 class RepositoryDelegate(DumpfileDelegate):
3700   """Creates a new Subversion Repository.  DumpfileDelegate does all
3701   of the heavy lifting."""
3702   def __init__(self):
3703     self.svnadmin = Ctx().svnadmin
3704     self.target = Ctx().target
3705     if not Ctx().existing_svnrepos:
3706       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
3707       if Ctx().fs_type and Ctx().fs_type != 'bdb':
3708         # User specified something other than bdb.
3709         run_command('%s create %s "%s"' % (self.svnadmin,
3710                                            "--fs-type=%s" % Ctx().fs_type,
3711                                            self.target))
3712       elif Ctx().fs_type:
3713         # User explicitly specified bdb.
3714         #
3715         # Since this is a BDB repository, pass --bdb-txn-nosync,
3716         # because it gives us a 4-5x speed boost (if cvs2svn is
3717         # creating the repository, cvs2svn should be the only program
3718         # accessing the svn repository (until cvs is done, at least)).
3719         # But we'll turn no-sync off in self.finish(), unless
3720         # instructed otherwise.
3721         run_command('%s create %s %s "%s"' % (self.svnadmin,
3722                                               "--fs-type=bdb",
3723                                               "--bdb-txn-nosync",
3724                                               self.target))
3725       else:
3726         # User didn't say what kind repository (bdb, fsfs, etc).
3727         # We still pass --bdb-txn-nosync.  It's a no-op if the default
3728         # repository type doesn't support it, but we definitely want
3729         # it if BDB is the default.
3730         run_command('%s create %s "%s"' % (self.svnadmin,
3731                                            "--bdb-txn-nosync",
3732                                            self.target))
3733
3734
3735     # Since the output of this run is a repository, not a dumpfile,
3736     # the temporary dumpfiles we create should go in the tmpdir.
3737     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
3738
3739     # This is 1 if a commit is in progress, otherwise None.
3740     self._commit_in_progress = None
3741
3742     self.dumpfile = open(self.dumpfile_path, 'w+b')
3743     self.loader_pipe = Popen3('%s load -q "%s"' % (self.svnadmin, self.target),
3744                               True)
3745     self.loader_pipe.fromchild.close()
3746     try:
3747       self._write_dumpfile_header(self.loader_pipe.tochild)
3748     except IOError:
3749       sys.stderr.write("%s: svnadmin failed with the following output while "
3750                        "loading the dumpfile:\n" % (error_prefix))
3751       sys.stderr.write(self.loader_pipe.childerr.read())
3752       sys.exit(1)
3753
3754   def _feed_pipe(self):
3755     """Feed the revision stored in the dumpfile to the svnadmin
3756     load pipe."""
3757     self.dumpfile.seek(0)
3758     while 1:
3759       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
3760       if not len(data):
3761         break
3762       try:
3763         self.loader_pipe.tochild.write(data)
3764       except IOError:
3765         sys.stderr.write("%s: svnadmin failed with the following output while "
3766                          "loading the dumpfile:\n" % (error_prefix))
3767         sys.stderr.write(self.loader_pipe.childerr.read())
3768         sys.exit(1)
3769
3770   def start_commit(self, svn_commit):
3771     """Start a new commit.  If a commit is already in progress, close
3772     the dumpfile, load it into the svn repository, open a new
3773     dumpfile, and write the header into it."""
3774     if self._commit_in_progress:
3775       self._feed_pipe()
3776     self.dumpfile.seek(0)
3777     self.dumpfile.truncate()
3778     DumpfileDelegate.start_commit(self, svn_commit)
3779     self._commit_in_progress = 1
3780
3781   def finish(self):
3782     """Loads the last commit into the repository."""
3783     self._feed_pipe()
3784     self.dumpfile.close()
3785     self.loader_pipe.tochild.close()
3786     error_output = self.loader_pipe.childerr.read()
3787     exit_status = self.loader_pipe.wait()
3788     if exit_status:
3789       sys.exit('%s: svnadmin load failed with exit status: %s\n'
3790                'and the following output:\n'
3791                '%s' % (error_prefix, exit_status, error_output))
3792     os.remove(self.dumpfile_path)
3793
3794     # If this is a BDB repository, and we created the repository, and
3795     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
3796     # line in the DB_CONFIG file, because txn syncing should be on by
3797     # default in BDB repositories.
3798     #
3799     # We determine if this is a BDB repository by looking for the
3800     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
3801     # checking Ctx().fs_type.  That way this code will Do The Right
3802     # Thing in all circumstances.
3803     db_config = os.path.join(self.target, "db/DB_CONFIG")
3804     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
3805         and os.path.exists(db_config)):
3806       no_sync = 'set_flags DB_TXN_NOSYNC\n'
3807
3808       contents = open(db_config, 'r').readlines()
3809       index = contents.index(no_sync)
3810       contents[index] = '# ' + no_sync
3811       contents = open(db_config, 'w').writelines(contents)
3812
3813
3814 class StdoutDelegate(SVNRepositoryMirrorDelegate):
3815   """Makes no changes to the disk, but writes out information to
3816   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
3817   print statements will state that we're doing something, when in
3818   reality, we aren't doing anything other than printing out that we're
3819   doing something.  Kind of zen, really."""
3820   def __init__(self, total_revs):
3821     self.total_revs = total_revs
3822
3823   def start_commit(self, svn_commit):
3824     """Prints out the Subversion revision number of the commit that is
3825     being started."""
3826     Log().write(LOG_VERBOSE, "=" * 60)
3827     Log().write(LOG_NORMAL, "Starting Subversion commit %d / %d" %
3828                 (svn_commit.revnum, self.total_revs))
3829
3830   def mkdir(self, path):
3831     """Print a line stating that we are creating directory PATH."""
3832     Log().write(LOG_VERBOSE, "  New Directory", path)
3833
3834   def add_path(self, s_item):
3835     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
3836     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
3837
3838   def change_path(self, s_item):
3839     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
3840     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
3841
3842   def delete_path(self, path):
3843     """Print a line stating that we are 'deleting' PATH."""
3844     Log().write(LOG_VERBOSE, "  Deleting", path)
3845
3846   def copy_path(self, src_path, dest_path, src_revnum):
3847     """Print a line stating that we are 'copying' revision SRC_REVNUM
3848     of SRC_PATH to DEST_PATH."""
3849     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
3850     Log().write(LOG_VERBOSE, "                to", dest_path)
3851
3852   def finish(self):
3853     """State that we are done creating our repository."""
3854     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
3855     Log().write(LOG_QUIET, "Done.")
3856
3857 # This should be a local to pass1,
3858 # but Python 2.0 does not support nested scopes.
3859 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
3860 def pass1():
3861   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
3862   cd = CollectData()
3863
3864   def visit_file(baton, dirname, files):
3865     cd = baton
3866     for fname in files:
3867       if fname[-2:] != ',v':
3868         continue
3869       cd.found_valid_file = 1
3870       pathname = os.path.join(dirname, fname)
3871       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
3872         # drop the 'Attic' portion from the pathname for the canonical name.
3873         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
3874       else:
3875         # If this file also exists in the attic, it's a fatal error
3876         attic_path = os.path.join(dirname, 'Attic', fname)
3877         if os.path.exists(attic_path):
3878           err = "%s: A CVS repository cannot contain both %s and %s" \
3879                 % (error_prefix, pathname, attic_path)
3880           sys.stderr.write(err + '\n')
3881           cd.fatal_errors.append(err)
3882         cd.set_fname(pathname, pathname)
3883       Log().write(LOG_NORMAL, pathname)
3884       try:
3885         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
3886       except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
3887         err = "%s: '%s' is not a valid ,v file" \
3888               % (error_prefix, pathname)
3889         sys.stderr.write(err + '\n')
3890         cd.fatal_errors.append(err)
3891       except:
3892         Log().write(LOG_WARN, "Exception occurred while parsing %s" % pathname)
3893         raise
3894
3895   os.path.walk(Ctx().cvsroot, visit_file, cd)
3896   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
3897
3898   cd.write_symbol_db()
3899
3900   if len(cd.fatal_errors) > 0:
3901     sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
3902              + "Error summary:\n"
3903              + "\n".join(cd.fatal_errors)
3904              + "\nExited due to fatal error(s).")
3905
3906   if cd.found_valid_file is None:
3907     sys.exit("\nNo RCS files found in your CVS Repository!\n"
3908              + "Are you absolutely certain you are pointing cvs2svn\n"
3909              + "at a CVS repository?\n"
3910              + "\nExited due to fatal error(s).")
3911
3912   StatsKeeper().reset_c_rev_info()
3913   StatsKeeper().archive()
3914   Log().write(LOG_QUIET, "Done")
3915
3916 def pass2():
3917   "Pass 2: clean up the revision information."
3918
3919   symbol_db = SymbolDatabase()
3920   symbol_db.read()
3921
3922   # Convert the list of regexps to a list of strings
3923   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
3924
3925   error_detected = 0
3926
3927   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
3928   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
3929   if blocked_excludes:
3930     for branch, blockers in blocked_excludes.items():
3931       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
3932                        "excluded because the following symbols depend "
3933                        "on it:\n" % (branch))
3934       for blocker in blockers:
3935         sys.stderr.write("    '%s'\n" % (blocker))
3936     sys.stderr.write("\n")
3937     error_detected = 1
3938
3939   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
3940   invalid_forced_tags = [ ]
3941   for forced_tag in Ctx().forced_tags:
3942     if excludes.has_key(forced_tag):
3943       continue
3944     if symbol_db.branch_has_commit(forced_tag):
3945       invalid_forced_tags.append(forced_tag)
3946   if invalid_forced_tags:
3947     sys.stderr.write(error_prefix + ": The following branches cannot be "
3948                      "forced to be tags because they have commits:\n")
3949     for tag in invalid_forced_tags:
3950       sys.stderr.write("    '%s'\n" % (tag))
3951     sys.stderr.write("\n")
3952     error_detected = 1
3953
3954   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
3955   mismatches = symbol_db.find_mismatches(excludes)
3956   def is_not_forced(mismatch):
3957     name = mismatch[0]
3958     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
3959   mismatches = filter(is_not_forced, mismatches)
3960   if mismatches:
3961     sys.stderr.write(error_prefix + ": The following symbols are tags "
3962                      "in some files and branches in others.\nUse "
3963                      "--force-tag, --force-branch and/or --exclude to "
3964                      "resolve the symbols.\n")
3965     for name, tag_count, branch_count, commit_count in mismatches:
3966       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
3967                        "%d files and has commits in %d files.\n"
3968                        % (name, tag_count, branch_count, commit_count))
3969     error_detected = 1
3970
3971   # Bail out now if we found errors
3972   if error_detected:
3973     sys.exit(1)
3974
3975   # Create the tags database
3976   tags_db = TagsDatabase(DB_OPEN_NEW)
3977   for tag in symbol_db.tags.keys():
3978     if tag not in Ctx().forced_branches:
3979       tags_db[tag] = None
3980   for tag in Ctx().forced_tags:
3981     tags_db[tag] = None
3982
3983   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
3984
3985   # We may have recorded some changes in revisions' timestamp.  We need to
3986   # scan for any other files which may have had the same log message and
3987   # occurred at "the same time" and change their timestamps, too.
3988
3989   # read the resync data file
3990   def read_resync(fname):
3991     "Read the .resync file into memory."
3992
3993     ### note that we assume that we can hold the entire resync file in
3994     ### memory. really large repositories with whacky timestamps could
3995     ### bust this assumption. should that ever happen, then it is possible
3996     ### to split the resync file into pieces and make multiple passes,
3997     ### using each piece.
3998
3999     #
4000     # A digest maps to a sequence of lists which specify a lower and upper
4001     # time bound for matching up the commit.  We keep a sequence of these
4002     # because a number of checkins with the same log message (e.g. an empty
4003     # log message) could need to be remapped.  We also make them a list because
4004     # we will dynamically expand the lower/upper bound as we find commits
4005     # that fall into a particular msg and time range.
4006     #
4007     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4008     #
4009     resync = { }
4010
4011     for line in fileinput.FileInput(fname):
4012       t1 = int(line[:8], 16)
4013       digest = line[9:DIGEST_END_IDX]
4014       t2 = int(line[DIGEST_END_IDX+1:], 16)
4015       t1_l = t1 - COMMIT_THRESHOLD/2
4016       t1_u = t1 + COMMIT_THRESHOLD/2
4017       if resync.has_key(digest):
4018         resync[digest].append([t1_l, t1_u, t2])
4019       else:
4020         resync[digest] = [ [t1_l, t1_u, t2] ]
4021
4022     # For each digest, sort the resync items in it in increasing order,
4023     # based on the lower time bound.
4024     digests = resync.keys()
4025     for digest in digests:
4026       (resync[digest]).sort()
4027
4028     return resync
4029
4030   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4031
4032   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4033   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4034
4035   # process the revisions file, looking for items to clean up
4036   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4037     c_rev = CVSRevision(Ctx(), line[:-1])
4038
4039     # Skip this entire revision if it's on an excluded branch
4040     if excludes.has_key(c_rev.branch_name):
4041       continue
4042
4043     # Remove all references to excluded tags and branches
4044     def not_excluded(symbol, excludes=excludes):
4045       return not excludes.has_key(symbol)
4046     c_rev.branches = filter(not_excluded, c_rev.branches)
4047     c_rev.tags = filter(not_excluded, c_rev.tags)
4048
4049     # Convert all branches that are forced to be tags
4050     for forced_tag in Ctx().forced_tags:
4051       if forced_tag in c_rev.branches:
4052         c_rev.branches.remove(forced_tag)
4053         c_rev.tags.append(forced_tag)
4054
4055     # Convert all tags that are forced to be branches
4056     for forced_branch in Ctx().forced_branches:
4057       if forced_branch in c_rev.tags:
4058         c_rev.tags.remove(forced_branch)
4059         c_rev.branches.append(forced_branch)
4060
4061     # see if this is "near" any of the resync records we
4062     # have recorded for this digest [of the log message].
4063     for record in resync.get(c_rev.digest, []):
4064       if record[0] <= c_rev.timestamp <= record[1]:
4065         # bingo! remap the time on this (record[2] is the new time).
4066
4067         # adjust the time range. we want the COMMIT_THRESHOLD from the
4068         # bounds of the earlier/latest commit in this group.
4069         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4070         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4071
4072         # By default this will be the new timestamp
4073         new_timestamp = record[2]
4074         # If the new timestamp is earlier than that of our previous revision
4075         if record[2] < c_rev.prev_timestamp:
4076           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4077                   + " to time %s, which is before previous the time of"
4078                   + " revision %s (%s):")
4079           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4080                                         c_rev.cvs_path, record[2],
4081                                         c_rev.prev_rev, c_rev.prev_timestamp))
4082           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4083           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4084           # attempted sync time, then sync back to c_rev.prev_timestamp
4085           # + 1...
4086           if (c_rev.prev_timestamp - record[2]) < COMMIT_THRESHOLD:
4087             new_timestamp = c_rev.prev_timestamp + 1
4088             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4089                                                           new_timestamp))
4090           # ...otherwise, make no change
4091           else:
4092             new_timestamp = c_rev.timestamp
4093             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4094                         warning_prefix)
4095
4096         msg = "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4097               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4098                  record[2] - c_rev.timestamp)
4099         Log().write(LOG_VERBOSE, msg)
4100
4101         c_rev.timestamp = new_timestamp
4102
4103         # stop looking for hits
4104         break
4105
4106     output.write(str(c_rev) + "\n")
4107   Log().write(LOG_QUIET, "Done")
4108
4109 def pass3():
4110   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4111   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4112             temp(DATAFILE + SORTED_REVS_SUFFIX))
4113   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4114   Log().write(LOG_QUIET, "Done")
4115
4116 def pass4():
4117   """Iterate through sorted revs, storing them in a database.
4118   If we're not doing a trunk-only conversion, generate the
4119   LastSymbolicNameDatabase, which contains the last CVSRevision
4120   that is a source for each tag or branch.
4121   """
4122   Log().write(LOG_QUIET,
4123       "Copying CVS revision data from flat file to database...")
4124   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4125   if not Ctx().trunk_only:
4126     Log().write(LOG_QUIET,
4127         "and finding last CVS revisions for all symbolic names...")
4128     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4129   else:
4130     # This is to avoid testing Ctx().trunk_only every time around the loop
4131     class DummyLSNDB:
4132       def noop(*args): pass
4133       log_revision = noop
4134       create_database = noop
4135     last_sym_name_db = DummyLSNDB()
4136
4137   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4138     c_rev = CVSRevision(Ctx(), line[:-1])
4139     cvs_revs_db.log_revision(c_rev)
4140     last_sym_name_db.log_revision(c_rev)
4141     StatsKeeper().record_c_rev(c_rev)
4142
4143   last_sym_name_db.create_database()
4144   StatsKeeper().archive()
4145   Log().write(LOG_QUIET, "Done")
4146
4147 def pass5():
4148   """
4149   Generate the SVNCommit <-> CVSRevision mapping
4150   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4151   CVSRevisions that represent an opening or closing for a path on a
4152   branch or tag.  See SymbolingsLogger for more details.
4153   """
4154   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4155
4156   aggregator = CVSRevisionAggregator()
4157   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4158     c_rev = CVSRevision(Ctx(), line[:-1])
4159     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4160       aggregator.process_revision(c_rev)
4161   aggregator.flush()
4162
4163   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4164   StatsKeeper().archive()
4165   Log().write(LOG_QUIET, "Done")
4166
4167 def pass6():
4168   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4169
4170   if not Ctx().trunk_only:
4171     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4172               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4173     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4174   Log().write(LOG_QUIET, "Done")
4175
4176 def pass7():
4177   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4178
4179   def generate_offsets_for_symbolings():
4180     """This function iterates through all the lines in
4181     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4182     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4183     where SYMBOLIC_NAME is first encountered.  This will allow us to
4184     seek to the various offsets in the file and sequentially read only
4185     the openings and closings that we need."""
4186
4187     ###PERF This is a fine example of a db that can be in-memory and
4188     #just flushed to disk when we're done.  Later, it can just be sucked
4189     #back into memory.
4190     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4191     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4192
4193     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4194     old_sym = ""
4195     while 1:
4196       fpos = file.tell()
4197       line = file.readline()
4198       if not line:
4199         break
4200       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4201       if not sym == old_sym:
4202         Log().write(LOG_VERBOSE, " ", sym)
4203         old_sym = sym
4204         offsets_db[sym] = fpos
4205
4206   if not Ctx().trunk_only:
4207     generate_offsets_for_symbolings()
4208   Log().write(LOG_QUIET, "Done.")
4209
4210 def pass8():
4211   svncounter = 2 # Repository initialization is 1.
4212   repos = SVNRepositoryMirror()
4213   persistence_manager = PersistenceManager(DB_OPEN_READ)
4214
4215   if (Ctx().target):
4216     if not Ctx().dry_run:
4217       repos.add_delegate(RepositoryDelegate())
4218     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4219   else:
4220     if not Ctx().dry_run:
4221       repos.add_delegate(DumpfileDelegate())
4222     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4223
4224   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4225
4226   while(1):
4227     svn_commit = persistence_manager.get_svn_commit(svncounter)
4228     if not svn_commit:
4229       break
4230     repos.commit(svn_commit)
4231     svncounter += 1
4232
4233   repos.finish()
4234
4235 _passes = [
4236   pass1,
4237   pass2,
4238   pass3,
4239   pass4,
4240   pass5,
4241   pass6,
4242   pass7,
4243   pass8,
4244   ]
4245
4246
4247 class Ctx:
4248   """Session state for this run of cvs2svn.  For example, run-time
4249   options are stored here.  This class is a Borg, see
4250   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4251   """
4252   __shared_state = { }
4253   def __init__(self):
4254     self.__dict__ = self.__shared_state
4255     if self.__dict__:
4256       return
4257     # Else, initialize to defaults.
4258     self.cvsroot = None
4259     self.target = None
4260     self.dumpfile = DUMPFILE
4261     self.tmpdir = '.'
4262     self.verbose = 0
4263     self.quiet = 0
4264     self.prune = 1
4265     self.existing_svnrepos = 0
4266     self.dump_only = 0
4267     self.dry_run = 0
4268     self.trunk_only = 0
4269     self.trunk_base = "trunk"
4270     self.tags_base = "tags"
4271     self.branches_base = "branches"
4272     self.encoding = "ascii"
4273     self.mime_types_file = None
4274     self.mime_mapper = None
4275     self.no_default_eol = 0
4276     self.eol_from_mime_type = 0
4277     self.keywords_off = 0
4278     self.use_cvs = None
4279     self.svnadmin = "svnadmin"
4280     self.username = None
4281     self.print_help = 0
4282     self.skip_cleanup = 0
4283     self.cvs_revnums = 0
4284     self.bdb_txn_nosync = 0
4285     self.fs_type = None
4286     self.forced_branches = []
4287     self.forced_tags = []
4288     self.excludes = []
4289     self.symbol_transforms = []
4290
4291 class MimeMapper:
4292   """A class that provides mappings from file names to MIME types.
4293   Note that we should really be using Python's 'mimetypes' module.
4294   See http://cvs2svn.tigris.org/servlets/ReadMsg?list=dev&msgNo=266
4295   for more."""
4296
4297   def __init__(self):
4298     self.mappings = { }
4299
4300   def set_mime_types_file(self, mime_types_file):
4301     for line in fileinput.input(mime_types_file):
4302       if line.startswith("#"):
4303         continue
4304
4305       # format of a line is something like
4306       # text/plain c h cpp
4307       extensions = line.split()
4308       if len(extensions) < 2:
4309         continue
4310       type = extensions.pop(0)
4311       for ext in extensions:
4312         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4313           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n" \
4314                            % (warning_prefix, ext, self.mappings[ext], type))
4315         self.mappings[ext] = type
4316
4317
4318   def get_type_from_filename(self, filename):
4319     basename, extension = os.path.splitext(os.path.basename(filename))
4320
4321     # Extension includes the dot, so strip it (will leave extension
4322     # empty if filename ends with a dot, which is ok):
4323     extension = extension[1:]
4324
4325     # If there is no extension (or the file ends with a period), use
4326     # the base name for mapping.  This allows us to set mappings for
4327     # files such as README or Makefile:
4328     if not extension:
4329       extension = basename
4330     if self.mappings.has_key(extension):
4331       return self.mappings[extension]
4332     return None
4333
4334
4335 def convert(start_pass, end_pass):
4336   "Convert a CVS repository to an SVN repository."
4337
4338   cleanup = Cleanup()
4339   times = [ None ] * (end_pass + 1)
4340   times[start_pass - 1] = time.time()
4341   StatsKeeper().set_start_time(time.time())
4342   for i in range(start_pass - 1, end_pass):
4343     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4344     _passes[i]()
4345     times[i + 1] = time.time()
4346     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4347     # Dispose of items in Ctx() not intended to live past the end of the pass
4348     # (Identified by exactly one leading underscore)
4349     for attr in dir(Ctx()):
4350       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4351           and not attr[:6] == "_Ctx__"):
4352         delattr(Ctx(), attr)
4353     if not Ctx().skip_cleanup:
4354       cleanup.cleanup(_passes[i])
4355     StatsKeeper().set_end_time(time.time())
4356
4357   Log().write(LOG_QUIET, StatsKeeper())
4358   if end_pass < 4:
4359     Log().write(LOG_QUIET, '(These are unaltered CVS repository stats and do not\n'
4360                 + ' reflect tags or branches excluded via --exclude)\n')
4361   print StatsKeeper().timings()
4362
4363
4364 def usage():
4365   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
4366         % os.path.basename(sys.argv[0])
4367   print '  --help, -h           print this usage message and exit with success'
4368   print '  --version            print the version number'
4369   print '  -q                   quiet'
4370   print '  -v                   verbose'
4371   print '  -s PATH              path for SVN repos'
4372   print '  -p START[:END]       start at pass START, end at pass END of %d' % len(_passes)
4373   print '                       If only START is given, run only pass START'
4374   print '                       (implicitly enables --skip-cleanup)'
4375   print '  --existing-svnrepos  load into existing SVN repository'
4376   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
4377   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
4378   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
4379   print '  --dry-run            do not create a repository or a dumpfile;'
4380   print '                       just print what would happen.'
4381   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
4382   print '                       (only use this if having problems with RCS)'
4383   print '  --svnadmin=PATH      path to the svnadmin program'
4384   print '  --trunk-only         convert only trunk commits, not tags nor branches'
4385   print '  --trunk=PATH         path for trunk (default: %s)'    \
4386         % Ctx().trunk_base
4387   print '  --branches=PATH      path for branches (default: %s)' \
4388         % Ctx().branches_base
4389   print '  --tags=PATH          path for tags (default: %s)'     \
4390         % Ctx().tags_base
4391   print '  --no-prune           don\'t prune empty directories'
4392   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
4393   print '  --encoding=ENC       encoding of log messages in CVS repos (default: %s)' \
4394         % Ctx().encoding
4395   print '  --force-branch=NAME  force NAME to be a branch'
4396   print '  --force-tag=NAME     force NAME to be a tag'
4397   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
4398   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
4399   print '                       use Python regexp and reference syntax respectively'
4400   print '  --username=NAME      username for cvs2svn-synthesized commits'
4401   print '  --skip-cleanup       prevent the deletion of intermediate files'
4402   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
4403   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
4404   print '  --cvs-revnums        record CVS revision numbers as file properties'
4405   print '  --mime-types=FILE    specify an apache-style mime.types file for\n' \
4406         '                       setting svn:mime-type'
4407   print '  --eol-from-mime-type set svn:eol-style by mime type (only with --mime-types)'
4408   print '  --no-default-eol     don\'t set svn:eol-style by CVS defaults'
4409   print '  --keywords-off       don\'t set svn:keywords on any files (cvs2svn sets'
4410   print '                       "svn:keywords to author date id" on non-binary files'
4411   print '                       by default)'
4412
4413 def main():
4414   # Convenience var, so we don't have to keep instantiating this Borg.
4415   ctx = Ctx()
4416
4417   profiling = None
4418   start_pass = 1
4419   end_pass = len(_passes)
4420
4421   try:
4422     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
4423                                [ "help", "create", "trunk=",
4424                                  "username=", "existing-svnrepos",
4425                                  "branches=", "tags=", "encoding=",
4426                                  "force-branch=", "force-tag=", "exclude=",
4427                                  "use-cvs", "mime-types=",
4428                                  "eol-from-mime-type", "no-default-eol",
4429                                  "trunk-only", "no-prune", "dry-run",
4430                                  "dump-only", "dumpfile=", "tmpdir=",
4431                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
4432                                  "bdb-txn-nosync", "fs-type=",
4433                                  "version", "profile",
4434                                  "keywords-off", "symbol-transform="])
4435   except getopt.GetoptError, e:
4436     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
4437     usage()
4438     sys.exit(1)
4439
4440   for opt, value in opts:
4441     if opt == '--version':
4442         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
4443         sys.exit(0)
4444     elif opt == '-p':
4445       # Don't cleanup if we're doing incrementals.
4446       ctx.skip_cleanup = 1
4447       if value.find(':') > 0:
4448         start_pass, end_pass = map(int, value.split(':'))
4449       else:
4450         end_pass = start_pass = int(value)
4451       if start_pass > len(_passes) or start_pass < 1:
4452         print '%s: illegal value (%d) for starting pass. '\
4453               'must be 1 through %d.' % (error_prefix, int(start_pass),
4454                                          len(_passes))
4455         sys.exit(1)
4456       if end_pass < start_pass or end_pass > len(_passes):
4457         print '%s: illegal value (%d) for ending pass. ' \
4458               'must be %d through %d.' % (error_prefix, int(end_pass),
4459                                           int(start_pass), len(_passes))
4460         sys.exit(1)
4461     elif (opt == '--help') or (opt == '-h'):
4462       ctx.print_help = 1
4463     elif opt == '-v':
4464       Log().log_level = LOG_VERBOSE
4465       ctx.verbose = 1
4466     elif opt == '-q':
4467       Log().log_level = LOG_QUIET
4468       ctx.quiet = 1
4469     elif opt == '-s':
4470       ctx.target = value
4471     elif opt == '--existing-svnrepos':
4472       ctx.existing_svnrepos = 1
4473     elif opt == '--dumpfile':
4474       ctx.dumpfile = value
4475     elif opt == '--tmpdir':
4476       ctx.tmpdir = value
4477     elif opt == '--use-cvs':
4478       ctx.use_cvs = 1
4479     elif opt == '--svnadmin':
4480       ctx.svnadmin = value
4481     elif opt == '--trunk-only':
4482       ctx.trunk_only = 1
4483     elif opt == '--trunk':
4484       if not value:
4485         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4486       ctx.trunk_base = value
4487     elif opt == '--branches':
4488       if not value:
4489         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4490       ctx.branches_base = value
4491     elif opt == '--tags':
4492       if not value:
4493         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4494       ctx.tags_base = value
4495     elif opt == '--no-prune':
4496       ctx.prune = None
4497     elif opt == '--dump-only':
4498       ctx.dump_only = 1
4499     elif opt == '--dry-run':
4500       ctx.dry_run = 1
4501     elif opt == '--encoding':
4502       ctx.encoding = value
4503     elif opt == '--force-branch':
4504       ctx.forced_branches.append(value)
4505     elif opt == '--force-tag':
4506       ctx.forced_tags.append(value)
4507     elif opt == '--exclude':
4508       try:
4509         ctx.excludes.append(re.compile('^' + value + '$'))
4510       except re.error, e:
4511         sys.exit(error_prefix + ": '%s' is not a valid regexp.\n" % (value))
4512     elif opt == '--mime-types':
4513       ctx.mime_types_file = value
4514     elif opt == '--eol-from-mime-type':
4515       ctx.eol_from_mime_type = 1
4516     elif opt == '--no-default-eol':
4517       ctx.no_default_eol = 1
4518     elif opt == '--keywords-off':
4519       ctx.keywords_off = 1
4520     elif opt == '--username':
4521       ctx.username = value
4522     elif opt == '--skip-cleanup':
4523       ctx.skip_cleanup = 1
4524     elif opt == '--cvs-revnums':
4525       ctx.cvs_revnums = 1
4526     elif opt == '--bdb-txn-nosync':
4527       ctx.bdb_txn_nosync = 1
4528     elif opt == '--fs-type':
4529       ctx.fs_type = value
4530     elif opt == '--create':
4531       sys.stderr.write(warning_prefix +
4532           ': The behaviour produced by the --create option is now the '
4533           'default,\nand passing the option is deprecated.\n')
4534     elif opt == '--profile':
4535       profiling = 1
4536     elif opt == '--symbol-transform':
4537       ctx.symbol_transforms.append(value.split(":"))
4538
4539   if ctx.print_help:
4540     usage()
4541     sys.exit(0)
4542
4543   # Consistency check for options and arguments.
4544   if len(args) == 0:
4545     usage()
4546     sys.exit(1)
4547
4548   if len(args) > 1:
4549     sys.stderr.write(error_prefix +
4550                      ": must pass only one CVS repository.\n")
4551     usage()
4552     sys.exit(1)
4553
4554   ctx.cvsroot = args[0]
4555
4556   if not os.path.isdir(ctx.cvsroot):
4557     sys.stderr.write(error_prefix +
4558                      ": the given CVS repository path '%s' is not an "
4559                      "existing directory.\n" % ctx.cvsroot)
4560     sys.exit(1)
4561
4562   if ctx.use_cvs:
4563     # Ascend above the specified root if necessary, to find the cvs_repository
4564     # (a directory containing a CVSROOT directory) and the cvs_module (the
4565     # path of the conversion root within the cvs repository)
4566     # NB: cvs_module must be seperated by '/' *not* by os.sep .
4567     ctx.cvs_repository = os.path.abspath(ctx.cvsroot)
4568     prev_cvs_repository = None
4569     ctx.cvs_module = ""
4570     while prev_cvs_repository != ctx.cvs_repository:
4571       if os.path.isdir(os.path.join(ctx.cvs_repository, 'CVSROOT')):
4572         break
4573       prev_cvs_repository = ctx.cvs_repository
4574       ctx.cvs_repository, module_component = os.path.split(ctx.cvs_repository)
4575       ctx.cvs_module = module_component + "/" + ctx.cvs_module
4576     else:
4577       # Hit the root (of the drive, on Windows) without finding a CVSROOT dir.
4578       sys.stderr.write(error_prefix +
4579                        ": the path '%s' is not a CVS repository, nor a path " \
4580                        "within a CVS repository.  A CVS repository contains " \
4581                        "a CVSROOT directory within its root directory.\n" \
4582                        % ctx.cvsroot)
4583       sys.exit(1)
4584     os.environ['CVSROOT'] = ctx.cvs_repository
4585
4586   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
4587     sys.stderr.write(error_prefix +
4588                      ": must pass one of '-s' or '--dump-only'.\n")
4589     sys.exit(1)
4590
4591   def not_both(opt1val, opt1name, opt2val, opt2name):
4592     if opt1val and opt2val:
4593       sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
4594                        % (opt1name, opt2name))
4595       sys.exit(1)
4596
4597   not_both(ctx.target, '-s',
4598            ctx.dump_only, '--dump-only')
4599
4600   not_both(ctx.dump_only, '--dump-only',
4601            ctx.existing_svnrepos, '--existing-svnrepos')
4602
4603   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
4604            ctx.existing_svnrepos, '--existing-svnrepos')
4605
4606   not_both(ctx.dump_only, '--dump-only',
4607            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
4608
4609   not_both(ctx.quiet, '-q',
4610            ctx.verbose, '-v')
4611
4612   not_both(ctx.fs_type, '--fs-type',
4613            ctx.existing_svnrepos, '--existing-svnrepos')
4614
4615   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
4616     sys.stderr.write(error_prefix +
4617                      ": cannot pass --bdb-txn-nosync with --fs-type=%s.\n" \
4618                      % ctx.fs_type)
4619     sys.exit(1)
4620
4621   if ((string.find(ctx.trunk_base, '/') > -1)
4622       or (string.find(ctx.tags_base, '/') > -1)
4623       or (string.find(ctx.branches_base, '/') > -1)):
4624     sys.stderr.write("%s: cannot pass multicomponent path to "
4625                      "--trunk, --tags, or --branches yet.\n"
4626                      "  See http://cvs2svn.tigris.org/issues/show_bug.cgi?"
4627                      "id=7 for details.\n" % error_prefix)
4628     sys.exit(1)
4629
4630   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
4631     sys.stderr.write(error_prefix +
4632                      ": the svn-repos-path '%s' is not an "
4633                      "existing directory.\n" % ctx.target)
4634     sys.exit(1)
4635
4636   if not ctx.dump_only and not ctx.existing_svnrepos \
4637      and (not ctx.dry_run) and os.path.exists(ctx.target):
4638     sys.stderr.write(error_prefix +
4639                      ": the svn-repos-path '%s' exists.\nRemove it, or pass "
4640                      "'--existing-svnrepos'.\n" % ctx.target)
4641     sys.exit(1)
4642
4643   if ctx.mime_types_file:
4644     ctx.mime_mapper = MimeMapper()
4645     ctx.mime_mapper.set_mime_types_file(ctx.mime_types_file)
4646
4647   # Make sure the tmp directory exists.  Note that we don't check if
4648   # it's empty -- we want to be able to use, for example, "." to hold
4649   # tempfiles.  But if we *did* want check if it were empty, we'd do
4650   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
4651   if not os.path.exists(ctx.tmpdir):
4652     os.mkdir(ctx.tmpdir)
4653   elif not os.path.isdir(ctx.tmpdir):
4654     sys.stderr.write(error_prefix +
4655        ": cvs2svn tried to use '%s' for temporary files, but that path\n"
4656        "  exists and is not a directory.  Please make it be a directory,\n"
4657        "  or specify some other directory for temporary files.\n" \
4658                      % ctx.tmpdir)
4659     sys.exit(1)
4660
4661   if ctx.use_cvs:
4662     def cvs_ok():
4663       pipe = Popen3('cvs %s --version' % Ctx().cvs_global_arguments, True)
4664       pipe.tochild.close()
4665       pipe.fromchild.read()
4666       errmsg = pipe.childerr.read()
4667       status = pipe.wait()
4668       ok = len(errmsg) == 0 and status == 0
4669       return (ok, status, errmsg)
4670
4671     ctx.cvs_global_arguments = "-q -R"
4672     ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4673     if not ok:
4674       ctx.cvs_global_arguments = "-q"
4675       ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4676
4677     if not ok:
4678       sys.stderr.write(error_prefix +
4679                        ": error executing CVS: status %s, error output:\n" \
4680                        % (cvs_exitstatus) + cvs_errmsg)
4681
4682   # But do lock the tmpdir, to avoid process clash.
4683   try:
4684     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4685   except OSError, e:
4686     if e.errno == errno.EACCES:
4687       sys.stderr.write(error_prefix + ": Permission denied:"
4688                        + " No write access to output directory.\n")
4689       sys.exit(1)
4690     if e.errno == errno.EEXIST:
4691       sys.stderr.write(error_prefix +
4692           ": cvs2svn is using directory '%s' for temporary files, but\n"
4693           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
4694           "  cvs2svn process is currently using '%s' as its temporary\n"
4695           "  workspace.  If you are certain that is not the case,\n"
4696           "  then remove the '%s/cvs2svn.lock' subdirectory.\n" \
4697                        % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir))
4698       sys.exit(1)
4699     raise
4700   try:
4701     if profiling:
4702       import hotshot
4703       prof = hotshot.Profile('cvs2svn.hotshot')
4704       prof.runcall(convert, start_pass, end_pass)
4705       prof.close()
4706     else:
4707       convert(start_pass, end_pass)
4708   finally:
4709     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4710     except: pass
4711
4712 if __name__ == '__main__':
4713   main()