cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import string
  30 import getopt
  31 import stat
  32 import md5
  33 import marshal
  34 import errno
  35 import popen2
  36
  37 # Warnings and errors start with these strings.  They are typically
  38 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  39 warning_prefix = "WARNING"
  40 error_prefix = "ERROR"
  41
  42 # Make sure this Python is recent enough.
  43 if sys.hexversion < 0x2000000:
  44   sys.stderr.write("'%s: Python 2.0 or higher required, "
  45                    "see www.python.org.\n" % error_prefix)
  46   sys.exit(1)
  47
  48 # Pretend we have true booleans on older python versions
  49 try:
  50   True
  51 except:
  52   True = 1
  53   False = 0
  54
  55 # Minimal, incomplete, version of popen2.Popen3 for those platforms
  56 # for which popen2 does not provide it.
  57 try:
  58   Popen3 = popen2.Popen3
  59 except AttributeError:
  60   class Popen3:
  61     def __init__(self, cmd, capturestderr):
  62       if type(cmd) != str:
  63         cmd = " ".join(cmd)
  64       self.fromchild, self.tochild, self.childerr = popen2.popen3(cmd,
  65                                                                   mode='b')
  66     def wait(self):
  67       return self.fromchild.close() or self.tochild.close() or \
  68              self.childerr.close()
  69
  70 # DBM module selection
  71
  72 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
  73 #    so that the dbhash module used by anydbm will use bsddb3.
  74 try:
  75   import bsddb3
  76   sys.modules['bsddb'] = sys.modules['bsddb3']
  77 except ImportError:
  78   pass
  79
  80 # 2. These DBM modules are not good for cvs2svn.
  81 import anydbm
  82 if (anydbm._defaultmod.__name__ == 'dumbdbm'
  83     or anydbm._defaultmod.__name__ == 'dbm'):
  84   print 'ERROR: your installation of Python does not contain a suitable'
  85   print '  DBM module. This script cannot continue.'
  86   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  87   print '  for details.'
  88   sys.exit(1)
  89
  90 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
  91 #    Unfortunately, gdbm appears not to be trouble free, either.
  92 if hasattr(anydbm._defaultmod, 'bsddb') \
  93     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  94   try:
  95     gdbm = __import__('gdbm')
  96   except ImportError:
  97     sys.stderr.write(warning_prefix +
  98         ': The version of the bsddb module found '
  99         'on your computer has been reported to malfunction on some datasets, '
 100         'causing KeyError exceptions. You may wish to upgrade your Python to '
 101         'version 2.3 or later.\n')
 102   else:
 103     anydbm._defaultmod = gdbm
 104
 105 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 106 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 107 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 108
 109 # This really only matches standard '1.1.1.*'-style vendor revisions.
 110 # One could conceivably have a file whose default branch is 1.1.3 or
 111 # whatever, or was that at some point in time, with vendor revisions
 112 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 113 # is the only time this regexp gets used), we'd have no basis for
 114 # assuming that the non-standard vendor branch had ever been the
 115 # default branch anyway, so we don't want this to match them anyway.
 116 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 117
 118 # If this run's output is a repository, then (in the tmpdir) we use
 119 # a dumpfile of this name for repository loads.
 120 #
 121 # If this run's output is a dumpfile, then this is default name of
 122 # that dumpfile, but in the current directory (unless the user has
 123 # specified a dumpfile path, of course, in which case it will be
 124 # wherever the user said).
 125 DUMPFILE = 'cvs2svn-dump'
 126
 127 # This file appears with different suffixes at different stages of
 128 # processing.  CVS revisions are cleaned and sorted here, for commit
 129 # grouping.  See design-notes.txt for details.
 130 DATAFILE = 'cvs2svn-data'
 131
 132 # This file contains a marshalled copy of all the statistics that we
 133 # gather throughout the various runs of cvs2svn.  The data stored as a
 134 # marshalled dictionary.
 135 STATISTICS_FILE = 'cvs2svn-statistics'
 136
 137 # This text file contains records (1 per line) that describe svn
 138 # filesystem paths that are the opening and closing source revisions
 139 # for copies to tags and branches.  The format is as follows:
 140 #
 141 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 142 #
 143 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 144 # SVN_REVNUM are the primary and secondary sorting criteria for
 145 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 146 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 147 # A sorted version of the above file.
 148 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 149
 150 # This file is a temporary file for storing symbolic_name -> closing
 151 # CVSRevision until the end of our pass where we can look up the
 152 # corresponding SVNRevNum for the closing revs and write these out to
 153 # the SYMBOL_OPENINGS_CLOSINGS.
 154 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 155
 156 # Skeleton version of an svn filesystem.
 157 # (These supersede and will eventually replace the two above.)
 158 # See class SVNRepositoryMirror for how these work.
 159 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 160 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 161
 162 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 163 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 164 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 165
 166 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 167 # the CVSRevision is the last such that is a source for those symbolic
 168 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 169 # file, and this file's 1.3 is the latest (by date) revision among
 170 # *all* CVS files that is a source for branch B, then the
 171 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 172 # list at least B in its list.
 173 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 174
 175 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 176 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 177 ### the s-revs data in this database.
 178 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 179
 180 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 181 # names), values are ignorable.
 182 TAGS_DB = 'cvs2svn-tags.db'
 183
 184 # A list all tags.  Each line consists of the tag name and the number
 185 # of files in which it exists, separated by a space.
 186 TAGS_LIST = 'cvs2svn-tags.txt'
 187
 188 # A list of all branches.  The file is stored as a plain text file
 189 # to make it easy to look at in an editor.  Each line contains the
 190 # branch name, the number of files where the branch is created, the
 191 # commit count, and a list of tags and branches that are defined on
 192 # revisions in the branch.
 193 BRANCHES_LIST = 'cvs2svn-branches.txt'
 194
 195 # These two databases provide a bidirectional mapping between
 196 # CVSRevision.unique_key()s and Subversion revision numbers.
 197 #
 198 # The first maps CVSRevision.unique_key() to a number; the values are
 199 # not unique.
 200 #
 201 # The second maps a number to a list of CVSRevision.unique_key()s.
 202 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 203 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 204
 205 # This database maps svn_revnums to tuples of (symbolic_name, date).
 206 #
 207 # The svn_revnums are the revision numbers of all non-primary
 208 # SVNCommits.  No primary SVNCommit has a key in this database.
 209 #
 210 # The date is stored for all commits in this database.
 211 #
 212 # For commits that fill symbolic names, the symbolic_name is stored.
 213 # For commits that default branch syncs, the symbolic_name is None.
 214 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 215
 216 # This database maps svn_revnums of a default branch synchronization
 217 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 218 #
 219 # (NOTE: Secondary commits that fill branches and tags also have a
 220 # motivating commit, but we do not record it because it is (currently)
 221 # not needed for anything.)
 222 #
 223 # This mapping is used when generating the log message for the commit
 224 # that synchronizes the default branch with trunk.
 225 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 226
 227 # How many bytes to read at a time from a pipe.  128 kiB should be
 228 # large enough to be efficient without wasting too much memory.
 229 PIPE_READ_SIZE = 128 * 1024
 230
 231 # Record the default RCS branches, if any, for CVS filepaths.
 232 #
 233 # The keys are CVS filepaths, relative to the top of the repository
 234 # and with the ",v" stripped off, so they match the cvs paths used in
 235 # Commit.commit().  The values are vendor branch revisions, such as
 236 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 237 # represents the highest vendor branch revision thought to have ever
 238 # been head of the default branch.
 239 #
 240 # The reason we record a specific vendor revision, rather than a
 241 # default branch number, is that there are two cases to handle:
 242 #
 243 # One case is simple.  The RCS file lists a default branch explicitly
 244 # in its header, such as '1.1.1'.  In this case, we know that every
 245 # revision on the vendor branch is to be treated as head of trunk at
 246 # that point in time.
 247 #
 248 # But there's also a degenerate case.  The RCS file does not currently
 249 # have a default branch, yet we can deduce that for some period in the
 250 # past it probably *did* have one.  For example, the file has vendor
 251 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 252 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 253 # case, we should record 1.1.1.96 as the last vendor revision to have
 254 # been the head of the default branch.
 255 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 256
 257 # Records the author and log message for each changeset.
 258 # The keys are author+log digests, the same kind used to identify
 259 # unique revisions in the .revs, etc files.  Each value is a tuple
 260 # of two elements: '(author logmessage)'.
 261 METADATA_DB = "cvs2svn-metadata.db"
 262
 263 REVS_SUFFIX = '.revs'
 264 CLEAN_REVS_SUFFIX = '.c-revs'
 265 SORTED_REVS_SUFFIX = '.s-revs'
 266 RESYNC_SUFFIX = '.resync'
 267
 268 SVN_INVALID_REVNUM = -1
 269
 270 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 271
 272 # Things that can happen to a file.
 273 OP_NOOP   = '-'
 274 OP_ADD    = 'A'
 275 OP_DELETE = 'D'
 276 OP_CHANGE = 'C'
 277
 278 # A deltatext either does or doesn't represent some change.
 279 DELTATEXT_NONEMPTY = 'N'
 280 DELTATEXT_EMPTY    = 'E'
 281
 282 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 283
 284 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 285 OPENING = 'O'
 286 CLOSING = 'C'
 287
 288 def temp(basename):
 289   """Return a path to BASENAME in Ctx().tmpdir.
 290   This is a convenience function to save horizontal space in source."""
 291   return os.path.join(Ctx().tmpdir, basename)
 292
 293 # Since the unofficial set also includes [/\] we need to translate those
 294 # into ones that don't conflict with Subversion limitations.
 295 def _clean_symbolic_name(name):
 296   """Return symbolic name NAME, translating characters that Subversion
 297   does not allow in a pathname."""
 298   name = name.replace('/','++')
 299   name = name.replace('\\','--')
 300   return name
 301
 302 def _path_join(*components):
 303   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 304   Empty component are skipped."""
 305   return string.join(filter(None, components), '/')
 306
 307 def run_command(command):
 308   if os.system(command):
 309     sys.exit('Command failed: "%s"' % command)
 310
 311 def relative_name(cvsroot, fname):
 312   l = len(cvsroot)
 313   if fname[:l] == cvsroot:
 314     if fname[l] == os.sep:
 315       return string.replace(fname[l+1:], os.sep, '/')
 316     return string.replace(fname[l:], os.sep, '/')
 317   sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
 318                    " cvsroot\n" % (error_prefix, cvsroot, fname))
 319   sys.exit(1)
 320
 321 def get_co_pipe(c_rev):
 322   """Return a command string, and the pipe created using that string.
 323   C_REV is a CVSRevision. The pipe returns the text of that CVS Revision."""
 324   ctx = Ctx()
 325   if ctx.use_cvs:
 326     pipe_cmd = 'cvs %s co -r%s -p %s' % \
 327                (ctx.cvs_global_arguments, c_rev.rev,
 328                 escape_shell_arg(ctx.cvs_module + c_rev.cvs_path))
 329   else:
 330     pipe_cmd = 'co -q -x,v -p%s %s' % \
 331                (c_rev.rev, escape_shell_arg(c_rev.rcs_path()))
 332   pipe = Popen3(pipe_cmd, True)
 333   pipe.tochild.close()
 334   return pipe_cmd, pipe
 335
 336 def generate_ignores(c_rev):
 337   # Read in props
 338   pipe_cmd, pipe = get_co_pipe(c_rev)
 339   buf = pipe.fromchild.read(PIPE_READ_SIZE)
 340   raw_ignore_val = ""
 341   while buf:
 342     raw_ignore_val = raw_ignore_val + buf
 343     buf = pipe.fromchild.read(PIPE_READ_SIZE)
 344   pipe.fromchild.close()
 345   error_output = pipe.childerr.read()
 346   exit_status = pipe.wait()
 347   if exit_status:
 348     sys.exit("%s: The command '%s' failed with exit status: %s\n"
 349              "and the following output:\n"
 350              "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
 351
 352   # Tweak props: First, convert any spaces to newlines...
 353   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 354   raw_ignores = raw_ignore_val.split('\n')
 355   ignore_vals = [ ]
 356   for ignore in raw_ignores:
 357     # Reset the list if we encounter a '!'
 358     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 359     if ignore == '!':
 360       ignore_vals = [ ]
 361       continue
 362     # Skip empty lines
 363     if len(ignore) == 0:
 364       continue
 365     ignore_vals.append(ignore)
 366   return ignore_vals
 367
 368 # Return a string that has not been returned by gen_key() before.
 369 gen_key_base = 0L
 370 def gen_key():
 371   global gen_key_base
 372   key = '%x' % gen_key_base
 373   gen_key_base = gen_key_base + 1
 374   return key
 375
 376 if sys.platform == "win32":
 377   def escape_shell_arg(str):
 378     return '"' + string.replace(str, '"', '"^""') + '"'
 379 else:
 380   def escape_shell_arg(str):
 381     return "'" + string.replace(str, "'", "'\\''") + "'"
 382
 383 def format_date(date):
 384   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 385   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 386   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 387
 388 def sort_file(infile, outfile):
 389   # sort the log files
 390
 391   # GNU sort will sort our dates differently (incorrectly!) if our
 392   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 393   # it to 'C'
 394   if os.environ.has_key('LC_ALL'):
 395     lc_all_tmp = os.environ['LC_ALL']
 396   else:
 397     lc_all_tmp = None
 398   os.environ['LC_ALL'] = 'C'
 399   # The -T option to sort has a nice side effect.  The Win32 sort is
 400   # case insensitive and cannot be used, and since it does not
 401   # understand the -T option and dies if we try to use it, there is
 402   # no risk that we use that sort by accident.
 403   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 404   if lc_all_tmp is None:
 405     del os.environ['LC_ALL']
 406   else:
 407     os.environ['LC_ALL'] = lc_all_tmp
 408
 409 def print_node_tree(tree, root_node, indent_depth=0):
 410   """For debugging purposes.  Prints all nodes in TREE that are
 411   rooted at ROOT_NODE.  INDENT_DEPTH is merely for purposes of
 412   debugging with the print statement in this function."""
 413   if not indent_depth:
 414     print "TREE", "=" * 75
 415   print "TREE:", " " * (indent_depth * 2), root_node, tree[root_node]
 416   for key, value in tree[root_node].items():
 417     if key[0] == '/': #Skip flags
 418       continue
 419     print_node_tree(tree, value, (indent_depth + 1))
 420
 421 def match_regexp_list(regexp_list, string):
 422   """Return 1 if string matches any of the compiled regexps in REGEXP_LIST,
 423   else return None."""
 424   for regexp in regexp_list:
 425     if regexp.match(string):
 426       return 1
 427
 428 # These constants represent the log levels that this script supports
 429 LOG_WARN = -1
 430 LOG_QUIET = 0
 431 LOG_NORMAL = 1
 432 LOG_VERBOSE = 2
 433 class Log:
 434   """A Simple logging facility.  Each line will be timestamped is
 435   self.use_timestamps is TRUE.  This class is a Borg, see
 436   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 437   __shared_state = {}
 438   def __init__(self):
 439     self.__dict__ = self.__shared_state
 440     if self.__dict__:
 441       return
 442     self.log_level = LOG_NORMAL
 443     # Set this to true if you want to see timestamps on each line output.
 444     self.use_timestamps = None
 445     self.logger = sys.stdout
 446
 447   def _timestamp(self):
 448     """Output a detailed timestamp at the beginning of each line output."""
 449     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 450
 451   def write(self, log_level, *args):
 452     """This is the public method to use for writing to a file.  Only
 453     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 454     there are multiple ARGS, they will be separated by a space."""
 455     if log_level > self.log_level:
 456       return
 457     if self.use_timestamps:
 458       self._timestamp()
 459     self.logger.write(' '.join(map(str,args)) + "\n")
 460     # Ensure that log output doesn't get out-of-order with respect to
 461     # stderr output.
 462     self.logger.flush()
 463
 464
 465 class Cleanup:
 466   """This singleton class manages any files created by cvs2svn.  When
 467   you first create a file, call Cleanup.register, passing the
 468   filename, and the last pass that you need the file.  After the end
 469   of that pass, your file will be cleaned up after running an optional
 470   callback.  This class is a Borg, see
 471   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 472
 473   __shared_state = {}
 474   def __init__(self):
 475     self.__dict__ = self.__shared_state
 476     if self.__dict__:
 477       return
 478     self._log = {}
 479     self._callbacks = {}
 480
 481   def register(self, file, which_pass, callback=None):
 482     """Register FILE for cleanup at the end of WHICH_PASS, running
 483     function CALLBACK prior to removal.  Registering a given FILE is
 484     idempotent; you may register as many times as you wish, but it
 485     will only be cleaned up once.
 486
 487     Note that if a file is registered multiple times, only the first
 488     callback registered for that file will be called at cleanup
 489     time.  Also note that if you register a database file you must
 490     close the database before cleanup, e.g. using a callback."""
 491     if not self._log.has_key(which_pass):
 492       self._log[which_pass] = {}
 493     self._log[which_pass][file] = 1
 494     if callback and not self._callbacks.has_key(file):
 495       self._callbacks[file] = callback
 496
 497   def cleanup(self, which_pass):
 498     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 499     if not self._log.has_key(which_pass):
 500       return
 501     for file in self._log[which_pass].keys():
 502       Log().write(LOG_VERBOSE, "Deleting", file)
 503       if self._callbacks.has_key(file):
 504         self._callbacks[file]()
 505       os.unlink(file)
 506
 507
 508 # Always use these constants for opening databases.
 509 DB_OPEN_READ = 'r'
 510 DB_OPEN_NEW = 'n'
 511
 512 # A wrapper for anydbm that uses the marshal module to store items as
 513 # strings.
 514 class Database:
 515   def __init__(self, filename, mode):
 516     # pybsddb3 has a bug which prevents it from working with
 517     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 518     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 519     # for databases protected by lock and transaction support
 520     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 521     #
 522     # Therefore, manually perform the removal (we can do this, because
 523     # we know that for bsddb - but *not* anydbm in general - the database
 524     # consists of one file with the name we specify, rather than several
 525     # based on that name).
 526     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 527       if os.path.isfile(filename):
 528         os.unlink(filename)
 529       mode = 'c'
 530
 531     self.db = anydbm.open(filename, mode)
 532
 533   def has_key(self, key):
 534     return self.db.has_key(key)
 535
 536   def __getitem__(self, key):
 537     return marshal.loads(self.db[key])
 538
 539   def __setitem__(self, key, value):
 540     self.db[key] = marshal.dumps(value)
 541
 542   def __delitem__(self, key):
 543     del self.db[key]
 544
 545   def get(self, key, default):
 546     if self.has_key(key):
 547       return self.__getitem__(key)
 548     return default
 549
 550
 551 class StatsKeeper:
 552   __shared_state = { }
 553   def __init__(self):
 554     self.__dict__ = self.__shared_state
 555     if self.__dict__:
 556       return
 557     self.filename = temp(STATISTICS_FILE)
 558     Cleanup().register(self.filename, pass8)
 559     # This can get kinda large, so we don't store it in our data dict.
 560     self.repos_files = { }
 561
 562     if os.path.exists(self.filename):
 563       self.unarchive()
 564     else:
 565       self.data = { 'cvs_revs_count' : 0,
 566                     'tags': { },
 567                     'branches' : { },
 568                     'repos_size' : 0,
 569                     'repos_file_count' : 0,
 570                     'svn_rev_count' : None,
 571                     'first_rev_date' : 1L<<32,
 572                     'last_rev_date' : 0,
 573                     'pass_timings' : { },
 574                     'start_time' : 0,
 575                     'end_time' : 0,
 576                     }
 577
 578   def log_duration_for_pass(self, duration, pass_num):
 579     self.data['pass_timings'][pass_num] = duration
 580
 581   def set_start_time(self, start):
 582     self.data['start_time'] = start
 583
 584   def set_end_time(self, end):
 585     self.data['end_time'] = end
 586
 587   def _bump_item(self, key, amount=1):
 588     self.data[key] = self.data[key] + amount
 589
 590   def reset_c_rev_info(self):
 591     self.data['cvs_revs_count'] = 0
 592     self.data['tags'] = { }
 593     self.data['branches'] = { }
 594
 595   def record_c_rev(self, c_rev):
 596     self._bump_item('cvs_revs_count')
 597
 598     for tag in c_rev.tags:
 599       self.data['tags'][tag] = None
 600     for branch in c_rev.branches:
 601       self.data['branches'][branch] = None
 602
 603     if c_rev.timestamp < self.data['first_rev_date']:
 604       self.data['first_rev_date'] = c_rev.timestamp
 605
 606     if c_rev.timestamp > self.data['last_rev_date']:
 607       self.data['last_rev_date'] = c_rev.timestamp
 608
 609     # Only add the size if this is the first time we see the file.
 610     if not self.repos_files.has_key(c_rev.fname):
 611       self._bump_item('repos_size', c_rev.file_size)
 612     self.repos_files[c_rev.fname] = None
 613
 614     self.data['repos_file_count'] = len(self.repos_files)
 615
 616   def set_svn_rev_count(self, count):
 617     self.data['svn_rev_count'] = count
 618
 619   def svn_rev_count(self):
 620     return self.data['svn_rev_count']
 621
 622   def archive(self):
 623     open(self.filename, 'w').write(marshal.dumps(self.data))
 624
 625   def unarchive(self):
 626     self.data = marshal.loads(open(self.filename, 'r').read())
 627
 628   def __str__(self):
 629     svn_revs_str = ""
 630     if self.data['svn_rev_count'] is not None:
 631       svn_revs_str = ('Total SVN Commits:      %10s\n'
 632                       % self.data['svn_rev_count'])
 633
 634     return ('\n'                                \
 635             'cvs2svn Statistics:\n'             \
 636             '------------------\n'              \
 637             'Total CVS Files:        %10i\n'    \
 638             'Total CVS Revisions:    %10i\n'    \
 639             'Total Unique Tags:      %10i\n'    \
 640             'Total Unique Branches:  %10i\n'    \
 641             'CVS Repos Size in KB:   %10i\n'    \
 642             '%s'                                \
 643             'First Revision Date:    %s\n'      \
 644             'Last Revision Date:     %s\n'      \
 645             '------------------'                \
 646             % (self.data['repos_file_count'],
 647                self.data['cvs_revs_count'],
 648                len(self.data['tags']),
 649                len(self.data['branches']),
 650                (self.data['repos_size'] / 1024),
 651                svn_revs_str,
 652                time.ctime(self.data['first_rev_date']),
 653                time.ctime(self.data['last_rev_date']),
 654                ))
 655
 656   def timings(self):
 657     passes = self.data['pass_timings'].keys()
 658     passes.sort()
 659     str = 'Timings:\n------------------\n'
 660
 661     def desc(val):
 662       if val == 1: return "second"
 663       return "seconds"
 664
 665     for pass_num in passes:
 666       duration = int(self.data['pass_timings'][pass_num])
 667       p_str = ('pass %d:%6d %s\n'
 668                % (pass_num, duration, desc(duration)))
 669       str = str + p_str
 670
 671     total = int(self.data['end_time'] - self.data['start_time'])
 672     str = str + ('total: %6d %s' % (total, desc(total)))
 673     return str
 674
 675
 676 class LastSymbolicNameDatabase:
 677   """ Passing every CVSRevision in s-revs to this class will result in
 678   a Database whose key is the last CVS Revision a symbolicname was
 679   seen in, and whose value is a list of all symbolicnames that were
 680   last seen in that revision."""
 681   def __init__(self, mode):
 682     self.symbols = {}
 683     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 684     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 685
 686   # Once we've gone through all the revs,
 687   # symbols.keys() will be a list of all tags and branches, and
 688   # their corresponding values will be a key into the last CVS revision
 689   # that they were used in.
 690   def log_revision(self, c_rev):
 691     # Gather last CVS Revision for symbolic name info and tag info
 692     for tag in c_rev.tags:
 693       self.symbols[tag] = c_rev.unique_key()
 694     if c_rev.op is not OP_DELETE:
 695       for branch in c_rev.branches:
 696         self.symbols[branch] = c_rev.unique_key()
 697
 698   # Creates an inversion of symbols above--a dictionary of lists (key
 699   # = CVS rev unique_key: val = list of symbols that close in that
 700   # rev.
 701   def create_database(self):
 702     for sym, rev_unique_key in self.symbols.items():
 703       if self.symbol_revs_db.has_key(rev_unique_key):
 704         ary = self.symbol_revs_db[rev_unique_key]
 705         ary.append(sym)
 706         self.symbol_revs_db[rev_unique_key] = ary
 707       else:
 708         self.symbol_revs_db[rev_unique_key] = [sym]
 709
 710
 711 class CVSRevisionDatabase:
 712   """A Database to store CVSRevision objects and retrieve them by their
 713   unique_key()."""
 714
 715   def __init__(self, mode):
 716     """Initialize an instance, opening database in MODE (like the MODE
 717     argument to Database or anydbm.open())."""
 718     self.cvs_revs_db = Database(temp(CVS_REVS_DB), mode)
 719     Cleanup().register(temp(CVS_REVS_DB), pass8)
 720
 721   def log_revision(self, c_rev):
 722     """Add C_REV, a CVSRevision, to the database."""
 723     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
 724
 725   def get_revision(self, unique_key):
 726     """Return the CVSRevision stored under UNIQUE_KEY."""
 727     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
 728
 729
 730 class TagsDatabase(Database):
 731   """A Database to store which symbolic names are tags.
 732   Each key is a tag name.
 733   The value has no meaning, and should be set to None."""
 734   def __init__(self, mode):
 735     Database.__init__(self, temp(TAGS_DB), mode)
 736     Cleanup().register(temp(TAGS_DB), pass8)
 737
 738
 739 class CVSRevision:
 740   def __init__(self, ctx, *args):
 741     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
 742
 743     If CTX is None, the following members and methods of the
 744     instantiated CVSRevision class object will be unavailable (or
 745     simply will not work correctly, if at all):
 746        cvs_path
 747        svn_path
 748        svn_trunk_path
 749        is_default_branch_revision()
 750
 751     (Note that this class treats CTX as const, because the caller
 752     likely passed in a Borg instance of a Ctx.  The reason this class
 753     takes CTX as as a parameter, instead of just instantiating a Ctx
 754     itself, is that this class should be usable outside cvs2svn.)
 755
 756     If there is one argument in ARGS, it is a string, in the format of
 757     a line from a revs file.  Do *not* include a trailing newline.
 758
 759     If there are multiple ARGS, there must be 16 of them,
 760     comprising a parsed revs line:
 761        timestamp       -->  (int) date stamp for this cvs revision
 762        digest          -->  (string) digest of author+logmsg
 763        prev_timestamp  -->  (int) date stamp for the previous cvs revision
 764        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
 765        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
 766        rev             -->  (string) this CVS rev, e.g., "1.3"
 767        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
 768        file_in_attic   -->  (char or None) true if RCS file is in Attic
 769        file_executable -->  (char or None) true if RCS file has exec bit set.
 770        file_size       -->  (int) size of the RCS file
 771        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
 772        mode            -->  (string or None) "kkv", "kb", etc.
 773        branch_name     -->  (string or None) branch on which this rev occurred
 774        tags            -->  (list of strings) all tags on this revision
 775        branches        -->  (list of strings) all branches rooted in this rev
 776        fname           -->  (string) relative path of file in CVS repos
 777
 778     The two forms of initialization are equivalent."""
 779
 780     self._ctx = ctx
 781     if len(args) == 16:
 782       (self.timestamp, self.digest, self.prev_timestamp, self.op,
 783        self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
 784        self.file_executable, self.file_size, self.deltatext_code, self.fname,
 785        self.mode, self.branch_name, self.tags, self.branches) = args
 786     elif len(args) == 1:
 787       data = args[0].split(' ', 14)
 788       self.timestamp = int(data[0], 16)
 789       self.digest = data[1]
 790       if data[2] == "*":
 791         self.prev_timestamp = 0
 792       else:
 793         self.prev_timestamp = int(data[2])
 794       self.op = data[3]
 795       self.prev_rev = data[4]
 796       if self.prev_rev == "*":
 797         self.prev_rev = None
 798       self.rev = data[5]
 799       self.next_rev = data[6]
 800       if self.next_rev == "*":
 801         self.next_rev = None
 802       self.file_in_attic = data[7]
 803       if self.file_in_attic == "*":
 804         self.file_in_attic = None
 805       self.file_executable = data[8]
 806       if self.file_executable == "*":
 807         self.file_executable = None
 808       self.file_size = int(data[9])
 809       self.deltatext_code = data[10]
 810       self.mode = data[11]
 811       if self.mode == "*":
 812         self.mode = None
 813       self.branch_name = data[12]
 814       if self.branch_name == "*":
 815         self.branch_name = None
 816       ntags = int(data[13])
 817       tags = data[14].split(' ', ntags + 1)
 818       nbranches = int(tags[ntags])
 819       branches = tags[ntags + 1].split(' ', nbranches)
 820       self.fname = branches[nbranches]
 821       self.tags = tags[:ntags]
 822       self.branches = branches[:nbranches]
 823     else:
 824       raise TypeError, 'CVSRevision() takes 2 or 16 arguments (%d given)' % \
 825           (len(args) + 1)
 826     if ctx is not None:
 827       self.cvs_path = relative_name(self._ctx.cvsroot, self.fname[:-2])
 828       self.svn_path = self._make_path(self.cvs_path, self.branch_name)
 829       self.svn_trunk_path = self._make_path(self.cvs_path)
 830
 831   # The 'primary key' of a CVS Revision is the revision number + the
 832   # filename.  To provide a unique key (say, for a dict), we just glom
 833   # them together in a string.  By passing in self.prev_rev or
 834   # self.next_rev, you can get the unique key for their respective
 835   # CVSRevisions.
 836   def unique_key(self, revnum=None):
 837     if revnum is None:
 838       revnum = self.rev
 839     return revnum + "/" + self.fname
 840
 841   def __str__(self):
 842     return ('%08lx %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s' % (
 843       self.timestamp, self.digest, self.prev_timestamp or "*", self.op,
 844       (self.prev_rev or "*"), self.rev, (self.next_rev or "*"),
 845       (self.file_in_attic or "*"), (self.file_executable or "*"),
 846       self.file_size,
 847       self.deltatext_code, (self.mode or "*"), (self.branch_name or "*"),
 848       len(self.tags), self.tags and " " or "", " ".join(self.tags),
 849       len(self.branches), self.branches and " " or "", " ".join(self.branches),
 850       self.fname, ))
 851
 852   # Returns true if this CVSRevision is the opening CVSRevision for
 853   # NAME (for this RCS file).
 854   def opens_symbolic_name(self, name):
 855     if name in self.tags:
 856       return 1
 857     if name in self.branches:
 858       # If this c_rev opens a branch and our op is OP_DELETE, then
 859       # that means that the file that this c_rev belongs to was
 860       # created on the branch, so for all intents and purposes, this
 861       # c_rev is *technically* not an opening.  See Issue #62 for more
 862       # information.
 863       if self.op != OP_DELETE:
 864         return 1
 865     return 0
 866
 867   def is_default_branch_revision(self):
 868     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
 869     revision according to DEFAULT_BRANCHES_DB (see the conditions
 870     documented there), else return None."""
 871     if self._ctx._default_branches_db.has_key(self.cvs_path):
 872       val = self._ctx._default_branches_db[self.cvs_path]
 873       val_last_dot = val.rindex(".")
 874       our_last_dot = self.rev.rindex(".")
 875       default_branch = val[:val_last_dot]
 876       our_branch = self.rev[:our_last_dot]
 877       default_rev_component = int(val[val_last_dot + 1:])
 878       our_rev_component = int(self.rev[our_last_dot + 1:])
 879       if (default_branch == our_branch
 880           and our_rev_component <= default_rev_component):
 881         return 1
 882     # else
 883     return None
 884
 885   def _make_path(self, path, branch_name = None):
 886     """Return the trunk path or branch path for PATH.
 887
 888     If PATH is None, return None."""
 889     # For a while, we treated each top-level subdir of the CVS
 890     # repository as a "project root" and interpolated the appropriate
 891     # genealogy (trunk|tag|branch) in according to the official
 892     # recommended layout.  For example, the path '/foo/bar/baz.c' on
 893     # branch 'Rel2' would become
 894     #
 895     #   /foo/branches/Rel2/bar/baz.c
 896     #
 897     # and on trunk it would become
 898     #
 899     #   /foo/trunk/bar/baz.c
 900     #
 901     # However, we went back to the older and simpler method of just
 902     # prepending the genealogy to the front, instead of interpolating.
 903     # So now we produce:
 904     #
 905     #   /branches/Rel2/foo/bar/baz.c
 906     #   /trunk/foo/bar/baz.c
 907     #
 908     # Why?  Well, Jack Repenning pointed out that this way is much
 909     # friendlier to "anonymously rooted subtrees" (that's a tree where
 910     # the name of the top level dir doesn't matter, the point is that if
 911     # you cd into it and, say, run 'make', something good will happen).
 912     # By interpolating, we made it impossible to point cvs2svn at some
 913     # subdir in the CVS repository and convert it as a project, because
 914     # we'd treat every subdir underneath it as an independent project
 915     # root, which is probably not what the user wanted.
 916     #
 917     # Also, see Blair Zajac's post
 918     #
 919     #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
 920     #
 921     # and the surrounding thread, for why what people really want is a
 922     # way of specifying an in-repository prefix path, not interpolation.
 923     if path is None:
 924       return None
 925
 926     if branch_name:
 927       branch_name = _clean_symbolic_name(branch_name)
 928       return self._ctx.branches_base + '/' + branch_name + '/' + path
 929     else:
 930       return self._ctx.trunk_base + '/' + path
 931
 932   def rcs_path(self):
 933     """Returns the actual filesystem path to the RCS file of this
 934     CVSRevision."""
 935     if self.file_in_attic is None:
 936       return self.fname
 937     else:
 938       basepath, filename = os.path.split(self.fname)
 939       return os.path.join(basepath, 'Attic', filename)
 940
 941   def filename(self):
 942     "Return the last path component of self.fname, minus the ',v'"
 943     return os.path.split(self.fname)[-1][:-2]
 944
 945 class SymbolDatabase:
 946   """This database records information on all symbols in the RCS
 947   files.  It is created in pass 1 and it is used in pass 2."""
 948   def __init__(self):
 949     # A hash that maps tag names to commit counts
 950     self.tags = { }
 951     # A hash that maps branch names to lists of the format
 952     # [ create_count, commit_count, blockers ], where blockers
 953     # is a hash that lists the symbols that depend on the
 954     # the branch.  The blockers hash is used as a set, so the
 955     # values are not used.
 956     self.branches = { }
 957
 958   def register_tag_creation(self, name):
 959     """Register the creation of the tag NAME."""
 960     if not self.tags.has_key(name):
 961       self.tags[name] = 0
 962     self.tags[name] += 1
 963
 964   def _branch(self, name):
 965     """Helper function to get a branch node that will create and
 966     initialize the node if it does not exist."""
 967     if not self.branches.has_key(name):
 968       self.branches[name] = [ 0, 0, { } ]
 969     return self.branches[name]
 970
 971   def register_branch_creation(self, name):
 972     """Register the creation of the branch NAME."""
 973     self._branch(name)[0] += 1
 974
 975   def register_branch_commit(self, name):
 976     """Register a commit on the branch NAME."""
 977     self._branch(name)[1] += 1
 978
 979   def register_branch_blocker(self, name, blocker):
 980     """Register BLOCKER as a blocker on the branch NAME."""
 981     self._branch(name)[2][blocker] = None
 982
 983   def branch_has_commit(self, name):
 984     """Return non-zero if NAME has commits.  Returns 0 if name
 985     is not a branch or if it has no commits."""
 986     return self.branches.has_key(name) and self.branches[name][1]
 987
 988   def find_excluded_symbols(self, regexp_list):
 989     """Returns a hash of all symbols thaht match the regexps in
 990     REGEXP_LISTE.  The hash is used as a set so the values are
 991     not used."""
 992     excludes = { }
 993     for tag in self.tags.keys():
 994       if match_regexp_list(regexp_list, tag):
 995         excludes[tag] = None
 996     for branch in self.branches.keys():
 997       if match_regexp_list(regexp_list, branch):
 998         excludes[branch] = None
 999     return excludes
1000
1001   def find_branch_exclude_blockers(self, branch, excludes):
1002     """Find all blockers of BRANCH, excluding the ones in the hash
1003     EXCLUDES."""
1004     blockers = { }
1005     if excludes.has_key(branch):
1006       for blocker in self.branches[branch][2]:
1007         if not excludes.has_key(blocker):
1008           blockers[blocker] = None
1009     return blockers
1010
1011   def find_blocked_excludes(self, excludes):
1012     """Find all branches not in EXCLUDES that have blocking symbols that
1013     are not themselves excluded.  Return a hash that maps branch names
1014     to a hash of blockers.  The hash of blockes is used as a set so the
1015     values are not used."""
1016     blocked_branches = { }
1017     for branch in self.branches.keys():
1018       blockers = self.find_branch_exclude_blockers(branch, excludes)
1019       if blockers:
1020         blocked_branches[branch] = blockers
1021     return blocked_branches
1022
1023   def find_mismatches(self, excludes=None):
1024     """Find all symbols that are defined as both tags and branches,
1025     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1026     the symbol name, tag count, branch count and commit count."""
1027     if excludes is None:
1028       excludes = { }
1029     mismatches = [ ]
1030     for branch in self.branches.keys():
1031       if not excludes.has_key(branch) and self.tags.has_key(branch):
1032         mismatches.append((branch,                    # name
1033                            self.tags[branch],         # tag count
1034                            self.branches[branch][0],  # branch count
1035                            self.branches[branch][1])) # commit count
1036     return mismatches
1037
1038   def read(self):
1039     """Read the symbol database from files."""
1040     f = open(temp(TAGS_LIST))
1041     while 1:
1042       line = f.readline()
1043       if not line:
1044         break
1045       tag, count = line.split()
1046       self.tags[tag] = int(count)
1047
1048     f = open(temp(BRANCHES_LIST))
1049     while 1:
1050       line = f.readline()
1051       if not line:
1052         break
1053       words = line.split()
1054       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1055       for blocker in words[3:]:
1056         self.branches[words[0]][2][blocker] = None
1057
1058   def write(self):
1059     """Store the symbol database to files."""
1060     f = open(temp(TAGS_LIST), "w")
1061     Cleanup().register(temp(TAGS_LIST), pass2)
1062     for tag, count in self.tags.items():
1063       f.write("%s %d\n" % (tag, count))
1064
1065     f = open(temp(BRANCHES_LIST), "w")
1066     Cleanup().register(temp(BRANCHES_LIST), pass2)
1067     for branch, info in self.branches.items():
1068       f.write("%s %d %d" % (branch, info[0], info[1]))
1069       if info[2]:
1070         f.write(" ")
1071         f.write(" ".join(info[2].keys()))
1072       f.write("\n")
1073
1074 class CollectData(cvs2svn_rcsparse.Sink):
1075   def __init__(self):
1076     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1077     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1078     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1079     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1080     self.default_branches_db = Database(temp(DEFAULT_BRANCHES_DB), DB_OPEN_NEW)
1081     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1082     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1083     Cleanup().register(temp(METADATA_DB), pass8)
1084     self.fatal_errors = []
1085     self.num_files = 0
1086     self.symbol_db = SymbolDatabase()
1087
1088     # 1 if we've collected data for at least one file, None otherwise.
1089     self.found_valid_file = None
1090
1091     # See set_fname() for initializations of other variables.
1092
1093   def set_fname(self, canonical_name, filename):
1094     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1095     filesystem path to the file in question, and CANONICAL_NAME is
1096     FILENAME with the 'Attic' component removed (if the file is indeed
1097     in the Attic) ."""
1098     self.fname = canonical_name
1099
1100     # We calculate and save some file metadata here, where we can do
1101     # it only once per file, instead of waiting until later where we
1102     # would have to do the same calculations once per CVS *revision*.
1103
1104     self.rel_name = relative_name(Ctx().cvsroot, self.fname)[:-2]
1105
1106     # If the paths are not the same, then that means that the
1107     # canonical_name has had the 'Attic' component stripped out.
1108     self.file_in_attic = None
1109     if not canonical_name == filename:
1110       self.file_in_attic = 1
1111
1112     file_stat = os.stat(filename)
1113     # The size of our file in bytes
1114     self.file_size = file_stat[stat.ST_SIZE]
1115
1116     # Whether or not the executable bit is set.
1117     self.file_executable = None
1118     if file_stat[0] & stat.S_IXUSR:
1119       self.file_executable = 1
1120
1121     # revision -> [timestamp, author, old-timestamp]
1122     self.rev_data = { }
1123
1124     # Maps revision number (key) to the revision number of the
1125     # previous revision along this line of development.
1126     #
1127     # For the first revision R on a branch, we consider the revision
1128     # from which R sprouted to be the 'previous'.
1129     #
1130     # Note that this revision can't be determined arithmetically (due
1131     # to cvsadmin -o, which is why this is necessary).
1132     self.prev_rev = { }
1133
1134     # This dict is essentially self.prev_rev with the values mapped in
1135     # the other direction, so following key -> value will yield you
1136     # the next revision number
1137     self.next_rev = { }
1138
1139     # Track the state of each revision so that in set_revision_info,
1140     # we can determine if our op is an add/change/delete.  We can do
1141     # this because in set_revision_info, we'll have all of the
1142     # revisions for a file at our fingertips, and we need to examine
1143     # the state of our prev_rev to determine if we're an add or a
1144     # change--without the state of the prev_rev, we are unable to
1145     # distinguish between an add and a change.
1146     self.rev_state = { }
1147
1148     # Hash mapping branch numbers, like '1.7.2', to branch names,
1149     # like 'Release_1_0_dev'.
1150     self.branch_names = { }
1151
1152     # RCS flags (used for keyword expansion).
1153     self.mode = None
1154
1155     # Hash mapping revision numbers, like '1.7', to lists of names
1156     # indicating which branches sprout from that revision, like
1157     # ['Release_1_0_dev', 'experimental_driver', ...].
1158     self.branchlist = { }
1159
1160     # Like self.branchlist, but the values are lists of tag names that
1161     # apply to the key revision.
1162     self.taglist = { }
1163
1164     # If set, this is an RCS branch number -- rcsparse calls this the
1165     # "principal branch", but CVS and RCS refer to it as the "default
1166     # branch", so that's what we call it, even though the rcsparse API
1167     # setter method is still 'set_principal_branch'.
1168     self.default_branch = None
1169
1170     # If the RCS file doesn't have a default branch anymore, but does
1171     # have vendor revisions, then we make an educated guess that those
1172     # revisions *were* the head of the default branch up until the
1173     # commit of 1.2, at which point the file's default branch became
1174     # trunk.  This records the date at which 1.2 was committed.
1175     self.first_non_vendor_revision_date = None
1176
1177     # A list of all symbols defined for the current file.  Used to
1178     # prevent multiple definitions of a symbol, something which can
1179     # easily happen when --symbol-transform is used.
1180     self.defined_symbols = [ ]
1181
1182   def set_principal_branch(self, branch):
1183     self.default_branch = branch
1184
1185   def set_expansion(self, mode):
1186     self.mode = mode
1187
1188   def set_branch_name(self, branch_number, name):
1189     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1190     and that NAME sprouts from BRANCH_NUMBER .
1191     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1192     for example '1.7.2' (never '1.7.0.2')."""
1193     if not self.branch_names.has_key(branch_number):
1194       self.branch_names[branch_number] = name
1195       # The branchlist is keyed on the revision number from which the
1196       # branch sprouts, so strip off the odd final component.
1197       sprout_rev = branch_number[:branch_number.rfind(".")]
1198       if not self.branchlist.has_key(sprout_rev):
1199         self.branchlist[sprout_rev] = []
1200       self.branchlist[sprout_rev].append(name)
1201       self.symbol_db.register_branch_creation(name)
1202     else:
1203       sys.stderr.write("%s: in '%s':\n"
1204                        "   branch '%s' already has name '%s',\n"
1205                        "   cannot also have name '%s', ignoring the latter\n"
1206                        % (warning_prefix, self.fname, branch_number,
1207                           self.branch_names[branch_number], name))
1208
1209   def rev_to_branch_name(self, revision):
1210     """Return the name of the branch on which REVISION lies.
1211     REVISION is a non-branch revision number with an even number of,
1212     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1213     For the convenience of callers, REVISION can also be a trunk
1214     revision such as '1.2', in which case just return None."""
1215     if trunk_rev.match(revision):
1216       return None
1217     return self.branch_names.get(revision[:revision.rindex(".")])
1218
1219   def add_cvs_branch(self, revision, branch_name):
1220     """Record the root revision and branch revision for BRANCH_NAME,
1221     based on REVISION.  REVISION is a CVS branch number having an even
1222     number of components where the second-to-last is '0'.  For
1223     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1224     from 1.7 and has branch number 1.7.2."""
1225     last_dot = revision.rfind(".")
1226     branch_rev = revision[:last_dot]
1227     last2_dot = branch_rev.rfind(".")
1228     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1229     self.set_branch_name(branch_rev, branch_name)
1230
1231   def define_tag(self, name, revision):
1232     """Record a bidirectional mapping between symbolic NAME and REVISION.
1233     REVISION is an unprocessed revision number from the RCS file's
1234     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1235     This function will determine what kind of symbolic name it is by
1236     inspection, and record it in the right places."""
1237     for (pattern, replacement) in Ctx().symbol_transforms:
1238       newname = re.sub(pattern, replacement, name)
1239       if newname != name:
1240         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1241                     % (name, newname))
1242         name = newname
1243     if name in self.defined_symbols:
1244       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1245                 % (error_prefix, name, self.fname)
1246       sys.stderr.write(err + "\n")
1247       self.fatal_errors.append(err)
1248     self.defined_symbols.append(name)
1249     if branch_tag.match(revision):
1250       self.add_cvs_branch(revision, name)
1251     elif vendor_tag.match(revision):
1252       self.set_branch_name(revision, name)
1253     else:
1254       if not self.taglist.has_key(revision):
1255         self.taglist[revision] = []
1256       self.taglist[revision].append(name)
1257       self.symbol_db.register_tag_creation(name)
1258
1259   def define_revision(self, revision, timestamp, author, state,
1260                       branches, next):
1261
1262     # Record the state of our revision for later calculations
1263     self.rev_state[revision] = state
1264
1265     # store the rev_data as a list in case we have to jigger the timestamp
1266     self.rev_data[revision] = [int(timestamp), author, None]
1267
1268     # When on trunk, the RCS 'next' revision number points to what
1269     # humans might consider to be the 'previous' revision number.  For
1270     # example, 1.3's RCS 'next' is 1.2.
1271     #
1272     # However, on a branch, the RCS 'next' revision number really does
1273     # point to what humans would consider to be the 'next' revision
1274     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1275     #
1276     # In other words, in RCS, 'next' always means "where to find the next
1277     # deltatext that you need this revision to retrieve.
1278     #
1279     # That said, we don't *want* RCS's behavior here, so we determine
1280     # whether we're on trunk or a branch and set self.prev_rev
1281     # accordingly.
1282     #
1283     # One last thing.  Note that if REVISION is a branch revision,
1284     # instead of mapping REVISION to NEXT, we instead map NEXT to
1285     # REVISION.  Since we loop over all revisions in the file before
1286     # doing anything with the data we gather here, this 'reverse
1287     # assignment' effectively does the following:
1288     #
1289     # 1. Gives us no 'prev' value for REVISION (in this
1290     # iteration... it may have been set in a previous iteration)
1291     #
1292     # 2. Sets the 'prev' value for the revision with number NEXT to
1293     # REVISION.  So when we come around to the branch revision whose
1294     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1295     # set.
1296     if trunk_rev.match(revision):
1297       self.prev_rev[revision] = next
1298       self.next_rev[next] = revision
1299     elif next:
1300       self.prev_rev[next] = revision
1301       self.next_rev[revision] = next
1302
1303     for b in branches:
1304       self.prev_rev[b] = revision
1305
1306     # Ratchet up the highest vendor head revision, if necessary.
1307     if self.default_branch:
1308       default_branch_root = self.default_branch + "."
1309       if ((revision.find(default_branch_root) == 0)
1310           and (default_branch_root.count('.') == revision.count('.'))):
1311         # This revision is on the default branch, so record that it is
1312         # the new highest default branch head revision.
1313         self.default_branches_db[self.rel_name] = revision
1314     else:
1315       # No default branch, so make an educated guess.
1316       if revision == '1.2':
1317         # This is probably the time when the file stopped having a
1318         # default branch, so make a note of it.
1319         self.first_non_vendor_revision_date = timestamp
1320       else:
1321         m = vendor_revision.match(revision)
1322         if m and ((not self.first_non_vendor_revision_date)
1323                   or (timestamp < self.first_non_vendor_revision_date)):
1324           # We're looking at a vendor revision, and it wasn't
1325           # committed after this file lost its default branch, so bump
1326           # the maximum trunk vendor revision in the permanent record.
1327           self.default_branches_db[self.rel_name] = revision
1328
1329     if not trunk_rev.match(revision):
1330       # Check for unlabeled branches, record them.  We tried to collect
1331       # all branch names when we parsed the symbolic name header
1332       # earlier, of course, but that didn't catch unlabeled branches.
1333       # If a branch is unlabeled, this is our first encounter with it,
1334       # so we have to record its data now.
1335       branch_number = revision[:revision.rindex(".")]
1336       if not self.branch_names.has_key(branch_number):
1337         branch_name = "unlabeled-" + branch_number
1338         self.set_branch_name(branch_number, branch_name)
1339
1340       # Register the commit on this non-trunk branch
1341       branch_name = self.branch_names[branch_number]
1342       self.symbol_db.register_branch_commit(branch_name)
1343
1344   def tree_completed(self):
1345     "The revision tree has been parsed.  Analyze it for consistency."
1346
1347     # Our algorithm depends upon the timestamps on the revisions occuring
1348     # monotonically over time.  That is, we want to see rev 1.34 occur in
1349     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1350     # sorting), and then tried to insert 1.34, we'd be screwed.
1351
1352     # to perform the analysis, we'll simply visit all of the 'previous'
1353     # links that we have recorded and validate that the timestamp on the
1354     # previous revision is before the specified revision
1355
1356     # if we have to resync some nodes, then we restart the scan. just keep
1357     # looping as long as we need to restart.
1358     while 1:
1359       for current, prev in self.prev_rev.items():
1360         if not prev:
1361           # no previous revision exists (i.e. the initial revision)
1362           continue
1363         t_c = self.rev_data[current][0]
1364         t_p = self.rev_data[prev][0]
1365         if t_p >= t_c:
1366           # the previous revision occurred later than the current revision.
1367           # shove the previous revision back in time (and any before it that
1368           # may need to shift).
1369
1370           # We sync backwards and not forwards because any given CVS
1371           # Revision has only one previous revision.  However, a CVS
1372           # Revision can *be* a previous revision for many other
1373           # revisions (e.g., a revision that is the source of multiple
1374           # branches).  This becomes relevant when we do the secondary
1375           # synchronization in pass 2--we can make certain that we
1376           # don't resync a revision earlier than it's previous
1377           # revision, but it would be non-trivial to make sure that we
1378           # don't resync revision R *after* any revisions that have R
1379           # as a previous revision.
1380           while t_p >= t_c:
1381             self.rev_data[prev][0] = t_c - 1    # new timestamp
1382             self.rev_data[prev][2] = t_p        # old timestamp
1383             delta = t_c - 1 - t_p
1384             msg =  "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1385                   % (self.rel_name,
1386                      prev, time.ctime(t_p), delta)
1387             Log().write(LOG_VERBOSE, msg)
1388             if (delta > COMMIT_THRESHOLD
1389                 or delta < (COMMIT_THRESHOLD * -1)):
1390               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1391               Log().write(LOG_WARN, str % (warning_prefix, self.rel_name,
1392                                            delta))
1393             current = prev
1394             prev = self.prev_rev[current]
1395             if not prev:
1396               break
1397             t_c = t_c - 1               # self.rev_data[current][0]
1398             t_p = self.rev_data[prev][0]
1399
1400           # break from the for-loop
1401           break
1402       else:
1403         # finished the for-loop (no resyncing was performed)
1404         return
1405
1406   def set_revision_info(self, revision, log, text):
1407     timestamp, author, old_ts = self.rev_data[revision]
1408     digest = sha.new(log + '\0' + author).hexdigest()
1409     if old_ts:
1410       # the timestamp on this revision was changed. log it for later
1411       # resynchronization of other files's revisions that occurred
1412       # for this time and log message.
1413       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1414
1415     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1416     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1417     #
1418     # If revision 1.1 appears to have been created via 'cvs add'
1419     # instead of 'cvs import', then this file probably never had a
1420     # default branch, so retroactively remove its record in the
1421     # default branches db.  The test is that the log message CVS uses
1422     # for 1.1 in imports is "Initial revision\n" with no period.
1423     if revision == '1.1' and log != 'Initial revision\n':
1424       if self.default_branches_db.has_key(self.rel_name):
1425         del self.default_branches_db[self.rel_name]
1426
1427     # Get the timestamp of the previous revision
1428     prev_rev = self.prev_rev.get(revision, None)
1429     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1430
1431     # How to tell if a CVSRevision is an add, a change, or a deletion:
1432     #
1433     # It's a delete if RCS state is 'dead'
1434     #
1435     # It's an add if RCS state is 'Exp.' and
1436     #      - we either have no previous revision
1437     #        or
1438     #      - we have a previous revision whose state is 'dead'
1439     #
1440     # Anything else is a change.
1441     if self.rev_state[revision] == 'dead':
1442       op = OP_DELETE
1443     elif ((self.prev_rev.get(revision, None) is None)
1444           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1445       op = OP_ADD
1446     else:
1447       op = OP_CHANGE
1448
1449     if text:
1450       deltatext_code = DELTATEXT_NONEMPTY
1451     else:
1452       deltatext_code = DELTATEXT_EMPTY
1453
1454     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp, op,
1455                         self.prev_rev[revision], revision,
1456                         self.next_rev.get(revision),
1457                         self.file_in_attic, self.file_executable,
1458                         self.file_size,
1459                         deltatext_code, self.fname,
1460                         self.mode, self.rev_to_branch_name(revision),
1461                         self.taglist.get(revision, []),
1462                         self.branchlist.get(revision, []))
1463     self.revs.write(str(c_rev) + "\n")
1464     StatsKeeper().record_c_rev(c_rev)
1465
1466     if not self.metadata_db.has_key(digest):
1467       self.metadata_db[digest] = (author, log)
1468
1469   def parse_completed(self):
1470     # Walk through all branches and tags and register them with
1471     # their parent branch in the symbol database.
1472     for revision, symbols in self.taglist.items() + self.branchlist.items():
1473       for symbol in symbols:
1474         name = self.rev_to_branch_name(revision)
1475         if name is not None:
1476           self.symbol_db.register_branch_blocker(name, symbol)
1477
1478     self.num_files = self.num_files + 1
1479
1480   def write_symbol_db(self):
1481     self.symbol_db.write()
1482
1483 class SymbolingsLogger:
1484   """Manage the file that contains lines for symbol openings and
1485   closings.
1486
1487   This data will later be used to determine valid SVNRevision ranges
1488   from which a file can be copied when creating a branch or tag in
1489   Subversion.  Do this by finding "Openings" and "Closings" for each
1490   file copied onto a branch or tag.
1491
1492   An "Opening" is the CVSRevision from which a given branch/tag
1493   sprouts on a path.
1494
1495   The "Closing" for that branch/tag and path is the next CVSRevision
1496   on the same line of development as the opening.
1497
1498   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1499   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1500   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1501   'foo.c'.  Note that there may be many revisions chronologically
1502   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1503   perhaps even including on branch BEE itself.  But 1.3 is the next
1504   revision *on the same line* as 1.2, that is why it is the closing
1505   revision for those symbolic names of which 1.2 is the opening.
1506
1507   The reason for doing all this hullabaloo is to make branch and tag
1508   creation as efficient as possible by minimizing the number of copies
1509   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1510   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1511   means that when creating branch BEE, there is some motivation to do
1512   the copy from one of 17-30.  Now if there were another file,
1513   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1514   to revisions 24 and 39 in Subversion, we would know that the ideal
1515   thing would be to copy the branch from somewhere between 24 and 29,
1516   inclusive.
1517   """
1518   def __init__(self):
1519     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1520     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1521     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1522     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1523
1524     # This keys of this dictionary are Subversion repository *source*
1525     # paths for which we've encountered an 'opening'.  The values are
1526     # the symbolic names that this path has opened.  The only paths
1527     # that should be in this dict are paths whose corresponding
1528     # CVSRevision is a default branch revision.
1529     self.open_paths_with_default_branches = { }
1530
1531   def log_revision(self, c_rev, svn_revnum):
1532     """Log any openings found in C_REV, and if C_REV.next_rev is not
1533     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1534     any) will have its revnum determined later."""
1535     for name in c_rev.tags + c_rev.branches:
1536       name = _clean_symbolic_name(name)
1537       self._note_default_branch_opening(c_rev, name)
1538       if c_rev.op != OP_DELETE:
1539         self._log(name, svn_revnum, c_rev.svn_path, OPENING)
1540
1541       # If our c_rev has a next_rev, then that's the closing rev for
1542       # this source revision.  Log it to closings for later processing
1543       # since we don't know the svn_revnum yet.
1544       if c_rev.next_rev is not None:
1545         self.closings.write('%s %s\n' %
1546                             (name, c_rev.unique_key(c_rev.next_rev)))
1547
1548   def _log(self, name, svn_revnum, svn_path, type):
1549     """Write out a single line to the symbol_openings_closings file
1550     representing that svn_revnum of svn_path is either the opening or
1551     closing (TYPE) of NAME (a symbolic name).
1552
1553     TYPE should only be one of the following global constants:
1554     OPENING or CLOSING."""
1555     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1556     self.symbolings.write('%s %.8d %s %s\n' % (name, svn_revnum,
1557                                                type, svn_path))
1558
1559   def close(self):
1560     """Iterate through the closings file, lookup the svn_revnum for
1561     each closing CVSRevision, and write a proper line out to the
1562     symbolings file."""
1563     # Use this to get the c_rev.svn_path of our rev_key
1564     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1565
1566     self.closings.close()
1567     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1568       (name, rev_key) = line.rstrip().split(" ", 1)
1569       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1570
1571       c_rev = cvs_revs_db.get_revision(rev_key)
1572       self._log(name, svn_revnum, c_rev.svn_path, CLOSING)
1573
1574     self.symbolings.close()
1575
1576   def _note_default_branch_opening(self, c_rev, symbolic_name):
1577     """If C_REV is a default branch revision, log C_REV.svn_trunk_path
1578     as an opening for SYMBOLIC_NAME."""
1579     path = c_rev.svn_trunk_path
1580     if not self.open_paths_with_default_branches.has_key(path):
1581       self.open_paths_with_default_branches[path] = [ ]
1582     self.open_paths_with_default_branches[path].append(symbolic_name)
1583
1584   def log_default_branch_closing(self, c_rev, svn_revnum):
1585     """If self.open_paths_with_default_branches contains
1586     C_REV.svn_trunk_path, then call log each name in
1587     self.open_paths_with_default_branches[C_REV.svn_trunk_path] as a
1588     closing with SVN_REVNUM as the closing revision number. """
1589     path = c_rev.svn_trunk_path
1590     if self.open_paths_with_default_branches.has_key(path):
1591       # log each symbol as a closing
1592       for name in self.open_paths_with_default_branches[path]:
1593         self._log(name, svn_revnum, path, CLOSING)
1594       # Remove them from the openings list as we're done with them.
1595       del self.open_paths_with_default_branches[path]
1596
1597
1598 class PersistenceManager:
1599   """The PersistenceManager allows us to effectively store SVNCommits
1600   to disk and retrieve them later using only their subversion revision
1601   number as the key.  It also returns the subversion revision number
1602   for a given CVSRevision's unique key.
1603
1604   All information pertinent to each SVNCommit is stored in a series of
1605   on-disk databases so that SVNCommits can be retrieved on-demand.
1606
1607   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
1608   In 'new' mode, PersistenceManager will initialize a new set of on-disk
1609   databases and be fully-featured.
1610   In 'read' mode, PersistenceManager will open existing on-disk databases
1611   and the set_* methods will be unavailable."""
1612   def __init__(self, mode):
1613     self.mode = mode
1614     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
1615       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
1616     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
1617     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
1618     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
1619     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
1620     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
1621     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
1622     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
1623     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
1624     ###PERF kff Elsewhere there are comments about sucking the tags db
1625     ### into memory.  That seems like a good idea.
1626     if not Ctx().trunk_only:
1627       self.tags_db = TagsDatabase(DB_OPEN_READ)
1628       self.motivating_revnums = Database(temp(MOTIVATING_REVNUMS), mode)
1629       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
1630
1631     # "branch_name" -> svn_revnum in which branch was last filled.
1632     # This is used by CVSCommit._pre_commit, to prevent creating a fill
1633     # revision which would have nothing to do.
1634     self.last_filled = {}
1635
1636   def get_svn_revnum(self, cvs_rev_unique_key):
1637     """Return the Subversion revision number in which
1638     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
1639     is no mapping for CVS_REV_UNIQUE_KEY."""
1640     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
1641
1642   def get_svn_commit(self, svn_revnum):
1643     """Return an SVNCommit that corresponds to SVN_REVNUM.
1644
1645     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
1646
1647     This method can throw SVNCommitInternalInconsistencyError.
1648     """
1649     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
1650     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
1651     if c_rev_keys == None:
1652       return None
1653
1654     digest = None
1655     for key in c_rev_keys:
1656       c_rev = self.cvs_revisions.get_revision(key)
1657       svn_commit.add_revision(c_rev)
1658       # Set the author and log message for this commit by using
1659       # CVSRevision metadata, but only if haven't done so already.
1660       if digest is None:
1661         digest = c_rev.digest
1662         author, log_msg = self.svn_commit_metadata[digest]
1663         svn_commit.set_author(author)
1664         svn_commit.set_log_msg(log_msg)
1665
1666     # If we're doing a trunk-only conversion, we don't need to do any more work.
1667     if Ctx().trunk_only:
1668       return svn_commit
1669
1670     name, date = self._get_name_and_date(svn_revnum)
1671     if name:
1672       svn_commit.set_symbolic_name(name)
1673       svn_commit.set_date(date)
1674       if self.tags_db.has_key(name):
1675         svn_commit.is_tag = 1
1676
1677     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
1678     if motivating_revnum:
1679       svn_commit.set_motivating_revnum(int(motivating_revnum))
1680       svn_commit.set_date(date)
1681
1682     if len(svn_commit.cvs_revs) and name:
1683       msg = """An SVNCommit cannot have cvs_revisions *and* a
1684       corresponding symbolic name ('%s') to fill.""" % name
1685       raise SVNCommit.SVNCommitInternalInconsistencyError(msg)
1686
1687     return svn_commit
1688
1689   def set_cvs_revs(self, svn_revnum, cvs_revs):
1690     """Record the bidirectional mapping between SVN_REVNUM and
1691     CVS_REVS."""
1692     if self.mode == DB_OPEN_READ:
1693       raise RuntimeError, \
1694           'Write operation attempted on read-only PersistenceManager'
1695     for c_rev in cvs_revs:
1696       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
1697     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
1698     for c_rev in cvs_revs:
1699       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
1700
1701   def set_name_and_date(self, svn_revnum, name, date):
1702     """Associate symbolic name NAME and DATE with SVN_REVNUM."""
1703     if self.mode == DB_OPEN_READ:
1704       raise RuntimeError, \
1705           'Write operation attempted on read-only PersistenceManager'
1706     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
1707     self.last_filled[name] = svn_revnum
1708
1709   def _get_name_and_date(self, svn_revnum):
1710     """Return a tuple containing the symbolic name and date associated
1711     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
1712     associated with it."""
1713     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
1714
1715   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
1716     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
1717     if self.mode == DB_OPEN_READ:
1718       raise RuntimeError, \
1719           'Write operation attempted on read-only PersistenceManager'
1720     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
1721
1722
1723 class CVSCommit:
1724   """Each instance of this class contains a number of CVS Revisions
1725   that correspond to one or more Subversion Commits.  After all CVS
1726   Revisions are added to the grouping, calling process_revisions will
1727   generate a Subversion Commit (or Commits) for the set of CVS
1728   Revisions in the grouping."""
1729
1730   def __init__(self, digest, author, log):
1731     self.digest = digest
1732     self.author = author
1733     self.log = log
1734
1735     # Symbolic names for which the last source revision has already
1736     # been seen and for which the CVSRevisionAggregator has already
1737     # generated a fill SVNCommit.  See self.process_revisions().
1738     self.done_symbols = [ ]
1739
1740     self.files = { }
1741     # Lists of CVSRevisions
1742     self.changes = [ ]
1743     self.deletes = [ ]
1744
1745     # Start out with a t_min higher than any incoming time T, and a
1746     # t_max lower than any incoming T.  This way the first T will
1747     # push t_min down to T, and t_max up to T, naturally (without any
1748     # special-casing), and successive times will then ratchet them
1749     # outward as appropriate.
1750     self.t_min = 1L<<32
1751     self.t_max = 0
1752
1753     # This will be set to the SVNCommit that occurs in self._commit.
1754     self.motivating_commit = None
1755
1756     # This is a list of all non-primary commits motivated by the main
1757     # commit.  We gather these so that we can set their dates to the
1758     # same date as the primary commit.
1759     self.secondary_commits = [ ]
1760
1761     # State for handling default branches.
1762     #
1763     # Here is a tempting, but ultimately nugatory, bit of logic, which
1764     # I share with you so you may appreciate the less attractive, but
1765     # refreshingly non-nugatory, logic which follows it:
1766     #
1767     # If some of the commits in this txn happened on a non-trunk
1768     # default branch, then those files will have to be copied into
1769     # trunk manually after being changed on the branch (because the
1770     # RCS "default branch" appears as head, i.e., trunk, in practice).
1771     # As long as those copies don't overwrite any trunk paths that
1772     # were also changed in this commit, then we can do the copies in
1773     # the same revision, because they won't cover changes that don't
1774     # appear anywhere/anywhen else.  However, if some of the trunk dst
1775     # paths *did* change in this commit, then immediately copying the
1776     # branch changes would lose those trunk mods forever.  So in this
1777     # case, we need to do at least that copy in its own revision.  And
1778     # for simplicity's sake, if we're creating the new revision for
1779     # even one file, then we just do all such copies together in the
1780     # new revision.
1781     #
1782     # Doesn't that sound nice?
1783     #
1784     # Unfortunately, Subversion doesn't support copies with sources
1785     # in the current txn.  All copies must be based in committed
1786     # revisions.  Therefore, we generate the above-described new
1787     # revision unconditionally.
1788     #
1789     # This is a list of c_revs, and a c_rev is appended for each
1790     # default branch commit that will need to be copied to trunk (or
1791     # deleted from trunk) in some generated revision following the
1792     # "regular" revision.
1793     self.default_branch_cvs_revisions = [ ]
1794
1795   def __cmp__(self, other):
1796     # Commits should be sorted by t_max.  If both self and other have
1797     # the same t_max, break the tie using t_min, and lastly, digest
1798     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
1799             or cmp(self.digest, other.digest))
1800
1801   def has_file(self, fname):
1802     return self.files.has_key(fname)
1803
1804   def revisions(self):
1805     return self.changes + self.deletes
1806
1807   def opens_symbolic_name(self, name):
1808     """Returns true if any CVSRevision in this commit is on a tag or a
1809     branch or is the origin of a tag or branch."""
1810     for c_rev in self.revisions():
1811       if c_rev.opens_symbolic_name(name):
1812         return 1
1813     return 0
1814
1815   def add_revision(self, c_rev):
1816     # Record the time range of this commit.
1817     #
1818     # ### ISSUE: It's possible, though unlikely, that the time range
1819     # of a commit could get gradually expanded to be arbitrarily
1820     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
1821     # problem, and anyway deciding where to break it up would be a
1822     # judgement call.  For now, we just print a warning in commit() if
1823     # this happens.
1824     if c_rev.timestamp < self.t_min:
1825       self.t_min = c_rev.timestamp
1826     if c_rev.timestamp > self.t_max:
1827       self.t_max = c_rev.timestamp
1828
1829     if c_rev.op == OP_DELETE:
1830       self.deletes.append(c_rev)
1831     else:
1832       # OP_CHANGE or OP_ADD
1833       self.changes.append(c_rev)
1834
1835     self.files[c_rev.fname] = 1
1836
1837   def _pre_commit(self):
1838     """Generates any SVNCommits that must exist before the main
1839     commit."""
1840
1841     # There may be multiple c_revs in this commit that would cause
1842     # branch B to be filled, but we only want to fill B once.  On the
1843     # other hand, there might be multiple branches committed on in
1844     # this commit.  Whatever the case, we should count exactly one
1845     # commit per branch, because we only fill a branch once per
1846     # CVSCommit.  This list tracks which branches we've already
1847     # counted.
1848     accounted_for_sym_names = [ ]
1849
1850     def fill_needed(c_rev, pm):
1851       """Return 1 if this is the first commit on a new branch (for
1852       this file) and we need to fill the branch; else return 0
1853       (meaning that some other file's first commit on the branch has
1854       already done the fill for us).
1855
1856       If C_REV.op is OP_ADD, only return 1 if the branch that this
1857       commit is on has no last filled revision.
1858
1859       PM is a PersistenceManager to query.
1860       """
1861
1862       # Different '.' counts indicate that c_rev is now on a different
1863       # line of development (and may need a fill)
1864       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
1865         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
1866         # It should be the case that when we have a file F that
1867         # is added on branch B (thus, F on trunk is in state
1868         # 'dead'), we generate an SVNCommit to fill B iff the branch
1869         # has never been filled before.
1870         #
1871         # If this c_rev.op == OP_ADD, *and* the branch has never
1872         # been filled before, then fill it now.  Otherwise, no need to
1873         # fill it.
1874         if c_rev.op == OP_ADD:
1875           if pm.last_filled.get(c_rev.branch_name, None) is None:
1876             return 1
1877         else:
1878           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
1879             return 1
1880       return 0
1881
1882     for c_rev in self.changes + self.deletes:
1883       # If a commit is on a branch, we must ensure that the branch
1884       # path being committed exists (in HEAD of the Subversion
1885       # repository).  If it doesn't exist, we will need to fill the
1886       # branch.  After the fill, the path on which we're committing
1887       # will exist.
1888       if c_rev.branch_name \
1889           and c_rev.branch_name not in accounted_for_sym_names \
1890           and c_rev.branch_name not in self.done_symbols \
1891           and fill_needed(c_rev, Ctx()._persistence_manager):
1892         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
1893                                % c_rev.branch_name)
1894         svn_commit.set_symbolic_name(c_rev.branch_name)
1895         self.secondary_commits.append(svn_commit)
1896         accounted_for_sym_names.append(c_rev.branch_name)
1897
1898   def _commit(self):
1899     """Generates the primary SVNCommit that corresponds the this
1900     CVSCommit."""
1901     # Generate an SVNCommit unconditionally.  Even if the only change
1902     # in this CVSCommit is a deletion of an already-deleted file (that
1903     # is, a CVS revision in state 'dead' whose predecessor was also in
1904     # state 'dead'), the conversion will still generate a Subversion
1905     # revision containing the log message for the second dead
1906     # revision, because we don't want to lose that information.
1907     svn_commit = SVNCommit("commit")
1908     self.motivating_commit = svn_commit
1909
1910     for c_rev in self.changes:
1911       svn_commit.add_revision(c_rev)
1912       # Only make a change if we need to.  When 1.1.1.1 has an empty
1913       # deltatext, the explanation is almost always that we're looking
1914       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
1915       # such imports, CVS creates an RCS file where 1.1 has the
1916       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
1917       # content as 1.1.  There's no reason to reflect this non-change
1918       # in the repository, so we want to do nothing in this case.  (If
1919       # we were really paranoid, we could make sure 1.1's log message
1920       # is the CVS-generated "Initial revision\n", but I think the
1921       # conditions below are strict enough.)
1922       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
1923               and (c_rev.rev == "1.1.1.1")):
1924         if c_rev.is_default_branch_revision():
1925           self.default_branch_cvs_revisions.append(c_rev)
1926
1927     for c_rev in self.deletes:
1928       # When a file is added on a branch, CVS not only adds the file
1929       # on the branch, but generates a trunk revision (typically
1930       # 1.1) for that file in state 'dead'.  We only want to add
1931       # this revision if the log message is not the standard cvs
1932       # fabricated log message.
1933       if c_rev.prev_rev is None:
1934         # c_rev.branches may be empty if the originating branch
1935         # has been excluded.
1936         if not c_rev.branches:
1937           continue
1938         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
1939                              % (c_rev.filename(),
1940                                 c_rev.branches[0]))
1941         author, log_msg = \
1942             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
1943         if log_msg == cvs_generated_msg:
1944           continue
1945
1946       svn_commit.add_revision(c_rev)
1947       if c_rev.is_default_branch_revision():
1948         self.default_branch_cvs_revisions.append(c_rev)
1949
1950     # There is a slight chance that we didn't actually register any
1951     # CVSRevisions with our SVNCommit (see loop over self.deletes
1952     # above), so if we have no CVSRevisions, we don't flush the
1953     # svn_commit to disk and roll back our revnum.
1954     if len(svn_commit.cvs_revs) > 0:
1955       svn_commit.flush()
1956     else:
1957       # We will not be flushing this SVNCommit, so rollback the
1958       # SVNCommit revision counter.
1959       SVNCommit.revnum = SVNCommit.revnum - 1
1960
1961     if not Ctx().trunk_only:
1962       for c_rev in self.revisions():
1963         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
1964
1965   def _post_commit(self):
1966     """Generates any SVNCommits that we can perform now that _commit
1967     has happened.  That is, handle non-trunk default branches.
1968     Sometimes an RCS file has a non-trunk default branch, so a commit
1969     on that default branch would be visible in a default CVS checkout
1970     of HEAD.  If we don't copy that commit over to Subversion's trunk,
1971     then there will be no Subversion tree which corresponds to that
1972     CVS checkout.  Of course, in order to copy the path over, we may
1973     first need to delete the existing trunk there.  """
1974
1975     # Only generate a commit if we have default branch revs
1976     if len(self.default_branch_cvs_revisions):
1977       # Generate an SVNCommit for all of our default branch c_revs.
1978       svn_commit = SVNCommit("post-commit default branch(es)")
1979       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
1980       for c_rev in self.default_branch_cvs_revisions:
1981         svn_commit.add_revision(c_rev)
1982         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
1983                                                             svn_commit.revnum)
1984       self.secondary_commits.append(svn_commit)
1985
1986   def process_revisions(self, done_symbols):
1987     """Process all the CVSRevisions that this instance has, creating
1988     one or more SVNCommits in the process.  Generate fill SVNCommits
1989     only for symbols not in DONE_SYMBOLS (avoids unnecessary
1990     fills).
1991
1992     Return the primary SVNCommit that corresponds to this CVSCommit.
1993     The returned SVNCommit is the commit that motivated any other
1994     SVNCommits generated in this CVSCommit."""
1995     self.done_symbols = done_symbols
1996     seconds = self.t_max - self.t_min + 1
1997
1998     Log().write(LOG_VERBOSE, '-' * 60)
1999     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2000     if seconds == 1:
2001       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2002                   % time.ctime(self.t_max))
2003     else:
2004       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2005       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2006                   % (time.ctime(self.t_max), seconds))
2007
2008     if seconds > COMMIT_THRESHOLD + 1:
2009       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2010                   % (warning_prefix, COMMIT_THRESHOLD))
2011
2012     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2013       self._commit()
2014       return self.motivating_commit
2015
2016     self._pre_commit()
2017     self._commit()
2018     self._post_commit()
2019
2020     for svn_commit in self.secondary_commits:
2021       svn_commit.set_date(self.motivating_commit.get_date())
2022       svn_commit.flush()
2023
2024     return self.motivating_commit
2025
2026
2027 class SVNCommit:
2028   """This represents one commit to the Subversion Repository.  There
2029   are three types of SVNCommits:
2030
2031   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2032
2033   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2034
2035   3. Updates trunk to reflect the contents of a particular branch
2036      (this is to handle RCS default branches)."""
2037
2038   # The revision number to assign to the next new SVNCommit.
2039   # We start at 2 because SVNRepositoryMirror uses the first commit
2040   # to create trunk, tags, and branches.
2041   revnum = 2
2042
2043   class SVNCommitInternalInconsistencyError(Exception):
2044     """Exception raised if we encounter an impossible state in the
2045     SVNCommit Databases."""
2046     pass
2047
2048   def __init__(self, description="", revnum=None, cvs_revs=None):
2049     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2050     If REVNUM, the SVNCommit will correspond to that revision number;
2051     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2052     REVNUM.
2053
2054     It is an error to pass CVS_REVS without REVNUM, but you may pass
2055     REVNUM without CVS_REVS, and then add a revision at a time by
2056     invoking add_revision()."""
2057     self._description = description
2058
2059     # Revprop metadata for this commit.
2060     #
2061     # These initial values are placeholders.  At least the log and the
2062     # date should be different by the time these are used.
2063     #
2064     # They are private because their values should be returned encoded
2065     # in UTF8, but callers aren't required to set them in UTF8.
2066     # Therefore, accessor methods are used to set them, and
2067     # self.get_revprops() is used to to get them, in dictionary form.
2068     self._author = Ctx().username
2069     self._log_msg = "This log message means an SVNCommit was used too soon."
2070     self._max_date = 0  # Latest date seen so far.
2071
2072     self.cvs_revs = cvs_revs or []
2073     if revnum:
2074       self.revnum = revnum
2075     else:
2076       self.revnum = SVNCommit.revnum
2077       SVNCommit.revnum = SVNCommit.revnum + 1
2078
2079     # The symbolic name that is filled in this SVNCommit, if any
2080     self.symbolic_name = None
2081
2082     # If this commit is a default branch synchronization, this
2083     # variable represents the subversion revision number of the
2084     # *primary* commit where the default branch changes actually
2085     # happened.  It is None otherwise.
2086     #
2087     # It is possible for multiple synchronization commits to refer to
2088     # the same motivating commit revision number, and it is possible
2089     # for a single synchronization commit to contain CVSRevisions on
2090     # multiple different default branches.
2091     self.motivating_revnum = None
2092
2093     # is_tag is true only if this commit is a fill of a symbolic name
2094     # that is a tag, None in all other cases.
2095     self.is_tag = None
2096
2097   def set_symbolic_name(self, name):
2098     "Set self.symbolic_name to NAME."
2099     name = _clean_symbolic_name(name)
2100     self.symbolic_name = name
2101
2102   def set_motivating_revnum(self, revnum):
2103     "Set self.motivating_revnum to REVNUM."
2104     self.motivating_revnum = revnum
2105
2106   def set_author(self, author):
2107     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2108     This is the only way to set an SVNCommit's author."""
2109     self._author = author
2110
2111   def set_log_msg(self, msg):
2112     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2113     This is the only way to set an SVNCommit's log message."""
2114     self._log_msg = msg
2115
2116   def set_date(self, date):
2117     """Set this SVNCommit's date to DATE (an integer).
2118     Note that self.add_revision() updates this automatically based on
2119     a CVSRevision; so you may not need to call this at all, and even
2120     if you do, the value may be overwritten by a later call to
2121     self.add_revision()."""
2122     self._max_date = date
2123
2124   def get_date(self):
2125     """Returns this SVNCommit's date as an integer."""
2126     return self._max_date
2127
2128   def get_revprops(self):
2129     """Return the Subversion revprops for this SVNCommit."""
2130     date = format_date(self._max_date)
2131     try:
2132       ### FIXME: The 'replace' behavior should be an option, like
2133       ### --encoding is.
2134       utf8_author = None
2135       if self._author is not None:
2136         unicode_author = unicode(self._author, Ctx().encoding, 'replace')
2137         utf8_author = unicode_author.encode('utf8')
2138       unicode_log = unicode(self.get_log_msg(), Ctx().encoding, 'replace')
2139       utf8_log = unicode_log.encode('utf8')
2140       return { 'svn:author' : utf8_author,
2141                'svn:log'    : utf8_log,
2142                'svn:date'   : date }
2143     except UnicodeError:
2144       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2145                   % warning_prefix)
2146       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2147       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2148       Log().write(LOG_WARN, "  date:   '%s'" % date)
2149       Log().write(LOG_WARN, "(subversion rev %s)  Related files:" % self.revnum)
2150       for c_rev in self.cvs_revs:
2151         Log().write(LOG_WARN, " ", c_rev.fname)
2152
2153       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2154                   "'--encoding=latin1'.\n")
2155       # It's better to fall back to the original (unknown encoding) data
2156       # than to either 1) quit or 2) record nothing at all.
2157       return { 'svn:author' : self._author,
2158                'svn:log'    : self.get_log_msg(),
2159                'svn:date'   : date }
2160
2161   def add_revision(self, cvs_rev):
2162     self.cvs_revs.append(cvs_rev)
2163     if cvs_rev.timestamp > self._max_date:
2164       self._max_date = cvs_rev.timestamp
2165
2166   def _is_primary_commit(self):
2167     """Return true if this is a primary SVNCommit, false otherwise."""
2168     return not (self.symbolic_name or self.motivating_revnum)
2169
2170   def flush(self):
2171     Log().write(LOG_NORMAL, "Creating Subversion commit %d (%s)"
2172                 % (self.revnum, self._description))
2173     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2174
2175     if self.motivating_revnum is not None:
2176       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2177                                                        self.motivating_revnum)
2178
2179     # If we're not a primary commit, then store our date and/or our
2180     # symbolic_name
2181     if not self._is_primary_commit():
2182       Ctx()._persistence_manager.set_name_and_date(self.revnum,
2183                                                    self.symbolic_name,
2184                                                    self._max_date)
2185
2186   def __str__(self):
2187     """ Print a human-readable description of this SVNCommit.  This
2188     description is not intended to be machine-parseable (although
2189     we're not going to stop you if you try!)"""
2190
2191     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2192     if self.symbolic_name:
2193       ret = ret + "   symbolic name: " +  self.symbolic_name + "\n"
2194     else:
2195       ret = ret + "   NO symbolic name\n"
2196     ret = ret + "   debug description: " + self._description + "\n"
2197     ret = ret + "   cvs_revs:\n"
2198     for c_rev in self.cvs_revs:
2199       ret = ret + "     " + c_rev.unique_key() + "\n"
2200     return ret
2201
2202   def get_log_msg(self):
2203     """Returns the actual log message for a primary commit, and the
2204     appropriate manufactured log message for a secondary commit."""
2205     if self.symbolic_name is not None:
2206       return self._log_msg_for_symbolic_name_commit()
2207     elif self.motivating_revnum is not None:
2208       return self._log_msg_for_default_branch_commit()
2209     else:
2210       return self._log_msg
2211
2212   def _log_msg_for_symbolic_name_commit(self):
2213     """Creates a log message for a manufactured commit that fills
2214     self.symbolic_name.  If self.is_tag is true, write the log message
2215     as though for a tag, else write it as though for a branch."""
2216     type = 'branch'
2217     if self.is_tag:
2218       type = 'tag'
2219
2220     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2221     space_or_newline = ' '
2222     if len(self.symbolic_name) >= 13:
2223       space_or_newline = '\n'
2224
2225     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2226            % (type, space_or_newline, self.symbolic_name)
2227
2228   def _log_msg_for_default_branch_commit(self):
2229     """Creates a log message for a manufactured commit that
2230     synchronizes a non-trunk default branch with trunk."""
2231     msg = 'This commit was generated by cvs2svn to compensate for '     \
2232           'changes in r%d,\n'                                           \
2233           'which included commits to RCS files with non-trunk default ' \
2234           'branches.\n' % self.motivating_revnum
2235     return msg
2236
2237 class CVSRevisionAggregator:
2238   """This class groups CVSRevisions into CVSCommits that represent
2239   at least one SVNCommit."""
2240   def __init__(self):
2241     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2242     if not Ctx().trunk_only:
2243       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), DB_OPEN_READ)
2244     self.cvs_commits = {}
2245     self.pending_symbols = {}
2246     # A list of symbols for which we've already encountered the last
2247     # CVSRevision that is a source for that symbol.  That is, the
2248     # final fill for this symbol has been done, and we never need to
2249     # fill it again.
2250     self.done_symbols = [ ]
2251
2252     # This variable holds the most recently created primary svn_commit
2253     # object.  CVSRevisionAggregator maintains this variable merely
2254     # for its date, so that it can set dates for the SVNCommits
2255     # created in self.attempt_to_commit_symbols().
2256     self.latest_primary_svn_commit = None
2257
2258     Ctx()._symbolings_logger = SymbolingsLogger()
2259     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2260     Ctx()._default_branches_db = Database(temp(DEFAULT_BRANCHES_DB),
2261                                           DB_OPEN_READ)
2262
2263
2264   def process_revision(self, c_rev):
2265     # Each time we read a new line, we scan the commits we've
2266     # accumulated so far to see if any are ready for processing now.
2267     ready_queue = [ ]
2268     for digest_key, cvs_commit in self.cvs_commits.items():
2269       if cvs_commit.t_max + COMMIT_THRESHOLD < c_rev.timestamp:
2270         ready_queue.append(cvs_commit)
2271         del self.cvs_commits[digest_key]
2272         continue
2273       # If the inbound commit is on the same file as a pending commit,
2274       # close the pending commit to further changes.  Don't flush it though,
2275       # as there may be other pending commits dated before this one.
2276       # ### ISSUE: the has_file() check below is not optimal.
2277       # It does fix the dataloss bug where revisions would get lost
2278       # if checked in too quickly, but it can also break apart the
2279       # commits.  The correct fix would require tracking the dependencies
2280       # between change sets and committing them in proper order.
2281       if cvs_commit.has_file(c_rev.fname):
2282         unused_id = digest_key + '-'
2283         # Find a string that does is not already a key in
2284         # the self.cvs_commits dict
2285         while self.cvs_commits.has_key(unused_id):
2286           unused_id = unused_id + '-'
2287         self.cvs_commits[unused_id] = cvs_commit
2288         del self.cvs_commits[digest_key]
2289
2290     # Add this item into the set of still-available commits.
2291     if self.cvs_commits.has_key(c_rev.digest):
2292       cvs_commit = self.cvs_commits[c_rev.digest]
2293     else:
2294       author, log = self.metadata_db[c_rev.digest]
2295       self.cvs_commits[c_rev.digest] = CVSCommit(c_rev.digest,
2296                                                  author, log)
2297       cvs_commit = self.cvs_commits[c_rev.digest]
2298     cvs_commit.add_revision(c_rev)
2299
2300     # If there are any elements in the ready_queue at this point, they
2301     # need to be processed, because this latest rev couldn't possibly
2302     # be part of any of them.  Sort them into time-order, then process
2303     # 'em.
2304     ready_queue.sort()
2305
2306     # Make sure we attempt_to_commit_symbols for this c_rev, even if no
2307     # commits are ready.
2308     if len(ready_queue) == 0:
2309       self.attempt_to_commit_symbols(ready_queue, c_rev)
2310
2311     for cvs_commit in ready_queue[:]:
2312       self.latest_primary_svn_commit \
2313           = cvs_commit.process_revisions(self.done_symbols)
2314       ready_queue.remove(cvs_commit)
2315       self.attempt_to_commit_symbols(ready_queue, c_rev)
2316
2317   def flush(self):
2318     """Commit anything left in self.cvs_commits.  Then inform the
2319     SymbolingsLogger that all commits are done."""
2320
2321     ready_queue = [ ]
2322     for k, v in self.cvs_commits.items():
2323       ready_queue.append((v, k))
2324
2325     ready_queue.sort()
2326     for cvs_commit_tuple in ready_queue[:]:
2327       self.latest_primary_svn_commit = \
2328         cvs_commit_tuple[0].process_revisions(self.done_symbols)
2329       ready_queue.remove(cvs_commit_tuple)
2330       del self.cvs_commits[cvs_commit_tuple[1]]
2331       self.attempt_to_commit_symbols([])
2332
2333     if not Ctx().trunk_only:
2334       Ctx()._symbolings_logger.close()
2335
2336   def attempt_to_commit_symbols(self, queued_commits, c_rev=None):
2337     """
2338     This function generates 1 SVNCommit for each symbol in
2339     self.pending_symbols that doesn't have an opening CVSRevision in
2340     either QUEUED_COMMITS or self.cvs_commits.values().
2341
2342     If C_REV is not None, then we first add to self.pending_symbols
2343     any symbols from C_REV that C_REV is the last CVSRevision for.
2344     """
2345     # If we're not doing a trunk-only conversion, get the symbolic
2346     # names that this c_rev is the last *source* CVSRevision for and
2347     # add them to those left over from previous passes through the
2348     # aggregator.
2349     if c_rev and not Ctx().trunk_only:
2350       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2351         self.pending_symbols[sym] = None
2352
2353     # Make a list of all symbols that still have *source* CVSRevisions
2354     # in the pending commit queue (self.cvs_commits).
2355     open_symbols = {}
2356     for sym in self.pending_symbols.keys():
2357       for cvs_commit in self.cvs_commits.values() + queued_commits:
2358         if cvs_commit.opens_symbolic_name(sym):
2359           open_symbols[sym] = None
2360           break
2361
2362     # Sort the pending symbols so that we will always process the
2363     # symbols in the same order, regardless of the order in which the
2364     # dict hashing algorithm hands them back to us.  We do this so
2365     # that our tests will get the same results on all platforms.
2366     sorted_pending_symbols_keys = self.pending_symbols.keys()
2367     sorted_pending_symbols_keys.sort()
2368     for sym in sorted_pending_symbols_keys:
2369       if open_symbols.has_key(sym): # sym is still open--don't close it.
2370         continue
2371       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2372       svn_commit.set_symbolic_name(sym)
2373       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2374       svn_commit.flush()
2375       self.done_symbols.append(sym)
2376       del self.pending_symbols[sym]
2377
2378
2379 class SymbolingsReader:
2380   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2381   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2382   returning the correct opening and closing Subversion revision
2383   numbers for a given symbolic name."""
2384   def __init__(self):
2385     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2386     reads the offsets database into memory."""
2387     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2388     # The offsets_db is really small, and we need to read and write
2389     # from it a fair bit, so suck it into memory
2390     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2391     self.offsets = { }
2392     for key in offsets_db.db.keys():
2393       #print " ZOO:", key, offsets_db[key]
2394       self.offsets[key] = offsets_db[key]
2395
2396   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2397     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2398     SymbolicNameFillingGuide object.
2399
2400     Note that if we encounter an opening rev in this fill, but the
2401     corresponding closing rev takes place later than SVN_REVNUM, the
2402     closing will not be passed to SymbolicNameFillingGuide in this
2403     fill (and will be discarded when encountered in a later fill).
2404     This is perfectly fine, because we can still do a valid fill
2405     without the closing--we always try to fill what we can as soon as
2406     we can."""
2407     # It's possible to have a branch start with a file that was added
2408     # on a branch
2409     if not self.offsets.has_key(symbolic_name):
2410       return SymbolicNameFillingGuide(symbolic_name)
2411     # set our read offset for self.symbolings to the offset for
2412     # symbolic_name
2413     self.symbolings.seek(self.offsets[symbolic_name])
2414
2415     symbol_fill = SymbolicNameFillingGuide(symbolic_name)
2416     while (1):
2417       fpos = self.symbolings.tell()
2418       line = self.symbolings.readline().rstrip()
2419       if not line:
2420         break
2421       name, revnum, type, svn_path = line.split(" ", 3)
2422       revnum = int(revnum)
2423       if (revnum > svn_revnum
2424           or name != symbolic_name):
2425         break
2426       symbol_fill.register(svn_path, revnum, type)
2427
2428     # get current offset of the read marker and set it to the offset
2429     # for the beginning of the line we just read if we used anything
2430     # we read.
2431     if not symbol_fill.is_empty():
2432       self.offsets[symbolic_name] = fpos
2433
2434     symbol_fill.make_node_tree()
2435     return symbol_fill
2436
2437
2438 class SymbolicNameFillingGuide:
2439   """A SymbolicNameFillingGuide is essentially a node tree
2440   representing the source paths to be copied to fill
2441   self.symbolic_name in the current SVNCommit.
2442
2443   After calling self.register() on a series of openings and closings,
2444   call self.make_node_tree() to prepare self.node_tree for
2445   examination.  See the docstring for self.make_node_tree() for
2446   details on the structure of self.node_tree.
2447
2448   By walking self.node_tree and calling self.get_best_revnum() on each
2449   node, the caller can determine what subversion revision number to
2450   copy the path corresponding to that node from.  self.node_tree
2451   should be treated as read-only.
2452
2453   The caller can then descend to sub-nodes to see if their "best
2454   revnum" differs from their parents' and if it does, take appropriate
2455   actions to "patch up" the subtrees."""
2456   def __init__(self, symbolic_name):
2457     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2458     prepares it for receiving openings and closings.
2459
2460     Returns a fully functional and armed SymbolicNameFillingGuide
2461     object."""
2462     self.name = symbolic_name
2463
2464     self.opening_key = "/o"
2465     self.closing_key = "/c"
2466
2467     # A dictionary of SVN_PATHS and SVN_REVNUMS whose format is:
2468     #
2469     # { svn_path : { self.opening_key : svn_revnum,
2470     #                self.closing_key : svn_revnum }
2471     #                ...}
2472     self.things = { }
2473
2474     # The key for the root node of the node tree
2475     self.root_key = '0'
2476     # The dictionary that holds our node tree, seeded with the root key.
2477     self.node_tree = { self.root_key : { } }
2478
2479   def get_best_revnum(self, node, preferred_revnum):
2480     """Determine the best subversion revision number to use when
2481     copying the source tree beginning at NODE.  Returns a
2482     subversion revision number.
2483
2484     PREFERRED_REVNUM is passed to self._best_rev and used to
2485     calculate the best_revnum."""
2486     revnum = SVN_INVALID_REVNUM
2487
2488     # Aggregate openings and closings from the rev tree
2489     openings = self._list_revnums_for_key(node, self.opening_key)
2490     closings = self._list_revnums_for_key(node, self.closing_key)
2491
2492     # Score the lists
2493     scores = self._score_revisions(self._sum_revnum_counts(openings),
2494                                   self._sum_revnum_counts(closings))
2495
2496     revnum, max_score = self._best_rev(scores, preferred_revnum)
2497
2498     if revnum == SVN_INVALID_REVNUM:
2499       sys.stderr.write(error_prefix + ": failed to find a revision "
2500                        + "to copy from when copying %s\n" % name)
2501       sys.exit(1)
2502     return revnum, max_score
2503
2504
2505   def _best_rev(self, scores, preferred_rev):
2506     """Return the revision with the highest score from SCORES, a list
2507     returned by _score_revisions().  When the maximum score is shared
2508     by multiple revisions, the oldest revision is selected, unless
2509     PREFERRED_REV is one of the possibilities, in which case, it is
2510     selected."""
2511     max_score = 0
2512     preferred_rev_score = -1
2513     rev = SVN_INVALID_REVNUM
2514     if preferred_rev is None:
2515       # Comparison order of different types is arbitrary. Do not
2516       # expect None to compare less than int values below.
2517       # In Python 2.3 None compares with ints like negative infinity.
2518       # In Python 2.0 None compares with ints like positive infinity.
2519       preferred_rev = SVN_INVALID_REVNUM
2520     for revnum, count in scores:
2521       if count > max_score:
2522         max_score = count
2523         rev = revnum
2524       if revnum <= preferred_rev:
2525         preferred_rev_score = count
2526     if preferred_rev_score == max_score:
2527       rev = preferred_rev
2528     return rev, max_score
2529
2530
2531   def _score_revisions(self, openings, closings):
2532     """Return a list of revisions and scores based on OPENINGS and
2533     CLOSINGS.  The returned list looks like:
2534
2535        [(REV1 SCORE1), (REV2 SCORE2), ...]
2536
2537     where REV2 > REV1.  OPENINGS and CLOSINGS are the values of
2538     self.opening__key and self.closing_key from some file or
2539     directory node, or else None.
2540
2541     Each score indicates that copying the corresponding revision (or
2542     any following revision up to the next revision in the list) of the
2543     object in question would yield that many correct paths at or
2544     underneath the object.  There may be other paths underneath it
2545     which are not correct and would need to be deleted or recopied;
2546     those can only be detected by descending and examining their
2547     scores.
2548
2549     If OPENINGS is false, return the empty list."""
2550     # First look for easy outs.
2551     if not openings:
2552       return []
2553
2554     # Must be able to call len(closings) below.
2555     if closings is None:
2556       closings = []
2557
2558     # No easy out, so wish for lexical closures and calculate the scores :-).
2559     scores = []
2560     opening_score_accum = 0
2561     for i in range(len(openings)):
2562       opening_rev, opening_score = openings[i]
2563       opening_score_accum = opening_score_accum + opening_score
2564       scores.append((opening_rev, opening_score_accum))
2565     min = 0
2566     for i in range(len(closings)):
2567       closing_rev, closing_score = closings[i]
2568       done_exact_rev = None
2569       insert_index = None
2570       insert_score = None
2571       for j in range(min, len(scores)):
2572         score_rev, score = scores[j]
2573         if score_rev >= closing_rev:
2574           if not done_exact_rev:
2575             if score_rev > closing_rev:
2576               insert_index = j
2577               insert_score = scores[j-1][1] - closing_score
2578             done_exact_rev = 1
2579           scores[j] = (score_rev, score - closing_score)
2580         else:
2581           min = j + 1
2582       if not done_exact_rev:
2583         scores.append((closing_rev,scores[-1][1] - closing_score))
2584       if insert_index is not None:
2585         scores.insert(insert_index, (closing_rev, insert_score))
2586     return scores
2587
2588   def _sum_revnum_counts(self, rev_list):
2589     """Takes an array of revisions (REV_LIST), for example:
2590
2591       [21, 18, 6, 49, 39, 24, 24, 24, 24, 24, 24, 24]
2592
2593     and adds up every occurrence of each revision and returns a sorted
2594     array of tuples containing (svn_revnum, count):
2595
2596       [(6, 1), (18, 1), (21, 1), (24, 7), (39, 1), (49, 1)]
2597     """
2598     s = {}
2599     for k in rev_list: # Add up the scores
2600       if s.has_key(k):
2601         s[k] = s[k] + 1
2602       else:
2603         s[k] = 1
2604     a = s.items()
2605     a.sort()
2606     return a
2607
2608   def _list_revnums_for_key(self, node, revnum_type_key):
2609     """Scan self.node_tree and return a list of all the revision
2610     numbers (including duplicates) contained in REVNUM_TYPE_KEY values
2611     for all leaf nodes at and under NODE.
2612
2613     REVNUM_TYPE_KEY should be either self.opening_key or
2614     self.closing_key."""
2615     revnums = []
2616
2617     # If the node has self.opening_key, it must be a leaf node--all
2618     # leaf nodes have at least an opening key (although they may not
2619     # have a closing key.  Fetch revnum and return
2620     if (self.node_tree[node].has_key(self.opening_key) and
2621         self.node_tree[node].has_key(revnum_type_key)):
2622       revnums.append(self.node_tree[node][revnum_type_key])
2623       return revnums
2624
2625     for key, node_contents in self.node_tree[node].items():
2626       if key[0] == '/':
2627         continue
2628       revnums = revnums + \
2629           self._list_revnums_for_key(node_contents, revnum_type_key)
2630     return revnums
2631
2632   def register(self, svn_path, svn_revnum, type):
2633     """Collects opening and closing revisions for this
2634     SymbolicNameFillingGuide.  SVN_PATH is the source path that needs
2635     to be copied into self.symbolic_name, and SVN_REVNUM is either the
2636     first svn revision number that we can copy from (our opening), or
2637     the last (not inclusive) svn revision number that we can copy from
2638     (our closing).  TYPE indicates whether this path is an opening or a
2639     a closing.
2640
2641     The opening for a given SVN_PATH must be passed before the closing
2642     for it to have any effect... any closing encountered before a
2643     corresponding opening will be discarded.
2644
2645     It is not necessary to pass a corresponding closing for every
2646     opening.
2647     """
2648     # Always log an OPENING
2649     if type == OPENING:
2650       self.things[svn_path] = {self.opening_key: svn_revnum}
2651     # Only log a closing if we've already registered the opening for that path.
2652     elif type == CLOSING and self.things.has_key(svn_path):
2653       # When we have a non-trunk default branch, we may have multiple
2654       # closings--only register the first closing we encounter.
2655       if not self.things[svn_path].has_key(self.closing_key):
2656         self.things[svn_path][self.closing_key] = svn_revnum
2657
2658   def make_node_tree(self):
2659     """Generates the SymbolicNameFillingGuide's node tree from
2660     self.things.  Each leaf node maps self.opening_key to the earliest
2661     subversion revision from which this node/path may be copied; and
2662     optionally map self.closing_key to the subversion revision one
2663     higher than the last revision from which this node/path may be
2664     copied.  Intermediate nodes never contain opening or closing
2665     flags."""
2666
2667     for svn_path, open_close in self.things.items():
2668       parent_key = self.root_key
2669
2670       path_so_far = ""
2671       # Walk up the path, one node at a time.
2672       components = svn_path.split('/')
2673       for component in components:
2674         path_so_far = path_so_far + '/' + component
2675
2676         child_key = None
2677         if not self.node_tree[parent_key].has_key(component):
2678           child_key = gen_key()
2679           self.node_tree[child_key] = { }
2680           self.node_tree[parent_key][component] = child_key
2681         else:
2682           child_key = self.node_tree[parent_key][component]
2683
2684         parent_key = child_key
2685       # Having reached the leaf, attach the value
2686       self.node_tree[parent_key] = open_close
2687     #print_node_tree(self.node_tree, self.root_key)
2688
2689   def is_empty(self):
2690     """Return true if we haven't accumulated any openings or closings,
2691     false otherwise."""
2692     return not len(self.things)
2693
2694
2695 class FillSource:
2696   """Representation of a fill source used by the symbol filler in
2697   SVNRepositoryMirror."""
2698   def __init__(self, prefix, key):
2699     """Create an unscored fill source with a prefix and a key."""
2700     self.prefix = prefix
2701     self.key = key
2702     self.score = None
2703     self.revnum = None
2704
2705   def set_score(self, score, revnum):
2706     """Set the SCORE and REVNUM."""
2707     self.score = score
2708     self.revnum = revnum
2709
2710   def __cmp__(self, other):
2711     """Comparison operator used to sort FillSources in descending
2712     score order."""
2713     if self.score is None or other.score is None:
2714       raise TypeError, 'Tried to compare unscored FillSource'
2715     return cmp(other.score, self.score)
2716
2717
2718 class SVNRepositoryMirror:
2719   """Mirror a Subversion Repository as it is constructed, one
2720   SVNCommit at a time.  The mirror is skeletal; it does not contain
2721   file contents.  The creation of a dumpfile or Subversion repository
2722   is handled by delegates.  See self.add_delegate method for how to
2723   set delegates.
2724
2725   The structure of the repository is kept in two databases and one
2726   hash.  The revs_db database maps revisions to root node keys, and
2727   the nodes_db database maps node keys to nodes.  A node is a hash
2728   from directory names to keys.  Both the revs_db and the nodes_db are
2729   stored on disk and each access is expensive.
2730
2731   The nodes_db database only has the keys for old revisions.  The
2732   revision that is being contructed is kept in memory in the new_nodes
2733   hash which is cheap to access.
2734
2735   You must invoke _start_commit between SVNCommits.
2736
2737   *** WARNING *** All path arguments to methods in this class CANNOT
2738       have leading or trailing slashes.
2739   """
2740
2741   class SVNRepositoryMirrorPathExistsError(Exception):
2742     """Exception raised if an attempt is made to add a path to the
2743     repository mirror and that path already exists in the youngest
2744     revision of the repository."""
2745     pass
2746
2747   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
2748     """Exception raised if a CVSRevision is found to have an unexpected
2749     operation (OP) value."""
2750     pass
2751
2752   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
2753     """Exception raised if an empty SymbolicNameFillingGuide is returned
2754     during a fill where the branch in question already exists."""
2755     pass
2756
2757   def __init__(self):
2758     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
2759     self.delegates = [ ]
2760
2761     # This corresponds to the 'revisions' table in a Subversion fs.
2762     self.revs_db = Database(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
2763     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
2764
2765     # This corresponds to the 'nodes' table in a Subversion fs.  (We
2766     # don't need a 'representations' or 'strings' table because we
2767     # only track metadata, not file contents.)
2768     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
2769     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
2770
2771     # Start at revision 0 without a root node.  It will be created
2772     # by _open_writable_root_node.
2773     self.youngest = 0
2774     self.new_root_key = None
2775     self.new_nodes = { }
2776
2777     if not Ctx().trunk_only:
2778       ###PERF IMPT: Suck this into memory.
2779       self.tags_db = TagsDatabase(DB_OPEN_READ)
2780       self.symbolings_reader = SymbolingsReader()
2781
2782   def _initialize_repository(self, date):
2783     """Initialize the repository by creating the directories for
2784     trunk, tags, and branches.  This method should only be called
2785     after all delegates are added to the repository mirror."""
2786     # Make a 'fake' SVNCommit so we can take advantage of the revprops
2787     # magic therein
2788     svn_commit = SVNCommit("Initialization", 1)
2789     svn_commit.set_date(date)
2790     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
2791
2792     self._start_commit(svn_commit)
2793     self._mkdir(Ctx().trunk_base)
2794     if not Ctx().trunk_only:
2795       self._mkdir(Ctx().branches_base)
2796       self._mkdir(Ctx().tags_base)
2797
2798   def _start_commit(self, svn_commit):
2799     """Start a new commit."""
2800     if self.youngest > 0:
2801       self._end_commit()
2802
2803     self.youngest = svn_commit.revnum
2804     self.new_root_key = None
2805     self.new_nodes = { }
2806
2807     self._invoke_delegates('start_commit', svn_commit)
2808
2809   def _end_commit(self):
2810     """Called at the end of each commit.  This method copies the newly
2811     created nodes to the on-disk nodes db."""
2812     if self.new_root_key is None:
2813       # No changes were made in this revision, so we make the root node
2814       # of the new revision be the same as the last one.
2815       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
2816     else:
2817       self.revs_db[str(self.youngest)] = self.new_root_key
2818       # Copy the new nodes to the nodes_db
2819       for key, value in self.new_nodes.items():
2820         self.nodes_db[key] = value
2821
2822   def _get_node(self, key):
2823     """Returns the node contents for KEY which may refer to either
2824     self.nodes_db or self.new_nodes."""
2825     if self.new_nodes.has_key(key):
2826       return self.new_nodes[key]
2827     else:
2828       return self.nodes_db[key]
2829
2830   def _open_readonly_node(self, path, revnum):
2831     """Open a readonly node for PATH at revision REVNUM.  Returns the
2832     node key and node contents if the path exists, else (None, None)."""
2833     # Get the root key
2834     if revnum == self.youngest:
2835       if self.new_root_key is None:
2836         node_key = self.revs_db[str(self.youngest - 1)]
2837       else:
2838         node_key = self.new_root_key
2839     else:
2840       node_key = self.revs_db[str(revnum)]
2841
2842     for component in path.split('/'):
2843       node_contents = self._get_node(node_key)
2844       if not node_contents.has_key(component):
2845         return None
2846       node_key = node_contents[component]
2847
2848     return node_key
2849
2850   def _open_writable_root_node(self):
2851     """Open a writable root node.  The current root node is returned
2852     immeditely if it is already writable.  If not, create a new one by
2853     copying the contents of the root node of the previous version."""
2854     if self.new_root_key is not None:
2855       return self.new_root_key, self.new_nodes[self.new_root_key]
2856
2857     if self.youngest < 2:
2858       new_contents = { }
2859     else:
2860       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
2861     self.new_root_key = gen_key()
2862     self.new_nodes = { self.new_root_key: new_contents }
2863
2864     return self.new_root_key, new_contents
2865
2866   def _open_writable_node(self, svn_path, create):
2867     """Open a writable node for the path SVN_PATH, creating SVN_PATH
2868     and any missing directories if CREATE is True."""
2869     parent_key, parent_contents = self._open_writable_root_node()
2870
2871     # Walk up the path, one node at a time.
2872     path_so_far = None
2873     components = svn_path.split('/')
2874     for i in range(len(components)):
2875       component = components[i]
2876       this_key = this_contents = None
2877       path_so_far = _path_join(path_so_far, component)
2878       if parent_contents.has_key(component):
2879         # The component exists.
2880         this_key = parent_contents[component]
2881         if self.new_nodes.has_key(this_key):
2882           this_contents = self.new_nodes[this_key]
2883         else:
2884           # Suck the node from the nodes_db, but update the key
2885           this_contents = self.nodes_db[this_key]
2886           this_key = gen_key()
2887           self.new_nodes[this_key] = this_contents
2888           parent_contents[component] = this_key
2889       elif create:
2890         # The component does not exists, so we create it.
2891         this_contents = { }
2892         this_key = gen_key()
2893         self.new_nodes[this_key] = this_contents
2894         parent_contents[component] = this_key
2895         if i < len(components) - 1:
2896           self._invoke_delegates('mkdir', path_so_far)
2897       else:
2898         # The component does not exists and we are not instructed to
2899         # create it, so we give up.
2900         return None, None
2901
2902       parent_key = this_key
2903       parent_contents = this_contents
2904
2905     return this_key, this_contents
2906
2907   def _path_exists(self, path):
2908     """If PATH exists in self.youngest of the svn repository mirror,
2909     return true, else return None.
2910
2911     PATH must not start with '/'."""
2912     return self._open_readonly_node(path, self.youngest) is not None
2913
2914   def _fast_delete_path(self, parent_path, parent_contents, component):
2915     """Delete COMPONENT from the parent direcory PARENT_PATH with the
2916     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
2917     in PARENT_CONTENTS."""
2918     if parent_contents.has_key(component):
2919       del parent_contents[component]
2920       self._invoke_delegates('delete_path', _path_join(parent_path, component))
2921
2922   def _delete_path(self, svn_path, should_prune=False):
2923     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
2924     all ancestor directories that are made empty when SVN_PATH is deleted.
2925     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
2926
2927     NOTE: This function does *not* allow you delete top-level entries
2928     (like /trunk, /branches, /tags), nor does it prune upwards beyond
2929     those entries."""
2930     pos = svn_path.rfind('/')
2931     parent_path = svn_path[:pos]
2932     entry = svn_path[pos+1:]
2933     parent_key, parent_contents = self._open_writable_node(parent_path, False)
2934     if parent_key is not None:
2935       self._fast_delete_path(parent_path, parent_contents, entry)
2936       # The following recursion makes pruning an O(n^2) operation in the
2937       # worst case (where n is the depth of SVN_PATH), but the worst case
2938       # is probably rare, and the constant cost is pretty low.  Another
2939       # drawback is that we issue a delete for each path and not just
2940       # a single delete for the topmost directory pruned.
2941       if (should_prune and len(parent_contents) == 0 and
2942           parent_path.find('/') != -1):
2943         self._delete_path(parent_path, True)
2944
2945   def _mkdir(self, path):
2946     """Create PATH in the repository mirror at the youngest revision."""
2947     self._open_writable_node(path, True)
2948     self._invoke_delegates('mkdir', path)
2949
2950   def _change_path(self, cvs_rev):
2951     """Register a change in self.youngest for the CVS_REV's svn_path
2952     in the repository mirror."""
2953     # We do not have to update the nodes because our mirror is only
2954     # concerned with the presence or absence of paths, and a file
2955     # content change does not cause any path changes.
2956     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, 0))
2957
2958   def _add_path(self, cvs_rev):
2959     """Add the CVS_REV's svn_path to the repository mirror."""
2960     self._open_writable_node(cvs_rev.svn_path, True)
2961     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, 1))
2962
2963   def _copy_path(self, src_path, dest_path, src_revnum):
2964     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
2965     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
2966     parent *must* exist, but DEST_PATH *cannot* exist.
2967
2968     Return the node key and the contents of the new node at DEST_PATH
2969     as a dictionary."""
2970     # get the contents of the node of our src_path
2971     src_key = self._open_readonly_node(src_path, src_revnum)
2972     src_contents = self._get_node(src_key)
2973
2974     # Get the parent path and the base path of the dest_path
2975     pos = dest_path.rindex('/')
2976     dest_parent = dest_path[:pos]
2977     dest_basename = dest_path[pos+1:]
2978     dest_parent_key, dest_parent_contents = \
2979                    self._open_writable_node(dest_parent, False)
2980
2981     if dest_parent_contents.has_key(dest_basename):
2982       msg = "Attempt to add path '%s' to repository mirror " % dest_path
2983       msg = msg + "when it already exists in the mirror."
2984       raise self.SVNRepositoryMirrorPathExistsError, msg
2985
2986     dest_parent_contents[dest_basename] = src_key
2987     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
2988
2989     # Yes sir, src_key and src_contents are also the contents of the
2990     # destination.  This is a cheap copy, remember!  :-)
2991     return src_key, src_contents
2992
2993   def _fill_symbolic_name(self, svn_commit):
2994     """Performs all copies necessary to create as much of the the tag
2995     or branch SVN_COMMIT.symbolic_name as possible given the current
2996     revision of the repository mirror.
2997
2998     The symbolic name is guaranteed to exist in the Subversion
2999     repository by the end of this call, even if there are no paths
3000     under it."""
3001     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3002       svn_commit.symbolic_name, self.youngest)
3003
3004     # Create the list of sources for the symbolic name.  All source
3005     # prefixes must be direct sources for the destination, i.e. we
3006     # must have 'trunk' and 'branches/my_branch' and not just
3007     # 'branches'.
3008     sources = []
3009     for entry, key in symbol_fill.node_tree[symbol_fill.root_key].items():
3010       if entry == Ctx().trunk_base:
3011         sources.append(FillSource(entry, key))
3012       elif entry == Ctx().branches_base:
3013         for entry2, key2 in symbol_fill.node_tree[key].items():
3014           sources.append(FillSource(entry + '/' + entry2, key2))
3015       else:
3016         raise # Should never happen
3017     if self.tags_db.has_key(svn_commit.symbolic_name):
3018       dest_prefix = _path_join(Ctx().tags_base, svn_commit.symbolic_name)
3019     else:
3020       dest_prefix = _path_join(Ctx().branches_base,
3021                                svn_commit.symbolic_name)
3022
3023     if sources:
3024       dest_key = self._open_writable_node(dest_prefix, False)[0]
3025       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3026     else:
3027       # We can only get here for a branch whose first commit is an add
3028       # (as opposed to a copy).
3029       dest_path = Ctx().branches_base + '/' + symbol_fill.name
3030       if not self._path_exists(dest_path):
3031         # If our symbol_fill was empty, that means that our first
3032         # commit on the branch was to a file added on the branch, and
3033         # that this is our first fill of that branch.
3034         #
3035         # This case is covered by test 16.
3036         #
3037         # ...we create the branch by copying trunk from the our
3038         # current revision number minus 1
3039         source_path = Ctx().trunk_base
3040         entries = self._copy_path(source_path, dest_path,
3041                                   svn_commit.revnum - 1)[1]
3042         # Now since we've just copied trunk to a branch that's
3043         # *supposed* to be empty, we delete any entries in the
3044         # copied directory.
3045         for entry in entries.keys():
3046           del_path = dest_path + '/' + entry
3047           # Delete but don't prune.
3048           self._delete_path(del_path)
3049       else:
3050         msg = "Error filling branch '" + symbol_fill.name + "'.\n"
3051         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3052         msg = msg + "attempted to create a branch that already exists."
3053         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3054
3055   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3056             path = None, parent_source_prefix = None,
3057             preferred_revnum = None, prune_ok = None):
3058     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3059     SOURCES, and recurse into the child items.
3060
3061     DEST_PREFIX is the prefix of the destination directory, e.g.
3062     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3063     FillSource classes that are candidates to be copied to the
3064     destination.  DEST_KEY is the key in self.nodes_db to the
3065     destination, or None if the destination does not yet exist.
3066
3067     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3068     are at the top level, e.g. '/tags/my_tag'.
3069
3070     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3071     the parent directory, and PREFERRED_REVNUM is an int which is the
3072     source revision number that the caller (who may have copied KEY's
3073     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3074     then no revision is preferable to any other (which probably means
3075     that no copies have happened yet).
3076
3077     PRUNE_OK means that a copy has been made in this recursion, and
3078     it's safe to prune directories that are not in
3079     SYMBOL_FILL.node_tree, provided that said directory has a source
3080     prefix of one of the PARENT_SOURCE_PREFIX.
3081
3082     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3083     should only be passed in by recursive calls."""
3084     # Calculate scores and revnums for all sources
3085     for source in sources:
3086       src_revnum, score = symbol_fill.get_best_revnum(source.key,
3087                                                       preferred_revnum)
3088       source.set_score(score, src_revnum)
3089
3090     # Sort the sources in descending score order so that we will make
3091     # a eventual copy from the source with the highest score.
3092     sources.sort()
3093     copy_source = sources[0]
3094
3095     src_path = _path_join(copy_source.prefix, path)
3096     dest_path = _path_join(dest_prefix, path)
3097
3098     # Figure out if we shall copy to this destination and delete any
3099     # destination path that is in the way.
3100     do_copy = 0
3101     if dest_key is None:
3102       do_copy = 1
3103     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3104                        copy_source.revnum != preferred_revnum):
3105       # We are about to replace the destination, so we need to remove
3106       # it before we perform the copy.
3107       self._delete_path(dest_path)
3108       do_copy = 1
3109
3110     if do_copy:
3111       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3112                                                copy_source.revnum)
3113       prune_ok = 1
3114     else:
3115       dest_entries = self._get_node(dest_key)
3116
3117     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3118     # elements and the values are lists of FillSource classes where
3119     # this path element exists.
3120     src_entries = {}
3121     for source in sources:
3122       for entry, key in symbol_fill.node_tree[source.key].items():
3123         if entry[0] == '/': # Skip flags
3124           continue
3125         if not src_entries.has_key(entry):
3126           src_entries[entry] = []
3127         src_entries[entry].append(FillSource(source.prefix, key))
3128
3129     if prune_ok:
3130       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3131       delete_list = [ ]
3132       for entry in dest_entries.keys():
3133         if not src_entries.has_key(entry):
3134           delete_list.append(entry)
3135       if delete_list:
3136         if not self.new_nodes.has_key(dest_key):
3137           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3138         # Sort the delete list to get "diffable" dumpfiles.
3139         delete_list.sort()
3140         for entry in delete_list:
3141           self._fast_delete_path(dest_path, dest_entries, entry)
3142
3143     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3144     src_keys = src_entries.keys()
3145     src_keys.sort()
3146     for src_key in src_keys:
3147       if dest_entries.has_key(src_key):
3148         next_dest_key = dest_entries[src_key]
3149       else:
3150         next_dest_key = None
3151       self._fill(symbol_fill, dest_prefix, next_dest_key,
3152                  src_entries[src_key], _path_join(path, src_key),
3153                  copy_source.prefix, sources[0].revnum, prune_ok)
3154
3155   def _synchronize_default_branch(self, svn_commit):
3156     """Propagate any changes that happened on a non-trunk default
3157     branch to the trunk of the repository.  See
3158     CVSCommit._post_commit() for details on why this is necessary."""
3159     for cvs_rev in svn_commit.cvs_revs:
3160       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3161         if self._path_exists(cvs_rev.svn_trunk_path):
3162           # Delete the path on trunk...
3163           self._delete_path(cvs_rev.svn_trunk_path)
3164         # ...and copy over from branch
3165         self._copy_path(cvs_rev.svn_path, cvs_rev.svn_trunk_path,
3166                         svn_commit.motivating_revnum)
3167       elif cvs_rev.op == OP_DELETE:
3168         # delete trunk path
3169         self._delete_path(cvs_rev.svn_trunk_path)
3170       else:
3171         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3172                % cvs_rev.op)
3173         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3174
3175   def commit(self, svn_commit):
3176     """Add an SVNCommit to the SVNRepository, incrementing the
3177     Repository revision number, and changing the repository.  Invoke
3178     the delegates' _start_commit() method."""
3179
3180     if svn_commit.revnum == 2:
3181       self._initialize_repository(svn_commit.get_date())
3182
3183     self._start_commit(svn_commit)
3184
3185     if svn_commit.symbolic_name:
3186       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3187                   svn_commit.symbolic_name)
3188       self._fill_symbolic_name(svn_commit)
3189     elif svn_commit.motivating_revnum:
3190       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3191                   % svn_commit.motivating_revnum)
3192       self._synchronize_default_branch(svn_commit)
3193     else: # This actually commits CVSRevisions
3194       if len(svn_commit.cvs_revs) > 1: plural = "s"
3195       else: plural = ""
3196       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3197                   % (len(svn_commit.cvs_revs), plural))
3198       for cvs_rev in svn_commit.cvs_revs:
3199         # See comment in CVSCommit._commit() for what this is all
3200         # about.  Note that although asking self._path_exists() is
3201         # somewhat expensive, we only do it if the first two (cheap)
3202         # tests succeed first.
3203         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3204                 and (cvs_rev.rev == "1.1.1.1")
3205                 and self._path_exists(cvs_rev.svn_path)):
3206           if cvs_rev.op == OP_ADD:
3207             self._add_path(cvs_rev)
3208           elif cvs_rev.op == OP_CHANGE:
3209             # Fix for Issue #74:
3210             #
3211             # Here's the scenario.  You have file FOO that is imported
3212             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3213             # the file exists.
3214             #
3215             # Moving forward in time, FOO is deleted on the default
3216             # branch (r1.1.1.2).  cvs2svn determines that this delete
3217             # also needs to happen on trunk, so FOO is deleted on
3218             # trunk.
3219             #
3220             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3221             # not 'dead', we assume it's a change).  However, since
3222             # our trunk file has been deleted, svnadmin blows up--you
3223             # can't change a file that doesn't exist!
3224             #
3225             # Soooo... we just check the path, and if it doesn't
3226             # exist, we do an add... if the path does exist, it's
3227             # business as usual.
3228             if not self._path_exists(cvs_rev.svn_path):
3229               self._add_path(cvs_rev)
3230             else:
3231               self._change_path(cvs_rev)
3232
3233         if cvs_rev.op == OP_DELETE:
3234           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3235
3236   def cleanup(self):
3237     """Callback for the Cleanup.register in self.__init__."""
3238     self.revs_db = None
3239     self.nodes_db = None
3240
3241   def add_delegate(self, delegate):
3242     """Adds DELEGATE to self.delegates.
3243
3244     For every delegate you add, as soon as SVNRepositoryMirror
3245     performs a repository action method, SVNRepositoryMirror will call
3246     the delegate's corresponding repository action method.  Multiple
3247     delegates will be called in the order that they are added.  See
3248     SVNRepositoryMirrorDelegate for more information."""
3249     self.delegates.append(delegate)
3250
3251   def _invoke_delegates(self, method, *args):
3252     """Iterate through each of our delegates, in the order that they
3253     were added, and call the delegate's method named METHOD with the
3254     arguments in ARGS."""
3255     for delegate in self.delegates:
3256       getattr(delegate, method)(*args)
3257
3258   def finish(self):
3259     """Calls the delegate finish method."""
3260     self._end_commit()
3261     self._invoke_delegates('finish')
3262     self.cleanup()
3263
3264
3265 class SVNCommitItem:
3266   """A wrapper class for CVSRevision objects upon which
3267    Subversion-related data (such as properties) may be hung."""
3268
3269   def __init__(self, c_rev, make_svn_props):
3270     self.c_rev = c_rev
3271     self.set_cvs_revnum_properties = Ctx().cvs_revnums
3272     self.eol_from_mime_type = Ctx().eol_from_mime_type
3273     self.no_default_eol = Ctx().no_default_eol
3274     self.keywords_off = Ctx().keywords_off
3275     self.mime_mapper = Ctx().mime_mapper
3276
3277     # We begin with only a "CVS revision" property.
3278     self.svn_props = { }
3279     if self.set_cvs_revnum_properties:
3280       self.svn_props['cvs2svn:cvs-rev'] = c_rev.rev
3281       make_svn_props = True
3282
3283     # If asked to fill in the Subversion properties ('svn:' ones), do so.
3284     if make_svn_props:
3285       # Tack on the executableness, if any.
3286       if c_rev.file_executable:
3287         self.svn_props['svn:executable'] = '*'
3288
3289       # Set the svn:keywords property, if appropriate.  See issue #2.
3290       if c_rev.mode is None or c_rev.mode == 'kv' or c_rev.mode == 'kvl':
3291         if not self.keywords_off:
3292           self.svn_props['svn:keywords'] = 'Author Date Id Revision'
3293
3294       # Set mime-type and eol.  These two properties are intertwingled;
3295       # follow the conditionals carefully.  See also issue #39.
3296       mime_type = None
3297       eol_style = None
3298
3299       if self.mime_mapper:
3300         mime_type = self.mime_mapper.get_type_from_filename(c_rev.cvs_path)
3301
3302       if not c_rev.mode == 'b':
3303         if not self.no_default_eol:
3304           eol_style = 'native'
3305         elif mime_type and self.eol_from_mime_type:
3306           if mime_type.startswith("text/"):
3307             eol_style = 'native'
3308           else:
3309             eol_style = None
3310       elif mime_type is None:
3311         # file is kb, and no other mimetype specified
3312         mime_type = 'application/octet-stream'
3313
3314       if mime_type:
3315         self.svn_props['svn:mime-type'] = mime_type
3316
3317       if eol_style:
3318         self.svn_props['svn:eol-style'] = eol_style
3319
3320
3321 class SVNRepositoryMirrorDelegate:
3322   """Abstract superclass for any delegate to SVNRepositoryMirror.
3323   Subclasses must implement all of the methods below.
3324
3325   For each method, a subclass implements, in its own way, the
3326   Subversion operation implied by the method's name.  For example, for
3327   the add_path method, the DumpfileDelegate would write out a
3328   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3329   would merely print that the path is being added to the repository,
3330   and the RepositoryDelegate would actually cause the path to be added
3331   to the Subversion repository that it is creating.
3332   """
3333
3334   def start_commit(self, svn_commit):
3335     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3336     see subclass implementation for details."""
3337     raise NotImplementedError
3338
3339   def mkdir(self, path):
3340     """PATH is a string; see subclass implementation for details."""
3341     raise NotImplementedError
3342
3343   def add_path(self, s_item):
3344     """S_ITEM is an SVNCommitItem; see subclass implementation for
3345     details."""
3346     raise NotImplementedError
3347
3348   def change_path(self, s_item):
3349     """S_ITEM is an SVNCommitItem; see subclass implementation for
3350     details."""
3351     raise NotImplementedError
3352
3353   def delete_path(self, path):
3354     """PATH is a string; see subclass implementation for
3355     details."""
3356     raise NotImplementedError
3357
3358   def copy_path(self, src_path, dest_path, src_revnum):
3359     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3360     subversion revision number (int); see subclass implementation for
3361     details."""
3362     raise NotImplementedError
3363
3364   def finish(self):
3365     """Perform any cleanup necessary after all revisions have been
3366     committed."""
3367     raise NotImplementedError
3368
3369
3370 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3371   """Create a Subversion dumpfile."""
3372
3373   def __init__(self, dumpfile_path=None):
3374     """Return a new DumpfileDelegate instance, attached to a dumpfile
3375     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding.
3376
3377     If Ctx().cvs_revnums is true, then set the 'cvs2svn:cvs-revnum'
3378     property on files, when they are changed due to a corresponding
3379     CVS revision.
3380
3381     If Ctx().mime_mapper is not None, then it is a MimeMapper
3382     instance, used to determine whether or not to set the
3383     'svn:mime-type' property on files.  But even if Ctx().mime_mapper
3384     is None, files marked with the CVS 'kb' flag will receive a mime
3385     type of "application/octet-stream".
3386
3387     Unless Ctx().no_default_eol is true, set 'svn:eol-style' to
3388     'native' for files not marked with the CVS 'kb' flag, except as
3389     superseded by Ctx().eol_from_mime_type (see below).
3390
3391     If Ctx().eol_from_mime_type is not None, then set 'svn:eol-style'
3392     to 'native' for all files to which Ctx().mime_mapper assigns a
3393     mime type beginning with "text/", and don't set 'svn:eol-style'
3394     for files assigned a type not beginning with "text/".
3395     """
3396     if dumpfile_path:
3397       self.dumpfile_path = dumpfile_path
3398     else:
3399       self.dumpfile_path = Ctx().dumpfile
3400     self.path_encoding = Ctx().encoding
3401
3402     self.dumpfile = open(self.dumpfile_path, 'wb')
3403     self._write_dumpfile_header(self.dumpfile)
3404
3405   def _write_dumpfile_header(self, dumpfile):
3406     # Initialize the dumpfile with the standard headers.
3407     #
3408     # Since the CVS repository doesn't have a UUID, and the Subversion
3409     # repository will be created with one anyway, we don't specify a
3410     # UUID in the dumpflie
3411     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3412
3413   def _utf8_path(self, path):
3414     """Return a copy of PATH encoded in UTF-8.  PATH is assumed to be
3415     encoded in self.path_encoding."""
3416     try:
3417       # Log messages can be converted with the 'replace' strategy,
3418       # but we can't afford any lossiness here.
3419       unicode_path = unicode(path, self.path_encoding, 'strict')
3420       return unicode_path.encode('utf-8')
3421     except UnicodeError:
3422       print "Unable to convert a path '%s' to internal encoding." % path
3423       print "Consider rerunning with (for example) '--encoding=latin1'"
3424       sys.exit(1)
3425
3426   def start_commit(self, svn_commit):
3427     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3428
3429     self.revision = svn_commit.revnum
3430
3431     # The start of a new commit typically looks like this:
3432     #
3433     #   Revision-number: 1
3434     #   Prop-content-length: 129
3435     #   Content-length: 129
3436     #
3437     #   K 7
3438     #   svn:log
3439     #   V 27
3440     #   Log message for revision 1.
3441     #   K 10
3442     #   svn:author
3443     #   V 7
3444     #   jrandom
3445     #   K 8
3446     #   svn:date
3447     #   V 27
3448     #   2003-04-22T22:57:58.132837Z
3449     #   PROPS-END
3450     #
3451     # Notice that the length headers count everything -- not just the
3452     # length of the data but also the lengths of the lengths, including
3453     # the 'K ' or 'V ' prefixes.
3454     #
3455     # The reason there are both Prop-content-length and Content-length
3456     # is that the former includes just props, while the latter includes
3457     # everything.  That's the generic header form for any entity in a
3458     # dumpfile.  But since revisions only have props, the two lengths
3459     # are always the same for revisions.
3460
3461     # Calculate the total length of the props section.
3462     props = svn_commit.get_revprops()
3463     prop_names = props.keys()
3464     prop_names.sort()
3465     total_len = 10  # len('PROPS-END\n')
3466     for propname in prop_names:
3467       if props[propname] is None:
3468         continue
3469       klen = len(propname)
3470       klen_len = len('K %d' % klen)
3471       vlen = len(props[propname])
3472       vlen_len = len('V %d' % vlen)
3473       # + 4 for the four newlines within a given property's section
3474       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
3475
3476     # Print the revision header and props
3477     self.dumpfile.write('Revision-number: %d\n'
3478                         'Prop-content-length: %d\n'
3479                         'Content-length: %d\n'
3480                         '\n'
3481                         % (self.revision, total_len, total_len))
3482
3483     for propname in prop_names:
3484       if props[propname] is None:
3485         continue
3486       self.dumpfile.write('K %d\n'
3487                           '%s\n'
3488                           'V %d\n'
3489                           '%s\n' % (len(propname),
3490                                     propname,
3491                                     len(props[propname]),
3492                                     props[propname]))
3493
3494     self.dumpfile.write('PROPS-END\n')
3495     self.dumpfile.write('\n')
3496
3497   def mkdir(self, path):
3498     """Emit the creation of directory PATH."""
3499     self.dumpfile.write("Node-path: %s\n"
3500                         "Node-kind: dir\n"
3501                         "Node-action: add\n"
3502                         "Content-length: 10\n"
3503                         "\n"
3504                         "\n" % self._utf8_path(path))
3505
3506   def _add_or_change_path(self, s_item, op):
3507     """Emit the addition or change corresponding to S_ITEM.
3508     OP is either the constant OP_ADD or OP_CHANGE."""
3509
3510     # Validation stuffs
3511     if op == OP_ADD:
3512       action = 'add'
3513     elif op == OP_CHANGE:
3514       action = 'change'
3515     else:
3516       sys.stderr.write("%s: _add_or_change_path() called with bad op ('%s')"
3517                        % (error_prefix, op))
3518       sys.exit(1)
3519
3520     # Convenience variables
3521     c_rev = s_item.c_rev
3522     svn_props = s_item.svn_props
3523
3524     # The property handling here takes advantage of an undocumented
3525     # but IMHO consistent feature of the Subversion dumpfile-loading
3526     # code.  When a node's properties aren't mentioned (that is, the
3527     # "Prop-content-length:" header is absent, no properties are
3528     # listed at all, and there is no "PROPS-END\n" line) then no
3529     # change is made to the node's properties.
3530     #
3531     # This is consistent with the way dumpfiles behave w.r.t. text
3532     # content changes, so I'm comfortable relying on it.  If you
3533     # commit a change to *just* the properties of some node that
3534     # already has text contents from a previous revision, then in the
3535     # dumpfile output for the prop change, no "Text-content-length:"
3536     # nor "Text-content-md5:" header will be present, and the text of
3537     # the file will not be given.  But this does not cause the file's
3538     # text to be erased!  It simply remains unchanged.
3539     #
3540     # This works out great for cvs2svn, due to lucky coincidences:
3541     #
3542     # For files, the only properties we ever set are set in the first
3543     # revision; all other revisions (including on branches) inherit
3544     # from that.  After the first revision, we never change file
3545     # properties, therefore, there is no need to remember the full set
3546     # of properties on a given file once we've set it.
3547     #
3548     # For directories, the only property we set is "svn:ignore", and
3549     # while we may change it after the first revision, we always do so
3550     # based on the contents of a ".cvsignore" file -- in other words,
3551     # CVS is doing the remembering for us, so we still don't have to
3552     # preserve the previous value of the property ourselves.
3553
3554     # Calculate the (sorted-by-name) property string and length, if any.
3555     prop_contents = ''
3556     prop_names = svn_props.keys()
3557     prop_names.sort()
3558     for pname in prop_names:
3559       pval = svn_props[pname]
3560       prop_contents = prop_contents + \
3561                       'K %d\n%s\nV %d\n%s\n' \
3562                       % (len(pname), pname, len(pval), pval)
3563     if prop_contents:
3564       prop_contents = prop_contents + 'PROPS-END\n'
3565       props_len = len(prop_contents)
3566     else:
3567       props_len = 0
3568
3569     props_header = ''
3570     if props_len:
3571       props_header = 'Prop-content-length: %d\n' % props_len
3572
3573     # treat .cvsignore as a directory property
3574     dir_path, basename = os.path.split(c_rev.svn_path)
3575     if basename == ".cvsignore":
3576       ignore_vals = generate_ignores(c_rev)
3577       ignore_contents = '\n'.join(ignore_vals)
3578       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3579                          (len(ignore_contents), ignore_contents))
3580       ignore_contents = ignore_contents + 'PROPS-END\n'
3581       ignore_len = len(ignore_contents)
3582
3583       # write headers, then props
3584       self.dumpfile.write('Node-path: %s\n'
3585                           'Node-kind: dir\n'
3586                           'Node-action: change\n'
3587                           'Prop-content-length: %d\n'
3588                           'Content-length: %d\n'
3589                           '\n'
3590                           '%s'
3591                           % (self._utf8_path(dir_path), ignore_len,
3592                              ignore_len, ignore_contents))
3593
3594     pipe_cmd, pipe = get_co_pipe(c_rev)
3595     self.dumpfile.write('Node-path: %s\n'
3596                         'Node-kind: file\n'
3597                         'Node-action: %s\n'
3598                         '%s'  # no property header if no props
3599                         'Text-content-length: '
3600                         % (self._utf8_path(c_rev.svn_path),
3601                            action, props_header))
3602
3603     pos = self.dumpfile.tell()
3604
3605     self.dumpfile.write('0000000000000000\n'
3606                         'Text-content-md5: 00000000000000000000000000000000\n'
3607                         'Content-length: 0000000000000000\n'
3608                         '\n')
3609
3610     if prop_contents:
3611       self.dumpfile.write(prop_contents)
3612
3613     # Insert the rev contents, calculating length and checksum as we go.
3614     checksum = md5.new()
3615     length = 0
3616     normalize_crlf = sys.platform == "win32" \
3617                      and svn_props.has_key('svn:eol-style')
3618     trailing_cr = ""
3619     buf = pipe.fromchild.read(PIPE_READ_SIZE)
3620     while buf:
3621       if normalize_crlf:
3622         buf = string.replace(buf,"\r\n","\n")
3623         if buf[-1] == "\r":
3624           trailing_cr = "\r"
3625           buf = buf[:-1]
3626         else:
3627           trailing_cr = ""
3628       checksum.update(buf)
3629       length = length + len(buf)
3630       self.dumpfile.write(buf)
3631       # optimize because of python's immutable strings
3632       if trailing_cr:
3633         buf = trailing_cr + pipe.fromchild.read(PIPE_READ_SIZE)
3634       else:
3635         buf = pipe.fromchild.read(PIPE_READ_SIZE)
3636     pipe.fromchild.close()
3637     error_output = pipe.childerr.read()
3638     exit_status = pipe.wait()
3639     if exit_status:
3640       sys.exit("%s: The command '%s' failed with exit status: %s\n"
3641                "and the following output:\n"
3642                "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
3643
3644     # Go back to patch up the length and checksum headers:
3645     self.dumpfile.seek(pos, 0)
3646     # We left 16 zeros for the text length; replace them with the real
3647     # length, padded on the left with spaces:
3648     self.dumpfile.write('%16d' % length)
3649     # 16... + 1 newline + len('Text-content-md5: ') == 35
3650     self.dumpfile.seek(pos + 35, 0)
3651     self.dumpfile.write(checksum.hexdigest())
3652     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
3653     self.dumpfile.seek(pos + 84, 0)
3654     # The content length is the length of property data, text data,
3655     # and any metadata around/inside around them.
3656     self.dumpfile.write('%16d' % (length + props_len))
3657     # Jump back to the end of the stream
3658     self.dumpfile.seek(0, 2)
3659
3660     # This record is done (write two newlines -- one to terminate
3661     # contents that weren't themselves newline-termination, one to
3662     # provide a blank line for readability.
3663     self.dumpfile.write('\n\n')
3664
3665   def add_path(self, s_item):
3666     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
3667     self._add_or_change_path(s_item, OP_ADD)
3668
3669   def change_path(self, s_item):
3670     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
3671     self._add_or_change_path(s_item, OP_CHANGE)
3672
3673   def delete_path(self, path):
3674     """Emit the deletion of PATH."""
3675     self.dumpfile.write('Node-path: %s\n'
3676                         'Node-action: delete\n'
3677                         '\n' % self._utf8_path(path))
3678
3679   def copy_path(self, src_path, dest_path, src_revnum):
3680     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
3681     # We don't need to include "Node-kind:" for copies; the loader
3682     # ignores it anyway and just uses the source kind instead.
3683     self.dumpfile.write('Node-path: %s\n'
3684                         'Node-action: add\n'
3685                         'Node-copyfrom-rev: %d\n'
3686                         'Node-copyfrom-path: /%s\n'
3687                         '\n'
3688                         % (self._utf8_path(dest_path),
3689                            src_revnum,
3690                            self._utf8_path(src_path)))
3691
3692   def finish(self):
3693     """Perform any cleanup necessary after all revisions have been
3694     committed."""
3695     self.dumpfile.close()
3696
3697
3698 class RepositoryDelegate(DumpfileDelegate):
3699   """Creates a new Subversion Repository.  DumpfileDelegate does all
3700   of the heavy lifting."""
3701   def __init__(self):
3702     self.svnadmin = Ctx().svnadmin
3703     self.target = Ctx().target
3704     if not Ctx().existing_svnrepos:
3705       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
3706       if Ctx().fs_type and Ctx().fs_type != 'bdb':
3707         # User specified something other than bdb.
3708         run_command('%s create %s "%s"' % (self.svnadmin,
3709                                            "--fs-type=%s" % Ctx().fs_type,
3710                                            self.target))
3711       elif Ctx().fs_type:
3712         # User explicitly specified bdb.
3713         #
3714         # Since this is a BDB repository, pass --bdb-txn-nosync,
3715         # because it gives us a 4-5x speed boost (if cvs2svn is
3716         # creating the repository, cvs2svn should be the only program
3717         # accessing the svn repository (until cvs is done, at least)).
3718         # But we'll turn no-sync off in self.finish(), unless
3719         # instructed otherwise.
3720         run_command('%s create %s %s "%s"' % (self.svnadmin,
3721                                               "--fs-type=bdb",
3722                                               "--bdb-txn-nosync",
3723                                               self.target))
3724       else:
3725         # User didn't say what kind repository (bdb, fsfs, etc).
3726         # We still pass --bdb-txn-nosync.  It's a no-op if the default
3727         # repository type doesn't support it, but we definitely want
3728         # it if BDB is the default.
3729         run_command('%s create %s "%s"' % (self.svnadmin,
3730                                            "--bdb-txn-nosync",
3731                                            self.target))
3732
3733
3734     # Since the output of this run is a repository, not a dumpfile,
3735     # the temporary dumpfiles we create should go in the tmpdir.
3736     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
3737
3738     # This is 1 if a commit is in progress, otherwise None.
3739     self._commit_in_progress = None
3740
3741     self.dumpfile = open(self.dumpfile_path, 'w+b')
3742     self.loader_pipe = Popen3('%s load -q "%s"' % (self.svnadmin, self.target),
3743                               True)
3744     self.loader_pipe.fromchild.close()
3745     try:
3746       self._write_dumpfile_header(self.loader_pipe.tochild)
3747     except IOError:
3748       sys.stderr.write("%s: svnadmin failed with the following output while "
3749                        "loading the dumpfile:\n" % (error_prefix))
3750       sys.stderr.write(self.loader_pipe.childerr.read())
3751       sys.exit(1)
3752
3753   def _feed_pipe(self):
3754     """Feed the revision stored in the dumpfile to the svnadmin
3755     load pipe."""
3756     self.dumpfile.seek(0)
3757     while 1:
3758       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
3759       if not len(data):
3760         break
3761       try:
3762         self.loader_pipe.tochild.write(data)
3763       except IOError:
3764         sys.stderr.write("%s: svnadmin failed with the following output while "
3765                          "loading the dumpfile:\n" % (error_prefix))
3766         sys.stderr.write(self.loader_pipe.childerr.read())
3767         sys.exit(1)
3768
3769   def start_commit(self, svn_commit):
3770     """Start a new commit.  If a commit is already in progress, close
3771     the dumpfile, load it into the svn repository, open a new
3772     dumpfile, and write the header into it."""
3773     if self._commit_in_progress:
3774       self._feed_pipe()
3775     self.dumpfile.seek(0)
3776     self.dumpfile.truncate()
3777     DumpfileDelegate.start_commit(self, svn_commit)
3778     self._commit_in_progress = 1
3779
3780   def finish(self):
3781     """Loads the last commit into the repository."""
3782     self._feed_pipe()
3783     self.dumpfile.close()
3784     self.loader_pipe.tochild.close()
3785     error_output = self.loader_pipe.childerr.read()
3786     exit_status = self.loader_pipe.wait()
3787     if exit_status:
3788       sys.exit('%s: svnadmin load failed with exit status: %s\n'
3789                'and the following output:\n'
3790                '%s' % (error_prefix, exit_status, error_output))
3791     os.remove(self.dumpfile_path)
3792
3793     # If this is a BDB repository, and we created the repository, and
3794     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
3795     # line in the DB_CONFIG file, because txn syncing should be on by
3796     # default in BDB repositories.
3797     #
3798     # We determine if this is a BDB repository by looking for the
3799     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
3800     # checking Ctx().fs_type.  That way this code will Do The Right
3801     # Thing in all circumstances.
3802     db_config = os.path.join(self.target, "db/DB_CONFIG")
3803     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
3804         and os.path.exists(db_config)):
3805       no_sync = 'set_flags DB_TXN_NOSYNC\n'
3806
3807       contents = open(db_config, 'r').readlines()
3808       index = contents.index(no_sync)
3809       contents[index] = '# ' + no_sync
3810       contents = open(db_config, 'w').writelines(contents)
3811
3812
3813 class StdoutDelegate(SVNRepositoryMirrorDelegate):
3814   """Makes no changes to the disk, but writes out information to
3815   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
3816   print statements will state that we're doing something, when in
3817   reality, we aren't doing anything other than printing out that we're
3818   doing something.  Kind of zen, really."""
3819   def __init__(self, total_revs):
3820     self.total_revs = total_revs
3821
3822   def start_commit(self, svn_commit):
3823     """Prints out the Subversion revision number of the commit that is
3824     being started."""
3825     Log().write(LOG_VERBOSE, "=" * 60)
3826     Log().write(LOG_NORMAL, "Starting Subversion commit %d / %d" %
3827                 (svn_commit.revnum, self.total_revs))
3828
3829   def mkdir(self, path):
3830     """Print a line stating that we are creating directory PATH."""
3831     Log().write(LOG_VERBOSE, "  New Directory", path)
3832
3833   def add_path(self, s_item):
3834     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
3835     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
3836
3837   def change_path(self, s_item):
3838     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
3839     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
3840
3841   def delete_path(self, path):
3842     """Print a line stating that we are 'deleting' PATH."""
3843     Log().write(LOG_VERBOSE, "  Deleting", path)
3844
3845   def copy_path(self, src_path, dest_path, src_revnum):
3846     """Print a line stating that we are 'copying' revision SRC_REVNUM
3847     of SRC_PATH to DEST_PATH."""
3848     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
3849     Log().write(LOG_VERBOSE, "                to", dest_path)
3850
3851   def finish(self):
3852     """State that we are done creating our repository."""
3853     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
3854     Log().write(LOG_QUIET, "Done.")
3855
3856 # This should be a local to pass1,
3857 # but Python 2.0 does not support nested scopes.
3858 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
3859 def pass1():
3860   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
3861   cd = CollectData()
3862
3863   def visit_file(baton, dirname, files):
3864     cd = baton
3865     for fname in files:
3866       if fname[-2:] != ',v':
3867         continue
3868       cd.found_valid_file = 1
3869       pathname = os.path.join(dirname, fname)
3870       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
3871         # drop the 'Attic' portion from the pathname for the canonical name.
3872         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
3873       else:
3874         # If this file also exists in the attic, it's a fatal error
3875         attic_path = os.path.join(dirname, 'Attic', fname)
3876         if os.path.exists(attic_path):
3877           err = "%s: A CVS repository cannot contain both %s and %s" \
3878                 % (error_prefix, pathname, attic_path)
3879           sys.stderr.write(err + '\n')
3880           cd.fatal_errors.append(err)
3881         cd.set_fname(pathname, pathname)
3882       Log().write(LOG_NORMAL, pathname)
3883       try:
3884         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
3885       except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
3886         err = "%s: '%s' is not a valid ,v file" \
3887               % (error_prefix, pathname)
3888         sys.stderr.write(err + '\n')
3889         cd.fatal_errors.append(err)
3890       except:
3891         Log().write(LOG_WARN, "Exception occurred while parsing %s" % pathname)
3892         raise
3893
3894   os.path.walk(Ctx().cvsroot, visit_file, cd)
3895   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
3896
3897   cd.write_symbol_db()
3898
3899   if len(cd.fatal_errors) > 0:
3900     sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
3901              + "Error summary:\n"
3902              + "\n".join(cd.fatal_errors)
3903              + "\nExited due to fatal error(s).")
3904
3905   if cd.found_valid_file is None:
3906     sys.exit("\nNo RCS files found in your CVS Repository!\n"
3907              + "Are you absolutely certain you are pointing cvs2svn\n"
3908              + "at a CVS repository?\n"
3909              + "\nExited due to fatal error(s).")
3910
3911   StatsKeeper().reset_c_rev_info()
3912   StatsKeeper().archive()
3913   Log().write(LOG_QUIET, "Done")
3914
3915 def pass2():
3916   "Pass 2: clean up the revision information."
3917
3918   symbol_db = SymbolDatabase()
3919   symbol_db.read()
3920
3921   # Convert the list of regexps to a list of strings
3922   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
3923
3924   error_detected = 0
3925
3926   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
3927   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
3928   if blocked_excludes:
3929     for branch, blockers in blocked_excludes.items():
3930       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
3931                        "excluded because the following symbols depend "
3932                        "on it:\n" % (branch))
3933       for blocker in blockers:
3934         sys.stderr.write("    '%s'\n" % (blocker))
3935     sys.stderr.write("\n")
3936     error_detected = 1
3937
3938   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
3939   invalid_forced_tags = [ ]
3940   for forced_tag in Ctx().forced_tags:
3941     if excludes.has_key(forced_tag):
3942       continue
3943     if symbol_db.branch_has_commit(forced_tag):
3944       invalid_forced_tags.append(forced_tag)
3945   if invalid_forced_tags:
3946     sys.stderr.write(error_prefix + ": The following branches cannot be "
3947                      "forced to be tags because they have commits:\n")
3948     for tag in invalid_forced_tags:
3949       sys.stderr.write("    '%s'\n" % (tag))
3950     sys.stderr.write("\n")
3951     error_detected = 1
3952
3953   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
3954   mismatches = symbol_db.find_mismatches(excludes)
3955   def is_not_forced(mismatch):
3956     name = mismatch[0]
3957     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
3958   mismatches = filter(is_not_forced, mismatches)
3959   if mismatches:
3960     sys.stderr.write(error_prefix + ": The following symbols are tags "
3961                      "in some files and branches in others.\nUse "
3962                      "--force-tag, --force-branch and/or --exclude to "
3963                      "resolve the symbols.\n")
3964     for name, tag_count, branch_count, commit_count in mismatches:
3965       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
3966                        "%d files and has commits in %d files.\n"
3967                        % (name, tag_count, branch_count, commit_count))
3968     error_detected = 1
3969
3970   # Bail out now if we found errors
3971   if error_detected:
3972     sys.exit(1)
3973
3974   # Create the tags database
3975   tags_db = TagsDatabase(DB_OPEN_NEW)
3976   for tag in symbol_db.tags.keys():
3977     if tag not in Ctx().forced_branches:
3978       tags_db[tag] = None
3979   for tag in Ctx().forced_tags:
3980     tags_db[tag] = None
3981
3982   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
3983
3984   # We may have recorded some changes in revisions' timestamp.  We need to
3985   # scan for any other files which may have had the same log message and
3986   # occurred at "the same time" and change their timestamps, too.
3987
3988   # read the resync data file
3989   def read_resync(fname):
3990     "Read the .resync file into memory."
3991
3992     ### note that we assume that we can hold the entire resync file in
3993     ### memory. really large repositories with whacky timestamps could
3994     ### bust this assumption. should that ever happen, then it is possible
3995     ### to split the resync file into pieces and make multiple passes,
3996     ### using each piece.
3997
3998     #
3999     # A digest maps to a sequence of lists which specify a lower and upper
4000     # time bound for matching up the commit.  We keep a sequence of these
4001     # because a number of checkins with the same log message (e.g. an empty
4002     # log message) could need to be remapped.  We also make them a list because
4003     # we will dynamically expand the lower/upper bound as we find commits
4004     # that fall into a particular msg and time range.
4005     #
4006     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4007     #
4008     resync = { }
4009
4010     for line in fileinput.FileInput(fname):
4011       t1 = int(line[:8], 16)
4012       digest = line[9:DIGEST_END_IDX]
4013       t2 = int(line[DIGEST_END_IDX+1:], 16)
4014       t1_l = t1 - COMMIT_THRESHOLD/2
4015       t1_u = t1 + COMMIT_THRESHOLD/2
4016       if resync.has_key(digest):
4017         resync[digest].append([t1_l, t1_u, t2])
4018       else:
4019         resync[digest] = [ [t1_l, t1_u, t2] ]
4020
4021     # For each digest, sort the resync items in it in increasing order,
4022     # based on the lower time bound.
4023     digests = resync.keys()
4024     for digest in digests:
4025       (resync[digest]).sort()
4026
4027     return resync
4028
4029   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4030
4031   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4032   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4033
4034   # process the revisions file, looking for items to clean up
4035   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4036     c_rev = CVSRevision(Ctx(), line[:-1])
4037
4038     # Skip this entire revision if it's on an excluded branch
4039     if excludes.has_key(c_rev.branch_name):
4040       continue
4041
4042     # Remove all references to excluded tags and branches
4043     def not_excluded(symbol, excludes=excludes):
4044       return not excludes.has_key(symbol)
4045     c_rev.branches = filter(not_excluded, c_rev.branches)
4046     c_rev.tags = filter(not_excluded, c_rev.tags)
4047
4048     # Convert all branches that are forced to be tags
4049     for forced_tag in Ctx().forced_tags:
4050       if forced_tag in c_rev.branches:
4051         c_rev.branches.remove(forced_tag)
4052         c_rev.tags.append(forced_tag)
4053
4054     # Convert all tags that are forced to be branches
4055     for forced_branch in Ctx().forced_branches:
4056       if forced_branch in c_rev.tags:
4057         c_rev.tags.remove(forced_branch)
4058         c_rev.branches.append(forced_branch)
4059
4060     # see if this is "near" any of the resync records we
4061     # have recorded for this digest [of the log message].
4062     for record in resync.get(c_rev.digest, []):
4063       if record[0] <= c_rev.timestamp <= record[1]:
4064         # bingo! remap the time on this (record[2] is the new time).
4065
4066         # adjust the time range. we want the COMMIT_THRESHOLD from the
4067         # bounds of the earlier/latest commit in this group.
4068         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4069         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4070
4071         # By default this will be the new timestamp
4072         new_timestamp = record[2]
4073         # If the new timestamp is earlier than that of our previous revision
4074         if record[2] < c_rev.prev_timestamp:
4075           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4076                   + " to time %s, which is before previous the time of"
4077                   + " revision %s (%s):")
4078           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4079                                         c_rev.cvs_path, record[2],
4080                                         c_rev.prev_rev, c_rev.prev_timestamp))
4081           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4082           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4083           # attempted sync time, then sync back to c_rev.prev_timestamp
4084           # + 1...
4085           if (c_rev.prev_timestamp - record[2]) < COMMIT_THRESHOLD:
4086             new_timestamp = c_rev.prev_timestamp + 1
4087             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4088                                                           new_timestamp))
4089           # ...otherwise, make no change
4090           else:
4091             new_timestamp = c_rev.timestamp
4092             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4093                         warning_prefix)
4094
4095         msg = "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4096               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4097                  record[2] - c_rev.timestamp)
4098         Log().write(LOG_VERBOSE, msg)
4099
4100         c_rev.timestamp = new_timestamp
4101
4102         # stop looking for hits
4103         break
4104
4105     output.write(str(c_rev) + "\n")
4106   Log().write(LOG_QUIET, "Done")
4107
4108 def pass3():
4109   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4110   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4111             temp(DATAFILE + SORTED_REVS_SUFFIX))
4112   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4113   Log().write(LOG_QUIET, "Done")
4114
4115 def pass4():
4116   """Iterate through sorted revs, storing them in a database.
4117   If we're not doing a trunk-only conversion, generate the
4118   LastSymbolicNameDatabase, which contains the last CVSRevision
4119   that is a source for each tag or branch.
4120   """
4121   Log().write(LOG_QUIET,
4122       "Copying CVS revision data from flat file to database...")
4123   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4124   if not Ctx().trunk_only:
4125     Log().write(LOG_QUIET,
4126         "and finding last CVS revisions for all symbolic names...")
4127     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4128   else:
4129     # This is to avoid testing Ctx().trunk_only every time around the loop
4130     class DummyLSNDB:
4131       def noop(*args): pass
4132       log_revision = noop
4133       create_database = noop
4134     last_sym_name_db = DummyLSNDB()
4135
4136   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4137     c_rev = CVSRevision(Ctx(), line[:-1])
4138     cvs_revs_db.log_revision(c_rev)
4139     last_sym_name_db.log_revision(c_rev)
4140     StatsKeeper().record_c_rev(c_rev)
4141
4142   last_sym_name_db.create_database()
4143   StatsKeeper().archive()
4144   Log().write(LOG_QUIET, "Done")
4145
4146 def pass5():
4147   """
4148   Generate the SVNCommit <-> CVSRevision mapping
4149   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4150   CVSRevisions that represent an opening or closing for a path on a
4151   branch or tag.  See SymbolingsLogger for more details.
4152   """
4153   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4154
4155   aggregator = CVSRevisionAggregator()
4156   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4157     c_rev = CVSRevision(Ctx(), line[:-1])
4158     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4159       aggregator.process_revision(c_rev)
4160   aggregator.flush()
4161
4162   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4163   StatsKeeper().archive()
4164   Log().write(LOG_QUIET, "Done")
4165
4166 def pass6():
4167   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4168
4169   if not Ctx().trunk_only:
4170     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4171               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4172     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4173   Log().write(LOG_QUIET, "Done")
4174
4175 def pass7():
4176   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4177
4178   def generate_offsets_for_symbolings():
4179     """This function iterates through all the lines in
4180     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4181     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4182     where SYMBOLIC_NAME is first encountered.  This will allow us to
4183     seek to the various offsets in the file and sequentially read only
4184     the openings and closings that we need."""
4185
4186     ###PERF This is a fine example of a db that can be in-memory and
4187     #just flushed to disk when we're done.  Later, it can just be sucked
4188     #back into memory.
4189     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4190     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4191
4192     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4193     old_sym = ""
4194     while 1:
4195       fpos = file.tell()
4196       line = file.readline()
4197       if not line:
4198         break
4199       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4200       if not sym == old_sym:
4201         Log().write(LOG_VERBOSE, " ", sym)
4202         old_sym = sym
4203         offsets_db[sym] = fpos
4204
4205   if not Ctx().trunk_only:
4206     generate_offsets_for_symbolings()
4207   Log().write(LOG_QUIET, "Done.")
4208
4209 def pass8():
4210   svncounter = 2 # Repository initialization is 1.
4211   repos = SVNRepositoryMirror()
4212   persistence_manager = PersistenceManager(DB_OPEN_READ)
4213
4214   if (Ctx().target):
4215     if not Ctx().dry_run:
4216       repos.add_delegate(RepositoryDelegate())
4217     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4218   else:
4219     if not Ctx().dry_run:
4220       repos.add_delegate(DumpfileDelegate())
4221     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4222
4223   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4224
4225   while(1):
4226     svn_commit = persistence_manager.get_svn_commit(svncounter)
4227     if not svn_commit:
4228       break
4229     repos.commit(svn_commit)
4230     svncounter += 1
4231
4232   repos.finish()
4233
4234 _passes = [
4235   pass1,
4236   pass2,
4237   pass3,
4238   pass4,
4239   pass5,
4240   pass6,
4241   pass7,
4242   pass8,
4243   ]
4244
4245
4246 class Ctx:
4247   """Session state for this run of cvs2svn.  For example, run-time
4248   options are stored here.  This class is a Borg, see
4249   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4250   """
4251   __shared_state = { }
4252   def __init__(self):
4253     self.__dict__ = self.__shared_state
4254     if self.__dict__:
4255       return
4256     # Else, initialize to defaults.
4257     self.cvsroot = None
4258     self.target = None
4259     self.dumpfile = DUMPFILE
4260     self.tmpdir = '.'
4261     self.verbose = 0
4262     self.quiet = 0
4263     self.prune = 1
4264     self.existing_svnrepos = 0
4265     self.dump_only = 0
4266     self.dry_run = 0
4267     self.trunk_only = 0
4268     self.trunk_base = "trunk"
4269     self.tags_base = "tags"
4270     self.branches_base = "branches"
4271     self.encoding = "ascii"
4272     self.mime_types_file = None
4273     self.mime_mapper = None
4274     self.no_default_eol = 0
4275     self.eol_from_mime_type = 0
4276     self.keywords_off = 0
4277     self.use_cvs = None
4278     self.svnadmin = "svnadmin"
4279     self.username = None
4280     self.print_help = 0
4281     self.skip_cleanup = 0
4282     self.cvs_revnums = 0
4283     self.bdb_txn_nosync = 0
4284     self.fs_type = None
4285     self.forced_branches = []
4286     self.forced_tags = []
4287     self.excludes = []
4288     self.symbol_transforms = []
4289
4290 class MimeMapper:
4291   """A class that provides mappings from file names to MIME types.
4292   Note that we should really be using Python's 'mimetypes' module.
4293   See http://cvs2svn.tigris.org/servlets/ReadMsg?list=dev&msgNo=266
4294   for more."""
4295
4296   def __init__(self):
4297     self.mappings = { }
4298
4299   def set_mime_types_file(self, mime_types_file):
4300     for line in fileinput.input(mime_types_file):
4301       if line.startswith("#"):
4302         continue
4303
4304       # format of a line is something like
4305       # text/plain c h cpp
4306       extensions = line.split()
4307       if len(extensions) < 2:
4308         continue
4309       type = extensions.pop(0)
4310       for ext in extensions:
4311         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4312           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n" \
4313                            % (warning_prefix, ext, self.mappings[ext], type))
4314         self.mappings[ext] = type
4315
4316
4317   def get_type_from_filename(self, filename):
4318     basename, extension = os.path.splitext(os.path.basename(filename))
4319
4320     # Extension includes the dot, so strip it (will leave extension
4321     # empty if filename ends with a dot, which is ok):
4322     extension = extension[1:]
4323
4324     # If there is no extension (or the file ends with a period), use
4325     # the base name for mapping.  This allows us to set mappings for
4326     # files such as README or Makefile:
4327     if not extension:
4328       extension = basename
4329     if self.mappings.has_key(extension):
4330       return self.mappings[extension]
4331     return None
4332
4333
4334 def convert(start_pass, end_pass):
4335   "Convert a CVS repository to an SVN repository."
4336
4337   cleanup = Cleanup()
4338   times = [ None ] * (end_pass + 1)
4339   times[start_pass - 1] = time.time()
4340   StatsKeeper().set_start_time(time.time())
4341   for i in range(start_pass - 1, end_pass):
4342     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4343     _passes[i]()
4344     times[i + 1] = time.time()
4345     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4346     # Dispose of items in Ctx() not intended to live past the end of the pass
4347     # (Identified by exactly one leading underscore)
4348     for attr in dir(Ctx()):
4349       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4350           and not attr[:6] == "_Ctx__"):
4351         delattr(Ctx(), attr)
4352     if not Ctx().skip_cleanup:
4353       cleanup.cleanup(_passes[i])
4354     StatsKeeper().set_end_time(time.time())
4355
4356   Log().write(LOG_QUIET, StatsKeeper())
4357   if end_pass < 4:
4358     Log().write(LOG_QUIET, '(These are unaltered CVS repository stats and do not\n'
4359                 + ' reflect tags or branches excluded via --exclude)\n')
4360   print StatsKeeper().timings()
4361
4362
4363 def usage():
4364   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
4365         % os.path.basename(sys.argv[0])
4366   print '  --help, -h           print this usage message and exit with success'
4367   print '  --version            print the version number'
4368   print '  -q                   quiet'
4369   print '  -v                   verbose'
4370   print '  -s PATH              path for SVN repos'
4371   print '  -p START[:END]       start at pass START, end at pass END of %d' % len(_passes)
4372   print '                       If only START is given, run only pass START'
4373   print '                       (implicitly enables --skip-cleanup)'
4374   print '  --existing-svnrepos  load into existing SVN repository'
4375   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
4376   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
4377   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
4378   print '  --dry-run            do not create a repository or a dumpfile;'
4379   print '                       just print what would happen.'
4380   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
4381   print '                       (only use this if having problems with RCS)'
4382   print '  --svnadmin=PATH      path to the svnadmin program'
4383   print '  --trunk-only         convert only trunk commits, not tags nor branches'
4384   print '  --trunk=PATH         path for trunk (default: %s)'    \
4385         % Ctx().trunk_base
4386   print '  --branches=PATH      path for branches (default: %s)' \
4387         % Ctx().branches_base
4388   print '  --tags=PATH          path for tags (default: %s)'     \
4389         % Ctx().tags_base
4390   print '  --no-prune           don\'t prune empty directories'
4391   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
4392   print '  --encoding=ENC       encoding of log messages in CVS repos (default: %s)' \
4393         % Ctx().encoding
4394   print '  --force-branch=NAME  force NAME to be a branch'
4395   print '  --force-tag=NAME     force NAME to be a tag'
4396   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
4397   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
4398   print '                       use Python regexp and reference syntax respectively'
4399   print '  --username=NAME      username for cvs2svn-synthesized commits'
4400   print '  --skip-cleanup       prevent the deletion of intermediate files'
4401   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
4402   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
4403   print '  --cvs-revnums        record CVS revision numbers as file properties'
4404   print '  --mime-types=FILE    specify an apache-style mime.types file for\n' \
4405         '                       setting svn:mime-type'
4406   print '  --eol-from-mime-type set svn:eol-style by mime type (only with --mime-types)'
4407   print '  --no-default-eol     don\'t set svn:eol-style by CVS defaults'
4408   print '  --keywords-off       don\'t set svn:keywords on any files (cvs2svn sets'
4409   print '                       "svn:keywords to author date id" on non-binary files'
4410   print '                       by default)'
4411
4412 def main():
4413   # Convenience var, so we don't have to keep instantiating this Borg.
4414   ctx = Ctx()
4415
4416   profiling = None
4417   start_pass = 1
4418   end_pass = len(_passes)
4419
4420   try:
4421     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
4422                                [ "help", "create", "trunk=",
4423                                  "username=", "existing-svnrepos",
4424                                  "branches=", "tags=", "encoding=",
4425                                  "force-branch=", "force-tag=", "exclude=",
4426                                  "use-cvs", "mime-types=",
4427                                  "eol-from-mime-type", "no-default-eol",
4428                                  "trunk-only", "no-prune", "dry-run",
4429                                  "dump-only", "dumpfile=", "tmpdir=",
4430                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
4431                                  "bdb-txn-nosync", "fs-type=",
4432                                  "version", "profile",
4433                                  "keywords-off", "symbol-transform="])
4434   except getopt.GetoptError, e:
4435     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
4436     usage()
4437     sys.exit(1)
4438
4439   for opt, value in opts:
4440     if opt == '--version':
4441         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
4442         sys.exit(0)
4443     elif opt == '-p':
4444       # Don't cleanup if we're doing incrementals.
4445       ctx.skip_cleanup = 1
4446       if value.find(':') > 0:
4447         start_pass, end_pass = map(int, value.split(':'))
4448       else:
4449         end_pass = start_pass = int(value)
4450       if start_pass > len(_passes) or start_pass < 1:
4451         print '%s: illegal value (%d) for starting pass. '\
4452               'must be 1 through %d.' % (error_prefix, int(start_pass),
4453                                          len(_passes))
4454         sys.exit(1)
4455       if end_pass < start_pass or end_pass > len(_passes):
4456         print '%s: illegal value (%d) for ending pass. ' \
4457               'must be %d through %d.' % (error_prefix, int(end_pass),
4458                                           int(start_pass), len(_passes))
4459         sys.exit(1)
4460     elif (opt == '--help') or (opt == '-h'):
4461       ctx.print_help = 1
4462     elif opt == '-v':
4463       Log().log_level = LOG_VERBOSE
4464       ctx.verbose = 1
4465     elif opt == '-q':
4466       Log().log_level = LOG_QUIET
4467       ctx.quiet = 1
4468     elif opt == '-s':
4469       ctx.target = value
4470     elif opt == '--existing-svnrepos':
4471       ctx.existing_svnrepos = 1
4472     elif opt == '--dumpfile':
4473       ctx.dumpfile = value
4474     elif opt == '--tmpdir':
4475       ctx.tmpdir = value
4476     elif opt == '--use-cvs':
4477       ctx.use_cvs = 1
4478     elif opt == '--svnadmin':
4479       ctx.svnadmin = value
4480     elif opt == '--trunk-only':
4481       ctx.trunk_only = 1
4482     elif opt == '--trunk':
4483       if not value:
4484         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4485       ctx.trunk_base = value
4486     elif opt == '--branches':
4487       if not value:
4488         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4489       ctx.branches_base = value
4490     elif opt == '--tags':
4491       if not value:
4492         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4493       ctx.tags_base = value
4494     elif opt == '--no-prune':
4495       ctx.prune = None
4496     elif opt == '--dump-only':
4497       ctx.dump_only = 1
4498     elif opt == '--dry-run':
4499       ctx.dry_run = 1
4500     elif opt == '--encoding':
4501       ctx.encoding = value
4502     elif opt == '--force-branch':
4503       ctx.forced_branches.append(value)
4504     elif opt == '--force-tag':
4505       ctx.forced_tags.append(value)
4506     elif opt == '--exclude':
4507       try:
4508         ctx.excludes.append(re.compile('^' + value + '$'))
4509       except re.error, e:
4510         sys.exit(error_prefix + ": '%s' is not a valid regexp.\n" % (value))
4511     elif opt == '--mime-types':
4512       ctx.mime_types_file = value
4513     elif opt == '--eol-from-mime-type':
4514       ctx.eol_from_mime_type = 1
4515     elif opt == '--no-default-eol':
4516       ctx.no_default_eol = 1
4517     elif opt == '--keywords-off':
4518       ctx.keywords_off = 1
4519     elif opt == '--username':
4520       ctx.username = value
4521     elif opt == '--skip-cleanup':
4522       ctx.skip_cleanup = 1
4523     elif opt == '--cvs-revnums':
4524       ctx.cvs_revnums = 1
4525     elif opt == '--bdb-txn-nosync':
4526       ctx.bdb_txn_nosync = 1
4527     elif opt == '--fs-type':
4528       ctx.fs_type = value
4529     elif opt == '--create':
4530       sys.stderr.write(warning_prefix +
4531           ': The behaviour produced by the --create option is now the '
4532           'default,\nand passing the option is deprecated.\n')
4533     elif opt == '--profile':
4534       profiling = 1
4535     elif opt == '--symbol-transform':
4536       ctx.symbol_transforms.append(value.split(":"))
4537
4538   if ctx.print_help:
4539     usage()
4540     sys.exit(0)
4541
4542   # Consistency check for options and arguments.
4543   if len(args) == 0:
4544     usage()
4545     sys.exit(1)
4546
4547   if len(args) > 1:
4548     sys.stderr.write(error_prefix +
4549                      ": must pass only one CVS repository.\n")
4550     usage()
4551     sys.exit(1)
4552
4553   ctx.cvsroot = args[0]
4554
4555   if not os.path.isdir(ctx.cvsroot):
4556     sys.stderr.write(error_prefix +
4557                      ": the given CVS repository path '%s' is not an "
4558                      "existing directory.\n" % ctx.cvsroot)
4559     sys.exit(1)
4560
4561   if ctx.use_cvs:
4562     # Ascend above the specified root if necessary, to find the cvs_repository
4563     # (a directory containing a CVSROOT directory) and the cvs_module (the
4564     # path of the conversion root within the cvs repository)
4565     # NB: cvs_module must be seperated by '/' *not* by os.sep .
4566     ctx.cvs_repository = os.path.abspath(ctx.cvsroot)
4567     prev_cvs_repository = None
4568     ctx.cvs_module = ""
4569     while prev_cvs_repository != ctx.cvs_repository:
4570       if os.path.isdir(os.path.join(ctx.cvs_repository, 'CVSROOT')):
4571         break
4572       prev_cvs_repository = ctx.cvs_repository
4573       ctx.cvs_repository, module_component = os.path.split(ctx.cvs_repository)
4574       ctx.cvs_module = module_component + "/" + ctx.cvs_module
4575     else:
4576       # Hit the root (of the drive, on Windows) without finding a CVSROOT dir.
4577       sys.stderr.write(error_prefix +
4578                        ": the path '%s' is not a CVS repository, nor a path " \
4579                        "within a CVS repository.  A CVS repository contains " \
4580                        "a CVSROOT directory within its root directory.\n" \
4581                        % ctx.cvsroot)
4582       sys.exit(1)
4583     os.environ['CVSROOT'] = ctx.cvs_repository
4584
4585   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
4586     sys.stderr.write(error_prefix +
4587                      ": must pass one of '-s' or '--dump-only'.\n")
4588     sys.exit(1)
4589
4590   def not_both(opt1val, opt1name, opt2val, opt2name):
4591     if opt1val and opt2val:
4592       sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
4593                        % (opt1name, opt2name))
4594       sys.exit(1)
4595
4596   not_both(ctx.target, '-s',
4597            ctx.dump_only, '--dump-only')
4598
4599   not_both(ctx.dump_only, '--dump-only',
4600            ctx.existing_svnrepos, '--existing-svnrepos')
4601
4602   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
4603            ctx.existing_svnrepos, '--existing-svnrepos')
4604
4605   not_both(ctx.dump_only, '--dump-only',
4606            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
4607
4608   not_both(ctx.quiet, '-q',
4609            ctx.verbose, '-v')
4610
4611   not_both(ctx.fs_type, '--fs-type',
4612            ctx.existing_svnrepos, '--existing-svnrepos')
4613
4614   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
4615     sys.stderr.write(error_prefix +
4616                      ": cannot pass --bdb-txn-nosync with --fs-type=%s.\n" \
4617                      % ctx.fs_type)
4618     sys.exit(1)
4619
4620   if ((string.find(ctx.trunk_base, '/') > -1)
4621       or (string.find(ctx.tags_base, '/') > -1)
4622       or (string.find(ctx.branches_base, '/') > -1)):
4623     sys.stderr.write("%s: cannot pass multicomponent path to "
4624                      "--trunk, --tags, or --branches yet.\n"
4625                      "  See http://cvs2svn.tigris.org/issues/show_bug.cgi?"
4626                      "id=7 for details.\n" % error_prefix)
4627     sys.exit(1)
4628
4629   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
4630     sys.stderr.write(error_prefix +
4631                      ": the svn-repos-path '%s' is not an "
4632                      "existing directory.\n" % ctx.target)
4633     sys.exit(1)
4634
4635   if not ctx.dump_only and not ctx.existing_svnrepos \
4636      and (not ctx.dry_run) and os.path.exists(ctx.target):
4637     sys.stderr.write(error_prefix +
4638                      ": the svn-repos-path '%s' exists.\nRemove it, or pass "
4639                      "'--existing-svnrepos'.\n" % ctx.target)
4640     sys.exit(1)
4641
4642   if ctx.mime_types_file:
4643     ctx.mime_mapper = MimeMapper()
4644     ctx.mime_mapper.set_mime_types_file(ctx.mime_types_file)
4645
4646   # Make sure the tmp directory exists.  Note that we don't check if
4647   # it's empty -- we want to be able to use, for example, "." to hold
4648   # tempfiles.  But if we *did* want check if it were empty, we'd do
4649   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
4650   if not os.path.exists(ctx.tmpdir):
4651     os.mkdir(ctx.tmpdir)
4652   elif not os.path.isdir(ctx.tmpdir):
4653     sys.stderr.write(error_prefix +
4654        ": cvs2svn tried to use '%s' for temporary files, but that path\n"
4655        "  exists and is not a directory.  Please make it be a directory,\n"
4656        "  or specify some other directory for temporary files.\n" \
4657                      % ctx.tmpdir)
4658     sys.exit(1)
4659
4660   if ctx.use_cvs:
4661     def cvs_ok():
4662       pipe = Popen3('cvs %s --version' % Ctx().cvs_global_arguments, True)
4663       pipe.tochild.close()
4664       pipe.fromchild.read()
4665       errmsg = pipe.childerr.read()
4666       status = pipe.wait()
4667       ok = len(errmsg) == 0 and status == 0
4668       return (ok, status, errmsg)
4669
4670     ctx.cvs_global_arguments = "-q -R"
4671     ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4672     if not ok:
4673       ctx.cvs_global_arguments = "-q"
4674       ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4675
4676     if not ok:
4677       sys.stderr.write(error_prefix +
4678                        ": error executing CVS: status %s, error output:\n" \
4679                        % (cvs_exitstatus) + cvs_errmsg)
4680
4681   # But do lock the tmpdir, to avoid process clash.
4682   try:
4683     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4684   except OSError, e:
4685     if e.errno == errno.EACCES:
4686       sys.stderr.write(error_prefix + ": Permission denied:"
4687                        + " No write access to output directory.\n")
4688       sys.exit(1)
4689     if e.errno == errno.EEXIST:
4690       sys.stderr.write(error_prefix +
4691           ": cvs2svn is using directory '%s' for temporary files, but\n"
4692           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
4693           "  cvs2svn process is currently using '%s' as its temporary\n"
4694           "  workspace.  If you are certain that is not the case,\n"
4695           "  then remove the '%s/cvs2svn.lock' subdirectory.\n" \
4696                        % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir))
4697       sys.exit(1)
4698     raise
4699   try:
4700     if profiling:
4701       import hotshot
4702       prof = hotshot.Profile('cvs2svn.hotshot')
4703       prof.runcall(convert, start_pass, end_pass)
4704       prof.close()
4705     else:
4706       convert(start_pass, end_pass)
4707   finally:
4708     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4709     except: pass
4710
4711 if __name__ == '__main__':
4712   main()