cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import string
  30 import getopt
  31 import stat
  32 import md5
  33 import marshal
  34 import errno
  35 import popen2
  36
  37 # Warnings and errors start with these strings.  They are typically
  38 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  39 warning_prefix = "WARNING"
  40 error_prefix = "ERROR"
  41
  42 # Make sure this Python is recent enough.
  43 if sys.hexversion < 0x2000000:
  44   sys.stderr.write("'%s: Python 2.0 or higher required, "
  45                    "see www.python.org.\n" % error_prefix)
  46   sys.exit(1)
  47
  48 # Pretend we have true booleans on older python versions
  49 try:
  50   True
  51 except:
  52   True = 1
  53   False = 0
  54
  55 # Minimal, incomplete, version of popen2.Popen3 for those platforms
  56 # for which popen2 does not provide it.
  57 try:
  58   Popen3 = popen2.Popen3
  59 except AttributeError:
  60   class Popen3:
  61     def __init__(self, cmd, capturestderr):
  62       if type(cmd) != str:
  63         cmd = " ".join(cmd)
  64       self.fromchild, self.tochild, self.childerr = popen2.popen3(cmd,
  65                                                                   mode='b')
  66     def wait(self):
  67       return self.fromchild.close() or self.tochild.close() or \
  68              self.childerr.close()
  69
  70 # DBM module selection
  71
  72 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
  73 #    so that the dbhash module used by anydbm will use bsddb3.
  74 try:
  75   import bsddb3
  76   sys.modules['bsddb'] = sys.modules['bsddb3']
  77 except ImportError:
  78   pass
  79
  80 # 2. These DBM modules are not good for cvs2svn.
  81 import anydbm
  82 if (anydbm._defaultmod.__name__ == 'dumbdbm'
  83     or anydbm._defaultmod.__name__ == 'dbm'):
  84   print 'ERROR: your installation of Python does not contain a suitable'
  85   print '  DBM module. This script cannot continue.'
  86   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  87   print '  for details.'
  88   sys.exit(1)
  89
  90 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
  91 #    Unfortunately, gdbm appears not to be trouble free, either.
  92 if hasattr(anydbm._defaultmod, 'bsddb') \
  93     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  94   try:
  95     gdbm = __import__('gdbm')
  96   except ImportError:
  97     sys.stderr.write(warning_prefix +
  98         ': The version of the bsddb module found '
  99         'on your computer has been reported to malfunction on some datasets, '
 100         'causing KeyError exceptions. You may wish to upgrade your Python to '
 101         'version 2.3 or later.\n')
 102   else:
 103     anydbm._defaultmod = gdbm
 104
 105 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 106 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 107 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 108
 109 # This really only matches standard '1.1.1.*'-style vendor revisions.
 110 # One could conceivably have a file whose default branch is 1.1.3 or
 111 # whatever, or was that at some point in time, with vendor revisions
 112 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 113 # is the only time this regexp gets used), we'd have no basis for
 114 # assuming that the non-standard vendor branch had ever been the
 115 # default branch anyway, so we don't want this to match them anyway.
 116 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 117
 118 # If this run's output is a repository, then (in the tmpdir) we use
 119 # a dumpfile of this name for repository loads.
 120 #
 121 # If this run's output is a dumpfile, then this is default name of
 122 # that dumpfile, but in the current directory (unless the user has
 123 # specified a dumpfile path, of course, in which case it will be
 124 # wherever the user said).
 125 DUMPFILE = 'cvs2svn-dump'
 126
 127 # This file appears with different suffixes at different stages of
 128 # processing.  CVS revisions are cleaned and sorted here, for commit
 129 # grouping.  See design-notes.txt for details.
 130 DATAFILE = 'cvs2svn-data'
 131
 132 # This file contains a marshalled copy of all the statistics that we
 133 # gather throughout the various runs of cvs2svn.  The data stored as a
 134 # marshalled dictionary.
 135 STATISTICS_FILE = 'cvs2svn-statistics'
 136
 137 # This text file contains records (1 per line) that describe svn
 138 # filesystem paths that are the opening and closing source revisions
 139 # for copies to tags and branches.  The format is as follows:
 140 #
 141 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 142 #
 143 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 144 # SVN_REVNUM are the primary and secondary sorting criteria for
 145 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 146 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 147 # A sorted version of the above file.
 148 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 149
 150 # This file is a temporary file for storing symbolic_name -> closing
 151 # CVSRevision until the end of our pass where we can look up the
 152 # corresponding SVNRevNum for the closing revs and write these out to
 153 # the SYMBOL_OPENINGS_CLOSINGS.
 154 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 155
 156 # Skeleton version of an svn filesystem.
 157 # (These supersede and will eventually replace the two above.)
 158 # See class SVNRepositoryMirror for how these work.
 159 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 160 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 161
 162 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 163 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 164 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 165
 166 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 167 # the CVSRevision is the last such that is a source for those symbolic
 168 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 169 # file, and this file's 1.3 is the latest (by date) revision among
 170 # *all* CVS files that is a source for branch B, then the
 171 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 172 # list at least B in its list.
 173 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 174
 175 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 176 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 177 ### the s-revs data in this database.
 178 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 179
 180 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 181 # names), values are ignorable.
 182 TAGS_DB = 'cvs2svn-tags.db'
 183
 184 # A list all tags.  Each line consists of the tag name and the number
 185 # of files in which it exists, separated by a space.
 186 TAGS_LIST = 'cvs2svn-tags.txt'
 187
 188 # A list of all branches.  The file is stored as a plain text file
 189 # to make it easy to look at in an editor.  Each line contains the
 190 # branch name, the number of files where the branch is created, the
 191 # commit count, and a list of tags and branches that are defined on
 192 # revisions in the branch.
 193 BRANCHES_LIST = 'cvs2svn-branches.txt'
 194
 195 # These two databases provide a bidirectional mapping between
 196 # CVSRevision.unique_key()s and Subversion revision numbers.
 197 #
 198 # The first maps CVSRevision.unique_key() to a number; the values are
 199 # not unique.
 200 #
 201 # The second maps a number to a list of CVSRevision.unique_key()s.
 202 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 203 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 204
 205 # This database maps svn_revnums to tuples of (symbolic_name, date).
 206 #
 207 # The svn_revnums are the revision numbers of all non-primary
 208 # SVNCommits.  No primary SVNCommit has a key in this database.
 209 #
 210 # The date is stored for all commits in this database.
 211 #
 212 # For commits that fill symbolic names, the symbolic_name is stored.
 213 # For commits that default branch syncs, the symbolic_name is None.
 214 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 215
 216 # This database maps svn_revnums of a default branch synchronization
 217 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 218 #
 219 # (NOTE: Secondary commits that fill branches and tags also have a
 220 # motivating commit, but we do not record it because it is (currently)
 221 # not needed for anything.)
 222 #
 223 # This mapping is used when generating the log message for the commit
 224 # that synchronizes the default branch with trunk.
 225 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 226
 227 # How many bytes to read at a time from a pipe.  128 kiB should be
 228 # large enough to be efficient without wasting too much memory.
 229 PIPE_READ_SIZE = 128 * 1024
 230
 231 # Record the default RCS branches, if any, for CVS filepaths.
 232 #
 233 # The keys are CVS filepaths, relative to the top of the repository
 234 # and with the ",v" stripped off, so they match the cvs paths used in
 235 # Commit.commit().  The values are vendor branch revisions, such as
 236 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 237 # represents the highest vendor branch revision thought to have ever
 238 # been head of the default branch.
 239 #
 240 # The reason we record a specific vendor revision, rather than a
 241 # default branch number, is that there are two cases to handle:
 242 #
 243 # One case is simple.  The RCS file lists a default branch explicitly
 244 # in its header, such as '1.1.1'.  In this case, we know that every
 245 # revision on the vendor branch is to be treated as head of trunk at
 246 # that point in time.
 247 #
 248 # But there's also a degenerate case.  The RCS file does not currently
 249 # have a default branch, yet we can deduce that for some period in the
 250 # past it probably *did* have one.  For example, the file has vendor
 251 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 252 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 253 # case, we should record 1.1.1.96 as the last vendor revision to have
 254 # been the head of the default branch.
 255 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 256
 257 # Records the author and log message for each changeset.
 258 # The keys are author+log digests, the same kind used to identify
 259 # unique revisions in the .revs, etc files.  Each value is a tuple
 260 # of two elements: '(author logmessage)'.
 261 METADATA_DB = "cvs2svn-metadata.db"
 262
 263 REVS_SUFFIX = '.revs'
 264 CLEAN_REVS_SUFFIX = '.c-revs'
 265 SORTED_REVS_SUFFIX = '.s-revs'
 266 RESYNC_SUFFIX = '.resync'
 267
 268 SVN_INVALID_REVNUM = -1
 269
 270 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 271
 272 # Things that can happen to a file.
 273 OP_NOOP   = '-'
 274 OP_ADD    = 'A'
 275 OP_DELETE = 'D'
 276 OP_CHANGE = 'C'
 277
 278 # A deltatext either does or doesn't represent some change.
 279 DELTATEXT_NONEMPTY = 'N'
 280 DELTATEXT_EMPTY    = 'E'
 281
 282 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 283
 284 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 285 OPENING = 'O'
 286 CLOSING = 'C'
 287
 288 def temp(basename):
 289   """Return a path to BASENAME in Ctx().tmpdir.
 290   This is a convenience function to save horizontal space in source."""
 291   return os.path.join(Ctx().tmpdir, basename)
 292
 293 # Since the unofficial set also includes [/\] we need to translate those
 294 # into ones that don't conflict with Subversion limitations.
 295 def _clean_symbolic_name(name):
 296   """Return symbolic name NAME, translating characters that Subversion
 297   does not allow in a pathname."""
 298   name = name.replace('/','++')
 299   name = name.replace('\\','--')
 300   return name
 301
 302 def _path_join(*components):
 303   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 304   Empty component are skipped."""
 305   return string.join(filter(None, components), '/')
 306
 307 def run_command(command):
 308   if os.system(command):
 309     sys.exit('Command failed: "%s"' % command)
 310
 311 def relative_name(cvsroot, fname):
 312   l = len(cvsroot)
 313   if fname[:l] == cvsroot:
 314     if fname[l] == os.sep:
 315       return string.replace(fname[l+1:], os.sep, '/')
 316     return string.replace(fname[l:], os.sep, '/')
 317   sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
 318                    " cvsroot\n" % (error_prefix, cvsroot, fname))
 319   sys.exit(1)
 320
 321 def get_co_pipe(c_rev, extra_arguments=''):
 322   """Return a command string, and the pipe created using that string.
 323   C_REV is a CVSRevision, and EXTRA_ARGUMENTS is used to add extra
 324   arguments.  The pipe returns the text of that CVS Revision."""
 325   ctx = Ctx()
 326   if ctx.use_cvs:
 327     pipe_cmd = 'cvs %s co -r%s -p %s %s' % \
 328                (ctx.cvs_global_arguments, c_rev.rev, extra_arguments,
 329                 escape_shell_arg(ctx.cvs_module + c_rev.cvs_path))
 330   else:
 331     pipe_cmd = 'co -q -x,v -p%s %s %s' % \
 332                (c_rev.rev, extra_arguments, escape_shell_arg(c_rev.rcs_path()))
 333   pipe = Popen3(pipe_cmd, True)
 334   pipe.tochild.close()
 335   return pipe_cmd, pipe
 336
 337 def generate_ignores(c_rev):
 338   # Read in props
 339   pipe_cmd, pipe = get_co_pipe(c_rev)
 340   buf = pipe.fromchild.read(PIPE_READ_SIZE)
 341   raw_ignore_val = ""
 342   while buf:
 343     raw_ignore_val = raw_ignore_val + buf
 344     buf = pipe.fromchild.read(PIPE_READ_SIZE)
 345   pipe.fromchild.close()
 346   error_output = pipe.childerr.read()
 347   exit_status = pipe.wait()
 348   if exit_status:
 349     sys.exit("%s: The command '%s' failed with exit status: %s\n"
 350              "and the following output:\n"
 351              "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
 352
 353   # Tweak props: First, convert any spaces to newlines...
 354   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 355   raw_ignores = raw_ignore_val.split('\n')
 356   ignore_vals = [ ]
 357   for ignore in raw_ignores:
 358     # Reset the list if we encounter a '!'
 359     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 360     if ignore == '!':
 361       ignore_vals = [ ]
 362       continue
 363     # Skip empty lines
 364     if len(ignore) == 0:
 365       continue
 366     ignore_vals.append(ignore)
 367   return ignore_vals
 368
 369 # Return a string that has not been returned by gen_key() before.
 370 gen_key_base = 0L
 371 def gen_key():
 372   global gen_key_base
 373   key = '%x' % gen_key_base
 374   gen_key_base = gen_key_base + 1
 375   return key
 376
 377 # ============================================================================
 378 # This code is copied with a few modifications from:
 379 #   subversion/subversion/bindings/swig/python/svn/core.py
 380
 381 if sys.platform == "win32":
 382   _escape_shell_arg_re = re.compile(r'(\\+)(\"|$)')
 383
 384   def escape_shell_arg(arg):
 385     # The (very strange) parsing rules used by the C runtime library are
 386     # described at:
 387     # http://msdn.microsoft.com/library/en-us/vclang/html/_pluslang_Parsing_C.2b2b_.Command.2d.Line_Arguments.asp
 388
 389     # double up slashes, but only if they are followed by a quote character
 390     arg = re.sub(_escape_shell_arg_re, r'\1\1\2', arg)
 391
 392     # surround by quotes and escape quotes inside
 393     arg = '"' + string.replace(arg, '"', '"^""') + '"'
 394     return arg
 395
 396
 397   def argv_to_command_string(argv):
 398     """Flatten a list of command line arguments into a command string.
 399
 400     The resulting command string is expected to be passed to the system
 401     shell which os functions like popen() and system() invoke internally.
 402     """
 403
 404     # According cmd's usage notes (cmd /?), it parses the command line by
 405     # "seeing if the first character is a quote character and if so, stripping
 406     # the leading character and removing the last quote character."
 407     # So to prevent the argument string from being changed we add an extra set
 408     # of quotes around it here.
 409     return '"' + string.join(map(escape_shell_arg, argv), " ") + '"'
 410
 411 else:
 412   def escape_shell_arg(str):
 413     return "'" + string.replace(str, "'", "'\\''") + "'"
 414
 415   def argv_to_command_string(argv):
 416     """Flatten a list of command line arguments into a command string.
 417
 418     The resulting command string is expected to be passed to the system
 419     shell which os functions like popen() and system() invoke internally.
 420     """
 421
 422     return string.join(map(escape_shell_arg, argv), " ")
 423 # ============================================================================
 424
 425 def format_date(date):
 426   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 427   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 428   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 429
 430 def sort_file(infile, outfile):
 431   # sort the log files
 432
 433   # GNU sort will sort our dates differently (incorrectly!) if our
 434   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 435   # it to 'C'
 436   if os.environ.has_key('LC_ALL'):
 437     lc_all_tmp = os.environ['LC_ALL']
 438   else:
 439     lc_all_tmp = None
 440   os.environ['LC_ALL'] = 'C'
 441   # The -T option to sort has a nice side effect.  The Win32 sort is
 442   # case insensitive and cannot be used, and since it does not
 443   # understand the -T option and dies if we try to use it, there is
 444   # no risk that we use that sort by accident.
 445   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 446   if lc_all_tmp is None:
 447     del os.environ['LC_ALL']
 448   else:
 449     os.environ['LC_ALL'] = lc_all_tmp
 450
 451 def print_node_tree(tree, root_node, indent_depth=0):
 452   """For debugging purposes.  Prints all nodes in TREE that are
 453   rooted at ROOT_NODE.  INDENT_DEPTH is merely for purposes of
 454   debugging with the print statement in this function."""
 455   if not indent_depth:
 456     print "TREE", "=" * 75
 457   print "TREE:", " " * (indent_depth * 2), root_node, tree[root_node]
 458   for key, value in tree[root_node].items():
 459     if key[0] == '/': #Skip flags
 460       continue
 461     print_node_tree(tree, value, (indent_depth + 1))
 462
 463 def match_regexp_list(regexp_list, string):
 464   """Test whether STRING matches any of the compiled regexps in REGEXP_LIST."""
 465   for regexp in regexp_list:
 466     if regexp.match(string):
 467       return True
 468   return False
 469
 470 class LF_EOL_Filter:
 471   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 472   into LFs only."""
 473   def __init__(self, stream):
 474     self.stream = stream
 475     self.carry_cr = False
 476     self.eof = False
 477
 478   def read(self, size):
 479     while True:
 480       buf = self.stream.read(size)
 481       self.eof = len(buf) == 0
 482       if self.carry_cr:
 483         buf = '\r' + buf
 484         self.carry_cr = False
 485       if not self.eof and buf[-1] == '\r':
 486         self.carry_cr = True
 487         buf = buf[:-1]
 488       buf = string.replace(buf, '\r\n', '\n')
 489       buf = string.replace(buf, '\r', '\n')
 490       if len(buf) > 0 or self.eof:
 491         return buf
 492
 493
 494 # These constants represent the log levels that this script supports
 495 LOG_WARN = -1
 496 LOG_QUIET = 0
 497 LOG_NORMAL = 1
 498 LOG_VERBOSE = 2
 499 class Log:
 500   """A Simple logging facility.  Each line will be timestamped is
 501   self.use_timestamps is TRUE.  This class is a Borg, see
 502   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 503   __shared_state = {}
 504   def __init__(self):
 505     self.__dict__ = self.__shared_state
 506     if self.__dict__:
 507       return
 508     self.log_level = LOG_NORMAL
 509     # Set this to true if you want to see timestamps on each line output.
 510     self.use_timestamps = None
 511     self.logger = sys.stdout
 512
 513   def _timestamp(self):
 514     """Output a detailed timestamp at the beginning of each line output."""
 515     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 516
 517   def write(self, log_level, *args):
 518     """This is the public method to use for writing to a file.  Only
 519     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 520     there are multiple ARGS, they will be separated by a space."""
 521     if log_level > self.log_level:
 522       return
 523     if self.use_timestamps:
 524       self._timestamp()
 525     self.logger.write(' '.join(map(str,args)) + "\n")
 526     # Ensure that log output doesn't get out-of-order with respect to
 527     # stderr output.
 528     self.logger.flush()
 529
 530
 531 class Cleanup:
 532   """This singleton class manages any files created by cvs2svn.  When
 533   you first create a file, call Cleanup.register, passing the
 534   filename, and the last pass that you need the file.  After the end
 535   of that pass, your file will be cleaned up after running an optional
 536   callback.  This class is a Borg, see
 537   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 538
 539   __shared_state = {}
 540   def __init__(self):
 541     self.__dict__ = self.__shared_state
 542     if self.__dict__:
 543       return
 544     self._log = {}
 545     self._callbacks = {}
 546
 547   def register(self, file, which_pass, callback=None):
 548     """Register FILE for cleanup at the end of WHICH_PASS, running
 549     function CALLBACK prior to removal.  Registering a given FILE is
 550     idempotent; you may register as many times as you wish, but it
 551     will only be cleaned up once.
 552
 553     Note that if a file is registered multiple times, only the first
 554     callback registered for that file will be called at cleanup
 555     time.  Also note that if you register a database file you must
 556     close the database before cleanup, e.g. using a callback."""
 557     if not self._log.has_key(which_pass):
 558       self._log[which_pass] = {}
 559     self._log[which_pass][file] = 1
 560     if callback and not self._callbacks.has_key(file):
 561       self._callbacks[file] = callback
 562
 563   def cleanup(self, which_pass):
 564     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 565     if not self._log.has_key(which_pass):
 566       return
 567     for file in self._log[which_pass].keys():
 568       Log().write(LOG_VERBOSE, "Deleting", file)
 569       if self._callbacks.has_key(file):
 570         self._callbacks[file]()
 571       os.unlink(file)
 572
 573
 574 # Always use these constants for opening databases.
 575 DB_OPEN_READ = 'r'
 576 DB_OPEN_NEW = 'n'
 577
 578 # A wrapper for anydbm that uses the marshal module to store items as
 579 # strings.
 580 class Database:
 581   def __init__(self, filename, mode):
 582     # pybsddb3 has a bug which prevents it from working with
 583     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 584     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 585     # for databases protected by lock and transaction support
 586     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 587     #
 588     # Therefore, manually perform the removal (we can do this, because
 589     # we know that for bsddb - but *not* anydbm in general - the database
 590     # consists of one file with the name we specify, rather than several
 591     # based on that name).
 592     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 593       if os.path.isfile(filename):
 594         os.unlink(filename)
 595       mode = 'c'
 596
 597     self.db = anydbm.open(filename, mode)
 598
 599   def has_key(self, key):
 600     return self.db.has_key(key)
 601
 602   def __getitem__(self, key):
 603     return marshal.loads(self.db[key])
 604
 605   def __setitem__(self, key, value):
 606     self.db[key] = marshal.dumps(value)
 607
 608   def __delitem__(self, key):
 609     del self.db[key]
 610
 611   def get(self, key, default):
 612     if self.has_key(key):
 613       return self.__getitem__(key)
 614     return default
 615
 616
 617 class StatsKeeper:
 618   __shared_state = { }
 619   def __init__(self):
 620     self.__dict__ = self.__shared_state
 621     if self.__dict__:
 622       return
 623     self.filename = temp(STATISTICS_FILE)
 624     Cleanup().register(self.filename, pass8)
 625     # This can get kinda large, so we don't store it in our data dict.
 626     self.repos_files = { }
 627
 628     if os.path.exists(self.filename):
 629       self.unarchive()
 630     else:
 631       self.data = { 'cvs_revs_count' : 0,
 632                     'tags': { },
 633                     'branches' : { },
 634                     'repos_size' : 0,
 635                     'repos_file_count' : 0,
 636                     'svn_rev_count' : None,
 637                     'first_rev_date' : 1L<<32,
 638                     'last_rev_date' : 0,
 639                     'pass_timings' : { },
 640                     'start_time' : 0,
 641                     'end_time' : 0,
 642                     }
 643
 644   def log_duration_for_pass(self, duration, pass_num):
 645     self.data['pass_timings'][pass_num] = duration
 646
 647   def set_start_time(self, start):
 648     self.data['start_time'] = start
 649
 650   def set_end_time(self, end):
 651     self.data['end_time'] = end
 652
 653   def _bump_item(self, key, amount=1):
 654     self.data[key] = self.data[key] + amount
 655
 656   def reset_c_rev_info(self):
 657     self.data['cvs_revs_count'] = 0
 658     self.data['tags'] = { }
 659     self.data['branches'] = { }
 660
 661   def record_c_rev(self, c_rev):
 662     self._bump_item('cvs_revs_count')
 663
 664     for tag in c_rev.tags:
 665       self.data['tags'][tag] = None
 666     for branch in c_rev.branches:
 667       self.data['branches'][branch] = None
 668
 669     if c_rev.timestamp < self.data['first_rev_date']:
 670       self.data['first_rev_date'] = c_rev.timestamp
 671
 672     if c_rev.timestamp > self.data['last_rev_date']:
 673       self.data['last_rev_date'] = c_rev.timestamp
 674
 675     # Only add the size if this is the first time we see the file.
 676     if not self.repos_files.has_key(c_rev.fname):
 677       self._bump_item('repos_size', c_rev.file_size)
 678     self.repos_files[c_rev.fname] = None
 679
 680     self.data['repos_file_count'] = len(self.repos_files)
 681
 682   def set_svn_rev_count(self, count):
 683     self.data['svn_rev_count'] = count
 684
 685   def svn_rev_count(self):
 686     return self.data['svn_rev_count']
 687
 688   def archive(self):
 689     open(self.filename, 'w').write(marshal.dumps(self.data))
 690
 691   def unarchive(self):
 692     self.data = marshal.loads(open(self.filename, 'r').read())
 693
 694   def __str__(self):
 695     svn_revs_str = ""
 696     if self.data['svn_rev_count'] is not None:
 697       svn_revs_str = ('Total SVN Commits:      %10s\n'
 698                       % self.data['svn_rev_count'])
 699
 700     return ('\n'                                \
 701             'cvs2svn Statistics:\n'             \
 702             '------------------\n'              \
 703             'Total CVS Files:        %10i\n'    \
 704             'Total CVS Revisions:    %10i\n'    \
 705             'Total Unique Tags:      %10i\n'    \
 706             'Total Unique Branches:  %10i\n'    \
 707             'CVS Repos Size in KB:   %10i\n'    \
 708             '%s'                                \
 709             'First Revision Date:    %s\n'      \
 710             'Last Revision Date:     %s\n'      \
 711             '------------------'                \
 712             % (self.data['repos_file_count'],
 713                self.data['cvs_revs_count'],
 714                len(self.data['tags']),
 715                len(self.data['branches']),
 716                (self.data['repos_size'] / 1024),
 717                svn_revs_str,
 718                time.ctime(self.data['first_rev_date']),
 719                time.ctime(self.data['last_rev_date']),
 720                ))
 721
 722   def timings(self):
 723     passes = self.data['pass_timings'].keys()
 724     passes.sort()
 725     str = 'Timings:\n------------------\n'
 726
 727     def desc(val):
 728       if val == 1: return "second"
 729       return "seconds"
 730
 731     for pass_num in passes:
 732       duration = int(self.data['pass_timings'][pass_num])
 733       p_str = ('pass %d:%6d %s\n'
 734                % (pass_num, duration, desc(duration)))
 735       str = str + p_str
 736
 737     total = int(self.data['end_time'] - self.data['start_time'])
 738     str = str + ('total: %6d %s' % (total, desc(total)))
 739     return str
 740
 741
 742 class LastSymbolicNameDatabase:
 743   """ Passing every CVSRevision in s-revs to this class will result in
 744   a Database whose key is the last CVS Revision a symbolicname was
 745   seen in, and whose value is a list of all symbolicnames that were
 746   last seen in that revision."""
 747   def __init__(self, mode):
 748     self.symbols = {}
 749     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 750     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 751
 752   # Once we've gone through all the revs,
 753   # symbols.keys() will be a list of all tags and branches, and
 754   # their corresponding values will be a key into the last CVS revision
 755   # that they were used in.
 756   def log_revision(self, c_rev):
 757     # Gather last CVS Revision for symbolic name info and tag info
 758     for tag in c_rev.tags:
 759       self.symbols[tag] = c_rev.unique_key()
 760     if c_rev.op is not OP_DELETE:
 761       for branch in c_rev.branches:
 762         self.symbols[branch] = c_rev.unique_key()
 763
 764   # Creates an inversion of symbols above--a dictionary of lists (key
 765   # = CVS rev unique_key: val = list of symbols that close in that
 766   # rev.
 767   def create_database(self):
 768     for sym, rev_unique_key in self.symbols.items():
 769       if self.symbol_revs_db.has_key(rev_unique_key):
 770         ary = self.symbol_revs_db[rev_unique_key]
 771         ary.append(sym)
 772         self.symbol_revs_db[rev_unique_key] = ary
 773       else:
 774         self.symbol_revs_db[rev_unique_key] = [sym]
 775
 776
 777 class CVSRevisionDatabase:
 778   """A Database to store CVSRevision objects and retrieve them by their
 779   unique_key()."""
 780
 781   def __init__(self, mode):
 782     """Initialize an instance, opening database in MODE (like the MODE
 783     argument to Database or anydbm.open())."""
 784     self.cvs_revs_db = Database(temp(CVS_REVS_DB), mode)
 785     Cleanup().register(temp(CVS_REVS_DB), pass8)
 786
 787   def log_revision(self, c_rev):
 788     """Add C_REV, a CVSRevision, to the database."""
 789     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
 790
 791   def get_revision(self, unique_key):
 792     """Return the CVSRevision stored under UNIQUE_KEY."""
 793     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
 794
 795
 796 class TagsDatabase(Database):
 797   """A Database to store which symbolic names are tags.
 798   Each key is a tag name.
 799   The value has no meaning, and should be set to None."""
 800   def __init__(self, mode):
 801     Database.__init__(self, temp(TAGS_DB), mode)
 802     Cleanup().register(temp(TAGS_DB), pass8)
 803
 804
 805 class CVSRevision:
 806   def __init__(self, ctx, *args):
 807     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
 808
 809     If CTX is None, the following members and methods of the
 810     instantiated CVSRevision class object will be unavailable (or
 811     simply will not work correctly, if at all):
 812        cvs_path
 813        svn_path
 814        svn_trunk_path
 815        is_default_branch_revision()
 816
 817     (Note that this class treats CTX as const, because the caller
 818     likely passed in a Borg instance of a Ctx.  The reason this class
 819     takes CTX as as a parameter, instead of just instantiating a Ctx
 820     itself, is that this class should be usable outside cvs2svn.)
 821
 822     If there is one argument in ARGS, it is a string, in the format of
 823     a line from a revs file.  Do *not* include a trailing newline.
 824
 825     If there are multiple ARGS, there must be 16 of them,
 826     comprising a parsed revs line:
 827        timestamp       -->  (int) date stamp for this cvs revision
 828        digest          -->  (string) digest of author+logmsg
 829        prev_timestamp  -->  (int) date stamp for the previous cvs revision
 830        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
 831        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
 832        rev             -->  (string) this CVS rev, e.g., "1.3"
 833        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
 834        file_in_attic   -->  (char or None) true if RCS file is in Attic
 835        file_executable -->  (char or None) true if RCS file has exec bit set.
 836        file_size       -->  (int) size of the RCS file
 837        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
 838        mode            -->  (string or None) "kkv", "kb", etc.
 839        branch_name     -->  (string or None) branch on which this rev occurred
 840        tags            -->  (list of strings) all tags on this revision
 841        branches        -->  (list of strings) all branches rooted in this rev
 842        fname           -->  (string) relative path of file in CVS repos
 843
 844     The two forms of initialization are equivalent."""
 845
 846     self._ctx = ctx
 847     if len(args) == 16:
 848       (self.timestamp, self.digest, self.prev_timestamp, self.op,
 849        self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
 850        self.file_executable, self.file_size, self.deltatext_code,
 851        self.fname,
 852        self.mode, self.branch_name, self.tags, self.branches) = args
 853     elif len(args) == 1:
 854       data = args[0].split(' ', 14)
 855       (self.timestamp, self.digest, self.prev_timestamp, self.op,
 856        self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
 857        self.file_executable, self.file_size, self.deltatext_code,
 858        self.mode, self.branch_name, numtags, remainder) = data
 859       # Patch up data items which are not simple strings
 860       self.timestamp = int(self.timestamp, 16)
 861       if self.prev_timestamp == "*":
 862         self.prev_timestamp = 0
 863       else:
 864         self.prev_timestamp = int(self.prev_timestamp)
 865       if self.prev_rev == "*":
 866         self.prev_rev = None
 867       if self.next_rev == "*":
 868         self.next_rev = None
 869       if self.file_in_attic == "*":
 870         self.file_in_attic = None
 871       if self.file_executable == "*":
 872         self.file_executable = None
 873       self.file_size = int(self.file_size)
 874       if self.mode == "*":
 875         self.mode = None
 876       if self.branch_name == "*":
 877         self.branch_name = None
 878       numtags = int(numtags)
 879       tags_and_numbranches_and_remainder = remainder.split(' ', numtags + 1)
 880       self.tags = tags_and_numbranches_and_remainder[:-2]
 881       numbranches = int(tags_and_numbranches_and_remainder[-2])
 882       remainder = tags_and_numbranches_and_remainder[-1]
 883       branches_and_fname = remainder.split(' ', numbranches)
 884       self.branches = branches_and_fname[:-1]
 885       self.fname = branches_and_fname[-1]
 886     else:
 887       raise TypeError, 'CVSRevision() takes 2 or 16 arguments (%d given)' % \
 888           (len(args) + 1)
 889     if ctx is not None:
 890       self.cvs_path = relative_name(self._ctx.cvsroot, self.fname[:-2])
 891       self.svn_path = self._make_path(self.cvs_path, self.branch_name)
 892       self.svn_trunk_path = self._make_path(self.cvs_path)
 893
 894   # The 'primary key' of a CVS Revision is the revision number + the
 895   # filename.  To provide a unique key (say, for a dict), we just glom
 896   # them together in a string.  By passing in self.prev_rev or
 897   # self.next_rev, you can get the unique key for their respective
 898   # CVSRevisions.
 899   def unique_key(self, revnum=None):
 900     if revnum is None:
 901       revnum = self.rev
 902     return revnum + "/" + self.fname
 903
 904   def __str__(self):
 905     return ('%08lx %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s' % (
 906       self.timestamp, self.digest, self.prev_timestamp or "*", self.op,
 907       (self.prev_rev or "*"), self.rev, (self.next_rev or "*"),
 908       (self.file_in_attic or "*"), (self.file_executable or "*"),
 909       self.file_size,
 910       self.deltatext_code, (self.mode or "*"), (self.branch_name or "*"),
 911       len(self.tags), self.tags and " " or "", " ".join(self.tags),
 912       len(self.branches), self.branches and " " or "", " ".join(self.branches),
 913       self.fname, ))
 914
 915   # Returns true if this CVSRevision is the opening CVSRevision for
 916   # NAME (for this RCS file).
 917   def opens_symbolic_name(self, name):
 918     if name in self.tags:
 919       return 1
 920     if name in self.branches:
 921       # If this c_rev opens a branch and our op is OP_DELETE, then
 922       # that means that the file that this c_rev belongs to was
 923       # created on the branch, so for all intents and purposes, this
 924       # c_rev is *technically* not an opening.  See Issue #62 for more
 925       # information.
 926       if self.op != OP_DELETE:
 927         return 1
 928     return 0
 929
 930   def is_default_branch_revision(self):
 931     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
 932     revision according to DEFAULT_BRANCHES_DB (see the conditions
 933     documented there), else return None."""
 934     if self._ctx._default_branches_db.has_key(self.cvs_path):
 935       val = self._ctx._default_branches_db[self.cvs_path]
 936       val_last_dot = val.rindex(".")
 937       our_last_dot = self.rev.rindex(".")
 938       default_branch = val[:val_last_dot]
 939       our_branch = self.rev[:our_last_dot]
 940       default_rev_component = int(val[val_last_dot + 1:])
 941       our_rev_component = int(self.rev[our_last_dot + 1:])
 942       if (default_branch == our_branch
 943           and our_rev_component <= default_rev_component):
 944         return 1
 945     # else
 946     return None
 947
 948   def _make_path(self, path, branch_name = None):
 949     """Return the trunk path or branch path for PATH.
 950
 951     If PATH is None, return None."""
 952     # For a while, we treated each top-level subdir of the CVS
 953     # repository as a "project root" and interpolated the appropriate
 954     # genealogy (trunk|tag|branch) in according to the official
 955     # recommended layout.  For example, the path '/foo/bar/baz.c' on
 956     # branch 'Rel2' would become
 957     #
 958     #   /foo/branches/Rel2/bar/baz.c
 959     #
 960     # and on trunk it would become
 961     #
 962     #   /foo/trunk/bar/baz.c
 963     #
 964     # However, we went back to the older and simpler method of just
 965     # prepending the genealogy to the front, instead of interpolating.
 966     # So now we produce:
 967     #
 968     #   /branches/Rel2/foo/bar/baz.c
 969     #   /trunk/foo/bar/baz.c
 970     #
 971     # Why?  Well, Jack Repenning pointed out that this way is much
 972     # friendlier to "anonymously rooted subtrees" (that's a tree where
 973     # the name of the top level dir doesn't matter, the point is that if
 974     # you cd into it and, say, run 'make', something good will happen).
 975     # By interpolating, we made it impossible to point cvs2svn at some
 976     # subdir in the CVS repository and convert it as a project, because
 977     # we'd treat every subdir underneath it as an independent project
 978     # root, which is probably not what the user wanted.
 979     #
 980     # Also, see Blair Zajac's post
 981     #
 982     #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
 983     #
 984     # and the surrounding thread, for why what people really want is a
 985     # way of specifying an in-repository prefix path, not interpolation.
 986     if path is None:
 987       return None
 988
 989     if branch_name:
 990       branch_name = _clean_symbolic_name(branch_name)
 991       return self._ctx.branches_base + '/' + branch_name + '/' + path
 992     else:
 993       return self._ctx.trunk_base + '/' + path
 994
 995   def rcs_path(self):
 996     """Returns the actual filesystem path to the RCS file of this
 997     CVSRevision."""
 998     if self.file_in_attic is None:
 999       return self.fname
1000     else:
1001       basepath, filename = os.path.split(self.fname)
1002       return os.path.join(basepath, 'Attic', filename)
1003
1004   def filename(self):
1005     "Return the last path component of self.fname, minus the ',v'"
1006     return os.path.split(self.fname)[-1][:-2]
1007
1008 class SymbolDatabase:
1009   """This database records information on all symbols in the RCS
1010   files.  It is created in pass 1 and it is used in pass 2."""
1011   def __init__(self):
1012     # A hash that maps tag names to commit counts
1013     self.tags = { }
1014     # A hash that maps branch names to lists of the format
1015     # [ create_count, commit_count, blockers ], where blockers
1016     # is a hash that lists the symbols that depend on the
1017     # the branch.  The blockers hash is used as a set, so the
1018     # values are not used.
1019     self.branches = { }
1020
1021   def register_tag_creation(self, name):
1022     """Register the creation of the tag NAME."""
1023     if not self.tags.has_key(name):
1024       self.tags[name] = 0
1025     self.tags[name] += 1
1026
1027   def _branch(self, name):
1028     """Helper function to get a branch node that will create and
1029     initialize the node if it does not exist."""
1030     if not self.branches.has_key(name):
1031       self.branches[name] = [ 0, 0, { } ]
1032     return self.branches[name]
1033
1034   def register_branch_creation(self, name):
1035     """Register the creation of the branch NAME."""
1036     self._branch(name)[0] += 1
1037
1038   def register_branch_commit(self, name):
1039     """Register a commit on the branch NAME."""
1040     self._branch(name)[1] += 1
1041
1042   def register_branch_blocker(self, name, blocker):
1043     """Register BLOCKER as a blocker on the branch NAME."""
1044     self._branch(name)[2][blocker] = None
1045
1046   def branch_has_commit(self, name):
1047     """Return non-zero if NAME has commits.  Returns 0 if name
1048     is not a branch or if it has no commits."""
1049     return self.branches.has_key(name) and self.branches[name][1]
1050
1051   def find_excluded_symbols(self, regexp_list):
1052     """Returns a hash of all symbols thaht match the regexps in
1053     REGEXP_LISTE.  The hash is used as a set so the values are
1054     not used."""
1055     excludes = { }
1056     for tag in self.tags.keys():
1057       if match_regexp_list(regexp_list, tag):
1058         excludes[tag] = None
1059     for branch in self.branches.keys():
1060       if match_regexp_list(regexp_list, branch):
1061         excludes[branch] = None
1062     return excludes
1063
1064   def find_branch_exclude_blockers(self, branch, excludes):
1065     """Find all blockers of BRANCH, excluding the ones in the hash
1066     EXCLUDES."""
1067     blockers = { }
1068     if excludes.has_key(branch):
1069       for blocker in self.branches[branch][2]:
1070         if not excludes.has_key(blocker):
1071           blockers[blocker] = None
1072     return blockers
1073
1074   def find_blocked_excludes(self, excludes):
1075     """Find all branches not in EXCLUDES that have blocking symbols that
1076     are not themselves excluded.  Return a hash that maps branch names
1077     to a hash of blockers.  The hash of blockes is used as a set so the
1078     values are not used."""
1079     blocked_branches = { }
1080     for branch in self.branches.keys():
1081       blockers = self.find_branch_exclude_blockers(branch, excludes)
1082       if blockers:
1083         blocked_branches[branch] = blockers
1084     return blocked_branches
1085
1086   def find_mismatches(self, excludes=None):
1087     """Find all symbols that are defined as both tags and branches,
1088     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1089     the symbol name, tag count, branch count and commit count."""
1090     if excludes is None:
1091       excludes = { }
1092     mismatches = [ ]
1093     for branch in self.branches.keys():
1094       if not excludes.has_key(branch) and self.tags.has_key(branch):
1095         mismatches.append((branch,                    # name
1096                            self.tags[branch],         # tag count
1097                            self.branches[branch][0],  # branch count
1098                            self.branches[branch][1])) # commit count
1099     return mismatches
1100
1101   def read(self):
1102     """Read the symbol database from files."""
1103     f = open(temp(TAGS_LIST))
1104     while 1:
1105       line = f.readline()
1106       if not line:
1107         break
1108       tag, count = line.split()
1109       self.tags[tag] = int(count)
1110
1111     f = open(temp(BRANCHES_LIST))
1112     while 1:
1113       line = f.readline()
1114       if not line:
1115         break
1116       words = line.split()
1117       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1118       for blocker in words[3:]:
1119         self.branches[words[0]][2][blocker] = None
1120
1121   def write(self):
1122     """Store the symbol database to files."""
1123     f = open(temp(TAGS_LIST), "w")
1124     Cleanup().register(temp(TAGS_LIST), pass2)
1125     for tag, count in self.tags.items():
1126       f.write("%s %d\n" % (tag, count))
1127
1128     f = open(temp(BRANCHES_LIST), "w")
1129     Cleanup().register(temp(BRANCHES_LIST), pass2)
1130     for branch, info in self.branches.items():
1131       f.write("%s %d %d" % (branch, info[0], info[1]))
1132       if info[2]:
1133         f.write(" ")
1134         f.write(" ".join(info[2].keys()))
1135       f.write("\n")
1136
1137 class CollectData(cvs2svn_rcsparse.Sink):
1138   def __init__(self):
1139     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1140     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1141     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1142     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1143     self.default_branches_db = Database(temp(DEFAULT_BRANCHES_DB), DB_OPEN_NEW)
1144     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1145     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1146     Cleanup().register(temp(METADATA_DB), pass8)
1147     self.fatal_errors = []
1148     self.num_files = 0
1149     self.symbol_db = SymbolDatabase()
1150
1151     # 1 if we've collected data for at least one file, None otherwise.
1152     self.found_valid_file = None
1153
1154     # See set_fname() for initializations of other variables.
1155
1156   def set_fname(self, canonical_name, filename):
1157     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1158     filesystem path to the file in question, and CANONICAL_NAME is
1159     FILENAME with the 'Attic' component removed (if the file is indeed
1160     in the Attic) ."""
1161     self.fname = canonical_name
1162
1163     # We calculate and save some file metadata here, where we can do
1164     # it only once per file, instead of waiting until later where we
1165     # would have to do the same calculations once per CVS *revision*.
1166
1167     self.rel_name = relative_name(Ctx().cvsroot, self.fname)[:-2]
1168
1169     # If the paths are not the same, then that means that the
1170     # canonical_name has had the 'Attic' component stripped out.
1171     self.file_in_attic = None
1172     if not canonical_name == filename:
1173       self.file_in_attic = 1
1174
1175     file_stat = os.stat(filename)
1176     # The size of our file in bytes
1177     self.file_size = file_stat[stat.ST_SIZE]
1178
1179     # Whether or not the executable bit is set.
1180     self.file_executable = None
1181     if file_stat[0] & stat.S_IXUSR:
1182       self.file_executable = 1
1183
1184     # revision -> [timestamp, author, old-timestamp]
1185     self.rev_data = { }
1186
1187     # Maps revision number (key) to the revision number of the
1188     # previous revision along this line of development.
1189     #
1190     # For the first revision R on a branch, we consider the revision
1191     # from which R sprouted to be the 'previous'.
1192     #
1193     # Note that this revision can't be determined arithmetically (due
1194     # to cvsadmin -o, which is why this is necessary).
1195     self.prev_rev = { }
1196
1197     # This dict is essentially self.prev_rev with the values mapped in
1198     # the other direction, so following key -> value will yield you
1199     # the next revision number
1200     self.next_rev = { }
1201
1202     # Track the state of each revision so that in set_revision_info,
1203     # we can determine if our op is an add/change/delete.  We can do
1204     # this because in set_revision_info, we'll have all of the
1205     # revisions for a file at our fingertips, and we need to examine
1206     # the state of our prev_rev to determine if we're an add or a
1207     # change--without the state of the prev_rev, we are unable to
1208     # distinguish between an add and a change.
1209     self.rev_state = { }
1210
1211     # Hash mapping branch numbers, like '1.7.2', to branch names,
1212     # like 'Release_1_0_dev'.
1213     self.branch_names = { }
1214
1215     # RCS flags (used for keyword expansion).
1216     self.mode = None
1217
1218     # Hash mapping revision numbers, like '1.7', to lists of names
1219     # indicating which branches sprout from that revision, like
1220     # ['Release_1_0_dev', 'experimental_driver', ...].
1221     self.branchlist = { }
1222
1223     # Like self.branchlist, but the values are lists of tag names that
1224     # apply to the key revision.
1225     self.taglist = { }
1226
1227     # If set, this is an RCS branch number -- rcsparse calls this the
1228     # "principal branch", but CVS and RCS refer to it as the "default
1229     # branch", so that's what we call it, even though the rcsparse API
1230     # setter method is still 'set_principal_branch'.
1231     self.default_branch = None
1232
1233     # If the RCS file doesn't have a default branch anymore, but does
1234     # have vendor revisions, then we make an educated guess that those
1235     # revisions *were* the head of the default branch up until the
1236     # commit of 1.2, at which point the file's default branch became
1237     # trunk.  This records the date at which 1.2 was committed.
1238     self.first_non_vendor_revision_date = None
1239
1240     # A list of all symbols defined for the current file.  Used to
1241     # prevent multiple definitions of a symbol, something which can
1242     # easily happen when --symbol-transform is used.
1243     self.defined_symbols = [ ]
1244
1245   def set_principal_branch(self, branch):
1246     self.default_branch = branch
1247
1248   def set_expansion(self, mode):
1249     self.mode = mode
1250
1251   def set_branch_name(self, branch_number, name):
1252     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1253     and that NAME sprouts from BRANCH_NUMBER .
1254     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1255     for example '1.7.2' (never '1.7.0.2')."""
1256     if not self.branch_names.has_key(branch_number):
1257       self.branch_names[branch_number] = name
1258       # The branchlist is keyed on the revision number from which the
1259       # branch sprouts, so strip off the odd final component.
1260       sprout_rev = branch_number[:branch_number.rfind(".")]
1261       if not self.branchlist.has_key(sprout_rev):
1262         self.branchlist[sprout_rev] = []
1263       self.branchlist[sprout_rev].append(name)
1264       self.symbol_db.register_branch_creation(name)
1265     else:
1266       sys.stderr.write("%s: in '%s':\n"
1267                        "   branch '%s' already has name '%s',\n"
1268                        "   cannot also have name '%s', ignoring the latter\n"
1269                        % (warning_prefix, self.fname, branch_number,
1270                           self.branch_names[branch_number], name))
1271
1272   def rev_to_branch_name(self, revision):
1273     """Return the name of the branch on which REVISION lies.
1274     REVISION is a non-branch revision number with an even number of,
1275     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1276     For the convenience of callers, REVISION can also be a trunk
1277     revision such as '1.2', in which case just return None."""
1278     if trunk_rev.match(revision):
1279       return None
1280     return self.branch_names.get(revision[:revision.rindex(".")])
1281
1282   def add_cvs_branch(self, revision, branch_name):
1283     """Record the root revision and branch revision for BRANCH_NAME,
1284     based on REVISION.  REVISION is a CVS branch number having an even
1285     number of components where the second-to-last is '0'.  For
1286     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1287     from 1.7 and has branch number 1.7.2."""
1288     last_dot = revision.rfind(".")
1289     branch_rev = revision[:last_dot]
1290     last2_dot = branch_rev.rfind(".")
1291     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1292     self.set_branch_name(branch_rev, branch_name)
1293
1294   def define_tag(self, name, revision):
1295     """Record a bidirectional mapping between symbolic NAME and REVISION.
1296     REVISION is an unprocessed revision number from the RCS file's
1297     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1298     This function will determine what kind of symbolic name it is by
1299     inspection, and record it in the right places."""
1300     for (pattern, replacement) in Ctx().symbol_transforms:
1301       newname = re.sub(pattern, replacement, name)
1302       if newname != name:
1303         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1304                     % (name, newname))
1305         name = newname
1306     if name in self.defined_symbols:
1307       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1308                 % (error_prefix, name, self.fname)
1309       sys.stderr.write(err + "\n")
1310       self.fatal_errors.append(err)
1311     self.defined_symbols.append(name)
1312     if branch_tag.match(revision):
1313       self.add_cvs_branch(revision, name)
1314     elif vendor_tag.match(revision):
1315       self.set_branch_name(revision, name)
1316     else:
1317       if not self.taglist.has_key(revision):
1318         self.taglist[revision] = []
1319       self.taglist[revision].append(name)
1320       self.symbol_db.register_tag_creation(name)
1321
1322   def define_revision(self, revision, timestamp, author, state,
1323                       branches, next):
1324
1325     # Record the state of our revision for later calculations
1326     self.rev_state[revision] = state
1327
1328     # store the rev_data as a list in case we have to jigger the timestamp
1329     self.rev_data[revision] = [int(timestamp), author, None]
1330
1331     # When on trunk, the RCS 'next' revision number points to what
1332     # humans might consider to be the 'previous' revision number.  For
1333     # example, 1.3's RCS 'next' is 1.2.
1334     #
1335     # However, on a branch, the RCS 'next' revision number really does
1336     # point to what humans would consider to be the 'next' revision
1337     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1338     #
1339     # In other words, in RCS, 'next' always means "where to find the next
1340     # deltatext that you need this revision to retrieve.
1341     #
1342     # That said, we don't *want* RCS's behavior here, so we determine
1343     # whether we're on trunk or a branch and set self.prev_rev
1344     # accordingly.
1345     #
1346     # One last thing.  Note that if REVISION is a branch revision,
1347     # instead of mapping REVISION to NEXT, we instead map NEXT to
1348     # REVISION.  Since we loop over all revisions in the file before
1349     # doing anything with the data we gather here, this 'reverse
1350     # assignment' effectively does the following:
1351     #
1352     # 1. Gives us no 'prev' value for REVISION (in this
1353     # iteration... it may have been set in a previous iteration)
1354     #
1355     # 2. Sets the 'prev' value for the revision with number NEXT to
1356     # REVISION.  So when we come around to the branch revision whose
1357     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1358     # set.
1359     if trunk_rev.match(revision):
1360       self.prev_rev[revision] = next
1361       self.next_rev[next] = revision
1362     elif next:
1363       self.prev_rev[next] = revision
1364       self.next_rev[revision] = next
1365
1366     for b in branches:
1367       self.prev_rev[b] = revision
1368
1369     # Ratchet up the highest vendor head revision, if necessary.
1370     if self.default_branch:
1371       default_branch_root = self.default_branch + "."
1372       if ((revision.find(default_branch_root) == 0)
1373           and (default_branch_root.count('.') == revision.count('.'))):
1374         # This revision is on the default branch, so record that it is
1375         # the new highest default branch head revision.
1376         self.default_branches_db[self.rel_name] = revision
1377     else:
1378       # No default branch, so make an educated guess.
1379       if revision == '1.2':
1380         # This is probably the time when the file stopped having a
1381         # default branch, so make a note of it.
1382         self.first_non_vendor_revision_date = timestamp
1383       else:
1384         m = vendor_revision.match(revision)
1385         if m and ((not self.first_non_vendor_revision_date)
1386                   or (timestamp < self.first_non_vendor_revision_date)):
1387           # We're looking at a vendor revision, and it wasn't
1388           # committed after this file lost its default branch, so bump
1389           # the maximum trunk vendor revision in the permanent record.
1390           self.default_branches_db[self.rel_name] = revision
1391
1392     if not trunk_rev.match(revision):
1393       # Check for unlabeled branches, record them.  We tried to collect
1394       # all branch names when we parsed the symbolic name header
1395       # earlier, of course, but that didn't catch unlabeled branches.
1396       # If a branch is unlabeled, this is our first encounter with it,
1397       # so we have to record its data now.
1398       branch_number = revision[:revision.rindex(".")]
1399       if not self.branch_names.has_key(branch_number):
1400         branch_name = "unlabeled-" + branch_number
1401         self.set_branch_name(branch_number, branch_name)
1402
1403       # Register the commit on this non-trunk branch
1404       branch_name = self.branch_names[branch_number]
1405       self.symbol_db.register_branch_commit(branch_name)
1406
1407   def tree_completed(self):
1408     "The revision tree has been parsed.  Analyze it for consistency."
1409
1410     # Our algorithm depends upon the timestamps on the revisions occuring
1411     # monotonically over time.  That is, we want to see rev 1.34 occur in
1412     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1413     # sorting), and then tried to insert 1.34, we'd be screwed.
1414
1415     # to perform the analysis, we'll simply visit all of the 'previous'
1416     # links that we have recorded and validate that the timestamp on the
1417     # previous revision is before the specified revision
1418
1419     # if we have to resync some nodes, then we restart the scan. just keep
1420     # looping as long as we need to restart.
1421     while 1:
1422       for current, prev in self.prev_rev.items():
1423         if not prev:
1424           # no previous revision exists (i.e. the initial revision)
1425           continue
1426         t_c = self.rev_data[current][0]
1427         t_p = self.rev_data[prev][0]
1428         if t_p >= t_c:
1429           # the previous revision occurred later than the current revision.
1430           # shove the previous revision back in time (and any before it that
1431           # may need to shift).
1432
1433           # We sync backwards and not forwards because any given CVS
1434           # Revision has only one previous revision.  However, a CVS
1435           # Revision can *be* a previous revision for many other
1436           # revisions (e.g., a revision that is the source of multiple
1437           # branches).  This becomes relevant when we do the secondary
1438           # synchronization in pass 2--we can make certain that we
1439           # don't resync a revision earlier than it's previous
1440           # revision, but it would be non-trivial to make sure that we
1441           # don't resync revision R *after* any revisions that have R
1442           # as a previous revision.
1443           while t_p >= t_c:
1444             self.rev_data[prev][0] = t_c - 1    # new timestamp
1445             self.rev_data[prev][2] = t_p        # old timestamp
1446             delta = t_c - 1 - t_p
1447             msg =  "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1448                   % (self.rel_name,
1449                      prev, time.ctime(t_p), delta)
1450             Log().write(LOG_VERBOSE, msg)
1451             if (delta > COMMIT_THRESHOLD
1452                 or delta < (COMMIT_THRESHOLD * -1)):
1453               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1454               Log().write(LOG_WARN, str % (warning_prefix, self.rel_name,
1455                                            delta))
1456             current = prev
1457             prev = self.prev_rev[current]
1458             if not prev:
1459               break
1460             t_c = t_c - 1               # self.rev_data[current][0]
1461             t_p = self.rev_data[prev][0]
1462
1463           # break from the for-loop
1464           break
1465       else:
1466         # finished the for-loop (no resyncing was performed)
1467         return
1468
1469   def set_revision_info(self, revision, log, text):
1470     timestamp, author, old_ts = self.rev_data[revision]
1471     digest = sha.new(log + '\0' + author).hexdigest()
1472     if old_ts:
1473       # the timestamp on this revision was changed. log it for later
1474       # resynchronization of other files's revisions that occurred
1475       # for this time and log message.
1476       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1477
1478     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1479     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1480     #
1481     # If revision 1.1 appears to have been created via 'cvs add'
1482     # instead of 'cvs import', then this file probably never had a
1483     # default branch, so retroactively remove its record in the
1484     # default branches db.  The test is that the log message CVS uses
1485     # for 1.1 in imports is "Initial revision\n" with no period.
1486     if revision == '1.1' and log != 'Initial revision\n':
1487       if self.default_branches_db.has_key(self.rel_name):
1488         del self.default_branches_db[self.rel_name]
1489
1490     # Get the timestamp of the previous revision
1491     prev_rev = self.prev_rev.get(revision, None)
1492     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1493
1494     # How to tell if a CVSRevision is an add, a change, or a deletion:
1495     #
1496     # It's a delete if RCS state is 'dead'
1497     #
1498     # It's an add if RCS state is 'Exp.' and
1499     #      - we either have no previous revision
1500     #        or
1501     #      - we have a previous revision whose state is 'dead'
1502     #
1503     # Anything else is a change.
1504     if self.rev_state[revision] == 'dead':
1505       op = OP_DELETE
1506     elif ((self.prev_rev.get(revision, None) is None)
1507           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1508       op = OP_ADD
1509     else:
1510       op = OP_CHANGE
1511
1512     if text:
1513       deltatext_code = DELTATEXT_NONEMPTY
1514     else:
1515       deltatext_code = DELTATEXT_EMPTY
1516
1517     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp, op,
1518                         self.prev_rev[revision], revision,
1519                         self.next_rev.get(revision),
1520                         self.file_in_attic, self.file_executable,
1521                         self.file_size,
1522                         deltatext_code, self.fname,
1523                         self.mode, self.rev_to_branch_name(revision),
1524                         self.taglist.get(revision, []),
1525                         self.branchlist.get(revision, []))
1526     self.revs.write(str(c_rev) + "\n")
1527     StatsKeeper().record_c_rev(c_rev)
1528
1529     if not self.metadata_db.has_key(digest):
1530       self.metadata_db[digest] = (author, log)
1531
1532   def parse_completed(self):
1533     # Walk through all branches and tags and register them with
1534     # their parent branch in the symbol database.
1535     for revision, symbols in self.taglist.items() + self.branchlist.items():
1536       for symbol in symbols:
1537         name = self.rev_to_branch_name(revision)
1538         if name is not None:
1539           self.symbol_db.register_branch_blocker(name, symbol)
1540
1541     self.num_files = self.num_files + 1
1542
1543   def write_symbol_db(self):
1544     self.symbol_db.write()
1545
1546 class SymbolingsLogger:
1547   """Manage the file that contains lines for symbol openings and
1548   closings.
1549
1550   This data will later be used to determine valid SVNRevision ranges
1551   from which a file can be copied when creating a branch or tag in
1552   Subversion.  Do this by finding "Openings" and "Closings" for each
1553   file copied onto a branch or tag.
1554
1555   An "Opening" is the CVSRevision from which a given branch/tag
1556   sprouts on a path.
1557
1558   The "Closing" for that branch/tag and path is the next CVSRevision
1559   on the same line of development as the opening.
1560
1561   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1562   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1563   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1564   'foo.c'.  Note that there may be many revisions chronologically
1565   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1566   perhaps even including on branch BEE itself.  But 1.3 is the next
1567   revision *on the same line* as 1.2, that is why it is the closing
1568   revision for those symbolic names of which 1.2 is the opening.
1569
1570   The reason for doing all this hullabaloo is to make branch and tag
1571   creation as efficient as possible by minimizing the number of copies
1572   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1573   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1574   means that when creating branch BEE, there is some motivation to do
1575   the copy from one of 17-30.  Now if there were another file,
1576   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1577   to revisions 24 and 39 in Subversion, we would know that the ideal
1578   thing would be to copy the branch from somewhere between 24 and 29,
1579   inclusive.
1580   """
1581   def __init__(self):
1582     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1583     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1584     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1585     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1586
1587     # This keys of this dictionary are Subversion repository *source*
1588     # paths for which we've encountered an 'opening'.  The values are
1589     # the symbolic names that this path has opened.  The only paths
1590     # that should be in this dict are paths whose corresponding
1591     # CVSRevision is a default branch revision.
1592     self.open_paths_with_default_branches = { }
1593
1594   def log_revision(self, c_rev, svn_revnum):
1595     """Log any openings found in C_REV, and if C_REV.next_rev is not
1596     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1597     any) will have its revnum determined later."""
1598     for name in c_rev.tags + c_rev.branches:
1599       name = _clean_symbolic_name(name)
1600       self._note_default_branch_opening(c_rev, name)
1601       if c_rev.op != OP_DELETE:
1602         self._log(name, svn_revnum, c_rev.svn_path, OPENING)
1603
1604       # If our c_rev has a next_rev, then that's the closing rev for
1605       # this source revision.  Log it to closings for later processing
1606       # since we don't know the svn_revnum yet.
1607       if c_rev.next_rev is not None:
1608         self.closings.write('%s %s\n' %
1609                             (name, c_rev.unique_key(c_rev.next_rev)))
1610
1611   def _log(self, name, svn_revnum, svn_path, type):
1612     """Write out a single line to the symbol_openings_closings file
1613     representing that svn_revnum of svn_path is either the opening or
1614     closing (TYPE) of NAME (a symbolic name).
1615
1616     TYPE should only be one of the following global constants:
1617     OPENING or CLOSING."""
1618     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1619     self.symbolings.write('%s %.8d %s %s\n' % (name, svn_revnum,
1620                                                type, svn_path))
1621
1622   def close(self):
1623     """Iterate through the closings file, lookup the svn_revnum for
1624     each closing CVSRevision, and write a proper line out to the
1625     symbolings file."""
1626     # Use this to get the c_rev.svn_path of our rev_key
1627     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1628
1629     self.closings.close()
1630     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1631       (name, rev_key) = line.rstrip().split(" ", 1)
1632       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1633
1634       c_rev = cvs_revs_db.get_revision(rev_key)
1635       self._log(name, svn_revnum, c_rev.svn_path, CLOSING)
1636
1637     self.symbolings.close()
1638
1639   def _note_default_branch_opening(self, c_rev, symbolic_name):
1640     """If C_REV is a default branch revision, log C_REV.svn_trunk_path
1641     as an opening for SYMBOLIC_NAME."""
1642     path = c_rev.svn_trunk_path
1643     if not self.open_paths_with_default_branches.has_key(path):
1644       self.open_paths_with_default_branches[path] = [ ]
1645     self.open_paths_with_default_branches[path].append(symbolic_name)
1646
1647   def log_default_branch_closing(self, c_rev, svn_revnum):
1648     """If self.open_paths_with_default_branches contains
1649     C_REV.svn_trunk_path, then call log each name in
1650     self.open_paths_with_default_branches[C_REV.svn_trunk_path] as a
1651     closing with SVN_REVNUM as the closing revision number. """
1652     path = c_rev.svn_trunk_path
1653     if self.open_paths_with_default_branches.has_key(path):
1654       # log each symbol as a closing
1655       for name in self.open_paths_with_default_branches[path]:
1656         self._log(name, svn_revnum, path, CLOSING)
1657       # Remove them from the openings list as we're done with them.
1658       del self.open_paths_with_default_branches[path]
1659
1660
1661 class PersistenceManager:
1662   """The PersistenceManager allows us to effectively store SVNCommits
1663   to disk and retrieve them later using only their subversion revision
1664   number as the key.  It also returns the subversion revision number
1665   for a given CVSRevision's unique key.
1666
1667   All information pertinent to each SVNCommit is stored in a series of
1668   on-disk databases so that SVNCommits can be retrieved on-demand.
1669
1670   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
1671   In 'new' mode, PersistenceManager will initialize a new set of on-disk
1672   databases and be fully-featured.
1673   In 'read' mode, PersistenceManager will open existing on-disk databases
1674   and the set_* methods will be unavailable."""
1675   def __init__(self, mode):
1676     self.mode = mode
1677     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
1678       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
1679     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
1680     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
1681     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
1682     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
1683     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
1684     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
1685     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
1686     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
1687     ###PERF kff Elsewhere there are comments about sucking the tags db
1688     ### into memory.  That seems like a good idea.
1689     if not Ctx().trunk_only:
1690       self.tags_db = TagsDatabase(DB_OPEN_READ)
1691       self.motivating_revnums = Database(temp(MOTIVATING_REVNUMS), mode)
1692       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
1693
1694     # "branch_name" -> svn_revnum in which branch was last filled.
1695     # This is used by CVSCommit._pre_commit, to prevent creating a fill
1696     # revision which would have nothing to do.
1697     self.last_filled = {}
1698
1699   def get_svn_revnum(self, cvs_rev_unique_key):
1700     """Return the Subversion revision number in which
1701     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
1702     is no mapping for CVS_REV_UNIQUE_KEY."""
1703     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
1704
1705   def get_svn_commit(self, svn_revnum):
1706     """Return an SVNCommit that corresponds to SVN_REVNUM.
1707
1708     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
1709
1710     This method can throw SVNCommitInternalInconsistencyError.
1711     """
1712     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
1713     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
1714     if c_rev_keys == None:
1715       return None
1716
1717     digest = None
1718     for key in c_rev_keys:
1719       c_rev = self.cvs_revisions.get_revision(key)
1720       svn_commit.add_revision(c_rev)
1721       # Set the author and log message for this commit by using
1722       # CVSRevision metadata, but only if haven't done so already.
1723       if digest is None:
1724         digest = c_rev.digest
1725         author, log_msg = self.svn_commit_metadata[digest]
1726         svn_commit.set_author(author)
1727         svn_commit.set_log_msg(log_msg)
1728
1729     # If we're doing a trunk-only conversion, we don't need to do any more work.
1730     if Ctx().trunk_only:
1731       return svn_commit
1732
1733     name, date = self._get_name_and_date(svn_revnum)
1734     if name:
1735       svn_commit.set_symbolic_name(name)
1736       svn_commit.set_date(date)
1737       if self.tags_db.has_key(name):
1738         svn_commit.is_tag = 1
1739
1740     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
1741     if motivating_revnum:
1742       svn_commit.set_motivating_revnum(int(motivating_revnum))
1743       svn_commit.set_date(date)
1744
1745     if len(svn_commit.cvs_revs) and name:
1746       msg = """An SVNCommit cannot have cvs_revisions *and* a
1747       corresponding symbolic name ('%s') to fill.""" % name
1748       raise SVNCommit.SVNCommitInternalInconsistencyError(msg)
1749
1750     return svn_commit
1751
1752   def set_cvs_revs(self, svn_revnum, cvs_revs):
1753     """Record the bidirectional mapping between SVN_REVNUM and
1754     CVS_REVS."""
1755     if self.mode == DB_OPEN_READ:
1756       raise RuntimeError, \
1757           'Write operation attempted on read-only PersistenceManager'
1758     for c_rev in cvs_revs:
1759       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
1760     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
1761     for c_rev in cvs_revs:
1762       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
1763
1764   def set_name_and_date(self, svn_revnum, name, date):
1765     """Associate symbolic name NAME and DATE with SVN_REVNUM."""
1766     if self.mode == DB_OPEN_READ:
1767       raise RuntimeError, \
1768           'Write operation attempted on read-only PersistenceManager'
1769     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
1770     self.last_filled[name] = svn_revnum
1771
1772   def _get_name_and_date(self, svn_revnum):
1773     """Return a tuple containing the symbolic name and date associated
1774     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
1775     associated with it."""
1776     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
1777
1778   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
1779     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
1780     if self.mode == DB_OPEN_READ:
1781       raise RuntimeError, \
1782           'Write operation attempted on read-only PersistenceManager'
1783     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
1784
1785
1786 class CVSCommit:
1787   """Each instance of this class contains a number of CVS Revisions
1788   that correspond to one or more Subversion Commits.  After all CVS
1789   Revisions are added to the grouping, calling process_revisions will
1790   generate a Subversion Commit (or Commits) for the set of CVS
1791   Revisions in the grouping."""
1792
1793   def __init__(self, digest, author, log):
1794     self.digest = digest
1795     self.author = author
1796     self.log = log
1797
1798     # Symbolic names for which the last source revision has already
1799     # been seen and for which the CVSRevisionAggregator has already
1800     # generated a fill SVNCommit.  See self.process_revisions().
1801     self.done_symbols = [ ]
1802
1803     self.files = { }
1804     # Lists of CVSRevisions
1805     self.changes = [ ]
1806     self.deletes = [ ]
1807
1808     # Start out with a t_min higher than any incoming time T, and a
1809     # t_max lower than any incoming T.  This way the first T will
1810     # push t_min down to T, and t_max up to T, naturally (without any
1811     # special-casing), and successive times will then ratchet them
1812     # outward as appropriate.
1813     self.t_min = 1L<<32
1814     self.t_max = 0
1815
1816     # This will be set to the SVNCommit that occurs in self._commit.
1817     self.motivating_commit = None
1818
1819     # This is a list of all non-primary commits motivated by the main
1820     # commit.  We gather these so that we can set their dates to the
1821     # same date as the primary commit.
1822     self.secondary_commits = [ ]
1823
1824     # State for handling default branches.
1825     #
1826     # Here is a tempting, but ultimately nugatory, bit of logic, which
1827     # I share with you so you may appreciate the less attractive, but
1828     # refreshingly non-nugatory, logic which follows it:
1829     #
1830     # If some of the commits in this txn happened on a non-trunk
1831     # default branch, then those files will have to be copied into
1832     # trunk manually after being changed on the branch (because the
1833     # RCS "default branch" appears as head, i.e., trunk, in practice).
1834     # As long as those copies don't overwrite any trunk paths that
1835     # were also changed in this commit, then we can do the copies in
1836     # the same revision, because they won't cover changes that don't
1837     # appear anywhere/anywhen else.  However, if some of the trunk dst
1838     # paths *did* change in this commit, then immediately copying the
1839     # branch changes would lose those trunk mods forever.  So in this
1840     # case, we need to do at least that copy in its own revision.  And
1841     # for simplicity's sake, if we're creating the new revision for
1842     # even one file, then we just do all such copies together in the
1843     # new revision.
1844     #
1845     # Doesn't that sound nice?
1846     #
1847     # Unfortunately, Subversion doesn't support copies with sources
1848     # in the current txn.  All copies must be based in committed
1849     # revisions.  Therefore, we generate the above-described new
1850     # revision unconditionally.
1851     #
1852     # This is a list of c_revs, and a c_rev is appended for each
1853     # default branch commit that will need to be copied to trunk (or
1854     # deleted from trunk) in some generated revision following the
1855     # "regular" revision.
1856     self.default_branch_cvs_revisions = [ ]
1857
1858   def __cmp__(self, other):
1859     # Commits should be sorted by t_max.  If both self and other have
1860     # the same t_max, break the tie using t_min, and lastly, digest
1861     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
1862             or cmp(self.digest, other.digest))
1863
1864   def has_file(self, fname):
1865     return self.files.has_key(fname)
1866
1867   def revisions(self):
1868     return self.changes + self.deletes
1869
1870   def opens_symbolic_name(self, name):
1871     """Returns true if any CVSRevision in this commit is on a tag or a
1872     branch or is the origin of a tag or branch."""
1873     for c_rev in self.revisions():
1874       if c_rev.opens_symbolic_name(name):
1875         return 1
1876     return 0
1877
1878   def add_revision(self, c_rev):
1879     # Record the time range of this commit.
1880     #
1881     # ### ISSUE: It's possible, though unlikely, that the time range
1882     # of a commit could get gradually expanded to be arbitrarily
1883     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
1884     # problem, and anyway deciding where to break it up would be a
1885     # judgement call.  For now, we just print a warning in commit() if
1886     # this happens.
1887     if c_rev.timestamp < self.t_min:
1888       self.t_min = c_rev.timestamp
1889     if c_rev.timestamp > self.t_max:
1890       self.t_max = c_rev.timestamp
1891
1892     if c_rev.op == OP_DELETE:
1893       self.deletes.append(c_rev)
1894     else:
1895       # OP_CHANGE or OP_ADD
1896       self.changes.append(c_rev)
1897
1898     self.files[c_rev.fname] = 1
1899
1900   def _pre_commit(self):
1901     """Generates any SVNCommits that must exist before the main
1902     commit."""
1903
1904     # There may be multiple c_revs in this commit that would cause
1905     # branch B to be filled, but we only want to fill B once.  On the
1906     # other hand, there might be multiple branches committed on in
1907     # this commit.  Whatever the case, we should count exactly one
1908     # commit per branch, because we only fill a branch once per
1909     # CVSCommit.  This list tracks which branches we've already
1910     # counted.
1911     accounted_for_sym_names = [ ]
1912
1913     def fill_needed(c_rev, pm):
1914       """Return 1 if this is the first commit on a new branch (for
1915       this file) and we need to fill the branch; else return 0
1916       (meaning that some other file's first commit on the branch has
1917       already done the fill for us).
1918
1919       If C_REV.op is OP_ADD, only return 1 if the branch that this
1920       commit is on has no last filled revision.
1921
1922       PM is a PersistenceManager to query.
1923       """
1924
1925       # Different '.' counts indicate that c_rev is now on a different
1926       # line of development (and may need a fill)
1927       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
1928         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
1929         # It should be the case that when we have a file F that
1930         # is added on branch B (thus, F on trunk is in state
1931         # 'dead'), we generate an SVNCommit to fill B iff the branch
1932         # has never been filled before.
1933         #
1934         # If this c_rev.op == OP_ADD, *and* the branch has never
1935         # been filled before, then fill it now.  Otherwise, no need to
1936         # fill it.
1937         if c_rev.op == OP_ADD:
1938           if pm.last_filled.get(c_rev.branch_name, None) is None:
1939             return 1
1940         else:
1941           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
1942             return 1
1943       return 0
1944
1945     for c_rev in self.changes + self.deletes:
1946       # If a commit is on a branch, we must ensure that the branch
1947       # path being committed exists (in HEAD of the Subversion
1948       # repository).  If it doesn't exist, we will need to fill the
1949       # branch.  After the fill, the path on which we're committing
1950       # will exist.
1951       if c_rev.branch_name \
1952           and c_rev.branch_name not in accounted_for_sym_names \
1953           and c_rev.branch_name not in self.done_symbols \
1954           and fill_needed(c_rev, Ctx()._persistence_manager):
1955         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
1956                                % c_rev.branch_name)
1957         svn_commit.set_symbolic_name(c_rev.branch_name)
1958         self.secondary_commits.append(svn_commit)
1959         accounted_for_sym_names.append(c_rev.branch_name)
1960
1961   def _commit(self):
1962     """Generates the primary SVNCommit that corresponds the this
1963     CVSCommit."""
1964     # Generate an SVNCommit unconditionally.  Even if the only change
1965     # in this CVSCommit is a deletion of an already-deleted file (that
1966     # is, a CVS revision in state 'dead' whose predecessor was also in
1967     # state 'dead'), the conversion will still generate a Subversion
1968     # revision containing the log message for the second dead
1969     # revision, because we don't want to lose that information.
1970     svn_commit = SVNCommit("commit")
1971     self.motivating_commit = svn_commit
1972
1973     for c_rev in self.changes:
1974       svn_commit.add_revision(c_rev)
1975       # Only make a change if we need to.  When 1.1.1.1 has an empty
1976       # deltatext, the explanation is almost always that we're looking
1977       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
1978       # such imports, CVS creates an RCS file where 1.1 has the
1979       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
1980       # content as 1.1.  There's no reason to reflect this non-change
1981       # in the repository, so we want to do nothing in this case.  (If
1982       # we were really paranoid, we could make sure 1.1's log message
1983       # is the CVS-generated "Initial revision\n", but I think the
1984       # conditions below are strict enough.)
1985       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
1986               and (c_rev.rev == "1.1.1.1")):
1987         if c_rev.is_default_branch_revision():
1988           self.default_branch_cvs_revisions.append(c_rev)
1989
1990     for c_rev in self.deletes:
1991       # When a file is added on a branch, CVS not only adds the file
1992       # on the branch, but generates a trunk revision (typically
1993       # 1.1) for that file in state 'dead'.  We only want to add
1994       # this revision if the log message is not the standard cvs
1995       # fabricated log message.
1996       if c_rev.prev_rev is None:
1997         # c_rev.branches may be empty if the originating branch
1998         # has been excluded.
1999         if not c_rev.branches:
2000           continue
2001         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
2002                              % (c_rev.filename(),
2003                                 c_rev.branches[0]))
2004         author, log_msg = \
2005             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
2006         if log_msg == cvs_generated_msg:
2007           continue
2008
2009       svn_commit.add_revision(c_rev)
2010       if c_rev.is_default_branch_revision():
2011         self.default_branch_cvs_revisions.append(c_rev)
2012
2013     # There is a slight chance that we didn't actually register any
2014     # CVSRevisions with our SVNCommit (see loop over self.deletes
2015     # above), so if we have no CVSRevisions, we don't flush the
2016     # svn_commit to disk and roll back our revnum.
2017     if len(svn_commit.cvs_revs) > 0:
2018       svn_commit.flush()
2019     else:
2020       # We will not be flushing this SVNCommit, so rollback the
2021       # SVNCommit revision counter.
2022       SVNCommit.revnum = SVNCommit.revnum - 1
2023
2024     if not Ctx().trunk_only:
2025       for c_rev in self.revisions():
2026         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
2027
2028   def _post_commit(self):
2029     """Generates any SVNCommits that we can perform now that _commit
2030     has happened.  That is, handle non-trunk default branches.
2031     Sometimes an RCS file has a non-trunk default branch, so a commit
2032     on that default branch would be visible in a default CVS checkout
2033     of HEAD.  If we don't copy that commit over to Subversion's trunk,
2034     then there will be no Subversion tree which corresponds to that
2035     CVS checkout.  Of course, in order to copy the path over, we may
2036     first need to delete the existing trunk there.  """
2037
2038     # Only generate a commit if we have default branch revs
2039     if len(self.default_branch_cvs_revisions):
2040       # Generate an SVNCommit for all of our default branch c_revs.
2041       svn_commit = SVNCommit("post-commit default branch(es)")
2042       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
2043       for c_rev in self.default_branch_cvs_revisions:
2044         svn_commit.add_revision(c_rev)
2045         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
2046                                                             svn_commit.revnum)
2047       self.secondary_commits.append(svn_commit)
2048
2049   def process_revisions(self, done_symbols):
2050     """Process all the CVSRevisions that this instance has, creating
2051     one or more SVNCommits in the process.  Generate fill SVNCommits
2052     only for symbols not in DONE_SYMBOLS (avoids unnecessary
2053     fills).
2054
2055     Return the primary SVNCommit that corresponds to this CVSCommit.
2056     The returned SVNCommit is the commit that motivated any other
2057     SVNCommits generated in this CVSCommit."""
2058     self.done_symbols = done_symbols
2059     seconds = self.t_max - self.t_min + 1
2060
2061     Log().write(LOG_VERBOSE, '-' * 60)
2062     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2063     if seconds == 1:
2064       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2065                   % time.ctime(self.t_max))
2066     else:
2067       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2068       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2069                   % (time.ctime(self.t_max), seconds))
2070
2071     if seconds > COMMIT_THRESHOLD + 1:
2072       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2073                   % (warning_prefix, COMMIT_THRESHOLD))
2074
2075     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2076       self._commit()
2077       return self.motivating_commit
2078
2079     self._pre_commit()
2080     self._commit()
2081     self._post_commit()
2082
2083     for svn_commit in self.secondary_commits:
2084       svn_commit.set_date(self.motivating_commit.get_date())
2085       svn_commit.flush()
2086
2087     return self.motivating_commit
2088
2089
2090 class SVNCommit:
2091   """This represents one commit to the Subversion Repository.  There
2092   are three types of SVNCommits:
2093
2094   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2095
2096   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2097
2098   3. Updates trunk to reflect the contents of a particular branch
2099      (this is to handle RCS default branches)."""
2100
2101   # The revision number to assign to the next new SVNCommit.
2102   # We start at 2 because SVNRepositoryMirror uses the first commit
2103   # to create trunk, tags, and branches.
2104   revnum = 2
2105
2106   class SVNCommitInternalInconsistencyError(Exception):
2107     """Exception raised if we encounter an impossible state in the
2108     SVNCommit Databases."""
2109     pass
2110
2111   def __init__(self, description="", revnum=None, cvs_revs=None):
2112     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2113     If REVNUM, the SVNCommit will correspond to that revision number;
2114     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2115     REVNUM.
2116
2117     It is an error to pass CVS_REVS without REVNUM, but you may pass
2118     REVNUM without CVS_REVS, and then add a revision at a time by
2119     invoking add_revision()."""
2120     self._description = description
2121
2122     # Revprop metadata for this commit.
2123     #
2124     # These initial values are placeholders.  At least the log and the
2125     # date should be different by the time these are used.
2126     #
2127     # They are private because their values should be returned encoded
2128     # in UTF8, but callers aren't required to set them in UTF8.
2129     # Therefore, accessor methods are used to set them, and
2130     # self.get_revprops() is used to to get them, in dictionary form.
2131     self._author = Ctx().username
2132     self._log_msg = "This log message means an SVNCommit was used too soon."
2133     self._max_date = 0  # Latest date seen so far.
2134
2135     self.cvs_revs = cvs_revs or []
2136     if revnum:
2137       self.revnum = revnum
2138     else:
2139       self.revnum = SVNCommit.revnum
2140       SVNCommit.revnum = SVNCommit.revnum + 1
2141
2142     # The symbolic name that is filled in this SVNCommit, if any
2143     self.symbolic_name = None
2144
2145     # If this commit is a default branch synchronization, this
2146     # variable represents the subversion revision number of the
2147     # *primary* commit where the default branch changes actually
2148     # happened.  It is None otherwise.
2149     #
2150     # It is possible for multiple synchronization commits to refer to
2151     # the same motivating commit revision number, and it is possible
2152     # for a single synchronization commit to contain CVSRevisions on
2153     # multiple different default branches.
2154     self.motivating_revnum = None
2155
2156     # is_tag is true only if this commit is a fill of a symbolic name
2157     # that is a tag, None in all other cases.
2158     self.is_tag = None
2159
2160   def set_symbolic_name(self, name):
2161     "Set self.symbolic_name to NAME."
2162     name = _clean_symbolic_name(name)
2163     self.symbolic_name = name
2164
2165   def set_motivating_revnum(self, revnum):
2166     "Set self.motivating_revnum to REVNUM."
2167     self.motivating_revnum = revnum
2168
2169   def set_author(self, author):
2170     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2171     This is the only way to set an SVNCommit's author."""
2172     self._author = author
2173
2174   def set_log_msg(self, msg):
2175     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2176     This is the only way to set an SVNCommit's log message."""
2177     self._log_msg = msg
2178
2179   def set_date(self, date):
2180     """Set this SVNCommit's date to DATE (an integer).
2181     Note that self.add_revision() updates this automatically based on
2182     a CVSRevision; so you may not need to call this at all, and even
2183     if you do, the value may be overwritten by a later call to
2184     self.add_revision()."""
2185     self._max_date = date
2186
2187   def get_date(self):
2188     """Returns this SVNCommit's date as an integer."""
2189     return self._max_date
2190
2191   def get_revprops(self):
2192     """Return the Subversion revprops for this SVNCommit."""
2193     date = format_date(self._max_date)
2194     try:
2195       ### FIXME: The 'replace' behavior should be an option, like
2196       ### --encoding is.
2197       utf8_author = None
2198       if self._author is not None:
2199         unicode_author = unicode(self._author, Ctx().encoding, 'replace')
2200         utf8_author = unicode_author.encode('utf8')
2201       unicode_log = unicode(self.get_log_msg(), Ctx().encoding, 'replace')
2202       utf8_log = unicode_log.encode('utf8')
2203       return { 'svn:author' : utf8_author,
2204                'svn:log'    : utf8_log,
2205                'svn:date'   : date }
2206     except UnicodeError:
2207       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2208                   % warning_prefix)
2209       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2210       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2211       Log().write(LOG_WARN, "  date:   '%s'" % date)
2212       Log().write(LOG_WARN, "(subversion rev %s)  Related files:" % self.revnum)
2213       for c_rev in self.cvs_revs:
2214         Log().write(LOG_WARN, " ", c_rev.fname)
2215
2216       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2217                   "'--encoding=latin1'.\n")
2218       # It's better to fall back to the original (unknown encoding) data
2219       # than to either 1) quit or 2) record nothing at all.
2220       return { 'svn:author' : self._author,
2221                'svn:log'    : self.get_log_msg(),
2222                'svn:date'   : date }
2223
2224   def add_revision(self, cvs_rev):
2225     self.cvs_revs.append(cvs_rev)
2226     if cvs_rev.timestamp > self._max_date:
2227       self._max_date = cvs_rev.timestamp
2228
2229   def _is_primary_commit(self):
2230     """Return true if this is a primary SVNCommit, false otherwise."""
2231     return not (self.symbolic_name or self.motivating_revnum)
2232
2233   def flush(self):
2234     Log().write(LOG_NORMAL, "Creating Subversion commit %d (%s)"
2235                 % (self.revnum, self._description))
2236     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2237
2238     if self.motivating_revnum is not None:
2239       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2240                                                        self.motivating_revnum)
2241
2242     # If we're not a primary commit, then store our date and/or our
2243     # symbolic_name
2244     if not self._is_primary_commit():
2245       Ctx()._persistence_manager.set_name_and_date(self.revnum,
2246                                                    self.symbolic_name,
2247                                                    self._max_date)
2248
2249   def __str__(self):
2250     """ Print a human-readable description of this SVNCommit.  This
2251     description is not intended to be machine-parseable (although
2252     we're not going to stop you if you try!)"""
2253
2254     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2255     if self.symbolic_name:
2256       ret = ret + "   symbolic name: " +  self.symbolic_name + "\n"
2257     else:
2258       ret = ret + "   NO symbolic name\n"
2259     ret = ret + "   debug description: " + self._description + "\n"
2260     ret = ret + "   cvs_revs:\n"
2261     for c_rev in self.cvs_revs:
2262       ret = ret + "     " + c_rev.unique_key() + "\n"
2263     return ret
2264
2265   def get_log_msg(self):
2266     """Returns the actual log message for a primary commit, and the
2267     appropriate manufactured log message for a secondary commit."""
2268     if self.symbolic_name is not None:
2269       return self._log_msg_for_symbolic_name_commit()
2270     elif self.motivating_revnum is not None:
2271       return self._log_msg_for_default_branch_commit()
2272     else:
2273       return self._log_msg
2274
2275   def _log_msg_for_symbolic_name_commit(self):
2276     """Creates a log message for a manufactured commit that fills
2277     self.symbolic_name.  If self.is_tag is true, write the log message
2278     as though for a tag, else write it as though for a branch."""
2279     type = 'branch'
2280     if self.is_tag:
2281       type = 'tag'
2282
2283     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2284     space_or_newline = ' '
2285     if len(self.symbolic_name) >= 13:
2286       space_or_newline = '\n'
2287
2288     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2289            % (type, space_or_newline, self.symbolic_name)
2290
2291   def _log_msg_for_default_branch_commit(self):
2292     """Creates a log message for a manufactured commit that
2293     synchronizes a non-trunk default branch with trunk."""
2294     msg = 'This commit was generated by cvs2svn to compensate for '     \
2295           'changes in r%d,\n'                                           \
2296           'which included commits to RCS files with non-trunk default ' \
2297           'branches.\n' % self.motivating_revnum
2298     return msg
2299
2300 class CVSRevisionAggregator:
2301   """This class groups CVSRevisions into CVSCommits that represent
2302   at least one SVNCommit."""
2303   def __init__(self):
2304     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2305     if not Ctx().trunk_only:
2306       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), DB_OPEN_READ)
2307     self.cvs_commits = {}
2308     self.pending_symbols = {}
2309     # A list of symbols for which we've already encountered the last
2310     # CVSRevision that is a source for that symbol.  That is, the
2311     # final fill for this symbol has been done, and we never need to
2312     # fill it again.
2313     self.done_symbols = [ ]
2314
2315     # This variable holds the most recently created primary svn_commit
2316     # object.  CVSRevisionAggregator maintains this variable merely
2317     # for its date, so that it can set dates for the SVNCommits
2318     # created in self.attempt_to_commit_symbols().
2319     self.latest_primary_svn_commit = None
2320
2321     Ctx()._symbolings_logger = SymbolingsLogger()
2322     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2323     Ctx()._default_branches_db = Database(temp(DEFAULT_BRANCHES_DB),
2324                                           DB_OPEN_READ)
2325
2326
2327   def process_revision(self, c_rev):
2328     # Each time we read a new line, we scan the commits we've
2329     # accumulated so far to see if any are ready for processing now.
2330     ready_queue = [ ]
2331     for digest_key, cvs_commit in self.cvs_commits.items():
2332       if cvs_commit.t_max + COMMIT_THRESHOLD < c_rev.timestamp:
2333         ready_queue.append(cvs_commit)
2334         del self.cvs_commits[digest_key]
2335         continue
2336       # If the inbound commit is on the same file as a pending commit,
2337       # close the pending commit to further changes.  Don't flush it though,
2338       # as there may be other pending commits dated before this one.
2339       # ### ISSUE: the has_file() check below is not optimal.
2340       # It does fix the dataloss bug where revisions would get lost
2341       # if checked in too quickly, but it can also break apart the
2342       # commits.  The correct fix would require tracking the dependencies
2343       # between change sets and committing them in proper order.
2344       if cvs_commit.has_file(c_rev.fname):
2345         unused_id = digest_key + '-'
2346         # Find a string that does is not already a key in
2347         # the self.cvs_commits dict
2348         while self.cvs_commits.has_key(unused_id):
2349           unused_id = unused_id + '-'
2350         self.cvs_commits[unused_id] = cvs_commit
2351         del self.cvs_commits[digest_key]
2352
2353     # Add this item into the set of still-available commits.
2354     if self.cvs_commits.has_key(c_rev.digest):
2355       cvs_commit = self.cvs_commits[c_rev.digest]
2356     else:
2357       author, log = self.metadata_db[c_rev.digest]
2358       self.cvs_commits[c_rev.digest] = CVSCommit(c_rev.digest,
2359                                                  author, log)
2360       cvs_commit = self.cvs_commits[c_rev.digest]
2361     cvs_commit.add_revision(c_rev)
2362
2363     # If there are any elements in the ready_queue at this point, they
2364     # need to be processed, because this latest rev couldn't possibly
2365     # be part of any of them.  Sort them into time-order, then process
2366     # 'em.
2367     ready_queue.sort()
2368
2369     # Make sure we attempt_to_commit_symbols for this c_rev, even if no
2370     # commits are ready.
2371     if len(ready_queue) == 0:
2372       self.attempt_to_commit_symbols(ready_queue, c_rev)
2373
2374     for cvs_commit in ready_queue[:]:
2375       self.latest_primary_svn_commit \
2376           = cvs_commit.process_revisions(self.done_symbols)
2377       ready_queue.remove(cvs_commit)
2378       self.attempt_to_commit_symbols(ready_queue, c_rev)
2379
2380   def flush(self):
2381     """Commit anything left in self.cvs_commits.  Then inform the
2382     SymbolingsLogger that all commits are done."""
2383
2384     ready_queue = [ ]
2385     for k, v in self.cvs_commits.items():
2386       ready_queue.append((v, k))
2387
2388     ready_queue.sort()
2389     for cvs_commit_tuple in ready_queue[:]:
2390       self.latest_primary_svn_commit = \
2391         cvs_commit_tuple[0].process_revisions(self.done_symbols)
2392       ready_queue.remove(cvs_commit_tuple)
2393       del self.cvs_commits[cvs_commit_tuple[1]]
2394       self.attempt_to_commit_symbols([])
2395
2396     if not Ctx().trunk_only:
2397       Ctx()._symbolings_logger.close()
2398
2399   def attempt_to_commit_symbols(self, queued_commits, c_rev=None):
2400     """
2401     This function generates 1 SVNCommit for each symbol in
2402     self.pending_symbols that doesn't have an opening CVSRevision in
2403     either QUEUED_COMMITS or self.cvs_commits.values().
2404
2405     If C_REV is not None, then we first add to self.pending_symbols
2406     any symbols from C_REV that C_REV is the last CVSRevision for.
2407     """
2408     # If we're not doing a trunk-only conversion, get the symbolic
2409     # names that this c_rev is the last *source* CVSRevision for and
2410     # add them to those left over from previous passes through the
2411     # aggregator.
2412     if c_rev and not Ctx().trunk_only:
2413       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2414         self.pending_symbols[sym] = None
2415
2416     # Make a list of all symbols that still have *source* CVSRevisions
2417     # in the pending commit queue (self.cvs_commits).
2418     open_symbols = {}
2419     for sym in self.pending_symbols.keys():
2420       for cvs_commit in self.cvs_commits.values() + queued_commits:
2421         if cvs_commit.opens_symbolic_name(sym):
2422           open_symbols[sym] = None
2423           break
2424
2425     # Sort the pending symbols so that we will always process the
2426     # symbols in the same order, regardless of the order in which the
2427     # dict hashing algorithm hands them back to us.  We do this so
2428     # that our tests will get the same results on all platforms.
2429     sorted_pending_symbols_keys = self.pending_symbols.keys()
2430     sorted_pending_symbols_keys.sort()
2431     for sym in sorted_pending_symbols_keys:
2432       if open_symbols.has_key(sym): # sym is still open--don't close it.
2433         continue
2434       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2435       svn_commit.set_symbolic_name(sym)
2436       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2437       svn_commit.flush()
2438       self.done_symbols.append(sym)
2439       del self.pending_symbols[sym]
2440
2441
2442 class SymbolingsReader:
2443   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2444   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2445   returning the correct opening and closing Subversion revision
2446   numbers for a given symbolic name."""
2447   def __init__(self):
2448     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2449     reads the offsets database into memory."""
2450     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2451     # The offsets_db is really small, and we need to read and write
2452     # from it a fair bit, so suck it into memory
2453     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2454     self.offsets = { }
2455     for key in offsets_db.db.keys():
2456       #print " ZOO:", key, offsets_db[key]
2457       self.offsets[key] = offsets_db[key]
2458
2459   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2460     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2461     SymbolicNameFillingGuide object.
2462
2463     Note that if we encounter an opening rev in this fill, but the
2464     corresponding closing rev takes place later than SVN_REVNUM, the
2465     closing will not be passed to SymbolicNameFillingGuide in this
2466     fill (and will be discarded when encountered in a later fill).
2467     This is perfectly fine, because we can still do a valid fill
2468     without the closing--we always try to fill what we can as soon as
2469     we can."""
2470     # It's possible to have a branch start with a file that was added
2471     # on a branch
2472     if not self.offsets.has_key(symbolic_name):
2473       return SymbolicNameFillingGuide(symbolic_name)
2474     # set our read offset for self.symbolings to the offset for
2475     # symbolic_name
2476     self.symbolings.seek(self.offsets[symbolic_name])
2477
2478     symbol_fill = SymbolicNameFillingGuide(symbolic_name)
2479     while (1):
2480       fpos = self.symbolings.tell()
2481       line = self.symbolings.readline().rstrip()
2482       if not line:
2483         break
2484       name, revnum, type, svn_path = line.split(" ", 3)
2485       revnum = int(revnum)
2486       if (revnum > svn_revnum
2487           or name != symbolic_name):
2488         break
2489       symbol_fill.register(svn_path, revnum, type)
2490
2491     # get current offset of the read marker and set it to the offset
2492     # for the beginning of the line we just read if we used anything
2493     # we read.
2494     if not symbol_fill.is_empty():
2495       self.offsets[symbolic_name] = fpos
2496
2497     symbol_fill.make_node_tree()
2498     return symbol_fill
2499
2500
2501 class SymbolicNameFillingGuide:
2502   """A SymbolicNameFillingGuide is essentially a node tree
2503   representing the source paths to be copied to fill
2504   self.symbolic_name in the current SVNCommit.
2505
2506   After calling self.register() on a series of openings and closings,
2507   call self.make_node_tree() to prepare self.node_tree for
2508   examination.  See the docstring for self.make_node_tree() for
2509   details on the structure of self.node_tree.
2510
2511   By walking self.node_tree and calling self.get_best_revnum() on each
2512   node, the caller can determine what subversion revision number to
2513   copy the path corresponding to that node from.  self.node_tree
2514   should be treated as read-only.
2515
2516   The caller can then descend to sub-nodes to see if their "best
2517   revnum" differs from their parents' and if it does, take appropriate
2518   actions to "patch up" the subtrees."""
2519   def __init__(self, symbolic_name):
2520     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2521     prepares it for receiving openings and closings.
2522
2523     Returns a fully functional and armed SymbolicNameFillingGuide
2524     object."""
2525     self.name = symbolic_name
2526
2527     self.opening_key = "/o"
2528     self.closing_key = "/c"
2529
2530     # A dictionary of SVN_PATHS and SVN_REVNUMS whose format is:
2531     #
2532     # { svn_path : { self.opening_key : svn_revnum,
2533     #                self.closing_key : svn_revnum }
2534     #                ...}
2535     self.things = { }
2536
2537     # The key for the root node of the node tree
2538     self.root_key = '0'
2539     # The dictionary that holds our node tree, seeded with the root key.
2540     self.node_tree = { self.root_key : { } }
2541
2542   def get_best_revnum(self, node, preferred_revnum):
2543     """Determine the best subversion revision number to use when
2544     copying the source tree beginning at NODE.  Returns a
2545     subversion revision number.
2546
2547     PREFERRED_REVNUM is passed to self._best_rev and used to
2548     calculate the best_revnum."""
2549     revnum = SVN_INVALID_REVNUM
2550
2551     # Aggregate openings and closings from the rev tree
2552     openings = self._list_revnums_for_key(node, self.opening_key)
2553     closings = self._list_revnums_for_key(node, self.closing_key)
2554
2555     # Score the lists
2556     scores = self._score_revisions(self._sum_revnum_counts(openings),
2557                                   self._sum_revnum_counts(closings))
2558
2559     revnum, max_score = self._best_rev(scores, preferred_revnum)
2560
2561     if revnum == SVN_INVALID_REVNUM:
2562       sys.stderr.write(error_prefix + ": failed to find a revision "
2563                        + "to copy from when copying %s\n" % name)
2564       sys.exit(1)
2565     return revnum, max_score
2566
2567
2568   def _best_rev(self, scores, preferred_rev):
2569     """Return the revision with the highest score from SCORES, a list
2570     returned by _score_revisions().  When the maximum score is shared
2571     by multiple revisions, the oldest revision is selected, unless
2572     PREFERRED_REV is one of the possibilities, in which case, it is
2573     selected."""
2574     max_score = 0
2575     preferred_rev_score = -1
2576     rev = SVN_INVALID_REVNUM
2577     if preferred_rev is None:
2578       # Comparison order of different types is arbitrary. Do not
2579       # expect None to compare less than int values below.
2580       # In Python 2.3 None compares with ints like negative infinity.
2581       # In Python 2.0 None compares with ints like positive infinity.
2582       preferred_rev = SVN_INVALID_REVNUM
2583     for revnum, count in scores:
2584       if count > max_score:
2585         max_score = count
2586         rev = revnum
2587       if revnum <= preferred_rev:
2588         preferred_rev_score = count
2589     if preferred_rev_score == max_score:
2590       rev = preferred_rev
2591     return rev, max_score
2592
2593
2594   def _score_revisions(self, openings, closings):
2595     """Return a list of revisions and scores based on OPENINGS and
2596     CLOSINGS.  The returned list looks like:
2597
2598        [(REV1 SCORE1), (REV2 SCORE2), ...]
2599
2600     where REV2 > REV1.  OPENINGS and CLOSINGS are the values of
2601     self.opening__key and self.closing_key from some file or
2602     directory node, or else None.
2603
2604     Each score indicates that copying the corresponding revision (or
2605     any following revision up to the next revision in the list) of the
2606     object in question would yield that many correct paths at or
2607     underneath the object.  There may be other paths underneath it
2608     which are not correct and would need to be deleted or recopied;
2609     those can only be detected by descending and examining their
2610     scores.
2611
2612     If OPENINGS is false, return the empty list."""
2613     # First look for easy outs.
2614     if not openings:
2615       return []
2616
2617     # Must be able to call len(closings) below.
2618     if closings is None:
2619       closings = []
2620
2621     # No easy out, so wish for lexical closures and calculate the scores :-).
2622     scores = []
2623     opening_score_accum = 0
2624     for i in range(len(openings)):
2625       opening_rev, opening_score = openings[i]
2626       opening_score_accum = opening_score_accum + opening_score
2627       scores.append((opening_rev, opening_score_accum))
2628     min = 0
2629     for i in range(len(closings)):
2630       closing_rev, closing_score = closings[i]
2631       done_exact_rev = None
2632       insert_index = None
2633       insert_score = None
2634       for j in range(min, len(scores)):
2635         score_rev, score = scores[j]
2636         if score_rev >= closing_rev:
2637           if not done_exact_rev:
2638             if score_rev > closing_rev:
2639               insert_index = j
2640               insert_score = scores[j-1][1] - closing_score
2641             done_exact_rev = 1
2642           scores[j] = (score_rev, score - closing_score)
2643         else:
2644           min = j + 1
2645       if not done_exact_rev:
2646         scores.append((closing_rev,scores[-1][1] - closing_score))
2647       if insert_index is not None:
2648         scores.insert(insert_index, (closing_rev, insert_score))
2649     return scores
2650
2651   def _sum_revnum_counts(self, rev_list):
2652     """Takes an array of revisions (REV_LIST), for example:
2653
2654       [21, 18, 6, 49, 39, 24, 24, 24, 24, 24, 24, 24]
2655
2656     and adds up every occurrence of each revision and returns a sorted
2657     array of tuples containing (svn_revnum, count):
2658
2659       [(6, 1), (18, 1), (21, 1), (24, 7), (39, 1), (49, 1)]
2660     """
2661     s = {}
2662     for k in rev_list: # Add up the scores
2663       if s.has_key(k):
2664         s[k] = s[k] + 1
2665       else:
2666         s[k] = 1
2667     a = s.items()
2668     a.sort()
2669     return a
2670
2671   def _list_revnums_for_key(self, node, revnum_type_key):
2672     """Scan self.node_tree and return a list of all the revision
2673     numbers (including duplicates) contained in REVNUM_TYPE_KEY values
2674     for all leaf nodes at and under NODE.
2675
2676     REVNUM_TYPE_KEY should be either self.opening_key or
2677     self.closing_key."""
2678     revnums = []
2679
2680     # If the node has self.opening_key, it must be a leaf node--all
2681     # leaf nodes have at least an opening key (although they may not
2682     # have a closing key.  Fetch revnum and return
2683     if (self.node_tree[node].has_key(self.opening_key) and
2684         self.node_tree[node].has_key(revnum_type_key)):
2685       revnums.append(self.node_tree[node][revnum_type_key])
2686       return revnums
2687
2688     for key, node_contents in self.node_tree[node].items():
2689       if key[0] == '/':
2690         continue
2691       revnums = revnums + \
2692           self._list_revnums_for_key(node_contents, revnum_type_key)
2693     return revnums
2694
2695   def register(self, svn_path, svn_revnum, type):
2696     """Collects opening and closing revisions for this
2697     SymbolicNameFillingGuide.  SVN_PATH is the source path that needs
2698     to be copied into self.symbolic_name, and SVN_REVNUM is either the
2699     first svn revision number that we can copy from (our opening), or
2700     the last (not inclusive) svn revision number that we can copy from
2701     (our closing).  TYPE indicates whether this path is an opening or a
2702     a closing.
2703
2704     The opening for a given SVN_PATH must be passed before the closing
2705     for it to have any effect... any closing encountered before a
2706     corresponding opening will be discarded.
2707
2708     It is not necessary to pass a corresponding closing for every
2709     opening.
2710     """
2711     # Always log an OPENING
2712     if type == OPENING:
2713       self.things[svn_path] = {self.opening_key: svn_revnum}
2714     # Only log a closing if we've already registered the opening for that path.
2715     elif type == CLOSING and self.things.has_key(svn_path):
2716       # When we have a non-trunk default branch, we may have multiple
2717       # closings--only register the first closing we encounter.
2718       if not self.things[svn_path].has_key(self.closing_key):
2719         self.things[svn_path][self.closing_key] = svn_revnum
2720
2721   def make_node_tree(self):
2722     """Generates the SymbolicNameFillingGuide's node tree from
2723     self.things.  Each leaf node maps self.opening_key to the earliest
2724     subversion revision from which this node/path may be copied; and
2725     optionally map self.closing_key to the subversion revision one
2726     higher than the last revision from which this node/path may be
2727     copied.  Intermediate nodes never contain opening or closing
2728     flags."""
2729
2730     for svn_path, open_close in self.things.items():
2731       parent_key = self.root_key
2732
2733       path_so_far = ""
2734       # Walk up the path, one node at a time.
2735       components = svn_path.split('/')
2736       for component in components:
2737         path_so_far = path_so_far + '/' + component
2738
2739         child_key = None
2740         if not self.node_tree[parent_key].has_key(component):
2741           child_key = gen_key()
2742           self.node_tree[child_key] = { }
2743           self.node_tree[parent_key][component] = child_key
2744         else:
2745           child_key = self.node_tree[parent_key][component]
2746
2747         parent_key = child_key
2748       # Having reached the leaf, attach the value
2749       self.node_tree[parent_key] = open_close
2750     #print_node_tree(self.node_tree, self.root_key)
2751
2752   def is_empty(self):
2753     """Return true if we haven't accumulated any openings or closings,
2754     false otherwise."""
2755     return not len(self.things)
2756
2757
2758 class FillSource:
2759   """Representation of a fill source used by the symbol filler in
2760   SVNRepositoryMirror."""
2761   def __init__(self, prefix, key):
2762     """Create an unscored fill source with a prefix and a key."""
2763     self.prefix = prefix
2764     self.key = key
2765     self.score = None
2766     self.revnum = None
2767
2768   def set_score(self, score, revnum):
2769     """Set the SCORE and REVNUM."""
2770     self.score = score
2771     self.revnum = revnum
2772
2773   def __cmp__(self, other):
2774     """Comparison operator used to sort FillSources in descending
2775     score order."""
2776     if self.score is None or other.score is None:
2777       raise TypeError, 'Tried to compare unscored FillSource'
2778     return cmp(other.score, self.score)
2779
2780
2781 class SVNRepositoryMirror:
2782   """Mirror a Subversion Repository as it is constructed, one
2783   SVNCommit at a time.  The mirror is skeletal; it does not contain
2784   file contents.  The creation of a dumpfile or Subversion repository
2785   is handled by delegates.  See self.add_delegate method for how to
2786   set delegates.
2787
2788   The structure of the repository is kept in two databases and one
2789   hash.  The revs_db database maps revisions to root node keys, and
2790   the nodes_db database maps node keys to nodes.  A node is a hash
2791   from directory names to keys.  Both the revs_db and the nodes_db are
2792   stored on disk and each access is expensive.
2793
2794   The nodes_db database only has the keys for old revisions.  The
2795   revision that is being contructed is kept in memory in the new_nodes
2796   hash which is cheap to access.
2797
2798   You must invoke _start_commit between SVNCommits.
2799
2800   *** WARNING *** All path arguments to methods in this class CANNOT
2801       have leading or trailing slashes.
2802   """
2803
2804   class SVNRepositoryMirrorPathExistsError(Exception):
2805     """Exception raised if an attempt is made to add a path to the
2806     repository mirror and that path already exists in the youngest
2807     revision of the repository."""
2808     pass
2809
2810   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
2811     """Exception raised if a CVSRevision is found to have an unexpected
2812     operation (OP) value."""
2813     pass
2814
2815   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
2816     """Exception raised if an empty SymbolicNameFillingGuide is returned
2817     during a fill where the branch in question already exists."""
2818     pass
2819
2820   def __init__(self):
2821     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
2822     self.delegates = [ ]
2823
2824     # This corresponds to the 'revisions' table in a Subversion fs.
2825     self.revs_db = Database(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
2826     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
2827
2828     # This corresponds to the 'nodes' table in a Subversion fs.  (We
2829     # don't need a 'representations' or 'strings' table because we
2830     # only track metadata, not file contents.)
2831     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
2832     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
2833
2834     # Start at revision 0 without a root node.  It will be created
2835     # by _open_writable_root_node.
2836     self.youngest = 0
2837     self.new_root_key = None
2838     self.new_nodes = { }
2839
2840     if not Ctx().trunk_only:
2841       ###PERF IMPT: Suck this into memory.
2842       self.tags_db = TagsDatabase(DB_OPEN_READ)
2843       self.symbolings_reader = SymbolingsReader()
2844
2845   def _initialize_repository(self, date):
2846     """Initialize the repository by creating the directories for
2847     trunk, tags, and branches.  This method should only be called
2848     after all delegates are added to the repository mirror."""
2849     # Make a 'fake' SVNCommit so we can take advantage of the revprops
2850     # magic therein
2851     svn_commit = SVNCommit("Initialization", 1)
2852     svn_commit.set_date(date)
2853     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
2854
2855     self._start_commit(svn_commit)
2856     self._mkdir(Ctx().trunk_base)
2857     if not Ctx().trunk_only:
2858       self._mkdir(Ctx().branches_base)
2859       self._mkdir(Ctx().tags_base)
2860
2861   def _start_commit(self, svn_commit):
2862     """Start a new commit."""
2863     if self.youngest > 0:
2864       self._end_commit()
2865
2866     self.youngest = svn_commit.revnum
2867     self.new_root_key = None
2868     self.new_nodes = { }
2869
2870     self._invoke_delegates('start_commit', svn_commit)
2871
2872   def _end_commit(self):
2873     """Called at the end of each commit.  This method copies the newly
2874     created nodes to the on-disk nodes db."""
2875     if self.new_root_key is None:
2876       # No changes were made in this revision, so we make the root node
2877       # of the new revision be the same as the last one.
2878       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
2879     else:
2880       self.revs_db[str(self.youngest)] = self.new_root_key
2881       # Copy the new nodes to the nodes_db
2882       for key, value in self.new_nodes.items():
2883         self.nodes_db[key] = value
2884
2885   def _get_node(self, key):
2886     """Returns the node contents for KEY which may refer to either
2887     self.nodes_db or self.new_nodes."""
2888     if self.new_nodes.has_key(key):
2889       return self.new_nodes[key]
2890     else:
2891       return self.nodes_db[key]
2892
2893   def _open_readonly_node(self, path, revnum):
2894     """Open a readonly node for PATH at revision REVNUM.  Returns the
2895     node key and node contents if the path exists, else (None, None)."""
2896     # Get the root key
2897     if revnum == self.youngest:
2898       if self.new_root_key is None:
2899         node_key = self.revs_db[str(self.youngest - 1)]
2900       else:
2901         node_key = self.new_root_key
2902     else:
2903       node_key = self.revs_db[str(revnum)]
2904
2905     for component in path.split('/'):
2906       node_contents = self._get_node(node_key)
2907       if not node_contents.has_key(component):
2908         return None
2909       node_key = node_contents[component]
2910
2911     return node_key
2912
2913   def _open_writable_root_node(self):
2914     """Open a writable root node.  The current root node is returned
2915     immeditely if it is already writable.  If not, create a new one by
2916     copying the contents of the root node of the previous version."""
2917     if self.new_root_key is not None:
2918       return self.new_root_key, self.new_nodes[self.new_root_key]
2919
2920     if self.youngest < 2:
2921       new_contents = { }
2922     else:
2923       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
2924     self.new_root_key = gen_key()
2925     self.new_nodes = { self.new_root_key: new_contents }
2926
2927     return self.new_root_key, new_contents
2928
2929   def _open_writable_node(self, svn_path, create):
2930     """Open a writable node for the path SVN_PATH, creating SVN_PATH
2931     and any missing directories if CREATE is True."""
2932     parent_key, parent_contents = self._open_writable_root_node()
2933
2934     # Walk up the path, one node at a time.
2935     path_so_far = None
2936     components = svn_path.split('/')
2937     for i in range(len(components)):
2938       component = components[i]
2939       this_key = this_contents = None
2940       path_so_far = _path_join(path_so_far, component)
2941       if parent_contents.has_key(component):
2942         # The component exists.
2943         this_key = parent_contents[component]
2944         if self.new_nodes.has_key(this_key):
2945           this_contents = self.new_nodes[this_key]
2946         else:
2947           # Suck the node from the nodes_db, but update the key
2948           this_contents = self.nodes_db[this_key]
2949           this_key = gen_key()
2950           self.new_nodes[this_key] = this_contents
2951           parent_contents[component] = this_key
2952       elif create:
2953         # The component does not exists, so we create it.
2954         this_contents = { }
2955         this_key = gen_key()
2956         self.new_nodes[this_key] = this_contents
2957         parent_contents[component] = this_key
2958         if i < len(components) - 1:
2959           self._invoke_delegates('mkdir', path_so_far)
2960       else:
2961         # The component does not exists and we are not instructed to
2962         # create it, so we give up.
2963         return None, None
2964
2965       parent_key = this_key
2966       parent_contents = this_contents
2967
2968     return this_key, this_contents
2969
2970   def _path_exists(self, path):
2971     """If PATH exists in self.youngest of the svn repository mirror,
2972     return true, else return None.
2973
2974     PATH must not start with '/'."""
2975     return self._open_readonly_node(path, self.youngest) is not None
2976
2977   def _fast_delete_path(self, parent_path, parent_contents, component):
2978     """Delete COMPONENT from the parent direcory PARENT_PATH with the
2979     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
2980     in PARENT_CONTENTS."""
2981     if parent_contents.has_key(component):
2982       del parent_contents[component]
2983       self._invoke_delegates('delete_path', _path_join(parent_path, component))
2984
2985   def _delete_path(self, svn_path, should_prune=False):
2986     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
2987     all ancestor directories that are made empty when SVN_PATH is deleted.
2988     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
2989
2990     NOTE: This function does *not* allow you delete top-level entries
2991     (like /trunk, /branches, /tags), nor does it prune upwards beyond
2992     those entries."""
2993     pos = svn_path.rfind('/')
2994     parent_path = svn_path[:pos]
2995     entry = svn_path[pos+1:]
2996     parent_key, parent_contents = self._open_writable_node(parent_path, False)
2997     if parent_key is not None:
2998       self._fast_delete_path(parent_path, parent_contents, entry)
2999       # The following recursion makes pruning an O(n^2) operation in the
3000       # worst case (where n is the depth of SVN_PATH), but the worst case
3001       # is probably rare, and the constant cost is pretty low.  Another
3002       # drawback is that we issue a delete for each path and not just
3003       # a single delete for the topmost directory pruned.
3004       if (should_prune and len(parent_contents) == 0 and
3005           parent_path.find('/') != -1):
3006         self._delete_path(parent_path, True)
3007
3008   def _mkdir(self, path):
3009     """Create PATH in the repository mirror at the youngest revision."""
3010     self._open_writable_node(path, True)
3011     self._invoke_delegates('mkdir', path)
3012
3013   def _change_path(self, cvs_rev):
3014     """Register a change in self.youngest for the CVS_REV's svn_path
3015     in the repository mirror."""
3016     # We do not have to update the nodes because our mirror is only
3017     # concerned with the presence or absence of paths, and a file
3018     # content change does not cause any path changes.
3019     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, 0))
3020
3021   def _add_path(self, cvs_rev):
3022     """Add the CVS_REV's svn_path to the repository mirror."""
3023     self._open_writable_node(cvs_rev.svn_path, True)
3024     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, 1))
3025
3026   def _copy_path(self, src_path, dest_path, src_revnum):
3027     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
3028     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
3029     parent *must* exist, but DEST_PATH *cannot* exist.
3030
3031     Return the node key and the contents of the new node at DEST_PATH
3032     as a dictionary."""
3033     # get the contents of the node of our src_path
3034     src_key = self._open_readonly_node(src_path, src_revnum)
3035     src_contents = self._get_node(src_key)
3036
3037     # Get the parent path and the base path of the dest_path
3038     pos = dest_path.rindex('/')
3039     dest_parent = dest_path[:pos]
3040     dest_basename = dest_path[pos+1:]
3041     dest_parent_key, dest_parent_contents = \
3042                    self._open_writable_node(dest_parent, False)
3043
3044     if dest_parent_contents.has_key(dest_basename):
3045       msg = "Attempt to add path '%s' to repository mirror " % dest_path
3046       msg = msg + "when it already exists in the mirror."
3047       raise self.SVNRepositoryMirrorPathExistsError, msg
3048
3049     dest_parent_contents[dest_basename] = src_key
3050     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
3051
3052     # Yes sir, src_key and src_contents are also the contents of the
3053     # destination.  This is a cheap copy, remember!  :-)
3054     return src_key, src_contents
3055
3056   def _fill_symbolic_name(self, svn_commit):
3057     """Performs all copies necessary to create as much of the the tag
3058     or branch SVN_COMMIT.symbolic_name as possible given the current
3059     revision of the repository mirror.
3060
3061     The symbolic name is guaranteed to exist in the Subversion
3062     repository by the end of this call, even if there are no paths
3063     under it."""
3064     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3065       svn_commit.symbolic_name, self.youngest)
3066
3067     # Create the list of sources for the symbolic name.  All source
3068     # prefixes must be direct sources for the destination, i.e. we
3069     # must have 'trunk' and 'branches/my_branch' and not just
3070     # 'branches'.
3071     sources = []
3072     for entry, key in symbol_fill.node_tree[symbol_fill.root_key].items():
3073       if entry == Ctx().trunk_base:
3074         sources.append(FillSource(entry, key))
3075       elif entry == Ctx().branches_base:
3076         for entry2, key2 in symbol_fill.node_tree[key].items():
3077           sources.append(FillSource(entry + '/' + entry2, key2))
3078       else:
3079         raise # Should never happen
3080     if self.tags_db.has_key(svn_commit.symbolic_name):
3081       dest_prefix = _path_join(Ctx().tags_base, svn_commit.symbolic_name)
3082     else:
3083       dest_prefix = _path_join(Ctx().branches_base,
3084                                svn_commit.symbolic_name)
3085
3086     if sources:
3087       dest_key = self._open_writable_node(dest_prefix, False)[0]
3088       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3089     else:
3090       # We can only get here for a branch whose first commit is an add
3091       # (as opposed to a copy).
3092       dest_path = Ctx().branches_base + '/' + symbol_fill.name
3093       if not self._path_exists(dest_path):
3094         # If our symbol_fill was empty, that means that our first
3095         # commit on the branch was to a file added on the branch, and
3096         # that this is our first fill of that branch.
3097         #
3098         # This case is covered by test 16.
3099         #
3100         # ...we create the branch by copying trunk from the our
3101         # current revision number minus 1
3102         source_path = Ctx().trunk_base
3103         entries = self._copy_path(source_path, dest_path,
3104                                   svn_commit.revnum - 1)[1]
3105         # Now since we've just copied trunk to a branch that's
3106         # *supposed* to be empty, we delete any entries in the
3107         # copied directory.
3108         for entry in entries.keys():
3109           del_path = dest_path + '/' + entry
3110           # Delete but don't prune.
3111           self._delete_path(del_path)
3112       else:
3113         msg = "Error filling branch '" + symbol_fill.name + "'.\n"
3114         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3115         msg = msg + "attempted to create a branch that already exists."
3116         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3117
3118   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3119             path = None, parent_source_prefix = None,
3120             preferred_revnum = None, prune_ok = None):
3121     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3122     SOURCES, and recurse into the child items.
3123
3124     DEST_PREFIX is the prefix of the destination directory, e.g.
3125     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3126     FillSource classes that are candidates to be copied to the
3127     destination.  DEST_KEY is the key in self.nodes_db to the
3128     destination, or None if the destination does not yet exist.
3129
3130     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3131     are at the top level, e.g. '/tags/my_tag'.
3132
3133     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3134     the parent directory, and PREFERRED_REVNUM is an int which is the
3135     source revision number that the caller (who may have copied KEY's
3136     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3137     then no revision is preferable to any other (which probably means
3138     that no copies have happened yet).
3139
3140     PRUNE_OK means that a copy has been made in this recursion, and
3141     it's safe to prune directories that are not in
3142     SYMBOL_FILL.node_tree, provided that said directory has a source
3143     prefix of one of the PARENT_SOURCE_PREFIX.
3144
3145     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3146     should only be passed in by recursive calls."""
3147     # Calculate scores and revnums for all sources
3148     for source in sources:
3149       src_revnum, score = symbol_fill.get_best_revnum(source.key,
3150                                                       preferred_revnum)
3151       source.set_score(score, src_revnum)
3152
3153     # Sort the sources in descending score order so that we will make
3154     # a eventual copy from the source with the highest score.
3155     sources.sort()
3156     copy_source = sources[0]
3157
3158     src_path = _path_join(copy_source.prefix, path)
3159     dest_path = _path_join(dest_prefix, path)
3160
3161     # Figure out if we shall copy to this destination and delete any
3162     # destination path that is in the way.
3163     do_copy = 0
3164     if dest_key is None:
3165       do_copy = 1
3166     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3167                        copy_source.revnum != preferred_revnum):
3168       # We are about to replace the destination, so we need to remove
3169       # it before we perform the copy.
3170       self._delete_path(dest_path)
3171       do_copy = 1
3172
3173     if do_copy:
3174       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3175                                                copy_source.revnum)
3176       prune_ok = 1
3177     else:
3178       dest_entries = self._get_node(dest_key)
3179
3180     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3181     # elements and the values are lists of FillSource classes where
3182     # this path element exists.
3183     src_entries = {}
3184     for source in sources:
3185       for entry, key in symbol_fill.node_tree[source.key].items():
3186         if entry[0] == '/': # Skip flags
3187           continue
3188         if not src_entries.has_key(entry):
3189           src_entries[entry] = []
3190         src_entries[entry].append(FillSource(source.prefix, key))
3191
3192     if prune_ok:
3193       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3194       delete_list = [ ]
3195       for entry in dest_entries.keys():
3196         if not src_entries.has_key(entry):
3197           delete_list.append(entry)
3198       if delete_list:
3199         if not self.new_nodes.has_key(dest_key):
3200           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3201         # Sort the delete list to get "diffable" dumpfiles.
3202         delete_list.sort()
3203         for entry in delete_list:
3204           self._fast_delete_path(dest_path, dest_entries, entry)
3205
3206     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3207     src_keys = src_entries.keys()
3208     src_keys.sort()
3209     for src_key in src_keys:
3210       if dest_entries.has_key(src_key):
3211         next_dest_key = dest_entries[src_key]
3212       else:
3213         next_dest_key = None
3214       self._fill(symbol_fill, dest_prefix, next_dest_key,
3215                  src_entries[src_key], _path_join(path, src_key),
3216                  copy_source.prefix, sources[0].revnum, prune_ok)
3217
3218   def _synchronize_default_branch(self, svn_commit):
3219     """Propagate any changes that happened on a non-trunk default
3220     branch to the trunk of the repository.  See
3221     CVSCommit._post_commit() for details on why this is necessary."""
3222     for cvs_rev in svn_commit.cvs_revs:
3223       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3224         if self._path_exists(cvs_rev.svn_trunk_path):
3225           # Delete the path on trunk...
3226           self._delete_path(cvs_rev.svn_trunk_path)
3227         # ...and copy over from branch
3228         self._copy_path(cvs_rev.svn_path, cvs_rev.svn_trunk_path,
3229                         svn_commit.motivating_revnum)
3230       elif cvs_rev.op == OP_DELETE:
3231         # delete trunk path
3232         self._delete_path(cvs_rev.svn_trunk_path)
3233       else:
3234         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3235                % cvs_rev.op)
3236         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3237
3238   def commit(self, svn_commit):
3239     """Add an SVNCommit to the SVNRepository, incrementing the
3240     Repository revision number, and changing the repository.  Invoke
3241     the delegates' _start_commit() method."""
3242
3243     if svn_commit.revnum == 2:
3244       self._initialize_repository(svn_commit.get_date())
3245
3246     self._start_commit(svn_commit)
3247
3248     if svn_commit.symbolic_name:
3249       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3250                   svn_commit.symbolic_name)
3251       self._fill_symbolic_name(svn_commit)
3252     elif svn_commit.motivating_revnum:
3253       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3254                   % svn_commit.motivating_revnum)
3255       self._synchronize_default_branch(svn_commit)
3256     else: # This actually commits CVSRevisions
3257       if len(svn_commit.cvs_revs) > 1: plural = "s"
3258       else: plural = ""
3259       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3260                   % (len(svn_commit.cvs_revs), plural))
3261       for cvs_rev in svn_commit.cvs_revs:
3262         # See comment in CVSCommit._commit() for what this is all
3263         # about.  Note that although asking self._path_exists() is
3264         # somewhat expensive, we only do it if the first two (cheap)
3265         # tests succeed first.
3266         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3267                 and (cvs_rev.rev == "1.1.1.1")
3268                 and self._path_exists(cvs_rev.svn_path)):
3269           if cvs_rev.op == OP_ADD:
3270             self._add_path(cvs_rev)
3271           elif cvs_rev.op == OP_CHANGE:
3272             # Fix for Issue #74:
3273             #
3274             # Here's the scenario.  You have file FOO that is imported
3275             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3276             # the file exists.
3277             #
3278             # Moving forward in time, FOO is deleted on the default
3279             # branch (r1.1.1.2).  cvs2svn determines that this delete
3280             # also needs to happen on trunk, so FOO is deleted on
3281             # trunk.
3282             #
3283             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3284             # not 'dead', we assume it's a change).  However, since
3285             # our trunk file has been deleted, svnadmin blows up--you
3286             # can't change a file that doesn't exist!
3287             #
3288             # Soooo... we just check the path, and if it doesn't
3289             # exist, we do an add... if the path does exist, it's
3290             # business as usual.
3291             if not self._path_exists(cvs_rev.svn_path):
3292               self._add_path(cvs_rev)
3293             else:
3294               self._change_path(cvs_rev)
3295
3296         if cvs_rev.op == OP_DELETE:
3297           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3298
3299   def cleanup(self):
3300     """Callback for the Cleanup.register in self.__init__."""
3301     self.revs_db = None
3302     self.nodes_db = None
3303
3304   def add_delegate(self, delegate):
3305     """Adds DELEGATE to self.delegates.
3306
3307     For every delegate you add, as soon as SVNRepositoryMirror
3308     performs a repository action method, SVNRepositoryMirror will call
3309     the delegate's corresponding repository action method.  Multiple
3310     delegates will be called in the order that they are added.  See
3311     SVNRepositoryMirrorDelegate for more information."""
3312     self.delegates.append(delegate)
3313
3314   def _invoke_delegates(self, method, *args):
3315     """Iterate through each of our delegates, in the order that they
3316     were added, and call the delegate's method named METHOD with the
3317     arguments in ARGS."""
3318     for delegate in self.delegates:
3319       getattr(delegate, method)(*args)
3320
3321   def finish(self):
3322     """Calls the delegate finish method."""
3323     self._end_commit()
3324     self._invoke_delegates('finish')
3325     self.cleanup()
3326
3327
3328 class SVNCommitItem:
3329   """A wrapper class for CVSRevision objects upon which
3330    Subversion-related data (such as properties) may be hung."""
3331
3332   def __init__(self, c_rev, make_svn_props):
3333     self.c_rev = c_rev
3334     self.set_cvs_revnum_properties = Ctx().cvs_revnums
3335     self.eol_from_mime_type = Ctx().eol_from_mime_type
3336     self.no_default_eol = Ctx().no_default_eol
3337     self.keywords_off = Ctx().keywords_off
3338     self.mime_mapper = Ctx().mime_mapper
3339
3340     # We begin with only a "CVS revision" property.
3341     self.svn_props = { }
3342     if self.set_cvs_revnum_properties:
3343       self.svn_props['cvs2svn:cvs-rev'] = c_rev.rev
3344       make_svn_props = True
3345
3346     # Set mime-type and eol.  These two properties are intertwingled;
3347     # follow the conditionals carefully.  See also issue #39.
3348     mime_type = None
3349     eol_style = None
3350     keywords = None
3351
3352     if self.mime_mapper:
3353       mime_type = self.mime_mapper.get_type_from_filename(c_rev.cvs_path)
3354
3355     if not c_rev.mode == 'b':
3356       if not self.no_default_eol:
3357         eol_style = 'native'
3358       elif mime_type and self.eol_from_mime_type:
3359         if mime_type.startswith("text/"):
3360           eol_style = 'native'
3361         else:
3362           eol_style = None
3363     elif mime_type is None:
3364       # file is kb, and no other mimetype specified
3365       mime_type = 'application/octet-stream'
3366
3367     # Set the svn:keywords property, if appropriate.  See issue #2.
3368     if not self.keywords_off and (c_rev.mode is None or c_rev.mode == 'kv' or
3369                                   c_rev.mode == 'kvl'):
3370       keywords = 'Author Date Id Revision'
3371
3372     # Remember if we need to filter the EOLs.  We can't use self.svn_props
3373     # becase they are only set on the first revision and we need to filter
3374     # all revisions.
3375     self.needs_eol_filter = eol_style == 'native'
3376
3377     # Remember if this file has svn:keywords set
3378     self.has_keywords = keywords is not None
3379
3380     # If asked to fill in the Subversion properties ('svn:' ones), do so.
3381     if make_svn_props:
3382       # Tack on the executableness, if any.
3383       if c_rev.file_executable:
3384         self.svn_props['svn:executable'] = '*'
3385
3386       # Set the svn:keywords property, if appropriate.  See issue #2.
3387       if keywords:
3388         self.svn_props['svn:keywords'] = 'Author Date Id Revision'
3389
3390       if mime_type:
3391         self.svn_props['svn:mime-type'] = mime_type
3392
3393       if eol_style:
3394         self.svn_props['svn:eol-style'] = eol_style
3395
3396
3397 class SVNRepositoryMirrorDelegate:
3398   """Abstract superclass for any delegate to SVNRepositoryMirror.
3399   Subclasses must implement all of the methods below.
3400
3401   For each method, a subclass implements, in its own way, the
3402   Subversion operation implied by the method's name.  For example, for
3403   the add_path method, the DumpfileDelegate would write out a
3404   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3405   would merely print that the path is being added to the repository,
3406   and the RepositoryDelegate would actually cause the path to be added
3407   to the Subversion repository that it is creating.
3408   """
3409
3410   def start_commit(self, svn_commit):
3411     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3412     see subclass implementation for details."""
3413     raise NotImplementedError
3414
3415   def mkdir(self, path):
3416     """PATH is a string; see subclass implementation for details."""
3417     raise NotImplementedError
3418
3419   def add_path(self, s_item):
3420     """S_ITEM is an SVNCommitItem; see subclass implementation for
3421     details."""
3422     raise NotImplementedError
3423
3424   def change_path(self, s_item):
3425     """S_ITEM is an SVNCommitItem; see subclass implementation for
3426     details."""
3427     raise NotImplementedError
3428
3429   def delete_path(self, path):
3430     """PATH is a string; see subclass implementation for
3431     details."""
3432     raise NotImplementedError
3433
3434   def copy_path(self, src_path, dest_path, src_revnum):
3435     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3436     subversion revision number (int); see subclass implementation for
3437     details."""
3438     raise NotImplementedError
3439
3440   def finish(self):
3441     """Perform any cleanup necessary after all revisions have been
3442     committed."""
3443     raise NotImplementedError
3444
3445
3446 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3447   """Create a Subversion dumpfile."""
3448
3449   def __init__(self, dumpfile_path=None):
3450     """Return a new DumpfileDelegate instance, attached to a dumpfile
3451     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding.
3452
3453     If Ctx().cvs_revnums is true, then set the 'cvs2svn:cvs-revnum'
3454     property on files, when they are changed due to a corresponding
3455     CVS revision.
3456
3457     If Ctx().mime_mapper is not None, then it is a MimeMapper
3458     instance, used to determine whether or not to set the
3459     'svn:mime-type' property on files.  But even if Ctx().mime_mapper
3460     is None, files marked with the CVS 'kb' flag will receive a mime
3461     type of "application/octet-stream".
3462
3463     Unless Ctx().no_default_eol is true, set 'svn:eol-style' to
3464     'native' for files not marked with the CVS 'kb' flag, except as
3465     superseded by Ctx().eol_from_mime_type (see below).
3466
3467     If Ctx().eol_from_mime_type is not None, then set 'svn:eol-style'
3468     to 'native' for all files to which Ctx().mime_mapper assigns a
3469     mime type beginning with "text/", and don't set 'svn:eol-style'
3470     for files assigned a type not beginning with "text/".
3471     """
3472     if dumpfile_path:
3473       self.dumpfile_path = dumpfile_path
3474     else:
3475       self.dumpfile_path = Ctx().dumpfile
3476     self.path_encoding = Ctx().encoding
3477
3478     self.dumpfile = open(self.dumpfile_path, 'wb')
3479     self._write_dumpfile_header(self.dumpfile)
3480
3481   def _write_dumpfile_header(self, dumpfile):
3482     # Initialize the dumpfile with the standard headers.
3483     #
3484     # Since the CVS repository doesn't have a UUID, and the Subversion
3485     # repository will be created with one anyway, we don't specify a
3486     # UUID in the dumpflie
3487     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3488
3489   def _utf8_path(self, path):
3490     """Return a copy of PATH encoded in UTF-8.  PATH is assumed to be
3491     encoded in self.path_encoding."""
3492     try:
3493       # Log messages can be converted with the 'replace' strategy,
3494       # but we can't afford any lossiness here.
3495       unicode_path = unicode(path, self.path_encoding, 'strict')
3496       return unicode_path.encode('utf-8')
3497     except UnicodeError:
3498       print "Unable to convert a path '%s' to internal encoding." % path
3499       print "Consider rerunning with (for example) '--encoding=latin1'"
3500       sys.exit(1)
3501
3502   def start_commit(self, svn_commit):
3503     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3504
3505     self.revision = svn_commit.revnum
3506
3507     # The start of a new commit typically looks like this:
3508     #
3509     #   Revision-number: 1
3510     #   Prop-content-length: 129
3511     #   Content-length: 129
3512     #
3513     #   K 7
3514     #   svn:log
3515     #   V 27
3516     #   Log message for revision 1.
3517     #   K 10
3518     #   svn:author
3519     #   V 7
3520     #   jrandom
3521     #   K 8
3522     #   svn:date
3523     #   V 27
3524     #   2003-04-22T22:57:58.132837Z
3525     #   PROPS-END
3526     #
3527     # Notice that the length headers count everything -- not just the
3528     # length of the data but also the lengths of the lengths, including
3529     # the 'K ' or 'V ' prefixes.
3530     #
3531     # The reason there are both Prop-content-length and Content-length
3532     # is that the former includes just props, while the latter includes
3533     # everything.  That's the generic header form for any entity in a
3534     # dumpfile.  But since revisions only have props, the two lengths
3535     # are always the same for revisions.
3536
3537     # Calculate the total length of the props section.
3538     props = svn_commit.get_revprops()
3539     prop_names = props.keys()
3540     prop_names.sort()
3541     total_len = 10  # len('PROPS-END\n')
3542     for propname in prop_names:
3543       if props[propname] is None:
3544         continue
3545       klen = len(propname)
3546       klen_len = len('K %d' % klen)
3547       vlen = len(props[propname])
3548       vlen_len = len('V %d' % vlen)
3549       # + 4 for the four newlines within a given property's section
3550       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
3551
3552     # Print the revision header and props
3553     self.dumpfile.write('Revision-number: %d\n'
3554                         'Prop-content-length: %d\n'
3555                         'Content-length: %d\n'
3556                         '\n'
3557                         % (self.revision, total_len, total_len))
3558
3559     for propname in prop_names:
3560       if props[propname] is None:
3561         continue
3562       self.dumpfile.write('K %d\n'
3563                           '%s\n'
3564                           'V %d\n'
3565                           '%s\n' % (len(propname),
3566                                     propname,
3567                                     len(props[propname]),
3568                                     props[propname]))
3569
3570     self.dumpfile.write('PROPS-END\n')
3571     self.dumpfile.write('\n')
3572
3573   def mkdir(self, path):
3574     """Emit the creation of directory PATH."""
3575     self.dumpfile.write("Node-path: %s\n"
3576                         "Node-kind: dir\n"
3577                         "Node-action: add\n"
3578                         "Content-length: 10\n"
3579                         "\n"
3580                         "\n" % self._utf8_path(path))
3581
3582   def _add_or_change_path(self, s_item, op):
3583     """Emit the addition or change corresponding to S_ITEM.
3584     OP is either the constant OP_ADD or OP_CHANGE."""
3585
3586     # Validation stuffs
3587     if op == OP_ADD:
3588       action = 'add'
3589     elif op == OP_CHANGE:
3590       action = 'change'
3591     else:
3592       sys.stderr.write("%s: _add_or_change_path() called with bad op ('%s')"
3593                        % (error_prefix, op))
3594       sys.exit(1)
3595
3596     # Convenience variables
3597     c_rev = s_item.c_rev
3598     svn_props = s_item.svn_props
3599
3600     # The property handling here takes advantage of an undocumented
3601     # but IMHO consistent feature of the Subversion dumpfile-loading
3602     # code.  When a node's properties aren't mentioned (that is, the
3603     # "Prop-content-length:" header is absent, no properties are
3604     # listed at all, and there is no "PROPS-END\n" line) then no
3605     # change is made to the node's properties.
3606     #
3607     # This is consistent with the way dumpfiles behave w.r.t. text
3608     # content changes, so I'm comfortable relying on it.  If you
3609     # commit a change to *just* the properties of some node that
3610     # already has text contents from a previous revision, then in the
3611     # dumpfile output for the prop change, no "Text-content-length:"
3612     # nor "Text-content-md5:" header will be present, and the text of
3613     # the file will not be given.  But this does not cause the file's
3614     # text to be erased!  It simply remains unchanged.
3615     #
3616     # This works out great for cvs2svn, due to lucky coincidences:
3617     #
3618     # For files, the only properties we ever set are set in the first
3619     # revision; all other revisions (including on branches) inherit
3620     # from that.  After the first revision, we never change file
3621     # properties, therefore, there is no need to remember the full set
3622     # of properties on a given file once we've set it.
3623     #
3624     # For directories, the only property we set is "svn:ignore", and
3625     # while we may change it after the first revision, we always do so
3626     # based on the contents of a ".cvsignore" file -- in other words,
3627     # CVS is doing the remembering for us, so we still don't have to
3628     # preserve the previous value of the property ourselves.
3629
3630     # Calculate the (sorted-by-name) property string and length, if any.
3631     prop_contents = ''
3632     prop_names = svn_props.keys()
3633     prop_names.sort()
3634     for pname in prop_names:
3635       pval = svn_props[pname]
3636       prop_contents = prop_contents + \
3637                       'K %d\n%s\nV %d\n%s\n' \
3638                       % (len(pname), pname, len(pval), pval)
3639     if prop_contents:
3640       prop_contents = prop_contents + 'PROPS-END\n'
3641       props_len = len(prop_contents)
3642     else:
3643       props_len = 0
3644
3645     props_header = ''
3646     if props_len:
3647       props_header = 'Prop-content-length: %d\n' % props_len
3648
3649     # treat .cvsignore as a directory property
3650     dir_path, basename = os.path.split(c_rev.svn_path)
3651     if basename == ".cvsignore":
3652       ignore_vals = generate_ignores(c_rev)
3653       ignore_contents = '\n'.join(ignore_vals)
3654       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3655                          (len(ignore_contents), ignore_contents))
3656       ignore_contents = ignore_contents + 'PROPS-END\n'
3657       ignore_len = len(ignore_contents)
3658
3659       # write headers, then props
3660       self.dumpfile.write('Node-path: %s\n'
3661                           'Node-kind: dir\n'
3662                           'Node-action: change\n'
3663                           'Prop-content-length: %d\n'
3664                           'Content-length: %d\n'
3665                           '\n'
3666                           '%s'
3667                           % (self._utf8_path(dir_path), ignore_len,
3668                              ignore_len, ignore_contents))
3669
3670     # If the file has keywords, we must use -kk to prevent CVS/RCS from
3671     # expanding the keywords because they must be unexpanded in the
3672     # repository, or Subversion will get confused.
3673     if s_item.has_keywords:
3674       pipe_cmd, pipe = get_co_pipe(c_rev, '-kk')
3675     else:
3676       pipe_cmd, pipe = get_co_pipe(c_rev)
3677
3678     self.dumpfile.write('Node-path: %s\n'
3679                         'Node-kind: file\n'
3680                         'Node-action: %s\n'
3681                         '%s'  # no property header if no props
3682                         'Text-content-length: '
3683                         % (self._utf8_path(c_rev.svn_path),
3684                            action, props_header))
3685
3686     pos = self.dumpfile.tell()
3687
3688     self.dumpfile.write('0000000000000000\n'
3689                         'Text-content-md5: 00000000000000000000000000000000\n'
3690                         'Content-length: 0000000000000000\n'
3691                         '\n')
3692
3693     if prop_contents:
3694       self.dumpfile.write(prop_contents)
3695
3696     # Insert a filter to convert all EOLs to LFs if neccessary
3697     if s_item.needs_eol_filter:
3698       data_reader = LF_EOL_Filter(pipe.fromchild)
3699     else:
3700       data_reader = pipe.fromchild
3701
3702     # Insert the rev contents, calculating length and checksum as we go.
3703     checksum = md5.new()
3704     length = 0
3705     while True:
3706       buf = data_reader.read(PIPE_READ_SIZE)
3707       if buf == '':
3708         break
3709       checksum.update(buf)
3710       length = length + len(buf)
3711       self.dumpfile.write(buf)
3712
3713     pipe.fromchild.close()
3714     error_output = pipe.childerr.read()
3715     exit_status = pipe.wait()
3716     if exit_status:
3717       sys.exit("%s: The command '%s' failed with exit status: %s\n"
3718                "and the following output:\n"
3719                "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
3720
3721     # Go back to patch up the length and checksum headers:
3722     self.dumpfile.seek(pos, 0)
3723     # We left 16 zeros for the text length; replace them with the real
3724     # length, padded on the left with spaces:
3725     self.dumpfile.write('%16d' % length)
3726     # 16... + 1 newline + len('Text-content-md5: ') == 35
3727     self.dumpfile.seek(pos + 35, 0)
3728     self.dumpfile.write(checksum.hexdigest())
3729     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
3730     self.dumpfile.seek(pos + 84, 0)
3731     # The content length is the length of property data, text data,
3732     # and any metadata around/inside around them.
3733     self.dumpfile.write('%16d' % (length + props_len))
3734     # Jump back to the end of the stream
3735     self.dumpfile.seek(0, 2)
3736
3737     # This record is done (write two newlines -- one to terminate
3738     # contents that weren't themselves newline-termination, one to
3739     # provide a blank line for readability.
3740     self.dumpfile.write('\n\n')
3741
3742   def add_path(self, s_item):
3743     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
3744     self._add_or_change_path(s_item, OP_ADD)
3745
3746   def change_path(self, s_item):
3747     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
3748     self._add_or_change_path(s_item, OP_CHANGE)
3749
3750   def delete_path(self, path):
3751     """Emit the deletion of PATH."""
3752     self.dumpfile.write('Node-path: %s\n'
3753                         'Node-action: delete\n'
3754                         '\n' % self._utf8_path(path))
3755
3756   def copy_path(self, src_path, dest_path, src_revnum):
3757     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
3758     # We don't need to include "Node-kind:" for copies; the loader
3759     # ignores it anyway and just uses the source kind instead.
3760     self.dumpfile.write('Node-path: %s\n'
3761                         'Node-action: add\n'
3762                         'Node-copyfrom-rev: %d\n'
3763                         'Node-copyfrom-path: /%s\n'
3764                         '\n'
3765                         % (self._utf8_path(dest_path),
3766                            src_revnum,
3767                            self._utf8_path(src_path)))
3768
3769   def finish(self):
3770     """Perform any cleanup necessary after all revisions have been
3771     committed."""
3772     self.dumpfile.close()
3773
3774
3775 class RepositoryDelegate(DumpfileDelegate):
3776   """Creates a new Subversion Repository.  DumpfileDelegate does all
3777   of the heavy lifting."""
3778   def __init__(self):
3779     self.svnadmin = Ctx().svnadmin
3780     self.target = Ctx().target
3781     if not Ctx().existing_svnrepos:
3782       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
3783       if Ctx().fs_type and Ctx().fs_type != 'bdb':
3784         # User specified something other than bdb.
3785         run_command('%s create %s "%s"' % (self.svnadmin,
3786                                            "--fs-type=%s" % Ctx().fs_type,
3787                                            self.target))
3788       elif Ctx().fs_type:
3789         # User explicitly specified bdb.
3790         #
3791         # Since this is a BDB repository, pass --bdb-txn-nosync,
3792         # because it gives us a 4-5x speed boost (if cvs2svn is
3793         # creating the repository, cvs2svn should be the only program
3794         # accessing the svn repository (until cvs is done, at least)).
3795         # But we'll turn no-sync off in self.finish(), unless
3796         # instructed otherwise.
3797         run_command('%s create %s %s "%s"' % (self.svnadmin,
3798                                               "--fs-type=bdb",
3799                                               "--bdb-txn-nosync",
3800                                               self.target))
3801       else:
3802         # User didn't say what kind repository (bdb, fsfs, etc).
3803         # We still pass --bdb-txn-nosync.  It's a no-op if the default
3804         # repository type doesn't support it, but we definitely want
3805         # it if BDB is the default.
3806         run_command('%s create %s "%s"' % (self.svnadmin,
3807                                            "--bdb-txn-nosync",
3808                                            self.target))
3809
3810
3811     # Since the output of this run is a repository, not a dumpfile,
3812     # the temporary dumpfiles we create should go in the tmpdir.
3813     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
3814
3815     # This is 1 if a commit is in progress, otherwise None.
3816     self._commit_in_progress = None
3817
3818     self.dumpfile = open(self.dumpfile_path, 'w+b')
3819     self.loader_pipe = Popen3('%s load -q "%s"' % (self.svnadmin, self.target),
3820                               True)
3821     self.loader_pipe.fromchild.close()
3822     try:
3823       self._write_dumpfile_header(self.loader_pipe.tochild)
3824     except IOError:
3825       sys.stderr.write("%s: svnadmin failed with the following output while "
3826                        "loading the dumpfile:\n" % (error_prefix))
3827       sys.stderr.write(self.loader_pipe.childerr.read())
3828       sys.exit(1)
3829
3830   def _feed_pipe(self):
3831     """Feed the revision stored in the dumpfile to the svnadmin
3832     load pipe."""
3833     self.dumpfile.seek(0)
3834     while 1:
3835       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
3836       if not len(data):
3837         break
3838       try:
3839         self.loader_pipe.tochild.write(data)
3840       except IOError:
3841         sys.stderr.write("%s: svnadmin failed with the following output while "
3842                          "loading the dumpfile:\n" % (error_prefix))
3843         sys.stderr.write(self.loader_pipe.childerr.read())
3844         sys.exit(1)
3845
3846   def start_commit(self, svn_commit):
3847     """Start a new commit.  If a commit is already in progress, close
3848     the dumpfile, load it into the svn repository, open a new
3849     dumpfile, and write the header into it."""
3850     if self._commit_in_progress:
3851       self._feed_pipe()
3852     self.dumpfile.seek(0)
3853     self.dumpfile.truncate()
3854     DumpfileDelegate.start_commit(self, svn_commit)
3855     self._commit_in_progress = 1
3856
3857   def finish(self):
3858     """Loads the last commit into the repository."""
3859     self._feed_pipe()
3860     self.dumpfile.close()
3861     self.loader_pipe.tochild.close()
3862     error_output = self.loader_pipe.childerr.read()
3863     exit_status = self.loader_pipe.wait()
3864     if exit_status:
3865       sys.exit('%s: svnadmin load failed with exit status: %s\n'
3866                'and the following output:\n'
3867                '%s' % (error_prefix, exit_status, error_output))
3868     os.remove(self.dumpfile_path)
3869
3870     # If this is a BDB repository, and we created the repository, and
3871     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
3872     # line in the DB_CONFIG file, because txn syncing should be on by
3873     # default in BDB repositories.
3874     #
3875     # We determine if this is a BDB repository by looking for the
3876     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
3877     # checking Ctx().fs_type.  That way this code will Do The Right
3878     # Thing in all circumstances.
3879     db_config = os.path.join(self.target, "db/DB_CONFIG")
3880     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
3881         and os.path.exists(db_config)):
3882       no_sync = 'set_flags DB_TXN_NOSYNC\n'
3883
3884       contents = open(db_config, 'r').readlines()
3885       index = contents.index(no_sync)
3886       contents[index] = '# ' + no_sync
3887       contents = open(db_config, 'w').writelines(contents)
3888
3889
3890 class StdoutDelegate(SVNRepositoryMirrorDelegate):
3891   """Makes no changes to the disk, but writes out information to
3892   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
3893   print statements will state that we're doing something, when in
3894   reality, we aren't doing anything other than printing out that we're
3895   doing something.  Kind of zen, really."""
3896   def __init__(self, total_revs):
3897     self.total_revs = total_revs
3898
3899   def start_commit(self, svn_commit):
3900     """Prints out the Subversion revision number of the commit that is
3901     being started."""
3902     Log().write(LOG_VERBOSE, "=" * 60)
3903     Log().write(LOG_NORMAL, "Starting Subversion commit %d / %d" %
3904                 (svn_commit.revnum, self.total_revs))
3905
3906   def mkdir(self, path):
3907     """Print a line stating that we are creating directory PATH."""
3908     Log().write(LOG_VERBOSE, "  New Directory", path)
3909
3910   def add_path(self, s_item):
3911     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
3912     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
3913
3914   def change_path(self, s_item):
3915     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
3916     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
3917
3918   def delete_path(self, path):
3919     """Print a line stating that we are 'deleting' PATH."""
3920     Log().write(LOG_VERBOSE, "  Deleting", path)
3921
3922   def copy_path(self, src_path, dest_path, src_revnum):
3923     """Print a line stating that we are 'copying' revision SRC_REVNUM
3924     of SRC_PATH to DEST_PATH."""
3925     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
3926     Log().write(LOG_VERBOSE, "                to", dest_path)
3927
3928   def finish(self):
3929     """State that we are done creating our repository."""
3930     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
3931     Log().write(LOG_QUIET, "Done.")
3932
3933 # This should be a local to pass1,
3934 # but Python 2.0 does not support nested scopes.
3935 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
3936 def pass1():
3937   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
3938   cd = CollectData()
3939
3940   def visit_file(baton, dirname, files):
3941     cd = baton
3942     for fname in files:
3943       if fname[-2:] != ',v':
3944         continue
3945       cd.found_valid_file = 1
3946       pathname = os.path.join(dirname, fname)
3947       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
3948         # drop the 'Attic' portion from the pathname for the canonical name.
3949         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
3950       else:
3951         # If this file also exists in the attic, it's a fatal error
3952         attic_path = os.path.join(dirname, 'Attic', fname)
3953         if os.path.exists(attic_path):
3954           err = "%s: A CVS repository cannot contain both %s and %s" \
3955                 % (error_prefix, pathname, attic_path)
3956           sys.stderr.write(err + '\n')
3957           cd.fatal_errors.append(err)
3958         cd.set_fname(pathname, pathname)
3959       Log().write(LOG_NORMAL, pathname)
3960       try:
3961         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
3962       except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
3963         err = "%s: '%s' is not a valid ,v file" \
3964               % (error_prefix, pathname)
3965         sys.stderr.write(err + '\n')
3966         cd.fatal_errors.append(err)
3967       except:
3968         Log().write(LOG_WARN, "Exception occurred while parsing %s" % pathname)
3969         raise
3970
3971   os.path.walk(Ctx().cvsroot, visit_file, cd)
3972   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
3973
3974   cd.write_symbol_db()
3975
3976   if len(cd.fatal_errors) > 0:
3977     sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
3978              + "Error summary:\n"
3979              + "\n".join(cd.fatal_errors)
3980              + "\nExited due to fatal error(s).")
3981
3982   if cd.found_valid_file is None:
3983     sys.exit("\nNo RCS files found in your CVS Repository!\n"
3984              + "Are you absolutely certain you are pointing cvs2svn\n"
3985              + "at a CVS repository?\n"
3986              + "\nExited due to fatal error(s).")
3987
3988   StatsKeeper().reset_c_rev_info()
3989   StatsKeeper().archive()
3990   Log().write(LOG_QUIET, "Done")
3991
3992 def pass2():
3993   "Pass 2: clean up the revision information."
3994
3995   symbol_db = SymbolDatabase()
3996   symbol_db.read()
3997
3998   # Convert the list of regexps to a list of strings
3999   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
4000
4001   error_detected = 0
4002
4003   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
4004   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
4005   if blocked_excludes:
4006     for branch, blockers in blocked_excludes.items():
4007       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
4008                        "excluded because the following symbols depend "
4009                        "on it:\n" % (branch))
4010       for blocker in blockers:
4011         sys.stderr.write("    '%s'\n" % (blocker))
4012     sys.stderr.write("\n")
4013     error_detected = 1
4014
4015   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
4016   invalid_forced_tags = [ ]
4017   for forced_tag in Ctx().forced_tags:
4018     if excludes.has_key(forced_tag):
4019       continue
4020     if symbol_db.branch_has_commit(forced_tag):
4021       invalid_forced_tags.append(forced_tag)
4022   if invalid_forced_tags:
4023     sys.stderr.write(error_prefix + ": The following branches cannot be "
4024                      "forced to be tags because they have commits:\n")
4025     for tag in invalid_forced_tags:
4026       sys.stderr.write("    '%s'\n" % (tag))
4027     sys.stderr.write("\n")
4028     error_detected = 1
4029
4030   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
4031   mismatches = symbol_db.find_mismatches(excludes)
4032   def is_not_forced(mismatch):
4033     name = mismatch[0]
4034     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
4035   mismatches = filter(is_not_forced, mismatches)
4036   if mismatches:
4037     sys.stderr.write(error_prefix + ": The following symbols are tags "
4038                      "in some files and branches in others.\nUse "
4039                      "--force-tag, --force-branch and/or --exclude to "
4040                      "resolve the symbols.\n")
4041     for name, tag_count, branch_count, commit_count in mismatches:
4042       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
4043                        "%d files and has commits in %d files.\n"
4044                        % (name, tag_count, branch_count, commit_count))
4045     error_detected = 1
4046
4047   # Bail out now if we found errors
4048   if error_detected:
4049     sys.exit(1)
4050
4051   # Create the tags database
4052   tags_db = TagsDatabase(DB_OPEN_NEW)
4053   for tag in symbol_db.tags.keys():
4054     if tag not in Ctx().forced_branches:
4055       tags_db[tag] = None
4056   for tag in Ctx().forced_tags:
4057     tags_db[tag] = None
4058
4059   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
4060
4061   # We may have recorded some changes in revisions' timestamp.  We need to
4062   # scan for any other files which may have had the same log message and
4063   # occurred at "the same time" and change their timestamps, too.
4064
4065   # read the resync data file
4066   def read_resync(fname):
4067     "Read the .resync file into memory."
4068
4069     ### note that we assume that we can hold the entire resync file in
4070     ### memory. really large repositories with whacky timestamps could
4071     ### bust this assumption. should that ever happen, then it is possible
4072     ### to split the resync file into pieces and make multiple passes,
4073     ### using each piece.
4074
4075     #
4076     # A digest maps to a sequence of lists which specify a lower and upper
4077     # time bound for matching up the commit.  We keep a sequence of these
4078     # because a number of checkins with the same log message (e.g. an empty
4079     # log message) could need to be remapped.  We also make them a list because
4080     # we will dynamically expand the lower/upper bound as we find commits
4081     # that fall into a particular msg and time range.
4082     #
4083     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4084     #
4085     resync = { }
4086
4087     for line in fileinput.FileInput(fname):
4088       t1 = int(line[:8], 16)
4089       digest = line[9:DIGEST_END_IDX]
4090       t2 = int(line[DIGEST_END_IDX+1:], 16)
4091       t1_l = t1 - COMMIT_THRESHOLD/2
4092       t1_u = t1 + COMMIT_THRESHOLD/2
4093       if resync.has_key(digest):
4094         resync[digest].append([t1_l, t1_u, t2])
4095       else:
4096         resync[digest] = [ [t1_l, t1_u, t2] ]
4097
4098     # For each digest, sort the resync items in it in increasing order,
4099     # based on the lower time bound.
4100     digests = resync.keys()
4101     for digest in digests:
4102       (resync[digest]).sort()
4103
4104     return resync
4105
4106   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4107
4108   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4109   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4110
4111   # process the revisions file, looking for items to clean up
4112   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4113     c_rev = CVSRevision(Ctx(), line[:-1])
4114
4115     # Skip this entire revision if it's on an excluded branch
4116     if excludes.has_key(c_rev.branch_name):
4117       continue
4118
4119     # Remove all references to excluded tags and branches
4120     def not_excluded(symbol, excludes=excludes):
4121       return not excludes.has_key(symbol)
4122     c_rev.branches = filter(not_excluded, c_rev.branches)
4123     c_rev.tags = filter(not_excluded, c_rev.tags)
4124
4125     # Convert all branches that are forced to be tags
4126     for forced_tag in Ctx().forced_tags:
4127       if forced_tag in c_rev.branches:
4128         c_rev.branches.remove(forced_tag)
4129         c_rev.tags.append(forced_tag)
4130
4131     # Convert all tags that are forced to be branches
4132     for forced_branch in Ctx().forced_branches:
4133       if forced_branch in c_rev.tags:
4134         c_rev.tags.remove(forced_branch)
4135         c_rev.branches.append(forced_branch)
4136
4137     # see if this is "near" any of the resync records we
4138     # have recorded for this digest [of the log message].
4139     for record in resync.get(c_rev.digest, []):
4140       if record[0] <= c_rev.timestamp <= record[1]:
4141         # bingo! remap the time on this (record[2] is the new time).
4142
4143         # adjust the time range. we want the COMMIT_THRESHOLD from the
4144         # bounds of the earlier/latest commit in this group.
4145         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4146         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4147
4148         # By default this will be the new timestamp
4149         new_timestamp = record[2]
4150         # If the new timestamp is earlier than that of our previous revision
4151         if record[2] < c_rev.prev_timestamp:
4152           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4153                   + " to time %s, which is before previous the time of"
4154                   + " revision %s (%s):")
4155           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4156                                         c_rev.cvs_path, record[2],
4157                                         c_rev.prev_rev, c_rev.prev_timestamp))
4158           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4159           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4160           # attempted sync time, then sync back to c_rev.prev_timestamp
4161           # + 1...
4162           if (c_rev.prev_timestamp - record[2]) < COMMIT_THRESHOLD:
4163             new_timestamp = c_rev.prev_timestamp + 1
4164             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4165                                                           new_timestamp))
4166           # ...otherwise, make no change
4167           else:
4168             new_timestamp = c_rev.timestamp
4169             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4170                         warning_prefix)
4171
4172         msg = "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4173               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4174                  record[2] - c_rev.timestamp)
4175         Log().write(LOG_VERBOSE, msg)
4176
4177         c_rev.timestamp = new_timestamp
4178
4179         # stop looking for hits
4180         break
4181
4182     output.write(str(c_rev) + "\n")
4183   Log().write(LOG_QUIET, "Done")
4184
4185 def pass3():
4186   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4187   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4188             temp(DATAFILE + SORTED_REVS_SUFFIX))
4189   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4190   Log().write(LOG_QUIET, "Done")
4191
4192 def pass4():
4193   """Iterate through sorted revs, storing them in a database.
4194   If we're not doing a trunk-only conversion, generate the
4195   LastSymbolicNameDatabase, which contains the last CVSRevision
4196   that is a source for each tag or branch.
4197   """
4198   Log().write(LOG_QUIET,
4199       "Copying CVS revision data from flat file to database...")
4200   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4201   if not Ctx().trunk_only:
4202     Log().write(LOG_QUIET,
4203         "and finding last CVS revisions for all symbolic names...")
4204     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4205   else:
4206     # This is to avoid testing Ctx().trunk_only every time around the loop
4207     class DummyLSNDB:
4208       def noop(*args): pass
4209       log_revision = noop
4210       create_database = noop
4211     last_sym_name_db = DummyLSNDB()
4212
4213   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4214     c_rev = CVSRevision(Ctx(), line[:-1])
4215     cvs_revs_db.log_revision(c_rev)
4216     last_sym_name_db.log_revision(c_rev)
4217     StatsKeeper().record_c_rev(c_rev)
4218
4219   last_sym_name_db.create_database()
4220   StatsKeeper().archive()
4221   Log().write(LOG_QUIET, "Done")
4222
4223 def pass5():
4224   """
4225   Generate the SVNCommit <-> CVSRevision mapping
4226   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4227   CVSRevisions that represent an opening or closing for a path on a
4228   branch or tag.  See SymbolingsLogger for more details.
4229   """
4230   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4231
4232   aggregator = CVSRevisionAggregator()
4233   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4234     c_rev = CVSRevision(Ctx(), line[:-1])
4235     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4236       aggregator.process_revision(c_rev)
4237   aggregator.flush()
4238
4239   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4240   StatsKeeper().archive()
4241   Log().write(LOG_QUIET, "Done")
4242
4243 def pass6():
4244   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4245
4246   if not Ctx().trunk_only:
4247     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4248               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4249     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4250   Log().write(LOG_QUIET, "Done")
4251
4252 def pass7():
4253   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4254
4255   def generate_offsets_for_symbolings():
4256     """This function iterates through all the lines in
4257     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4258     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4259     where SYMBOLIC_NAME is first encountered.  This will allow us to
4260     seek to the various offsets in the file and sequentially read only
4261     the openings and closings that we need."""
4262
4263     ###PERF This is a fine example of a db that can be in-memory and
4264     #just flushed to disk when we're done.  Later, it can just be sucked
4265     #back into memory.
4266     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4267     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4268
4269     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4270     old_sym = ""
4271     while 1:
4272       fpos = file.tell()
4273       line = file.readline()
4274       if not line:
4275         break
4276       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4277       if not sym == old_sym:
4278         Log().write(LOG_VERBOSE, " ", sym)
4279         old_sym = sym
4280         offsets_db[sym] = fpos
4281
4282   if not Ctx().trunk_only:
4283     generate_offsets_for_symbolings()
4284   Log().write(LOG_QUIET, "Done.")
4285
4286 def pass8():
4287   svncounter = 2 # Repository initialization is 1.
4288   repos = SVNRepositoryMirror()
4289   persistence_manager = PersistenceManager(DB_OPEN_READ)
4290
4291   if (Ctx().target):
4292     if not Ctx().dry_run:
4293       repos.add_delegate(RepositoryDelegate())
4294     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4295   else:
4296     if not Ctx().dry_run:
4297       repos.add_delegate(DumpfileDelegate())
4298     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4299
4300   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4301
4302   while(1):
4303     svn_commit = persistence_manager.get_svn_commit(svncounter)
4304     if not svn_commit:
4305       break
4306     repos.commit(svn_commit)
4307     svncounter += 1
4308
4309   repos.finish()
4310
4311 _passes = [
4312   pass1,
4313   pass2,
4314   pass3,
4315   pass4,
4316   pass5,
4317   pass6,
4318   pass7,
4319   pass8,
4320   ]
4321
4322
4323 class Ctx:
4324   """Session state for this run of cvs2svn.  For example, run-time
4325   options are stored here.  This class is a Borg, see
4326   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4327   """
4328   __shared_state = { }
4329   def __init__(self):
4330     self.__dict__ = self.__shared_state
4331     if self.__dict__:
4332       return
4333     # Else, initialize to defaults.
4334     self.cvsroot = None
4335     self.target = None
4336     self.dumpfile = DUMPFILE
4337     self.tmpdir = '.'
4338     self.verbose = 0
4339     self.quiet = 0
4340     self.prune = 1
4341     self.existing_svnrepos = 0
4342     self.dump_only = 0
4343     self.dry_run = 0
4344     self.trunk_only = 0
4345     self.trunk_base = "trunk"
4346     self.tags_base = "tags"
4347     self.branches_base = "branches"
4348     self.encoding = "ascii"
4349     self.mime_types_file = None
4350     self.mime_mapper = None
4351     self.no_default_eol = 0
4352     self.eol_from_mime_type = 0
4353     self.keywords_off = 0
4354     self.use_cvs = None
4355     self.svnadmin = "svnadmin"
4356     self.username = None
4357     self.print_help = 0
4358     self.skip_cleanup = 0
4359     self.cvs_revnums = 0
4360     self.bdb_txn_nosync = 0
4361     self.fs_type = None
4362     self.forced_branches = []
4363     self.forced_tags = []
4364     self.excludes = []
4365     self.symbol_transforms = []
4366
4367 class MimeMapper:
4368   """A class that provides mappings from file names to MIME types.
4369   Note that we should really be using Python's 'mimetypes' module.
4370   See http://cvs2svn.tigris.org/servlets/ReadMsg?list=dev&msgNo=266
4371   for more."""
4372
4373   def __init__(self):
4374     self.mappings = { }
4375
4376   def set_mime_types_file(self, mime_types_file):
4377     for line in fileinput.input(mime_types_file):
4378       if line.startswith("#"):
4379         continue
4380
4381       # format of a line is something like
4382       # text/plain c h cpp
4383       extensions = line.split()
4384       if len(extensions) < 2:
4385         continue
4386       type = extensions.pop(0)
4387       for ext in extensions:
4388         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4389           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n" \
4390                            % (warning_prefix, ext, self.mappings[ext], type))
4391         self.mappings[ext] = type
4392
4393
4394   def get_type_from_filename(self, filename):
4395     basename, extension = os.path.splitext(os.path.basename(filename))
4396
4397     # Extension includes the dot, so strip it (will leave extension
4398     # empty if filename ends with a dot, which is ok):
4399     extension = extension[1:]
4400
4401     # If there is no extension (or the file ends with a period), use
4402     # the base name for mapping.  This allows us to set mappings for
4403     # files such as README or Makefile:
4404     if not extension:
4405       extension = basename
4406     if self.mappings.has_key(extension):
4407       return self.mappings[extension]
4408     return None
4409
4410
4411 def convert(start_pass, end_pass):
4412   "Convert a CVS repository to an SVN repository."
4413
4414   cleanup = Cleanup()
4415   times = [ None ] * (end_pass + 1)
4416   times[start_pass - 1] = time.time()
4417   StatsKeeper().set_start_time(time.time())
4418   for i in range(start_pass - 1, end_pass):
4419     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4420     _passes[i]()
4421     times[i + 1] = time.time()
4422     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4423     # Dispose of items in Ctx() not intended to live past the end of the pass
4424     # (Identified by exactly one leading underscore)
4425     for attr in dir(Ctx()):
4426       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4427           and not attr[:6] == "_Ctx__"):
4428         delattr(Ctx(), attr)
4429     if not Ctx().skip_cleanup:
4430       cleanup.cleanup(_passes[i])
4431     StatsKeeper().set_end_time(time.time())
4432
4433   Log().write(LOG_QUIET, StatsKeeper())
4434   if end_pass < 4:
4435     Log().write(LOG_QUIET, '(These are unaltered CVS repository stats and do not\n'
4436                 + ' reflect tags or branches excluded via --exclude)\n')
4437   print StatsKeeper().timings()
4438
4439
4440 def usage():
4441   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
4442         % os.path.basename(sys.argv[0])
4443   print '  --help, -h           print this usage message and exit with success'
4444   print '  --version            print the version number'
4445   print '  -q                   quiet'
4446   print '  -v                   verbose'
4447   print '  -s PATH              path for SVN repos'
4448   print '  -p START[:END]       start at pass START, end at pass END of %d' % len(_passes)
4449   print '                       If only START is given, run only pass START'
4450   print '                       (implicitly enables --skip-cleanup)'
4451   print '  --existing-svnrepos  load into existing SVN repository'
4452   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
4453   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
4454   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
4455   print '  --dry-run            do not create a repository or a dumpfile;'
4456   print '                       just print what would happen.'
4457   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
4458   print '                       (only use this if having problems with RCS)'
4459   print '  --svnadmin=PATH      path to the svnadmin program'
4460   print '  --trunk-only         convert only trunk commits, not tags nor branches'
4461   print '  --trunk=PATH         path for trunk (default: %s)'    \
4462         % Ctx().trunk_base
4463   print '  --branches=PATH      path for branches (default: %s)' \
4464         % Ctx().branches_base
4465   print '  --tags=PATH          path for tags (default: %s)'     \
4466         % Ctx().tags_base
4467   print '  --no-prune           don\'t prune empty directories'
4468   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
4469   print '  --encoding=ENC       encoding of log messages in CVS repos (default: %s)' \
4470         % Ctx().encoding
4471   print '  --force-branch=NAME  force NAME to be a branch'
4472   print '  --force-tag=NAME     force NAME to be a tag'
4473   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
4474   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
4475   print '                       use Python regexp and reference syntax respectively'
4476   print '  --username=NAME      username for cvs2svn-synthesized commits'
4477   print '  --skip-cleanup       prevent the deletion of intermediate files'
4478   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
4479   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
4480   print '  --cvs-revnums        record CVS revision numbers as file properties'
4481   print '  --mime-types=FILE    specify an apache-style mime.types file for\n' \
4482         '                       setting svn:mime-type'
4483   print '  --eol-from-mime-type set svn:eol-style by mime type (only with --mime-types)'
4484   print '  --no-default-eol     don\'t set svn:eol-style by CVS defaults'
4485   print '  --keywords-off       don\'t set svn:keywords on any files (cvs2svn sets'
4486   print '                       "svn:keywords to author date id" on non-binary files'
4487   print '                       by default)'
4488
4489 def main():
4490   # Convenience var, so we don't have to keep instantiating this Borg.
4491   ctx = Ctx()
4492
4493   profiling = None
4494   start_pass = 1
4495   end_pass = len(_passes)
4496
4497   try:
4498     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
4499                                [ "help", "create", "trunk=",
4500                                  "username=", "existing-svnrepos",
4501                                  "branches=", "tags=", "encoding=",
4502                                  "force-branch=", "force-tag=", "exclude=",
4503                                  "use-cvs", "mime-types=",
4504                                  "eol-from-mime-type", "no-default-eol",
4505                                  "trunk-only", "no-prune", "dry-run",
4506                                  "dump-only", "dumpfile=", "tmpdir=",
4507                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
4508                                  "bdb-txn-nosync", "fs-type=",
4509                                  "version", "profile",
4510                                  "keywords-off", "symbol-transform="])
4511   except getopt.GetoptError, e:
4512     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
4513     usage()
4514     sys.exit(1)
4515
4516   for opt, value in opts:
4517     if opt == '--version':
4518         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
4519         sys.exit(0)
4520     elif opt == '-p':
4521       # Don't cleanup if we're doing incrementals.
4522       ctx.skip_cleanup = 1
4523       if value.find(':') > 0:
4524         start_pass, end_pass = map(int, value.split(':'))
4525       else:
4526         end_pass = start_pass = int(value)
4527       if start_pass > len(_passes) or start_pass < 1:
4528         print '%s: illegal value (%d) for starting pass. '\
4529               'must be 1 through %d.' % (error_prefix, int(start_pass),
4530                                          len(_passes))
4531         sys.exit(1)
4532       if end_pass < start_pass or end_pass > len(_passes):
4533         print '%s: illegal value (%d) for ending pass. ' \
4534               'must be %d through %d.' % (error_prefix, int(end_pass),
4535                                           int(start_pass), len(_passes))
4536         sys.exit(1)
4537     elif (opt == '--help') or (opt == '-h'):
4538       ctx.print_help = 1
4539     elif opt == '-v':
4540       Log().log_level = LOG_VERBOSE
4541       ctx.verbose = 1
4542     elif opt == '-q':
4543       Log().log_level = LOG_QUIET
4544       ctx.quiet = 1
4545     elif opt == '-s':
4546       ctx.target = value
4547     elif opt == '--existing-svnrepos':
4548       ctx.existing_svnrepos = 1
4549     elif opt == '--dumpfile':
4550       ctx.dumpfile = value
4551     elif opt == '--tmpdir':
4552       ctx.tmpdir = value
4553     elif opt == '--use-cvs':
4554       ctx.use_cvs = 1
4555     elif opt == '--svnadmin':
4556       ctx.svnadmin = value
4557     elif opt == '--trunk-only':
4558       ctx.trunk_only = 1
4559     elif opt == '--trunk':
4560       if not value:
4561         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4562       ctx.trunk_base = value
4563     elif opt == '--branches':
4564       if not value:
4565         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4566       ctx.branches_base = value
4567     elif opt == '--tags':
4568       if not value:
4569         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4570       ctx.tags_base = value
4571     elif opt == '--no-prune':
4572       ctx.prune = None
4573     elif opt == '--dump-only':
4574       ctx.dump_only = 1
4575     elif opt == '--dry-run':
4576       ctx.dry_run = 1
4577     elif opt == '--encoding':
4578       ctx.encoding = value
4579     elif opt == '--force-branch':
4580       ctx.forced_branches.append(value)
4581     elif opt == '--force-tag':
4582       ctx.forced_tags.append(value)
4583     elif opt == '--exclude':
4584       try:
4585         ctx.excludes.append(re.compile('^' + value + '$'))
4586       except re.error, e:
4587         sys.exit(error_prefix + ": '%s' is not a valid regexp.\n" % (value))
4588     elif opt == '--mime-types':
4589       ctx.mime_types_file = value
4590     elif opt == '--eol-from-mime-type':
4591       ctx.eol_from_mime_type = 1
4592     elif opt == '--no-default-eol':
4593       ctx.no_default_eol = 1
4594     elif opt == '--keywords-off':
4595       ctx.keywords_off = 1
4596     elif opt == '--username':
4597       ctx.username = value
4598     elif opt == '--skip-cleanup':
4599       ctx.skip_cleanup = 1
4600     elif opt == '--cvs-revnums':
4601       ctx.cvs_revnums = 1
4602     elif opt == '--bdb-txn-nosync':
4603       ctx.bdb_txn_nosync = 1
4604     elif opt == '--fs-type':
4605       ctx.fs_type = value
4606     elif opt == '--create':
4607       sys.stderr.write(warning_prefix +
4608           ': The behaviour produced by the --create option is now the '
4609           'default,\nand passing the option is deprecated.\n')
4610     elif opt == '--profile':
4611       profiling = 1
4612     elif opt == '--symbol-transform':
4613       ctx.symbol_transforms.append(value.split(":"))
4614
4615   if ctx.print_help:
4616     usage()
4617     sys.exit(0)
4618
4619   # Consistency check for options and arguments.
4620   if len(args) == 0:
4621     usage()
4622     sys.exit(1)
4623
4624   if len(args) > 1:
4625     sys.stderr.write(error_prefix +
4626                      ": must pass only one CVS repository.\n")
4627     usage()
4628     sys.exit(1)
4629
4630   ctx.cvsroot = args[0]
4631
4632   if not os.path.isdir(ctx.cvsroot):
4633     sys.stderr.write(error_prefix +
4634                      ": the given CVS repository path '%s' is not an "
4635                      "existing directory.\n" % ctx.cvsroot)
4636     sys.exit(1)
4637
4638   if ctx.use_cvs:
4639     # Ascend above the specified root if necessary, to find the cvs_repository
4640     # (a directory containing a CVSROOT directory) and the cvs_module (the
4641     # path of the conversion root within the cvs repository)
4642     # NB: cvs_module must be seperated by '/' *not* by os.sep .
4643     ctx.cvs_repository = os.path.abspath(ctx.cvsroot)
4644     prev_cvs_repository = None
4645     ctx.cvs_module = ""
4646     while prev_cvs_repository != ctx.cvs_repository:
4647       if os.path.isdir(os.path.join(ctx.cvs_repository, 'CVSROOT')):
4648         break
4649       prev_cvs_repository = ctx.cvs_repository
4650       ctx.cvs_repository, module_component = os.path.split(ctx.cvs_repository)
4651       ctx.cvs_module = module_component + "/" + ctx.cvs_module
4652     else:
4653       # Hit the root (of the drive, on Windows) without finding a CVSROOT dir.
4654       sys.stderr.write(error_prefix +
4655                        ": the path '%s' is not a CVS repository, nor a path " \
4656                        "within a CVS repository.  A CVS repository contains " \
4657                        "a CVSROOT directory within its root directory.\n" \
4658                        % ctx.cvsroot)
4659       sys.exit(1)
4660     os.environ['CVSROOT'] = ctx.cvs_repository
4661
4662   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
4663     sys.stderr.write(error_prefix +
4664                      ": must pass one of '-s' or '--dump-only'.\n")
4665     sys.exit(1)
4666
4667   def not_both(opt1val, opt1name, opt2val, opt2name):
4668     if opt1val and opt2val:
4669       sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
4670                        % (opt1name, opt2name))
4671       sys.exit(1)
4672
4673   not_both(ctx.target, '-s',
4674            ctx.dump_only, '--dump-only')
4675
4676   not_both(ctx.dump_only, '--dump-only',
4677            ctx.existing_svnrepos, '--existing-svnrepos')
4678
4679   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
4680            ctx.existing_svnrepos, '--existing-svnrepos')
4681
4682   not_both(ctx.dump_only, '--dump-only',
4683            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
4684
4685   not_both(ctx.quiet, '-q',
4686            ctx.verbose, '-v')
4687
4688   not_both(ctx.fs_type, '--fs-type',
4689            ctx.existing_svnrepos, '--existing-svnrepos')
4690
4691   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
4692     sys.stderr.write(error_prefix +
4693                      ": cannot pass --bdb-txn-nosync with --fs-type=%s.\n" \
4694                      % ctx.fs_type)
4695     sys.exit(1)
4696
4697   if ((string.find(ctx.trunk_base, '/') > -1)
4698       or (string.find(ctx.tags_base, '/') > -1)
4699       or (string.find(ctx.branches_base, '/') > -1)):
4700     sys.stderr.write("%s: cannot pass multicomponent path to "
4701                      "--trunk, --tags, or --branches yet.\n"
4702                      "  See http://cvs2svn.tigris.org/issues/show_bug.cgi?"
4703                      "id=7 for details.\n" % error_prefix)
4704     sys.exit(1)
4705
4706   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
4707     sys.stderr.write(error_prefix +
4708                      ": the svn-repos-path '%s' is not an "
4709                      "existing directory.\n" % ctx.target)
4710     sys.exit(1)
4711
4712   if not ctx.dump_only and not ctx.existing_svnrepos \
4713      and (not ctx.dry_run) and os.path.exists(ctx.target):
4714     sys.stderr.write(error_prefix +
4715                      ": the svn-repos-path '%s' exists.\nRemove it, or pass "
4716                      "'--existing-svnrepos'.\n" % ctx.target)
4717     sys.exit(1)
4718
4719   if ctx.mime_types_file:
4720     ctx.mime_mapper = MimeMapper()
4721     ctx.mime_mapper.set_mime_types_file(ctx.mime_types_file)
4722
4723   # Make sure the tmp directory exists.  Note that we don't check if
4724   # it's empty -- we want to be able to use, for example, "." to hold
4725   # tempfiles.  But if we *did* want check if it were empty, we'd do
4726   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
4727   if not os.path.exists(ctx.tmpdir):
4728     os.mkdir(ctx.tmpdir)
4729   elif not os.path.isdir(ctx.tmpdir):
4730     sys.stderr.write(error_prefix +
4731        ": cvs2svn tried to use '%s' for temporary files, but that path\n"
4732        "  exists and is not a directory.  Please make it be a directory,\n"
4733        "  or specify some other directory for temporary files.\n" \
4734                      % ctx.tmpdir)
4735     sys.exit(1)
4736
4737   if ctx.use_cvs:
4738     def cvs_ok():
4739       pipe = Popen3('cvs %s --version' % Ctx().cvs_global_arguments, True)
4740       pipe.tochild.close()
4741       pipe.fromchild.read()
4742       errmsg = pipe.childerr.read()
4743       status = pipe.wait()
4744       ok = len(errmsg) == 0 and status == 0
4745       return (ok, status, errmsg)
4746
4747     ctx.cvs_global_arguments = "-q -R"
4748     ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4749     if not ok:
4750       ctx.cvs_global_arguments = "-q"
4751       ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4752
4753     if not ok:
4754       sys.stderr.write(error_prefix +
4755                        ": error executing CVS: status %s, error output:\n" \
4756                        % (cvs_exitstatus) + cvs_errmsg)
4757
4758   # But do lock the tmpdir, to avoid process clash.
4759   try:
4760     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4761   except OSError, e:
4762     if e.errno == errno.EACCES:
4763       sys.stderr.write(error_prefix + ": Permission denied:"
4764                        + " No write access to output directory.\n")
4765       sys.exit(1)
4766     if e.errno == errno.EEXIST:
4767       sys.stderr.write(error_prefix +
4768           ": cvs2svn is using directory '%s' for temporary files, but\n"
4769           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
4770           "  cvs2svn process is currently using '%s' as its temporary\n"
4771           "  workspace.  If you are certain that is not the case,\n"
4772           "  then remove the '%s/cvs2svn.lock' subdirectory.\n" \
4773                        % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir))
4774       sys.exit(1)
4775     raise
4776   try:
4777     if profiling:
4778       import hotshot
4779       prof = hotshot.Profile('cvs2svn.hotshot')
4780       prof.runcall(convert, start_pass, end_pass)
4781       prof.close()
4782     else:
4783       convert(start_pass, end_pass)
4784   finally:
4785     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4786     except: pass
4787
4788 if __name__ == '__main__':
4789   main()