cvs2svn

   1 #!/usr/bin/env python
   2 # (Be in -*- python -*- mode.)
   3 #
   4 # cvs2svn: ...
   5 #
   6 # ====================================================================
   7 # Copyright (c) 2000-2004 CollabNet.  All rights reserved.
   8 #
   9 # This software is licensed as described in the file COPYING, which
  10 # you should have received as part of this distribution.  The terms
  11 # are also available at http://subversion.tigris.org/license-1.html.
  12 # If newer versions of this license are posted there, you may use a
  13 # newer version instead, at your option.
  14 #
  15 # This software consists of voluntary contributions made by many
  16 # individuals.  For exact contribution history, see the revision
  17 # history and logs, available at http://cvs2svn.tigris.org/.
  18 # ====================================================================
  19
  20 VERSION = 'r' + "$LastChangedRevision$"[22:-2]
  21
  22 import cvs2svn_rcsparse
  23 import os
  24 import sys
  25 import sha
  26 import re
  27 import time
  28 import fileinput
  29 import string
  30 import getopt
  31 import stat
  32 import md5
  33 import marshal
  34 import errno
  35 import popen2
  36
  37 # Warnings and errors start with these strings.  They are typically
  38 # followed by a colon and a space, as in "%s: " ==> "WARNING: ".
  39 warning_prefix = "WARNING"
  40 error_prefix = "ERROR"
  41
  42 # Make sure this Python is recent enough.
  43 if sys.hexversion < 0x2000000:
  44   sys.stderr.write("'%s: Python 2.0 or higher required, "
  45                    "see www.python.org.\n" % error_prefix)
  46   sys.exit(1)
  47
  48 # Pretend we have true booleans on older python versions
  49 try:
  50   True
  51 except:
  52   True = 1
  53   False = 0
  54
  55 # Minimal, incomplete, version of popen2.Popen3 for those platforms
  56 # for which popen2 does not provide it.
  57 try:
  58   Popen3 = popen2.Popen3
  59 except AttributeError:
  60   class Popen3:
  61     def __init__(self, cmd, capturestderr):
  62       if type(cmd) != str:
  63         cmd = " ".join(cmd)
  64       self.fromchild, self.tochild, self.childerr = popen2.popen3(cmd,
  65                                                                   mode='b')
  66     def wait(self):
  67       return self.fromchild.close() or self.tochild.close() or \
  68              self.childerr.close()
  69
  70 # DBM module selection
  71
  72 # 1. If we have bsddb3, it is probably newer than bsddb.  Fake bsddb = bsddb3,
  73 #    so that the dbhash module used by anydbm will use bsddb3.
  74 try:
  75   import bsddb3
  76   sys.modules['bsddb'] = sys.modules['bsddb3']
  77 except ImportError:
  78   pass
  79
  80 # 2. These DBM modules are not good for cvs2svn.
  81 import anydbm
  82 if (anydbm._defaultmod.__name__ == 'dumbdbm'
  83     or anydbm._defaultmod.__name__ == 'dbm'):
  84   print 'ERROR: your installation of Python does not contain a suitable'
  85   print '  DBM module. This script cannot continue.'
  86   print '  to solve: see http://python.org/doc/current/lib/module-anydbm.html'
  87   print '  for details.'
  88   sys.exit(1)
  89
  90 # 3. If we are using the old bsddb185 module, then try prefer gdbm instead.
  91 #    Unfortunately, gdbm appears not to be trouble free, either.
  92 if hasattr(anydbm._defaultmod, 'bsddb') \
  93     and not hasattr(anydbm._defaultmod.bsddb, '__version__'):
  94   try:
  95     gdbm = __import__('gdbm')
  96   except ImportError:
  97     sys.stderr.write(warning_prefix +
  98         ': The version of the bsddb module found '
  99         'on your computer has been reported to malfunction on some datasets, '
 100         'causing KeyError exceptions. You may wish to upgrade your Python to '
 101         'version 2.3 or later.\n')
 102   else:
 103     anydbm._defaultmod = gdbm
 104
 105 trunk_rev = re.compile('^[0-9]+\\.[0-9]+$')
 106 branch_tag = re.compile('^[0-9.]+\\.0\\.[0-9]+$')
 107 vendor_tag = re.compile('^[0-9]+\\.[0-9]+\\.[0-9]+$')
 108
 109 SVN_KEYWORDS_VALUE = 'Author Date Id Revision'
 110
 111 # This really only matches standard '1.1.1.*'-style vendor revisions.
 112 # One could conceivably have a file whose default branch is 1.1.3 or
 113 # whatever, or was that at some point in time, with vendor revisions
 114 # 1.1.3.1, 1.1.3.2, etc.  But with the default branch gone now (which
 115 # is the only time this regexp gets used), we'd have no basis for
 116 # assuming that the non-standard vendor branch had ever been the
 117 # default branch anyway, so we don't want this to match them anyway.
 118 vendor_revision = re.compile('^(1\\.1\\.1)\\.([0-9])+$')
 119
 120 # If this run's output is a repository, then (in the tmpdir) we use
 121 # a dumpfile of this name for repository loads.
 122 #
 123 # If this run's output is a dumpfile, then this is default name of
 124 # that dumpfile, but in the current directory (unless the user has
 125 # specified a dumpfile path, of course, in which case it will be
 126 # wherever the user said).
 127 DUMPFILE = 'cvs2svn-dump'
 128
 129 # This file appears with different suffixes at different stages of
 130 # processing.  CVS revisions are cleaned and sorted here, for commit
 131 # grouping.  See design-notes.txt for details.
 132 DATAFILE = 'cvs2svn-data'
 133
 134 # This file contains a marshalled copy of all the statistics that we
 135 # gather throughout the various runs of cvs2svn.  The data stored as a
 136 # marshalled dictionary.
 137 STATISTICS_FILE = 'cvs2svn-statistics'
 138
 139 # This text file contains records (1 per line) that describe svn
 140 # filesystem paths that are the opening and closing source revisions
 141 # for copies to tags and branches.  The format is as follows:
 142 #
 143 # SYMBOL_NAME SVN_REVNUM TYPE SVN_PATH
 144 #
 145 # Where type is either OPENING or CLOSING.  The SYMBOL_NAME and
 146 # SVN_REVNUM are the primary and secondary sorting criteria for
 147 # creating SYMBOL_OPENINGS_CLOSINGS_SORTED.
 148 SYMBOL_OPENINGS_CLOSINGS = 'cvs2svn-symbolic-names.txt'
 149 # A sorted version of the above file.
 150 SYMBOL_OPENINGS_CLOSINGS_SORTED = 'cvs2svn-symbolic-names-s.txt'
 151
 152 # This file is a temporary file for storing symbolic_name -> closing
 153 # CVSRevision until the end of our pass where we can look up the
 154 # corresponding SVNRevNum for the closing revs and write these out to
 155 # the SYMBOL_OPENINGS_CLOSINGS.
 156 SYMBOL_CLOSINGS_TMP = 'cvs2svn-symbolic-names-closings-tmp.txt'
 157
 158 # Skeleton version of an svn filesystem.
 159 # (These supersede and will eventually replace the two above.)
 160 # See class SVNRepositoryMirror for how these work.
 161 SVN_MIRROR_REVISIONS_DB = 'cvs2svn-svn-revisions.db'
 162 SVN_MIRROR_NODES_DB = 'cvs2svn-svn-nodes.db'
 163
 164 # Offsets pointing to the beginning of each SYMBOLIC_NAME in
 165 # SYMBOL_OPENINGS_CLOSINGS_SORTED
 166 SYMBOL_OFFSETS_DB = 'cvs2svn-symbolic-name-offsets.db'
 167
 168 # Maps CVSRevision.unique_key()s to lists of symbolic names, where
 169 # the CVSRevision is the last such that is a source for those symbolic
 170 # names.  For example, if branch B's number is 1.3.0.2 in this CVS
 171 # file, and this file's 1.3 is the latest (by date) revision among
 172 # *all* CVS files that is a source for branch B, then the
 173 # CVSRevision.unique_key() corresponding to this file at 1.3 would
 174 # list at least B in its list.
 175 SYMBOL_LAST_CVS_REVS_DB = 'cvs2svn-symbol-last-cvs-revs.db'
 176
 177 # Maps CVSRevision.unique_key() to corresponding line in s-revs.
 178 ###PERF Or, we could map to an offset into s-revs, instead of dup'ing
 179 ### the s-revs data in this database.
 180 CVS_REVS_DB = 'cvs2svn-cvs-revs.db'
 181
 182 # Lists all symbolic names that are tags.  Keys are strings (symbolic
 183 # names), values are ignorable.
 184 TAGS_DB = 'cvs2svn-tags.db'
 185
 186 # A list all tags.  Each line consists of the tag name and the number
 187 # of files in which it exists, separated by a space.
 188 TAGS_LIST = 'cvs2svn-tags.txt'
 189
 190 # A list of all branches.  The file is stored as a plain text file
 191 # to make it easy to look at in an editor.  Each line contains the
 192 # branch name, the number of files where the branch is created, the
 193 # commit count, and a list of tags and branches that are defined on
 194 # revisions in the branch.
 195 BRANCHES_LIST = 'cvs2svn-branches.txt'
 196
 197 # These two databases provide a bidirectional mapping between
 198 # CVSRevision.unique_key()s and Subversion revision numbers.
 199 #
 200 # The first maps CVSRevision.unique_key() to a number; the values are
 201 # not unique.
 202 #
 203 # The second maps a number to a list of CVSRevision.unique_key()s.
 204 CVS_REVS_TO_SVN_REVNUMS = 'cvs2svn-cvs-revs-to-svn-revnums.db'
 205 SVN_REVNUMS_TO_CVS_REVS = 'cvs2svn-svn-revnums-to-cvs-revs.db'
 206
 207 # This database maps svn_revnums to tuples of (symbolic_name, date).
 208 #
 209 # The svn_revnums are the revision numbers of all non-primary
 210 # SVNCommits.  No primary SVNCommit has a key in this database.
 211 #
 212 # The date is stored for all commits in this database.
 213 #
 214 # For commits that fill symbolic names, the symbolic_name is stored.
 215 # For commits that default branch syncs, the symbolic_name is None.
 216 SVN_COMMIT_NAMES_DATES = 'cvs2svn-svn-commit-names-and-dates.db'
 217
 218 # This database maps svn_revnums of a default branch synchronization
 219 # commit to the svn_revnum of the primary SVNCommit that motivated it.
 220 #
 221 # (NOTE: Secondary commits that fill branches and tags also have a
 222 # motivating commit, but we do not record it because it is (currently)
 223 # not needed for anything.)
 224 #
 225 # This mapping is used when generating the log message for the commit
 226 # that synchronizes the default branch with trunk.
 227 MOTIVATING_REVNUMS = 'cvs2svn-svn-motivating-commit-revnums.db'
 228
 229 # How many bytes to read at a time from a pipe.  128 kiB should be
 230 # large enough to be efficient without wasting too much memory.
 231 PIPE_READ_SIZE = 128 * 1024
 232
 233 # Record the default RCS branches, if any, for CVS filepaths.
 234 #
 235 # The keys are CVS filepaths, relative to the top of the repository
 236 # and with the ",v" stripped off, so they match the cvs paths used in
 237 # Commit.commit().  The values are vendor branch revisions, such as
 238 # '1.1.1.1', or '1.1.1.2', or '1.1.1.96'.  The vendor branch revision
 239 # represents the highest vendor branch revision thought to have ever
 240 # been head of the default branch.
 241 #
 242 # The reason we record a specific vendor revision, rather than a
 243 # default branch number, is that there are two cases to handle:
 244 #
 245 # One case is simple.  The RCS file lists a default branch explicitly
 246 # in its header, such as '1.1.1'.  In this case, we know that every
 247 # revision on the vendor branch is to be treated as head of trunk at
 248 # that point in time.
 249 #
 250 # But there's also a degenerate case.  The RCS file does not currently
 251 # have a default branch, yet we can deduce that for some period in the
 252 # past it probably *did* have one.  For example, the file has vendor
 253 # revisions 1.1.1.1 -> 1.1.1.96, all of which are dated before 1.2,
 254 # and then it has 1.1.1.97 -> 1.1.1.100 dated after 1.2.  In this
 255 # case, we should record 1.1.1.96 as the last vendor revision to have
 256 # been the head of the default branch.
 257 DEFAULT_BRANCHES_DB = 'cvs2svn-default-branches.db'
 258
 259 # Records the author and log message for each changeset.
 260 # The keys are author+log digests, the same kind used to identify
 261 # unique revisions in the .revs, etc files.  Each value is a tuple
 262 # of two elements: '(author logmessage)'.
 263 METADATA_DB = "cvs2svn-metadata.db"
 264
 265 REVS_SUFFIX = '.revs'
 266 CLEAN_REVS_SUFFIX = '.c-revs'
 267 SORTED_REVS_SUFFIX = '.s-revs'
 268 RESYNC_SUFFIX = '.resync'
 269
 270 SVN_INVALID_REVNUM = -1
 271
 272 COMMIT_THRESHOLD = 5 * 60       # flush a commit if a 5 minute gap occurs
 273
 274 # Things that can happen to a file.
 275 OP_NOOP   = '-'
 276 OP_ADD    = 'A'
 277 OP_DELETE = 'D'
 278 OP_CHANGE = 'C'
 279
 280 # A deltatext either does or doesn't represent some change.
 281 DELTATEXT_NONEMPTY = 'N'
 282 DELTATEXT_EMPTY    = 'E'
 283
 284 DIGEST_END_IDX = 9 + (sha.digestsize * 2)
 285
 286 # Constants used in SYMBOL_OPENINGS_CLOSINGS
 287 OPENING = 'O'
 288 CLOSING = 'C'
 289
 290 def temp(basename):
 291   """Return a path to BASENAME in Ctx().tmpdir.
 292   This is a convenience function to save horizontal space in source."""
 293   return os.path.join(Ctx().tmpdir, basename)
 294
 295 # Since the unofficial set also includes [/\] we need to translate those
 296 # into ones that don't conflict with Subversion limitations.
 297 def _clean_symbolic_name(name):
 298   """Return symbolic name NAME, translating characters that Subversion
 299   does not allow in a pathname."""
 300   name = name.replace('/','++')
 301   name = name.replace('\\','--')
 302   return name
 303
 304 def _path_join(*components):
 305   """Join two or more pathname COMPONENTS, inserting '/' as needed.
 306   Empty component are skipped."""
 307   return string.join(filter(None, components), '/')
 308
 309 def run_command(command):
 310   if os.system(command):
 311     sys.exit('Command failed: "%s"' % command)
 312
 313 def relative_name(cvsroot, fname):
 314   l = len(cvsroot)
 315   if fname[:l] == cvsroot:
 316     if fname[l] == os.sep:
 317       return string.replace(fname[l+1:], os.sep, '/')
 318     return string.replace(fname[l:], os.sep, '/')
 319   sys.stderr.write("%s: relative_path('%s', '%s'): fname is not a sub-path of"
 320                    " cvsroot\n" % (error_prefix, cvsroot, fname))
 321   sys.exit(1)
 322
 323 def get_co_pipe(c_rev, extra_arguments=''):
 324   """Return a command string, and the pipe created using that string.
 325   C_REV is a CVSRevision, and EXTRA_ARGUMENTS is used to add extra
 326   arguments.  The pipe returns the text of that CVS Revision."""
 327   ctx = Ctx()
 328   if ctx.use_cvs:
 329     pipe_cmd = 'cvs %s co -r%s -p %s %s' % \
 330                (ctx.cvs_global_arguments, c_rev.rev, extra_arguments,
 331                 escape_shell_arg(ctx.cvs_module + c_rev.cvs_path))
 332   else:
 333     pipe_cmd = 'co -q -x,v -p%s %s %s' % \
 334                (c_rev.rev, extra_arguments, escape_shell_arg(c_rev.rcs_path()))
 335   pipe = Popen3(pipe_cmd, True)
 336   pipe.tochild.close()
 337   return pipe_cmd, pipe
 338
 339 def generate_ignores(c_rev):
 340   # Read in props
 341   pipe_cmd, pipe = get_co_pipe(c_rev)
 342   buf = pipe.fromchild.read(PIPE_READ_SIZE)
 343   raw_ignore_val = ""
 344   while buf:
 345     raw_ignore_val = raw_ignore_val + buf
 346     buf = pipe.fromchild.read(PIPE_READ_SIZE)
 347   pipe.fromchild.close()
 348   error_output = pipe.childerr.read()
 349   exit_status = pipe.wait()
 350   if exit_status:
 351     sys.exit("%s: The command '%s' failed with exit status: %s\n"
 352              "and the following output:\n"
 353              "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
 354
 355   # Tweak props: First, convert any spaces to newlines...
 356   raw_ignore_val = '\n'.join(raw_ignore_val.split())
 357   raw_ignores = raw_ignore_val.split('\n')
 358   ignore_vals = [ ]
 359   for ignore in raw_ignores:
 360     # Reset the list if we encounter a '!'
 361     # See http://cvsbook.red-bean.com/cvsbook.html#cvsignore
 362     if ignore == '!':
 363       ignore_vals = [ ]
 364       continue
 365     # Skip empty lines
 366     if len(ignore) == 0:
 367       continue
 368     ignore_vals.append(ignore)
 369   return ignore_vals
 370
 371 # Return a string that has not been returned by gen_key() before.
 372 gen_key_base = 0L
 373 def gen_key():
 374   global gen_key_base
 375   key = '%x' % gen_key_base
 376   gen_key_base = gen_key_base + 1
 377   return key
 378
 379 # ============================================================================
 380 # This code is copied with a few modifications from:
 381 #   subversion/subversion/bindings/swig/python/svn/core.py
 382
 383 if sys.platform == "win32":
 384   _escape_shell_arg_re = re.compile(r'(\\+)(\"|$)')
 385
 386   def escape_shell_arg(arg):
 387     # The (very strange) parsing rules used by the C runtime library are
 388     # described at:
 389     # http://msdn.microsoft.com/library/en-us/vclang/html/_pluslang_Parsing_C.2b2b_.Command.2d.Line_Arguments.asp
 390
 391     # double up slashes, but only if they are followed by a quote character
 392     arg = re.sub(_escape_shell_arg_re, r'\1\1\2', arg)
 393
 394     # surround by quotes and escape quotes inside
 395     arg = '"' + string.replace(arg, '"', '"^""') + '"'
 396     return arg
 397
 398
 399   def argv_to_command_string(argv):
 400     """Flatten a list of command line arguments into a command string.
 401
 402     The resulting command string is expected to be passed to the system
 403     shell which os functions like popen() and system() invoke internally.
 404     """
 405
 406     # According cmd's usage notes (cmd /?), it parses the command line by
 407     # "seeing if the first character is a quote character and if so, stripping
 408     # the leading character and removing the last quote character."
 409     # So to prevent the argument string from being changed we add an extra set
 410     # of quotes around it here.
 411     return '"' + string.join(map(escape_shell_arg, argv), " ") + '"'
 412
 413 else:
 414   def escape_shell_arg(str):
 415     return "'" + string.replace(str, "'", "'\\''") + "'"
 416
 417   def argv_to_command_string(argv):
 418     """Flatten a list of command line arguments into a command string.
 419
 420     The resulting command string is expected to be passed to the system
 421     shell which os functions like popen() and system() invoke internally.
 422     """
 423
 424     return string.join(map(escape_shell_arg, argv), " ")
 425 # ============================================================================
 426
 427 def format_date(date):
 428   """Return an svn-compatible date string for DATE (seconds since epoch)."""
 429   # A Subversion date looks like "2002-09-29T14:44:59.000000Z"
 430   return time.strftime("%Y-%m-%dT%H:%M:%S.000000Z", time.gmtime(date))
 431
 432 def sort_file(infile, outfile):
 433   # sort the log files
 434
 435   # GNU sort will sort our dates differently (incorrectly!) if our
 436   # LC_ALL is anything but 'C', so if LC_ALL is set, temporarily set
 437   # it to 'C'
 438   if os.environ.has_key('LC_ALL'):
 439     lc_all_tmp = os.environ['LC_ALL']
 440   else:
 441     lc_all_tmp = None
 442   os.environ['LC_ALL'] = 'C'
 443   # The -T option to sort has a nice side effect.  The Win32 sort is
 444   # case insensitive and cannot be used, and since it does not
 445   # understand the -T option and dies if we try to use it, there is
 446   # no risk that we use that sort by accident.
 447   run_command('sort -T %s %s > %s' % (Ctx().tmpdir, infile, outfile))
 448   if lc_all_tmp is None:
 449     del os.environ['LC_ALL']
 450   else:
 451     os.environ['LC_ALL'] = lc_all_tmp
 452
 453 def print_node_tree(tree, root_node, indent_depth=0):
 454   """For debugging purposes.  Prints all nodes in TREE that are
 455   rooted at ROOT_NODE.  INDENT_DEPTH is merely for purposes of
 456   debugging with the print statement in this function."""
 457   if not indent_depth:
 458     print "TREE", "=" * 75
 459   print "TREE:", " " * (indent_depth * 2), root_node, tree[root_node]
 460   for key, value in tree[root_node].items():
 461     if key[0] == '/': #Skip flags
 462       continue
 463     print_node_tree(tree, value, (indent_depth + 1))
 464
 465 def match_regexp_list(regexp_list, string):
 466   """Test whether STRING matches any of the compiled regexps in REGEXP_LIST."""
 467   for regexp in regexp_list:
 468     if regexp.match(string):
 469       return True
 470   return False
 471
 472 class LF_EOL_Filter:
 473   """Filter a stream and convert all end-of-line markers (CRLF, CR or LF)
 474   into LFs only."""
 475   def __init__(self, stream):
 476     self.stream = stream
 477     self.carry_cr = False
 478     self.eof = False
 479
 480   def read(self, size):
 481     while True:
 482       buf = self.stream.read(size)
 483       self.eof = len(buf) == 0
 484       if self.carry_cr:
 485         buf = '\r' + buf
 486         self.carry_cr = False
 487       if not self.eof and buf[-1] == '\r':
 488         self.carry_cr = True
 489         buf = buf[:-1]
 490       buf = string.replace(buf, '\r\n', '\n')
 491       buf = string.replace(buf, '\r', '\n')
 492       if len(buf) > 0 or self.eof:
 493         return buf
 494
 495
 496 # These constants represent the log levels that this script supports
 497 LOG_WARN = -1
 498 LOG_QUIET = 0
 499 LOG_NORMAL = 1
 500 LOG_VERBOSE = 2
 501 class Log:
 502   """A Simple logging facility.  Each line will be timestamped is
 503   self.use_timestamps is TRUE.  This class is a Borg, see
 504   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 505   __shared_state = {}
 506   def __init__(self):
 507     self.__dict__ = self.__shared_state
 508     if self.__dict__:
 509       return
 510     self.log_level = LOG_NORMAL
 511     # Set this to true if you want to see timestamps on each line output.
 512     self.use_timestamps = None
 513     self.logger = sys.stdout
 514
 515   def _timestamp(self):
 516     """Output a detailed timestamp at the beginning of each line output."""
 517     self.logger.write(time.strftime('[%Y-%m-%d %I:%m:%S %Z] - '))
 518
 519   def write(self, log_level, *args):
 520     """This is the public method to use for writing to a file.  Only
 521     messages whose LOG_LEVEL is <= self.log_level will be printed.  If
 522     there are multiple ARGS, they will be separated by a space."""
 523     if log_level > self.log_level:
 524       return
 525     if self.use_timestamps:
 526       self._timestamp()
 527     self.logger.write(' '.join(map(str,args)) + "\n")
 528     # Ensure that log output doesn't get out-of-order with respect to
 529     # stderr output.
 530     self.logger.flush()
 531
 532
 533 class Cleanup:
 534   """This singleton class manages any files created by cvs2svn.  When
 535   you first create a file, call Cleanup.register, passing the
 536   filename, and the last pass that you need the file.  After the end
 537   of that pass, your file will be cleaned up after running an optional
 538   callback.  This class is a Borg, see
 539   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531."""
 540
 541   __shared_state = {}
 542   def __init__(self):
 543     self.__dict__ = self.__shared_state
 544     if self.__dict__:
 545       return
 546     self._log = {}
 547     self._callbacks = {}
 548
 549   def register(self, file, which_pass, callback=None):
 550     """Register FILE for cleanup at the end of WHICH_PASS, running
 551     function CALLBACK prior to removal.  Registering a given FILE is
 552     idempotent; you may register as many times as you wish, but it
 553     will only be cleaned up once.
 554
 555     Note that if a file is registered multiple times, only the first
 556     callback registered for that file will be called at cleanup
 557     time.  Also note that if you register a database file you must
 558     close the database before cleanup, e.g. using a callback."""
 559     if not self._log.has_key(which_pass):
 560       self._log[which_pass] = {}
 561     self._log[which_pass][file] = 1
 562     if callback and not self._callbacks.has_key(file):
 563       self._callbacks[file] = callback
 564
 565   def cleanup(self, which_pass):
 566     """Clean up all files, and invoke callbacks, for pass WHICH_PASS."""
 567     if not self._log.has_key(which_pass):
 568       return
 569     for file in self._log[which_pass].keys():
 570       Log().write(LOG_VERBOSE, "Deleting", file)
 571       if self._callbacks.has_key(file):
 572         self._callbacks[file]()
 573       os.unlink(file)
 574
 575
 576 # Always use these constants for opening databases.
 577 DB_OPEN_READ = 'r'
 578 DB_OPEN_NEW = 'n'
 579
 580 # A wrapper for anydbm that uses the marshal module to store items as
 581 # strings.
 582 class Database:
 583   def __init__(self, filename, mode):
 584     # pybsddb3 has a bug which prevents it from working with
 585     # Berkeley DB 4.2 if you open the db with 'n' ("new").  This
 586     # causes the DB_TRUNCATE flag to be passed, which is disallowed
 587     # for databases protected by lock and transaction support
 588     # (bsddb databases use locking from bsddb version 4.2.4 onwards).
 589     #
 590     # Therefore, manually perform the removal (we can do this, because
 591     # we know that for bsddb - but *not* anydbm in general - the database
 592     # consists of one file with the name we specify, rather than several
 593     # based on that name).
 594     if mode == 'n' and anydbm._defaultmod.__name__ == 'dbhash':
 595       if os.path.isfile(filename):
 596         os.unlink(filename)
 597       mode = 'c'
 598
 599     self.db = anydbm.open(filename, mode)
 600
 601   def has_key(self, key):
 602     return self.db.has_key(key)
 603
 604   def __getitem__(self, key):
 605     return marshal.loads(self.db[key])
 606
 607   def __setitem__(self, key, value):
 608     self.db[key] = marshal.dumps(value)
 609
 610   def __delitem__(self, key):
 611     del self.db[key]
 612
 613   def get(self, key, default):
 614     if self.has_key(key):
 615       return self.__getitem__(key)
 616     return default
 617
 618
 619 class StatsKeeper:
 620   __shared_state = { }
 621   def __init__(self):
 622     self.__dict__ = self.__shared_state
 623     if self.__dict__:
 624       return
 625     self.filename = temp(STATISTICS_FILE)
 626     Cleanup().register(self.filename, pass8)
 627     # This can get kinda large, so we don't store it in our data dict.
 628     self.repos_files = { }
 629
 630     if os.path.exists(self.filename):
 631       self.unarchive()
 632     else:
 633       self.data = { 'cvs_revs_count' : 0,
 634                     'tags': { },
 635                     'branches' : { },
 636                     'repos_size' : 0,
 637                     'repos_file_count' : 0,
 638                     'svn_rev_count' : None,
 639                     'first_rev_date' : 1L<<32,
 640                     'last_rev_date' : 0,
 641                     'pass_timings' : { },
 642                     'start_time' : 0,
 643                     'end_time' : 0,
 644                     }
 645
 646   def log_duration_for_pass(self, duration, pass_num):
 647     self.data['pass_timings'][pass_num] = duration
 648
 649   def set_start_time(self, start):
 650     self.data['start_time'] = start
 651
 652   def set_end_time(self, end):
 653     self.data['end_time'] = end
 654
 655   def _bump_item(self, key, amount=1):
 656     self.data[key] = self.data[key] + amount
 657
 658   def reset_c_rev_info(self):
 659     self.data['cvs_revs_count'] = 0
 660     self.data['tags'] = { }
 661     self.data['branches'] = { }
 662
 663   def record_c_rev(self, c_rev):
 664     self._bump_item('cvs_revs_count')
 665
 666     for tag in c_rev.tags:
 667       self.data['tags'][tag] = None
 668     for branch in c_rev.branches:
 669       self.data['branches'][branch] = None
 670
 671     if c_rev.timestamp < self.data['first_rev_date']:
 672       self.data['first_rev_date'] = c_rev.timestamp
 673
 674     if c_rev.timestamp > self.data['last_rev_date']:
 675       self.data['last_rev_date'] = c_rev.timestamp
 676
 677     # Only add the size if this is the first time we see the file.
 678     if not self.repos_files.has_key(c_rev.fname):
 679       self._bump_item('repos_size', c_rev.file_size)
 680     self.repos_files[c_rev.fname] = None
 681
 682     self.data['repos_file_count'] = len(self.repos_files)
 683
 684   def set_svn_rev_count(self, count):
 685     self.data['svn_rev_count'] = count
 686
 687   def svn_rev_count(self):
 688     return self.data['svn_rev_count']
 689
 690   def archive(self):
 691     open(self.filename, 'w').write(marshal.dumps(self.data))
 692
 693   def unarchive(self):
 694     self.data = marshal.loads(open(self.filename, 'r').read())
 695
 696   def __str__(self):
 697     svn_revs_str = ""
 698     if self.data['svn_rev_count'] is not None:
 699       svn_revs_str = ('Total SVN Commits:      %10s\n'
 700                       % self.data['svn_rev_count'])
 701
 702     return ('\n'                                \
 703             'cvs2svn Statistics:\n'             \
 704             '------------------\n'              \
 705             'Total CVS Files:        %10i\n'    \
 706             'Total CVS Revisions:    %10i\n'    \
 707             'Total Unique Tags:      %10i\n'    \
 708             'Total Unique Branches:  %10i\n'    \
 709             'CVS Repos Size in KB:   %10i\n'    \
 710             '%s'                                \
 711             'First Revision Date:    %s\n'      \
 712             'Last Revision Date:     %s\n'      \
 713             '------------------'                \
 714             % (self.data['repos_file_count'],
 715                self.data['cvs_revs_count'],
 716                len(self.data['tags']),
 717                len(self.data['branches']),
 718                (self.data['repos_size'] / 1024),
 719                svn_revs_str,
 720                time.ctime(self.data['first_rev_date']),
 721                time.ctime(self.data['last_rev_date']),
 722                ))
 723
 724   def timings(self):
 725     passes = self.data['pass_timings'].keys()
 726     passes.sort()
 727     str = 'Timings:\n------------------\n'
 728
 729     def desc(val):
 730       if val == 1: return "second"
 731       return "seconds"
 732
 733     for pass_num in passes:
 734       duration = int(self.data['pass_timings'][pass_num])
 735       p_str = ('pass %d:%6d %s\n'
 736                % (pass_num, duration, desc(duration)))
 737       str = str + p_str
 738
 739     total = int(self.data['end_time'] - self.data['start_time'])
 740     str = str + ('total: %6d %s' % (total, desc(total)))
 741     return str
 742
 743
 744 class LastSymbolicNameDatabase:
 745   """ Passing every CVSRevision in s-revs to this class will result in
 746   a Database whose key is the last CVS Revision a symbolicname was
 747   seen in, and whose value is a list of all symbolicnames that were
 748   last seen in that revision."""
 749   def __init__(self, mode):
 750     self.symbols = {}
 751     self.symbol_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), mode)
 752     Cleanup().register(temp(SYMBOL_LAST_CVS_REVS_DB), pass5)
 753
 754   # Once we've gone through all the revs,
 755   # symbols.keys() will be a list of all tags and branches, and
 756   # their corresponding values will be a key into the last CVS revision
 757   # that they were used in.
 758   def log_revision(self, c_rev):
 759     # Gather last CVS Revision for symbolic name info and tag info
 760     for tag in c_rev.tags:
 761       self.symbols[tag] = c_rev.unique_key()
 762     if c_rev.op is not OP_DELETE:
 763       for branch in c_rev.branches:
 764         self.symbols[branch] = c_rev.unique_key()
 765
 766   # Creates an inversion of symbols above--a dictionary of lists (key
 767   # = CVS rev unique_key: val = list of symbols that close in that
 768   # rev.
 769   def create_database(self):
 770     for sym, rev_unique_key in self.symbols.items():
 771       if self.symbol_revs_db.has_key(rev_unique_key):
 772         ary = self.symbol_revs_db[rev_unique_key]
 773         ary.append(sym)
 774         self.symbol_revs_db[rev_unique_key] = ary
 775       else:
 776         self.symbol_revs_db[rev_unique_key] = [sym]
 777
 778
 779 class CVSRevisionDatabase:
 780   """A Database to store CVSRevision objects and retrieve them by their
 781   unique_key()."""
 782
 783   def __init__(self, mode):
 784     """Initialize an instance, opening database in MODE (like the MODE
 785     argument to Database or anydbm.open())."""
 786     self.cvs_revs_db = Database(temp(CVS_REVS_DB), mode)
 787     Cleanup().register(temp(CVS_REVS_DB), pass8)
 788
 789   def log_revision(self, c_rev):
 790     """Add C_REV, a CVSRevision, to the database."""
 791     self.cvs_revs_db[c_rev.unique_key()] = str(c_rev)
 792
 793   def get_revision(self, unique_key):
 794     """Return the CVSRevision stored under UNIQUE_KEY."""
 795     return CVSRevision(Ctx(), self.cvs_revs_db[unique_key])
 796
 797
 798 class TagsDatabase(Database):
 799   """A Database to store which symbolic names are tags.
 800   Each key is a tag name.
 801   The value has no meaning, and should be set to None."""
 802   def __init__(self, mode):
 803     Database.__init__(self, temp(TAGS_DB), mode)
 804     Cleanup().register(temp(TAGS_DB), pass8)
 805
 806
 807 class CVSRevision:
 808   def __init__(self, ctx, *args):
 809     """Initialize a new CVSRevision with Ctx object CTX, and ARGS.
 810
 811     If CTX is None, the following members and methods of the
 812     instantiated CVSRevision class object will be unavailable (or
 813     simply will not work correctly, if at all):
 814        cvs_path
 815        svn_path
 816        svn_trunk_path
 817        is_default_branch_revision()
 818
 819     (Note that this class treats CTX as const, because the caller
 820     likely passed in a Borg instance of a Ctx.  The reason this class
 821     takes CTX as as a parameter, instead of just instantiating a Ctx
 822     itself, is that this class should be usable outside cvs2svn.)
 823
 824     If there is one argument in ARGS, it is a string, in the format of
 825     a line from a revs file.  Do *not* include a trailing newline.
 826
 827     If there are multiple ARGS, there must be 16 of them,
 828     comprising a parsed revs line:
 829        timestamp       -->  (int) date stamp for this cvs revision
 830        digest          -->  (string) digest of author+logmsg
 831        prev_timestamp  -->  (int) date stamp for the previous cvs revision
 832        op              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
 833        prev_rev        -->  (string or None) previous CVS rev, e.g., "1.2"
 834        rev             -->  (string) this CVS rev, e.g., "1.3"
 835        next_rev        -->  (string or None) next CVS rev, e.g., "1.4"
 836        file_in_attic   -->  (char or None) true if RCS file is in Attic
 837        file_executable -->  (char or None) true if RCS file has exec bit set.
 838        file_size       -->  (int) size of the RCS file
 839        deltatext_code  -->  (char) 'N' if non-empty deltatext, else 'E'
 840        mode            -->  (string or None) "kkv", "kb", etc.
 841        branch_name     -->  (string or None) branch on which this rev occurred
 842        tags            -->  (list of strings) all tags on this revision
 843        branches        -->  (list of strings) all branches rooted in this rev
 844        fname           -->  (string) relative path of file in CVS repos
 845
 846     The two forms of initialization are equivalent."""
 847
 848     self._ctx = ctx
 849     if len(args) == 16:
 850       (self.timestamp, self.digest, self.prev_timestamp, self.op,
 851        self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
 852        self.file_executable, self.file_size, self.deltatext_code,
 853        self.fname,
 854        self.mode, self.branch_name, self.tags, self.branches) = args
 855     elif len(args) == 1:
 856       data = args[0].split(' ', 14)
 857       (self.timestamp, self.digest, self.prev_timestamp, self.op,
 858        self.prev_rev, self.rev, self.next_rev, self.file_in_attic,
 859        self.file_executable, self.file_size, self.deltatext_code,
 860        self.mode, self.branch_name, numtags, remainder) = data
 861       # Patch up data items which are not simple strings
 862       self.timestamp = int(self.timestamp, 16)
 863       if self.prev_timestamp == "*":
 864         self.prev_timestamp = 0
 865       else:
 866         self.prev_timestamp = int(self.prev_timestamp)
 867       if self.prev_rev == "*":
 868         self.prev_rev = None
 869       if self.next_rev == "*":
 870         self.next_rev = None
 871       if self.file_in_attic == "*":
 872         self.file_in_attic = None
 873       if self.file_executable == "*":
 874         self.file_executable = None
 875       self.file_size = int(self.file_size)
 876       if self.mode == "*":
 877         self.mode = None
 878       if self.branch_name == "*":
 879         self.branch_name = None
 880       numtags = int(numtags)
 881       tags_and_numbranches_and_remainder = remainder.split(' ', numtags + 1)
 882       self.tags = tags_and_numbranches_and_remainder[:-2]
 883       numbranches = int(tags_and_numbranches_and_remainder[-2])
 884       remainder = tags_and_numbranches_and_remainder[-1]
 885       branches_and_fname = remainder.split(' ', numbranches)
 886       self.branches = branches_and_fname[:-1]
 887       self.fname = branches_and_fname[-1]
 888     else:
 889       raise TypeError, 'CVSRevision() takes 2 or 16 arguments (%d given)' % \
 890           (len(args) + 1)
 891     if ctx is not None:
 892       self.cvs_path = relative_name(self._ctx.cvsroot, self.fname[:-2])
 893       self.svn_path = self._make_path(self.cvs_path, self.branch_name)
 894       self.svn_trunk_path = self._make_path(self.cvs_path)
 895
 896   # The 'primary key' of a CVS Revision is the revision number + the
 897   # filename.  To provide a unique key (say, for a dict), we just glom
 898   # them together in a string.  By passing in self.prev_rev or
 899   # self.next_rev, you can get the unique key for their respective
 900   # CVSRevisions.
 901   def unique_key(self, revnum=None):
 902     if revnum is None:
 903       revnum = self.rev
 904     return revnum + "/" + self.fname
 905
 906   def __str__(self):
 907     return ('%08lx %s %s %s %s %s %s %s %s %d %s %s %s %d%s%s %d%s%s %s' % (
 908       self.timestamp, self.digest, self.prev_timestamp or "*", self.op,
 909       (self.prev_rev or "*"), self.rev, (self.next_rev or "*"),
 910       (self.file_in_attic or "*"), (self.file_executable or "*"),
 911       self.file_size,
 912       self.deltatext_code, (self.mode or "*"), (self.branch_name or "*"),
 913       len(self.tags), self.tags and " " or "", " ".join(self.tags),
 914       len(self.branches), self.branches and " " or "", " ".join(self.branches),
 915       self.fname, ))
 916
 917   # Returns true if this CVSRevision is the opening CVSRevision for
 918   # NAME (for this RCS file).
 919   def opens_symbolic_name(self, name):
 920     if name in self.tags:
 921       return 1
 922     if name in self.branches:
 923       # If this c_rev opens a branch and our op is OP_DELETE, then
 924       # that means that the file that this c_rev belongs to was
 925       # created on the branch, so for all intents and purposes, this
 926       # c_rev is *technically* not an opening.  See Issue #62 for more
 927       # information.
 928       if self.op != OP_DELETE:
 929         return 1
 930     return 0
 931
 932   def is_default_branch_revision(self):
 933     """Return 1 if SELF.rev of SELF.cvs_path is a default branch
 934     revision according to DEFAULT_BRANCHES_DB (see the conditions
 935     documented there), else return None."""
 936     if self._ctx._default_branches_db.has_key(self.cvs_path):
 937       val = self._ctx._default_branches_db[self.cvs_path]
 938       val_last_dot = val.rindex(".")
 939       our_last_dot = self.rev.rindex(".")
 940       default_branch = val[:val_last_dot]
 941       our_branch = self.rev[:our_last_dot]
 942       default_rev_component = int(val[val_last_dot + 1:])
 943       our_rev_component = int(self.rev[our_last_dot + 1:])
 944       if (default_branch == our_branch
 945           and our_rev_component <= default_rev_component):
 946         return 1
 947     # else
 948     return None
 949
 950   def _make_path(self, path, branch_name = None):
 951     """Return the trunk path or branch path for PATH.
 952
 953     If PATH is None, return None."""
 954     # For a while, we treated each top-level subdir of the CVS
 955     # repository as a "project root" and interpolated the appropriate
 956     # genealogy (trunk|tag|branch) in according to the official
 957     # recommended layout.  For example, the path '/foo/bar/baz.c' on
 958     # branch 'Rel2' would become
 959     #
 960     #   /foo/branches/Rel2/bar/baz.c
 961     #
 962     # and on trunk it would become
 963     #
 964     #   /foo/trunk/bar/baz.c
 965     #
 966     # However, we went back to the older and simpler method of just
 967     # prepending the genealogy to the front, instead of interpolating.
 968     # So now we produce:
 969     #
 970     #   /branches/Rel2/foo/bar/baz.c
 971     #   /trunk/foo/bar/baz.c
 972     #
 973     # Why?  Well, Jack Repenning pointed out that this way is much
 974     # friendlier to "anonymously rooted subtrees" (that's a tree where
 975     # the name of the top level dir doesn't matter, the point is that if
 976     # you cd into it and, say, run 'make', something good will happen).
 977     # By interpolating, we made it impossible to point cvs2svn at some
 978     # subdir in the CVS repository and convert it as a project, because
 979     # we'd treat every subdir underneath it as an independent project
 980     # root, which is probably not what the user wanted.
 981     #
 982     # Also, see Blair Zajac's post
 983     #
 984     #    http://subversion.tigris.org/servlets/ReadMsg?list=dev&msgNo=38965
 985     #
 986     # and the surrounding thread, for why what people really want is a
 987     # way of specifying an in-repository prefix path, not interpolation.
 988     if path is None:
 989       return None
 990
 991     if branch_name:
 992       branch_name = _clean_symbolic_name(branch_name)
 993       return self._ctx.branches_base + '/' + branch_name + '/' + path
 994     else:
 995       return self._ctx.trunk_base + '/' + path
 996
 997   def rcs_path(self):
 998     """Returns the actual filesystem path to the RCS file of this
 999     CVSRevision."""
1000     if self.file_in_attic is None:
1001       return self.fname
1002     else:
1003       basepath, filename = os.path.split(self.fname)
1004       return os.path.join(basepath, 'Attic', filename)
1005
1006   def filename(self):
1007     "Return the last path component of self.fname, minus the ',v'"
1008     return os.path.split(self.fname)[-1][:-2]
1009
1010 class SymbolDatabase:
1011   """This database records information on all symbols in the RCS
1012   files.  It is created in pass 1 and it is used in pass 2."""
1013   def __init__(self):
1014     # A hash that maps tag names to commit counts
1015     self.tags = { }
1016     # A hash that maps branch names to lists of the format
1017     # [ create_count, commit_count, blockers ], where blockers
1018     # is a hash that lists the symbols that depend on the
1019     # the branch.  The blockers hash is used as a set, so the
1020     # values are not used.
1021     self.branches = { }
1022
1023   def register_tag_creation(self, name):
1024     """Register the creation of the tag NAME."""
1025     if not self.tags.has_key(name):
1026       self.tags[name] = 0
1027     self.tags[name] += 1
1028
1029   def _branch(self, name):
1030     """Helper function to get a branch node that will create and
1031     initialize the node if it does not exist."""
1032     if not self.branches.has_key(name):
1033       self.branches[name] = [ 0, 0, { } ]
1034     return self.branches[name]
1035
1036   def register_branch_creation(self, name):
1037     """Register the creation of the branch NAME."""
1038     self._branch(name)[0] += 1
1039
1040   def register_branch_commit(self, name):
1041     """Register a commit on the branch NAME."""
1042     self._branch(name)[1] += 1
1043
1044   def register_branch_blocker(self, name, blocker):
1045     """Register BLOCKER as a blocker on the branch NAME."""
1046     self._branch(name)[2][blocker] = None
1047
1048   def branch_has_commit(self, name):
1049     """Return non-zero if NAME has commits.  Returns 0 if name
1050     is not a branch or if it has no commits."""
1051     return self.branches.has_key(name) and self.branches[name][1]
1052
1053   def find_excluded_symbols(self, regexp_list):
1054     """Returns a hash of all symbols thaht match the regexps in
1055     REGEXP_LISTE.  The hash is used as a set so the values are
1056     not used."""
1057     excludes = { }
1058     for tag in self.tags.keys():
1059       if match_regexp_list(regexp_list, tag):
1060         excludes[tag] = None
1061     for branch in self.branches.keys():
1062       if match_regexp_list(regexp_list, branch):
1063         excludes[branch] = None
1064     return excludes
1065
1066   def find_branch_exclude_blockers(self, branch, excludes):
1067     """Find all blockers of BRANCH, excluding the ones in the hash
1068     EXCLUDES."""
1069     blockers = { }
1070     if excludes.has_key(branch):
1071       for blocker in self.branches[branch][2]:
1072         if not excludes.has_key(blocker):
1073           blockers[blocker] = None
1074     return blockers
1075
1076   def find_blocked_excludes(self, excludes):
1077     """Find all branches not in EXCLUDES that have blocking symbols that
1078     are not themselves excluded.  Return a hash that maps branch names
1079     to a hash of blockers.  The hash of blockes is used as a set so the
1080     values are not used."""
1081     blocked_branches = { }
1082     for branch in self.branches.keys():
1083       blockers = self.find_branch_exclude_blockers(branch, excludes)
1084       if blockers:
1085         blocked_branches[branch] = blockers
1086     return blocked_branches
1087
1088   def find_mismatches(self, excludes=None):
1089     """Find all symbols that are defined as both tags and branches,
1090     excluding the ones in EXCLUDES.  Returns a list of 4-tuples with
1091     the symbol name, tag count, branch count and commit count."""
1092     if excludes is None:
1093       excludes = { }
1094     mismatches = [ ]
1095     for branch in self.branches.keys():
1096       if not excludes.has_key(branch) and self.tags.has_key(branch):
1097         mismatches.append((branch,                    # name
1098                            self.tags[branch],         # tag count
1099                            self.branches[branch][0],  # branch count
1100                            self.branches[branch][1])) # commit count
1101     return mismatches
1102
1103   def read(self):
1104     """Read the symbol database from files."""
1105     f = open(temp(TAGS_LIST))
1106     while 1:
1107       line = f.readline()
1108       if not line:
1109         break
1110       tag, count = line.split()
1111       self.tags[tag] = int(count)
1112
1113     f = open(temp(BRANCHES_LIST))
1114     while 1:
1115       line = f.readline()
1116       if not line:
1117         break
1118       words = line.split()
1119       self.branches[words[0]] = [ int(words[1]), int(words[2]), { } ]
1120       for blocker in words[3:]:
1121         self.branches[words[0]][2][blocker] = None
1122
1123   def write(self):
1124     """Store the symbol database to files."""
1125     f = open(temp(TAGS_LIST), "w")
1126     Cleanup().register(temp(TAGS_LIST), pass2)
1127     for tag, count in self.tags.items():
1128       f.write("%s %d\n" % (tag, count))
1129
1130     f = open(temp(BRANCHES_LIST), "w")
1131     Cleanup().register(temp(BRANCHES_LIST), pass2)
1132     for branch, info in self.branches.items():
1133       f.write("%s %d %d" % (branch, info[0], info[1]))
1134       if info[2]:
1135         f.write(" ")
1136         f.write(" ".join(info[2].keys()))
1137       f.write("\n")
1138
1139 class CollectData(cvs2svn_rcsparse.Sink):
1140   def __init__(self):
1141     self.revs = open(temp(DATAFILE + REVS_SUFFIX), 'w')
1142     Cleanup().register(temp(DATAFILE + REVS_SUFFIX), pass2)
1143     self.resync = open(temp(DATAFILE + RESYNC_SUFFIX), 'w')
1144     Cleanup().register(temp(DATAFILE + RESYNC_SUFFIX), pass2)
1145     self.default_branches_db = Database(temp(DEFAULT_BRANCHES_DB), DB_OPEN_NEW)
1146     Cleanup().register(temp(DEFAULT_BRANCHES_DB), pass5)
1147     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_NEW)
1148     Cleanup().register(temp(METADATA_DB), pass8)
1149     self.fatal_errors = []
1150     self.num_files = 0
1151     self.symbol_db = SymbolDatabase()
1152
1153     # 1 if we've collected data for at least one file, None otherwise.
1154     self.found_valid_file = None
1155
1156     # See set_fname() for initializations of other variables.
1157
1158   def set_fname(self, canonical_name, filename):
1159     """Prepare to receive data for FILENAME.  FILENAME is the absolute
1160     filesystem path to the file in question, and CANONICAL_NAME is
1161     FILENAME with the 'Attic' component removed (if the file is indeed
1162     in the Attic) ."""
1163     self.fname = canonical_name
1164
1165     # We calculate and save some file metadata here, where we can do
1166     # it only once per file, instead of waiting until later where we
1167     # would have to do the same calculations once per CVS *revision*.
1168
1169     self.rel_name = relative_name(Ctx().cvsroot, self.fname)[:-2]
1170
1171     # If the paths are not the same, then that means that the
1172     # canonical_name has had the 'Attic' component stripped out.
1173     self.file_in_attic = None
1174     if not canonical_name == filename:
1175       self.file_in_attic = 1
1176
1177     file_stat = os.stat(filename)
1178     # The size of our file in bytes
1179     self.file_size = file_stat[stat.ST_SIZE]
1180
1181     # Whether or not the executable bit is set.
1182     self.file_executable = None
1183     if file_stat[0] & stat.S_IXUSR:
1184       self.file_executable = 1
1185
1186     # revision -> [timestamp, author, old-timestamp]
1187     self.rev_data = { }
1188
1189     # Maps revision number (key) to the revision number of the
1190     # previous revision along this line of development.
1191     #
1192     # For the first revision R on a branch, we consider the revision
1193     # from which R sprouted to be the 'previous'.
1194     #
1195     # Note that this revision can't be determined arithmetically (due
1196     # to cvsadmin -o, which is why this is necessary).
1197     self.prev_rev = { }
1198
1199     # This dict is essentially self.prev_rev with the values mapped in
1200     # the other direction, so following key -> value will yield you
1201     # the next revision number
1202     self.next_rev = { }
1203
1204     # Track the state of each revision so that in set_revision_info,
1205     # we can determine if our op is an add/change/delete.  We can do
1206     # this because in set_revision_info, we'll have all of the
1207     # revisions for a file at our fingertips, and we need to examine
1208     # the state of our prev_rev to determine if we're an add or a
1209     # change--without the state of the prev_rev, we are unable to
1210     # distinguish between an add and a change.
1211     self.rev_state = { }
1212
1213     # Hash mapping branch numbers, like '1.7.2', to branch names,
1214     # like 'Release_1_0_dev'.
1215     self.branch_names = { }
1216
1217     # RCS flags (used for keyword expansion).
1218     self.mode = None
1219
1220     # Hash mapping revision numbers, like '1.7', to lists of names
1221     # indicating which branches sprout from that revision, like
1222     # ['Release_1_0_dev', 'experimental_driver', ...].
1223     self.branchlist = { }
1224
1225     # Like self.branchlist, but the values are lists of tag names that
1226     # apply to the key revision.
1227     self.taglist = { }
1228
1229     # If set, this is an RCS branch number -- rcsparse calls this the
1230     # "principal branch", but CVS and RCS refer to it as the "default
1231     # branch", so that's what we call it, even though the rcsparse API
1232     # setter method is still 'set_principal_branch'.
1233     self.default_branch = None
1234
1235     # If the RCS file doesn't have a default branch anymore, but does
1236     # have vendor revisions, then we make an educated guess that those
1237     # revisions *were* the head of the default branch up until the
1238     # commit of 1.2, at which point the file's default branch became
1239     # trunk.  This records the date at which 1.2 was committed.
1240     self.first_non_vendor_revision_date = None
1241
1242     # A list of all symbols defined for the current file.  Used to
1243     # prevent multiple definitions of a symbol, something which can
1244     # easily happen when --symbol-transform is used.
1245     self.defined_symbols = [ ]
1246
1247   def set_principal_branch(self, branch):
1248     self.default_branch = branch
1249
1250   def set_expansion(self, mode):
1251     self.mode = mode
1252
1253   def set_branch_name(self, branch_number, name):
1254     """Record that BRANCH_NUMBER is the branch number for branch NAME,
1255     and that NAME sprouts from BRANCH_NUMBER .
1256     BRANCH_NUMBER is an RCS branch number with an odd number of components,
1257     for example '1.7.2' (never '1.7.0.2')."""
1258     if not self.branch_names.has_key(branch_number):
1259       self.branch_names[branch_number] = name
1260       # The branchlist is keyed on the revision number from which the
1261       # branch sprouts, so strip off the odd final component.
1262       sprout_rev = branch_number[:branch_number.rfind(".")]
1263       if not self.branchlist.has_key(sprout_rev):
1264         self.branchlist[sprout_rev] = []
1265       self.branchlist[sprout_rev].append(name)
1266       self.symbol_db.register_branch_creation(name)
1267     else:
1268       sys.stderr.write("%s: in '%s':\n"
1269                        "   branch '%s' already has name '%s',\n"
1270                        "   cannot also have name '%s', ignoring the latter\n"
1271                        % (warning_prefix, self.fname, branch_number,
1272                           self.branch_names[branch_number], name))
1273
1274   def rev_to_branch_name(self, revision):
1275     """Return the name of the branch on which REVISION lies.
1276     REVISION is a non-branch revision number with an even number of,
1277     components, for example '1.7.2.1' (never '1.7.2' nor '1.7.0.2').
1278     For the convenience of callers, REVISION can also be a trunk
1279     revision such as '1.2', in which case just return None."""
1280     if trunk_rev.match(revision):
1281       return None
1282     return self.branch_names.get(revision[:revision.rindex(".")])
1283
1284   def add_cvs_branch(self, revision, branch_name):
1285     """Record the root revision and branch revision for BRANCH_NAME,
1286     based on REVISION.  REVISION is a CVS branch number having an even
1287     number of components where the second-to-last is '0'.  For
1288     example, if it's '1.7.0.2', then record that BRANCH_NAME sprouts
1289     from 1.7 and has branch number 1.7.2."""
1290     last_dot = revision.rfind(".")
1291     branch_rev = revision[:last_dot]
1292     last2_dot = branch_rev.rfind(".")
1293     branch_rev = branch_rev[:last2_dot] + revision[last_dot:]
1294     self.set_branch_name(branch_rev, branch_name)
1295
1296   def define_tag(self, name, revision):
1297     """Record a bidirectional mapping between symbolic NAME and REVISION.
1298     REVISION is an unprocessed revision number from the RCS file's
1299     header, for example: '1.7', '1.7.0.2', or '1.1.1' or '1.1.1.1'.
1300     This function will determine what kind of symbolic name it is by
1301     inspection, and record it in the right places."""
1302     for (pattern, replacement) in Ctx().symbol_transforms:
1303       newname = re.sub(pattern, replacement, name)
1304       if newname != name:
1305         Log().write(LOG_WARN, "   symbol '%s' transformed to '%s'"
1306                     % (name, newname))
1307         name = newname
1308     if name in self.defined_symbols:
1309       err = "%s: Multiple definitions of the symbol '%s' in '%s'" \
1310                 % (error_prefix, name, self.fname)
1311       sys.stderr.write(err + "\n")
1312       self.fatal_errors.append(err)
1313     self.defined_symbols.append(name)
1314     if branch_tag.match(revision):
1315       self.add_cvs_branch(revision, name)
1316     elif vendor_tag.match(revision):
1317       self.set_branch_name(revision, name)
1318     else:
1319       if not self.taglist.has_key(revision):
1320         self.taglist[revision] = []
1321       self.taglist[revision].append(name)
1322       self.symbol_db.register_tag_creation(name)
1323
1324   def define_revision(self, revision, timestamp, author, state,
1325                       branches, next):
1326
1327     # Record the state of our revision for later calculations
1328     self.rev_state[revision] = state
1329
1330     # store the rev_data as a list in case we have to jigger the timestamp
1331     self.rev_data[revision] = [int(timestamp), author, None]
1332
1333     # When on trunk, the RCS 'next' revision number points to what
1334     # humans might consider to be the 'previous' revision number.  For
1335     # example, 1.3's RCS 'next' is 1.2.
1336     #
1337     # However, on a branch, the RCS 'next' revision number really does
1338     # point to what humans would consider to be the 'next' revision
1339     # number.  For example, 1.1.2.1's RCS 'next' would be 1.1.2.2.
1340     #
1341     # In other words, in RCS, 'next' always means "where to find the next
1342     # deltatext that you need this revision to retrieve.
1343     #
1344     # That said, we don't *want* RCS's behavior here, so we determine
1345     # whether we're on trunk or a branch and set self.prev_rev
1346     # accordingly.
1347     #
1348     # One last thing.  Note that if REVISION is a branch revision,
1349     # instead of mapping REVISION to NEXT, we instead map NEXT to
1350     # REVISION.  Since we loop over all revisions in the file before
1351     # doing anything with the data we gather here, this 'reverse
1352     # assignment' effectively does the following:
1353     #
1354     # 1. Gives us no 'prev' value for REVISION (in this
1355     # iteration... it may have been set in a previous iteration)
1356     #
1357     # 2. Sets the 'prev' value for the revision with number NEXT to
1358     # REVISION.  So when we come around to the branch revision whose
1359     # revision value is NEXT, its 'prev' and 'prev_rev' are already
1360     # set.
1361     if trunk_rev.match(revision):
1362       self.prev_rev[revision] = next
1363       self.next_rev[next] = revision
1364     elif next:
1365       self.prev_rev[next] = revision
1366       self.next_rev[revision] = next
1367
1368     for b in branches:
1369       self.prev_rev[b] = revision
1370
1371     # Ratchet up the highest vendor head revision, if necessary.
1372     if self.default_branch:
1373       default_branch_root = self.default_branch + "."
1374       if ((revision.find(default_branch_root) == 0)
1375           and (default_branch_root.count('.') == revision.count('.'))):
1376         # This revision is on the default branch, so record that it is
1377         # the new highest default branch head revision.
1378         self.default_branches_db[self.rel_name] = revision
1379     else:
1380       # No default branch, so make an educated guess.
1381       if revision == '1.2':
1382         # This is probably the time when the file stopped having a
1383         # default branch, so make a note of it.
1384         self.first_non_vendor_revision_date = timestamp
1385       else:
1386         m = vendor_revision.match(revision)
1387         if m and ((not self.first_non_vendor_revision_date)
1388                   or (timestamp < self.first_non_vendor_revision_date)):
1389           # We're looking at a vendor revision, and it wasn't
1390           # committed after this file lost its default branch, so bump
1391           # the maximum trunk vendor revision in the permanent record.
1392           self.default_branches_db[self.rel_name] = revision
1393
1394     if not trunk_rev.match(revision):
1395       # Check for unlabeled branches, record them.  We tried to collect
1396       # all branch names when we parsed the symbolic name header
1397       # earlier, of course, but that didn't catch unlabeled branches.
1398       # If a branch is unlabeled, this is our first encounter with it,
1399       # so we have to record its data now.
1400       branch_number = revision[:revision.rindex(".")]
1401       if not self.branch_names.has_key(branch_number):
1402         branch_name = "unlabeled-" + branch_number
1403         self.set_branch_name(branch_number, branch_name)
1404
1405       # Register the commit on this non-trunk branch
1406       branch_name = self.branch_names[branch_number]
1407       self.symbol_db.register_branch_commit(branch_name)
1408
1409   def tree_completed(self):
1410     "The revision tree has been parsed.  Analyze it for consistency."
1411
1412     # Our algorithm depends upon the timestamps on the revisions occuring
1413     # monotonically over time.  That is, we want to see rev 1.34 occur in
1414     # time before rev 1.35.  If we inserted 1.35 *first* (due to the time-
1415     # sorting), and then tried to insert 1.34, we'd be screwed.
1416
1417     # to perform the analysis, we'll simply visit all of the 'previous'
1418     # links that we have recorded and validate that the timestamp on the
1419     # previous revision is before the specified revision
1420
1421     # if we have to resync some nodes, then we restart the scan. just keep
1422     # looping as long as we need to restart.
1423     while 1:
1424       for current, prev in self.prev_rev.items():
1425         if not prev:
1426           # no previous revision exists (i.e. the initial revision)
1427           continue
1428         t_c = self.rev_data[current][0]
1429         t_p = self.rev_data[prev][0]
1430         if t_p >= t_c:
1431           # the previous revision occurred later than the current revision.
1432           # shove the previous revision back in time (and any before it that
1433           # may need to shift).
1434
1435           # We sync backwards and not forwards because any given CVS
1436           # Revision has only one previous revision.  However, a CVS
1437           # Revision can *be* a previous revision for many other
1438           # revisions (e.g., a revision that is the source of multiple
1439           # branches).  This becomes relevant when we do the secondary
1440           # synchronization in pass 2--we can make certain that we
1441           # don't resync a revision earlier than it's previous
1442           # revision, but it would be non-trivial to make sure that we
1443           # don't resync revision R *after* any revisions that have R
1444           # as a previous revision.
1445           while t_p >= t_c:
1446             self.rev_data[prev][0] = t_c - 1    # new timestamp
1447             self.rev_data[prev][2] = t_p        # old timestamp
1448             delta = t_c - 1 - t_p
1449             msg =  "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
1450                   % (self.rel_name,
1451                      prev, time.ctime(t_p), delta)
1452             Log().write(LOG_VERBOSE, msg)
1453             if (delta > COMMIT_THRESHOLD
1454                 or delta < (COMMIT_THRESHOLD * -1)):
1455               str = "%s: Significant timestamp change for '%s' (%d seconds)"
1456               Log().write(LOG_WARN, str % (warning_prefix, self.rel_name,
1457                                            delta))
1458             current = prev
1459             prev = self.prev_rev[current]
1460             if not prev:
1461               break
1462             t_c = t_c - 1               # self.rev_data[current][0]
1463             t_p = self.rev_data[prev][0]
1464
1465           # break from the for-loop
1466           break
1467       else:
1468         # finished the for-loop (no resyncing was performed)
1469         return
1470
1471   def set_revision_info(self, revision, log, text):
1472     timestamp, author, old_ts = self.rev_data[revision]
1473     digest = sha.new(log + '\0' + author).hexdigest()
1474     if old_ts:
1475       # the timestamp on this revision was changed. log it for later
1476       # resynchronization of other files's revisions that occurred
1477       # for this time and log message.
1478       self.resync.write('%08lx %s %08lx\n' % (old_ts, digest, timestamp))
1479
1480     # "...Give back one kadam to honor the Hebrew God whose Ark this is."
1481     #       -- Imam to Indy and Sallah, in 'Raiders of the Lost Ark'
1482     #
1483     # If revision 1.1 appears to have been created via 'cvs add'
1484     # instead of 'cvs import', then this file probably never had a
1485     # default branch, so retroactively remove its record in the
1486     # default branches db.  The test is that the log message CVS uses
1487     # for 1.1 in imports is "Initial revision\n" with no period.
1488     if revision == '1.1' and log != 'Initial revision\n':
1489       if self.default_branches_db.has_key(self.rel_name):
1490         del self.default_branches_db[self.rel_name]
1491
1492     # Get the timestamp of the previous revision
1493     prev_rev = self.prev_rev.get(revision, None)
1494     prev_timestamp, ign, ign = self.rev_data.get(prev_rev, [0, None, None])
1495
1496     # How to tell if a CVSRevision is an add, a change, or a deletion:
1497     #
1498     # It's a delete if RCS state is 'dead'
1499     #
1500     # It's an add if RCS state is 'Exp.' and
1501     #      - we either have no previous revision
1502     #        or
1503     #      - we have a previous revision whose state is 'dead'
1504     #
1505     # Anything else is a change.
1506     if self.rev_state[revision] == 'dead':
1507       op = OP_DELETE
1508     elif ((self.prev_rev.get(revision, None) is None)
1509           or (self.rev_state[self.prev_rev[revision]] == 'dead')):
1510       op = OP_ADD
1511     else:
1512       op = OP_CHANGE
1513
1514     if text:
1515       deltatext_code = DELTATEXT_NONEMPTY
1516     else:
1517       deltatext_code = DELTATEXT_EMPTY
1518
1519     c_rev = CVSRevision(Ctx(), timestamp, digest, prev_timestamp, op,
1520                         self.prev_rev[revision], revision,
1521                         self.next_rev.get(revision),
1522                         self.file_in_attic, self.file_executable,
1523                         self.file_size,
1524                         deltatext_code, self.fname,
1525                         self.mode, self.rev_to_branch_name(revision),
1526                         self.taglist.get(revision, []),
1527                         self.branchlist.get(revision, []))
1528     self.revs.write(str(c_rev) + "\n")
1529     StatsKeeper().record_c_rev(c_rev)
1530
1531     if not self.metadata_db.has_key(digest):
1532       self.metadata_db[digest] = (author, log)
1533
1534   def parse_completed(self):
1535     # Walk through all branches and tags and register them with
1536     # their parent branch in the symbol database.
1537     for revision, symbols in self.taglist.items() + self.branchlist.items():
1538       for symbol in symbols:
1539         name = self.rev_to_branch_name(revision)
1540         if name is not None:
1541           self.symbol_db.register_branch_blocker(name, symbol)
1542
1543     self.num_files = self.num_files + 1
1544
1545   def write_symbol_db(self):
1546     self.symbol_db.write()
1547
1548 class SymbolingsLogger:
1549   """Manage the file that contains lines for symbol openings and
1550   closings.
1551
1552   This data will later be used to determine valid SVNRevision ranges
1553   from which a file can be copied when creating a branch or tag in
1554   Subversion.  Do this by finding "Openings" and "Closings" for each
1555   file copied onto a branch or tag.
1556
1557   An "Opening" is the CVSRevision from which a given branch/tag
1558   sprouts on a path.
1559
1560   The "Closing" for that branch/tag and path is the next CVSRevision
1561   on the same line of development as the opening.
1562
1563   For example, on file 'foo.c', branch BEE has branch number 1.2.2 and
1564   obviously sprouts from revision 1.2.  Therefore, 1.2 is the opening
1565   for BEE on path 'foo.c', and 1.3 is the closing for BEE on path
1566   'foo.c'.  Note that there may be many revisions chronologically
1567   between 1.2 and 1.3, for example, revisions on branches of 'foo.c',
1568   perhaps even including on branch BEE itself.  But 1.3 is the next
1569   revision *on the same line* as 1.2, that is why it is the closing
1570   revision for those symbolic names of which 1.2 is the opening.
1571
1572   The reason for doing all this hullabaloo is to make branch and tag
1573   creation as efficient as possible by minimizing the number of copies
1574   and deletes per creation.  For example, revisions 1.2 and 1.3 of
1575   foo.c might correspond to revisions 17 and 30 in Subversion.  That
1576   means that when creating branch BEE, there is some motivation to do
1577   the copy from one of 17-30.  Now if there were another file,
1578   'bar.c', whose opening and closing CVSRevisions for BEE corresponded
1579   to revisions 24 and 39 in Subversion, we would know that the ideal
1580   thing would be to copy the branch from somewhere between 24 and 29,
1581   inclusive.
1582   """
1583   def __init__(self):
1584     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS), 'w')
1585     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS), pass6)
1586     self.closings = open(temp(SYMBOL_CLOSINGS_TMP), 'w')
1587     Cleanup().register(temp(SYMBOL_CLOSINGS_TMP), pass5)
1588
1589     # This keys of this dictionary are Subversion repository *source*
1590     # paths for which we've encountered an 'opening'.  The values are
1591     # the symbolic names that this path has opened.  The only paths
1592     # that should be in this dict are paths whose corresponding
1593     # CVSRevision is a default branch revision.
1594     self.open_paths_with_default_branches = { }
1595
1596   def log_revision(self, c_rev, svn_revnum):
1597     """Log any openings found in C_REV, and if C_REV.next_rev is not
1598     None, a closing.  The opening uses SVN_REVNUM, but the closing (if
1599     any) will have its revnum determined later."""
1600     for name in c_rev.tags + c_rev.branches:
1601       name = _clean_symbolic_name(name)
1602       self._note_default_branch_opening(c_rev, name)
1603       if c_rev.op != OP_DELETE:
1604         self._log(name, svn_revnum, c_rev.svn_path, OPENING)
1605
1606       # If our c_rev has a next_rev, then that's the closing rev for
1607       # this source revision.  Log it to closings for later processing
1608       # since we don't know the svn_revnum yet.
1609       if c_rev.next_rev is not None:
1610         self.closings.write('%s %s\n' %
1611                             (name, c_rev.unique_key(c_rev.next_rev)))
1612
1613   def _log(self, name, svn_revnum, svn_path, type):
1614     """Write out a single line to the symbol_openings_closings file
1615     representing that svn_revnum of svn_path is either the opening or
1616     closing (TYPE) of NAME (a symbolic name).
1617
1618     TYPE should only be one of the following global constants:
1619     OPENING or CLOSING."""
1620     # 8 places gives us 999,999,999 SVN revs.  That *should* be enough.
1621     self.symbolings.write('%s %.8d %s %s\n' % (name, svn_revnum,
1622                                                type, svn_path))
1623
1624   def close(self):
1625     """Iterate through the closings file, lookup the svn_revnum for
1626     each closing CVSRevision, and write a proper line out to the
1627     symbolings file."""
1628     # Use this to get the c_rev.svn_path of our rev_key
1629     cvs_revs_db = CVSRevisionDatabase(DB_OPEN_READ)
1630
1631     self.closings.close()
1632     for line in fileinput.FileInput(temp(SYMBOL_CLOSINGS_TMP)):
1633       (name, rev_key) = line.rstrip().split(" ", 1)
1634       svn_revnum = Ctx()._persistence_manager.get_svn_revnum(rev_key)
1635
1636       c_rev = cvs_revs_db.get_revision(rev_key)
1637       self._log(name, svn_revnum, c_rev.svn_path, CLOSING)
1638
1639     self.symbolings.close()
1640
1641   def _note_default_branch_opening(self, c_rev, symbolic_name):
1642     """If C_REV is a default branch revision, log C_REV.svn_trunk_path
1643     as an opening for SYMBOLIC_NAME."""
1644     path = c_rev.svn_trunk_path
1645     if not self.open_paths_with_default_branches.has_key(path):
1646       self.open_paths_with_default_branches[path] = [ ]
1647     self.open_paths_with_default_branches[path].append(symbolic_name)
1648
1649   def log_default_branch_closing(self, c_rev, svn_revnum):
1650     """If self.open_paths_with_default_branches contains
1651     C_REV.svn_trunk_path, then call log each name in
1652     self.open_paths_with_default_branches[C_REV.svn_trunk_path] as a
1653     closing with SVN_REVNUM as the closing revision number. """
1654     path = c_rev.svn_trunk_path
1655     if self.open_paths_with_default_branches.has_key(path):
1656       # log each symbol as a closing
1657       for name in self.open_paths_with_default_branches[path]:
1658         self._log(name, svn_revnum, path, CLOSING)
1659       # Remove them from the openings list as we're done with them.
1660       del self.open_paths_with_default_branches[path]
1661
1662
1663 class PersistenceManager:
1664   """The PersistenceManager allows us to effectively store SVNCommits
1665   to disk and retrieve them later using only their subversion revision
1666   number as the key.  It also returns the subversion revision number
1667   for a given CVSRevision's unique key.
1668
1669   All information pertinent to each SVNCommit is stored in a series of
1670   on-disk databases so that SVNCommits can be retrieved on-demand.
1671
1672   MODE is one of the constants DB_OPEN_NEW or DB_OPEN_READ.
1673   In 'new' mode, PersistenceManager will initialize a new set of on-disk
1674   databases and be fully-featured.
1675   In 'read' mode, PersistenceManager will open existing on-disk databases
1676   and the set_* methods will be unavailable."""
1677   def __init__(self, mode):
1678     self.mode = mode
1679     if mode not in (DB_OPEN_NEW, DB_OPEN_READ):
1680       raise RuntimeError, "Invalid 'mode' argument to PersistenceManager"
1681     self.svn2cvs_db = Database(temp(SVN_REVNUMS_TO_CVS_REVS), mode)
1682     Cleanup().register(temp(SVN_REVNUMS_TO_CVS_REVS), pass8)
1683     self.cvs2svn_db = Database(temp(CVS_REVS_TO_SVN_REVNUMS), mode)
1684     Cleanup().register(temp(CVS_REVS_TO_SVN_REVNUMS), pass8)
1685     self.svn_commit_names_dates = Database(temp(SVN_COMMIT_NAMES_DATES), mode)
1686     Cleanup().register(temp(SVN_COMMIT_NAMES_DATES), pass8)
1687     self.svn_commit_metadata = Database(temp(METADATA_DB), DB_OPEN_READ)
1688     self.cvs_revisions = CVSRevisionDatabase(DB_OPEN_READ)
1689     ###PERF kff Elsewhere there are comments about sucking the tags db
1690     ### into memory.  That seems like a good idea.
1691     if not Ctx().trunk_only:
1692       self.tags_db = TagsDatabase(DB_OPEN_READ)
1693       self.motivating_revnums = Database(temp(MOTIVATING_REVNUMS), mode)
1694       Cleanup().register(temp(MOTIVATING_REVNUMS), pass8)
1695
1696     # "branch_name" -> svn_revnum in which branch was last filled.
1697     # This is used by CVSCommit._pre_commit, to prevent creating a fill
1698     # revision which would have nothing to do.
1699     self.last_filled = {}
1700
1701   def get_svn_revnum(self, cvs_rev_unique_key):
1702     """Return the Subversion revision number in which
1703     CVS_REV_UNIQUE_KEY was committed, or SVN_INVALID_REVNUM if there
1704     is no mapping for CVS_REV_UNIQUE_KEY."""
1705     return int(self.cvs2svn_db.get(cvs_rev_unique_key, SVN_INVALID_REVNUM))
1706
1707   def get_svn_commit(self, svn_revnum):
1708     """Return an SVNCommit that corresponds to SVN_REVNUM.
1709
1710     If no SVNCommit exists for revnum SVN_REVNUM, then return None.
1711
1712     This method can throw SVNCommitInternalInconsistencyError.
1713     """
1714     svn_commit = SVNCommit("Retrieved from disk", svn_revnum)
1715     c_rev_keys = self.svn2cvs_db.get(str(svn_revnum), None)
1716     if c_rev_keys == None:
1717       return None
1718
1719     digest = None
1720     for key in c_rev_keys:
1721       c_rev = self.cvs_revisions.get_revision(key)
1722       svn_commit.add_revision(c_rev)
1723       # Set the author and log message for this commit by using
1724       # CVSRevision metadata, but only if haven't done so already.
1725       if digest is None:
1726         digest = c_rev.digest
1727         author, log_msg = self.svn_commit_metadata[digest]
1728         svn_commit.set_author(author)
1729         svn_commit.set_log_msg(log_msg)
1730
1731     # If we're doing a trunk-only conversion, we don't need to do any more work.
1732     if Ctx().trunk_only:
1733       return svn_commit
1734
1735     name, date = self._get_name_and_date(svn_revnum)
1736     if name:
1737       svn_commit.set_symbolic_name(name)
1738       svn_commit.set_date(date)
1739       if self.tags_db.has_key(name):
1740         svn_commit.is_tag = 1
1741
1742     motivating_revnum = self.motivating_revnums.get(str(svn_revnum), None)
1743     if motivating_revnum:
1744       svn_commit.set_motivating_revnum(int(motivating_revnum))
1745       svn_commit.set_date(date)
1746
1747     if len(svn_commit.cvs_revs) and name:
1748       msg = """An SVNCommit cannot have cvs_revisions *and* a
1749       corresponding symbolic name ('%s') to fill.""" % name
1750       raise SVNCommit.SVNCommitInternalInconsistencyError(msg)
1751
1752     return svn_commit
1753
1754   def set_cvs_revs(self, svn_revnum, cvs_revs):
1755     """Record the bidirectional mapping between SVN_REVNUM and
1756     CVS_REVS."""
1757     if self.mode == DB_OPEN_READ:
1758       raise RuntimeError, \
1759           'Write operation attempted on read-only PersistenceManager'
1760     for c_rev in cvs_revs:
1761       Log().write(LOG_VERBOSE, " ", c_rev.unique_key())
1762     self.svn2cvs_db[str(svn_revnum)] = [x.unique_key() for x in cvs_revs]
1763     for c_rev in cvs_revs:
1764       self.cvs2svn_db[c_rev.unique_key()] = svn_revnum
1765
1766   def set_name_and_date(self, svn_revnum, name, date):
1767     """Associate symbolic name NAME and DATE with SVN_REVNUM."""
1768     if self.mode == DB_OPEN_READ:
1769       raise RuntimeError, \
1770           'Write operation attempted on read-only PersistenceManager'
1771     self.svn_commit_names_dates[str(svn_revnum)] = (name, date)
1772     self.last_filled[name] = svn_revnum
1773
1774   def _get_name_and_date(self, svn_revnum):
1775     """Return a tuple containing the symbolic name and date associated
1776     with SVN_REVNUM, or (None, None) if SVN_REVNUM has no such data
1777     associated with it."""
1778     return self.svn_commit_names_dates.get(str(svn_revnum), (None, None))
1779
1780   def set_motivating_revnum(self, svn_revnum, motivating_revnum):
1781     """Store MOTIVATING_REVNUM as the value of SVN_REVNUM"""
1782     if self.mode == DB_OPEN_READ:
1783       raise RuntimeError, \
1784           'Write operation attempted on read-only PersistenceManager'
1785     self.motivating_revnums[str(svn_revnum)] = str(motivating_revnum)
1786
1787
1788 class CVSCommit:
1789   """Each instance of this class contains a number of CVS Revisions
1790   that correspond to one or more Subversion Commits.  After all CVS
1791   Revisions are added to the grouping, calling process_revisions will
1792   generate a Subversion Commit (or Commits) for the set of CVS
1793   Revisions in the grouping."""
1794
1795   def __init__(self, digest, author, log):
1796     self.digest = digest
1797     self.author = author
1798     self.log = log
1799
1800     # Symbolic names for which the last source revision has already
1801     # been seen and for which the CVSRevisionAggregator has already
1802     # generated a fill SVNCommit.  See self.process_revisions().
1803     self.done_symbols = [ ]
1804
1805     self.files = { }
1806     # Lists of CVSRevisions
1807     self.changes = [ ]
1808     self.deletes = [ ]
1809
1810     # Start out with a t_min higher than any incoming time T, and a
1811     # t_max lower than any incoming T.  This way the first T will
1812     # push t_min down to T, and t_max up to T, naturally (without any
1813     # special-casing), and successive times will then ratchet them
1814     # outward as appropriate.
1815     self.t_min = 1L<<32
1816     self.t_max = 0
1817
1818     # This will be set to the SVNCommit that occurs in self._commit.
1819     self.motivating_commit = None
1820
1821     # This is a list of all non-primary commits motivated by the main
1822     # commit.  We gather these so that we can set their dates to the
1823     # same date as the primary commit.
1824     self.secondary_commits = [ ]
1825
1826     # State for handling default branches.
1827     #
1828     # Here is a tempting, but ultimately nugatory, bit of logic, which
1829     # I share with you so you may appreciate the less attractive, but
1830     # refreshingly non-nugatory, logic which follows it:
1831     #
1832     # If some of the commits in this txn happened on a non-trunk
1833     # default branch, then those files will have to be copied into
1834     # trunk manually after being changed on the branch (because the
1835     # RCS "default branch" appears as head, i.e., trunk, in practice).
1836     # As long as those copies don't overwrite any trunk paths that
1837     # were also changed in this commit, then we can do the copies in
1838     # the same revision, because they won't cover changes that don't
1839     # appear anywhere/anywhen else.  However, if some of the trunk dst
1840     # paths *did* change in this commit, then immediately copying the
1841     # branch changes would lose those trunk mods forever.  So in this
1842     # case, we need to do at least that copy in its own revision.  And
1843     # for simplicity's sake, if we're creating the new revision for
1844     # even one file, then we just do all such copies together in the
1845     # new revision.
1846     #
1847     # Doesn't that sound nice?
1848     #
1849     # Unfortunately, Subversion doesn't support copies with sources
1850     # in the current txn.  All copies must be based in committed
1851     # revisions.  Therefore, we generate the above-described new
1852     # revision unconditionally.
1853     #
1854     # This is a list of c_revs, and a c_rev is appended for each
1855     # default branch commit that will need to be copied to trunk (or
1856     # deleted from trunk) in some generated revision following the
1857     # "regular" revision.
1858     self.default_branch_cvs_revisions = [ ]
1859
1860   def __cmp__(self, other):
1861     # Commits should be sorted by t_max.  If both self and other have
1862     # the same t_max, break the tie using t_min, and lastly, digest
1863     return (cmp(self.t_max, other.t_max) or cmp(self.t_min, other.t_min)
1864             or cmp(self.digest, other.digest))
1865
1866   def has_file(self, fname):
1867     return self.files.has_key(fname)
1868
1869   def revisions(self):
1870     return self.changes + self.deletes
1871
1872   def opens_symbolic_name(self, name):
1873     """Returns true if any CVSRevision in this commit is on a tag or a
1874     branch or is the origin of a tag or branch."""
1875     for c_rev in self.revisions():
1876       if c_rev.opens_symbolic_name(name):
1877         return 1
1878     return 0
1879
1880   def add_revision(self, c_rev):
1881     # Record the time range of this commit.
1882     #
1883     # ### ISSUE: It's possible, though unlikely, that the time range
1884     # of a commit could get gradually expanded to be arbitrarily
1885     # longer than COMMIT_THRESHOLD.  I'm not sure this is a huge
1886     # problem, and anyway deciding where to break it up would be a
1887     # judgement call.  For now, we just print a warning in commit() if
1888     # this happens.
1889     if c_rev.timestamp < self.t_min:
1890       self.t_min = c_rev.timestamp
1891     if c_rev.timestamp > self.t_max:
1892       self.t_max = c_rev.timestamp
1893
1894     if c_rev.op == OP_DELETE:
1895       self.deletes.append(c_rev)
1896     else:
1897       # OP_CHANGE or OP_ADD
1898       self.changes.append(c_rev)
1899
1900     self.files[c_rev.fname] = 1
1901
1902   def _pre_commit(self):
1903     """Generates any SVNCommits that must exist before the main
1904     commit."""
1905
1906     # There may be multiple c_revs in this commit that would cause
1907     # branch B to be filled, but we only want to fill B once.  On the
1908     # other hand, there might be multiple branches committed on in
1909     # this commit.  Whatever the case, we should count exactly one
1910     # commit per branch, because we only fill a branch once per
1911     # CVSCommit.  This list tracks which branches we've already
1912     # counted.
1913     accounted_for_sym_names = [ ]
1914
1915     def fill_needed(c_rev, pm):
1916       """Return 1 if this is the first commit on a new branch (for
1917       this file) and we need to fill the branch; else return 0
1918       (meaning that some other file's first commit on the branch has
1919       already done the fill for us).
1920
1921       If C_REV.op is OP_ADD, only return 1 if the branch that this
1922       commit is on has no last filled revision.
1923
1924       PM is a PersistenceManager to query.
1925       """
1926
1927       # Different '.' counts indicate that c_rev is now on a different
1928       # line of development (and may need a fill)
1929       if c_rev.rev.count('.') != c_rev.prev_rev.count('.'):
1930         svn_revnum = pm.get_svn_revnum(c_rev.unique_key(c_rev.prev_rev))
1931         # It should be the case that when we have a file F that
1932         # is added on branch B (thus, F on trunk is in state
1933         # 'dead'), we generate an SVNCommit to fill B iff the branch
1934         # has never been filled before.
1935         #
1936         # If this c_rev.op == OP_ADD, *and* the branch has never
1937         # been filled before, then fill it now.  Otherwise, no need to
1938         # fill it.
1939         if c_rev.op == OP_ADD:
1940           if pm.last_filled.get(c_rev.branch_name, None) is None:
1941             return 1
1942         else:
1943           if svn_revnum > pm.last_filled.get(c_rev.branch_name, 0):
1944             return 1
1945       return 0
1946
1947     for c_rev in self.changes + self.deletes:
1948       # If a commit is on a branch, we must ensure that the branch
1949       # path being committed exists (in HEAD of the Subversion
1950       # repository).  If it doesn't exist, we will need to fill the
1951       # branch.  After the fill, the path on which we're committing
1952       # will exist.
1953       if c_rev.branch_name \
1954           and c_rev.branch_name not in accounted_for_sym_names \
1955           and c_rev.branch_name not in self.done_symbols \
1956           and fill_needed(c_rev, Ctx()._persistence_manager):
1957         svn_commit = SVNCommit("pre-commit symbolic name '%s'"
1958                                % c_rev.branch_name)
1959         svn_commit.set_symbolic_name(c_rev.branch_name)
1960         self.secondary_commits.append(svn_commit)
1961         accounted_for_sym_names.append(c_rev.branch_name)
1962
1963   def _commit(self):
1964     """Generates the primary SVNCommit that corresponds the this
1965     CVSCommit."""
1966     # Generate an SVNCommit unconditionally.  Even if the only change
1967     # in this CVSCommit is a deletion of an already-deleted file (that
1968     # is, a CVS revision in state 'dead' whose predecessor was also in
1969     # state 'dead'), the conversion will still generate a Subversion
1970     # revision containing the log message for the second dead
1971     # revision, because we don't want to lose that information.
1972     svn_commit = SVNCommit("commit")
1973     self.motivating_commit = svn_commit
1974
1975     for c_rev in self.changes:
1976       svn_commit.add_revision(c_rev)
1977       # Only make a change if we need to.  When 1.1.1.1 has an empty
1978       # deltatext, the explanation is almost always that we're looking
1979       # at an imported file whose 1.1 and 1.1.1.1 are identical.  On
1980       # such imports, CVS creates an RCS file where 1.1 has the
1981       # content, and 1.1.1.1 has an empty deltatext, i.e, the same
1982       # content as 1.1.  There's no reason to reflect this non-change
1983       # in the repository, so we want to do nothing in this case.  (If
1984       # we were really paranoid, we could make sure 1.1's log message
1985       # is the CVS-generated "Initial revision\n", but I think the
1986       # conditions below are strict enough.)
1987       if not ((c_rev.deltatext_code == DELTATEXT_EMPTY)
1988               and (c_rev.rev == "1.1.1.1")):
1989         if c_rev.is_default_branch_revision():
1990           self.default_branch_cvs_revisions.append(c_rev)
1991
1992     for c_rev in self.deletes:
1993       # When a file is added on a branch, CVS not only adds the file
1994       # on the branch, but generates a trunk revision (typically
1995       # 1.1) for that file in state 'dead'.  We only want to add
1996       # this revision if the log message is not the standard cvs
1997       # fabricated log message.
1998       if c_rev.prev_rev is None:
1999         # c_rev.branches may be empty if the originating branch
2000         # has been excluded.
2001         if not c_rev.branches:
2002           continue
2003         cvs_generated_msg = ('file %s was initially added on branch %s.\n'
2004                              % (c_rev.filename(),
2005                                 c_rev.branches[0]))
2006         author, log_msg = \
2007             Ctx()._persistence_manager.svn_commit_metadata[c_rev.digest]
2008         if log_msg == cvs_generated_msg:
2009           continue
2010
2011       svn_commit.add_revision(c_rev)
2012       if c_rev.is_default_branch_revision():
2013         self.default_branch_cvs_revisions.append(c_rev)
2014
2015     # There is a slight chance that we didn't actually register any
2016     # CVSRevisions with our SVNCommit (see loop over self.deletes
2017     # above), so if we have no CVSRevisions, we don't flush the
2018     # svn_commit to disk and roll back our revnum.
2019     if len(svn_commit.cvs_revs) > 0:
2020       svn_commit.flush()
2021     else:
2022       # We will not be flushing this SVNCommit, so rollback the
2023       # SVNCommit revision counter.
2024       SVNCommit.revnum = SVNCommit.revnum - 1
2025
2026     if not Ctx().trunk_only:
2027       for c_rev in self.revisions():
2028         Ctx()._symbolings_logger.log_revision(c_rev, svn_commit.revnum)
2029
2030   def _post_commit(self):
2031     """Generates any SVNCommits that we can perform now that _commit
2032     has happened.  That is, handle non-trunk default branches.
2033     Sometimes an RCS file has a non-trunk default branch, so a commit
2034     on that default branch would be visible in a default CVS checkout
2035     of HEAD.  If we don't copy that commit over to Subversion's trunk,
2036     then there will be no Subversion tree which corresponds to that
2037     CVS checkout.  Of course, in order to copy the path over, we may
2038     first need to delete the existing trunk there.  """
2039
2040     # Only generate a commit if we have default branch revs
2041     if len(self.default_branch_cvs_revisions):
2042       # Generate an SVNCommit for all of our default branch c_revs.
2043       svn_commit = SVNCommit("post-commit default branch(es)")
2044       svn_commit.set_motivating_revnum(self.motivating_commit.revnum)
2045       for c_rev in self.default_branch_cvs_revisions:
2046         svn_commit.add_revision(c_rev)
2047         Ctx()._symbolings_logger.log_default_branch_closing(c_rev,
2048                                                             svn_commit.revnum)
2049       self.secondary_commits.append(svn_commit)
2050
2051   def process_revisions(self, done_symbols):
2052     """Process all the CVSRevisions that this instance has, creating
2053     one or more SVNCommits in the process.  Generate fill SVNCommits
2054     only for symbols not in DONE_SYMBOLS (avoids unnecessary
2055     fills).
2056
2057     Return the primary SVNCommit that corresponds to this CVSCommit.
2058     The returned SVNCommit is the commit that motivated any other
2059     SVNCommits generated in this CVSCommit."""
2060     self.done_symbols = done_symbols
2061     seconds = self.t_max - self.t_min + 1
2062
2063     Log().write(LOG_VERBOSE, '-' * 60)
2064     Log().write(LOG_VERBOSE, 'CVS Revision grouping:')
2065     if seconds == 1:
2066       Log().write(LOG_VERBOSE, '  Start time: %s (duration: 1 second)'
2067                   % time.ctime(self.t_max))
2068     else:
2069       Log().write(LOG_VERBOSE, '  Start time: %s' % time.ctime(self.t_min))
2070       Log().write(LOG_VERBOSE, '  End time:   %s (duration: %d seconds)'
2071                   % (time.ctime(self.t_max), seconds))
2072
2073     if seconds > COMMIT_THRESHOLD + 1:
2074       Log().write(LOG_WARN, '%s: grouping spans more than %d seconds'
2075                   % (warning_prefix, COMMIT_THRESHOLD))
2076
2077     if Ctx().trunk_only: # Only do the primary commit if we're trunk-only
2078       self._commit()
2079       return self.motivating_commit
2080
2081     self._pre_commit()
2082     self._commit()
2083     self._post_commit()
2084
2085     for svn_commit in self.secondary_commits:
2086       svn_commit.set_date(self.motivating_commit.get_date())
2087       svn_commit.flush()
2088
2089     return self.motivating_commit
2090
2091
2092 class SVNCommit:
2093   """This represents one commit to the Subversion Repository.  There
2094   are three types of SVNCommits:
2095
2096   1. Commits one or more CVSRevisions (cannot fill a symbolic name).
2097
2098   2. Creates or fills a symbolic name (cannot commit CVSRevisions).
2099
2100   3. Updates trunk to reflect the contents of a particular branch
2101      (this is to handle RCS default branches)."""
2102
2103   # The revision number to assign to the next new SVNCommit.
2104   # We start at 2 because SVNRepositoryMirror uses the first commit
2105   # to create trunk, tags, and branches.
2106   revnum = 2
2107
2108   class SVNCommitInternalInconsistencyError(Exception):
2109     """Exception raised if we encounter an impossible state in the
2110     SVNCommit Databases."""
2111     pass
2112
2113   def __init__(self, description="", revnum=None, cvs_revs=None):
2114     """Instantiate an SVNCommit.  DESCRIPTION is for debugging only.
2115     If REVNUM, the SVNCommit will correspond to that revision number;
2116     and if CVS_REVS, then they must be the exact set of CVSRevisions for
2117     REVNUM.
2118
2119     It is an error to pass CVS_REVS without REVNUM, but you may pass
2120     REVNUM without CVS_REVS, and then add a revision at a time by
2121     invoking add_revision()."""
2122     self._description = description
2123
2124     # Revprop metadata for this commit.
2125     #
2126     # These initial values are placeholders.  At least the log and the
2127     # date should be different by the time these are used.
2128     #
2129     # They are private because their values should be returned encoded
2130     # in UTF8, but callers aren't required to set them in UTF8.
2131     # Therefore, accessor methods are used to set them, and
2132     # self.get_revprops() is used to to get them, in dictionary form.
2133     self._author = Ctx().username
2134     self._log_msg = "This log message means an SVNCommit was used too soon."
2135     self._max_date = 0  # Latest date seen so far.
2136
2137     self.cvs_revs = cvs_revs or []
2138     if revnum:
2139       self.revnum = revnum
2140     else:
2141       self.revnum = SVNCommit.revnum
2142       SVNCommit.revnum = SVNCommit.revnum + 1
2143
2144     # The symbolic name that is filled in this SVNCommit, if any
2145     self.symbolic_name = None
2146
2147     # If this commit is a default branch synchronization, this
2148     # variable represents the subversion revision number of the
2149     # *primary* commit where the default branch changes actually
2150     # happened.  It is None otherwise.
2151     #
2152     # It is possible for multiple synchronization commits to refer to
2153     # the same motivating commit revision number, and it is possible
2154     # for a single synchronization commit to contain CVSRevisions on
2155     # multiple different default branches.
2156     self.motivating_revnum = None
2157
2158     # is_tag is true only if this commit is a fill of a symbolic name
2159     # that is a tag, None in all other cases.
2160     self.is_tag = None
2161
2162   def set_symbolic_name(self, name):
2163     "Set self.symbolic_name to NAME."
2164     name = _clean_symbolic_name(name)
2165     self.symbolic_name = name
2166
2167   def set_motivating_revnum(self, revnum):
2168     "Set self.motivating_revnum to REVNUM."
2169     self.motivating_revnum = revnum
2170
2171   def set_author(self, author):
2172     """Set this SVNCommit's author to AUTHOR (a locally-encoded string).
2173     This is the only way to set an SVNCommit's author."""
2174     self._author = author
2175
2176   def set_log_msg(self, msg):
2177     """Set this SVNCommit's log message to MSG (a locally-encoded string).
2178     This is the only way to set an SVNCommit's log message."""
2179     self._log_msg = msg
2180
2181   def set_date(self, date):
2182     """Set this SVNCommit's date to DATE (an integer).
2183     Note that self.add_revision() updates this automatically based on
2184     a CVSRevision; so you may not need to call this at all, and even
2185     if you do, the value may be overwritten by a later call to
2186     self.add_revision()."""
2187     self._max_date = date
2188
2189   def get_date(self):
2190     """Returns this SVNCommit's date as an integer."""
2191     return self._max_date
2192
2193   def get_revprops(self):
2194     """Return the Subversion revprops for this SVNCommit."""
2195     date = format_date(self._max_date)
2196     try:
2197       ### FIXME: The 'replace' behavior should be an option, like
2198       ### --encoding is.
2199       utf8_author = None
2200       if self._author is not None:
2201         unicode_author = unicode(self._author, Ctx().encoding, 'replace')
2202         utf8_author = unicode_author.encode('utf8')
2203       unicode_log = unicode(self.get_log_msg(), Ctx().encoding, 'replace')
2204       utf8_log = unicode_log.encode('utf8')
2205       return { 'svn:author' : utf8_author,
2206                'svn:log'    : utf8_log,
2207                'svn:date'   : date }
2208     except UnicodeError:
2209       Log().write(LOG_WARN, '%s: problem encoding author or log message:'
2210                   % warning_prefix)
2211       Log().write(LOG_WARN, "  author: '%s'" % self._author)
2212       Log().write(LOG_WARN, "  log:    '%s'" % self.get_log_msg().rstrip())
2213       Log().write(LOG_WARN, "  date:   '%s'" % date)
2214       Log().write(LOG_WARN, "(subversion rev %s)  Related files:" % self.revnum)
2215       for c_rev in self.cvs_revs:
2216         Log().write(LOG_WARN, " ", c_rev.fname)
2217
2218       Log().write(LOG_WARN, "Consider rerunning with (for example)",
2219                   "'--encoding=latin1'.\n")
2220       # It's better to fall back to the original (unknown encoding) data
2221       # than to either 1) quit or 2) record nothing at all.
2222       return { 'svn:author' : self._author,
2223                'svn:log'    : self.get_log_msg(),
2224                'svn:date'   : date }
2225
2226   def add_revision(self, cvs_rev):
2227     self.cvs_revs.append(cvs_rev)
2228     if cvs_rev.timestamp > self._max_date:
2229       self._max_date = cvs_rev.timestamp
2230
2231   def _is_primary_commit(self):
2232     """Return true if this is a primary SVNCommit, false otherwise."""
2233     return not (self.symbolic_name or self.motivating_revnum)
2234
2235   def flush(self):
2236     Log().write(LOG_NORMAL, "Creating Subversion commit %d (%s)"
2237                 % (self.revnum, self._description))
2238     Ctx()._persistence_manager.set_cvs_revs(self.revnum, self.cvs_revs)
2239
2240     if self.motivating_revnum is not None:
2241       Ctx()._persistence_manager.set_motivating_revnum(self.revnum,
2242                                                        self.motivating_revnum)
2243
2244     # If we're not a primary commit, then store our date and/or our
2245     # symbolic_name
2246     if not self._is_primary_commit():
2247       Ctx()._persistence_manager.set_name_and_date(self.revnum,
2248                                                    self.symbolic_name,
2249                                                    self._max_date)
2250
2251   def __str__(self):
2252     """ Print a human-readable description of this SVNCommit.  This
2253     description is not intended to be machine-parseable (although
2254     we're not going to stop you if you try!)"""
2255
2256     ret = "SVNCommit #: " + str(self.revnum) + "\n"
2257     if self.symbolic_name:
2258       ret = ret + "   symbolic name: " +  self.symbolic_name + "\n"
2259     else:
2260       ret = ret + "   NO symbolic name\n"
2261     ret = ret + "   debug description: " + self._description + "\n"
2262     ret = ret + "   cvs_revs:\n"
2263     for c_rev in self.cvs_revs:
2264       ret = ret + "     " + c_rev.unique_key() + "\n"
2265     return ret
2266
2267   def get_log_msg(self):
2268     """Returns the actual log message for a primary commit, and the
2269     appropriate manufactured log message for a secondary commit."""
2270     if self.symbolic_name is not None:
2271       return self._log_msg_for_symbolic_name_commit()
2272     elif self.motivating_revnum is not None:
2273       return self._log_msg_for_default_branch_commit()
2274     else:
2275       return self._log_msg
2276
2277   def _log_msg_for_symbolic_name_commit(self):
2278     """Creates a log message for a manufactured commit that fills
2279     self.symbolic_name.  If self.is_tag is true, write the log message
2280     as though for a tag, else write it as though for a branch."""
2281     type = 'branch'
2282     if self.is_tag:
2283       type = 'tag'
2284
2285     # In Python 2.2.3, we could use textwrap.fill().  Oh well :-).
2286     space_or_newline = ' '
2287     if len(self.symbolic_name) >= 13:
2288       space_or_newline = '\n'
2289
2290     return "This commit was manufactured by cvs2svn to create %s%s'%s'." \
2291            % (type, space_or_newline, self.symbolic_name)
2292
2293   def _log_msg_for_default_branch_commit(self):
2294     """Creates a log message for a manufactured commit that
2295     synchronizes a non-trunk default branch with trunk."""
2296     msg = 'This commit was generated by cvs2svn to compensate for '     \
2297           'changes in r%d,\n'                                           \
2298           'which included commits to RCS files with non-trunk default ' \
2299           'branches.\n' % self.motivating_revnum
2300     return msg
2301
2302 class CVSRevisionAggregator:
2303   """This class groups CVSRevisions into CVSCommits that represent
2304   at least one SVNCommit."""
2305   def __init__(self):
2306     self.metadata_db = Database(temp(METADATA_DB), DB_OPEN_READ)
2307     if not Ctx().trunk_only:
2308       self.last_revs_db = Database(temp(SYMBOL_LAST_CVS_REVS_DB), DB_OPEN_READ)
2309     self.cvs_commits = {}
2310     self.pending_symbols = {}
2311     # A list of symbols for which we've already encountered the last
2312     # CVSRevision that is a source for that symbol.  That is, the
2313     # final fill for this symbol has been done, and we never need to
2314     # fill it again.
2315     self.done_symbols = [ ]
2316
2317     # This variable holds the most recently created primary svn_commit
2318     # object.  CVSRevisionAggregator maintains this variable merely
2319     # for its date, so that it can set dates for the SVNCommits
2320     # created in self.attempt_to_commit_symbols().
2321     self.latest_primary_svn_commit = None
2322
2323     Ctx()._symbolings_logger = SymbolingsLogger()
2324     Ctx()._persistence_manager = PersistenceManager(DB_OPEN_NEW)
2325     Ctx()._default_branches_db = Database(temp(DEFAULT_BRANCHES_DB),
2326                                           DB_OPEN_READ)
2327
2328
2329   def process_revision(self, c_rev):
2330     # Each time we read a new line, we scan the commits we've
2331     # accumulated so far to see if any are ready for processing now.
2332     ready_queue = [ ]
2333     for digest_key, cvs_commit in self.cvs_commits.items():
2334       if cvs_commit.t_max + COMMIT_THRESHOLD < c_rev.timestamp:
2335         ready_queue.append(cvs_commit)
2336         del self.cvs_commits[digest_key]
2337         continue
2338       # If the inbound commit is on the same file as a pending commit,
2339       # close the pending commit to further changes.  Don't flush it though,
2340       # as there may be other pending commits dated before this one.
2341       # ### ISSUE: the has_file() check below is not optimal.
2342       # It does fix the dataloss bug where revisions would get lost
2343       # if checked in too quickly, but it can also break apart the
2344       # commits.  The correct fix would require tracking the dependencies
2345       # between change sets and committing them in proper order.
2346       if cvs_commit.has_file(c_rev.fname):
2347         unused_id = digest_key + '-'
2348         # Find a string that does is not already a key in
2349         # the self.cvs_commits dict
2350         while self.cvs_commits.has_key(unused_id):
2351           unused_id = unused_id + '-'
2352         self.cvs_commits[unused_id] = cvs_commit
2353         del self.cvs_commits[digest_key]
2354
2355     # Add this item into the set of still-available commits.
2356     if self.cvs_commits.has_key(c_rev.digest):
2357       cvs_commit = self.cvs_commits[c_rev.digest]
2358     else:
2359       author, log = self.metadata_db[c_rev.digest]
2360       self.cvs_commits[c_rev.digest] = CVSCommit(c_rev.digest,
2361                                                  author, log)
2362       cvs_commit = self.cvs_commits[c_rev.digest]
2363     cvs_commit.add_revision(c_rev)
2364
2365     # If there are any elements in the ready_queue at this point, they
2366     # need to be processed, because this latest rev couldn't possibly
2367     # be part of any of them.  Sort them into time-order, then process
2368     # 'em.
2369     ready_queue.sort()
2370
2371     # Make sure we attempt_to_commit_symbols for this c_rev, even if no
2372     # commits are ready.
2373     if len(ready_queue) == 0:
2374       self.attempt_to_commit_symbols(ready_queue, c_rev)
2375
2376     for cvs_commit in ready_queue[:]:
2377       self.latest_primary_svn_commit \
2378           = cvs_commit.process_revisions(self.done_symbols)
2379       ready_queue.remove(cvs_commit)
2380       self.attempt_to_commit_symbols(ready_queue, c_rev)
2381
2382   def flush(self):
2383     """Commit anything left in self.cvs_commits.  Then inform the
2384     SymbolingsLogger that all commits are done."""
2385
2386     ready_queue = [ ]
2387     for k, v in self.cvs_commits.items():
2388       ready_queue.append((v, k))
2389
2390     ready_queue.sort()
2391     for cvs_commit_tuple in ready_queue[:]:
2392       self.latest_primary_svn_commit = \
2393         cvs_commit_tuple[0].process_revisions(self.done_symbols)
2394       ready_queue.remove(cvs_commit_tuple)
2395       del self.cvs_commits[cvs_commit_tuple[1]]
2396       self.attempt_to_commit_symbols([])
2397
2398     if not Ctx().trunk_only:
2399       Ctx()._symbolings_logger.close()
2400
2401   def attempt_to_commit_symbols(self, queued_commits, c_rev=None):
2402     """
2403     This function generates 1 SVNCommit for each symbol in
2404     self.pending_symbols that doesn't have an opening CVSRevision in
2405     either QUEUED_COMMITS or self.cvs_commits.values().
2406
2407     If C_REV is not None, then we first add to self.pending_symbols
2408     any symbols from C_REV that C_REV is the last CVSRevision for.
2409     """
2410     # If we're not doing a trunk-only conversion, get the symbolic
2411     # names that this c_rev is the last *source* CVSRevision for and
2412     # add them to those left over from previous passes through the
2413     # aggregator.
2414     if c_rev and not Ctx().trunk_only:
2415       for sym in self.last_revs_db.get(c_rev.unique_key(), []):
2416         self.pending_symbols[sym] = None
2417
2418     # Make a list of all symbols that still have *source* CVSRevisions
2419     # in the pending commit queue (self.cvs_commits).
2420     open_symbols = {}
2421     for sym in self.pending_symbols.keys():
2422       for cvs_commit in self.cvs_commits.values() + queued_commits:
2423         if cvs_commit.opens_symbolic_name(sym):
2424           open_symbols[sym] = None
2425           break
2426
2427     # Sort the pending symbols so that we will always process the
2428     # symbols in the same order, regardless of the order in which the
2429     # dict hashing algorithm hands them back to us.  We do this so
2430     # that our tests will get the same results on all platforms.
2431     sorted_pending_symbols_keys = self.pending_symbols.keys()
2432     sorted_pending_symbols_keys.sort()
2433     for sym in sorted_pending_symbols_keys:
2434       if open_symbols.has_key(sym): # sym is still open--don't close it.
2435         continue
2436       svn_commit = SVNCommit("closing tag/branch '%s'" % sym)
2437       svn_commit.set_symbolic_name(sym)
2438       svn_commit.set_date(self.latest_primary_svn_commit.get_date())
2439       svn_commit.flush()
2440       self.done_symbols.append(sym)
2441       del self.pending_symbols[sym]
2442
2443
2444 class SymbolingsReader:
2445   """Provides an interface to the SYMBOL_OPENINGS_CLOSINGS_SORTED file
2446   and the SYMBOL_OFFSETS_DB.  Does the heavy lifting of finding and
2447   returning the correct opening and closing Subversion revision
2448   numbers for a given symbolic name."""
2449   def __init__(self):
2450     """Opens the SYMBOL_OPENINGS_CLOSINGS_SORTED for reading, and
2451     reads the offsets database into memory."""
2452     self.symbolings = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
2453     # The offsets_db is really small, and we need to read and write
2454     # from it a fair bit, so suck it into memory
2455     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_READ)
2456     self.offsets = { }
2457     for key in offsets_db.db.keys():
2458       #print " ZOO:", key, offsets_db[key]
2459       self.offsets[key] = offsets_db[key]
2460
2461   def filling_guide_for_symbol(self, symbolic_name, svn_revnum):
2462     """Given SYMBOLIC_NAME and SVN_REVNUM, return a new
2463     SymbolicNameFillingGuide object.
2464
2465     Note that if we encounter an opening rev in this fill, but the
2466     corresponding closing rev takes place later than SVN_REVNUM, the
2467     closing will not be passed to SymbolicNameFillingGuide in this
2468     fill (and will be discarded when encountered in a later fill).
2469     This is perfectly fine, because we can still do a valid fill
2470     without the closing--we always try to fill what we can as soon as
2471     we can."""
2472     # It's possible to have a branch start with a file that was added
2473     # on a branch
2474     if not self.offsets.has_key(symbolic_name):
2475       return SymbolicNameFillingGuide(symbolic_name)
2476     # set our read offset for self.symbolings to the offset for
2477     # symbolic_name
2478     self.symbolings.seek(self.offsets[symbolic_name])
2479
2480     symbol_fill = SymbolicNameFillingGuide(symbolic_name)
2481     while (1):
2482       fpos = self.symbolings.tell()
2483       line = self.symbolings.readline().rstrip()
2484       if not line:
2485         break
2486       name, revnum, type, svn_path = line.split(" ", 3)
2487       revnum = int(revnum)
2488       if (revnum > svn_revnum
2489           or name != symbolic_name):
2490         break
2491       symbol_fill.register(svn_path, revnum, type)
2492
2493     # get current offset of the read marker and set it to the offset
2494     # for the beginning of the line we just read if we used anything
2495     # we read.
2496     if not symbol_fill.is_empty():
2497       self.offsets[symbolic_name] = fpos
2498
2499     symbol_fill.make_node_tree()
2500     return symbol_fill
2501
2502
2503 class SymbolicNameFillingGuide:
2504   """A SymbolicNameFillingGuide is essentially a node tree
2505   representing the source paths to be copied to fill
2506   self.symbolic_name in the current SVNCommit.
2507
2508   After calling self.register() on a series of openings and closings,
2509   call self.make_node_tree() to prepare self.node_tree for
2510   examination.  See the docstring for self.make_node_tree() for
2511   details on the structure of self.node_tree.
2512
2513   By walking self.node_tree and calling self.get_best_revnum() on each
2514   node, the caller can determine what subversion revision number to
2515   copy the path corresponding to that node from.  self.node_tree
2516   should be treated as read-only.
2517
2518   The caller can then descend to sub-nodes to see if their "best
2519   revnum" differs from their parents' and if it does, take appropriate
2520   actions to "patch up" the subtrees."""
2521   def __init__(self, symbolic_name):
2522     """Initializes a SymbolicNameFillingGuide for SYMBOLIC_NAME and
2523     prepares it for receiving openings and closings.
2524
2525     Returns a fully functional and armed SymbolicNameFillingGuide
2526     object."""
2527     self.name = symbolic_name
2528
2529     self.opening_key = "/o"
2530     self.closing_key = "/c"
2531
2532     # A dictionary of SVN_PATHS and SVN_REVNUMS whose format is:
2533     #
2534     # { svn_path : { self.opening_key : svn_revnum,
2535     #                self.closing_key : svn_revnum }
2536     #                ...}
2537     self.things = { }
2538
2539     # The key for the root node of the node tree
2540     self.root_key = '0'
2541     # The dictionary that holds our node tree, seeded with the root key.
2542     self.node_tree = { self.root_key : { } }
2543
2544   def get_best_revnum(self, node, preferred_revnum):
2545     """Determine the best subversion revision number to use when
2546     copying the source tree beginning at NODE.  Returns a
2547     subversion revision number.
2548
2549     PREFERRED_REVNUM is passed to self._best_rev and used to
2550     calculate the best_revnum."""
2551     revnum = SVN_INVALID_REVNUM
2552
2553     # Aggregate openings and closings from the rev tree
2554     openings = self._list_revnums_for_key(node, self.opening_key)
2555     closings = self._list_revnums_for_key(node, self.closing_key)
2556
2557     # Score the lists
2558     scores = self._score_revisions(self._sum_revnum_counts(openings),
2559                                   self._sum_revnum_counts(closings))
2560
2561     revnum, max_score = self._best_rev(scores, preferred_revnum)
2562
2563     if revnum == SVN_INVALID_REVNUM:
2564       sys.stderr.write(error_prefix + ": failed to find a revision "
2565                        + "to copy from when copying %s\n" % name)
2566       sys.exit(1)
2567     return revnum, max_score
2568
2569
2570   def _best_rev(self, scores, preferred_rev):
2571     """Return the revision with the highest score from SCORES, a list
2572     returned by _score_revisions().  When the maximum score is shared
2573     by multiple revisions, the oldest revision is selected, unless
2574     PREFERRED_REV is one of the possibilities, in which case, it is
2575     selected."""
2576     max_score = 0
2577     preferred_rev_score = -1
2578     rev = SVN_INVALID_REVNUM
2579     if preferred_rev is None:
2580       # Comparison order of different types is arbitrary. Do not
2581       # expect None to compare less than int values below.
2582       # In Python 2.3 None compares with ints like negative infinity.
2583       # In Python 2.0 None compares with ints like positive infinity.
2584       preferred_rev = SVN_INVALID_REVNUM
2585     for revnum, count in scores:
2586       if count > max_score:
2587         max_score = count
2588         rev = revnum
2589       if revnum <= preferred_rev:
2590         preferred_rev_score = count
2591     if preferred_rev_score == max_score:
2592       rev = preferred_rev
2593     return rev, max_score
2594
2595
2596   def _score_revisions(self, openings, closings):
2597     """Return a list of revisions and scores based on OPENINGS and
2598     CLOSINGS.  The returned list looks like:
2599
2600        [(REV1 SCORE1), (REV2 SCORE2), ...]
2601
2602     where REV2 > REV1.  OPENINGS and CLOSINGS are the values of
2603     self.opening__key and self.closing_key from some file or
2604     directory node, or else None.
2605
2606     Each score indicates that copying the corresponding revision (or
2607     any following revision up to the next revision in the list) of the
2608     object in question would yield that many correct paths at or
2609     underneath the object.  There may be other paths underneath it
2610     which are not correct and would need to be deleted or recopied;
2611     those can only be detected by descending and examining their
2612     scores.
2613
2614     If OPENINGS is false, return the empty list."""
2615     # First look for easy outs.
2616     if not openings:
2617       return []
2618
2619     # Must be able to call len(closings) below.
2620     if closings is None:
2621       closings = []
2622
2623     # No easy out, so wish for lexical closures and calculate the scores :-).
2624     scores = []
2625     opening_score_accum = 0
2626     for i in range(len(openings)):
2627       opening_rev, opening_score = openings[i]
2628       opening_score_accum = opening_score_accum + opening_score
2629       scores.append((opening_rev, opening_score_accum))
2630     min = 0
2631     for i in range(len(closings)):
2632       closing_rev, closing_score = closings[i]
2633       done_exact_rev = None
2634       insert_index = None
2635       insert_score = None
2636       for j in range(min, len(scores)):
2637         score_rev, score = scores[j]
2638         if score_rev >= closing_rev:
2639           if not done_exact_rev:
2640             if score_rev > closing_rev:
2641               insert_index = j
2642               insert_score = scores[j-1][1] - closing_score
2643             done_exact_rev = 1
2644           scores[j] = (score_rev, score - closing_score)
2645         else:
2646           min = j + 1
2647       if not done_exact_rev:
2648         scores.append((closing_rev,scores[-1][1] - closing_score))
2649       if insert_index is not None:
2650         scores.insert(insert_index, (closing_rev, insert_score))
2651     return scores
2652
2653   def _sum_revnum_counts(self, rev_list):
2654     """Takes an array of revisions (REV_LIST), for example:
2655
2656       [21, 18, 6, 49, 39, 24, 24, 24, 24, 24, 24, 24]
2657
2658     and adds up every occurrence of each revision and returns a sorted
2659     array of tuples containing (svn_revnum, count):
2660
2661       [(6, 1), (18, 1), (21, 1), (24, 7), (39, 1), (49, 1)]
2662     """
2663     s = {}
2664     for k in rev_list: # Add up the scores
2665       if s.has_key(k):
2666         s[k] = s[k] + 1
2667       else:
2668         s[k] = 1
2669     a = s.items()
2670     a.sort()
2671     return a
2672
2673   def _list_revnums_for_key(self, node, revnum_type_key):
2674     """Scan self.node_tree and return a list of all the revision
2675     numbers (including duplicates) contained in REVNUM_TYPE_KEY values
2676     for all leaf nodes at and under NODE.
2677
2678     REVNUM_TYPE_KEY should be either self.opening_key or
2679     self.closing_key."""
2680     revnums = []
2681
2682     # If the node has self.opening_key, it must be a leaf node--all
2683     # leaf nodes have at least an opening key (although they may not
2684     # have a closing key.  Fetch revnum and return
2685     if (self.node_tree[node].has_key(self.opening_key) and
2686         self.node_tree[node].has_key(revnum_type_key)):
2687       revnums.append(self.node_tree[node][revnum_type_key])
2688       return revnums
2689
2690     for key, node_contents in self.node_tree[node].items():
2691       if key[0] == '/':
2692         continue
2693       revnums = revnums + \
2694           self._list_revnums_for_key(node_contents, revnum_type_key)
2695     return revnums
2696
2697   def register(self, svn_path, svn_revnum, type):
2698     """Collects opening and closing revisions for this
2699     SymbolicNameFillingGuide.  SVN_PATH is the source path that needs
2700     to be copied into self.symbolic_name, and SVN_REVNUM is either the
2701     first svn revision number that we can copy from (our opening), or
2702     the last (not inclusive) svn revision number that we can copy from
2703     (our closing).  TYPE indicates whether this path is an opening or a
2704     a closing.
2705
2706     The opening for a given SVN_PATH must be passed before the closing
2707     for it to have any effect... any closing encountered before a
2708     corresponding opening will be discarded.
2709
2710     It is not necessary to pass a corresponding closing for every
2711     opening.
2712     """
2713     # Always log an OPENING
2714     if type == OPENING:
2715       self.things[svn_path] = {self.opening_key: svn_revnum}
2716     # Only log a closing if we've already registered the opening for that path.
2717     elif type == CLOSING and self.things.has_key(svn_path):
2718       # When we have a non-trunk default branch, we may have multiple
2719       # closings--only register the first closing we encounter.
2720       if not self.things[svn_path].has_key(self.closing_key):
2721         self.things[svn_path][self.closing_key] = svn_revnum
2722
2723   def make_node_tree(self):
2724     """Generates the SymbolicNameFillingGuide's node tree from
2725     self.things.  Each leaf node maps self.opening_key to the earliest
2726     subversion revision from which this node/path may be copied; and
2727     optionally map self.closing_key to the subversion revision one
2728     higher than the last revision from which this node/path may be
2729     copied.  Intermediate nodes never contain opening or closing
2730     flags."""
2731
2732     for svn_path, open_close in self.things.items():
2733       parent_key = self.root_key
2734
2735       path_so_far = ""
2736       # Walk up the path, one node at a time.
2737       components = svn_path.split('/')
2738       for component in components:
2739         path_so_far = path_so_far + '/' + component
2740
2741         child_key = None
2742         if not self.node_tree[parent_key].has_key(component):
2743           child_key = gen_key()
2744           self.node_tree[child_key] = { }
2745           self.node_tree[parent_key][component] = child_key
2746         else:
2747           child_key = self.node_tree[parent_key][component]
2748
2749         parent_key = child_key
2750       # Having reached the leaf, attach the value
2751       self.node_tree[parent_key] = open_close
2752     #print_node_tree(self.node_tree, self.root_key)
2753
2754   def is_empty(self):
2755     """Return true if we haven't accumulated any openings or closings,
2756     false otherwise."""
2757     return not len(self.things)
2758
2759
2760 class FillSource:
2761   """Representation of a fill source used by the symbol filler in
2762   SVNRepositoryMirror."""
2763   def __init__(self, prefix, key):
2764     """Create an unscored fill source with a prefix and a key."""
2765     self.prefix = prefix
2766     self.key = key
2767     self.score = None
2768     self.revnum = None
2769
2770   def set_score(self, score, revnum):
2771     """Set the SCORE and REVNUM."""
2772     self.score = score
2773     self.revnum = revnum
2774
2775   def __cmp__(self, other):
2776     """Comparison operator used to sort FillSources in descending
2777     score order."""
2778     if self.score is None or other.score is None:
2779       raise TypeError, 'Tried to compare unscored FillSource'
2780     return cmp(other.score, self.score)
2781
2782
2783 class SVNRepositoryMirror:
2784   """Mirror a Subversion Repository as it is constructed, one
2785   SVNCommit at a time.  The mirror is skeletal; it does not contain
2786   file contents.  The creation of a dumpfile or Subversion repository
2787   is handled by delegates.  See self.add_delegate method for how to
2788   set delegates.
2789
2790   The structure of the repository is kept in two databases and one
2791   hash.  The revs_db database maps revisions to root node keys, and
2792   the nodes_db database maps node keys to nodes.  A node is a hash
2793   from directory names to keys.  Both the revs_db and the nodes_db are
2794   stored on disk and each access is expensive.
2795
2796   The nodes_db database only has the keys for old revisions.  The
2797   revision that is being contructed is kept in memory in the new_nodes
2798   hash which is cheap to access.
2799
2800   You must invoke _start_commit between SVNCommits.
2801
2802   *** WARNING *** All path arguments to methods in this class CANNOT
2803       have leading or trailing slashes.
2804   """
2805
2806   class SVNRepositoryMirrorPathExistsError(Exception):
2807     """Exception raised if an attempt is made to add a path to the
2808     repository mirror and that path already exists in the youngest
2809     revision of the repository."""
2810     pass
2811
2812   class SVNRepositoryMirrorUnexpectedOperationError(Exception):
2813     """Exception raised if a CVSRevision is found to have an unexpected
2814     operation (OP) value."""
2815     pass
2816
2817   class SVNRepositoryMirrorInvalidFillOperationError(Exception):
2818     """Exception raised if an empty SymbolicNameFillingGuide is returned
2819     during a fill where the branch in question already exists."""
2820     pass
2821
2822   def __init__(self):
2823     """Set up the SVNRepositoryMirror and prepare it for SVNCommits."""
2824     self.delegates = [ ]
2825
2826     # This corresponds to the 'revisions' table in a Subversion fs.
2827     self.revs_db = Database(temp(SVN_MIRROR_REVISIONS_DB), DB_OPEN_NEW)
2828     Cleanup().register(temp(SVN_MIRROR_REVISIONS_DB), pass8)
2829
2830     # This corresponds to the 'nodes' table in a Subversion fs.  (We
2831     # don't need a 'representations' or 'strings' table because we
2832     # only track metadata, not file contents.)
2833     self.nodes_db = Database(temp(SVN_MIRROR_NODES_DB), DB_OPEN_NEW)
2834     Cleanup().register(temp(SVN_MIRROR_NODES_DB), pass8)
2835
2836     # Start at revision 0 without a root node.  It will be created
2837     # by _open_writable_root_node.
2838     self.youngest = 0
2839     self.new_root_key = None
2840     self.new_nodes = { }
2841
2842     if not Ctx().trunk_only:
2843       ###PERF IMPT: Suck this into memory.
2844       self.tags_db = TagsDatabase(DB_OPEN_READ)
2845       self.symbolings_reader = SymbolingsReader()
2846
2847   def _initialize_repository(self, date):
2848     """Initialize the repository by creating the directories for
2849     trunk, tags, and branches.  This method should only be called
2850     after all delegates are added to the repository mirror."""
2851     # Make a 'fake' SVNCommit so we can take advantage of the revprops
2852     # magic therein
2853     svn_commit = SVNCommit("Initialization", 1)
2854     svn_commit.set_date(date)
2855     svn_commit.set_log_msg("New repository initialized by cvs2svn.")
2856
2857     self._start_commit(svn_commit)
2858     self._mkdir(Ctx().trunk_base)
2859     if not Ctx().trunk_only:
2860       self._mkdir(Ctx().branches_base)
2861       self._mkdir(Ctx().tags_base)
2862
2863   def _start_commit(self, svn_commit):
2864     """Start a new commit."""
2865     if self.youngest > 0:
2866       self._end_commit()
2867
2868     self.youngest = svn_commit.revnum
2869     self.new_root_key = None
2870     self.new_nodes = { }
2871
2872     self._invoke_delegates('start_commit', svn_commit)
2873
2874   def _end_commit(self):
2875     """Called at the end of each commit.  This method copies the newly
2876     created nodes to the on-disk nodes db."""
2877     if self.new_root_key is None:
2878       # No changes were made in this revision, so we make the root node
2879       # of the new revision be the same as the last one.
2880       self.revs_db[str(self.youngest)] = self.revs_db[str(self.youngest - 1)]
2881     else:
2882       self.revs_db[str(self.youngest)] = self.new_root_key
2883       # Copy the new nodes to the nodes_db
2884       for key, value in self.new_nodes.items():
2885         self.nodes_db[key] = value
2886
2887   def _get_node(self, key):
2888     """Returns the node contents for KEY which may refer to either
2889     self.nodes_db or self.new_nodes."""
2890     if self.new_nodes.has_key(key):
2891       return self.new_nodes[key]
2892     else:
2893       return self.nodes_db[key]
2894
2895   def _open_readonly_node(self, path, revnum):
2896     """Open a readonly node for PATH at revision REVNUM.  Returns the
2897     node key and node contents if the path exists, else (None, None)."""
2898     # Get the root key
2899     if revnum == self.youngest:
2900       if self.new_root_key is None:
2901         node_key = self.revs_db[str(self.youngest - 1)]
2902       else:
2903         node_key = self.new_root_key
2904     else:
2905       node_key = self.revs_db[str(revnum)]
2906
2907     for component in path.split('/'):
2908       node_contents = self._get_node(node_key)
2909       if not node_contents.has_key(component):
2910         return None
2911       node_key = node_contents[component]
2912
2913     return node_key
2914
2915   def _open_writable_root_node(self):
2916     """Open a writable root node.  The current root node is returned
2917     immeditely if it is already writable.  If not, create a new one by
2918     copying the contents of the root node of the previous version."""
2919     if self.new_root_key is not None:
2920       return self.new_root_key, self.new_nodes[self.new_root_key]
2921
2922     if self.youngest < 2:
2923       new_contents = { }
2924     else:
2925       new_contents = self.nodes_db[self.revs_db[str(self.youngest - 1)]]
2926     self.new_root_key = gen_key()
2927     self.new_nodes = { self.new_root_key: new_contents }
2928
2929     return self.new_root_key, new_contents
2930
2931   def _open_writable_node(self, svn_path, create):
2932     """Open a writable node for the path SVN_PATH, creating SVN_PATH
2933     and any missing directories if CREATE is True."""
2934     parent_key, parent_contents = self._open_writable_root_node()
2935
2936     # Walk up the path, one node at a time.
2937     path_so_far = None
2938     components = svn_path.split('/')
2939     for i in range(len(components)):
2940       component = components[i]
2941       this_key = this_contents = None
2942       path_so_far = _path_join(path_so_far, component)
2943       if parent_contents.has_key(component):
2944         # The component exists.
2945         this_key = parent_contents[component]
2946         if self.new_nodes.has_key(this_key):
2947           this_contents = self.new_nodes[this_key]
2948         else:
2949           # Suck the node from the nodes_db, but update the key
2950           this_contents = self.nodes_db[this_key]
2951           this_key = gen_key()
2952           self.new_nodes[this_key] = this_contents
2953           parent_contents[component] = this_key
2954       elif create:
2955         # The component does not exists, so we create it.
2956         this_contents = { }
2957         this_key = gen_key()
2958         self.new_nodes[this_key] = this_contents
2959         parent_contents[component] = this_key
2960         if i < len(components) - 1:
2961           self._invoke_delegates('mkdir', path_so_far)
2962       else:
2963         # The component does not exists and we are not instructed to
2964         # create it, so we give up.
2965         return None, None
2966
2967       parent_key = this_key
2968       parent_contents = this_contents
2969
2970     return this_key, this_contents
2971
2972   def _path_exists(self, path):
2973     """If PATH exists in self.youngest of the svn repository mirror,
2974     return true, else return None.
2975
2976     PATH must not start with '/'."""
2977     return self._open_readonly_node(path, self.youngest) is not None
2978
2979   def _fast_delete_path(self, parent_path, parent_contents, component):
2980     """Delete COMPONENT from the parent direcory PARENT_PATH with the
2981     contents PARENT_CONTENTS.  Do nothing if COMPONENT does not exist
2982     in PARENT_CONTENTS."""
2983     if parent_contents.has_key(component):
2984       del parent_contents[component]
2985       self._invoke_delegates('delete_path', _path_join(parent_path, component))
2986
2987   def _delete_path(self, svn_path, should_prune=False):
2988     """Delete PATH from the tree.  If SHOULD_PRUNE is true, then delete
2989     all ancestor directories that are made empty when SVN_PATH is deleted.
2990     In other words, SHOULD_PRUNE is like the -P option to 'cvs checkout'.
2991
2992     NOTE: This function does *not* allow you delete top-level entries
2993     (like /trunk, /branches, /tags), nor does it prune upwards beyond
2994     those entries."""
2995     pos = svn_path.rfind('/')
2996     parent_path = svn_path[:pos]
2997     entry = svn_path[pos+1:]
2998     parent_key, parent_contents = self._open_writable_node(parent_path, False)
2999     if parent_key is not None:
3000       self._fast_delete_path(parent_path, parent_contents, entry)
3001       # The following recursion makes pruning an O(n^2) operation in the
3002       # worst case (where n is the depth of SVN_PATH), but the worst case
3003       # is probably rare, and the constant cost is pretty low.  Another
3004       # drawback is that we issue a delete for each path and not just
3005       # a single delete for the topmost directory pruned.
3006       if (should_prune and len(parent_contents) == 0 and
3007           parent_path.find('/') != -1):
3008         self._delete_path(parent_path, True)
3009
3010   def _mkdir(self, path):
3011     """Create PATH in the repository mirror at the youngest revision."""
3012     self._open_writable_node(path, True)
3013     self._invoke_delegates('mkdir', path)
3014
3015   def _change_path(self, cvs_rev):
3016     """Register a change in self.youngest for the CVS_REV's svn_path
3017     in the repository mirror."""
3018     # We do not have to update the nodes because our mirror is only
3019     # concerned with the presence or absence of paths, and a file
3020     # content change does not cause any path changes.
3021     self._invoke_delegates('change_path', SVNCommitItem(cvs_rev, 0))
3022
3023   def _add_path(self, cvs_rev):
3024     """Add the CVS_REV's svn_path to the repository mirror."""
3025     self._open_writable_node(cvs_rev.svn_path, True)
3026     self._invoke_delegates('add_path', SVNCommitItem(cvs_rev, 1))
3027
3028   def _copy_path(self, src_path, dest_path, src_revnum):
3029     """Copy SRC_PATH at subversion revision number SRC_REVNUM to
3030     DEST_PATH. In the youngest revision of the repository, DEST_PATH's
3031     parent *must* exist, but DEST_PATH *cannot* exist.
3032
3033     Return the node key and the contents of the new node at DEST_PATH
3034     as a dictionary."""
3035     # get the contents of the node of our src_path
3036     src_key = self._open_readonly_node(src_path, src_revnum)
3037     src_contents = self._get_node(src_key)
3038
3039     # Get the parent path and the base path of the dest_path
3040     pos = dest_path.rindex('/')
3041     dest_parent = dest_path[:pos]
3042     dest_basename = dest_path[pos+1:]
3043     dest_parent_key, dest_parent_contents = \
3044                    self._open_writable_node(dest_parent, False)
3045
3046     if dest_parent_contents.has_key(dest_basename):
3047       msg = "Attempt to add path '%s' to repository mirror " % dest_path
3048       msg = msg + "when it already exists in the mirror."
3049       raise self.SVNRepositoryMirrorPathExistsError, msg
3050
3051     dest_parent_contents[dest_basename] = src_key
3052     self._invoke_delegates('copy_path', src_path, dest_path, src_revnum)
3053
3054     # Yes sir, src_key and src_contents are also the contents of the
3055     # destination.  This is a cheap copy, remember!  :-)
3056     return src_key, src_contents
3057
3058   def _fill_symbolic_name(self, svn_commit):
3059     """Performs all copies necessary to create as much of the the tag
3060     or branch SVN_COMMIT.symbolic_name as possible given the current
3061     revision of the repository mirror.
3062
3063     The symbolic name is guaranteed to exist in the Subversion
3064     repository by the end of this call, even if there are no paths
3065     under it."""
3066     symbol_fill = self.symbolings_reader.filling_guide_for_symbol(
3067       svn_commit.symbolic_name, self.youngest)
3068
3069     # Create the list of sources for the symbolic name.  All source
3070     # prefixes must be direct sources for the destination, i.e. we
3071     # must have 'trunk' and 'branches/my_branch' and not just
3072     # 'branches'.
3073     sources = []
3074     for entry, key in symbol_fill.node_tree[symbol_fill.root_key].items():
3075       if entry == Ctx().trunk_base:
3076         sources.append(FillSource(entry, key))
3077       elif entry == Ctx().branches_base:
3078         for entry2, key2 in symbol_fill.node_tree[key].items():
3079           sources.append(FillSource(entry + '/' + entry2, key2))
3080       else:
3081         raise # Should never happen
3082     if self.tags_db.has_key(svn_commit.symbolic_name):
3083       dest_prefix = _path_join(Ctx().tags_base, svn_commit.symbolic_name)
3084     else:
3085       dest_prefix = _path_join(Ctx().branches_base,
3086                                svn_commit.symbolic_name)
3087
3088     if sources:
3089       dest_key = self._open_writable_node(dest_prefix, False)[0]
3090       self._fill(symbol_fill, dest_prefix, dest_key, sources)
3091     else:
3092       # We can only get here for a branch whose first commit is an add
3093       # (as opposed to a copy).
3094       dest_path = Ctx().branches_base + '/' + symbol_fill.name
3095       if not self._path_exists(dest_path):
3096         # If our symbol_fill was empty, that means that our first
3097         # commit on the branch was to a file added on the branch, and
3098         # that this is our first fill of that branch.
3099         #
3100         # This case is covered by test 16.
3101         #
3102         # ...we create the branch by copying trunk from the our
3103         # current revision number minus 1
3104         source_path = Ctx().trunk_base
3105         entries = self._copy_path(source_path, dest_path,
3106                                   svn_commit.revnum - 1)[1]
3107         # Now since we've just copied trunk to a branch that's
3108         # *supposed* to be empty, we delete any entries in the
3109         # copied directory.
3110         for entry in entries.keys():
3111           del_path = dest_path + '/' + entry
3112           # Delete but don't prune.
3113           self._delete_path(del_path)
3114       else:
3115         msg = "Error filling branch '" + symbol_fill.name + "'.\n"
3116         msg = msg + "Received an empty SymbolicNameFillingGuide and\n"
3117         msg = msg + "attempted to create a branch that already exists."
3118         raise self.SVNRepositoryMirrorInvalidFillOperationError, msg
3119
3120   def _fill(self, symbol_fill, dest_prefix, dest_key, sources,
3121             path = None, parent_source_prefix = None,
3122             preferred_revnum = None, prune_ok = None):
3123     """Fill the tag or branch at DEST_PREFIX + PATH with items from
3124     SOURCES, and recurse into the child items.
3125
3126     DEST_PREFIX is the prefix of the destination directory, e.g.
3127     '/tags/my_tag' or '/branches/my_branch', and SOURCES is a list of
3128     FillSource classes that are candidates to be copied to the
3129     destination.  DEST_KEY is the key in self.nodes_db to the
3130     destination, or None if the destination does not yet exist.
3131
3132     PATH is the path relative to DEST_PREFIX.  If PATH is None, we
3133     are at the top level, e.g. '/tags/my_tag'.
3134
3135     PARENT_SOURCE_PREFIX is the source prefix that was used to copy
3136     the parent directory, and PREFERRED_REVNUM is an int which is the
3137     source revision number that the caller (who may have copied KEY's
3138     parent) used to perform its copy.  If PREFERRED_REVNUM is None,
3139     then no revision is preferable to any other (which probably means
3140     that no copies have happened yet).
3141
3142     PRUNE_OK means that a copy has been made in this recursion, and
3143     it's safe to prune directories that are not in
3144     SYMBOL_FILL.node_tree, provided that said directory has a source
3145     prefix of one of the PARENT_SOURCE_PREFIX.
3146
3147     PATH, PARENT_SOURCE_PREFIX, PRUNE_OK, and PREFERRED_REVNUM
3148     should only be passed in by recursive calls."""
3149     # Calculate scores and revnums for all sources
3150     for source in sources:
3151       src_revnum, score = symbol_fill.get_best_revnum(source.key,
3152                                                       preferred_revnum)
3153       source.set_score(score, src_revnum)
3154
3155     # Sort the sources in descending score order so that we will make
3156     # a eventual copy from the source with the highest score.
3157     sources.sort()
3158     copy_source = sources[0]
3159
3160     src_path = _path_join(copy_source.prefix, path)
3161     dest_path = _path_join(dest_prefix, path)
3162
3163     # Figure out if we shall copy to this destination and delete any
3164     # destination path that is in the way.
3165     do_copy = 0
3166     if dest_key is None:
3167       do_copy = 1
3168     elif prune_ok and (parent_source_prefix != copy_source.prefix or
3169                        copy_source.revnum != preferred_revnum):
3170       # We are about to replace the destination, so we need to remove
3171       # it before we perform the copy.
3172       self._delete_path(dest_path)
3173       do_copy = 1
3174
3175     if do_copy:
3176       dest_key, dest_entries = self._copy_path(src_path, dest_path,
3177                                                copy_source.revnum)
3178       prune_ok = 1
3179     else:
3180       dest_entries = self._get_node(dest_key)
3181
3182     # Create the SRC_ENTRIES hash from SOURCES.  The keys are path
3183     # elements and the values are lists of FillSource classes where
3184     # this path element exists.
3185     src_entries = {}
3186     for source in sources:
3187       for entry, key in symbol_fill.node_tree[source.key].items():
3188         if entry[0] == '/': # Skip flags
3189           continue
3190         if not src_entries.has_key(entry):
3191           src_entries[entry] = []
3192         src_entries[entry].append(FillSource(source.prefix, key))
3193
3194     if prune_ok:
3195       # Delete the entries in DEST_ENTRIES that are not in src_entries.
3196       delete_list = [ ]
3197       for entry in dest_entries.keys():
3198         if not src_entries.has_key(entry):
3199           delete_list.append(entry)
3200       if delete_list:
3201         if not self.new_nodes.has_key(dest_key):
3202           dest_key, dest_entries = self._open_writable_node(dest_path, True)
3203         # Sort the delete list to get "diffable" dumpfiles.
3204         delete_list.sort()
3205         for entry in delete_list:
3206           self._fast_delete_path(dest_path, dest_entries, entry)
3207
3208     # Recurse into the SRC_ENTRIES keys sorted in alphabetical order.
3209     src_keys = src_entries.keys()
3210     src_keys.sort()
3211     for src_key in src_keys:
3212       if dest_entries.has_key(src_key):
3213         next_dest_key = dest_entries[src_key]
3214       else:
3215         next_dest_key = None
3216       self._fill(symbol_fill, dest_prefix, next_dest_key,
3217                  src_entries[src_key], _path_join(path, src_key),
3218                  copy_source.prefix, sources[0].revnum, prune_ok)
3219
3220   def _synchronize_default_branch(self, svn_commit):
3221     """Propagate any changes that happened on a non-trunk default
3222     branch to the trunk of the repository.  See
3223     CVSCommit._post_commit() for details on why this is necessary."""
3224     for cvs_rev in svn_commit.cvs_revs:
3225       if cvs_rev.op == OP_ADD or cvs_rev.op == OP_CHANGE:
3226         if self._path_exists(cvs_rev.svn_trunk_path):
3227           # Delete the path on trunk...
3228           self._delete_path(cvs_rev.svn_trunk_path)
3229         # ...and copy over from branch
3230         self._copy_path(cvs_rev.svn_path, cvs_rev.svn_trunk_path,
3231                         svn_commit.motivating_revnum)
3232       elif cvs_rev.op == OP_DELETE:
3233         # delete trunk path
3234         self._delete_path(cvs_rev.svn_trunk_path)
3235       else:
3236         msg = ("Unknown CVSRevision operation '%s' in default branch sync."
3237                % cvs_rev.op)
3238         raise self.SVNRepositoryMirrorUnexpectedOperationError, msg
3239
3240   def commit(self, svn_commit):
3241     """Add an SVNCommit to the SVNRepository, incrementing the
3242     Repository revision number, and changing the repository.  Invoke
3243     the delegates' _start_commit() method."""
3244
3245     if svn_commit.revnum == 2:
3246       self._initialize_repository(svn_commit.get_date())
3247
3248     self._start_commit(svn_commit)
3249
3250     if svn_commit.symbolic_name:
3251       Log().write(LOG_VERBOSE, "Filling symbolic name:",
3252                   svn_commit.symbolic_name)
3253       self._fill_symbolic_name(svn_commit)
3254     elif svn_commit.motivating_revnum:
3255       Log().write(LOG_VERBOSE, "Synchronizing default_branch motivated by %d"
3256                   % svn_commit.motivating_revnum)
3257       self._synchronize_default_branch(svn_commit)
3258     else: # This actually commits CVSRevisions
3259       if len(svn_commit.cvs_revs) > 1: plural = "s"
3260       else: plural = ""
3261       Log().write(LOG_VERBOSE, "Committing %d CVSRevision%s"
3262                   % (len(svn_commit.cvs_revs), plural))
3263       for cvs_rev in svn_commit.cvs_revs:
3264         # See comment in CVSCommit._commit() for what this is all
3265         # about.  Note that although asking self._path_exists() is
3266         # somewhat expensive, we only do it if the first two (cheap)
3267         # tests succeed first.
3268         if not ((cvs_rev.deltatext_code == DELTATEXT_EMPTY)
3269                 and (cvs_rev.rev == "1.1.1.1")
3270                 and self._path_exists(cvs_rev.svn_path)):
3271           if cvs_rev.op == OP_ADD:
3272             self._add_path(cvs_rev)
3273           elif cvs_rev.op == OP_CHANGE:
3274             # Fix for Issue #74:
3275             #
3276             # Here's the scenario.  You have file FOO that is imported
3277             # on a non-trunk vendor branch.  So in r1.1 and r1.1.1.1,
3278             # the file exists.
3279             #
3280             # Moving forward in time, FOO is deleted on the default
3281             # branch (r1.1.1.2).  cvs2svn determines that this delete
3282             # also needs to happen on trunk, so FOO is deleted on
3283             # trunk.
3284             #
3285             # Along come r1.2, whose op is OP_CHANGE (because r1.1 is
3286             # not 'dead', we assume it's a change).  However, since
3287             # our trunk file has been deleted, svnadmin blows up--you
3288             # can't change a file that doesn't exist!
3289             #
3290             # Soooo... we just check the path, and if it doesn't
3291             # exist, we do an add... if the path does exist, it's
3292             # business as usual.
3293             if not self._path_exists(cvs_rev.svn_path):
3294               self._add_path(cvs_rev)
3295             else:
3296               self._change_path(cvs_rev)
3297
3298         if cvs_rev.op == OP_DELETE:
3299           self._delete_path(cvs_rev.svn_path, Ctx().prune)
3300
3301   def cleanup(self):
3302     """Callback for the Cleanup.register in self.__init__."""
3303     self.revs_db = None
3304     self.nodes_db = None
3305
3306   def add_delegate(self, delegate):
3307     """Adds DELEGATE to self.delegates.
3308
3309     For every delegate you add, as soon as SVNRepositoryMirror
3310     performs a repository action method, SVNRepositoryMirror will call
3311     the delegate's corresponding repository action method.  Multiple
3312     delegates will be called in the order that they are added.  See
3313     SVNRepositoryMirrorDelegate for more information."""
3314     self.delegates.append(delegate)
3315
3316   def _invoke_delegates(self, method, *args):
3317     """Iterate through each of our delegates, in the order that they
3318     were added, and call the delegate's method named METHOD with the
3319     arguments in ARGS."""
3320     for delegate in self.delegates:
3321       getattr(delegate, method)(*args)
3322
3323   def finish(self):
3324     """Calls the delegate finish method."""
3325     self._end_commit()
3326     self._invoke_delegates('finish')
3327     self.cleanup()
3328
3329
3330 class SVNCommitItem:
3331   """A wrapper class for CVSRevision objects upon which
3332    Subversion-related data (such as properties) may be hung."""
3333
3334   def __init__(self, c_rev, make_svn_props):
3335     self.c_rev = c_rev
3336     self.set_cvs_revnum_properties = Ctx().cvs_revnums
3337     self.eol_from_mime_type = Ctx().eol_from_mime_type
3338     self.no_default_eol = Ctx().no_default_eol
3339     self.keywords_off = Ctx().keywords_off
3340     self.mime_mapper = Ctx().mime_mapper
3341
3342     # We begin with only a "CVS revision" property.
3343     self.svn_props = { }
3344     if self.set_cvs_revnum_properties:
3345       self.svn_props['cvs2svn:cvs-rev'] = c_rev.rev
3346       make_svn_props = True
3347
3348     # Set mime-type and eol.  These two properties are intertwingled;
3349     # follow the conditionals carefully.  See also issue #39.
3350     mime_type = None
3351     eol_style = None
3352     keywords = None
3353
3354     if self.mime_mapper:
3355       mime_type = self.mime_mapper.get_type_from_filename(c_rev.cvs_path)
3356
3357     if not c_rev.mode == 'b':
3358       if not self.no_default_eol:
3359         eol_style = 'native'
3360       elif mime_type and self.eol_from_mime_type:
3361         if mime_type.startswith("text/"):
3362           eol_style = 'native'
3363         else:
3364           eol_style = None
3365     elif mime_type is None:
3366       # file is kb, and no other mimetype specified
3367       mime_type = 'application/octet-stream'
3368
3369     # Set the svn:keywords property, if appropriate.  See issue #2.
3370     if not self.keywords_off and (c_rev.mode is None or c_rev.mode == 'kv' or
3371                                   c_rev.mode == 'kvl'):
3372       keywords = SVN_KEYWORDS_VALUE
3373
3374     # Remember if we need to filter the EOLs.  We can't use self.svn_props
3375     # becase they are only set on the first revision and we need to filter
3376     # all revisions.
3377     self.needs_eol_filter = eol_style == 'native'
3378
3379     # Remember if this file has svn:keywords set
3380     self.has_keywords = keywords is not None
3381
3382     # If asked to fill in the Subversion properties ('svn:' ones), do so.
3383     if make_svn_props:
3384       # Tack on the executableness, if any.
3385       if c_rev.file_executable:
3386         self.svn_props['svn:executable'] = '*'
3387
3388       # Set the svn:keywords property, if appropriate.  See issue #2.
3389       if keywords:
3390         self.svn_props['svn:keywords'] = SVN_KEYWORDS_VALUE
3391
3392       if mime_type:
3393         self.svn_props['svn:mime-type'] = mime_type
3394
3395       if eol_style:
3396         self.svn_props['svn:eol-style'] = eol_style
3397
3398
3399 class SVNRepositoryMirrorDelegate:
3400   """Abstract superclass for any delegate to SVNRepositoryMirror.
3401   Subclasses must implement all of the methods below.
3402
3403   For each method, a subclass implements, in its own way, the
3404   Subversion operation implied by the method's name.  For example, for
3405   the add_path method, the DumpfileDelegate would write out a
3406   "Node-add:" command to a Subversion dumpfile, the StdoutDelegate
3407   would merely print that the path is being added to the repository,
3408   and the RepositoryDelegate would actually cause the path to be added
3409   to the Subversion repository that it is creating.
3410   """
3411
3412   def start_commit(self, svn_commit):
3413     """Perform any actions needed to start SVNCommit SVN_COMMIT;
3414     see subclass implementation for details."""
3415     raise NotImplementedError
3416
3417   def mkdir(self, path):
3418     """PATH is a string; see subclass implementation for details."""
3419     raise NotImplementedError
3420
3421   def add_path(self, s_item):
3422     """S_ITEM is an SVNCommitItem; see subclass implementation for
3423     details."""
3424     raise NotImplementedError
3425
3426   def change_path(self, s_item):
3427     """S_ITEM is an SVNCommitItem; see subclass implementation for
3428     details."""
3429     raise NotImplementedError
3430
3431   def delete_path(self, path):
3432     """PATH is a string; see subclass implementation for
3433     details."""
3434     raise NotImplementedError
3435
3436   def copy_path(self, src_path, dest_path, src_revnum):
3437     """SRC_PATH and DEST_PATH are both strings, and SRC_REVNUM is a
3438     subversion revision number (int); see subclass implementation for
3439     details."""
3440     raise NotImplementedError
3441
3442   def finish(self):
3443     """Perform any cleanup necessary after all revisions have been
3444     committed."""
3445     raise NotImplementedError
3446
3447
3448 class DumpfileDelegate(SVNRepositoryMirrorDelegate):
3449   """Create a Subversion dumpfile."""
3450
3451   def __init__(self, dumpfile_path=None):
3452     """Return a new DumpfileDelegate instance, attached to a dumpfile
3453     DUMPFILE_PATH (Ctx().dumpfile, if None), using Ctx().encoding.
3454
3455     If Ctx().cvs_revnums is true, then set the 'cvs2svn:cvs-revnum'
3456     property on files, when they are changed due to a corresponding
3457     CVS revision.
3458
3459     If Ctx().mime_mapper is not None, then it is a MimeMapper
3460     instance, used to determine whether or not to set the
3461     'svn:mime-type' property on files.  But even if Ctx().mime_mapper
3462     is None, files marked with the CVS 'kb' flag will receive a mime
3463     type of "application/octet-stream".
3464
3465     Unless Ctx().no_default_eol is true, set 'svn:eol-style' to
3466     'native' for files not marked with the CVS 'kb' flag, except as
3467     superseded by Ctx().eol_from_mime_type (see below).
3468
3469     If Ctx().eol_from_mime_type is not None, then set 'svn:eol-style'
3470     to 'native' for all files to which Ctx().mime_mapper assigns a
3471     mime type beginning with "text/", and don't set 'svn:eol-style'
3472     for files assigned a type not beginning with "text/".
3473     """
3474     if dumpfile_path:
3475       self.dumpfile_path = dumpfile_path
3476     else:
3477       self.dumpfile_path = Ctx().dumpfile
3478     self.path_encoding = Ctx().encoding
3479
3480     self.dumpfile = open(self.dumpfile_path, 'wb')
3481     self._write_dumpfile_header(self.dumpfile)
3482
3483   def _write_dumpfile_header(self, dumpfile):
3484     # Initialize the dumpfile with the standard headers.
3485     #
3486     # Since the CVS repository doesn't have a UUID, and the Subversion
3487     # repository will be created with one anyway, we don't specify a
3488     # UUID in the dumpflie
3489     dumpfile.write('SVN-fs-dump-format-version: 2\n\n')
3490
3491   def _utf8_path(self, path):
3492     """Return a copy of PATH encoded in UTF-8.  PATH is assumed to be
3493     encoded in self.path_encoding."""
3494     try:
3495       # Log messages can be converted with the 'replace' strategy,
3496       # but we can't afford any lossiness here.
3497       unicode_path = unicode(path, self.path_encoding, 'strict')
3498       return unicode_path.encode('utf-8')
3499     except UnicodeError:
3500       print "Unable to convert a path '%s' to internal encoding." % path
3501       print "Consider rerunning with (for example) '--encoding=latin1'"
3502       sys.exit(1)
3503
3504   def start_commit(self, svn_commit):
3505     """Emit the start of SVN_COMMIT (an SVNCommit)."""
3506
3507     self.revision = svn_commit.revnum
3508
3509     # The start of a new commit typically looks like this:
3510     #
3511     #   Revision-number: 1
3512     #   Prop-content-length: 129
3513     #   Content-length: 129
3514     #
3515     #   K 7
3516     #   svn:log
3517     #   V 27
3518     #   Log message for revision 1.
3519     #   K 10
3520     #   svn:author
3521     #   V 7
3522     #   jrandom
3523     #   K 8
3524     #   svn:date
3525     #   V 27
3526     #   2003-04-22T22:57:58.132837Z
3527     #   PROPS-END
3528     #
3529     # Notice that the length headers count everything -- not just the
3530     # length of the data but also the lengths of the lengths, including
3531     # the 'K ' or 'V ' prefixes.
3532     #
3533     # The reason there are both Prop-content-length and Content-length
3534     # is that the former includes just props, while the latter includes
3535     # everything.  That's the generic header form for any entity in a
3536     # dumpfile.  But since revisions only have props, the two lengths
3537     # are always the same for revisions.
3538
3539     # Calculate the total length of the props section.
3540     props = svn_commit.get_revprops()
3541     prop_names = props.keys()
3542     prop_names.sort()
3543     total_len = 10  # len('PROPS-END\n')
3544     for propname in prop_names:
3545       if props[propname] is None:
3546         continue
3547       klen = len(propname)
3548       klen_len = len('K %d' % klen)
3549       vlen = len(props[propname])
3550       vlen_len = len('V %d' % vlen)
3551       # + 4 for the four newlines within a given property's section
3552       total_len = total_len + klen + klen_len + vlen + vlen_len + 4
3553
3554     # Print the revision header and props
3555     self.dumpfile.write('Revision-number: %d\n'
3556                         'Prop-content-length: %d\n'
3557                         'Content-length: %d\n'
3558                         '\n'
3559                         % (self.revision, total_len, total_len))
3560
3561     for propname in prop_names:
3562       if props[propname] is None:
3563         continue
3564       self.dumpfile.write('K %d\n'
3565                           '%s\n'
3566                           'V %d\n'
3567                           '%s\n' % (len(propname),
3568                                     propname,
3569                                     len(props[propname]),
3570                                     props[propname]))
3571
3572     self.dumpfile.write('PROPS-END\n')
3573     self.dumpfile.write('\n')
3574
3575   def mkdir(self, path):
3576     """Emit the creation of directory PATH."""
3577     self.dumpfile.write("Node-path: %s\n"
3578                         "Node-kind: dir\n"
3579                         "Node-action: add\n"
3580                         "Content-length: 10\n"
3581                         "\n"
3582                         "\n" % self._utf8_path(path))
3583
3584   def _add_or_change_path(self, s_item, op):
3585     """Emit the addition or change corresponding to S_ITEM.
3586     OP is either the constant OP_ADD or OP_CHANGE."""
3587
3588     # Validation stuffs
3589     if op == OP_ADD:
3590       action = 'add'
3591     elif op == OP_CHANGE:
3592       action = 'change'
3593     else:
3594       sys.stderr.write("%s: _add_or_change_path() called with bad op ('%s')"
3595                        % (error_prefix, op))
3596       sys.exit(1)
3597
3598     # Convenience variables
3599     c_rev = s_item.c_rev
3600     svn_props = s_item.svn_props
3601
3602     # The property handling here takes advantage of an undocumented
3603     # but IMHO consistent feature of the Subversion dumpfile-loading
3604     # code.  When a node's properties aren't mentioned (that is, the
3605     # "Prop-content-length:" header is absent, no properties are
3606     # listed at all, and there is no "PROPS-END\n" line) then no
3607     # change is made to the node's properties.
3608     #
3609     # This is consistent with the way dumpfiles behave w.r.t. text
3610     # content changes, so I'm comfortable relying on it.  If you
3611     # commit a change to *just* the properties of some node that
3612     # already has text contents from a previous revision, then in the
3613     # dumpfile output for the prop change, no "Text-content-length:"
3614     # nor "Text-content-md5:" header will be present, and the text of
3615     # the file will not be given.  But this does not cause the file's
3616     # text to be erased!  It simply remains unchanged.
3617     #
3618     # This works out great for cvs2svn, due to lucky coincidences:
3619     #
3620     # For files, the only properties we ever set are set in the first
3621     # revision; all other revisions (including on branches) inherit
3622     # from that.  After the first revision, we never change file
3623     # properties, therefore, there is no need to remember the full set
3624     # of properties on a given file once we've set it.
3625     #
3626     # For directories, the only property we set is "svn:ignore", and
3627     # while we may change it after the first revision, we always do so
3628     # based on the contents of a ".cvsignore" file -- in other words,
3629     # CVS is doing the remembering for us, so we still don't have to
3630     # preserve the previous value of the property ourselves.
3631
3632     # Calculate the (sorted-by-name) property string and length, if any.
3633     prop_contents = ''
3634     prop_names = svn_props.keys()
3635     prop_names.sort()
3636     for pname in prop_names:
3637       pval = svn_props[pname]
3638       prop_contents = prop_contents + \
3639                       'K %d\n%s\nV %d\n%s\n' \
3640                       % (len(pname), pname, len(pval), pval)
3641     if prop_contents:
3642       prop_contents = prop_contents + 'PROPS-END\n'
3643       props_len = len(prop_contents)
3644     else:
3645       props_len = 0
3646
3647     props_header = ''
3648     if props_len:
3649       props_header = 'Prop-content-length: %d\n' % props_len
3650
3651     # treat .cvsignore as a directory property
3652     dir_path, basename = os.path.split(c_rev.svn_path)
3653     if basename == ".cvsignore":
3654       ignore_vals = generate_ignores(c_rev)
3655       ignore_contents = '\n'.join(ignore_vals)
3656       ignore_contents = ('K 10\nsvn:ignore\nV %d\n%s\n' % \
3657                          (len(ignore_contents), ignore_contents))
3658       ignore_contents = ignore_contents + 'PROPS-END\n'
3659       ignore_len = len(ignore_contents)
3660
3661       # write headers, then props
3662       self.dumpfile.write('Node-path: %s\n'
3663                           'Node-kind: dir\n'
3664                           'Node-action: change\n'
3665                           'Prop-content-length: %d\n'
3666                           'Content-length: %d\n'
3667                           '\n'
3668                           '%s'
3669                           % (self._utf8_path(dir_path), ignore_len,
3670                              ignore_len, ignore_contents))
3671
3672     # If the file has keywords, we must use -kk to prevent CVS/RCS from
3673     # expanding the keywords because they must be unexpanded in the
3674     # repository, or Subversion will get confused.
3675     if s_item.has_keywords:
3676       pipe_cmd, pipe = get_co_pipe(c_rev, '-kk')
3677     else:
3678       pipe_cmd, pipe = get_co_pipe(c_rev)
3679
3680     self.dumpfile.write('Node-path: %s\n'
3681                         'Node-kind: file\n'
3682                         'Node-action: %s\n'
3683                         '%s'  # no property header if no props
3684                         'Text-content-length: '
3685                         % (self._utf8_path(c_rev.svn_path),
3686                            action, props_header))
3687
3688     pos = self.dumpfile.tell()
3689
3690     self.dumpfile.write('0000000000000000\n'
3691                         'Text-content-md5: 00000000000000000000000000000000\n'
3692                         'Content-length: 0000000000000000\n'
3693                         '\n')
3694
3695     if prop_contents:
3696       self.dumpfile.write(prop_contents)
3697
3698     # Insert a filter to convert all EOLs to LFs if neccessary
3699     if s_item.needs_eol_filter:
3700       data_reader = LF_EOL_Filter(pipe.fromchild)
3701     else:
3702       data_reader = pipe.fromchild
3703
3704     # Insert the rev contents, calculating length and checksum as we go.
3705     checksum = md5.new()
3706     length = 0
3707     while True:
3708       buf = data_reader.read(PIPE_READ_SIZE)
3709       if buf == '':
3710         break
3711       checksum.update(buf)
3712       length = length + len(buf)
3713       self.dumpfile.write(buf)
3714
3715     pipe.fromchild.close()
3716     error_output = pipe.childerr.read()
3717     exit_status = pipe.wait()
3718     if exit_status:
3719       sys.exit("%s: The command '%s' failed with exit status: %s\n"
3720                "and the following output:\n"
3721                "%s" % (error_prefix, pipe_cmd, exit_status, error_output))
3722
3723     # Go back to patch up the length and checksum headers:
3724     self.dumpfile.seek(pos, 0)
3725     # We left 16 zeros for the text length; replace them with the real
3726     # length, padded on the left with spaces:
3727     self.dumpfile.write('%16d' % length)
3728     # 16... + 1 newline + len('Text-content-md5: ') == 35
3729     self.dumpfile.seek(pos + 35, 0)
3730     self.dumpfile.write(checksum.hexdigest())
3731     # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
3732     self.dumpfile.seek(pos + 84, 0)
3733     # The content length is the length of property data, text data,
3734     # and any metadata around/inside around them.
3735     self.dumpfile.write('%16d' % (length + props_len))
3736     # Jump back to the end of the stream
3737     self.dumpfile.seek(0, 2)
3738
3739     # This record is done (write two newlines -- one to terminate
3740     # contents that weren't themselves newline-termination, one to
3741     # provide a blank line for readability.
3742     self.dumpfile.write('\n\n')
3743
3744   def add_path(self, s_item):
3745     """Emit the addition corresponding to S_ITEM, an SVNCommitItem."""
3746     self._add_or_change_path(s_item, OP_ADD)
3747
3748   def change_path(self, s_item):
3749     """Emit the change corresponding to S_ITEM, an SVNCommitItem."""
3750     self._add_or_change_path(s_item, OP_CHANGE)
3751
3752   def delete_path(self, path):
3753     """Emit the deletion of PATH."""
3754     self.dumpfile.write('Node-path: %s\n'
3755                         'Node-action: delete\n'
3756                         '\n' % self._utf8_path(path))
3757
3758   def copy_path(self, src_path, dest_path, src_revnum):
3759     """Emit the copying of SRC_PATH at SRC_REV to DEST_PATH."""
3760     # We don't need to include "Node-kind:" for copies; the loader
3761     # ignores it anyway and just uses the source kind instead.
3762     self.dumpfile.write('Node-path: %s\n'
3763                         'Node-action: add\n'
3764                         'Node-copyfrom-rev: %d\n'
3765                         'Node-copyfrom-path: /%s\n'
3766                         '\n'
3767                         % (self._utf8_path(dest_path),
3768                            src_revnum,
3769                            self._utf8_path(src_path)))
3770
3771   def finish(self):
3772     """Perform any cleanup necessary after all revisions have been
3773     committed."""
3774     self.dumpfile.close()
3775
3776
3777 class RepositoryDelegate(DumpfileDelegate):
3778   """Creates a new Subversion Repository.  DumpfileDelegate does all
3779   of the heavy lifting."""
3780   def __init__(self):
3781     self.svnadmin = Ctx().svnadmin
3782     self.target = Ctx().target
3783     if not Ctx().existing_svnrepos:
3784       Log().write(LOG_NORMAL,"Creating new repository '%s'" % (self.target))
3785       if Ctx().fs_type and Ctx().fs_type != 'bdb':
3786         # User specified something other than bdb.
3787         run_command('%s create %s "%s"' % (self.svnadmin,
3788                                            "--fs-type=%s" % Ctx().fs_type,
3789                                            self.target))
3790       elif Ctx().fs_type:
3791         # User explicitly specified bdb.
3792         #
3793         # Since this is a BDB repository, pass --bdb-txn-nosync,
3794         # because it gives us a 4-5x speed boost (if cvs2svn is
3795         # creating the repository, cvs2svn should be the only program
3796         # accessing the svn repository (until cvs is done, at least)).
3797         # But we'll turn no-sync off in self.finish(), unless
3798         # instructed otherwise.
3799         run_command('%s create %s %s "%s"' % (self.svnadmin,
3800                                               "--fs-type=bdb",
3801                                               "--bdb-txn-nosync",
3802                                               self.target))
3803       else:
3804         # User didn't say what kind repository (bdb, fsfs, etc).
3805         # We still pass --bdb-txn-nosync.  It's a no-op if the default
3806         # repository type doesn't support it, but we definitely want
3807         # it if BDB is the default.
3808         run_command('%s create %s "%s"' % (self.svnadmin,
3809                                            "--bdb-txn-nosync",
3810                                            self.target))
3811
3812
3813     # Since the output of this run is a repository, not a dumpfile,
3814     # the temporary dumpfiles we create should go in the tmpdir.
3815     DumpfileDelegate.__init__(self, temp(Ctx().dumpfile))
3816
3817     # This is 1 if a commit is in progress, otherwise None.
3818     self._commit_in_progress = None
3819
3820     self.dumpfile = open(self.dumpfile_path, 'w+b')
3821     self.loader_pipe = Popen3('%s load -q "%s"' % (self.svnadmin, self.target),
3822                               True)
3823     self.loader_pipe.fromchild.close()
3824     try:
3825       self._write_dumpfile_header(self.loader_pipe.tochild)
3826     except IOError:
3827       sys.stderr.write("%s: svnadmin failed with the following output while "
3828                        "loading the dumpfile:\n" % (error_prefix))
3829       sys.stderr.write(self.loader_pipe.childerr.read())
3830       sys.exit(1)
3831
3832   def _feed_pipe(self):
3833     """Feed the revision stored in the dumpfile to the svnadmin
3834     load pipe."""
3835     self.dumpfile.seek(0)
3836     while 1:
3837       data = self.dumpfile.read(128*1024) # Chunk size is arbitrary
3838       if not len(data):
3839         break
3840       try:
3841         self.loader_pipe.tochild.write(data)
3842       except IOError:
3843         sys.stderr.write("%s: svnadmin failed with the following output while "
3844                          "loading the dumpfile:\n" % (error_prefix))
3845         sys.stderr.write(self.loader_pipe.childerr.read())
3846         sys.exit(1)
3847
3848   def start_commit(self, svn_commit):
3849     """Start a new commit.  If a commit is already in progress, close
3850     the dumpfile, load it into the svn repository, open a new
3851     dumpfile, and write the header into it."""
3852     if self._commit_in_progress:
3853       self._feed_pipe()
3854     self.dumpfile.seek(0)
3855     self.dumpfile.truncate()
3856     DumpfileDelegate.start_commit(self, svn_commit)
3857     self._commit_in_progress = 1
3858
3859   def finish(self):
3860     """Loads the last commit into the repository."""
3861     self._feed_pipe()
3862     self.dumpfile.close()
3863     self.loader_pipe.tochild.close()
3864     error_output = self.loader_pipe.childerr.read()
3865     exit_status = self.loader_pipe.wait()
3866     if exit_status:
3867       sys.exit('%s: svnadmin load failed with exit status: %s\n'
3868                'and the following output:\n'
3869                '%s' % (error_prefix, exit_status, error_output))
3870     os.remove(self.dumpfile_path)
3871
3872     # If this is a BDB repository, and we created the repository, and
3873     # --bdb-no-sync wasn't passed, then comment out the DB_TXN_NOSYNC
3874     # line in the DB_CONFIG file, because txn syncing should be on by
3875     # default in BDB repositories.
3876     #
3877     # We determine if this is a BDB repository by looking for the
3878     # DB_CONFIG file, which doesn't exist in FSFS, rather than by
3879     # checking Ctx().fs_type.  That way this code will Do The Right
3880     # Thing in all circumstances.
3881     db_config = os.path.join(self.target, "db/DB_CONFIG")
3882     if (not Ctx().existing_svnrepos and not Ctx().bdb_txn_nosync
3883         and os.path.exists(db_config)):
3884       no_sync = 'set_flags DB_TXN_NOSYNC\n'
3885
3886       contents = open(db_config, 'r').readlines()
3887       index = contents.index(no_sync)
3888       contents[index] = '# ' + no_sync
3889       contents = open(db_config, 'w').writelines(contents)
3890
3891
3892 class StdoutDelegate(SVNRepositoryMirrorDelegate):
3893   """Makes no changes to the disk, but writes out information to
3894   STDOUT about what the SVNRepositoryMirror is doing.  Of course, our
3895   print statements will state that we're doing something, when in
3896   reality, we aren't doing anything other than printing out that we're
3897   doing something.  Kind of zen, really."""
3898   def __init__(self, total_revs):
3899     self.total_revs = total_revs
3900
3901   def start_commit(self, svn_commit):
3902     """Prints out the Subversion revision number of the commit that is
3903     being started."""
3904     Log().write(LOG_VERBOSE, "=" * 60)
3905     Log().write(LOG_NORMAL, "Starting Subversion commit %d / %d" %
3906                 (svn_commit.revnum, self.total_revs))
3907
3908   def mkdir(self, path):
3909     """Print a line stating that we are creating directory PATH."""
3910     Log().write(LOG_VERBOSE, "  New Directory", path)
3911
3912   def add_path(self, s_item):
3913     """Print a line stating that we are 'adding' s_item.c_rev.svn_path."""
3914     Log().write(LOG_VERBOSE, "  Adding", s_item.c_rev.svn_path)
3915
3916   def change_path(self, s_item):
3917     """Print a line stating that we are 'changing' s_item.c_rev.svn_path."""
3918     Log().write(LOG_VERBOSE, "  Changing", s_item.c_rev.svn_path)
3919
3920   def delete_path(self, path):
3921     """Print a line stating that we are 'deleting' PATH."""
3922     Log().write(LOG_VERBOSE, "  Deleting", path)
3923
3924   def copy_path(self, src_path, dest_path, src_revnum):
3925     """Print a line stating that we are 'copying' revision SRC_REVNUM
3926     of SRC_PATH to DEST_PATH."""
3927     Log().write(LOG_VERBOSE, "  Copying revision", src_revnum, "of", src_path)
3928     Log().write(LOG_VERBOSE, "                to", dest_path)
3929
3930   def finish(self):
3931     """State that we are done creating our repository."""
3932     Log().write(LOG_VERBOSE, "Finished creating Subversion repository.")
3933     Log().write(LOG_QUIET, "Done.")
3934
3935 # This should be a local to pass1,
3936 # but Python 2.0 does not support nested scopes.
3937 OS_SEP_PLUS_ATTIC = os.sep + 'Attic'
3938 def pass1():
3939   Log().write(LOG_QUIET, "Examining all CVS ',v' files...")
3940   cd = CollectData()
3941
3942   def visit_file(baton, dirname, files):
3943     cd = baton
3944     for fname in files:
3945       if fname[-2:] != ',v':
3946         continue
3947       cd.found_valid_file = 1
3948       pathname = os.path.join(dirname, fname)
3949       if dirname[-6:] == OS_SEP_PLUS_ATTIC:
3950         # drop the 'Attic' portion from the pathname for the canonical name.
3951         cd.set_fname(os.path.join(dirname[:-6], fname), pathname)
3952       else:
3953         # If this file also exists in the attic, it's a fatal error
3954         attic_path = os.path.join(dirname, 'Attic', fname)
3955         if os.path.exists(attic_path):
3956           err = "%s: A CVS repository cannot contain both %s and %s" \
3957                 % (error_prefix, pathname, attic_path)
3958           sys.stderr.write(err + '\n')
3959           cd.fatal_errors.append(err)
3960         cd.set_fname(pathname, pathname)
3961       Log().write(LOG_NORMAL, pathname)
3962       try:
3963         cvs2svn_rcsparse.parse(open(pathname, 'rb'), cd)
3964       except (cvs2svn_rcsparse.common.RCSParseError, ValueError, RuntimeError):
3965         err = "%s: '%s' is not a valid ,v file" \
3966               % (error_prefix, pathname)
3967         sys.stderr.write(err + '\n')
3968         cd.fatal_errors.append(err)
3969       except:
3970         Log().write(LOG_WARN, "Exception occurred while parsing %s" % pathname)
3971         raise
3972
3973   os.path.walk(Ctx().cvsroot, visit_file, cd)
3974   Log().write(LOG_VERBOSE, 'Processed', cd.num_files, 'files')
3975
3976   cd.write_symbol_db()
3977
3978   if len(cd.fatal_errors) > 0:
3979     sys.exit("Pass 1 complete.\n" + "=" * 75 + "\n"
3980              + "Error summary:\n"
3981              + "\n".join(cd.fatal_errors)
3982              + "\nExited due to fatal error(s).")
3983
3984   if cd.found_valid_file is None:
3985     sys.exit("\nNo RCS files found in your CVS Repository!\n"
3986              + "Are you absolutely certain you are pointing cvs2svn\n"
3987              + "at a CVS repository?\n"
3988              + "\nExited due to fatal error(s).")
3989
3990   StatsKeeper().reset_c_rev_info()
3991   StatsKeeper().archive()
3992   Log().write(LOG_QUIET, "Done")
3993
3994 def pass2():
3995   "Pass 2: clean up the revision information."
3996
3997   symbol_db = SymbolDatabase()
3998   symbol_db.read()
3999
4000   # Convert the list of regexps to a list of strings
4001   excludes = symbol_db.find_excluded_symbols(Ctx().excludes)
4002
4003   error_detected = 0
4004
4005   Log().write(LOG_QUIET, "Checking for blocked exclusions...")
4006   blocked_excludes = symbol_db.find_blocked_excludes(excludes)
4007   if blocked_excludes:
4008     for branch, blockers in blocked_excludes.items():
4009       sys.stderr.write(error_prefix + ": The branch '%s' cannot be "
4010                        "excluded because the following symbols depend "
4011                        "on it:\n" % (branch))
4012       for blocker in blockers:
4013         sys.stderr.write("    '%s'\n" % (blocker))
4014     sys.stderr.write("\n")
4015     error_detected = 1
4016
4017   Log().write(LOG_QUIET, "Checking for forced tags with commits...")
4018   invalid_forced_tags = [ ]
4019   for forced_tag in Ctx().forced_tags:
4020     if excludes.has_key(forced_tag):
4021       continue
4022     if symbol_db.branch_has_commit(forced_tag):
4023       invalid_forced_tags.append(forced_tag)
4024   if invalid_forced_tags:
4025     sys.stderr.write(error_prefix + ": The following branches cannot be "
4026                      "forced to be tags because they have commits:\n")
4027     for tag in invalid_forced_tags:
4028       sys.stderr.write("    '%s'\n" % (tag))
4029     sys.stderr.write("\n")
4030     error_detected = 1
4031
4032   Log().write(LOG_QUIET, "Checking for tag/branch mismatches...")
4033   mismatches = symbol_db.find_mismatches(excludes)
4034   def is_not_forced(mismatch):
4035     name = mismatch[0]
4036     return not (name in Ctx().forced_tags or name in Ctx().forced_branches)
4037   mismatches = filter(is_not_forced, mismatches)
4038   if mismatches:
4039     sys.stderr.write(error_prefix + ": The following symbols are tags "
4040                      "in some files and branches in others.\nUse "
4041                      "--force-tag, --force-branch and/or --exclude to "
4042                      "resolve the symbols.\n")
4043     for name, tag_count, branch_count, commit_count in mismatches:
4044       sys.stderr.write("    '%s' is a tag in %d files, a branch in "
4045                        "%d files and has commits in %d files.\n"
4046                        % (name, tag_count, branch_count, commit_count))
4047     error_detected = 1
4048
4049   # Bail out now if we found errors
4050   if error_detected:
4051     sys.exit(1)
4052
4053   # Create the tags database
4054   tags_db = TagsDatabase(DB_OPEN_NEW)
4055   for tag in symbol_db.tags.keys():
4056     if tag not in Ctx().forced_branches:
4057       tags_db[tag] = None
4058   for tag in Ctx().forced_tags:
4059     tags_db[tag] = None
4060
4061   Log().write(LOG_QUIET, "Re-synchronizing CVS revision timestamps...")
4062
4063   # We may have recorded some changes in revisions' timestamp.  We need to
4064   # scan for any other files which may have had the same log message and
4065   # occurred at "the same time" and change their timestamps, too.
4066
4067   # read the resync data file
4068   def read_resync(fname):
4069     "Read the .resync file into memory."
4070
4071     ### note that we assume that we can hold the entire resync file in
4072     ### memory. really large repositories with whacky timestamps could
4073     ### bust this assumption. should that ever happen, then it is possible
4074     ### to split the resync file into pieces and make multiple passes,
4075     ### using each piece.
4076
4077     #
4078     # A digest maps to a sequence of lists which specify a lower and upper
4079     # time bound for matching up the commit.  We keep a sequence of these
4080     # because a number of checkins with the same log message (e.g. an empty
4081     # log message) could need to be remapped.  We also make them a list because
4082     # we will dynamically expand the lower/upper bound as we find commits
4083     # that fall into a particular msg and time range.
4084     #
4085     # resync == digest -> [ [old_time_lower, old_time_upper, new_time], ... ]
4086     #
4087     resync = { }
4088
4089     for line in fileinput.FileInput(fname):
4090       t1 = int(line[:8], 16)
4091       digest = line[9:DIGEST_END_IDX]
4092       t2 = int(line[DIGEST_END_IDX+1:], 16)
4093       t1_l = t1 - COMMIT_THRESHOLD/2
4094       t1_u = t1 + COMMIT_THRESHOLD/2
4095       if resync.has_key(digest):
4096         resync[digest].append([t1_l, t1_u, t2])
4097       else:
4098         resync[digest] = [ [t1_l, t1_u, t2] ]
4099
4100     # For each digest, sort the resync items in it in increasing order,
4101     # based on the lower time bound.
4102     digests = resync.keys()
4103     for digest in digests:
4104       (resync[digest]).sort()
4105
4106     return resync
4107
4108   resync = read_resync(temp(DATAFILE + RESYNC_SUFFIX))
4109
4110   output = open(temp(DATAFILE + CLEAN_REVS_SUFFIX), 'w')
4111   Cleanup().register(temp(DATAFILE + CLEAN_REVS_SUFFIX), pass3)
4112
4113   # process the revisions file, looking for items to clean up
4114   for line in fileinput.FileInput(temp(DATAFILE + REVS_SUFFIX)):
4115     c_rev = CVSRevision(Ctx(), line[:-1])
4116
4117     # Skip this entire revision if it's on an excluded branch
4118     if excludes.has_key(c_rev.branch_name):
4119       continue
4120
4121     # Remove all references to excluded tags and branches
4122     def not_excluded(symbol, excludes=excludes):
4123       return not excludes.has_key(symbol)
4124     c_rev.branches = filter(not_excluded, c_rev.branches)
4125     c_rev.tags = filter(not_excluded, c_rev.tags)
4126
4127     # Convert all branches that are forced to be tags
4128     for forced_tag in Ctx().forced_tags:
4129       if forced_tag in c_rev.branches:
4130         c_rev.branches.remove(forced_tag)
4131         c_rev.tags.append(forced_tag)
4132
4133     # Convert all tags that are forced to be branches
4134     for forced_branch in Ctx().forced_branches:
4135       if forced_branch in c_rev.tags:
4136         c_rev.tags.remove(forced_branch)
4137         c_rev.branches.append(forced_branch)
4138
4139     # see if this is "near" any of the resync records we
4140     # have recorded for this digest [of the log message].
4141     for record in resync.get(c_rev.digest, []):
4142       if record[0] <= c_rev.timestamp <= record[1]:
4143         # bingo! remap the time on this (record[2] is the new time).
4144
4145         # adjust the time range. we want the COMMIT_THRESHOLD from the
4146         # bounds of the earlier/latest commit in this group.
4147         record[0] = min(record[0], c_rev.timestamp - COMMIT_THRESHOLD/2)
4148         record[1] = max(record[1], c_rev.timestamp + COMMIT_THRESHOLD/2)
4149
4150         # By default this will be the new timestamp
4151         new_timestamp = record[2]
4152         # If the new timestamp is earlier than that of our previous revision
4153         if record[2] < c_rev.prev_timestamp:
4154           desc = ("%s: Attempt to set timestamp of revision %s on file %s"
4155                   + " to time %s, which is before previous the time of"
4156                   + " revision %s (%s):")
4157           Log().write(LOG_WARN, desc % (warning_prefix, c_rev.rev,
4158                                         c_rev.cvs_path, record[2],
4159                                         c_rev.prev_rev, c_rev.prev_timestamp))
4160           # If resyncing our rev to c_rev.prev_timestamp + 1 will place
4161           # the timestamp of c_rev within COMMIT_THRESHOLD of the
4162           # attempted sync time, then sync back to c_rev.prev_timestamp
4163           # + 1...
4164           if (c_rev.prev_timestamp - record[2]) < COMMIT_THRESHOLD:
4165             new_timestamp = c_rev.prev_timestamp + 1
4166             Log().write(LOG_WARN, "%s: Time set to %s" % (warning_prefix,
4167                                                           new_timestamp))
4168           # ...otherwise, make no change
4169           else:
4170             new_timestamp = c_rev.timestamp
4171             Log().write(LOG_WARN, "%s: Timestamp left untouched" %
4172                         warning_prefix)
4173
4174         msg = "RESYNC: '%s' (%s): old time='%s' delta=%ds" \
4175               % (c_rev.cvs_path, c_rev.rev, time.ctime(c_rev.timestamp),
4176                  record[2] - c_rev.timestamp)
4177         Log().write(LOG_VERBOSE, msg)
4178
4179         c_rev.timestamp = new_timestamp
4180
4181         # stop looking for hits
4182         break
4183
4184     output.write(str(c_rev) + "\n")
4185   Log().write(LOG_QUIET, "Done")
4186
4187 def pass3():
4188   Log().write(LOG_QUIET, "Sorting CVS revisions...")
4189   sort_file(temp(DATAFILE + CLEAN_REVS_SUFFIX),
4190             temp(DATAFILE + SORTED_REVS_SUFFIX))
4191   Cleanup().register(temp(DATAFILE + SORTED_REVS_SUFFIX), pass5)
4192   Log().write(LOG_QUIET, "Done")
4193
4194 def pass4():
4195   """Iterate through sorted revs, storing them in a database.
4196   If we're not doing a trunk-only conversion, generate the
4197   LastSymbolicNameDatabase, which contains the last CVSRevision
4198   that is a source for each tag or branch.
4199   """
4200   Log().write(LOG_QUIET,
4201       "Copying CVS revision data from flat file to database...")
4202   cvs_revs_db = CVSRevisionDatabase(DB_OPEN_NEW)
4203   if not Ctx().trunk_only:
4204     Log().write(LOG_QUIET,
4205         "and finding last CVS revisions for all symbolic names...")
4206     last_sym_name_db = LastSymbolicNameDatabase(DB_OPEN_NEW)
4207   else:
4208     # This is to avoid testing Ctx().trunk_only every time around the loop
4209     class DummyLSNDB:
4210       def noop(*args): pass
4211       log_revision = noop
4212       create_database = noop
4213     last_sym_name_db = DummyLSNDB()
4214
4215   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4216     c_rev = CVSRevision(Ctx(), line[:-1])
4217     cvs_revs_db.log_revision(c_rev)
4218     last_sym_name_db.log_revision(c_rev)
4219     StatsKeeper().record_c_rev(c_rev)
4220
4221   last_sym_name_db.create_database()
4222   StatsKeeper().archive()
4223   Log().write(LOG_QUIET, "Done")
4224
4225 def pass5():
4226   """
4227   Generate the SVNCommit <-> CVSRevision mapping
4228   databases.  CVSCommit._commit also calls SymbolingsLogger to register
4229   CVSRevisions that represent an opening or closing for a path on a
4230   branch or tag.  See SymbolingsLogger for more details.
4231   """
4232   Log().write(LOG_QUIET, "Mapping CVS revisions to Subversion commits...")
4233
4234   aggregator = CVSRevisionAggregator()
4235   for line in fileinput.FileInput(temp(DATAFILE + SORTED_REVS_SUFFIX)):
4236     c_rev = CVSRevision(Ctx(), line[:-1])
4237     if not (Ctx().trunk_only and c_rev.branch_name is not None):
4238       aggregator.process_revision(c_rev)
4239   aggregator.flush()
4240
4241   StatsKeeper().set_svn_rev_count(SVNCommit.revnum - 1)
4242   StatsKeeper().archive()
4243   Log().write(LOG_QUIET, "Done")
4244
4245 def pass6():
4246   Log().write(LOG_QUIET, "Sorting symbolic name source revisions...")
4247
4248   if not Ctx().trunk_only:
4249     sort_file(temp(SYMBOL_OPENINGS_CLOSINGS),
4250               temp(SYMBOL_OPENINGS_CLOSINGS_SORTED))
4251     Cleanup().register(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), pass8)
4252   Log().write(LOG_QUIET, "Done")
4253
4254 def pass7():
4255   Log().write(LOG_QUIET, "Determining offsets for all symbolic names...")
4256
4257   def generate_offsets_for_symbolings():
4258     """This function iterates through all the lines in
4259     SYMBOL_OPENINGS_CLOSINGS_SORTED, writing out a file mapping
4260     SYMBOLIC_NAME to the file offset in SYMBOL_OPENINGS_CLOSINGS_SORTED
4261     where SYMBOLIC_NAME is first encountered.  This will allow us to
4262     seek to the various offsets in the file and sequentially read only
4263     the openings and closings that we need."""
4264
4265     ###PERF This is a fine example of a db that can be in-memory and
4266     #just flushed to disk when we're done.  Later, it can just be sucked
4267     #back into memory.
4268     offsets_db = Database(temp(SYMBOL_OFFSETS_DB), DB_OPEN_NEW)
4269     Cleanup().register(temp(SYMBOL_OFFSETS_DB), pass8)
4270
4271     file = open(temp(SYMBOL_OPENINGS_CLOSINGS_SORTED), 'r')
4272     old_sym = ""
4273     while 1:
4274       fpos = file.tell()
4275       line = file.readline()
4276       if not line:
4277         break
4278       sym, svn_revnum, cvs_rev_key = line.split(" ", 2)
4279       if not sym == old_sym:
4280         Log().write(LOG_VERBOSE, " ", sym)
4281         old_sym = sym
4282         offsets_db[sym] = fpos
4283
4284   if not Ctx().trunk_only:
4285     generate_offsets_for_symbolings()
4286   Log().write(LOG_QUIET, "Done.")
4287
4288 def pass8():
4289   svncounter = 2 # Repository initialization is 1.
4290   repos = SVNRepositoryMirror()
4291   persistence_manager = PersistenceManager(DB_OPEN_READ)
4292
4293   if (Ctx().target):
4294     if not Ctx().dry_run:
4295       repos.add_delegate(RepositoryDelegate())
4296     Log().write(LOG_QUIET, "Starting Subversion Repository.")
4297   else:
4298     if not Ctx().dry_run:
4299       repos.add_delegate(DumpfileDelegate())
4300     Log().write(LOG_QUIET, "Starting Subversion Dumpfile.")
4301
4302   repos.add_delegate(StdoutDelegate(StatsKeeper().svn_rev_count()))
4303
4304   while(1):
4305     svn_commit = persistence_manager.get_svn_commit(svncounter)
4306     if not svn_commit:
4307       break
4308     repos.commit(svn_commit)
4309     svncounter += 1
4310
4311   repos.finish()
4312
4313 _passes = [
4314   pass1,
4315   pass2,
4316   pass3,
4317   pass4,
4318   pass5,
4319   pass6,
4320   pass7,
4321   pass8,
4322   ]
4323
4324
4325 class Ctx:
4326   """Session state for this run of cvs2svn.  For example, run-time
4327   options are stored here.  This class is a Borg, see
4328   http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66531.
4329   """
4330   __shared_state = { }
4331   def __init__(self):
4332     self.__dict__ = self.__shared_state
4333     if self.__dict__:
4334       return
4335     # Else, initialize to defaults.
4336     self.cvsroot = None
4337     self.target = None
4338     self.dumpfile = DUMPFILE
4339     self.tmpdir = '.'
4340     self.verbose = 0
4341     self.quiet = 0
4342     self.prune = 1
4343     self.existing_svnrepos = 0
4344     self.dump_only = 0
4345     self.dry_run = 0
4346     self.trunk_only = 0
4347     self.trunk_base = "trunk"
4348     self.tags_base = "tags"
4349     self.branches_base = "branches"
4350     self.encoding = "ascii"
4351     self.mime_types_file = None
4352     self.mime_mapper = None
4353     self.no_default_eol = 0
4354     self.eol_from_mime_type = 0
4355     self.keywords_off = 0
4356     self.use_cvs = None
4357     self.svnadmin = "svnadmin"
4358     self.username = None
4359     self.print_help = 0
4360     self.skip_cleanup = 0
4361     self.cvs_revnums = 0
4362     self.bdb_txn_nosync = 0
4363     self.fs_type = None
4364     self.forced_branches = []
4365     self.forced_tags = []
4366     self.excludes = []
4367     self.symbol_transforms = []
4368
4369 class MimeMapper:
4370   """A class that provides mappings from file names to MIME types.
4371   Note that we should really be using Python's 'mimetypes' module.
4372   See http://cvs2svn.tigris.org/servlets/ReadMsg?list=dev&msgNo=266
4373   for more."""
4374
4375   def __init__(self):
4376     self.mappings = { }
4377
4378   def set_mime_types_file(self, mime_types_file):
4379     for line in fileinput.input(mime_types_file):
4380       if line.startswith("#"):
4381         continue
4382
4383       # format of a line is something like
4384       # text/plain c h cpp
4385       extensions = line.split()
4386       if len(extensions) < 2:
4387         continue
4388       type = extensions.pop(0)
4389       for ext in extensions:
4390         if self.mappings.has_key(ext) and self.mappings[ext] != type:
4391           sys.stderr.write("%s: ambiguous MIME mapping for *.%s (%s or %s)\n" \
4392                            % (warning_prefix, ext, self.mappings[ext], type))
4393         self.mappings[ext] = type
4394
4395
4396   def get_type_from_filename(self, filename):
4397     basename, extension = os.path.splitext(os.path.basename(filename))
4398
4399     # Extension includes the dot, so strip it (will leave extension
4400     # empty if filename ends with a dot, which is ok):
4401     extension = extension[1:]
4402
4403     # If there is no extension (or the file ends with a period), use
4404     # the base name for mapping.  This allows us to set mappings for
4405     # files such as README or Makefile:
4406     if not extension:
4407       extension = basename
4408     if self.mappings.has_key(extension):
4409       return self.mappings[extension]
4410     return None
4411
4412
4413 def convert(start_pass, end_pass):
4414   "Convert a CVS repository to an SVN repository."
4415
4416   cleanup = Cleanup()
4417   times = [ None ] * (end_pass + 1)
4418   times[start_pass - 1] = time.time()
4419   StatsKeeper().set_start_time(time.time())
4420   for i in range(start_pass - 1, end_pass):
4421     Log().write(LOG_QUIET, '----- pass %d -----' % (i + 1))
4422     _passes[i]()
4423     times[i + 1] = time.time()
4424     StatsKeeper().log_duration_for_pass(times[i + 1] - times[i], i + 1)
4425     # Dispose of items in Ctx() not intended to live past the end of the pass
4426     # (Identified by exactly one leading underscore)
4427     for attr in dir(Ctx()):
4428       if (len(attr) > 2 and attr[0] == '_' and attr[1] != '_'
4429           and not attr[:6] == "_Ctx__"):
4430         delattr(Ctx(), attr)
4431     if not Ctx().skip_cleanup:
4432       cleanup.cleanup(_passes[i])
4433     StatsKeeper().set_end_time(time.time())
4434
4435   Log().write(LOG_QUIET, StatsKeeper())
4436   if end_pass < 4:
4437     Log().write(LOG_QUIET, '(These are unaltered CVS repository stats and do not\n'
4438                 + ' reflect tags or branches excluded via --exclude)\n')
4439   print StatsKeeper().timings()
4440
4441
4442 def usage():
4443   print 'USAGE: %s [-v] [-s svn-repos-path] [-p pass] cvs-repos-path' \
4444         % os.path.basename(sys.argv[0])
4445   print '  --help, -h           print this usage message and exit with success'
4446   print '  --version            print the version number'
4447   print '  -q                   quiet'
4448   print '  -v                   verbose'
4449   print '  -s PATH              path for SVN repos'
4450   print '  -p START[:END]       start at pass START, end at pass END of %d' % len(_passes)
4451   print '                       If only START is given, run only pass START'
4452   print '                       (implicitly enables --skip-cleanup)'
4453   print '  --existing-svnrepos  load into existing SVN repository'
4454   print '  --dumpfile=PATH      name of intermediate svn dumpfile'
4455   print '  --tmpdir=PATH        directory to use for tmp data (default to cwd)'
4456   print '  --profile            profile with \'hotshot\' (into file cvs2svn.hotshot)'
4457   print '  --dry-run            do not create a repository or a dumpfile;'
4458   print '                       just print what would happen.'
4459   print '  --use-cvs            use CVS instead of RCS \'co\' to extract data'
4460   print '                       (only use this if having problems with RCS)'
4461   print '  --svnadmin=PATH      path to the svnadmin program'
4462   print '  --trunk-only         convert only trunk commits, not tags nor branches'
4463   print '  --trunk=PATH         path for trunk (default: %s)'    \
4464         % Ctx().trunk_base
4465   print '  --branches=PATH      path for branches (default: %s)' \
4466         % Ctx().branches_base
4467   print '  --tags=PATH          path for tags (default: %s)'     \
4468         % Ctx().tags_base
4469   print '  --no-prune           don\'t prune empty directories'
4470   print '  --dump-only          just produce a dumpfile, don\'t commit to a repos'
4471   print '  --encoding=ENC       encoding of log messages in CVS repos (default: %s)' \
4472         % Ctx().encoding
4473   print '  --force-branch=NAME  force NAME to be a branch'
4474   print '  --force-tag=NAME     force NAME to be a tag'
4475   print '  --exclude=REGEXP     exclude branches and tags matching REGEXP'
4476   print '  --symbol-transform=P:S transform symbol names from P to S where P and S'
4477   print '                       use Python regexp and reference syntax respectively'
4478   print '  --username=NAME      username for cvs2svn-synthesized commits'
4479   print '  --skip-cleanup       prevent the deletion of intermediate files'
4480   print '  --bdb-txn-nosync     pass --bdb-txn-nosync to "svnadmin create"'
4481   print '  --fs-type=TYPE       pass --fs-type=TYPE to "svnadmin create"'
4482   print '  --cvs-revnums        record CVS revision numbers as file properties'
4483   print '  --mime-types=FILE    specify an apache-style mime.types file for\n' \
4484         '                       setting svn:mime-type'
4485   print '  --eol-from-mime-type set svn:eol-style by mime type (only with --mime-types)'
4486   print '  --no-default-eol     don\'t set svn:eol-style by CVS defaults'
4487   print '  --keywords-off       don\'t set svn:keywords on any files (by default,'
4488   print '                       cvs2svn sets svn:keywords on non-binary files to'
4489   print '                       "%s")' % SVN_KEYWORDS_VALUE
4490
4491 def main():
4492   # Convenience var, so we don't have to keep instantiating this Borg.
4493   ctx = Ctx()
4494
4495   profiling = None
4496   start_pass = 1
4497   end_pass = len(_passes)
4498
4499   try:
4500     opts, args = getopt.getopt(sys.argv[1:], 'p:s:qvh',
4501                                [ "help", "create", "trunk=",
4502                                  "username=", "existing-svnrepos",
4503                                  "branches=", "tags=", "encoding=",
4504                                  "force-branch=", "force-tag=", "exclude=",
4505                                  "use-cvs", "mime-types=",
4506                                  "eol-from-mime-type", "no-default-eol",
4507                                  "trunk-only", "no-prune", "dry-run",
4508                                  "dump-only", "dumpfile=", "tmpdir=",
4509                                  "svnadmin=", "skip-cleanup", "cvs-revnums",
4510                                  "bdb-txn-nosync", "fs-type=",
4511                                  "version", "profile",
4512                                  "keywords-off", "symbol-transform="])
4513   except getopt.GetoptError, e:
4514     sys.stderr.write(error_prefix + ': ' + str(e) + '\n\n')
4515     usage()
4516     sys.exit(1)
4517
4518   for opt, value in opts:
4519     if opt == '--version':
4520         print '%s version %s' % (os.path.basename(sys.argv[0]), VERSION)
4521         sys.exit(0)
4522     elif opt == '-p':
4523       # Don't cleanup if we're doing incrementals.
4524       ctx.skip_cleanup = 1
4525       if value.find(':') > 0:
4526         start_pass, end_pass = map(int, value.split(':'))
4527       else:
4528         end_pass = start_pass = int(value)
4529       if start_pass > len(_passes) or start_pass < 1:
4530         print '%s: illegal value (%d) for starting pass. '\
4531               'must be 1 through %d.' % (error_prefix, int(start_pass),
4532                                          len(_passes))
4533         sys.exit(1)
4534       if end_pass < start_pass or end_pass > len(_passes):
4535         print '%s: illegal value (%d) for ending pass. ' \
4536               'must be %d through %d.' % (error_prefix, int(end_pass),
4537                                           int(start_pass), len(_passes))
4538         sys.exit(1)
4539     elif (opt == '--help') or (opt == '-h'):
4540       ctx.print_help = 1
4541     elif opt == '-v':
4542       Log().log_level = LOG_VERBOSE
4543       ctx.verbose = 1
4544     elif opt == '-q':
4545       Log().log_level = LOG_QUIET
4546       ctx.quiet = 1
4547     elif opt == '-s':
4548       ctx.target = value
4549     elif opt == '--existing-svnrepos':
4550       ctx.existing_svnrepos = 1
4551     elif opt == '--dumpfile':
4552       ctx.dumpfile = value
4553     elif opt == '--tmpdir':
4554       ctx.tmpdir = value
4555     elif opt == '--use-cvs':
4556       ctx.use_cvs = 1
4557     elif opt == '--svnadmin':
4558       ctx.svnadmin = value
4559     elif opt == '--trunk-only':
4560       ctx.trunk_only = 1
4561     elif opt == '--trunk':
4562       if not value:
4563         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4564       ctx.trunk_base = value
4565     elif opt == '--branches':
4566       if not value:
4567         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4568       ctx.branches_base = value
4569     elif opt == '--tags':
4570       if not value:
4571         sys.exit("%s: cannot pass an empty path to %s." % (error_prefix, opt))
4572       ctx.tags_base = value
4573     elif opt == '--no-prune':
4574       ctx.prune = None
4575     elif opt == '--dump-only':
4576       ctx.dump_only = 1
4577     elif opt == '--dry-run':
4578       ctx.dry_run = 1
4579     elif opt == '--encoding':
4580       ctx.encoding = value
4581     elif opt == '--force-branch':
4582       ctx.forced_branches.append(value)
4583     elif opt == '--force-tag':
4584       ctx.forced_tags.append(value)
4585     elif opt == '--exclude':
4586       try:
4587         ctx.excludes.append(re.compile('^' + value + '$'))
4588       except re.error, e:
4589         sys.exit(error_prefix + ": '%s' is not a valid regexp.\n" % (value))
4590     elif opt == '--mime-types':
4591       ctx.mime_types_file = value
4592     elif opt == '--eol-from-mime-type':
4593       ctx.eol_from_mime_type = 1
4594     elif opt == '--no-default-eol':
4595       ctx.no_default_eol = 1
4596     elif opt == '--keywords-off':
4597       ctx.keywords_off = 1
4598     elif opt == '--username':
4599       ctx.username = value
4600     elif opt == '--skip-cleanup':
4601       ctx.skip_cleanup = 1
4602     elif opt == '--cvs-revnums':
4603       ctx.cvs_revnums = 1
4604     elif opt == '--bdb-txn-nosync':
4605       ctx.bdb_txn_nosync = 1
4606     elif opt == '--fs-type':
4607       ctx.fs_type = value
4608     elif opt == '--create':
4609       sys.stderr.write(warning_prefix +
4610           ': The behaviour produced by the --create option is now the '
4611           'default,\nand passing the option is deprecated.\n')
4612     elif opt == '--profile':
4613       profiling = 1
4614     elif opt == '--symbol-transform':
4615       ctx.symbol_transforms.append(value.split(":"))
4616
4617   if ctx.print_help:
4618     usage()
4619     sys.exit(0)
4620
4621   # Consistency check for options and arguments.
4622   if len(args) == 0:
4623     usage()
4624     sys.exit(1)
4625
4626   if len(args) > 1:
4627     sys.stderr.write(error_prefix +
4628                      ": must pass only one CVS repository.\n")
4629     usage()
4630     sys.exit(1)
4631
4632   ctx.cvsroot = args[0]
4633
4634   if not os.path.isdir(ctx.cvsroot):
4635     sys.stderr.write(error_prefix +
4636                      ": the given CVS repository path '%s' is not an "
4637                      "existing directory.\n" % ctx.cvsroot)
4638     sys.exit(1)
4639
4640   if ctx.use_cvs:
4641     # Ascend above the specified root if necessary, to find the cvs_repository
4642     # (a directory containing a CVSROOT directory) and the cvs_module (the
4643     # path of the conversion root within the cvs repository)
4644     # NB: cvs_module must be seperated by '/' *not* by os.sep .
4645     ctx.cvs_repository = os.path.abspath(ctx.cvsroot)
4646     prev_cvs_repository = None
4647     ctx.cvs_module = ""
4648     while prev_cvs_repository != ctx.cvs_repository:
4649       if os.path.isdir(os.path.join(ctx.cvs_repository, 'CVSROOT')):
4650         break
4651       prev_cvs_repository = ctx.cvs_repository
4652       ctx.cvs_repository, module_component = os.path.split(ctx.cvs_repository)
4653       ctx.cvs_module = module_component + "/" + ctx.cvs_module
4654     else:
4655       # Hit the root (of the drive, on Windows) without finding a CVSROOT dir.
4656       sys.stderr.write(error_prefix +
4657                        ": the path '%s' is not a CVS repository, nor a path " \
4658                        "within a CVS repository.  A CVS repository contains " \
4659                        "a CVSROOT directory within its root directory.\n" \
4660                        % ctx.cvsroot)
4661       sys.exit(1)
4662     os.environ['CVSROOT'] = ctx.cvs_repository
4663
4664   if (not ctx.target) and (not ctx.dump_only) and (not ctx.dry_run):
4665     sys.stderr.write(error_prefix +
4666                      ": must pass one of '-s' or '--dump-only'.\n")
4667     sys.exit(1)
4668
4669   def not_both(opt1val, opt1name, opt2val, opt2name):
4670     if opt1val and opt2val:
4671       sys.stderr.write(error_prefix + ": cannot pass both '%s' and '%s'.\n" \
4672                        % (opt1name, opt2name))
4673       sys.exit(1)
4674
4675   not_both(ctx.target, '-s',
4676            ctx.dump_only, '--dump-only')
4677
4678   not_both(ctx.dump_only, '--dump-only',
4679            ctx.existing_svnrepos, '--existing-svnrepos')
4680
4681   not_both(ctx.bdb_txn_nosync, '--bdb-txn-nosync',
4682            ctx.existing_svnrepos, '--existing-svnrepos')
4683
4684   not_both(ctx.dump_only, '--dump-only',
4685            ctx.bdb_txn_nosync, '--bdb-txn-nosync')
4686
4687   not_both(ctx.quiet, '-q',
4688            ctx.verbose, '-v')
4689
4690   not_both(ctx.fs_type, '--fs-type',
4691            ctx.existing_svnrepos, '--existing-svnrepos')
4692
4693   if ctx.fs_type and ctx.fs_type != 'bdb' and ctx.bdb_txn_nosync:
4694     sys.stderr.write(error_prefix +
4695                      ": cannot pass --bdb-txn-nosync with --fs-type=%s.\n" \
4696                      % ctx.fs_type)
4697     sys.exit(1)
4698
4699   if ((string.find(ctx.trunk_base, '/') > -1)
4700       or (string.find(ctx.tags_base, '/') > -1)
4701       or (string.find(ctx.branches_base, '/') > -1)):
4702     sys.stderr.write("%s: cannot pass multicomponent path to "
4703                      "--trunk, --tags, or --branches yet.\n"
4704                      "  See http://cvs2svn.tigris.org/issues/show_bug.cgi?"
4705                      "id=7 for details.\n" % error_prefix)
4706     sys.exit(1)
4707
4708   if ctx.existing_svnrepos and not os.path.isdir(ctx.target):
4709     sys.stderr.write(error_prefix +
4710                      ": the svn-repos-path '%s' is not an "
4711                      "existing directory.\n" % ctx.target)
4712     sys.exit(1)
4713
4714   if not ctx.dump_only and not ctx.existing_svnrepos \
4715      and (not ctx.dry_run) and os.path.exists(ctx.target):
4716     sys.stderr.write(error_prefix +
4717                      ": the svn-repos-path '%s' exists.\nRemove it, or pass "
4718                      "'--existing-svnrepos'.\n" % ctx.target)
4719     sys.exit(1)
4720
4721   if ctx.mime_types_file:
4722     ctx.mime_mapper = MimeMapper()
4723     ctx.mime_mapper.set_mime_types_file(ctx.mime_types_file)
4724
4725   # Make sure the tmp directory exists.  Note that we don't check if
4726   # it's empty -- we want to be able to use, for example, "." to hold
4727   # tempfiles.  But if we *did* want check if it were empty, we'd do
4728   # something like os.stat(ctx.tmpdir)[stat.ST_NLINK], of course :-).
4729   if not os.path.exists(ctx.tmpdir):
4730     os.mkdir(ctx.tmpdir)
4731   elif not os.path.isdir(ctx.tmpdir):
4732     sys.stderr.write(error_prefix +
4733        ": cvs2svn tried to use '%s' for temporary files, but that path\n"
4734        "  exists and is not a directory.  Please make it be a directory,\n"
4735        "  or specify some other directory for temporary files.\n" \
4736                      % ctx.tmpdir)
4737     sys.exit(1)
4738
4739   if ctx.use_cvs:
4740     def cvs_ok():
4741       pipe = Popen3('cvs %s --version' % Ctx().cvs_global_arguments, True)
4742       pipe.tochild.close()
4743       pipe.fromchild.read()
4744       errmsg = pipe.childerr.read()
4745       status = pipe.wait()
4746       ok = len(errmsg) == 0 and status == 0
4747       return (ok, status, errmsg)
4748
4749     ctx.cvs_global_arguments = "-q -R"
4750     ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4751     if not ok:
4752       ctx.cvs_global_arguments = "-q"
4753       ok, cvs_exitstatus, cvs_errmsg = cvs_ok()
4754
4755     if not ok:
4756       sys.stderr.write(error_prefix +
4757                        ": error executing CVS: status %s, error output:\n" \
4758                        % (cvs_exitstatus) + cvs_errmsg)
4759
4760   # But do lock the tmpdir, to avoid process clash.
4761   try:
4762     os.mkdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4763   except OSError, e:
4764     if e.errno == errno.EACCES:
4765       sys.stderr.write(error_prefix + ": Permission denied:"
4766                        + " No write access to output directory.\n")
4767       sys.exit(1)
4768     if e.errno == errno.EEXIST:
4769       sys.stderr.write(error_prefix +
4770           ": cvs2svn is using directory '%s' for temporary files, but\n"
4771           "  subdirectory '%s/cvs2svn.lock' exists, indicating that another\n"
4772           "  cvs2svn process is currently using '%s' as its temporary\n"
4773           "  workspace.  If you are certain that is not the case,\n"
4774           "  then remove the '%s/cvs2svn.lock' subdirectory.\n" \
4775                        % (ctx.tmpdir, ctx.tmpdir, ctx.tmpdir, ctx.tmpdir))
4776       sys.exit(1)
4777     raise
4778   try:
4779     if profiling:
4780       import hotshot
4781       prof = hotshot.Profile('cvs2svn.hotshot')
4782       prof.runcall(convert, start_pass, end_pass)
4783       prof.close()
4784     else:
4785       convert(start_pass, end_pass)
4786   finally:
4787     try: os.rmdir(os.path.join(ctx.tmpdir, 'cvs2svn.lock'))
4788     except: pass
4789
4790 if __name__ == '__main__':
4791   main()